コード例 #1
0
from nltk.classify import accuracy as nltk_accuracy


def gender_features(word, num_letters=2):
    return {'feature': word[-num_letters:].lower()}


if __name__ == '__main__':
    labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
                     [(name, 'female') for name in names.words('female.txt')])

    random.seed(7)
    random.shuffle(labeled_names)

    input_names = ['Leonardo', 'Amy', 'Sam', 'Rock']

    for i in range(1, 5):
        print '\n Number of letters :', i
        featuresets = [(gender_features(n, i), gender)
                       for (n, gender) in labeled_names]

        train_set, test_set = featuresets[500:], featuresets[:500]
        classifier = NaiveBayesClassifier.train(train_set)

        print 'Accuracy ==> ', str(
            100 * nltk_accuracy(classifier, test_set)) + str('%')

        # test the input data
        for name in input_names:
            print name, '==>', classifier.classify(gender_features(name, i))
コード例 #2
0
#importing library for naivebayes accuracy and names 
import random
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names 
#defining letter N as extract feature 
def extract_features(word,N=2):
    last_n_letters=word[-N:]
    return{'feature':last_n_letters.lower()}
#Create the training data using labeled names alredy available in male file 
if __name__=='__main__':
    male_list=[(name,'male') for name in names.words('male.txt')]
    female_list=[(name,'female') for name in names.words('female.txt')]
    data=(male_list+female_list)
    random.seed(5)
    random.shuffle(data)
#data to be tested on
namesInput=['rajesh','gaurav','swati','shubha']
#declaring train and test data 
train_sample=int(0.8*len(data))

for i in range(1,6):
    print("\n number of end letters:",i)
    features=[(extract_features(n,i),gender)for (n,gender)in data]#feature exraction for n with gender 
    train_data,test_data=features[:train_sample],features[train_sample:]
    classifier=NaiveBayesClassifier.train(train_data)#defining classifier
    accuracy_classifier=round(100*nltk_accuracy(classifier,test_data),2)#accuracy of classifier called as this 
    print('accuracy='+str(accuracy_classifier)+'%')
    for name in namesInput:#classified with listing name 
        print(name,'==>',classifier.classify(extract_features(name,1)))
コード例 #3
0
if __name__ == '__main__':
    # 提取标记名称
    labeled_names = ([(name, 'male') for name in names.words('male.txt')]) + \
                                  ([(name, 'female') for name in names.words('female.txt')])

    # 设置随机生成数的种子值,并混合搅乱训练数据
    random.seed(7)
    random.shuffle(labeled_names)

    # 定义一些输入的姓名
    input_names = ['Leonardo', 'Amy', 'Sam', 'Werner']

    # 搜索参数空间
    for i in range(1, 5):
        print("取参数为{}".format(i))
        featuresets = [(gender_features(n, i), gender)
                       for (n, gender) in labeled_names]

        # 分割数据为训练集和测试集
        train_set, test_set = featuresets[500:], featuresets[:500]

        # 用朴素贝叶斯分类器做分类
        classifier = NaiveBayesClassifier.train(train_set)

        # 打印分类器准确性
        print(u"准确性:{}%".format(100 * nltk_accuracy(classifier, test_set)))

        # 为输入姓名预测结果
        for name in input_names:
            print("{} ==> {}".format(
                name, classifier.classify(gender_features(name, i))))
コード例 #4
0
ファイル: 情感分析.py プロジェクト: ywyyy/PracticePush



# 构建模型,训练模型
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
np.random.shuffle(dataset)
rows=int(len(dataset)*0.8) # 80%为train set
train_set,test_set=dataset[:rows],dataset[rows:]
print('Num of train_set: ',len(train_set),
      '/nNum of test_set: ',len(test_set))
clf=NaiveBayesClassifier.train(train_set)
 
# 查看该模型在test set上的表现
acc=nltk_accuracy(clf,test_set)
#acc=clf.prob_classify(test_set)
print('Accuracy: {:.2f}%'.format(acc*100))






# 用该模型来预测新样本,查看新句子的情感是积极还是消极
new_samples = [
        "It is an amazing movie", 
        "This is a dull movie. I would never recommend it to anyone.",
        "The cinematography is pretty great in this movie", 
        "The direction was terrible and the story was all over the place" 
    ]
コード例 #5
0
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy
from nltk.corpus import names


def extract_features(word, N=2):
    last_n_letters = word[-N:]
    return {'feature': last_n_letters.lower()}


male_list = [(name, 'male') for name in names.words('male.txt')]
female_list = [(name, 'female') for name in names.words('female.txt')]
data = (male_list + female_list)

num_train = int(0.8 * len(data))

random.seed(5)
random.shuffle(data)

for i in range(1, 6):
    print('\nNumber of end letters:', i)
    features = [(extract_features(n, i), gender) for (n, gender) in data]
    train_data, test_data = features[:num_train], features[num_train:]
    classifier = NaiveBayesClassifier.train(train_data)
    accuracy = round(100 * nltk_accuracy(classifier, test_data), 2)
    print('Accuracy = ' + str(accuracy) + '%')
    input_names = ['Alexander', 'Danielle', 'David', 'Cheryl']
    for name in input_names:
        print(name, '==>', classifier.classify(extract_features(name, i)))
コード例 #6
0
# coding: utf-8
import input_data as datain
datain.out2csv
datain.out2csv()
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import PlaintextCorpusReader
import random
mesage_corpus = PlaintextCorpusReader('./', ['spam_data.csv', 'ham_data.csv'])
all_message = mesage_corpus.words()
all_message
def massage_feature(word,num_letter=1):
    return {'feature':word[-num_letter:]}
labels_name = ([(massage,'垃圾') for massage in message_corpus.words('soam.csv')]+[(massage,'正常') for massage in message_corpus.words('normal.csv')])
random.seed(7)
random.shuffle(labels_name)
message_corpus = PlaintextCorpusReader('./', ['spam_data.csv', 'ham_data.csv'])
labels_name = ([(massage,'垃圾') for massage in message_corpus.words('soam.csv')]+[(massage,'正常') for massage in message_corpus.words('normal.csv')])
random.seed(7)
random.shuffle(labels_name)
labels_name = ([(massage,'垃圾') for massage in message_corpus.words('spam_data.csv')]+[(massage,'正常') for massage in message_corpus.words('ham_data.csv')])
random.seed(7)
random.shuffle(labels_name)
from nltk.classify import accuracy as nltk_accuracy
featuresets = [(massage_feature(n),massage) for (n,massage) in labels_name]
train_set,test_set = featuresets[2000:],featuresets[:2000]
classifier = NaiveBayesClassifier.train(train_set)
rint('结果准确率:',str(100*nltk_accuracy(classifier,test_set))+str('%'))
print('结果准确率:',str(100*nltk_accuracy(classifier,test_set))+str('%'))
get_ipython().run_line_magic('save', 'here 1-31')
コード例 #7
0
if __name__ == "__main__":
    # 提取标记名称
    labeled_names = ([(name, 'male') for name in names.words('male.txt')]) + \
                    [(name, 'female') for name in names.words('female.txt')]

    # 设置随机生成数的种子值,并混合搅乱训练数据
    random.seed(7)
    random.shuffle(labeled_names)

    input_names = ['Leonardo', 'Amy', 'Sam']

    # 搜索参数空间
    for i in range(1, 5):
        print('Number of letters: ', i)
        featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names]

        # 将数据分为训练数据集和测试数据集
        train_set, test_set = featuresets[500:], featuresets[:500]

        # 用朴素贝叶斯分类器做分类
        classifier = NaiveBayesClassifier.train(train_set)

        # 打印分类器的准确性
        print('Accuracy==>', str(100 * nltk_accuracy(classifier, test_set)) + str('%'))

        # 为新输入预测输出结果
        for name in input_names:
            print(name , '==>', classifier.classify(gender_features(name, i)))

コード例 #8
0
    return {'feature': word[-num_letters:].lower()}

if __name__ == '__main__':
    # 提取标记名称
    labeled_names = ([(name, 'male') for name in names.words('male.txt')]) + \
                                  ([(name, 'female') for name in names.words('female.txt')])

    # 设置随机生成数的种子值,并混合搅乱训练数据
    random.seed(7)
    random.shuffle(labeled_names)

    # 定义一些输入的姓名
    input_names = ['Leonardo', 'Amy', 'Sam', 'Werner']

    # 搜索参数空间
    for i in range(1,5):
        print("取参数为{}".format(i))
        featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names]
        
        # 分割数据为训练集和测试集
        train_set, test_set = featuresets[500:], featuresets[:500]

        # 用朴素贝叶斯分类器做分类
        classifier = NaiveBayesClassifier.train(train_set)

        # 打印分类器准确性
        print(u"准确性:{}%".format(100*nltk_accuracy(classifier, test_set)))

        # 为输入姓名预测结果
        for name in input_names:
            print("{} ==> {}".format(name, classifier.classify(gender_features(name, i))))
コード例 #9
0
ファイル: NaiveBayes.py プロジェクト: Wilkersoon/naivebayes
dataframe = pd.DataFrame({'normal': normallist})  #将list用DataFrame,以便保存到CSV文件中
dataframe.to_csv('normal.csv', encoding='utf_8_sig', header=False,
                 index=False)  #保存到文件中

message_corpus = PlaintextCorpusReader('./',
                                       ['spam.csv', 'normal.csv'])  #取出分词文件

all_message = message_corpus.words()  #所有分词保存为list


def massage_feature(word, num_letter=1):  #分词特征化
    return {'feature': word[-num_letter:]}


labels_name = ([(massage, '垃圾')
                for massage in message_corpus.words('spam.csv')] +
               [(massage, '正常')
                for massage in message_corpus.words('normal.csv')])  #给特征分类

random.seed(7)
random.shuffle(labels_name)

featuresets = [(massage_feature(n), massage)
               for (n, massage) in labels_name]  #调整格式
train_set, test_set = featuresets[
    400:], featuresets[:400]  #取2000个数据前400个为测试后1600个为训练
classifier = NaiveBayesClassifier.train(
    train_set)  #调用nltk中的NaiveBayesClassifier函数,传参训练
print('结果准确率:',
      str(100 * nltk_accuracy(classifier, test_set)) + str('%'))  #传测试集参数并预测准确率
コード例 #10
0
from nltk import NaiveBayesClassifier
from nltk.classify import accuracy as nltk_accuracy

# Extract features from the input word
def gender_features(word, num_letters=2):
    return {'feature': word[-num_letters:].lower()}

if __name__=='__main__':
    # Extract labeled names
    labeled_names = ([(name, 'male') for name in names.words('male.txt')] +
            [(name, 'female') for name in names.words('female.txt')])

    random.seed(7)
    random.shuffle(labeled_names)
    input_names = ['Leonardo', 'Amy', 'Sam']

    # Sweeping the parameter space
    for i in range(1, 5):
        print '\nNumber of letters:', i
        featuresets = [(gender_features(n, i), gender) for (n, gender) in labeled_names]
        train_set, test_set = featuresets[500:], featuresets[:500]
        classifier = NaiveBayesClassifier.train(train_set)

        # Print classifier accuracy
        print 'Accuracy ==>', str(100 * nltk_accuracy(classifier, test_set)) + str('%')

        # Predict outputs for new inputs
        for name in input_names:
            print name, '==>', classifier.classify(gender_features(name, i))