Example #1
0
def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)
    # 对数据集进行划分,80%用来进行训练,20%进行测试,并把对应的类别进行标注
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.2,
        random_state=0)

    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary=True)
    word_tokenizer = vectorizer.build_tokenizer()

    # doc_term_list_train:得到训练数据集中的每个文档进行分词的数组
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]

    # doc_class_list_train:每个文档对应的类别编号的数组
    term_set_fs = FeatureSelections.feature_selection(doc_terms_list_train,
                                                      doc_class_list_train,
                                                      fs_method)[:fs_num]
    print "term_set_fs length %s " % (len(term_set_fs))

    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)
    # 朴素贝叶斯分类器
    # clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类器
    # doc_test_predicted = clf.predict(doc_test_vec)

    # SVM分类器
    svclf = SVC(kernel='linear')
    svclf.fit(doc_train_vec, doc_class_list_train)
    doc_test_predicted = svclf.predict(doc_test_vec)

    # KNN
    # knnclf = KNeighborsClassifier()  # default with k=5
    # knnclf.fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = knnclf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)

    print 'Accuracy: ', acc

    from sklearn.metrics import classification_report
    print 'precision,recall,F1-score如下:》》》》》》》》'
    print classification_report(doc_test_predicted, doc_class_list_test)

    return acc
def vectorize(text_tab, feature_names=None):
    """return a numpy digit number array
    text_tab : numpy array of strings
    """
    process = []
    for tweet in text_tab:
        if type(tweet) == str:
            process.append(clean(tweet))
    process = np.array(process)
    vect = CountVectorizer()
    if feature_names is None:
        vect.fit(process)
    else:
        vect.vocabulary = feature_names
    res = vect.transform(process)
    return res.toarray(), vect.get_feature_names()
Example #3
0
 def sent2bow(self, labeledList, idlist):
     print(len(idlist))
     x = []
     y = []
     total = [0] * len(self.avilableLabels)
     vectorizer = CountVectorizer()
     vectorizer.vocabulary = self.inverseVocab
     print(idlist)
     for i in idlist:
         print(i)
         doc = labeledList[i]
         for line in doc[2]:
             currentLabel = line[0]
             if currentLabel in self.avilableLabels:
                 idx = self.avilableLabels.index(currentLabel)
                 currentbow = vectorizer.fit_transform(
                     [line[1]]).toarray().tolist()[0]
                 total[idx] += 1
                 x.append(currentbow)
                 y.append(idx)
     return x, y, total
Example #4
0
import jieba
import jieba.posseg as pseg
import os
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == "__main__":
    # corpus=["我 来到 北京 清华大学",#第一类文本切词后的结果,词之间以空格隔开
    # "他 来到 了 网易 杭研 大厦", #第二类文本的切词结果
    # "小明 硕士 毕业 与 中国 科学院",#第三类文本的切词结果
    # "我 爱 北京 天安门"] #第四类文本的切词结果
    vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer.vocabulary = ['交通事故']
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    # frequency_matrix = vectorizer.fit_transform(corpus)
    frequency_matrix = [[0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 15529, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 4766, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 151, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13],
                        [0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    tfidf = transformer.fit_transform(frequency_matrix)  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    # print(vectorizer.get_feature_names())
    # print(frequency_matrix.toarray())
    # word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
    word = ['离退休人员返聘合同纠纷', '请求确认人民调解协议效力', '劳动合同纠纷', '社会保险纠纷', '福利待遇纠纷', '劳动合同纠纷', '社会保险纠纷', '侵害商业秘密纠纷']
Example #5
0
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 50000) 
    test_vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 50000) 
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of strings.
    total_training_Vectorized = train_vectorizer.fit_transform(bag_pos_train + bag_neg_train).toarray()

    #Assign the vocabulary of the test vectorizer to be the same as the vocabulary of the train vectorizer:
    test_vectorizer.vocabulary = train_vectorizer.get_feature_names()

    total_testing_Vectorized = test_vectorizer.fit_transform(bag_pos_test + bag_neg_test).toarray()

    ###############

    #Create vectors with signs (1 = positive, -1=negative) for reviews, which will be combined with the vectorized arrays:
    train_signs = np.empty(shape = (1600, 1))
    train_signs[0:800,0] = 1
    train_signs[800:1600, 0] = -1

    test_signs = np.empty(shape = (400, 1))
    test_signs[0:200,0] = 1
    test_signs[200:400, 0] = -1

    ###############
Example #6
0
def Build_Title_WordVector(filename):
    '''
    QueryoneGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt")
    QuerytwoGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt")
    QuerytriGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt")
    '''

    QueryoneGramDict = json.load(
        file("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt"))
    QuerytwoGramDict = json.load(
        file("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt"))
    QuerytriGramDict = json.load(
        file("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt"))

    Query = ReadDictFromFile(filename)

    inputlen = len(Query)
    Gramone = []
    Gramtwo = []
    Gramtri = []
    for item in Query:
        #ngram
        item = item.strip('\n')
        tokens = item.split(' ')

        paddbi = []
        paddbi.extend('p')
        paddbi.extend(tokens)
        paddbi.extend('p')
        paddtr = []
        paddtr.extend(['p', 'p'])
        paddtr.extend(tokens)
        paddtr.extend(['p', 'p'])
        bi_tokens = nltk.bigrams(paddbi)
        #tri_tokens = nltk.trigrams(paddtr)
        bi_tokens = [(token[0] + token[1]) for token in bi_tokens]
        #tri_tokens =[(token[0]+token[1]+token[2]) for token in tri_tokens]
        onetmp = ""
        for token in tokens:
            onetmp += token
            onetmp += ' '
        Gramone.append(str(onetmp[0:-1]))

        twotmp = ""
        for token in bi_tokens:
            twotmp += token
            twotmp += ' '
        Gramtwo.append(str(twotmp[0:-1]))
        '''
           tritmp=""
           for token in tri_tokens:
                tritmp+= token
                tritmp+=' '
           Gramtri.append(str(tritmp[0:-1]))
           '''

    if inputlen != len(Gramone) or inputlen != len(Gramtwo):
        print("train size don't match !!\n")

    query_vectorizerone = CountVectorizer(stop_words=stopWords)
    query_vectorizerone.vocabulary = QueryoneGramDict
    onegramVectorizerArray = query_vectorizerone.transform(Gramone).toarray()
    #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)

    query_vectorizertwo = CountVectorizer(stop_words=stopWords)
    query_vectorizertwo.vocabulary = QuerytwoGramDict
    twogramVectorizerArray = query_vectorizertwo.transform(Gramtwo).toarray()
    #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
    '''
    query_vectorizertri =CountVectorizer(stop_words = stopWords)
    query_vectorizertri.vocabulary=QuerytriGramDict
    trigramVectorizerArray = query_vectorizertri.transform(Gramtri).toarray()
     #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
     '''

    return onegramVectorizerArray, twogramVectorizerArray  #,trigramVectorizerArray
Example #7
0
def Build_Title_WordVector(filename):
    '''
    QueryoneGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt")
    QuerytwoGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt")
    QuerytriGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt")
    '''

    QueryoneGramDict=json.load(file("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt"))
    QuerytwoGramDict=json.load(file("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt"))
    QuerytriGramDict=json.load(file("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt"))

    Query=ReadDictFromFile(filename)

    inputlen =len(Query)
    Gramone=[]
    Gramtwo=[]
    Gramtri=[]
    for item in Query:
           #ngram
           item=item.strip('\n')
           tokens=item.split(' ')

           paddbi = []
           paddbi.extend('p')
           paddbi.extend(tokens)
           paddbi.extend('p')
           paddtr = []
           paddtr.extend(['p','p'])
           paddtr.extend(tokens)
           paddtr.extend(['p','p'])
           bi_tokens = nltk.bigrams(paddbi)
           #tri_tokens = nltk.trigrams(paddtr)
           bi_tokens =[(token[0]+token[1]) for token in bi_tokens]
           #tri_tokens =[(token[0]+token[1]+token[2]) for token in tri_tokens]
           onetmp=""
           for token in tokens:
                onetmp+= token
                onetmp+=' '
           Gramone.append(str(onetmp[0:-1]))

           twotmp=""
           for token in bi_tokens:
                twotmp+= token
                twotmp+=' '
           Gramtwo.append(str(twotmp[0:-1]))
           '''
           tritmp=""
           for token in tri_tokens:
                tritmp+= token
                tritmp+=' '
           Gramtri.append(str(tritmp[0:-1]))
           '''

    if inputlen!=len(Gramone) or inputlen!= len(Gramtwo):
        print("train size don't match !!\n")

    query_vectorizerone =CountVectorizer(stop_words = stopWords)
    query_vectorizerone.vocabulary=QueryoneGramDict
    onegramVectorizerArray = query_vectorizerone.transform(Gramone).toarray()
    #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)

    query_vectorizertwo =CountVectorizer(stop_words = stopWords)
    query_vectorizertwo.vocabulary=QuerytwoGramDict
    twogramVectorizerArray = query_vectorizertwo.transform(Gramtwo).toarray()
     #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
    '''
    query_vectorizertri =CountVectorizer(stop_words = stopWords)
    query_vectorizertri.vocabulary=QuerytriGramDict
    trigramVectorizerArray = query_vectorizertri.transform(Gramtri).toarray()
     #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
     '''

    return onegramVectorizerArray,twogramVectorizerArray#,trigramVectorizerArray
Example #8
0
        cleaned_docs.append(
                ' '.join([lemmatizer.lemmatize(word.lower())
                    for word in doc.split()
                    if letters_only(word)
                    and word not in all_names]))
    return cleaned_docs

cleaned_emails = clean_text(emails)

cv = CountVectorizer(stop_words="english", max_features=500)

term_docs = cv.fit_transform(cleaned_emails)
print (term_docs[0])
feature_names = cv.get_feature_names()
print (feature_names[100])
feature_mapping = cv.vocabulary()

def get_label_index(labels):
    from collections import defaultdict
    label_index = defaultdict(list)
    for index, label in enumerate(labels):
        label_index[label].append(index)
        return label_index

label_index = get_label_index(labels)

def get_prior(label_index):
    """Compute prior based on training samples
    Args:
        label_index (grouped sample indices by class)
    Returns: