Python CountVectorizer.vocabulary Examples

Programming Language: Python

Namespace/Package Name: sklearn.feature_extraction.text

Class/Type: CountVectorizer

Method/Function: vocabulary

Examples at hotexamples.com: 8

Python CountVectorizer.vocabulary - 8 examples found. These are the top rated real world Python examples of sklearn.feature_extraction.text.CountVectorizer.vocabulary extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

CountVectorizer(30)

_validate_vocabulary(30)

fit_transform(30)

fit(30)

build_tokenizer(30)

build_analyzer(30)

get_stop_words(30)

get_params(21)

get_feature_names_out(15)

build_preprocessor(13)

__init__(10)

get_feature_names(9)

dictionary_freeze(6)

count(4)

analyzer(4)

fixed_vocabulary(3)

astype(3)

_count_vocab(2)

copy(2)

fit_trainsform(2)

get_features_names(2)

append(2)

_word_ngrams(2)

get_feature_name(1)

getSenVec(1)

_sort_features(1)

get_features(1)

get_sentence_vector(1)

get_shape(1)

getOutputCol(1)

fit_Transform(1)

fit_trasform(1)

fit_transfrom(1)

fit_transforn(1)

__repr__(1)

fir_transform(1)

__dict__(1)

extract_ngrams(1)

delete_temporary_training_data(1)

count_features(1)

_limit_features(1)

fir(1)

Example #1

Show file

def text_classifly_twang(dataset_dir_name, fs_method, fs_num):
    print 'Loading dataset, 80% for training, 20% for testing...'
    movie_reviews = load_files(dataset_dir_name)
    # 对数据集进行划分，80%用来进行训练，20%进行测试，并把对应的类别进行标注
    doc_str_list_train, doc_str_list_test, doc_class_list_train, doc_class_list_test = train_test_split(
        movie_reviews.data,
        movie_reviews.target,
        test_size=0.2,
        random_state=0)

    print 'Feature selection...'
    print 'fs method:' + fs_method, 'fs num:' + str(fs_num)
    vectorizer = CountVectorizer(binary=True)
    word_tokenizer = vectorizer.build_tokenizer()

    # doc_term_list_train:得到训练数据集中的每个文档进行分词的数组
    doc_terms_list_train = [
        word_tokenizer(doc_str) for doc_str in doc_str_list_train
    ]

    # doc_class_list_train：每个文档对应的类别编号的数组
    term_set_fs = FeatureSelections.feature_selection(doc_terms_list_train,
                                                      doc_class_list_train,
                                                      fs_method)[:fs_num]
    print "term_set_fs length %s " % (len(term_set_fs))

    print 'Building VSM model...'
    term_dict = dict(zip(term_set_fs, range(len(term_set_fs))))
    vectorizer.fixed_vocabulary = True
    vectorizer.vocabulary = term_dict
    doc_train_vec = vectorizer.fit_transform(doc_str_list_train)
    doc_test_vec = vectorizer.transform(doc_str_list_test)
    # 朴素贝叶斯分类器
    # clf = MultinomialNB().fit(doc_train_vec, doc_class_list_train)  #调用MultinomialNB分类器
    # doc_test_predicted = clf.predict(doc_test_vec)

    # SVM分类器
    svclf = SVC(kernel='linear')
    svclf.fit(doc_train_vec, doc_class_list_train)
    doc_test_predicted = svclf.predict(doc_test_vec)

    # KNN
    # knnclf = KNeighborsClassifier()  # default with k=5
    # knnclf.fit(doc_train_vec, doc_class_list_train)
    # doc_test_predicted = knnclf.predict(doc_test_vec)

    acc = np.mean(doc_test_predicted == doc_class_list_test)

    print 'Accuracy: ', acc

    from sklearn.metrics import classification_report
    print 'precision,recall,F1-score如下：》》》》》》》》'
    print classification_report(doc_test_predicted, doc_class_list_test)

    return acc

Example #2

Show file

File: text_prep.py Project: Aurel37/tweet-sentiment-extraction

def vectorize(text_tab, feature_names=None):
    """return a numpy digit number array
    text_tab : numpy array of strings
    """
    process = []
    for tweet in text_tab:
        if type(tweet) == str:
            process.append(clean(tweet))
    process = np.array(process)
    vect = CountVectorizer()
    if feature_names is None:
        vect.fit(process)
    else:
        vect.vocabulary = feature_names
    res = vect.transform(process)
    return res.toarray(), vect.get_feature_names()

Example #3

Show file

 def sent2bow(self, labeledList, idlist):
     print(len(idlist))
     x = []
     y = []
     total = [0] * len(self.avilableLabels)
     vectorizer = CountVectorizer()
     vectorizer.vocabulary = self.inverseVocab
     print(idlist)
     for i in idlist:
         print(i)
         doc = labeledList[i]
         for line in doc[2]:
             currentLabel = line[0]
             if currentLabel in self.avilableLabels:
                 idx = self.avilableLabels.index(currentLabel)
                 currentbow = vectorizer.fit_transform(
                     [line[1]]).toarray().tolist()[0]
                 total[idx] += 1
                 x.append(currentbow)
                 y.append(idx)
     return x, y, total

Example #4

Show file

File: tfidf_test.py Project: Flyingon/code_tpl_python

import jieba
import jieba.posseg as pseg
import os
import sys
from sklearn import feature_extraction
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

if __name__ == "__main__":
    # corpus=["我 来到 北京 清华大学",#第一类文本切词后的结果，词之间以空格隔开
    # "他 来到 了 网易 杭研 大厦", #第二类文本的切词结果
    # "小明 硕士 毕业 与 中国 科学院",#第三类文本的切词结果
    # "我 爱 北京 天安门"] #第四类文本的切词结果
    vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer.vocabulary = ['交通事故']
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    # frequency_matrix = vectorizer.fit_transform(corpus)
    frequency_matrix = [[0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 55, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 15529, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 4766, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 151, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 28, 0, 0],
                        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 13],
                        [0, 0, 0, 0, 0, 27, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
    tfidf = transformer.fit_transform(frequency_matrix)  # 第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵
    # print(vectorizer.get_feature_names())
    # print(frequency_matrix.toarray())
    # word=vectorizer.get_feature_names()#获取词袋模型中的所有词语
    word = ['离退休人员返聘合同纠纷', '请求确认人民调解协议效力', '劳动合同纠纷', '社会保险纠纷', '福利待遇纠纷', '劳动合同纠纷', '社会保险纠纷', '侵害商业秘密纠纷']

Example #5

Show file

                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 50000) 
    test_vectorizer = CountVectorizer(analyzer = "word",   \
                                 tokenizer = None,    \
                                 preprocessor = None, \
                                 stop_words = None,   \
                                 max_features = 50000) 
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of strings.
    total_training_Vectorized = train_vectorizer.fit_transform(bag_pos_train + bag_neg_train).toarray()

    #Assign the vocabulary of the test vectorizer to be the same as the vocabulary of the train vectorizer:
    test_vectorizer.vocabulary = train_vectorizer.get_feature_names()

    total_testing_Vectorized = test_vectorizer.fit_transform(bag_pos_test + bag_neg_test).toarray()

    ###############

    #Create vectors with signs (1 = positive, -1=negative) for reviews, which will be combined with the vectorized arrays:
    train_signs = np.empty(shape = (1600, 1))
    train_signs[0:800,0] = 1
    train_signs[800:1600, 0] = -1

    test_signs = np.empty(shape = (400, 1))
    test_signs[0:200,0] = 1
    test_signs[200:400, 0] = -1

    ###############

Example #6

Show file

def Build_Title_WordVector(filename):
    '''
    QueryoneGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt")
    QuerytwoGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt")
    QuerytriGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt")
    '''

    QueryoneGramDict = json.load(
        file("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt"))
    QuerytwoGramDict = json.load(
        file("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt"))
    QuerytriGramDict = json.load(
        file("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt"))

    Query = ReadDictFromFile(filename)

    inputlen = len(Query)
    Gramone = []
    Gramtwo = []
    Gramtri = []
    for item in Query:
        #ngram
        item = item.strip('\n')
        tokens = item.split(' ')

        paddbi = []
        paddbi.extend('p')
        paddbi.extend(tokens)
        paddbi.extend('p')
        paddtr = []
        paddtr.extend(['p', 'p'])
        paddtr.extend(tokens)
        paddtr.extend(['p', 'p'])
        bi_tokens = nltk.bigrams(paddbi)
        #tri_tokens = nltk.trigrams(paddtr)
        bi_tokens = [(token[0] + token[1]) for token in bi_tokens]
        #tri_tokens =[(token[0]+token[1]+token[2]) for token in tri_tokens]
        onetmp = ""
        for token in tokens:
            onetmp += token
            onetmp += ' '
        Gramone.append(str(onetmp[0:-1]))

        twotmp = ""
        for token in bi_tokens:
            twotmp += token
            twotmp += ' '
        Gramtwo.append(str(twotmp[0:-1]))
        '''
           tritmp=""
           for token in tri_tokens:
                tritmp+= token
                tritmp+=' '
           Gramtri.append(str(tritmp[0:-1]))
           '''

    if inputlen != len(Gramone) or inputlen != len(Gramtwo):
        print("train size don't match !!\n")

    query_vectorizerone = CountVectorizer(stop_words=stopWords)
    query_vectorizerone.vocabulary = QueryoneGramDict
    onegramVectorizerArray = query_vectorizerone.transform(Gramone).toarray()
    #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)

    query_vectorizertwo = CountVectorizer(stop_words=stopWords)
    query_vectorizertwo.vocabulary = QuerytwoGramDict
    twogramVectorizerArray = query_vectorizertwo.transform(Gramtwo).toarray()
    #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
    '''
    query_vectorizertri =CountVectorizer(stop_words = stopWords)
    query_vectorizertri.vocabulary=QuerytriGramDict
    trigramVectorizerArray = query_vectorizertri.transform(Gramtri).toarray()
     #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
     '''

    return onegramVectorizerArray, twogramVectorizerArray  #,trigramVectorizerArray

Example #7

Show file

File: generateL1feature.py Project: XinChenBug/CIKM

def Build_Title_WordVector(filename):
    '''
    QueryoneGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt")
    QuerytwoGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt")
    QuerytriGramDict=ReadDictFromFile("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt")
    '''

    QueryoneGramDict=json.load(file("D:\\Featureusedfortest\\Title_word_vectorgram_1_all.txt"))
    QuerytwoGramDict=json.load(file("D:\\Featureusedfortest\\Title_word_vectorgram_2_all.txt"))
    QuerytriGramDict=json.load(file("D:\\Featureusedfortest\\Title_word_vectorgram_3_all.txt"))

    Query=ReadDictFromFile(filename)

    inputlen =len(Query)
    Gramone=[]
    Gramtwo=[]
    Gramtri=[]
    for item in Query:
           #ngram
           item=item.strip('\n')
           tokens=item.split(' ')

           paddbi = []
           paddbi.extend('p')
           paddbi.extend(tokens)
           paddbi.extend('p')
           paddtr = []
           paddtr.extend(['p','p'])
           paddtr.extend(tokens)
           paddtr.extend(['p','p'])
           bi_tokens = nltk.bigrams(paddbi)
           #tri_tokens = nltk.trigrams(paddtr)
           bi_tokens =[(token[0]+token[1]) for token in bi_tokens]
           #tri_tokens =[(token[0]+token[1]+token[2]) for token in tri_tokens]
           onetmp=""
           for token in tokens:
                onetmp+= token
                onetmp+=' '
           Gramone.append(str(onetmp[0:-1]))

           twotmp=""
           for token in bi_tokens:
                twotmp+= token
                twotmp+=' '
           Gramtwo.append(str(twotmp[0:-1]))
           '''
           tritmp=""
           for token in tri_tokens:
                tritmp+= token
                tritmp+=' '
           Gramtri.append(str(tritmp[0:-1]))
           '''

    if inputlen!=len(Gramone) or inputlen!= len(Gramtwo):
        print("train size don't match !!\n")

    query_vectorizerone =CountVectorizer(stop_words = stopWords)
    query_vectorizerone.vocabulary=QueryoneGramDict
    onegramVectorizerArray = query_vectorizerone.transform(Gramone).toarray()
    #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)

    query_vectorizertwo =CountVectorizer(stop_words = stopWords)
    query_vectorizertwo.vocabulary=QuerytwoGramDict
    twogramVectorizerArray = query_vectorizertwo.transform(Gramtwo).toarray()
     #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
    '''
    query_vectorizertri =CountVectorizer(stop_words = stopWords)
    query_vectorizertri.vocabulary=QuerytriGramDict
    trigramVectorizerArray = query_vectorizertri.transform(Gramtri).toarray()
     #savetxt("D:\\abstractfeatures\\query_tf_idf_featuret.txt",tf_idf_featuret)
     '''

    return onegramVectorizerArray,twogramVectorizerArray#,trigramVectorizerArray

Example #8

Show file

        cleaned_docs.append(
                ' '.join([lemmatizer.lemmatize(word.lower())
                    for word in doc.split()
                    if letters_only(word)
                    and word not in all_names]))
    return cleaned_docs

cleaned_emails = clean_text(emails)

cv = CountVectorizer(stop_words="english", max_features=500)

term_docs = cv.fit_transform(cleaned_emails)
print (term_docs[0])
feature_names = cv.get_feature_names()
print (feature_names[100])
feature_mapping = cv.vocabulary()

def get_label_index(labels):
    from collections import defaultdict
    label_index = defaultdict(list)
    for index, label in enumerate(labels):
        label_index[label].append(index)
        return label_index

label_index = get_label_index(labels)

def get_prior(label_index):
    """Compute prior based on training samples
    Args:
        label_index (grouped sample indices by class)
    Returns: