Example #1
1
def wordcloud(datafile):

    #remove stop words, the most common words in a language
    vectorizer=CountVectorizer(stop_words='english')

    for word in vectorizer.get_stop_words():
        STOPWORDS.add(word)
    STOPWORDS.add("said")

    pony_mask = np.array(Image.open("../pinkyB.jpg"))
    wc = WordCloud(background_color="black", max_words=2000, mask=pony_mask, stopwords=STOPWORDS)

    #init dictionary with the five categories
    categoriesSet = set(datafile["Category"])
    categoriesDict = dict.fromkeys(categoriesSet,"")

    #Conditional Selection
    # business = datafile.ix[datafile["Category"]=="Business"]
    # print business["Content"].size

    #fill index with data from cv
    for index, row in datafile.iterrows():
        categoriesDict[row["Category"]] += str(row["Content"])

    for category, text in categoriesDict.iteritems():
        wc.generate(text)
        image = wc.to_image()
        image.save("../wordcloud/wordcloud_" + category + ".jpg")
    return
Example #2
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words="english")
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words="_bad_str_stop_")
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words="_bad_unicode_stop_")
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ["some", "other", "words"]
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), stoplist)
Example #3
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), set(stoplist))
Example #4
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert_equal(cv.get_stop_words(), ENGLISH_STOP_WORDS)
    cv.set_params(stop_words='_bad_str_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    cv.set_params(stop_words='_bad_unicode_stop_')
    assert_raises(ValueError, cv.get_stop_words)
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert_equal(cv.get_stop_words(), stoplist)
Example #5
0
File: LDA.py Project: edurra/TFM
def initialize(texts_train, texts_test):
    v = CountVectorizer(stop_words="english")
    s = v.get_stop_words()
    stop_w = [x for x in s]
    stop = {}

    for w in stop_w:
        stop[w] = 1

    print("Preprocessing training file")
    ps = PorterStemmer()
    texts_train_p = [
        preprocess(texts_train[i], stop, ps)
        for i in list(range(0, len(texts_train)))
    ]

    print("Preprocessing test file")
    texts_test_p = [
        preprocess(texts_test[i], stop, ps)
        for i in list(range(0, len(texts_test)))
    ]

    print("Generating dictionary")
    dictionary = gensim.corpora.Dictionary(texts_train_p)
    dictionary.filter_extremes(no_below=15, no_above=0.5)

    bow_corpus = [dictionary.doc2bow(doc) for doc in texts_train_p]
    test_corpus = [dictionary.doc2bow(doc) for doc in texts_test_p]

    return dictionary, bow_corpus, test_corpus
Example #6
0
def count_words(text):
    vectorizer = CountVectorizer(token_pattern='\w+')
    corpus = [text]
    X = vectorizer.fit_transform(corpus)
    keys = vectorizer.get_feature_names()
    stops = vectorizer.get_stop_words()
    countList = X.toarray()[0]
    final_dict = {str(keys[i]): countList[i] for i in range(0, len(keys))}
    return final_dict
Example #7
0
def count_words(text):
	vectorizer = CountVectorizer(token_pattern= '\w+')
	corpus = [text]
	X = vectorizer.fit_transform(corpus)
	keys = vectorizer.get_feature_names()
	stops = vectorizer.get_stop_words()
	countList = X.toarray()[0]
	final_dict =  {str(keys[i]): countList[i] for i in range(0, len(keys))}
	return final_dict
Example #8
0
    def Common_Vectorizer_usage():
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(min_df=1)
        corpus = [
            'This is the first document.',
            'This is the second second document.',
            'And the third one.',
            'Is this the first document?',
        ]

        analyze = vectorizer.build_analyzer()
        print analyze("This is a text document to analyze.")
        print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze']
        
        X=vectorizer.fit_transform(corpus)
        print vectorizer.get_feature_names()
        print vectorizer.vocabulary_    #.get('document')
        print vectorizer.transform(['Something completely new.']).toarray()
        print list(X) 
        
        #bigram========================================================
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
        analyze = bigram_vectorizer.build_analyzer()
        print analyze('Bi-grams are cool!')
        X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
        print X_2

        feature_index = bigram_vectorizer.vocabulary_.get('is this')
        print X_2[:, feature_index] 
        
        #marui test
        print '\n\nmarui test====================='
        def t_preprocessor(s):
            return ','.join([x.lower() for x in s.split(' ')])

        stop_words1=['is','a','this']           #is ok: frozenset(['a', 'this', 'is'])
        stop_words2={'is':0,'a':1,'this':2}     #is ok: convert to frozenset(['a', 'this', 'is'])    
            
        cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2)
        params=cv.get_params()
        print 'get_params()',type(params),'---------------'
        for k in params:
            print k,'\t',params[k]
        print 'get_params end--------------'
        print '\nget_stop_words=',cv.get_stop_words()
        
        cv.fit(corpus)
        print cv.get_feature_names()
        print cv.transform(corpus).toarray()
        print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document')
        print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document')

        """
def test_count_vectorizer_stopwords():
    cv = CountVectorizer(stop_words="english")
    assert "all" in cv.get_stop_words()
    matrix = cv.fit_transform(DOCUMENTS)
    expected_features = ['ate', 'got', 'hens', 'kings', 'men', 'sleep', 'tired', 'went', 'zzz']
    assert cv.get_feature_names() == expected_features
    expected_vals = np.array([
        [0, 0, 0, 1, 1, 0, 0, 0, 0], # "all the kings men"
        [1, 0, 1, 1, 0, 0, 0, 0, 0], # "ate all the kings hens"
        [0, 1, 0, 0, 0, 1, 1, 1, 1] # "until they all got tired and went to sleep zzz"
    ])
    assert np.array_equal(matrix.toarray(), expected_vals)
Example #10
0
def feature(corpus):
    """
    sklearn里面的TF-IDF主要用到了两个函数:CountVectorizer()和TfidfTransformer()。
    CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵。
    矩阵元素weight[i][j] 表示j词在第i个文本下的词频,即各个词语出现的次数。
    通过get_feature_names()可看到所有文本的关键字,通过toarray()可看到词频矩阵的结果。
    TfidfTransformer也有个fit_transform函数,它的作用是计算tf-idf值。
    """
    vectorizer = CountVectorizer()  # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值
    tf_idf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    print('%%%%%%%%%')
    print(tf_idf)
    print('%%%%%%%%%')
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    print('&&&&&&&&&')
    print(word)
    print('&&&&&&&&&')
    print('$$$$$$$$$')
    weight = tf_idf.toarray()  # 将tf-idf矩阵抽取出来,元素w[i][j]表示j词在i类文本中的tf-idf权重
    print(weight)
    print('$$$$$$$$$')

    train_x, test_x = train_test_split(tf_idf, test_size=0.2)
    # scores = []
    # for i in range(2, 21):
    #     km = KMeans(n_clusters=i)
    #     km.fit(train_x)
    #     label = km.labels_
    #     print(label)
    #     print(km.inertia_)  # 用来评估簇的个数是否合适,距离越小说明簇分的越好,选择临界点的簇的个数
    #     scores.append({-km.score(test_x): i})
    # 确定簇的个数
    # return 19

    km = KMeans(n_clusters=19)
    km.fit(train_x)
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()
    print(vectorizer.get_stop_words())
    for i in range(19):
        print("Cluster %d:" % i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()
Example #11
0
def test_countvectorizer_stop_words():
    cv = CountVectorizer()
    cv.set_params(stop_words='english')
    assert cv.get_stop_words() == ENGLISH_STOP_WORDS
    cv.set_params(stop_words='_bad_str_stop_')
    with pytest.raises(ValueError):
        cv.get_stop_words()
    cv.set_params(stop_words='_bad_unicode_stop_')
    with pytest.raises(ValueError):
        cv.get_stop_words()
    stoplist = ['some', 'other', 'words']
    cv.set_params(stop_words=stoplist)
    assert cv.get_stop_words() == set(stoplist)
Example #12
0
def main():
    # load stop words
    vectorizer = CountVectorizer(stop_words='english')
    stop_word_set = vectorizer.get_stop_words()
    stop_word_set = stop_word_set.union(set(stopwords.words('english')))

    D_raw = json.load(open(input_file, "r"))
    # D: [{'entity':entity_string,'abstract':[word]}]
    D = []
    for i, j in enumerate(D_raw):
        if D_raw[i]['abstract']:
            D_raw[i]['abstract'] = re.sub(
                "[^a-z]", " ", D_raw[i]['abstract'].strip().lower()).split()
            if D_raw[i]['abstract']:
                D.append(D_raw[i])
    with open(documents_file, "wb") as f:
        pickle.dump(D, f)

    # make each document a counter
    doc_counter_list = []
    for i in range(len(D)):
        c = Counter()
        for w in D[i]['abstract']:
            if not w in stop_word_set:
                c[w] += 1
        doc_counter_list.append(c)
    with open(doc_counter_list_file, "wb") as f:
        pickle.dump(doc_counter_list, f)

    # for each word, record which documents it appears
    # {'word': [document]}
    word_appear_doc_dict = dict()
    for d in range(len(D)):
        for word in D[d]['abstract']:
            if not word in stop_word_set:
                if word in word_appear_doc_dict:
                    if not d in word_appear_doc_dict[word]:
                        word_appear_doc_dict[word].append(d)
                else:
                    word_appear_doc_dict[word] = [d]

    with open(word_appear_doc_dict_file, "wb") as f:
        pickle.dump(word_appear_doc_dict, f)
Example #13
0
def BoF(data,
        custom_tokenizer=None,
        tfidf=False,
        tokenize=True,
        strip_accents=None,
        tokenizer=None,
        stop_words=None,
        token_pattern='(?u)\b\w\w+\b',
        ngram_range=(1, 1),
        max_df=1.0,
        min_df=1,
        max_features=None):
    if tokenize:
        if custom_tokenizer:
            bag = CountVectorizer(tokenizer=custom_tokenizer,
                                  min_df=min_df,
                                  max_df=max_df,
                                  max_features=max_features,
                                  ngram_range=ngram_range,
                                  stop_words=stop_words).fit(data)
        elif tfidf:
            bag = TfidfVectorizer(min_df=min_df,
                                  max_df=max_df,
                                  max_features=max_features,
                                  ngram_range=ngram_range,
                                  stop_words=stop_words).fit(data)
        else:
            bag = CountVectorizer(min_df=min_df,
                                  max_df=max_df,
                                  max_features=max_features,
                                  ngram_range=ngram_range,
                                  stop_words=stop_words).fit(data)
        X = bag.transform(data)
        print(
            "Vocabulary size: {}, training set size: {} samples * {} features".
            format(len(bag.get_feature_names()), X.shape[0], X.shape[1]))
        print('# of tokens automatically excluded from the vocabulary:',
              len(bag.stop_words_))
        stopwords_eff = bag.get_stop_words()
        if stopwords_eff:
            print('# of stopwords that were effectively excluded :',
                  len(stopwords_eff))
        return X, bag
Example #14
0
    def tokenize_sentence(self,
                          sentence,
                          pre_process_sentence=None) -> np.ndarray:
        """
        Creates an array that contains all the words that appear on the sentence, after being processed by the
        fit_transform
        The count vectorizer.fit_transform creates a document by term matrix with one document = the sentence
        """
        if pre_process_sentence is None:
            pre_process_method = self.pre_process_corpus
        else:
            pre_process_method = pre_process_sentence

        vectorizer = CountVectorizer(
            token_pattern=self.token_pattern['token_pattern'],
            preprocessor=pre_process_method,
            stop_words='english' if self.args['stopwords'] else None,
        )

        tokenized = list()
        # If there is at least one word that is not a stop word process down below
        if not self.is_all_stop_words(
                sentence, vectorizer.get_stop_words()) and len(sentence) != 0:
            bow = vectorizer.fit_transform([sentence])
            # print(bow)
            words = vectorizer.get_feature_names()
            # I need this print for testing purposes
            # print(pd.DataFrame(bow.toarray(), columns=words))

            i = 0
            for count in bow.data:
                # print(count)
                for ind in range(count.item()):
                    # print(words[i])
                    tokenized.append(words[i])
                i = i + 1

        if len(tokenized) == 0:
            print('tokenized is empty')
            tokenized.append('')

        return np.asarray(tokenized)
Example #15
0
 def get_label(self, corpus, n_cluster=5):
     """
     经过matplotlib作图可知最好的簇的个数为5
     :param corpus
     :param n_cluster:
     :return:
     """
     vectorizer = CountVectorizer()
     transformer = TfidfTransformer()
     tf_idf = transformer.fit_transform(vectorizer.fit_transform(corpus))
     train_x, test_x = train_test_split(tf_idf, test_size=0.2)
     km = KMeans(n_clusters=n_cluster)
     km.fit(train_x)
     order_centroids = km.cluster_centers_.argsort()[:, ::-1]
     terms = vectorizer.get_feature_names()
     print(vectorizer.get_stop_words())
     for i in range(n_cluster):
         print("Cluster %d:" % i, end='')
         for ind in order_centroids[i, :10]:
             print(' %s' % terms[ind], end='')
         print('\n')
>>> X.toarray()[0]
array([1, 1, 1, 1, 1, 0, 1], dtype=int64)
>>> X.toarray()[1,2]
1
>>> from sklearn.datasets import fetch_20newsgroups
>>> categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']
>>> twenty_train = fetch_20newsgroups(subset='train', categories=categories, shuffle=True, random_state=42)
>>> from sklearn.feature_extraction.text import CountVectorizer
>>> vectorizer = CountVectorizer()
>>> train_counts = vectorizer.fit_transform(twenty_train.data)
>>> vectorizer.vocabulary_.get('algorithm')
4690
>>> len(vectorizer.get_feature_names())
35788
>>> vectorizer = CountVectorizer(stop_words='english')
>>> sorted(vectorizer.get_stop_words())[:20]
['a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst']
>>> import nltk
>>> s = nltk.stem.SnowballStemmer('english')
>>> s.stem("cats")
'cat'
>>> nltk.download()
showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml
True
>>> from nltk.tokenize import word_tokenize
>>> text = word_tokenize("And now for something completely different")
>>> text
['And', 'now', 'for', 'something', 'completely', 'different']
>>> nltk.pos_tag(text)
[('And', 'CC'), ('now', 'RB'), ('for', 'IN'), ('something', 'NN'), ('completely', 'RB'), ('different', 'JJ')]
>>> from sklearn.feature_extraction.text import CountVectorizer
Example #17
0
    def __call__(self, articles):
        for t in word_tokenize(articles):
            t = self.wnl.lemmatize(t)
            t = [re.sub('\S*@\S*\s?', '', sent) for sent in t]

            return t


categories = [
    'alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med'
]
twenty_train = fetch_20newsgroups(subset='train',
                                  categories=categories,
                                  shuffle=True,
                                  random_state=42)
twenty_test = fetch_20newsgroups(subset='test',
                                 categories=categories,
                                 shuffle=True,
                                 random_state=42)

count_vect_1 = CountVectorizer()  # vocabulary 1
count_vect_2 = CountVectorizer(tokenizer=LemmaTokenizer(
))  # keeps words of 3 or more characters) # vocabulary 2

count_vect_1.fit_transform(twenty_train.data)
count_vect_2.fit_transform(twenty_train.data)
print(len(set(count_vect_1.get_feature_names())))
print(count_vect_1.get_stop_words())
print(len(count_vect_2.get_feature_names()))
capabilities.
=== Post 2 with dist=0.92: Most imaging databases safe images
permanently.
=== Post 3 with dist=0.77: Imaging databases store data.
=== Post 4 with dist=0.77: Imaging databases store data. Imaging
databases store data. Imaging databases store data.
Best post is 3 with dist=0.77
'''


### Removing less important words
# called stop words, appear everywhere, carry little information
vectorizer = CountVectorizer(min_df=1, stop_words='english')

# usual stop words in english
sorted(vectorizer.get_stop_words())[0:20]
'''
['a', 'about', 'above', 'across', 'after', 'afterwards', 'again',
'against', 'all', 'almost', 'alone', 'along', 'already', 'also',
'''

# 18 words now
len(vectorizer.get_feature_names())

best_post(dist_norm)
'''
=== Post 0 with dist=1.41: This is a toy post about machine learning.
Actually, it contains not much interesting stuff.
=== Post 1 with dist=0.86: Imaging databases provide storage
capabilities.
=== Post 2 with dist=0.86: Most imaging databases safe images
    d = dist_norm(post_vec, new_post_vec)

    print(i, d, post)

    if d<best_dist:
        best_dist = d
        best_i = i


print
print(best_i)
print(best_dist)
print(posts[best_i])

vect_stop = CountVectorizer(min_df=1, stop_words='english')
print(vect_stop.get_stop_words())

eng_stremmer = nltk.stem.SnowballStemmer('english')

class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda  doc: (eng_stremmer.stem(w) for w in analyzer(doc))

vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english')
x_train = vectorizer.fit_transform(posts)

print(vectorizer.get_feature_names())
print(x_train.toarray())

class StemmedTfidVectorizer(TfidfVectorizer):
Example #20
0
for i in range(0, num_samples):
   post = z[i]
   if(post==new_post):
      continue
   post_vec = X_train.getrow(i)
   d = dist_norm(post_vec, new_post_vec)
   print " Post %i with dist= %.2f: %s" % (i, d, post)
   if d < best_dist:
      best_dist = d
      best_i = i
      
print "Best post is %i with dist= %.2f" % (best_i, best_dist)    

vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')

print sorted(vectorizer.get_stop_words())[:20]

# With stemmer
best_doc = None
best_dist = sys.maxint
best_i = None
for i in range(0, num_samples):
   post = z[i]
   if(post==new_post):
      continue
   post_vec = X_train.getrow(i)
   d = dist_norm(post_vec, new_post_vec)
   print " Post %i with dist= %.2f: %s" % (i, d, post)
   if d < best_dist:
      best_dist = d
      best_i = i
####### LDA ########  
    
# Use tf features for LDA.
print("Extracting tf features for LDA...")


#WITH FUNKY STOP WORDS
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=1, max_features=n_features, stop_words = text.ENGLISH_STOP_WORDS.union(['cooper', 'students', 'faculty', 'free', 'value', 'critical', 'thinking', 'education', 'work', 'school', 'skills', 'experience', 'union', 'learning', 'think', 'tuition', 'professors', 'time', 'student', 'learn', 'small', 'community', 'ability', 'learned', 'problem', 'solving', 'life', 'art', 'ideas', 'body', 'institution', 'quality', 'engineering', 'environment', 'career', 'peers', 'strong', 'different', 'debt', 'creative', 'rigor', 'rigorous', 'diverse', 'working', 'classes', 'people', 'exposure', 'focus', 'good', 'helped', 'great', 'class', 'did', 'like', 'world', 'new', 'technical', 'prepared', 'scholarship', 'hard', 'years', 'taught', 'way', 'unique', 'critically', 'freedom', 'program', 'allowed', 'challenging', 'lot', 'able', 'having', 'academic', 'professional', 'valued', 'classmates', 'ethic', 'real', 'field', 'high', 'study', 'architecture', 'undergraduate', 'opportunity', 'valuable', 'problems', 'nyc', 'research', 'design', 'really', 'diversity', 'commitment', 'intelligent', 'intellectual' 'graduate', 'dedication', 'access', 'passionate', 'culture', 'appreciate', 'amazing', 'better', 'experiences', 'understanding', 'opportunities', 'artists', 'foundation', 'major', 'degree', 'course', 'difficult', 'smart', 'institutions', 'graduate', 'intellectual', 'merit', 'city', 'development', 'lab', 'schools', 'pursue', 'teaching', 'job', 'succeed', 'arts', 'values', 'explore', 'communication', 'attended', 'college', 'knowledge', 'practical', 'colleagues', 'teamwork', 'group', 'future', 'resources', 'information', 'provide', 'engaged', 'approach', 'fundamentals', 'practice', 'dr', 'curriculum', 'educational', 'studies', 'artist', 'emphasis', 'tough', 'reputation', 'teachers', 'disciplines', 'engaging', 'talent', 'challenges', 'material', 'dedicated', 'excellent', 'support', 'unparalleled', 'challenged', 'truly', 'important', 'independent', 'best', 'interaction', 'didn', 've', 'talented', 'professor', 'leadership', 'teach', 'courses', 'projects', 'extremely', 'focused', 'helpful', 'independence', 'analytical', 'engagement', 'general', 'challenge', 'presentations', 'humanities', 'cu', 'perspective', 'computer', 'interdisciplinary', 'grad', 'especially', 'generally', 'humanities', 'incredible', 'brilliant', 'don', 'presentation', 'village', 'particularly', 'engineers', 'highly', 'importance', 'staff', 'civic', 'skill', 'demanding', 'artistic', 'atmosphere', 'graduating', 'fostered', 'fact', 'artistic', 'cost', 'writing', 'connections', 'critique', 'studio', 'discourse', 'instilled', 'thinker', 'curiosity', 'graduated', 'long', 'paid', 'coursework', 'background', 'provided', 'received', 'committed', 'higher', 'engineer', 'mentors', 'teacher', 'creativity', 'grateful', 'mission', 'breadth', 'status', 'collaboration', 'paying', 'shop', 'excellence', 'appreciation', 'programs', 'wealth', 'graduation', 'facilities', 'studios', 'undergrad', 'techniques', 'interviews', 'creatively', 'competitive', 'project', 'resume', 'invaluable']))

#NORMAL BORING STOP WORDS
#tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words = text.ENGLISH_STOP_WORDS)


tf = tf_vectorizer.fit_transform(data)
lda_stop_words = tf_vectorizer.get_stop_words()

print("Fitting LDA models with tf features, "
      "n_samples=", n_samples, " and n_features=", n_features)
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
lda.fit(tf)

#outputs
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
lda_stop_words = tf_vectorizer.get_stop_words()
print_top_words(lda, tf_feature_names, n_top_words)
#print("\nStop words:")
Example #22
0
"""if all(a[i] in "0123456789" for i in range(len(a))):
    print("The string is an integer.")"""
y="0123456789"
for i in xt:
    for j in range(len(i)):
        if(i[j] not in y):
            s.append(i)
            break
        d = int(i)
        k = num2words(d)
        s.append(k)
        break
print(s)


#4  expanding abbrevation
te='USA and GB are ...'
abbrevs={'USA':'United States','GB':'Great Britain'}
for ab in abbrevs:
    te= te.replace(ab,abbrevs[ab])
print(te)

#remove stop words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1, stop_words='english')
stop=list(sorted(vectorizer.get_stop_words()))
a="Artificial intelligence is the intelligence exhibited by machine"
pu = " ".join(set([ch for ch in a.split(" ") if ch not in stop]))
print(pu)

#remove punctuation
# split my data into train and test sets
from sklearn.cross_validation import train_test_split
train, test = train_test_split(lyrics, test_size=0.2, random_state=42)
print train.shape; print test.shape


# In[79]:

#==============================================================================
# Process description fields of train set
#==============================================================================

# tokenize the text using countvectoriser
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(lowercase=True, stop_words='english', strip_accents='unicode')
print count_vect.get_stop_words()

# if wanting to use n-grams
count_vect = CountVectorizer(analyzer='word', ngram_range=(1,2), lowercase=True, stop_words='english', strip_accents='unicode')


# In[80]:

train


# In[81]:

# fit the count vectoriser
X_train_counts = count_vect.fit_transform(train.lyrics)
X_train_counts.shape
Example #24
0
new_post_vec = vectorizer.transform([new_post])

best_dist = sys.maxsize
best_i = None
#一下就是一个简单的选出最大的函数了
for i in range(0, x_train.shape[0]):
    curr_dist = dist_norm(new_post_vec, x_train.getrow(i))
    print('post_num=%d,post_content=%s,dist=%.2f' % (i, posts[i], curr_dist))
    if curr_dist < best_dist:
        best_dist = curr_dist
        best_i = i
print('best_post_num=%d,post_content=%s,best_dist=%.2f' %
      (best_i, posts[best_i], best_dist))

#PART2
#stop_words,删除在任何帖子中都会出现的高频词,这个对帖子相似度区分不大,不应该占用和特定名次一样的权值,直接加在上面
#这样会输出English的常用词
print(sorted(vectorizer.get_stop_words())[0:20])

#PART3
# 近义词权重,image 和 images、information 两个词在上述情况下完全一样权重,显然不合理
#nltk natural language toolkit 自然语言处理在python中常用的包
#处理词干,词干就是指其它的词是由这个词衍生出来的
s = nltk.stem.SnowballStemmer('english')  #使用扩展词干处理规则(这个函数决定处理词干的规则)
print(s.stem('tools'))  #结果为tool

#PART4
#参数怎么调,不同词的权重应该是不同的,怎么设置
#我们一般认为在本个文件中出现的次数越多,在所有文件中出现的越少,他就越具有代表性
#有公式TF-IDF,在笔记中查看具体式子
from sklearn.feature_extraction.text import TfidfVectorizer  #这个是继承于CountVectorizer的,你可以直接用这个算TF-IDF
print(hotel.shape)
print(X_train.shape)
print(y_train.shape)

count = CountVectorizer(stop_words='english',
                        tokenizer=None,
                        ngram_range=(1, 2),
                        min_df=1,
                        max_df=0.9)
temp = count.fit_transform(
    X_train['Comment'].values.astype('str'))  # word count for recurrent words

print(count.get_feature_names())

print("Stop Words:")
print(count.get_stop_words())
print(temp.shape)
#print("temp: " + temp)

tdif = TfidfTransformer(norm='l1')
temp2 = tdif.fit_transform(temp)  # Give words different Weights

#print(temp2)

# nb = GaussianNB()
mn = MultinomialNB()

# Must convert to dense matrix for GaussianNB
# X = temp2.todense()
# nb.fit(X, hotel['Rating'])
Example #26
0
for i in range(0, num_samples):
    post = posts[i]
    if post == new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_raw(post_vec, new_post_vec)
    print("=== Post %i with dist=%.2f: %s" % (i, d, post))
    if d < best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f" % (best_i, best_dist))

vectorizer = CountVectorizer(
    min_df=1,
    stop_words='english')  #stopwords是停用词,如果定义了就会有很多类似于most啊、a啊、about啊不被统计
sorted(vectorizer.get_stop_words())[:50]  # 大约有多少呢?  318个
len(vectorizer.get_stop_words())

# 同语义的词的去重,需要下载一个包....好像不用》。。。!!!
# 正统的叫法是词干处理~

from nltk import stem

english_stemmer = stem.SnowballStemmer('english')  # 有很多,英语的用Snowball吧
english_stemmer.stem('imaging')
english_stemmer.stem('image')
english_stemmer.stem('imagine')
english_stemmer.stem('buys')
english_stemmer.stem('buying')
english_stemmer.stem('bought')
Example #27
0
# - **Why:** They don't tell you much about your text

# show vectorizer options
vect

# - **stop_words:** string {'english'}, list, or None (default)
# - If 'english', a built-in stop word list for English is used.
# - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
# - If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.

# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

# set of stop words
print vect.get_stop_words()

# ## Part 5: Other CountVectorizer Options
# - **max_features:** int or None, default=None
# - If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)

# all 100 features
print vect.get_feature_names()

# include 1-grams and 2-grams, and limit the number of features
vect = CountVectorizer(ngram_range=(1, 2), max_features=100000)
tokenize_test(vect)
from sklearn.feature_extraction.text import CountVectorizer
from termcolor import colored
import os

# -------start part different from i3covert_rawtext_to_bagOfwords.py ------
vectorizer = CountVectorizer(min_df=1, stop_words='english')

print(colored(sorted(vectorizer.get_stop_words())[0:20], 'blue'))

# ------- end part different from i3covert_rawtext_to_bagOfwords.py ------

print('dir(vectorizer)=', dir(vectorizer), '\nvectorizer=', vectorizer, '\n')

content = ["How to format my hard disk", "Hard disk format problems "]

X = vectorizer.fit_transform(content)
print('vectorizer.get_feature_names()=', vectorizer.get_feature_names())
print(X.toarray().transpose())

print(colored('*' * 25, 'red'))

from i2utils import DATA_DIR
TOY_DIR = os.path.join(DATA_DIR, "toy")
posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)]
print('posts:', posts)
X_train = vectorizer.fit_transform(posts)

num_samples, num_features = X_train.shape
print("#samples: %d, #featues: %d" % (num_samples, num_features))
print('vectorizer.get_feature_names()=', vectorizer.get_feature_names())
# show vectorizer options
vect


# - **stop_words:** string {'english'}, list, or None (default)
# - If 'english', a built-in stop word list for English is used.
# - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
# - If None, no stop words will be used. max_df can be set to a value in the range [0.7, 1.0) to automatically detect and filter stop words based on intra corpus document frequency of terms.

# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)


# set of stop words
print vect.get_stop_words()


# ## Part 4: Other CountVectorizer Options

# - **max_features:** int or None, default=None
# - If not None, build a vocabulary that only consider the top max_features ordered by term frequency across the corpus.

# remove English stop words and only keep 100 features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)


# all 100 features
print vect.get_feature_names()
number_of_features_bigram = len(vectorizer_bigram.get_feature_names())
list_of_features_bigram = vectorizer_bigram.get_feature_names()

X = vectorizer.fit_transform(corpus)

#corpus_array = X.toarray()
number_of_features = len(vectorizer.get_feature_names())
list_of_features = vectorizer.get_feature_names()
number_of_features = number_of_features_unigram + number_of_features_bigram
list_of_features = list_of_features_unigram + list_of_features_bigram
print "list_of_features:"
print list_of_features

print "list of features:%d" % number_of_features
print "#######vectorizer stop words############"
print vectorizer.get_stop_words()
print "#######vocabulary########"
print vectorizer.vocabulary_
transformer_unigram = TfidfTransformer(norm='', smooth_idf=True)
transformer_bigram = TfidfTransformer(norm='', smooth_idf=True)

transformer = TfidfTransformer(norm='', smooth_idf=True)
tfidf = transformer.fit_transform(X.toarray())

tfidf_array = tfidf.toarray()

tfidf_unigram = TfidfTransformer(norm='', smooth_idf=True)
tfidf_bigram = TfidfTransformer(norm='', smooth_idf=True)

tfidf_unigram_array = tfidf_unigram.fit_transform(X_unigram)
tfidf_bigram_array = tfidf_bigram.fit_transform(X_bigram)
Example #31
0
    train_indices = indices[:train_count]
    validate_indices = indices[train_count:]

    train_samples = samples.iloc[train_indices]
    validate_samples = samples.iloc[validate_indices]

    print "train sample count {}, validate sample count {}".format(
        len(train_samples), len(validate_samples))

# get tf-idf vector
print 'fitting train samples'
count_vect = CountVectorizer(stop_words='english', max_df=1.0)
count_vect.fit(train_samples['Text'])
print count_vect.get_feature_names()
print count_vect.get_stop_words()
X_train_counts = count_vect.transform(train_samples['Text'])
if TEST:
    X_test_counts = count_vect.transform(test_samples['Text'])
else:
    X_validate_counts = count_vect.transform(validate_samples['Text'])

tfidf_transformer = TfidfTransformer()
tfidf_transformer.fit(X_train_counts)
X_train_tfidf = tfidf_transformer.transform(X_train_counts)
if TEST:
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)
else:
    X_validate_tfidf = tfidf_transformer.transform(X_validate_counts)

print 'feature size {}'.format(X_train_tfidf.shape)
Example #32
0
# 단어 사전
mat = vectorizer.fit(sentences)
print(type(mat))

print(mat.vocabulary_)  # 알파벳 순으로 index가 붙는다

print(sorted(mat.vocabulary_.items()))

# 토큰
features = vectorizer.get_feature_names()
print(type(features))
print(features)

print('불용어')
print(vectorizer.get_stop_words())

sentence = [sentences[0]]
print('sentence: ', sentence)

myarray = vectorizer.transform(sentence).toarray()
print(type(myarray))
print('myarray: ', myarray)

"""
sentence: ['우리 아버지 여자 친구 이름은 홍길동 홍길동']
단어 사전: {'여자': 0, '이름은': 1, '홍길동': 2}
myarray: [[1 1 2]] 
-> sentence에 단어 사전의 토큰 중 '여자' 1번, '이름은' 1번, '홍길동' 2번 
포함되어 있다는 것을 ndarray로 리턴 
"""
Example #33
0
import pandas as pd

pd.DataFrame(X.toarray(), columns=vec.get_feature_names())

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
      'This is the first document.',
      'This document is the second document.',
      'And this is the third one.',
      'Is this the first document?',
  ]

corpus


vec = TfidfVectorizer()

X = vec.fit_transform(sample)
vec.get_feature_names()

X =vec.fit_transform(corpus)
vec.get_feature_names()

pd.DataFrame(X.toarray(),columns = vec.get_feature_names())
vec.get_stop_words()

def savePrint():
    print("Hello fro VIM")

class GeneralCountVectorizer(_GeneralBaseVectorizer):
    def __init__(self,
                 clusters={},
                 vocabulary=None,
                 max_features=None,
                 ngram_range=(1, 1),
                 stop_words=None,
                 token_pattern=r"(?u)\b\w[\w']+\b",
                 analyzer='word',
                 max_df=1.0,
                 min_df=1,
                 dim=100):

        # base constructor
        super(GeneralCountVectorizer, self).__init__(clusters, vocabulary, dim)

        # count vectorizer
        self.vec = CountVectorizer(max_features=max_features,
                                   ngram_range=ngram_range,
                                   stop_words=stop_words,
                                   token_pattern=token_pattern,
                                   analyzer=analyzer,
                                   vocabulary=self.vocabulary,
                                   max_df=max_df,
                                   min_df=min_df)

    def __repr__(self):
        class_name = self.__class__.__name__
        return '%s(%s)' % (
            class_name,
            _pprint(
                self.get_params(deep=False),
                offset=len(class_name),
            ),
        )

    def get_params(self, deep=False):
        out = {}
        """out = {**self.base_params,
               **self.vec.get_params(deep=deep)}"""

        return out

    def get_feature_names(self):
        return self.vec.get_feature_names()

    def get_stop_words(self):
        return self.vec.get_stop_words()

    def fit(self, X, y=None):

        X_mapped = self._map(X) if self._is_clusters else X

        self.vec.fit(X_mapped)

        return self

    def transform(self, X):

        if not get_len(X):
            # X is an empty of elements, return an empty np.ndarray
            return np.empty((0, self.dim))

        X_mapped = self._map(X) if self._is_clusters else X
        return self.vec.transform(X_mapped)

    def fit_transform(self, X, y=None):
        self.fit(X, y=y)
        return self.transform(X)

    def inverse_transform(self, X):
        return self.vec.inverse_transform(X)
class Tfidf:
	"""Permet de calculer les mesures de similarités entre des speechs et des paragraphes

	Cela se fait en plusieurs étapes :
		-Tokenisation et lemmatisation de tous les documents (speechs et paragraphes)
		-Détermination du vocabulaire, on compte tous les mots
		-Calcule de plusieurs valeurs : df, idf, tf, tfidf
		-Calcule des mesures cosinus entre les tfidf des speechs et des paragraphes
		-Calcule d'informations supplémentaires sur les mesures calculées
	"""
	
	def __init__(self, paragraphe, speech):
		"""Initialise les données

		Entrée :
			-paragraphe : liste des textes des paragraphes
			-speech : liste des textes des speechs

		On initialise set comme l'ensemble des textes
		"""

		self.paragraphe = paragraphe
		self.speech = speech

		self.set = list(self.paragraphe) #recopie
		self.set.extend(self.speech)


	def count(self, traitement=None):
		"""Définit le vocabulaire, et compte le nombre de mot par document (speech et paragraphe)

		Entrée : 
			-traitement : si traitement == "lemmatize", alors on utilise le tokenizer de stem.py, qui lemmatize en même
				sinon, on utilise le tokenizer par défaut de CountVectorizer (de sklearn), qui ne lemmatize pas

		Résultats :
			-self.tfidf_matrix : matrice creuse contenant pour chaque document, pour chaque mot, le nombre d'apparition du mot dans le document
			-self.vocabulary : dictionnaire contenant le vocabulaire
			-self.stop_words : dictionnaire contenant les stop_words
		"""

		if traitement == "lemmatize":
			self.tokenizer = stem.LemmaTokenizer()
		else:
			self.tokenizer = None

		self.tfidf_vectorizer = CountVectorizer(strip_accents='unicode', stop_words='english', tokenizer=self.tokenizer)
		self.tfidf_matrix = self.tfidf_vectorizer.fit_transform(self.set)

		self.vocabulary = self.tfidf_vectorizer.vocabulary_
		self.stop_words = self.tfidf_vectorizer.get_stop_words()


	def do_tf(self, ponderation):
		"""Calcul du tf

		Le tf (term frequency), pour un mot dans un document, est le nombre d'apparition du mot dans le document. C'est donc le contenu de self.tfidf_matrix
		Cette fonction permet cependant une amélioration : la prise en compte du contexte. Ainsi, pour mesure la similarité entre un speech et un paragraphe, on va regarder un peu les speechs et les paragraphes autour.
		Pour cela, on va calculer un tf augmenté, qui va compté les mots dans un document, mais aussi les mots des documents autour avec une certaine pondération.

		Entrée :
			-ponderation : un tableau de ponderation, qui s'applique de manière symétrique autour du document observé (qui lui est pondéré à 1)	
				exemple : ponderation de la forme [0.8,0.2] == 1*tf[i] + 0.8*tf[i+1] + 0.8*tf[i-1] + 0.2tf[i-2] + 0.2tf[i+2], tf[i] tjr pondéré à 1

		Pour ne pas prendre en compte le contexte : ponderation = []
		"""

		self.tf = sp.lil_matrix(self.tfidf_matrix, dtype=float)
	

		id_set = 0
		for s in [self.paragraphe, self.speech]:
			for j in range(len(s)):
				for i in range(len(self.vocabulary)):
					for k,p in enumerate(ponderation):
						if j - (k+1) >= 0:
							self.tf[id_set,i] += p*float(self.tfidf_matrix[id_set-(k+1),i])
					
						if j + (k+1) < len(s):
							self.tf[id_set,i] += p*float(self.tfidf_matrix[id_set+(k+1),i])

				id_set += 1



	#Calcul du tfidf

	def do_df(self):
		"""Calcul du df

		Le df (document frequency), pour un mot, correspond au nombre de documents où le mot apparait.
		On vérifie donc, pour chaque document, si le tf du mot dans ce document est non nul
		"""

		self.df = Counter()

		for _, i in self.vocabulary.iteritems():
			for j in range(len(self.set)):
				if self.tfidf_matrix[j,i] != 0:
					self.df[i] += 1


	def do_idf(self): 
		"""Calcul de l'idf

		L'idf (inverse term frequency) = log(nombre de documents / df) pour un mot
		"""

		self.idf = list(map(lambda x : numpy.log10((len(self.set)) / float(x)), self.df.values())) 


	def do_tfidf(self):
		"""Calcul du tfidf

		Le tfidf, pour un mot et un document = tf*idf
		"""

		self.tfidf = []
		for j in range(len(self.set)):
			self.tfidf.append([])
			for i in range(len(self.vocabulary)):
				self.tfidf[j].append(0.)

		for j in range(len(self.set)):
			for k,i in self.vocabulary.iteritems():				
				self.tfidf[j][i] = (self.tf[j,i] * self.idf[i])



	#Variantes de calcul

	def do_idf_variante(self): 
		"""Variante du calcul de l'idf

		Dans cette variante : idf = log( (nombre de document + 1) / (df + 1) ) + 1
		C'est cette variante de l'idf qui est utilisé dans sklearn si on ne spécifie pas de norme (paramètre norm=None)
		"""

		self.idf = list(map(lambda x : numpy.log((len(self.set) + 1.0) / float(x + 1.0)) + 1.0, self.df.values()))


	def do_idf_original(self):
		"""Calcul de l'idf directement avec sklearn

		On calcule ici l'idf directement avec les classe de sklearn. On obtient le même résultat que do_idf_variante.
		Calculer nous même l'idf nous permet de mieux contrôler ce que l'on fait, notamment sur la variante utilisée.
		"""
		tfidf_transformer = TfidfTransformer()
		tfidf_transformer.fit(self.tfidf_matrix)

		self.idf = tfidf_transformer.idf_


	def do_tfidf_original(self):
		"""Calcul du tfidf directement avec sklearn

		Sklearn nous permet de calculer directement les valeurs de tfidf en quelques lignes (incluant la tokenisation, le comptage, et les calculs intermédiaires). Le problème est qu'il ne gère pas le contexte, et qu'on a pas le choix des variantes de calcul.
		"""

		tfidf_v = TfidfVectorizer(strip_accents='unicode', stop_words='english', norm=None)
		self.tfidf = tfidf_v.fit_transform(self.set)




	#Mesure cosinus

	def mesure(self):
		"""Calcul les mesures de similarités entre les speechs et les paragraphes avec une mesure cosinus

		Résultat : 
			-self.similarite : un dictionnaire de la forme : self.similarite[idSpeech][idParagraphe] = valeur_similarite
		"""

		cosine_liste = cosine_similarity(self.tfidf[len(self.paragraphe):], self.tfidf[:len(self.paragraphe)]) #set1 to set2

		self.similarite = {}

		for i, cosine in enumerate(cosine_liste):
			for j, value in enumerate(cosine):
				if i in self.similarite:
					self.similarite[i][j] = value
				else:
					self.similarite[i] = {j : value}


	#Informations

	def do_infoMesure(self):
		"""Calcul des informations sur les mesures de similarités

		On calcule la moyenne et l'écart-type des similarités, ainsi que le pourcentage de zéro, par speech
		"""

		self.moyenne = {}
		self.ecartType = {}
		self.percentZero = {}

		for id_speech,speech in self.similarite.iteritems():
			somme = 0.
			nbZero = 0

			for v in speech.values():
				somme += v
				if v == 0:
					nbZero += 1

			self.moyenne[id_speech] = somme / float(len(speech))
			self.percentZero[id_speech] = float((nbZero * 100)) / float(len(speech))

			somme = 0.

			for v in speech.values():
				somme += (v - self.moyenne[id_speech])**2

			self.ecartType[id_speech] = numpy.sqrt(somme) / float(len(speech))


	def do_matchingWords(self):
		"""Détermine les mots en commun entre chaque speech et paragraphe (les matching words)

		On regarde les mots en commun dans chaque paire speech/paragraphe (tfidf non nul dans les deux documents) et on calcule leur similarité comme un produit scalaire. Attention, on utilise pas la même méthode pour la similarité entre deux mots, et la similarité entre deux documents.
		"""

		self.matchingWords = {}

		for j,speech in enumerate(self.tfidf[len(self.paragraphe):]):
			self.matchingWords[j] = {}
			for i,paragraphe in enumerate(self.tfidf[:len(self.paragraphe)]):
				self.matchingWords[j][i] = {}
				for w in range(len(self.vocabulary)):
					value = self.tfidf[len(self.paragraphe) + j][w] * self.tfidf[i][w]
					if value > 0.:
						self.matchingWords[j][i][w] = value

	
	def do_match(self, n=None):
		"""Trie les n meilleurs similarités pour chaque speech

		Si n = None, on garde toutes les similarités, triées, stockées dans self.match
		"""
		self.match = {}
		for i,s1 in enumerate(self.speech):
			if n:
				self.match[i] = sorted(self.similarite[i].iteritems(), key=lambda (k,v) : (v,k))[-n:]
			else:
				self.match[i] = sorted(self.similarite[i].iteritems(), key=lambda (k,v) : (v,k))



	#Éxécution

	def go(self, ponderation, n=None, lemmatizer=None):
		"""Calcul les mesures de similarités en appliquant toutes les opérations nécessaire

		Entrée :
			-ponderation : un tableau des pondération pour le contexte (voir do_tf)
			-n : le nombre de paragraphes avec les meilleurs similarités que l'on veut garder par speech (voir do_match)
			-lemmatizer : le tokenizer/lemmatizer utilisé (voir __init__)
		"""

		self.count(lemmatizer)
		#print self.vocabulary
		print
		print "tf\n"
		self.do_tf(ponderation)
		print "df\n"
		self.do_df()
		print "idf\n"
		self.do_idf()
		print "tfidf\n"
		self.do_tfidf()
		print "cosine\n"
		self.mesure()
		print "info\n"
		self.do_match(n)
		print
		#print self.match
		self.do_infoMesure()
		self.do_matchingWords()
Example #36
0
        pass
    else:
        dict_int[idx] = item
        int_dict[item] = idx
        idx += 1

df['int_labels'] = df['label'].map(int_dict)

print(df.loc[0])

y = df['int_labels'].values
count_vectorizer = CountVectorizer(decode_error='ignore',
                                   stop_words='english',
                                   max_df=0.2)
x = count_vectorizer.fit_transform(df['data'])
print(count_vectorizer.get_stop_words())

tfidf = TfidfVectorizer(decode_error='ignore',
                        stop_words='english',
                        max_df=0.8)
x = tfidf.fit_transform(df['data'])

model = MultinomialNB()
model.fit(x, y)

data_vectorizer = CountVectorizer(
    vocabulary=count_vectorizer.get_feature_names())
print(count_vectorizer.get_feature_names())
xml_x = data_vectorizer.fit_transform(txt_array)

data_tfidf = TfidfVectorizer(vocabulary=tfidf.get_feature_names())
Example #37
0
N_TOPICS = 16
print('reading data...')
dataset = fetch_20newsgroups(shuffle=False,
                             remove=('headers', 'footers', 'quotes'))
data_samples = dataset.data
train_docs, test_docs = train_test_split(data_samples, random_state=42)

print('priparing Count Vectorizer')
tf_vectorizer = CountVectorizer(max_df=1.0, stop_words='english')

X_train = tf_vectorizer.fit_transform(train_docs)
X_test = tf_vectorizer.transform(test_docs)

feature_names = tf_vectorizer.get_feature_names()

tf_vectorizer.get_stop_words()

print('Splitting test documents...')
X_test_train, X_test_test = rowwise_train_test_split(X_test,
                                                     random_seed=114514)

print('Start fitting sk-learn model...')
start = time()
vb_model = LDA_vb(n_components=N_TOPICS)
vb_model.fit(X_train)

phi_vb = vb_model.components_ / \
    vb_model.components_.sum(axis=1)[:, np.newaxis]
end = time()
print('done in {:.2f} seconds'.format((end - start)))
Example #38
0
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

print(X_train.getrow(3).toarray())
print(X_train.getrow(4).toarray())

def dist_norm(v1, v2):
    v1_normalized = v1/sp.linalg.norm(v1.toarray())
    v2_normalized = v2/sp.linalg.norm(v2.toarray())
    delta = v1_normalized - v2_normalized
    return sp.linalg.norm(delta.toarray())

best_doc = None
best_dist = sys.maxint
best_i = None
for i in range(0, num_samples):
    post = posts[i]
    if post==new_post:
        continue
    post_vec = X_train.getrow(i)
    d = dist_norm(post_vec, new_post_vec)
    print "=== Post %i with dist=%.2f: %s"%(i, d, post)
    if d < best_dist:
        best_dist = d
        best_i = i
print("Best post is %i with dist=%.2f"%(best_i, best_dist))

vectorizer = CountVectorizer(min_df=1, stop_words='english')
print(sorted(vectorizer.get_stop_words())[0:10])

# - **Why:** They probably don't tell you much about your text

# show vectorizer parameters
vect

# - **stop_words:** string {'english'}, list, or None (default)
#     - If 'english', a built-in stop word list for English is used.
#     - If a list, that list is assumed to contain stop words, all of which will be removed from the resulting tokens.
#     - If None, no stop words will be used.

# remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)

# examine the stop words
print(sorted(vect.get_stop_words()))

# - **max_df:** float in range [0.0, 1.0] or int, default=1.0
#     - When building the vocabulary, ignore terms that have a document frequency strictly higher than the given threshold (corpus-specific stop words).
#     - If float, the parameter represents a proportion of documents.
#     - If integer, the parameter represents an absolute count.

# ignore terms that appear in more than 50% of the documents
vect = CountVectorizer(max_df=0.5)
tokenize_test(vect)

# - **stop\_words\_:** Terms that were ignored because they either:
#     - occurred in too many documents (max_df)
#     - occurred in too few documents (min_df)
#     - were cut off by feature selection (max_features)
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()

vectorizer.fit(corpus)

vectorizer.vocabulary_
vectorizer.stop_words_

vectorizer.get_stop_words()
vectorizer.get_feature_names()

X = vectorizer.transform(corpus)
type(X)
X.toarray()

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
Example #41
0
from sklearn.feature_extraction.text import CountVectorizer
from termcolor import colored
import os

# -------start part different from i3covert_rawtext_to_bagOfwords.py ------
vectorizer = CountVectorizer(min_df=1, stop_words='english')

print(colored(sorted(vectorizer.get_stop_words())[0:20], 'blue'))


# ------- end part different from i3covert_rawtext_to_bagOfwords.py ------

print('dir(vectorizer)=',dir(vectorizer),'\nvectorizer=', vectorizer,'\n')

content = ["How to format my hard disk", "Hard disk format problems "]

X = vectorizer.fit_transform(content)
print('vectorizer.get_feature_names()=',vectorizer.get_feature_names())
print(X.toarray().transpose())


print(colored('*'*25, 'red'))



from i2utils import DATA_DIR
TOY_DIR = os.path.join(DATA_DIR, "toy")
posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)]
print('posts:', posts)
X_train = vectorizer.fit_transform(posts)