Example #1
0
def q133():
    def split_into_lemmas(message):
        message = message.lower()
        words = TextBlob(message).words
        # for each word, take its "base form" = lemma
        return [word.lemma for word in words]

    bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(data['text'])
    rare_feats = bow_transformer.get_feature_names()
    zacs_bow=CountVectorizer(ngram_range=(1, 2)).fit(data['text'])
    zac_bow_words=zacs_bow.get_feature_names()
    intersection = set(rare_feats).intersection(zac_bow_words)
    only_in_zac=list(set(zac_bow_words)-set(intersection))
    only_in_rare=list(set(rare_feats)-set(intersection))
    len_o_z = len(only_in_zac)
    len_o_r = len(only_in_rare)
    sum_o = len_o_z + len_o_r
    per_z = len_o_z/sum_o
    per_r = len_o_r/sum_o
    print("There are %d features in common, between both groups." %len(intersection))
    print("There are %d unique features in Zac's model." % len_o_z)
    print("There are %d unique features in Zac's model." %len_o_r)
    print("Together, both model's have %d features. " %sum_o )
    print("Clearly, Zac's model holds %f of all features. " % per_z)
    print("Clearly, Rare's model holds %f of all features. " % per_r)
    print("Zac's model is much larger, on account of using bigrams and not stemming. ")
Example #2
0
def find_significant_terms(corpus):
    """
    find words that are more common in one document than in the whole corpus

    # {(word, count)} per list -> and per corpus
    # {(word, freq)} per list = count.list / count.corpus

    :param corpus: [clinton_text, trump_text]
    :return:
    """

    vectorizer = CountVectorizer(min_df=1)

    list_counts = np.array(vectorizer.fit_transform(corpus).toarray())
    # print list_counts
    corpus_counts = np.sum(x for x in list_counts)
    # print corpus_counts
    list_freq = [1.0 * x / corpus_counts for x in list_counts]
    # print map(lambda x: x.tolist(), list_freq)

    sorted_by_freq = [list(reversed(sorted(zip(x.tolist(), vectorizer.get_feature_names())))) for x in list_freq]
    sorted_by_count = [list(reversed(sorted(zip(x.tolist(), vectorizer.get_feature_names())))) for x in list_counts]

    return sorted_by_freq, sorted_by_count

# print find_significant_terms(['ala ma kota', 'ala ma psa'])
Example #3
0
def TFIDF():
    global segcont
    global weight
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont))
    word = vectorizer.get_feature_names()  # 所有文本的关键字
    weight = tfidf.toarray()  # 对应的tfidf矩阵
    del segcont

    seg = []
    for i in range(len(weight)):
        enstr = ""
        for j in range(len(word)):
            if weight[i][j] >= 0.1:#####################################
                enstr = enstr + " " + word[j]
        seg.append(enstr)

    del weight
    vec = CountVectorizer()
    tra = TfidfTransformer()
    tidf = tra.fit_transform(vec.fit_transform(seg))
    wo = vec.get_feature_names()
    we = tidf.toarray()

    global we
Example #4
0
def test_feature_names():
    cv = CountVectorizer(max_df=0.5)

    # test for Value error on unfitted/empty vocabulary
    assert_raises(ValueError, cv.get_feature_names)
    assert_false(cv.fixed_vocabulary_)

    # test for vocabulary learned from data
    X = cv.fit_transform(ALL_FOOD_DOCS)
    n_samples, n_features = X.shape
    assert_equal(len(cv.vocabulary_), n_features)

    feature_names = cv.get_feature_names()
    assert_equal(len(feature_names), n_features)
    assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza',
                        'salad', 'sparkling', 'tomato', 'water'],
                       feature_names)

    for idx, name in enumerate(feature_names):
        assert_equal(idx, cv.vocabulary_.get(name))

    # test for custom vocabulary
    vocab = ['beer', 'burger', 'celeri', 'coke', 'pizza',
             'salad', 'sparkling', 'tomato', 'water']

    cv = CountVectorizer(vocabulary=vocab)
    feature_names = cv.get_feature_names()
    assert_array_equal(['beer', 'burger', 'celeri', 'coke', 'pizza', 'salad',
                        'sparkling', 'tomato', 'water'], feature_names)
    assert_true(cv.fixed_vocabulary_)

    for idx, name in enumerate(feature_names):
        assert_equal(idx, cv.vocabulary_.get(name))
Example #5
0
def load_dataset(dataset):
    if dataset == ['imdb']:
        #(X_pool, y_pool, X_test, y_test) = load_data()
        #vect = CountVectorizer(min_df=0.005, max_df=1./3, binary=True, ngram_range=(1,1))
        vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1,1))        
        X_pool, y_pool, X_test, y_test, _, _, = load_imdb(path='./aclImdb/', shuffle=True, vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif isinstance(dataset, list) and len(dataset) == 3 and dataset[0] == '20newsgroups':
        vect = CountVectorizer(min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1))
        X_pool, y_pool, X_test, y_test, _, _ = \
        load_newsgroups(class1=dataset[1], class2=dataset[2], vectorizer=vect)
        return (X_pool, y_pool, X_test, y_test, vect.get_feature_names())
    elif dataset == ['SRAA']:
        X_pool = pickle.load(open('./SRAA_X_train.pickle', 'rb'))
        y_pool = pickle.load(open('./SRAA_y_train.pickle', 'rb'))
        X_test = pickle.load(open('./SRAA_X_test.pickle', 'rb'))
        y_test = pickle.load(open('./SRAA_y_test.pickle', 'rb'))
        feat_names = pickle.load(open('./SRAA_feature_names.pickle', 'rb'))
        return (X_pool, y_pool, X_test, y_test, feat_names)
    elif dataset == ['nova']:
        (X_pool, y_pool, X_test, y_test) = load_nova()
        return (X_pool, y_pool, X_test, y_test, None)
    elif dataset == ['ibnsina']:
        (X_pool, y_pool, X_test, y_test) = load_ibnsina()
        return (X_pool, y_pool, X_test, y_test, None)
    elif dataset == ['creditg']:
        (X_pool, y_pool, X_test, y_test) = load_creditg()
        return (X_pool, y_pool, X_test, y_test, None)
Example #6
0
def wordMoverDistance(d1, d2):
    ###d1 list
    ###d2 list
    # Rule out words that not in vocabulary
    d1 = " ".join([w for w in d1 if w in vocab_dict])
    d2 = " ".join([w for w in d2 if w in vocab_dict])
    #print d1
    #print d2
    vect = CountVectorizer().fit([d1,d2])
    feature_names = vect.get_feature_names()
    W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]] #Word Matrix
    D_ = euclidean_distances(W_) # Distance Matrix
    D_ = D_.astype(np.double)
    #D_ /= D_.max()  # Normalize for comparison
    v_1, v_2 = vect.transform([d1, d2])
    v_1 = v_1.toarray().ravel()
    v_2 = v_2.toarray().ravel()
    ### EMD
    v_1 = v_1.astype(np.double)
    v_2 = v_2.astype(np.double)
    v_1 /= v_1.sum()
    v_2 /= v_2.sum()
    #print("d(doc_1, doc_2) = {:.2f}".format(emd(v_1, v_2, D_)))
    emd_d = emd(v_1, v_2, D_) ## WMD
    #print emd_d
    return emd_d
def improveVocabulary(positiveDocuments, negativeDocuments):
    countVectPos = CountVectorizer(min_df = 0.1, stop_words = 'english')
    countVectNeg = CountVectorizer(min_df = 0.1, stop_words = 'english')
    positiveCandidates = []
    negativeCandidates = []
    if len(positiveDocuments) > 0:
        try:
            countVectPos.fit_transform(positiveDocuments)
            positiveCandidates = countVectPos.get_feature_names()
        except:
            a = 1
            #print "count vector failed"
    if len(negativeDocuments) > 0:
        try:
            countVectNeg.fit_transform(negativeDocuments)
            negativeCandidates = countVectNeg.get_feature_names()
        except:
            a = 1
            #print "countvector failed"
    global listPos, listNeg, countDictPos, countDictNeg
    #pdb.set_trace()
    for candidate in (positiveCandidates + negativeCandidates):
        score = (getMapOutput(countVectPos.vocabulary_, candidate) - getMapOutput(countVectNeg.vocabulary_, candidate))
        if (score > 0 and  score/getMapOutput(countVectPos.vocabulary_, candidate) >= 0.1):
            insertMap(listPos, candidate)
        elif (score < 0 and  abs(score)/getMapOutput(countVectNeg.vocabulary_, candidate) >= 0.1):
            insertMap(listNeg, candidate)
    '''
def number_of_words_in_common(s1, s2):
    vec = CountVectorizer()
    counts = vec.fit_transform([s1, s2]).toarray()
    res = counts[0]*counts[1]
    i = 0
    common_words = []
    for r in range(len(res)):
        if res[r] !=0:
            i+=1
            vec.get_feature_names()[r]
    return i,common_words   
def vectorizerDataBigram(data, min_freq):
    vectorizer = CountVectorizer(analyzer='word', min_df = min_freq, lowercase=True,
                stop_words='english',token_pattern='(?u)\\b\\w\\w+\\b', binary=True,
                ngram_range=(2,2))
                
    X = vectorizer.fit_transform(data)
    df = pd.DataFrame(X.toarray(), columns = vectorizer.get_feature_names())

    regex = re.compile(r'\d')
    new_list = [s for s in vectorizer.get_feature_names() if not regex.match(s)]

    return pd.DataFrame(df, columns = new_list)
Example #10
0
def Get_unigrams_bigrams(corpus):
	# import countvectorzier to generate unigrams and bigrams
	unicount_vect = CountVectorizer(ngram_range=(1,1), lowercase = False,  stop_words='english',  token_pattern=r'\b\w+\b', min_df=1)
	unicount = unicount_vect.fit_transform(corpus).toarray() 
	unigrams = unicount_vect.get_feature_names()


	bicount_vect = CountVectorizer(ngram_range=(2,2), lowercase = False, stop_words='english',  token_pattern=r'\b\w+\b', min_df=1)
	bicount = bicount_vect.fit_transform(corpus).toarray() 
	bigrams = bicount_vect.get_feature_names()
	
	return (unigrams, bigrams)
class NearestNeighborMethod(object):
    def __init__(self,n_results=1,ngram_range=(1,1),tokenizer=SynonymTokenizer()):
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.neighbors import NearestNeighbors
        self.countvec = CountVectorizer(ngram_range=ngram_range,analyzer='word',lowercase=True,\
            token_pattern='[a-zA-Z0-9]+',strip_accents='unicode',tokenizer=tokenizer)
        self.nbrs = NearestNeighbors(n_neighbors=n_results)

    def load_ref_text(self,text_file):
        import re,nltk.data
        from nltk.corpus import wordnet as wn
        from nltk.stem import WordNetLemmatizer 
        textfile = open(text_file,'r')
        lines=textfile.readlines()
        textfile.close()
        sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
        sentences = [ sent_tokenizer.tokenize(line.strip()) for line in lines]
        sentences1 = [item for sublist in sentences for item in sublist] 
        chk2=pd.DataFrame(self.countvec.fit_transform(sentences1).toarray(),columns=self.countvec.get_feature_names())
        chk2[chk2>1]=1
        return chk2,sentences1
        
        #text1 = []
        #for sent in text:
        #        new_words = []
        #        for word,word_type in sent:
        #                synonyms = list({l.name().lower() for s in wn.synsets(word) for l in s.lemmas()})
        #                new_words.append((synonyms,word_type))
        #        text1.append(new_words)
    

    def load_query(self,text):
        #print text
        chk2=pd.DataFrame(self.countvec.transform([text]).toarray(),columns=self.countvec.get_feature_names())
        #print chk2.shape
        chk2[chk2>1]=1
        return chk2

    def get_scores(self,ref_dataframe,ref_query,n_results=1):        
        self.nbrs.fit(ref_dataframe)
        return self.nbrs.kneighbors(ref_query)

        
    def get_results(self,query):
        ref_dataframe,ref_sentences=NearestNeighborMethod.load_ref_text(self,'india.txt')
        #print ref_dataframe.shape,len(ref_sentences)
        ref_query=NearestNeighborMethod.load_query(self,query)
        neighbors_index = NearestNeighborMethod.get_scores(self,ref_dataframe,ref_query)[1]
        #print type(neighbors_index)
        #print neighbors_index[0]
        neighbors = list( ref_sentences[i] for i in neighbors_index[0] )
        print neighbors
def addBagOfWordsFeature(wordproblems):
    vectorizer = CountVectorizer(
        analyzer="word", tokenizer=LemmaTokenizer(), preprocessor=None, stop_words=None, max_features=5000
    )
    train_data_features = vectorizer.fit_transform(wordproblems)
    train_data_features = train_data_features.toarray()
    vocab = vectorizer.get_feature_names()
    vocab_wo_nums = []
    for s in vocab:
        if not any(char.isdigit() for char in s):
            vocab_wo_nums.append(s)

    vectorizer = CountVectorizer(
        analyzer="word",
        tokenizer=LemmaTokenizer(),
        preprocessor=None,
        stop_words=None,
        max_features=5000,
        vocabulary=vocab_wo_nums,
    )

    train_data_features = vectorizer.fit_transform(wordproblems)
    train_data_features = train_data_features.toarray()
    vocab = vectorizer.get_feature_names()

    with open("data/vocab.txt", "w") as f:
        f.write(str(vocab_wo_nums))

    numofnums = []
    numofques = []
    numofpercent = []
    for i in range(0, len(train_data_features)):
        nums = numberOfNumbers(None, wordproblems[i])
        numofnums.append(nums)
        ques = numberOfQuestions(wordproblems[i])
        numofques.append(ques)
        perc = numberOfPercent(wordproblems[i])
        numofpercent.append(perc)
    numofnums = numpy.array(numofnums)
    numofques = numpy.array(numofques)
    numofpercent = numpy.array(numofpercent)

    train_data_features = numpy.hstack((train_data_features, numpy.atleast_2d(numofnums).T))
    train_data_features = numpy.hstack((train_data_features, numpy.atleast_2d(numofques).T))
    train_data_features = numpy.hstack((train_data_features, numpy.atleast_2d(numofpercent).T))

    # print train_data_features
    return (vectorizer, train_data_features)
Example #13
0
def tfidf(corpus, word_category, file_to_write):
    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
    weight = tfidf.toarray()
    sum_weight = np.sum(weight, axis=0)
    word = vectorizer.get_feature_names()
    word_and_weight = []
    for i in range(len(sum_weight)):
        word_and_weight.append([word[i], sum_weight[i]])
    word_and_weight.sort(key=lambda key: key[1], reverse=True)
    f = open(file_to_write, "w+")
    result = []
    for j in range(len(word_and_weight)):
        try:
            f.write(
                word_and_weight[j][0]
                + " "
                + str(word_and_weight[j][1])
                + " "
                + word_category[word_and_weight[j][0]]
                + "\n"
            )
            result.append([word_and_weight[j][0], word_and_weight[j][1], word_category[word_and_weight[j][0]]])
        except:
            continue
    f.close()
    return result
Example #14
0
def find_common_words(all_words, num_most_frequent_words):
    vectorizer = CountVectorizer(
        stop_words=None, # 'english',
        max_features=num_most_frequent_words,
        binary=True)
    vectorizer.fit(all_words)
    return (vectorizer.vocabulary_, vectorizer.get_feature_names())
Example #15
0
 def getContextFeature(self):
     import time
     print 'start to get Context Feature'
     start = time.time()
     
     from sklearn.feature_extraction.text import TfidfTransformer
     from sklearn.feature_extraction.text import CountVectorizer
     #when we meet the large corpus, need to input an iteration!
     corpus = self.getIterText()
     #transfer the text into word frequency matrix
     vectorizer = CountVectorizer()
     transformer = TfidfTransformer()
     tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))
     
     print 'get word'
     word=vectorizer.get_feature_names()
     print 'get weight'
     weight=tfidf
     
     print 'weight type:', type(weight)
     #print weight
     end = time.time()
     
     print 'total time: \t', end-start
     return weight,word
def getTfidfData(dataTrain, dataTest, dataHold):
    print dataTrain.target_names
    
    count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2)
    tfidf_transformer = TfidfTransformer(sublinear_tf=True)
    X_counts = count_vect.fit_transform(dataTrain.data)
    X_tfidf = tfidf_transformer.fit_transform(X_counts)
    print X_tfidf.shape
    
    Y_counts = count_vect.transform(dataTest.data)
    Y_tfidf = tfidf_transformer.transform(Y_counts)
    print Y_tfidf.shape
    
    H_counts = count_vect.transform(dataHold.data)
    H_tfidf = tfidf_transformer.transform(H_counts)
    
    print 'feature selection using chi square test', len(dataTrain.target)
    feature_names = count_vect.get_feature_names()
    
    ch2 = SelectKBest(chi2, k='all')
    X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target)
    Y_tfidf = ch2.transform(Y_tfidf)
    H_tfidf = ch2.transform(H_tfidf)
    if feature_names:
        # keep selected feature names
        feature_names = [feature_names[i] for i
                         in ch2.get_support(indices=True)]
        
    if feature_names:
        feature_names = numpy.asarray(feature_names)
        print 'important features'
        print feature_names[:10]
    return X_tfidf, Y_tfidf, H_tfidf
def extract_text_features(train_data, test_data):
    """
    Returns one types of training and test data features.
        1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf

    Parameters
    ----------
    train_data : List[str]
        Training data in list. Will only take 30000 reviews for efficiency purposes
    test_data : List[str]
        Test data in list

    Returns
    -------
    Tuple(scipy.sparse.csr.csr_matrix,.., list)
        Returns X_train_tfidf, X_test_tfidf, vocab as a tuple.
    """
    
    # set up a count vectorizer that removes english stopwords when building a term-doc matrix
    count_vect = CountVectorizer(stop_words=set(stopwords.words('english')))
    # build the term frequency per document matrix from a random sublist of 30,000 documents
    train_counts = count_vect.fit_transform(random.sample(train_data, 30000))
    test_counts = count_vect.transform(test_data)
    tfidf_transformer = TfidfTransformer()

    train_tfidf = tfidf_transformer.fit_transform(train_counts)
    test_tfidf = tfidf_transformer.transform(test_counts)
    
    vocab = count_vect.get_feature_names()
    
    return (train_tfidf, test_tfidf, vocab)
def get_data(dir):
    titles = []
    titles_label = []
    os.path.walk(dir, visit, [titles, titles_label])
    # Initialize the "CountVectorizer" object, which is scikit-learn's bag of words tool.
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = 5000)
    # fit_transform() does two functions: First, it fits the model
    # and learns the vocabulary; second, it transforms our training data
    # into feature vectors. The input to fit_transform should be a list of
    # strings.
    titles_vocab_mat = vectorizer.fit_transform(titles)
    # Numpy arrays are easy to work with, so convert the result to an array
    #print vectorizer.vocabulary_  # a dict, the value is the index
    train_data_features = titles_vocab_mat.toarray()
    print train_data_features.shape
    # Take a look at the words in the vocabulary
    vocab = vectorizer.get_feature_names()
    print '/'.join(vocab)
    # Sum up the counts of each vocabulary word
    dist = np.sum(train_data_features, axis=0)
    total_words = 0
    for i in train_data_features:
        #print sum(i)
        total_words += sum(i)
    print total_words
    weka(vocab, dist, train_data_features, total_words, titles_label)
def bag_of_words_to_list(lines,max_features):
    # Initialize the "CountVectorizer" object, which is scikit-learn's
    # bag of words tool
    # removing stopwords
    vectorizer = CountVectorizer(
        stop_words = 'english'
        ,max_features = max_features
        )
    
    #TfidfVectorizer i need to check this

    print('>> Removing stopwords...')
    # lets remove stopwords
    lines = remove_stopwords(lines,2)

    print('>> Stemming...')
    # lets stem it
    lines =stemming(lines,3)

    print('>> Doing bag of words...')
    #lets do the bag of words
    bag_of_words = vectorizer.fit_transform(lines)



    #uncomment to visualize the words and how many times are used
    #printing_bow(bag_of_words,vectorizer)

    return(vectorizer.get_feature_names(),bag_of_words.toarray())
def produceLDATopics():
    '''
    Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer
    to extract topics.
    :return: pandas data frame with topic weights for each game (rows) and topic (columns)
    '''
    data_samples, gameNames = create_game_profile_df(game_path)
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    topics = lda.fit_transform(tf)
    # for i in range(50):
    #     gameTopics = []
    #     for j in range(len(topics[0])):
    #         if topics[i,j] > 1.0/float(n_topics):
    #             gameTopics.append(j)
    #     print gameNames[i], gameTopics
    topicsByGame = pandas.DataFrame(topics)
    topicsByGame.index = gameNames
    print topicsByGame

    tf_feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([tf_feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    return topicsByGame
def token_count_pac(pac_id,           \
                    limit       = 'ALL', \
                    ngram_range = (2,2), \
                    min_df      = 5):
    
    conn    = psql.connect("dbname='keyword-influence'")
    cursor  = conn.cursor()

    cursor.execute("SELECT id, speaking                       \
                    FROM words                                \
                    WHERE id IN (                             \
                        SELECT id                             \
                        FROM words                            \
                        WHERE bioguide_id IN(                 \
                            SELECT bioguide_id                \
                            FROM pac_contrib as pc            \
                            INNER JOIN congress as c          \
                            ON pc.fec_candidate_id = c.fec_id \
                            WHERE pac_id = '"+ pac_id +"'));")
    sql_result = cursor.fetchall()

    counter   = CountVectorizer(stop_words  = corpus.stopwords.words('english'), \
                                ngram_range = ngram_range,                       \
                                min_df      = min_df)
    chunks    = map(lambda x: x[1], sql_result)
    counts    = counter.fit_transform(chunks)
    vocab     = counter.get_feature_names()
    vocab     = dict(zip(range(len(vocab)),vocab))
    
    return [counts, vocab]
Example #22
0
def tfidf(fileList):
    segPath = sys.path[0] + '/seg_result'
    corpus = [] #存取文档的分词结果
    for eachFile in fileList:
        fileName = segPath + '/' + eachFile
        f = open(fileName,'r+')
        content = f.read()
        corpus.append(content)
    vectorizer = CountVectorizer()  # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频
    transformer = TfidfTransformer()  # 该类会统计每个词语的tf-idf权值,同时会使用默认的中文停用词
    tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))  # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵
    word = vectorizer.get_feature_names()  # 获取词袋模型中的所有词语
    weight = tfidf.toarray()  # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
    #创建tfidf文件夹,保存tf-idf的结果
    tfidfFilePath = os.getcwd() + '/tfidfFile'
    if not os.path.exists(tfidfFilePath):
        os.mkdir(tfidfFilePath)
    for i in range(len(weight)):
        print u"--------Writing all the tf-idf in the", i, u" file into ", tfidfFilePath + '/' + str(i) + '.txt', "--------"
        name = tfidfFilePath + '/' + string.zfill(i, 5) + '.txt'
        f = open(name,'w+')
        for j in range(len(word)):
            #f.write(word[j] + "    " + str(weight[i][j]) + "\n")
            #f.write(str(weight[i][j]) + "\n")
            f.write(word[j] + "\n")
        f.close()
Example #23
0
def textExtraction(df, series):
    vectorizer = CountVectorizer(analyzer = text_process, min_df = 0.1)
    df[series] = df[series].replace(np.nan, '', regex=True)
    vectorizer.fit_transform(df[series])
    vocab = vectorizer.get_feature_names()
    
    return vocab
def getCount(artName):
    artLst = []
    #artDict = {}
    for fn in os.listdir(indir):
        if not fn.endswith('.xml'): continue
        if ':' in fn:
            fn = fn.replace(':','/')
        fn = fn.decode('utf-8')
        #fn = unicodedata.normalize("NFC",fn)
        fn_de = unidecode(fn)
        newfn = fn_de[:-4]
        #print 'artName: ',artName, 'eval: ', newfn
        newfn = newfn.lower()
        if newfn == artName:
            # print "found article begin processing"
            #print fn
            if '/' in fn:
                fn = fn.replace('/',':')
            fullname = os.path.join(indir, fn)
            tree = ET.parse(fullname)
            root = tree.getroot()
            page = root.find('{http://www.mediawiki.org/xml/export-0.7/}page')

            revisions = page.findall('{http://www.mediawiki.org/xml/export-0.7/}revision')
            for s in revisions:
                txt = s.find('{http://www.mediawiki.org/xml/export-0.7/}text')
                artLst.append(txt.text)
            artLst = filter(None,[one for one in artLst])
            # print "processing done; begin counting"
            vectorizer = CountVectorizer(min_df=1,token_pattern='([^\[\|\]\s\.\!\=\{\}\;\<\>\?\"\'\#\(\)\,\*]+)')
            X = vectorizer.fit_transform(artLst)
            artDict = dict(zip(vectorizer.get_feature_names(),np.asarray(X.sum(axis=0)).ravel()))
        
            return artDict
    return -1
Example #25
0
def vectorize_substances(training, testing):
    substances = training.substances.apply(lambda x: re.sub(r'\(|\)|,','',x))
    substances_test = testing.substances.apply(lambda x: re.sub(r'\(|\)|,','',x))
    vec = CountVectorizer(strip_accents="unicode", analyzer="char_wb", ngram_range=(3,3), binary=True)
    x = vec.fit_transform(substances)
    xtest = vec.transform(substances_test)
    return x, xtest, vec.get_feature_names()
Example #26
0
def add_issue_columns(messages):
	from sklearn.feature_extraction.text import CountVectorizer
	v = CountVectorizer(binary=True)
	issue_matrix = v.fit_transform([str(x) for x in messages['Q13_issues']]).toarray()
	issues = v.get_feature_names()
	for (i, issue) in enumerate(issues):
		messages[issue] = pd.Series(issue_matrix[:,i])
Example #27
0
def count_fin_words(lmd,dataset):
    #Modifying the Dictionary  
    lmd=lmd[['Word','Positive','Negative']]
    lmd['Sum']=lmd['Positive']+lmd['Negative']
    lmd=lmd[lmd.Sum != 0]
    lmd=lmd.drop(['Sum'],axis=1)
    lmd.loc[lmd['Positive']>0, 'Positive'] = 1
    lmd.loc[lmd['Negative']>0, 'Negative'] = -1
    lmd['Word']=lmd['Word'].str.lower()
    #Counting the words in the MDA
    tf = CountVectorizer(analyzer='word', min_df = 0, stop_words = 'english')
    tfidf_matrix =  tf.fit_transform(dataset['MDA_Text'].values)
    feature_names = tf.get_feature_names() 
    tfidf_array = tfidf_matrix.toarray()
    tfidf_df = pd.DataFrame(tfidf_array)
    tfidf_df.columns = [i.lower() for i in feature_names] 
    tfidf_df = tfidf_df.T 
    tfidf_df['Word']=tfidf_df.index
    #Merging the results
    result_df = pd.merge(tfidf_df, lmd, how='inner',left_on='Word',right_on='Word')
    col_list=list(result_df)
    result_df_pos=result_df[result_df.Positive==1]
    result_df_neg=result_df[result_df.Negative==-1]
    result_df[col_list[0:len(dataset)]].sum(axis=0)
    #Counting the positive and negative words in a financial context per document
    pos_words_sum=result_df_pos[col_list[0:len(dataset)]].sum(axis=0)
    neg_words_sum=result_df_neg[col_list[0:len(dataset)]].sum(axis=0)
    #Adding new features to the master dataframe
    dataset['Tot_pos']=pos_words_sum.values
    dataset['Tot_neg']=neg_words_sum.values
    return dataset
Example #28
0
def createDTMat(fileList):
    from sklearn.feature_extraction.text import CountVectorizer
    cvec = CountVectorizer(stop_words = 'english')
    lines_list = readInList(fileList)
    X = cvec.fit_transform(lines_list).toarray()
    vocab = cvec.get_feature_names()
    return (X, vocab)
Example #29
0
def tf_idf(seg_files):
    seg_path = './segfile/'
    corpus = []
    for file in seg_files:
        fname = seg_path + file
        f = open(fname, 'r+')
        content = f.read()
        f.close()
        corpus.append(content)

    vectorizer = CountVectorizer()
    transformer = TfidfTransformer()
    tfdif = transformer.fit_transform(vectorizer.fit_transform(corpus))
    word = vectorizer.get_feature_names()
    weight = tfdif.toarray()

    save_path = './tfidffile'
    if not os._exists(save_path):
        os.mkdir(save_path)

    for i in range(len(weight)):
        print('--------Writing all the tf-idf in the', i, u' file into ', save_path + '/' + string.zfill(i, 5) + '.txt',
              '--------')
        f = open(save_path + '/' + string.zfill(i, 5) + '.txt', 'w+')
        for j in range(len(word)):
            f.write(word[j] + ' ' + str(weight[i][j]) + '\r\n')
        f.close()
Example #30
0
def text_feature(data,text_var,nfeature,noun=False,silence=False):
    """Calculate the text features for the given data.
    text_var specifies the name of the column that contains the text.
    nfeature specifies the max number of features to be extracted 
    from the text."""
    # First clean and parse the text data
    clean_statuses = []
    nitem = data.shape[0]
    data.index=range(nitem)
    for i in xrange( 0, nitem):
        if (i+1)%1000 == 0 and not silence:
            print "Status %d of %d\n" % ( i+1, nitem)                                                                    
        clean_statuses.append( status_to_words(data[text_var][i],noun))
    
    # Then extract features from the cleaned text
    print "Creating the bag of words...\n"
    vectorizer = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = None,   \
                             max_features = nfeature) 
    data_features = vectorizer.fit_transform(clean_statuses)
    data_features = data_features.toarray()
    vocab = vectorizer.get_feature_names() 
    # Sum up the counts of each vocabulary word
    counts = np.sum(data_features, axis=0)

    return {'features':data_features,'word':vocab,'counts':counts}
Example #31
0
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sys import argv
import pandas as pd
from sklearn.feature_extraction import text
#my_stop_words = list(pd.read_csv('stopwords.txt',header=None)[0])
text = list(pd.read_csv(argv[1], sep='\t', header=None)[0].astype('U'))
vectorizer = CountVectorizer(encoding='utf-8',
                             stop_words='english',
                             lowercase=True)
vectorizer.fit_transform(text)
vector = vectorizer.transform(text)
wl = vectorizer.get_feature_names()
np.savetxt(argv[2], wl, delimiter=",", fmt='%s')
np.savetxt(argv[3], vector.toarray(), delimiter=",", fmt='%s')
    import os
    import sys
    from sklearn.feature_extraction.text import CountVectorizer

    if "-" in sys.argv:
        lines = sys.stdin.readlines()
        sources = ['stdin']
    else:
        sources = ([arg for arg in sys.argv[1:] if os.path.exists(arg)]
                   or ["constitution.txt"])
        lines = []
        for s in sources:
            with open(s) as f:
                lines.extend(f.readlines())
    text = "".join(lines)
    cv = CountVectorizer(min_df=1,
                         charset_error="ignore",
                         stop_words="english",
                         max_features=200)
    counts = cv.fit_transform([text]).toarray().ravel()
    words = np.array(cv.get_feature_names())
    # throw away some words, normalize
    words = words[counts > 1]
    counts = counts[counts > 1]
    output_filename = (os.path.splitext(os.path.basename(sources[0]))[0] +
                       "_.png")
    print(output_filename)

    counts = make_wordcloud(words, counts, output_filename)
Example #33
0
from naive_bayes import SentiLexiconNB1
from utils import ReviewPreprocessing

dataset = pd.read_csv('assets/Restaurant_Reviews.tsv', delimiter='\t')

preprocessing = ReviewPreprocessing().fit(dataset['Review'])
corpus = preprocessing.corpus
senti_lexicon = preprocessing.senti_lexicon

vectorizing_regex = r"[-_'a-zA-ZÀ-ÖØ-öø-ÿ0-9]+"
vectorizer = CountVectorizer(ngram_range=(1, 2),
                             analyzer='word',
                             token_pattern=vectorizing_regex)

classifier = SentiLexiconNB1(senti_lexicon)

X = vectorizer.fit_transform(corpus)
y = dataset.iloc[:, 1].values

classifier.fit(X, y, vectorizer.get_feature_names())

while True:
    review = input('Add your review: ')

    preprocessed_review = preprocessing.transform([review])
    X = vectorizer.transform(preprocessed_review)

    sentiment = 'Positive' if classifier.predict(X)[0] else 'Negative'

    print(sentiment)
Example #34
0
    def most_common_words_by_group(
        self, X, text_col_name, group_col_name, num_examples, num_times_min, min_ngram,
    ):
        """
        Get the most commons phrases for defined groups.

        Parameters
        --------
        X: DataFrame
        text_col_name: str
        group_col_name: str
        num_examples: int
            Number of text examples to include per group
        num_times_min: int
            Minimum number of times word/phrase must appear in texts
        min_ngram: int

        Returns
        --------
        overall_counts_df: DataFrame
            Has groups, top words, and counts

        """
        # Fix for when column name is the same as an ngram column name
        X["group_column"] = X[group_col_name]

        # Remove all other unneeded columns
        X = X[[text_col_name, "group_column"]]

        all_stop_words = (
            set(ENGLISH_STOP_WORDS)
            | set(["-PRON-"])
            | set(string.punctuation)
            | set([" "])
        )

        cv = CountVectorizer(
            stop_words=all_stop_words,
            ngram_range=(min_ngram, 3),
            min_df=num_times_min,
            max_df=0.4,
        )
        vectors = cv.fit_transform(X[text_col_name]).todense()
        words = cv.get_feature_names()
        vectors_df = pd.DataFrame(vectors, columns=words)

        group_plus_vectors = pd.concat([vectors_df, X.reset_index(drop=False)], axis=1)

        count_words = pd.DataFrame(
            group_plus_vectors.groupby("group_column").count()["index"]
        )
        count_words = count_words.loc[:, ~count_words.columns.duplicated()]
        # Fix for when "count" is an ngram column
        count_words.columns = ["count_ngrams"]

        group_plus_vectors = group_plus_vectors.merge(
            count_words, on="group_column", how="left"
        )

        group_plus_vectors["count_ngrams"].fillna(0, inplace=True)

        sums_by_col = (
            group_plus_vectors[
                group_plus_vectors.columns[
                    ~group_plus_vectors.columns.isin([text_col_name, "index",])
                ]
            ]
            .groupby("group_column")
            .sum()
        )

        sums_by_col.sort_values(by="count_ngrams", ascending=False, inplace=True)

        sums_by_col.drop("count_ngrams", axis=1, inplace=True)

        array_sums = np.array(sums_by_col)
        sums_values_descending = -np.sort(-array_sums, axis=1)
        sums_indices_descending = (-array_sums).argsort()

        highest_sum = pd.DataFrame(sums_values_descending[:, 0])
        highest_sum.columns = ["highest_sum"]

        sums_by_col["highest_sum"] = highest_sum["highest_sum"].values

        overall_counts_df = pd.DataFrame(columns=["group_name", "top_words_and_counts"])
        i = 0
        for row in sums_by_col.index:
            dict_scores = {}
            temp_df = pd.DataFrame(columns=["group_name", "top_words_and_counts"])
            temp_df["group_name"] = [row]
            top_columns = sums_by_col.columns[
                sums_indices_descending[i][:num_examples]
            ].values
            top_counts = sums_values_descending[i][:num_examples]
            [dict_scores.update({x: y}) for x, y in zip(top_columns, top_counts)]
            temp_df["top_words_and_counts"] = [dict_scores]
            overall_counts_df = overall_counts_df.append([temp_df])
            print(f"Group Name: {row}\n")
            for k, v in dict_scores.items():
                print(k, v)
            print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
            i += 1

        return overall_counts_df
Example #35
0
y=msg.labelnum
#splitting the dataset into train and test data
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(X,y)
print(xtest.shape)
print(xtrain.shape)
print(ytest.shape)
print(ytrain.shape)
print("train data")
print(xtrain)
#output of count vectoriser is a sparse matrix
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm=count_vect.transform(xtest)
print(count_vect.get_feature_names())
df=pd.DataFrame(xtrain_dtm.toarray(),columns=count_vect.get_feature_names())
print(df)#tabular representation
print(xtrain_dtm) #sparse matrix representation
# Training Naive Bayes (NB) classifier on training data.
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(xtrain_dtm,ytrain)
predicted = clf.predict(xtest_dtm)
#printing accuracy metrics
from sklearn import metrics
print('Accuracy metrics')
print('Accuracy of the classifer is',metrics.accuracy_score(ytest,predicted))
print('Confusion matrix')
print(metrics.confusion_matrix(ytest,predicted))
print('Recall and Precison ')
print(metrics.recall_score(ytest,predicted))
for i in xrange(0, num):
    #if( (i+1)%1000 == 0 ):
    print "Review %d of %d\n" % (i + 1, num)
    perfect_words.append(cleaning_words(df["review"][i]))

#print df.head(67)
#print df["Postive rated"].mean()
X_train, X_test, y_train, y_test = train_test_split(perfect_words,
                                                    df['Postive rated'],
                                                    random_state=0)
#print X_train[10]
#print X_train.shape
#print df['Postive rated']

vect = CountVectorizer(min_df=5, ngram_range=(1, 2)).fit(X_train)
print len(vect.get_feature_names())

X_train_vetorised = vect.transform(X_train)
#print X_train_vetorised

print "starting training!!!!!"
model = LogisticRegression()
print "Stage 1 is completed"
model.fit(X_train_vetorised, y_train)
print "Stage 2 is completed"
predictions = model.predict(vect.transform(X_test))
print "Stage 3 is completed"
print("AUC:", roc_auc_score(y_test, predictions))

feature_name = np.array(vect.get_feature_names())
sort_coeff = model.coef_[0].argsort()
Example #37
0
            # print(line.replace('\xa0',''[:100]).strip())
            segline = segment(line.strip().replace('\xa0',''))
            # print(segline)
            doc = doc+ segline+' '
        corpus.append(doc)
        # print(docs)
        # break
    stoplist = ['了','与','他', '我']
    vectorizer=CountVectorizer( token_pattern='[\u4e00-\u9fa5]+',stop_words=stoplist,ngram_range=(1, 1),min_df=10)#该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频  
    transformer=TfidfTransformer()#该类会统计每个词语的tf-idf权值  
    # print(corpus)

    tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus))#第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵  
    print('--------tfidf---------')
    # print(tfidf)
    word=vectorizer.get_feature_names()#获取词袋模型中的所有词语  
    print('--------word---------')
    print(word)
    print(len(word))
    weight=tfidf.toarray()#将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重  
    # for i in range(len(weight)):#打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重  
    #     print ('-------这里输出第',i,'篇文本的词语tf-idf权重------')  
    #     for j in range(len(word)):  
    #         print(word[j],weight[i][j])



def demo():
    corpus=["我 来到 北京 清华大学",#第一类文本切词后的结果,词之间以空格隔开  
        "他 来到 了 网易 杭研 大厦",#第二类文本的切词结果  
        "小明 硕士 毕业 与 中国 科学院",#第三类文本的切词结果  
Example #38
0
class TextTopics():
    """
    Text classifier.
    """
    def __init__(self,
                 df: pd.DataFrame,
                 number_topics=50,
                 instance_path=instance_path(),
                 **kwargs):
        self._instance_path = instance_path
        self.number_topics = number_topics
        self.stop_words: List = get_stop_words("fi")
        self._count_vector: CountVectorizer = None
        self._lda: LDA = None
        self.token_cache = {}
        self._tokenizer = None
        self.min_sentence_length = 17

        # `kk` is used in assocation with time periods.
        self.stop_words += ["kk", "voi", "yms", "mm"]

        self.init(df, kwargs)

    def init(self, df: pd.DataFrame, generate_visualization=False, lang="fi"):
        """
        :param df: :class:`~pandas.Dataframe` containing text colums
        :param generate_visualization: Generate visalization of LDA results. Slows down
                                       generation notably.
        :param lang: Language for :class:`~Voikko`
        """
        if self._count_vector and self._lda:
            return True

        file_words = self.instance_path() / "word.dat"
        file_lda = self.instance_path() / "lda.dat"
        file_ldavis = self.instance_path() / "ldavis.html"

        try:
            # Try loading saved lda files.
            self._count_vector = joblib.load(file_words)
            self._lda = joblib.load(file_lda)
        except FileNotFoundError as e:
            logger.exception(e)

            texts = [x for x in df.to_numpy().flatten() if x is not np.NaN]

            # Setup word count vector
            self._count_vector = CountVectorizer(tokenizer=self.text_tokenize,
                                                 stop_words=self.stop_words)
            count_data = self._count_vector.fit_transform(texts)

            self._lda = LDA(n_components=self.number_topics, n_jobs=-1)
            self._lda.fit(count_data)

            if generate_visualization:
                logger.debug(
                    "Generating LDA visualization. This might take a while")
                from pyLDAvis import sklearn as sklearn_lda
                import pyLDAvis

                LDAvis_prepared = sklearn_lda.prepare(self._lda, count_data,
                                                      self._count_vector)
                pyLDAvis.save_html(LDAvis_prepared, str(file_ldavis))

            joblib.dump(self._count_vector, file_words)
            joblib.dump(self._lda, file_lda)

    def instance_path(self):
        path = self._instance_path / "lda" / str(self.number_topics)
        path.mkdir(exist_ok=True, parents=True)
        return path

    def tokenizer(self):
        if not self._tokenizer:
            self._tokenizer = VoikkoTokenizer("fi")
        return self._tokenizer

    @cached(LRUCache(maxsize=1024))
    def text_tokenize(self, text):
        """ Cached wrapper for `VoikkoTokenizer.tokenize()` """
        return self.tokenizer().tokenize(text)

    def find_talkingpoint(self, candidate: pd.Series) -> str:
        """ Find most suitable sentence from text """
        texts = tuple(candidate.dropna())
        if len(texts) == 0:
            return None

        x = self._get_topics(texts)
        return self.nearest_sentence(x[1], texts)

    def nearest_sentence(self, topics: List[float], texts: List[str]) -> str:
        """
        Find sentence closest to topic.

        TODO: When joining multiple sentences, it should be checked that they are from same paragraph.
        """
        @cached(LFUCache(maxsize=128))
        def lda(sentences):
            count_data = self._count_vector.transform(sentences)
            _lda = self._lda.transform(count_data)
            return _lda

        # Tokenize into sentences.
        sentences = chain(*[
            re.findall(r"\s*(.+?[\.!?])+", b, re.MULTILINE + re.DOTALL)
            for b in texts if b.strip() != ""
        ])

        # cleanup sentences.
        sentences = tuple(
            set(
                filter(lambda x: len(x) > self.min_sentence_length,
                       map(str.strip, sentences))))
        if len(sentences) == 0:
            return None

        # Find most topical sentence.
        tl_dr = []
        distance = 1.
        prev_sentence = ""
        for current_sentence, m in zip(sentences, lda(sentences)):
            _distance = np.abs(np.mean(topics - m))
            if _distance < distance:
                tl_dr, distance = ([prev_sentence,
                                    current_sentence], _distance)

            # Previous sentence is to provide context to most suitable sentence.
            prev_sentence = current_sentence

        return " ".join(filter(None, tl_dr))

    def compare_series(self, source: pd.Series, target: pd.Series):
        """
        Compare two text sets.

        First tuple contains topic word not found in :param:`target`, and second tuple
        contains word not found in :param:`source`.

        Note: This result will not be cached. Use :method:`compare_rows()` if possible.
        """
        # Convert them into tuples, so they can be cached.
        _source = tuple(source.dropna())
        _target = tuple(target.dropna())

        return self.compare_count_data(*self._get_topics(_source),
                                       *self._get_topics(_target))

    def compare_rows(self, df: pd.DataFrame, i, l):
        x = self.row_topics(df, i)
        y = self.row_topics(df, l)
        if not x or not y:
            return None

        r = self.compare_count_data(*x, *y)
        return r

    def row_topics(self, df: pd.DataFrame, idx):
        """ Return suitable topics from dataset `df` row :param:`idx` """
        x = tuple(df.loc[idx].dropna())
        if len(x) == 0:
            return None

        return self._get_topics(x)

    @cached(LRUCache(maxsize=512))
    def _get_topics(self, source: List) -> Tuple:

        count_data = self._count_vector.transform(source)
        return (count_data, self._lda.transform(count_data).mean(axis=0))

    def compare_count_data(
            self, counts_data_source, topics_source, counts_data_target,
            topics_target) -> Tuple[Tuple[str, int], Tuple[str, int]]:
        diffs = topics_source - topics_target

        topic_max = np.argmax(diffs)
        topic_min = np.argmin(diffs)

        source_words = self.suggest_topic_word(counts_data_source,
                                               counts_data_target, topic_max)
        target_words = self.suggest_topic_word(counts_data_target,
                                               counts_data_source, topic_min)

        word_for_source = self.suitable_topic_word(source_words) if len(
            source_words) else None
        word_for_target = self.suitable_topic_word(target_words) if len(
            target_words) else None

        return TopicComparision(source=Topic(id=topic_max,
                                             term=word_for_source),
                                target=Topic(id=topic_min,
                                             term=word_for_target))

    def suggest_topic_word(self, A, B,
                           topic_id: int) -> List[Tuple[int, float]]:
        """ Find relevant word for topic.

        Copares :param:`A` and :param:`B` words, and topic words to find
        suitable word with enough difference between `A` and `B`.

        :param A: :class:`csr_matrix` Target to find word for.
        :param B: :class:`csr_matrix` Comparative target for `A`
        :param topic_id: lda topic id number.

        :return: List of tuples in prominen order.
                 First instance in tuple is word vector feature number, and second is prominence value.
        """
        # Generate sum of used words
        a_sum = A.toarray().sum(0)
        b_sum = B.toarray().sum(0)

        # Topic word, prefering unique ones.
        λ = self._lda.components_[topic_id] / self._lda.components_.sum(0)

        # Remove words from A that B has used too.
        # Note: Doesn't actually remove.
        complement = a_sum - b_sum

        # Use logarithm, so topic words are prefered.
        prominence = np.log(complement) * λ

        # Generate list of words, ordered by prominence
        r = sorted(
            [(i, prominence[i])
             for i in prominence.argsort() if prominence[i] != 0 > -np.inf],
            key=lambda x: x[1],
            reverse=True)
        return r

    # sequence list is too volatile to be cached.
    def suitable_topic_word(self, seq: List[List[int, ]]) -> str:
        """
        Find first suitable word from :param:`seq` list.

        :param: 1d matrix of word feature indexes. Only first column in row
                is interepted as feature number.
        """
        vector_words = self.vector_words()
        """ Find first suitable word from word list """
        for r in seq:
            word = vector_words[r[0]]
            if self._suitable_topic_word(word):
                return word
        return None

    @cached(LFUCache(maxsize=512))
    def _suitable_topic_word(self, word) -> bool:
        """
        Check if word can be used as topic word

        Accepted word classes:
        :nimi:      Names; Words like `Linux` and `Microsoft`, `Kokoomus`
        :nimisana:  Substantives; like `ihminen`, `maahanmuutto`, `koulutus`, `Kokoomus`
        :laatusana: Adjectives; words like `maksuton`
        :nimisana_laatusana: Adjectives, that are not "real", like `rohkea` or `liberaali`
        :lyhenne:   Abbrevations; Words like `EU`
        :paikannimi:Geographical locations, like `Helsinki`
        :sukunimi:  Last names, like `Kekkonen`
        """

        for morph in self.tokenizer().analyze(word):
            _class = morph.get("CLASS")
            if _class in [
                    "nimi", "nimisana", "nimisana_laatusana", "lyhenne",
                    "paikannimi", "sukunimi"
            ]:
                return True
            else:
                logger.debug("Unsuitable word class %s for word %s", _class,
                             word)

        return False

    def vector_words(self) -> List:
        """ Feature names in CountVector """
        return self._count_vector.get_feature_names()
text=[]
for i in range(0,16002):
    texte = word_tokenize(dataset['Review'][i])
    text.append(texte)
    text[i]=nltk.pos_tag(text[i])

count=[]
for i in range(0,16002):
    words, tags=zip(*text[i])
    count.append(tags)
    count[i]=' '.join(count[i])

from sklearn.feature_extraction.text import CountVectorizer
tup=CountVectorizer()
X2=tup.fit_transform(count).toarray()
tup.get_feature_names()

#BIGRAMS
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(ngram_range=(2,2),max_features=100)    
X7 = count_vect.fit_transform(corpus).toarray()
count_vect.get_feature_names()

#UNIGRAMS
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=100)
X1=cv.fit_transform(corpus).toarray()
cv.get_feature_names()

#DEPENDENT VARIABLE