Example #1
0
def compute_information_gain(vectorizer: CountVectorizer, word: str, dataTrain: csr_matrix, targetTrain: [int]) \
        -> float:
    """Compute information gain of given word and return value"""
    word = word.lower()
    parentEntropy = computeEntropy(targetTrain)
    numRows = dataTrain.get_shape()[0]
    wordYesSplit = {0: 0, 1: 0}
    wordNoSplit = {0: 0, 1: 0}
    for count in range(numRows):
        simpleSentence = vectorizer.inverse_transform(dataTrain[count])[0]
        if word in simpleSentence:
            wordYesSplit[targetTrain[count]] += 1
        else:
            wordNoSplit[targetTrain[count]] += 1
    wordYesArray = wordYesSplit[0] * [0] + wordYesSplit[1] * [1]
    #print("lenYesArr: {}, YesDict: {}".format(len(wordYesArray), wordYesSplit))
    wordNoArray = wordNoSplit[0] * [0] + wordNoSplit[1] * [1]
    #print("lenNoArr: {}, NoDict: {}".format(len(wordNoArray), wordNoSplit))
    yesSplitEntropy = computeEntropy(wordYesArray)
    noSplitEntropy = computeEntropy(wordNoArray)
    probYes = len(wordYesArray) / numRows
    probNo = len(wordNoArray) / numRows
    #print("parEnt: {}, YesEnt: {}, NoEnt: {}".format(parentEntropy, yesSplitEntropy, noSplitEntropy))
    #print("probYes= {}, probNo= {}".format(probYes, probNo))

    return parentEntropy - (yesSplitEntropy * probYes +
                            noSplitEntropy * probNo)
Example #2
0
class FeatFunctions(object):
    """docstring for featFunctions"""

    def __init__(self, n_features=None):

        # self.arg = arg
        import re
        from sklearn.feature_extraction.text import FeatureHasher
        from numpy.random import randn, randint
        from sklearn.feature_extraction.text import CountVectorizer

        # Define some parameters:
        if not n_features:
            n_features = 100000

            # Initialize the hasher:
        self.hasher = FeatureHasher(n_features=n_features, input_type="string", non_negative=True)

        # Initialize the ngram:
        self.vectorizer = CountVectorizer(binary=True)

        # Feature name-function dictionary:
        self.featName_function = {"url": self.url, "all_caps": self.all_caps, "ngrams": self.ngrams}

    def all_caps(self, x):
        pat = re.compile(r"^[A-Z\d]+$")
        groups = pat.match(x)
        if groups:
            return ["f_all_caps"]

    def url(self, x):
        pat = re.compile("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")
        groups = pat.findall(x)
        if groups:
            return ["f_url"]

    def ngrams(self, x):
        ngram_feats = self.vectorizer.fit_transform([x])
        return self.vectorizer.inverse_transform(ngram_feats)[0].tolist()

        # An observation function that extracts features. x is a raw text

    def getObsFeatures(self, x, feat_list):
        str_feats = []
        for feat in feat_list:
            feat = feat(x)
            if feat:
                str_feats += feat

        return str_feats

    def getYXFeatures(self, y_name, y_idx, obs_feat_list):
        # return y_name+'_'+str(y_idx).join(obs_feat_list)
        # return map(lambda x,y:x+y,y_name+'_'+str(y_idx),obs_feat_list)
        xy_feat = [y_name + str(y_idx) + "_" + xfeat for xfeat in obs_feat_list]
        # print xy_feat

        hashed_feats = self.hasher.transform([xy_feat])
        # return hashed_feats.nonzero()[1]
        return hashed_feats
Example #3
0
def processChunk(chunk):
	# count n-gram frequencies
	v = CountVectorizer(ngram_range=(N,N))
	X = v.fit_transform([chunk])
	ngramfreq = zip(v.inverse_transform(X)[0], X.A[0])

	return ngramfreq
Example #4
0
def generate_stop_words(sentiment, diff=0.33):
    stop_words = []
    p_dict = {}
    n_dict = {}

    from sklearn.feature_extraction.text import CountVectorizer
    count_vect = CountVectorizer()
    trainX = count_vect.fit_transform(sentiment.train_data)
    sentences = count_vect.inverse_transform(trainX)

    for counter in range(0, len(sentiment.train_labels)):
        if sentiment.train_labels[counter] == "POSITIVE":
            for w in sentences[counter]:
                if w in p_dict:
                    p_dict[w] += 1
                else:
                    p_dict[w] = 1
        else:
            for w in sentences[counter]:
                if w in n_dict:
                    n_dict[w] += 1
                else:
                    n_dict[w] = 1

    for w in p_dict:
        if w in n_dict:
            if abs(p_dict[w] - n_dict[w]) / max(p_dict[w], n_dict[w]) < diff:
                stop_words.append(w)

    return stop_words
def preprocess(train, test, type='stop word'):
    if type == 'stop word':
        vectorizer = CountVectorizer(stop_words='english')
        # this may take a while
        tmp = vectorizer.fit_transform(train)
        tmp2 = vectorizer.transform(test)
        # inverse back to normal words
        tmp = vectorizer.inverse_transform(tmp)
        tmp2 = vectorizer.inverse_transform(tmp2)
        return tmp, tmp2
    else:
        # max_features 決定最多幾個字
        tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
        word_tf = tfidf.fit_transform(train)
        word_tf2 = tfidf.transform(test)
        return word_tf, word_tf2, tfidf.vocabulary_
Example #6
0
def plot_frequency_map(df,
                       settings,
                       target_events=list(['lost', 'passed']),
                       plot_name=None):
    """
    Plots frequency histogram and heatmap of users` event count

    :param df: data from BQ or your own (clickstream). Should have at least three columns: `event_name`,
            `event_timestamp` and `user_pseudo_id`
    :param settings: experiment config (can be empty dict here)
    :param target_events: name of event which signalize target function
            (e.g. for prediction of lost users it'll be `lost`)
    :param plot_name: name of file with graph plot
    :return: table with counts of events for users
    :type df: pd.DataFrame
    :type settings: dict
    :type target_events: List[str]
    :type plot_name: str
    :rtype: pd.DataFrame
    """
    users = df.user_pseudo_id[df.event_name.isin(target_events)].unique()
    df = df[df.user_pseudo_id.isin(users)]
    data = prepare_dataset(df, '')
    cv = CountVectorizer()
    x = cv.fit_transform(data.event_name.values).todense()
    cols = cv.inverse_transform(
        np.ones(df.event_name.nunique() - len(target_events)))[0]
    x = pd.DataFrame(x, columns=cols, index=data.user_pseudo_id)
    nodes_hist = df.groupby(
        'event_name',
        as_index=False).event_timestamp.count().sort_values('event_timestamp',
                                                            ascending=False)
    nodes_hist.event_name = nodes_hist.event_name.apply(lambda x: x.lower())
    sorted_cols = nodes_hist.event_name[~nodes_hist.event_name.
                                        isin(target_events)].values
    x = x.loc[:, sorted_cols]
    sns.mpl.pyplot.figure(figsize=[8, 5])
    bar = sns.barplot(nodes_hist.event_name.values,
                      nodes_hist.event_timestamp.values,
                      palette='YlGnBu')
    bar.set_xticklabels(bar.get_xticklabels(), rotation=90)

    settings = _check_folder(settings)
    export_folder = settings['export_folder']
    if plot_name:
        barname = os.path.join(export_folder, 'bar_{}.png'.format(plot_name))
        heatname = os.path.join(export_folder,
                                'countmap_{}.png'.format(plot_name))
    else:
        barname = os.path.join(export_folder,
                               'bar_{}.png'.format(datetime.now()))
        heatname = os.path.join(export_folder,
                                'countmap_{}.png'.format(datetime.now()))
    bar.get_figure().savefig(barname)
    sns.mpl.pyplot.figure(figsize=[10, 15])
    heatmap = sns.heatmap(x.values, cmap="YlGnBu")
    heatmap.get_figure().savefig(heatname)
    return x
Example #7
0
def computeMean(a, thres):
    vectorizer = CountVectorizer()

    bag = vectorizer.fit_transform(a)
    bag = bag.toarray()
    mean_ = np.mean(bag, axis=0)
    mean_ = np.array(np.array(mean_[mean_ > thres], dtype=np.bool), dtype=int)

    return vectorizer.inverse_transform(np.array([mean_ > thres], dtype=int))
Example #8
0
    def _identify_topics(self, strategy='average', max_d=0.75):
        """
        Group keyphrases to topics using Hierarchical Agglomerative Clustering (HAC) algorithm
        :param strategy: linkage strategy supported by scipy.cluster.hierarchy.linkage
        :param max_d: max distance for cluster identification using distance criterion in scipy.cluster.hierarchy.fcluster
        :return: None
        """
        # use term freq to convert phrases to vectors for clustering
        count = CountVectorizer()
        bag = count.fit_transform(list(self.phrases.keys()))

        # apply HAC
        Z = linkage(bag.toarray(), strategy)
        c, coph_dists = cophenet(Z, pdist(bag.toarray()))
        if c < 0.8:
            logger.warning("Cophenetic distances {} < 0.8".format(c))

        # identify clusters
        clusters = fcluster(Z, max_d, criterion='distance')
        cluster_data = defaultdict(list)
        for n, cluster in enumerate(clusters):
            inv = count.inverse_transform(bag.toarray()[n])
            cluster_data[cluster].append(' '.join(
                sorted([
                    str(i)
                    for i in count.inverse_transform(bag.toarray()[n])[0]
                ])))
        logger.debug('Found {} keyphrase clusters (topics)'.format(
            len(cluster_data)))
        topic_clusters = [frozenset(i) for i in cluster_data.values()]
        # apply pagerank to find most prominent topics
        # Sergey Brin and Lawrence Page. 1998.
        # The Anatomy of a Large - Scale Hypertextual Web Search Engine.
        # Computer Networks and ISDN Systems 30(1): 107–117
        topic_graph = nx.Graph()
        topic_graph.add_weighted_edges_from([(v, u, self.calc_distance(v, u))
                                             for v in topic_clusters
                                             for u in topic_clusters
                                             if u != v])
        pr = nx.pagerank(topic_graph, weight='weight')

        # sort topic by rank
        self.topics = sorted([(b, list(a)) for a, b in pr.items()],
                             reverse=True)
Example #9
0
 def cleanup(self, phrase):
     print(phrase)
     from sklearn.feature_extraction.text import CountVectorizer
     count_vect = CountVectorizer(stop_words='english')
     print("Count Vector process started.. ")
     vectr = count_vect.fit_transform([phrase])
     #print(vectr)
     #print(count_vect.vocabulary_)
     #print(count_vect.stop_words_)
     self.result_val = " ".join(count_vect.inverse_transform(vectr)[0])
Example #10
0
def NgramCollocationFinder(words, n, support=10, topK=200): 
    uni_vect = CountVectorizer(ngram_range=(1,1), stop_words=('english'))
    n_vect = CountVectorizer(ngram_range=(n,n), stop_words=('english'))
    X_uni = uni_vect.fit_transform([' '.join(words)])
    X_n = n_vect.fit_transform([' '.join(words)])
    ngrams = zip(n_vect.inverse_transform(X_n)[0], X_n.A[0])
    ngrams = (t for t in ngrams if t[1]>=support)
    unigrams = zip(uni_vect.inverse_transform(X_uni)[0], X_uni.A[0])
    unigrams = (t for t in unigrams if t[1]>=support)
    
    ngram_freq_total = 0
    Ngrams = []
    for t in ngrams:
        ngram_freq_total += t[1]
        Ngrams.append(t)
        #print t

    unigram_freq_total = 0
    Unigrams = []
    for t in unigrams:
        #print t
        unigram_freq_total += t[1]
        Unigrams.append(t)

 
    for i in range(len(Ngrams)):
        I_nominator = math.log((1.0*Ngrams[i][1])/ngram_freq_total, 2)
        count=0
        I_denominator = 0
        for w in Unigrams:
            if count==n:
                count = 0
                break
            if w[0] in Ngrams[i][0]:
                count = count+1
                I_denominator += math.log((1.0*w[1])/unigram_freq_total, 2) 
        Ngrams[i] = Ngrams[i] + (I_nominator-I_denominator,)

    Ngrams = sorted(Ngrams, key=lambda x: x[2], reverse=True)
    
    return Ngrams
Example #11
0
def avg_w2v(w2v, X_train, X_valid, min_df, max_f):
    # Build vocab from training data
    vect_w2v = CountVectorizer(min_df=min_df,
                               max_features=max_f,
                               vocabulary=w2v.index2word)
    vect_w2v.fit(X_train)

    X_train_docs = vect_w2v.inverse_transform(
        vect_w2v.transform(X_train))  # X_train_docs[i].shape = (doc_i_len,)
    X_valid_docs = vect_w2v.inverse_transform(vect_w2v.transform(X_valid))

    # Convert doc to embedding matrix
    # w2v[X_train_docs[i]].shape = (doc_i_len, 200)
    X_train_vec = np.vstack([
        np.mean(w2v[doc], axis=0) for doc in X_train_docs
    ])  # X_train_vec.shape=(6272, 200)
    X_valid_vec = np.vstack([
        np.mean(w2v[doc], axis=0) for doc in X_valid_docs
    ])  # X_valid_vec.shape=(784, 200)

    return X_train_vec, X_valid_vec
Example #12
0
def NgramCollocationFinder(words, n, support=10, topK=200):
    uni_vect = CountVectorizer(ngram_range=(1, 1), stop_words=('english'))
    n_vect = CountVectorizer(ngram_range=(n, n), stop_words=('english'))
    X_uni = uni_vect.fit_transform([' '.join(words)])
    X_n = n_vect.fit_transform([' '.join(words)])
    ngrams = zip(n_vect.inverse_transform(X_n)[0], X_n.A[0])
    ngrams = (t for t in ngrams if t[1] >= support)
    unigrams = zip(uni_vect.inverse_transform(X_uni)[0], X_uni.A[0])
    unigrams = (t for t in unigrams if t[1] >= support)

    ngram_freq_total = 0
    Ngrams = []
    for t in ngrams:
        ngram_freq_total += t[1]
        Ngrams.append(t)
        #print t

    unigram_freq_total = 0
    Unigrams = []
    for t in unigrams:
        #print t
        unigram_freq_total += t[1]
        Unigrams.append(t)

    for i in range(len(Ngrams)):
        I_nominator = math.log((1.0 * Ngrams[i][1]) / ngram_freq_total, 2)
        count = 0
        I_denominator = 0
        for w in Unigrams:
            if count == n:
                count = 0
                break
            if w[0] in Ngrams[i][0]:
                count = count + 1
                I_denominator += math.log((1.0 * w[1]) / unigram_freq_total, 2)
        Ngrams[i] = Ngrams[i] + (I_nominator - I_denominator, )

    Ngrams = sorted(Ngrams, key=lambda x: x[2], reverse=True)

    return Ngrams
def preprocess(train, test):
    # sklearn has default English stop word
    vectorizer = CountVectorizer(stop_words='english')
    # this may take a while
    tmp = vectorizer.fit_transform(train)
    tmp2 = vectorizer.transform(test)
    # return comment without stop words, alternative is using the tfidf_transformer
    tmp = vectorizer.inverse_transform(tmp)
    tmp2 = vectorizer.inverse_transform(tmp2)
    # combine array to string/ sentence
    noStopWord = []
    noStopWord2 = []
    for j in tmp:
        noStopWord.append(' '.join(j))

    for j in tmp2:
        noStopWord2.append(' '.join(j))

    tfidf = TfidfVectorizer(stop_words='english')
    word_tf = tfidf.fit_transform(train)
    word_tf2 = tfidf.transform(test)
    return word_tf, word_tf2
def tokenn(sentence):
    def filter_words(word_list):
        useful_words = [w for w in word_list if w not in sw]
        return useful_words

    def myTokenizer(sentence):
        words = tokenizer.tokenize(sentence.lower())
        return filter_words(words)

    cv = CountVectorizer(tokenizer=myTokenizer, ngram_range=(1, 4))
    vectorized_corpus = cv.fit_transform(sentence)
    vc = vectorized_corpus.toarray()
    fl = list(cv.inverse_transform(vc))
    return fl
def test_vectorizer_inverse_transform():
    vectorizer = CountVectorizer()
    transformed_data = vectorizer.fit_transform(DOCS_GPU)
    inversed_data = vectorizer.inverse_transform(transformed_data)

    sk_vectorizer = SkCountVect()
    sk_transformed_data = sk_vectorizer.fit_transform(DOCS)
    sk_inversed_data = sk_vectorizer.inverse_transform(sk_transformed_data)

    for doc, sk_doc in zip(inversed_data, sk_inversed_data):
        doc = np.sort(doc.to_arrow().to_pylist())
        sk_doc = np.sort(sk_doc)
        if len(doc) + len(sk_doc) == 0:
            continue
        assert_array_equal(doc, sk_doc)
Example #16
0
def trainModel():
    data = []
    dataLabels = []

    positive = open("positive.txt", "r", encoding="utf-8")
    negative = open("negative.txt", "r", encoding="utf-8")
    neutral = open("neutral.txt", "r", encoding="utf-8")
    for line in positive:
        data.append(line.rstrip())
        dataLabels.append('pos')
    for line in negative:
        data.append(line.rstrip())
        dataLabels.append('neg')
#    for line in neutral:
#        data.append(line.rstrip())
#        dataLabels.append('neu')

    vectorizer = CountVectorizer(analyzer='word',
                                 tokenizer=tokenize,
                                 lowercase=False)
    encoder = LabelEncoder()
    x = vectorizer.fit_transform(data)
    xnd = x.toarray()
    y = encoder.fit_transform(dataLabels)
    X_train, X_test, y_train, y_test = train_test_split(xnd,
                                                        y,
                                                        train_size=0.80,
                                                        random_state=84230)

    mnb = MultinomialNB()
    mnb.fit(X_train, y_train)
    y_pred = mnb.predict(X_test)
    y_predicted_labels = encoder.inverse_transform(y_pred)
    y_test_actual = encoder.inverse_transform(y_test)
    x_test_maps = vectorizer.inverse_transform(X_train)

    predictFile = open("predictions.txt", "w", encoding="utf-8")
    for i in range(len(y_predicted_labels)):
        ind = xnd.tolist().index(X_test[i].tolist())
        predictFile.write(
            str(y_predicted_labels[i]) + " - " + str(data[ind].strip()) + "\n")
    print(accuracy_score(y_test, y_pred))
    predictFile.close()

    joblib.dump(mnb, "sentiModel.pkl")
    pickle.dump(vectorizer, open("vector.pkl", "wb"))
    pickle.dump(encoder, open("encoder.pkl", "wb"))
    return
Example #17
0
    def model_fit(self, Xtrain_df, Ytrain_df):
        # Use CountVectorizer to calculate word frequency & to use IDF to calculate weights for the features.
        import warnings
        from sklearn.feature_extraction.text import CountVectorizer
        from sklearn.feature_extraction.text import TfidfTransformer
        from sklearn.naive_bayes import MultinomialNB
        import pickle   
        import os

        warnings.simplefilter(action='ignore', category=FutureWarning)

        count_vect = CountVectorizer(stop_words='english')
        print("Count Vector process started.. ")
        X_train_count = count_vect.fit_transform(Xtrain_df)
        print(count_vect.inverse_transform(X_train_count))
        print("\n Count Vector process completed.. ")

        tfitd_weight = TfidfTransformer()
        X_train_weight = tfitd_weight.fit_transform(X_train_count)
        print("\n IDF process completed.. ")
        print(X_train_weight.shape)

        # Check.. Shape of the X & Y axis

        print(X_train_weight.shape)
        print(Ytrain_df.shape)

        print("Fitting the model.. Multinomial NB")

        clf = MultinomialNB()
        clf.fit(X_train_weight, Ytrain_df)

        print("Model trained.. Let's pickle the model in and see some metrics.")

        filename = 'model_loc/multiNB_countvec.pkl'
        pickle.dump(count_vect, open(filename, 'wb'))

        filename = 'model_loc/multiNB.pkl'
        pickle.dump(clf, open(filename, 'wb'))

        print("model saved..")

        from sklearn.metrics import accuracy_score

        print(clf.score(X_train_weight, Ytrain_df))
Example #18
0
def popularity_vectorize():
    '''
    INPUT: None
    OUTPUT:
        - vector: CountVectorizier fit to the review corpus
        - pop_dict: Dictionary mapping vocabulary elements to popularity rank
        - pop_list: List of popularity vectors for each review in the corpus
    '''
    # import the review corpus
    movies, corpus, labels = load_data()

    # create a CountVectorizer() instance and fit/transform the corpus
    vector = CountVectorizer(ngram_range=(1, 2))
    corpus_vectors = vector.fit_transform(corpus)

    # Calculate the frequency for all elements in the vocabulary
    frequency = np.array(corpus_vectors.sum(axis=0))[0]

    # get the vocab indices in descending order of frequency
    top_indices = np.argsort(-frequency)

    # create list of all vocabualry elements
    feature_names = np.array(vector.get_feature_names())

    # create list of words where order corresponds to popularity
    words = feature_names[top_indices]

    # create the vocab-popularity map dictionary
    pop_dict = {}
    for n, word in enumerate(words):
        pop_dict[word] = n + 1

    # inverse transform the corpus_vectors into lists of vocab elements
    corpus_words = vector.inverse_transform(corpus_vectors)

    # create the list of popularity vectors for each review in the corpus
    # using the popularity dictionary
    pop_list = []
    for review in corpus_words:
        review_nums = []
        for word in review:
            review_nums.append(pop_dict[word])
        pop_list.append(review_nums)
    return (vector, pop_dict, pop_list)
Example #19
0
def count_vectorizer_usercase():
    x = ["some say the world end in fire", "some say in ice"]

    vectorizer = CountVectorizer()
    vectorizer.fit(x)

    print(vectorizer.vocabulary_)

    x_bag_of_words = vectorizer.transform(x)

    print(x_bag_of_words)

    print(x_bag_of_words.shape)

    print(x_bag_of_words.toarray())

    print(vectorizer.get_feature_names())

    print(vectorizer.inverse_transform(x_bag_of_words))
Example #20
0
def test_countvectorizer_custom_vocabulary():
    vocab = {"pizza": 0, "beer": 1}
    terms = set(vocab.keys())

    # Try a few of the supported types.
    for typ in [dict, list, iter, partial(defaultdict, int)]:
        v = typ(vocab)
        vect = CountVectorizer(vocabulary=v)
        vect.fit(JUNK_FOOD_DOCS)
        if isinstance(v, Mapping):
            assert vect.vocabulary_ == vocab
        else:
            assert set(vect.vocabulary_) == terms
        X = vect.transform(JUNK_FOOD_DOCS)
        assert X.shape[1] == len(terms)
        v = typ(vocab)
        vect = CountVectorizer(vocabulary=v)
        inv = vect.inverse_transform(X)
        assert len(inv) == X.shape[0]
Example #21
0
def plot_frequency_map(df,
                       settings,
                       target_events=['lost', 'passed'],
                       plot_name=None):
    users = df.user_pseudo_id[df.event_name.isin(target_events)].unique()
    df = df[df.user_pseudo_id.isin(users)]
    data = prepare_dataset(df, '')
    cv = CountVectorizer()
    x = cv.fit_transform(data.event_name.values).todense()
    cols = cv.inverse_transform(
        np.ones(df.event_name.nunique() - len(target_events)))[0]
    x = pd.DataFrame(x, columns=cols, index=data.user_pseudo_id)
    nodes_hist = df.groupby(
        'event_name',
        as_index=False).event_timestamp.count().sort_values('event_timestamp',
                                                            ascending=False)
    nodes_hist.event_name = nodes_hist.event_name.apply(lambda x: x.lower())
    sorted_cols = nodes_hist.event_name[~nodes_hist.event_name.
                                        isin(target_events)].values
    x = x.loc[:, sorted_cols]
    sns.mpl.pyplot.figure(figsize=[8, 5])
    bar = sns.barplot(nodes_hist.event_name.values,
                      nodes_hist.event_timestamp.values,
                      palette='YlGnBu')
    bar.set_xticklabels(bar.get_xticklabels(), rotation=90)

    settings = check_folder(settings)
    export_folder = settings['export_folder']
    if plot_name:
        barname = os.path.join(export_folder, 'bar_{}.png'.format(plot_name))
        heatname = os.path.join(export_folder,
                                'countmap_{}.png'.format(plot_name))
    else:
        barname = os.path.join(export_folder,
                               'bar_{}.png'.format(datetime.now()))
        heatname = os.path.join(export_folder,
                                'countmap_{}.png'.format(datetime.now()))
    bar.get_figure().savefig(barname)
    sns.mpl.pyplot.figure(figsize=[10, 15])
    heatmap = sns.heatmap(x.values, cmap="YlGnBu")
    heatmap.get_figure().savefig(heatname)
    return x
Example #22
0
def run_counts():
    vectorizer = CountVectorizer(ngram_range=(1,MAX_N_GRAMS), min_df=1)
    ngrams_fit = vectorizer.fit_transform(sys.stdin)
    # print ngrams.get_feature_names()
    # print ngrams.vocabulary_.get('nop nop')

    ngrams = []
    for i in range(MAX_N_GRAMS):
        ngrams.append(defaultdict(int))
    for gram, count in zip(vectorizer.inverse_transform(ngrams_fit)[0], ngrams_fit.A[0]):
        ngrams[len(gram.split())-1][gram] = count

    for n in ngrams:
        total = 0
        for count in n.itervalues():
            total += count
        for gram in n:
            n[gram] /= float(total)

    pickle.dump(ngrams, sys.stdout)
Example #23
0
    def action(self, tweets_list):

        corpus = []
        for tweet in tweets_list:
            # Deletes the accents
            tweet_str = tweet["text"].encode("utf-8")
            tweet_str = unicode(tweet_str, 'utf-8')
            tweet_str = unicodedata.normalize('NFD', tweet_str).encode(
                'ascii', 'ignore')
            corpus.append(tweet_str)
            self.corpus_tot.append(tweet_str)

        V = v.vectorize_(corpus)
        V = V.toarray()

        # Adding the new pack of tweets to the old clusters
        A = birch.clf_add_data(self.brc, V)
        self.labels = np.concatenate([self.labels, A])

        print "Nb tweets: %d" % len(self.corpus_tot)
        print "Nb clusters: %d" % len(birch.clf_cluster_centers(self.brc))

        clusters = [0] * len(birch.clf_cluster_centers(self.brc))
        for i in range(len(self.labels)):
            clusters[int(self.labels[i])] = clusters[int(self.labels[i])] + 1

        print "Clusters distribution:"
        print clusters

        C = birch.clf_cluster_centers(self.brc)

        for i in range(len(C)):
            if clusters[i] > 15:
                print "CLUSTER %d" % i
                P = CountVectorizer.inverse_transform(self.vectorizer, C[i])
                for j in range(len(P[0])):
                    print P[0][j],
                    print ",",
                print " "
        print " "
Example #24
0
def clean(corpus, th =0.05):
    th=0.3
    
    print corpus[0]
    from sklearn.feature_extraction.text import CountVectorizer
    vectorizer = CountVectorizer()
    vectorizer = CountVectorizer(token_pattern=r"\b[a-z]\w+\b")
    print 'fitting...'
#     corpus = ["The sky is a blue a22 22a 11 22-33"]
    X=vectorizer.fit_transform(corpus)
    dic=vectorizer.get_feature_names()
#     for i in range(10000):
#         print i,dic[i]
    
    print '#words ',len(vectorizer.get_feature_names())
    print 'computing tfidf ...'
    from sklearn.feature_extraction.text import TfidfTransformer
    transformer = TfidfTransformer()
    tfidf = transformer.fit_transform(X)
    print tfidf[0]
    exit(1)
    
    
    print 'nnz before cleaning',X.nnz
    nnz= tfidf.nonzero()
    print 'cleaning...'
    for i,j in zip(nnz[0],nnz[1]):
        if tfidf[i,j]<th:
            X[i,j]=0
    X.eliminate_zeros()
    print 'nnz after cleaning',X.nnz
    print 'extracting deleted words...'
    nnz2= X.nonzero()
    import collections
    row=  collections.Counter(nnz[0]).items()
    col=  collections.Counter(nnz[1]).items()
    row2=  collections.Counter(nnz2[0]).items()
    col2=  collections.Counter(nnz2[1]).items()
    print len(col),len(col2)
    print len(row),len(row2)
    for k,v in col2:
        if not v:
            print 'row has deleted'
    d=[]
    for k,v in col:
        if not v:
            print 'col has deleted',k
            d.append(k)
    print '# wordsto be deleted',len(d)
    exit(1)
#         print i,j ,tfidf[i,j]
    import pylab as P
    n, bins, patches = P.hist(t, 50, normed=1, histtype='stepfilled')
    P.figure()
    n, bins, patches = P.hist(t2, 50, normed=1, histtype='stepfilled')
    P.show()
    print len(t), len(t2)
    C=vectorizer.inverse_transform(X2)
    X=vectorizer.fit_transform(C)
    dic=vectorizer.get_feature_names()
    print len(vectorizer.get_feature_names())
    print 'done'
    exit(1)
    return 1
Example #25
0
# <codecell>

venue_vec = CountVectorizer(stop_words="english", min_df=0.001)
desc_vec = CountVectorizer(stop_words="english", min_df=0.001)

venue_data = venue_vec.fit_transform(data.Venue)
desc_data = desc_vec.fit_transform(data.Description)

X = sparse.hstack((venue_data, desc_data))
print X.shape

# <codecell>

print data.Venue[3]
venue_vec.inverse_transform(venue_data)[:10]

# <codecell>

CountVectorizer?

# <headingcell level=1>

# Label Encoding

# <markdowncell>

# We need to encode the strings for our categories as numbers

# <codecell>
Example #26
0
X_bag_of_words = vectorizer.transform(X)

# %%
X_bag_of_words.shape

# %%
X_bag_of_words

# %%
X_bag_of_words.toarray()

# %%
vectorizer.get_feature_names()

# %%
vectorizer.inverse_transform(X_bag_of_words)

# %% [markdown]
# # tf-idf Encoding
# A useful transformation that is often applied to the bag-of-word encoding is the so-called term-frequency inverse-document-frequency (tf-idf) scaling, which is a non-linear transformation of the word counts.
#
# The tf-idf encoding rescales words that are common to have less weight:

# %%
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(X)

# %%
import numpy as np
				npindL = np.array(indL)
#				print "top50 most populated clusters, down to size", max(10, int(X.shape[0]*0.0025))
				freq_th = max(10, int(X.shape[0]*0.0025))
				cluster_score = {}
	 			for clfreq in freqTwCl.most_common(50):
	 				cl = clfreq[0]
 					freq = clfreq[1]
 					cluster_score[cl] = 0
 					if freq >= freq_th:
 	 					#print "\n(cluster, freq):", clfreq
	 					clidx = (npindL == cl).nonzero()[0].tolist()
						cluster_centroid = X[clidx].sum(axis=0)
						#print "centroid_array:", cluster_centroid
						try:
							#orig_tweet = window_corpus[map_index_after_cleaning[i]].decode("utf-8")
							cluster_tweet = vectorizer.inverse_transform(cluster_centroid)
							#print orig_tweet, cluster_tweet, urls_window_corpus[map_index_after_cleaning[i]]
							#print orig_tweet
							#print "centroid_tweet:", cluster_tweet
							for term in np.nditer(cluster_tweet):
								#print "term:", term#, wdfVoc[term]
								try:
									cluster_score[cl] = max(cluster_score[cl], boosted_wdfVoc[str(term).strip()])
									#cluster_score[cl] += wdfVoc[str(term).strip()] * boost_entity[str(term)] #* boost_term_in_article[str(term)]
									#cluster_score[cl] = max(cluster_score[cl], wdfVoc[str(term).strip()] * boost_term_in_article[str(term)])
									#cluster_score[cl] = max(cluster_score[cl], wdfVoc[str(term).strip()] * boost_entity[str(term)])	
									#cluster_score[cl] = max(cluster_score[cl], wdfVoc[str(term).strip()] * boost_entity[str(term)] * boost_term_in_article[str(term)])
								except: pass 			
						except: pass
						cluster_score[cl] /= freq
					else: break
def cluster_tweets(tweets,clusters):
    print "Clustering tweets..."
    vectorizer = CountVectorizer(ngram_range=(1,10),min_df = 2)
    tweet_word_matrix = vectorizer.fit_transform([i['text'] for i in tweets])
    clf = KMeans(n_clusters=clusters)
    return vectorizer.inverse_transform(tweet_word_matrix), clf.fit_predict(tweet_word_matrix)
Example #29
0
        haha[new] += 1
    else:
        haha[new] = 1
# print(haha)

feature_doc = selector.fit_transform(feature_doc, target_doc)
train_x, test_x, train_y, test_y = train_test_split(feature_doc,
                                                    target_doc,
                                                    test_size=0.9)

clf4 = LogisticRegression(C=20.0, class_weight='balanced')
clf4.fit(train_x, train_y)
preds = clf4.predict(test_x)
print(classification_report(test_y, preds, digits=4))

test_x = vec.inverse_transform(test_x)
preds = label.inverse_transform(preds)
test_y = label.inverse_transform(test_y)
power = []
for item in test_x:
    str = ','.join(item)
    power.append(str)
data = pd.DataFrame(power)
data.columns = ['能力']
data['预测职业'] = pd.DataFrame(preds)
data['真实职业'] = pd.DataFrame(test_y)
data[['能力', '预测职业', '真实职业']].to_csv('positionresult.csv', index=None)
# data[['x','y_preds','y_true']].to_csv('positionresult.csv',index=None)
# print(power)
# print(preds)
Example #30
0
#  [0 0 0 ..., 0 0 0]
#  [0 0 0 ..., 0 0 0]
#  ...,
#  [0 0 0 ..., 0 0 0]
#  [0 0 0 ..., 0 0 0]
#  [0 0 0 ..., 0 0 0]]

# After calling .toarray(), is it a scipy.sparse matrix? False

doc_bag = count.fit_transform(example_doc).toarray()

print("[most frequent vocabularies]")
bag_cnts = np.sum(doc_bag, axis=0)
top = 10
# [::-1] reverses a list since sort is in ascending order
for tok, v in zip(count.inverse_transform(np.ones(bag_cnts.shape[0]))[0][bag_cnts.argsort()[::-1][:top]], 
                  np.sort(bag_cnts)[::-1][:top]):
    print('%s: %d' % (tok, v))

# [most frequent vocabularies]
# 蟋蟀: 98
# 可以: 21
# 就是: 21
# 聲音: 20
# 這樣: 19
# 你們: 17
# 真的: 16
# 還有: 15
# 比較: 15
# 豆油伯: 15
Example #31
0
def main(argv):
	test = ['klteYv1Uv9A_27_33.avi', '5YJaS2Eswg0_22_26.avi', 'UbmZAe5u5FI_132_141.avi', 'JntMAcTlOF0_50_70.avi', 'tJHUH9tpqPg_113_118.avi']
	test_npy = ['klteYv1Uv9A_27_33.avi.npy', '5YJaS2Eswg0_22_26.avi.npy', 'UbmZAe5u5FI_132_141.avi.npy', 'JntMAcTlOF0_50_70.avi.npy', 'tJHUH9tpqPg_113_118.avi.npy']
	#peer~~`
	directory = argv[0] + 'peer_review/feat/'
	peer_filename_list = []
	peer_feature_list = []
	for filename in os.listdir(directory):
		if filename.endswith(".npy"):
			#print(os.path.join(directory, filename))
			peer_feature_list.append(np.vstack((np.load(os.path.join(directory, filename)), np.zeros((50,4096)) ) ) )
			peer_filename_list.append(filename.replace('.npy',''))
		else:
			continue
	peer_feature_arr = np.array(peer_feature_list)


	#for filename in glob.glob(os.path.join(path, '*.txt')):
	directory = argv[0] + 'training_data/feat/'
	filename_list = []
	feature_list = []
	special = []
	for filename in os.listdir(directory):
		if filename.endswith(".npy"):
			#print(os.path.join(directory, filename))
			feature_list.append(np.vstack((np.load(os.path.join(directory, filename)), np.zeros((50,4096)) ) ) )
			filename_list.append(filename.replace('.npy',''))
			if filename == 'YmXCfQm0_CA_50_57.avi.npy':
				special.append(np.vstack((np.load(os.path.join(directory, filename)), np.zeros((50,4096)) ) ) )
		else:
			continue
	special = np.array(special)
	feature_arr = np.array(feature_list)
	directory = argv[0] + 'testing_data/feat/'
	test_feature_list = []
	test_file = []
	testing_npy = []
	for filename in os.listdir(directory):
		if filename.endswith(".npy"):
			#print(os.path.join(directory, filename))
			test_feature_list.append(np.vstack((np.load(os.path.join(directory, filename)), np.zeros((50,4096)) ) ) )
			test_file.append(filename.replace('.npy',''))
			if filename in test_npy:
				testing_npy.append(np.vstack((np.load(os.path.join(directory, filename)), np.zeros((50,4096)) ) ) )
		else:
			continue
	testing_npy = np.array(testing_npy)
	print('testing_npy : ', testing_npy.shape)		
	test_feature_list = np.array(test_feature_list)
	print('feature_arr.shape : ', test_feature_list.shape)
	print('len(filename_list) : ', len(test_file))
	features = (filename_list, feature_arr)
	with open(argv[0] + 'training_label.json') as label_file:
		label_data = json.load(label_file)
	#print(label_data[0]['id'])
	label = {}
	caption_all = []
	for i in range(len(label_data)):
		caption_all.append(label_data[i]['caption'])
		#label[label_data[i]['id']] = label_data[i]['caption']
	print(len(caption_all))
	#print(label[filename_list[0]])
	vectorizer = CountVectorizer(tokenizer=TB().tokenize, min_df = 0.00005)
	vectorizer.fit(list(itertools.chain.from_iterable(caption_all)))
	print(vectorizer.transform(['we']))
	#print(vectorizer.vocabulary_)
	#inv_map = {v: k for k, v in vectorizer.vocabulary_.items()}
	
	inverse = [(value, key) for key, value in vectorizer.vocabulary_.items()]
	max_voc_size = max(inverse)[0]+2
	vectorizer.vocabulary_["unknown"] = max_voc_size -1
	inv_map = {v: k for k, v in vectorizer.vocabulary_.items()}
	print(inverse[0])
	print(max_voc_size)
	print('inv_map 0 :', inv_map[0])
	buf_max = 0
	for i in range(len(label_data)):#len(label_data)
		label_buf = []
		seq_len_buf = []
		for j in range(len(label_data[i]['caption'])):
			dot_buf = label_data[i]['caption'][j].replace('.','')
			buf = TB().tokenize(dot_buf)
			for k in range(len(buf)):
				if not buf[k] in vectorizer.stop_words_:
					buf[k] = buf[k]
				else:
					buf[k] = "unknown"
			
			buf_max = max_value(buf_max, len(buf))
			arr = (vectorizer.transform(buf)).nonzero()[1]
			arr1 = np.append(np.array([max_voc_size]*80), arr)
			seq_len_buf.append(arr1.shape[0])
			#print(arr.shape)
			label_buf.append(np.append(arr1, np.array([max_voc_size]*(50-arr.shape[0]))))
		#print(np.array(label_buf).shape)
		label[label_data[i]['id']] = (np.array(label_buf), seq_len_buf)
	print('filename_list[0] : ', filename_list[0])
	print('len(label[filename_list[0]]) : ', label[filename_list[0]][0].shape)
	print(buf_max)
	#print(vectorizer.stop_words_)
	#print(find(vectorizer.transform(label[filename_list[0]][0]))[1])
	print(vectorizer.inverse_transform(vectorizer.transform(['.']))[0])
	#print(vectorizer.inverse_transform(vectorizer.transform(['A'])[0].todense()[0]))
	print(label_buf[-1])
	for j in range(len(label_buf[-1])):
		if label_buf[-1][j] != max_voc_size and label_buf[-1][j] != max_voc_size+1:
			print(inv_map[label_buf[-1][j]], end=' ')
	print('\n')

	#tf.reset_default_graph()
	#with tf.variable_scope(tf.get_variable_scope()):
	model = Model(130, 4096, max_voc_size+1, False)
	
	saver = tf.train.Saver()
	#tf.reset_default_graph()
	with tf.Session() as sess:
		#a = sess.run(model.bs_embedding)
		#print(a)
		#saver = tf.train.Saver()
		#optimistic_restore(sess,  "model_new/model_new.ckpt")
		#graph = graph
		#model = Model(130, 4096, 6087, True)
		saver.restore(sess, "Model/model_dyn_final.ckpt")
		#b = sess.run(model.bs_embedding)
		#print(b)
		#if a == b :
		#	print('Fuckkkkkkkkkkk')
		#tf.reset_default_graph()
		#prediction = sess.run(model.prediction,feed_dict = {model.xs : test_feature_list[:64,:,:], model.batch_size : int((test_feature_list).shape[0])})
		prediction = sess.run(model.prediction,feed_dict = {model.for_training : False,model.xs : test_feature_list, model.batch_size : int((test_feature_list).shape[0]), model.bos : [max_voc_size+1]*int((test_feature_list).shape[0])})
		print(np.array(prediction))
		peer_prediction = sess.run(model.prediction,feed_dict = {model.for_training : False,model.xs : peer_feature_arr, model.batch_size : int((peer_feature_arr).shape[0]), model.bos : [max_voc_size+1]*int((peer_feature_arr).shape[0])})
		#tt = sess.run([model.prediction],feed_dict = {model.xs : test_feature_list, model.batch_size : int((test_feature_list).shape[0])})
	#for i in range(50):
	#	for j in range(50):
	#		print(prediction[i][j],end = ',')
	#	print('\n')
	file = open(argv[1],'w')
	for i in range(len(test_file)):
		file.write(test_file[i])
		file.write(',')
		buf = []
		for j in range(len(prediction[i])):
			#if prediction[i][j] != 6086 and prediction[i][j] != 12 and prediction[i][j] != 6087:
				#if j == 0:			
			if prediction[i][j] < max_voc_size: 	
				if j == 0:
					file.write(inv_map[prediction[i][j]])
					file.write(' ')
					buf.append(inv_map[prediction[i][j]])
				elif inv_map[prediction[i][j]] != buf[-1] and prediction[i][j]!=max_voc_size-1:
					file.write(inv_map[prediction[i][j]])
					buf.append(inv_map[prediction[i][j]])
					file.write(' ')
		file.write('\n')

	file.close()

	file = open(argv[2],'w')
	for i in range(len(peer_filename_list)):
		file.write(peer_filename_list[i])
		file.write(',')
		buf = []
		for j in range(len(peer_prediction[i])):
			#if prediction[i][j] != 6086 and prediction[i][j] != 12 and prediction[i][j] != 6087:
				#if j == 0:			
			if peer_prediction[i][j] < max_voc_size:	
				if j == 0:
					file.write(inv_map[peer_prediction[i][j]])
					file.write(' ')
					buf.append(inv_map[peer_prediction[i][j]])
				elif inv_map[peer_prediction[i][j]] != buf[-1] and peer_prediction[i][j]!=max_voc_size-1::
					file.write(inv_map[peer_prediction[i][j]])
					buf.append(inv_map[peer_prediction[i][j]])
					file.write(' ')
		file.write('\n')

	file.close()
df = pd.DataFrame(email_list)

df['label'] = df['spam'].map({True: 1, False: 0})
df_x = df['title']
df_y = df['label']
print("** 데이터 **\n", df)

cv = CountVectorizer(binary=True)
x_traincv = cv.fit_transform(df_x)
print('\n변환된 데이터 표현 (1~2번째 이메일) **\n', x_traincv[0:2])

encoded_input = x_traincv.toarray()
print('\n** 이진 벡터 표현 **\n', encoded_input)

print('\n** 벡터 위치별 대응 단어 **\n', cv.get_feature_names())
print('/n** 첫번쨰 벡터에 대응하는 단어들 **/n', cv.inverse_transform(encoded_input[0]))

Bnb = BernoulliNB()
y_train = df_y.astype('int')
Bnb.fit(x_traincv, y_train)

test_email_list = [{
    'title': 'free game only today',
    'spam': True
}, {
    'title': 'cheapest flight deal',
    'spam': True
}, {
    'title': 'limited time offer only today only today',
    'spam': True
}, {
Example #33
0
class NewsDataset:
    def __init__(self,
                 n_samples=None,
                 shuffle=True,
                 random_state=0,
                 train_size=0.8):
        input_directory = "../../data/20_news_groups"
        vocabulary_path = os.path.join(input_directory, 'vocabulary.txt')
        input_voc_stream = open(vocabulary_path, 'r')
        vocab = []
        for line in input_voc_stream:
            vocab.append(line.strip().lower().split()[0])
        self.vocabulary = list(set(vocab))
        self.shuffle = shuffle
        self.random_state = random_state
        self.train_size = train_size

        dataset = fetch_20newsgroups(shuffle=shuffle,
                                     random_state=self.random_state,
                                     remove=('headers', 'footers', 'quotes'))
        if n_samples is None:
            self.n_samples = len(dataset.target)
        else:
            self.n_samples = n_samples

        self.categories = dataset.target[:n_samples]
        self.categories_names = dataset.target_names[:n_samples]
        self.raw_samples = dataset.data[:n_samples]
        self.targets = dataset.target[:n_samples]
        self.target_names = dataset.target_names[:n_samples]

        self.vectorizer = CountVectorizer(max_df=0.95,
                                          min_df=2,
                                          vocabulary=vocab,
                                          preprocessor=self.preprocessor)
        self.X = self.vectorizer.fit_transform(self.raw_samples)
        self.X_train, self.X_test, self.y_train, self.y_test =\
            train_test_split(self.X, self.targets, train_size=self.train_size, random_state=self.random_state,
                             stratify=self.targets)
        self.docs = [
            " ".join(d) for d in self.vectorizer.inverse_transform(self.X)
        ]
        #print("Number documents", len(self.docs))
        #print("Number of unique words", len(self.vocabulary))
        skf = StratifiedKFold(n_splits=5, random_state=0)
        self.splits = skf.split(self.X, self.targets)
        self.doc_set_train = [
            " ".join(d)
            for d in self.vectorizer.inverse_transform(self.X_train)
        ]
        self.doc_set_test = [
            " ".join(d) for d in self.vectorizer.inverse_transform(self.X_test)
        ]
        self.doc_train_list = [
            d.tolist() for d in self.vectorizer.inverse_transform(self.X_train)
        ]
        self.doc_test_list = [
            d.tolist() for d in self.vectorizer.inverse_transform(self.X_test)
        ]

    @staticmethod
    def preprocessor(doc):
        doc = doc.lower()
        doc = re.sub(r'-', ' ', doc)
        doc = re.sub(r'[^a-z ]', '', doc)
        doc = re.sub(r' +', ' ', doc)
        return doc
Example #34
0
        'Documents can be automatically tokenized'
    ]

    # Create a count vectorizer
    print('Count vectorizer:')
    cv = CountVectorizer()

    vectorized_corpus = cv.fit_transform(corpus)
    print(vectorized_corpus.todense())

    print('CV Vocabulary:')
    print(cv.vocabulary_)

    # Perform an inverse transformation
    vector = [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1]
    print(cv.inverse_transform(vector))

    # Use a complete external tokenizer
    print('CV with external tokenizer:')
    cv = CountVectorizer(tokenizer=tokenizer)
    vectorized_corpus = cv.fit_transform(corpus)
    print(vectorized_corpus.todense())

    # Use an n-gram range equal to (1, 2)
    print('CV witn n-gram range (1, 2):')
    cv = CountVectorizer(tokenizer=tokenizer, ngram_range=(1, 2))
    vectorized_corpus = cv.fit_transform(corpus)
    print(vectorized_corpus.todense())

    print('N-gram range (1,2) vocabulary:')
    print(cv.vocabulary_)
Example #35
0
                   header=None,
                   names=['label', 'message'])

df['label'] = df.label.map({'ham': 0, 'spam': 1})
df['message'] = df.message.map(lambda x: x.lower())
df['message'] = df.message.str.replace('[^\w\s]', '')
df['message'] = df['message'].apply(word_tokenize)
df['message'] = df['message'].apply(lambda x: [stemmer.stem(y) for y in x])

# This converts the list of words into space-separated strings
df['message'] = df['message'].apply(lambda x: ' '.join(x))

print(type(df['message']))
count_vect = CountVectorizer()
counts = count_vect.fit_transform(df['message'])
transformer = TfidfTransformer().fit(counts)
counts = transformer.transform(counts)

X_train, X_test, y_train, y_test = train_test_split(counts,
                                                    df['label'],
                                                    test_size=0.1,
                                                    random_state=69)
model = MultinomialNB().fit(X_train, y_train)

predicted = model.predict(X_test)

org_test = count_vect.inverse_transform(X_test)

q = "\n".join(
    [' '.join(y) for x, y in enumerate(org_test) if predicted[x] == 1])
Example #36
0
# K - means clustering
nclusters = 30
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=nclusters, init='k-means++', random_state=2)
y_kmeans = kmeans.fit_predict(X)

# Automatic cluster name generation
centers = np.array(kmeans.cluster_centers_)  # cluster centroids
for i in range(0, nclusters):
    for j in range(0, 50):
        if centers[i, j] > 0.8:
            centers[i, j] = 1
        else:
            centers[i, j] = 0
clusternames = cv.inverse_transform(
    centers)  # reversing bag of words for centers
glue = ' '
for i in range(0, len(clusternames)):
    clusternames[i] = glue.join(clusternames[i])
    if (len(clusternames[i]) == 0):
        clusternames[i] = 'various'

# Final clustering results and evaluation
# Calculating orders per product category
norders = []
for i in range(0, nclusters):
    norders.append(np.count_nonzero(y_kmeans == i))
# Product category percentage of total orders
percorders = []
for i in range(0, nclusters):
    percorders.append(norders[i] / (len(dataset) * 0.01))
Example #37
0
#!/usr/bin/python3
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# immaginiamo una serie di testi (preprocessati)
data = ['ciao amico gatto', 'ciao amico cane', 'ciao ciao']

# vettorizziamoli
text_vectorizer = CountVectorizer()
vectorized_data = text_vectorizer.fit_transform(data)

print('rappresentazione sparsa')
print(vectorized_data)

print('rappresentazione densa')
print(vectorized_data.todense())

print('dai vettori ritorniamo al testo')
print(text_vectorizer.inverse_transform(vectorized_data))
class TopicModelling:
    """
    An abstract class to run a topic modelling on a dataset to extract the most recurrent topics. \n
    THIS CLASS CANNOT BE INSTANCED DIRECTLY. \n
    To run a topic modelling, choose a model between LSA and LDA. The run it by instancing an LSA or LDA object. \n
    Do not forget to run 'pip install -r requirements.txt' to avoid any missing packages errors \n
    """
    def __init__(self,
                 folder,
                 filename,
                 columns,
                 n_topics,
                 sample_only=True,
                 display=False,
                 report=False,
                 cluster=False):
        """
        Initialize the Topic Modelling object \n

        `folder`: the path of the folder containing the data \n
        `filename`: the name of the file to analyse \n
        `columns`: a list with the name of the column to analyse inside the file, ['all'] to include all the dataset \n
        `n_topics`: the number of topics to extract \n
        `sample_only`: to analyze only a sample of the dataset in case of lack of computational power (limits to
        10K observations) \n
        `display`: to display each plot in a new windows \n
        `report`: to generate a pdf report contaning all the output and graphs of the topic modelling analysis \n
        NOTE : The reports are located in the folder containing the data inside the folder "grand_debat_report" \n
        `cluster`: to create a cluster of the topics --> Computer Intensive and very long \n
        `inference`: used to analyse the dataset of all users \n
        """

        self.folder = folder
        self.filename = filename

        now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S_")

        self.logname = now + filename[:len(filename) - 4]

        if self.columns == ['all']:
            self.logname += '_all'
        else:
            all_columns = list(
                pd.read_csv(os.path.join(self.folder, self.filename),
                            nrows=1).columns)
            col_indices = ""
            for colname in self.columns:
                if colname in all_columns:
                    self.logname += "_" + str(all_columns.index(colname))
                else:
                    warnings.warn(colname + " not in the dataset --> Ignoring")
                    self.columns.remove(colname)

        if not os.path.isdir(os.path.join(self.folder, "grand_debat_reports")):
            os.mkdir(os.path.join(self.folder, "grand_debat_reports"))

        self.logname = os.path.join(self.folder, "grand_debat_reports",
                                    self.logname + ".log")

        self.logger = logging.getLogger()
        self.logger.setLevel(logging.INFO)

        self.logger_file_handler = logging.FileHandler(filename=self.logname)
        self.logger_file_handler.setLevel(logging.INFO)
        formatter = logging.Formatter(
            fmt='%(asctime)s-%(msecs)d %(name)s %(levelname)s %(message)s \n',
            datefmt='%Y-%m-%d %H:%M:%S')
        self.logger_file_handler.setFormatter(formatter)
        self.logger.addHandler(self.logger_file_handler)

        self.report = report
        if self.report:
            report_name = self.logname[:len(self.logname) - 4] + "_report.pdf"
            self.pdf = PdfPages(report_name)

        if not os.path.isfile(os.path.join(folder, "clean_" + filename)):
            print("Cleaning file")
            self.logger.info("Cleaning file")
            if filename == "all_user_df.csv":
                dataCleaner = DataCleaner(folder,
                                          'all',
                                          force=False,
                                          user=True,
                                          lemma=False)
            else:
                dataCleaner = DataCleaner(folder,
                                          filename,
                                          force=True,
                                          user=False,
                                          lemma=False)
            dataCleaner.clean()
            print("File Cleaned")
            self.logger.info("File Cleaned")

        self.sample_only = sample_only

        if self.sample_only:
            self.df = pd.read_csv(os.path.join(folder, "clean_" + filename),
                                  nrows=1000)
        else:
            self.df = pd.read_csv(os.path.join(folder, "clean_" + filename))

        self.documents = self.initiate_documents(self.df, self.columns)

        if (self.documents is None) or self.documents.empty is True:
            raise ValueError('Data Not Loaded, check self.documents')

        print('Data Loaded : %d answers to analyse in file : %s' %
              (len(self.documents), filename))
        self.logger.info("Data Loaded : %d answers to analyse in file : %s" %
                         (len(self.documents), filename))

        print('Analysing column : ' + str(columns))
        self.logger.info('Analysing column : ' + str(columns))

        self.n_topics = n_topics

        self._display = display
        self._cluster = cluster

        if not self._display:
            plt.ioff()

        print('Preprocessing the data')
        self.logger.info('Preprocessing the data')
        self.preprocessing()
        print('Preprocessing Done')
        self.logger.info("Preprocessing Done")

        # Variable to be initiated later
        # self.model
        # self.topic_matrix
        # self.keys
        # self.count_vectorizer
        # self.document_term_matrix
        # self.categories
        # self.counts
        # self.top_words
        # self.mean_topic_vectors

    @staticmethod
    def initiate_documents(df, columns):
        """
        `df`: the dataframe \n
        `columns`: the columns to add \n
        `return` a list of documents \n
        """
        # initiating the list of documents
        # if multiple columns, the columns are concatenated

        all_columns = df.columns.tolist()
        if type(columns) is not list:
            raise ValueError('columns must be a list!')
        if columns == ['all']:
            columns_to_load = [
                colname for colname in all_columns
                if colname == 'title' or colname.startswith('QUXV')
            ]
        else:
            columns_to_load = columns
        for col in columns_to_load:
            if col not in all_columns:
                warnings.warn(col + ' is not in the documents --> ignored')
                columns_to_load.remove(col)
        documents = df[columns_to_load[0]].fillna('').map(str)
        for i in range(1, len(columns_to_load)):
            documents = documents + ' ' + df[columns_to_load[i]].fillna(
                '').map(str)
        return documents

    def preprocessing(self):
        """
        Initiate the document term matrix
        """
        self.count_vectorizer = CountVectorizer(stop_words='english')
        self.document_term_matrix = self.count_vectorizer.fit_transform(
            self.documents.astype('U'))

    def get_keys(self, topic_matrix):
        """
        returns an integer list of predicted topic categories for a given topic matrix \n
        For example : topic_matrix[0] = [0.04166669, 0.04166669, 0.04166669, 0.70833314, 0.04166669,
       0.0416667 , 0.0416667 , 0.0416667 ] means that the first document belongs to the topic 3 with the highest probability \n

        `topic_matrix`: a topic_matrix \n
        `return` an integer list \n
        """
        keys = []
        for i in range(topic_matrix.shape[0]):
            keys.append(topic_matrix[i].argmax())

        self.keys = keys

    def keys_to_counts(self):
        """
        the get_keys method has to be executed at least once to run this method \n

        `return` returns a tuple of topic categories and their accompanying magnitudes for the list of keys \n
        """
        count_pairs = Counter(self.keys).items()
        self.categories = [pair[0] for pair in count_pairs]
        self.counts = [pair[1] for pair in count_pairs]

    def get_top_n_words(self, n, keys, document_term_matrix, count_vectorizer):
        """
        `n`: number of top words to compute for each topic \n
        `keys`:  an integer list obtaining with the method get_keys \n
        `document_term_matrix`: a document/term matrix obtaing with a CountVectorizer \n
        `count_vectorizer`: a CountVectorizer object used to create the document term matrix \n
        `return` returns a list of n_topic strings, where each string contains the n most common
        words in a predicted category, in order \n
        """
        top_word_indices = []
        for topic in range(self.n_topics):
            temp_vector_sum = 0
            found = False
            for i in range(len(keys)):
                if keys[i] == topic:
                    temp_vector_sum += document_term_matrix[i]
                    found = True
            if found:
                temp_vector_sum = temp_vector_sum.toarray()
                top_n_word_indices = np.flip(
                    np.argsort(temp_vector_sum)[0][-n:], 0)
                top_word_indices.append(top_n_word_indices)
            else:
                top_word_indices.append([])
        self.top_words = []
        for topic in top_word_indices:
            topic_words = []
            for index in topic:
                temp_word_vector = np.zeros((1, document_term_matrix.shape[1]))
                temp_word_vector[:, index] = 1
                the_word = count_vectorizer.inverse_transform(
                    temp_word_vector)[0][0]
                topic_words.append(the_word.encode('ascii').decode('utf-8'))
            self.top_words.append(" ".join(topic_words))

    def display_top_n_word(self):
        """
        Display a bar chart with the number of documents for each topic \n
        """
        for i in range(len(self.top_words)):
            print("Topic {}: ".format(i), self.top_words[i])
            self.logger.info("Topic %d: %s" % (i, self.top_words[i]))

        labels = [
            'Topic {}: \n'.format(i) +
            ' '.join(self.top_words[i].split(' ')[0:2])
            for i in self.categories
        ]

        fig, ax = plt.subplots(figsize=(16, 8))
        ax.bar(self.categories, self.counts)
        ax.set_xticks(self.categories)
        ax.set_xticklabels(labels)
        ax.set_title('Topic Category Counts')

        if self.report:
            self.pdf.savefig(fig)

        if self._display:
            plt.show()
        else:
            plt.close(fig)

    @staticmethod
    def dimensional_reduction_tsne(topic_matrix):
        """
        Reduce the dimension of the topic matrix to 2D in order to display the cluster of a graph \n\n
        `topic_matrix`: a topic matrix \n
        `return` a 2D vector of the topic matrix \n
        """
        tsne_model = TSNE(n_components=2,
                          perplexity=50,
                          learning_rate=100,
                          n_iter=2000,
                          verbose=1,
                          random_state=0,
                          angle=0.75)
        tsne_vectors = tsne_model.fit_transform(topic_matrix)
        return tsne_vectors

    def get_mean_topic_vectors(self, keys, two_dim_vectors):
        """
        `keys`: a list of the topics of each documents\n
        `two_dim_vectors`: a two dimensional vector reduced by tsne \n
        `return` a list of centroid vectors from each predicted topic category \n
        """
        mean_topic_vectors = []
        for t in range(self.n_topics):
            articles_in_that_topic = []
            for i in range(len(keys)):
                if keys[i] == t:
                    articles_in_that_topic.append(two_dim_vectors[i])

            articles_in_that_topic = np.vstack(articles_in_that_topic)
            mean_article_in_that_topic = np.mean(articles_in_that_topic,
                                                 axis=0)
            mean_topic_vectors.append(mean_article_in_that_topic)
        self.mean_topic_vectors = mean_topic_vectors
        return mean_topic_vectors

    def display_cluster(self, tsne_vectors):
        # output_notebook()
        colormap = np.array([
            "#1f77b4", "#aec7e8", "#ff7f0e", "#ffbb78", "#2ca02c", "#98df8a",
            "#d62728", "#ff9896", "#9467bd", "#c5b0d5", "#8c564b", "#c49c94",
            "#e377c2", "#f7b6d2", "#7f7f7f", "#c7c7c7", "#bcbd22", "#dbdb8d",
            "#17becf", "#9edae5"
        ])
        colormap = colormap[:self.n_topics]

        plot = figure(title="t-SNE Clustering of {} LSA Topics".format(
            self.n_topics),
                      plot_width=1000,
                      plot_height=1000)
        plot.scatter(x=tsne_vectors[:, 0],
                     y=tsne_vectors[:, 1],
                     color=colormap[self.keys])

        for t in range(self.n_topics):
            label = Label(x=self.mean_topic_vectors[t][0],
                          y=self.mean_topic_vectors[t][1],
                          text=' '.join(self.top_words[t].split(' ')[0:2]),
                          text_color=colormap[t])
            plot.add_layout(label)

        if self.report:
            filename = self.logname[:len(self.logname) - 4] + "_cluster.html"
            output_file(filename)
            save(plot)
            print("Cluster saved to html")
            self.logger.info("Cluster saved to html")
        if self._display:
            show(plot)

    @staticmethod
    def postal_code_preprocessing(df, col='authorZipCode'):
        """
        Preprocessing of the column postal code to keep only the first 2 digits \n
        `df`: the dataframe containing a columns of postal codes \n
        `col`: the name of the column containing the postal codes \n
        `return` a series containing the postal codes \n
        """
        postal_codes = df[col].fillna(0).astype(str)
        postal_codes = postal_codes.apply(
            lambda x: '0' + x[:1] if (len(x) == 1 or x[1] == '.') else
            (x if (len(x) <= 2 or x[2] == '.') else
             (x[:3] if (x[:2] == '97' or x[:3] == '999') else x[:2])))
        return postal_codes

    @staticmethod
    def region_preprocessing(df, col='authorZipCode'):
        """
        Convert the postal codes to regions of France \n\n
        `df`: the dataframe containing a columns of postal codes \n
        `col`: the name of the column containing the postal codes  \n
        `return` the original dataframe with a new column 'region' \n
        """
        postal_codes = TopicModelling.postal_code_preprocessing(df)
        regions = postal_codes.apply(TopicModelling.find_region)
        df['region'] = regions
        return df

    @staticmethod
    def find_region(postal_code):
        """
        Find the region of a postal code \n\n
        `postal_code`: a postal code with 2 digits \n
        `return` the region of the postal code \n
        """
        REGIONS = {
            'Auvergne-Rhône-Alpes': [
                '01', '03', '07', '15', '26', '38', '42', '43', '63', '69',
                '73', '74'
            ],
            'Bourgogne-Franche-Comté':
            ['21', '25', '39', '58', '70', '71', '89', '90'],
            'Bretagne': ['35', '22', '56', '29'],
            'Centre-Val de Loire': ['18', '28', '36', '37', '41', '45'],
            'Corse': ['2A', '2B'],
            'Grand Est':
            ['08', '10', '51', '52', '54', '55', '57', '67', '68', '88'],
            'Guadeloupe': ['971'],
            'Guyane': ['973'],
            'Hauts-de-France': ['02', '59', '60', '62', '80'],
            'Île-de-France': ['75', '77', '78', '91', '92', '93', '94', '95'],
            'La Réunion': ['974'],
            'Martinique': ['972'],
            'Normandie': ['14', '27', '50', '61', '76'],
            'Nouvelle-Aquitaine': [
                '16', '17', '19', '23', '24', '33', '40', '47', '64', '79',
                '86', '87'
            ],
            'Occitanie': [
                '09', '11', '12', '30', '31', '32', '34', '46', '48', '65',
                '66', '81', '82'
            ],
            'Pays de la Loire': ['44', '49', '53', '72', '85'],
            'Provence-Alpes-Côte d\'Azur':
            ['04', '05', '06', '13', '83', '84'],
        }
        for key, value in REGIONS.items():
            if postal_code in value:
                return key

        return 'Undefined'

    def topic_postal_code(self, region=False, percentage=False):
        """
        Analyse the importance of each topic per postal codes or region \n\n
        `region`: to group the postal codes into region or note \n
        `percentage`: to display in percentage or in number of occurence \n
        """
        if 'authorZipCode' not in self.df.columns:
            return

        if region:
            postal_codes = TopicModelling.region_preprocessing(
                self.df)['region']
        else:
            postal_codes = TopicModelling.postal_code_preprocessing(self.df)

        postal_codes_index = dict.fromkeys(postal_codes)
        postal_codes_unique = list(postal_codes_index.keys())

        if not region:
            postal_codes_unique = [int(float(x)) for x in postal_codes_unique]
            postal_codes_unique.sort()
            postal_codes_unique = [
                str(x).zfill(2) for x in postal_codes_unique
            ]

        for key in list(postal_codes_index):
            if not region:
                key = str(int(float(key))).zfill(2)
            postal_codes_index[key] = postal_codes_unique.index(key)

        count_topic_postal = np.zeros(
            (len(postal_codes_unique), self.n_topics))

        for i in tqdm(range(len(self.keys))):
            if not region:
                count_topic_postal[postal_codes_index[str(
                    int(float(postal_codes[i]))).zfill(2)]][self.keys[i]] += 1
            else:
                count_topic_postal[postal_codes_index[postal_codes[i]]][
                    self.keys[i]] += 1

        self.df_count_topic_postal = pd.DataFrame(data=count_topic_postal,
                                                  index=postal_codes_unique)
        self.df_count_topic_postal.columns = [
            'Topic {}'.format(i) for i in range(self.n_topics)
        ]

        if percentage:
            self.df_count_topic_postal = self.df_count_topic_postal.div(
                self.df_count_topic_postal.sum(axis=1), axis=0)
            self.df_count_topic_postal.fillna(0)

        fig, ax = plt.subplots(figsize=(14, 10))
        ax = sb.heatmap(self.df_count_topic_postal, cmap="YlGnBu", ax=ax)

        if self.report:
            self.pdf.savefig(fig)

        if self._display:
            plt.show()
        else:
            plt.close(fig)

    def get_topics_words_weigths_counts(self, n_words=50):
        """
        Create a dataframe containing in each row : a word, its topic, the number of occurence inside the topic,
        and its weigths inside the topics \n\n
        `n_words`: the number of words per topic \n
        the dataframe is accessible inside the attribute self.df_topics_words_weigths_counts \n
        """
        self.topics_words_weigths = self.model.components_ / self.model.components_.sum(
            axis=1)[:, np.newaxis]

        topics_words_weigths_counts = []

        documents_by_topics = {}
        words_counts_by_topic = {}

        for i in range(self.n_topics):
            documents_by_topics[i] = []
            words_counts_by_topic[i] = {}

        for i in range(len(self.documents)):
            documents_by_topics[self.keys[i]].append(self.documents[i])

        for topic, documents in documents_by_topics.items():
            count_vectorizer = CountVectorizer()
            try:
                count_vectorizer.fit(documents)
            except Exception as e:
                print("type error: " + str(e))
                print(traceback.format_exc())
                continue
            words_counts_by_topic[topic] = count_vectorizer.vocabulary_
            del count_vectorizer

        for i in tqdm(range(self.topics_words_weigths.shape[0])):
            topic_words_weigths = self.topics_words_weigths[i]
            top_n_words_index = np.argsort(topic_words_weigths)[::-1][:n_words]

            for j in tqdm(range(len(top_n_words_index))):
                index = top_n_words_index[j]
                temp_word_vector = np.zeros(
                    (1, self.document_term_matrix.shape[1]))
                temp_word_vector[:, index] = 1
                the_word = self.count_vectorizer.inverse_transform(
                    temp_word_vector)[0][0]

                if the_word in words_counts_by_topic[i]:
                    topics_words_weigths_counts.append([
                        i, the_word, topic_words_weigths[index],
                        words_counts_by_topic[i][the_word]
                    ])

        self.df_topics_words_weigths_counts = pd.DataFrame(
            data=topics_words_weigths_counts,
            columns=['topic_id', 'word', 'importance', 'word_count'])

    def display_wordcloud(self, topic_number):
        """
        Display a wordcloud of the words of a topic by weight \n
        `topic_number`: the number of the topic \n
        """
        df_text = self.df_topics_words_weigths_counts
        df_text = df_text.loc[df_text['topic_id'] == topic_number]
        words = df_text['word'].values
        importance = df_text['importance'].values
        number = df_text['word_count']

        # Transformer en nombre de mots a avoir en fonction de l'importance?
        importance = 10000 * importance
        importance = importance.astype(int)

        # Avoir la liste de mots qui va passer dans le wordcloud
        words_total = np.copy(words)
        for i in range(number.shape[0]):
            w = words[i] + " "
            words_total[i] = w * importance[i]

        # Transformer en string
        text = ''.join(words_total)

        # Enlever le char ', facultatif
        text.replace("'", "")

        path_current_directory = os.path.dirname(os.path.abspath(__file__))
        mask = np.array(
            (Image.open(os.path.join(path_current_directory,
                                     'cloud_mask2.png'))))

        # Dessiner le nuage de mots
        wordcloud = wc.WordCloud(background_color="white",
                                 collocations=False,
                                 mask=mask).generate(str(text))

        fig, ax = plt.subplots(figsize=(16, 8), sharey='all', dpi=160)
        ax.imshow(wordcloud, interpolation="bilinear")
        ax.set_axis_off()

        plt.title("Word Cloud by weight of Topic " + str(topic_number))

        if self.report:
            self.pdf.savefig(fig)

        if self._display:
            plt.show()
        else:
            plt.close(fig)

    def display_topics_words_weights_counts(self, n_words=15):
        """
        Plot Word Count and Weights of Topic Keywords \n
        """
        df = self.df_topics_words_weigths_counts
        cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

        for i in range(self.n_topics):

            data = df.loc[df.topic_id == i, :][:n_words]

            if data.empty:
                continue

            fig, ax = plt.subplots(figsize=(16, 8), sharey='all', dpi=160)

            ax.bar(x='word',
                   height="word_count",
                   data=data,
                   color=cols[i % len(cols)],
                   width=0.5,
                   alpha=0.3,
                   label='Word Count')
            ax_twin = ax.twinx()
            ax_twin.bar(x='word',
                        height="importance",
                        data=data,
                        color=cols[i % len(cols)],
                        width=0.2,
                        label='Weights')
            ax.set_ylabel('Word Count', color=cols[i % len(cols)])
            ax.set_title('Topic: ' + str(i),
                         color=cols[i % len(cols)],
                         fontsize=16)
            ax.tick_params(axis='y', left=False)
            ax.set_xticklabels(data['word'],
                               rotation=30,
                               horizontalalignment='right')
            ax.legend(loc='upper left')
            ax_twin.legend(loc='upper right')

            fig.tight_layout(w_pad=2)
            fig.suptitle('Word Count and Importance of Topic Keywords',
                         fontsize=22,
                         y=1.05)

            if self.report:
                self.pdf.savefig(fig)

            if self._display:
                plt.show()
            else:
                plt.close(fig)

            self.display_wordcloud(i)

    def topic_correlation(self, topic_matrix):
        """
        Analyse the correlation between topic \n \n
        `topic_matrix`: self.topic_matrix \n
        """
        data = pd.DataFrame(data=topic_matrix)
        corr = data.corr()
        fig, ax = plt.subplots(figsize=(16, 8), sharey='all', dpi=160)
        cax = ax.matshow(corr, cmap='coolwarm', vmin=-1, vmax=1)
        fig.colorbar(cax)
        ticks = np.arange(0, len(data.columns), 1)
        ax.set_xticks(ticks)
        plt.xticks(rotation=90)
        ax.set_yticks(ticks)
        ax.set_xticklabels(data.columns)
        ax.set_yticklabels(data.columns)

        fig.suptitle('Topic Correlation')

        if self.report:
            self.pdf.savefig(fig)

        if self._display:
            plt.show()
        else:
            plt.close(fig)

    def summary(self, topic_matrix):
        """
        Run the topic modelling analyse of the topic_matrix given in parameters \n\n
        `topic_matrix`: the topic matrix of the model \n
        """
        self.get_keys(topic_matrix)

        self.keys_to_counts()

        print('Computing top words')
        self.logger.info('Computing top words')
        self.get_top_n_words(30, self.keys, self.document_term_matrix,
                             self.count_vectorizer)

        print('Displaying top words')
        self.logger.info('Displaying top words')
        self.display_top_n_word()

        print('Computing topics per postal code by counts')
        self.logger.info('Computing topics per postal code by counts')
        self.topic_postal_code(region=False, percentage=False)

        print('Computing topics per postal code by percentage')
        self.logger.info('Computing topics per postal code by percentage')
        self.topic_postal_code(region=False, percentage=True)

        self.topic_postal_code(region=True, percentage=False)

        self.topic_postal_code(region=True, percentage=True)

        print('Computing words weights per topics')
        self.logger.info('Computing words weigths per topics')
        self.get_topics_words_weigths_counts(n_words=50)

        print('Displaying words weigths per topics')
        self.logger.info('Displaying words weigths per topics')
        self.display_topics_words_weights_counts(n_words=20)

        print('Topic Correlation')
        self.logger.info('Topic Correlation')
        self.topic_correlation(topic_matrix)

        if self.report:
            self.pdf.close()

        if self._cluster:
            print('Dimensional Reduction')
            self.logger.info('Dimensional Reduction')
            tsne_vectors = self.dimensional_reduction_tsne(topic_matrix)

            print('Get Mean Topic Vectors')
            self.logger.info('Get Mean Topic Vectors')
            self.get_mean_topic_vectors(self.keys, tsne_vectors)

            print('Displaying Cluster')
            self.logger.info('Displaying Cluster')
            self.display_cluster(tsne_vectors)

        print('File Analysed !')
        self.logger.info('File Analysed !')

        self.logger.removeHandler(self.logger_file_handler)
        del self.logger, self.logger_file_handler