def calc_lda(tf_matrix, n_topics=10, max_iter=20):
    lda = decomposition.LatentDirichletAllocation(n_topics=n_topics,
                                                  max_iter=max_iter,
                                                  learning_method='online',
                                                  learning_offset=50.)
    doctopic = lda.fit_transform(tf_matrix)
    return doctopic, lda
Exemple #2
0
def summary():
    if 'show_id' in request.args:
        showrun = request.args.get('show_id')
        states = State.query.filter_by(showrun=int(showrun)).order_by(
            State.created_date.desc()).all()
        texts = [s.text for s in states]
    else:
        texts = []
        show_ids = request.args.get('show_ids')
        show_ids = [int(x) for x in show_ids.split(',')]
        for show_id in show_ids:
            states = State.query.filter_by(showrun=show_id).order_by(
                State.created_date.desc()).all()
            t = [s.text for s in states]
            texts.extend(t)
    vectorizer = TfidfVectorizer(stop_words='english',
                                 min_df=1,
                                 tokenizer=tokenize_nltk)
    dtm = vectorizer.fit_transform(texts).toarray()
    vocab = np.array(vectorizer.get_feature_names())
    #Define Topic Model: LatentDirichletAllocation (LDA)
    clf = decomposition.LatentDirichletAllocation(n_topics=5, random_state=3)
    num_top_words = 3
    doctopic = clf.fit_transform(dtm)
    topic_words = []
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])
    merged = list(set(itertools.chain.from_iterable(
        topic_words)))  # use set for unique items
    return jsonify({'data': {'topics': merged}})
Exemple #3
0
def runLDA(train, test, dev, numComponents, applianceName):

    # Function that run the Latent Dirichlet Allocation
    # It takes in a train,test and dev dataset that comprises frequency distributions
    # numComponents specifies the number of clusters or topics for the model

    # It returns the perplexity score of the dev set, and the clusters assignements on the train,dev and test set

    model = skld.LatentDirichletAllocation(n_components=numComponents,
                                           verbose=0)
    model.fit(train)

    predictions = model.transform(test)
    test_classification = np.argmax(predictions, 1)
    test_classification = test_classification.astype(int)

    predictions2 = model.transform(dev)
    dev_classification = np.argmax(predictions2, 1)
    dev_classification = dev_classification.astype(int)

    predictions3 = model.transform(train)
    train_classification = np.argmax(predictions3, 1)
    train_classification = train_classification.astype(int)

    if numComponents > 10:
        nRows = 3
    else:
        nRows = 2

    perplex = model.perplexity(dev)

    #compare_distributions(test,dev_classification,numComponents,nRows,applianceName)
    #compare_distributions(dev, dev_classification, numComponents, nRows, applianceName)

    return perplex, train_classification, dev_classification, test_classification
Exemple #4
0
def latent_dirichlet():
	'''
	Fits a latent dirichlet allocation model to the corpus of organization
	descriptions and/or self-reported IRS designations. Then prints the top
	10 words in each component. Does NOT return the model for further use.
	'''
	n_top_words = 10
	connect = sql.connect("myform/with_coords")
	db = connect.cursor()
	query = '''SELECT text_dump, pp_text FROM mcp'''
	corpus = []
	for all_text in db.execute(query).fetchall():
		writing = ""
		for alpha in all_text:
			if alpha:
				writing += alpha + " "
		corpus.append(writing)
	new_stops = ["chicago", "illinois", "founded", "year"]
	vect = sktext.CountVectorizer(stop_words = STOP_WORDS + new_stops)
	dtm = vect.fit_transform(corpus)
	model = skdecomp.LatentDirichletAllocation(
		n_components=N_CLUST,
		max_iter=5,
        learning_method='online',
        learning_offset=50)
	model.fit(dtm)
	tf_feature_names = vect.get_feature_names()
	print_top_words(model, tf_feature_names, n_top_words)
Exemple #5
0
 def __init__(self, config):
     self.topics_name = 'X_topics'
     self.name = 'lda_model'
     self.model = decomposition.LatentDirichletAllocation(
         n_components=config['n_components'],
         learning_method=config['learning_method'],
         max_iter=config['max_iter'])
Exemple #6
0
def train_lda_model(x_train_count, count_vector):
    """ train a LDA Model """
    lda_model = decomposition.LatentDirichletAllocation(
        n_components=20, learning_method='online', max_iter=20)
    x_topics = lda_model.fit_transform(x_train_count)
    topic_word = lda_model.components_
    vocab = count_vector.get_feature_names()
    return x_topics, topic_word, vocab
Exemple #7
0
def modeling(docs, alpha, beta, num_words, num_topics):
    learning = sk.LatentDirichletAllocation(num_topics,
                                            alpha,
                                            beta,
                                            learning_method='batch',
                                            max_iter=5000)
    learning.fit(docs)
    return (learning.transform(docs), learning.components_)
 def Linear_discriminant_analysis(self, source):
     min_max_scaler = preprocessing.MinMaxScaler()
     data_source = min_max_scaler.fit_transform(source)
     pca = decomposition.LatentDirichletAllocation(n_components=2)
     #居然LDA没有???
     result = {}
     result['data'] = pca.fit_transform(data_source)
     result['params'] = 0
     return result
Exemple #9
0
def lda(
    n_topics: int,
    name: Optional[str] = "lda",
) -> TopicModelingOperation:
    model = skdecomp.LatentDirichletAllocation(n_components=n_topics)
    return TopicModelingOperation(
        model=model,
        name=name,
    )
def extract_topics_lda(data_samples, preprocessor, n_features, n_topics, n_top_words, n_gram_range=(1,1), more_stopwords=None):

    lda = decomposer.LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    
    topics_words = _extract_topics_decomposer(data_samples, preprocessor, lda, n_features, n_topics, n_top_words, n_gram_range, more_stopwords)
    
    return topics_words
def run_latent_dirichlet_allocation(X=None):
    lda = decomposition.LatentDirichletAllocation()
    if X is None:
        X = users_as_real_vectors(users)
    p = lda.fit_transform(X)
    mask = (p[:, 0] * p[:, 0] + p[:, 1] * p[:, 1] < 12)
    p = p[mask]
    plt.figure()
    plt.scatter(p[:, 0], p[:, 1])
    plt.show()
def extract_topics(data_samples, lang, n_features, n_topics, n_top_words, more_stopwords=None):
    
    n_gram_range = (2, 2)
    
    t0 = time()

    preprocessor = None
    if lang in ["en", "english"]:
        preprocessor = TokenHandler.EnTokenHandler(stemming=True, stopword=True)
    if lang in ["tr", "turkish"]:
        preprocessor = TokenHandler.TrTokenHandler(stopword=True, more_stopwords=more_stopwords, 
                                                   stemming=False, 
                                                   remove_numbers=True,
                                                   deasciify=True, remove_punkt=True)

    '''    
    tf_vectorizer = txtfeatext.CountVectorizer(tokenizer=preprocessor, 
                                      ngram_range=(1, 2),
                                      max_features=n_features)  
    tf_matrix = tf_vectorizer.fit_transform(data_samples)
    ''' 
    tfidf_vectorizer = txtfeatext.TfidfVectorizer(tokenizer=preprocessor, 
                                          ngram_range=n_gram_range,
                                          max_features=n_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform(data_samples)
    
    t1 = time()
    print("1- Vectorizing took ", (t1-t0), "sec.")
    
    # apply NMF
    '''
    print("Applying NMF on tf*idf weighted terms, n_samples=%d and n_features=%d..."
          % (n_samples, n_features))
    '''
    nmf = decomposer.NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf_matrix)
    print("\nTopics in NMF model:")
    print_topic_words(nmf, tfidf_vectorizer, n_top_words)
    #nmf_topics = get_topic_words(model, vectorizer, n_top_words)
    
    t2 = time()
    print("NMF took ", t2 - t1, "sec.")
    
    #print("Applying LDA on tf weighted terms, n_samples=%d and n_features=%d..." % (n_samples, n_features))
    lda = decomposer.LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(tfidf_matrix)  
    print("\nTopics in LDA model:")
    print_topic_words(lda, tfidf_vectorizer, n_top_words)
    
    
    t3 = time()
    print("LDA took ", t3 - t2, "sec.")
    
    '''
Exemple #13
0
def perform_lda(max_df, min_df, topics, ngram):
    vectorizer = CountVectorizer(stop_words='english',
                                 max_df=max_df,
                                 min_df=min_df,
                                 ngram_range=ngram)
    matrixX = vectorizer.fit_transform(wordX)
    lda = d.LatentDirichletAllocation(n_components=topics,
                                      max_iter=10,
                                      verbose=1)
    lda.fit(matrixX)
    return [lda, vectorizer, max_df, min_df, topics, ngram]
def LDA(num_topics, num_top_words, deck):
    vectorizer = CountVectorizer(tokenizer=word_tokenize)
    X = vectorizer.fit_transform(deck)
    X_vocab = np.array(vectorizer.get_feature_names())
    lda = decomposition.LatentDirichletAllocation(n_topics=num_topics,learning_method='online')
    lda.fit_transform(X)
    lda_topic_words = []
    for topic in lda.components_:
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        lda_topic_words.append([X_vocab[i] for i in word_idx])
    
    lda_topic_words
def topic_modelling(flag):  #function for tokenization, training etc
    pd.set_option('display.max_colwidth', -1)
    X_train, x_test = train_test_split(reviews_datasets,
                                       test_size=0.9,
                                       random_state=111)
    # printx_test,typex_test))

    vectorizer_tf = TfidfVectorizer(tokenizer=tokenize,
                                    stop_words='english',
                                    max_df=0.75,
                                    min_df=50,
                                    max_features=10000,
                                    use_idf=False,
                                    norm=None)
    tf_vectors = vectorizer_tf.fit_transform(X_train.text)
    if (flag == 1):
        lda = decomposition.LatentDirichletAllocation(n_components=10,
                                                      max_iter=3,
                                                      learning_method='online',
                                                      learning_offset=50,
                                                      n_jobs=-1,
                                                      random_state=111)
        with open("lda_model.pk", "wb") as f:
            pickle.dump(lda, f)
    else:
        with open("lda_model.pk", "rb") as f:
            lda = pickle.load(f)
    W1 = lda.fit_transform(tf_vectors)
    H1 = lda.components_
    num_words = 15
    vocab = np.array(vectorizer_tf.get_feature_names())
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_words - 1:-1]]
    topic_words = ([top_words(t) for t in H1])
    topics = [' '.join(t) for t in topic_words]
    colnames = ["Topic" + str(i) for i in range(lda.n_components)]
    docnames = ["Doc" + str(i) for i in range(len(X_train.text))]
    df_doc_topic = pd.DataFrame(np.round(W1, 2),
                                columns=colnames,
                                index=docnames)
    df_doc_topic = pd.DataFrame(np.round(W1, 2),
                                columns=colnames,
                                index=docnames)
    topic_important = np.argmax(df_doc_topic.values, axis=1)
    df_doc_topic['most_matched_topic'] = topic_important

    print("Log Likelihood: ", lda.score(tf_vectors))
    print("Perplexity: ", lda.perplexity(tf_vectors))
    return lda, vectorizer_tf, topics
Exemple #16
0
def _topics_extraction_with_lda(X):
    tf_vectorizer = TfidfVectorizer(min_df=2,
                                    ngram_range=(1, 2),
                                    stop_words=get_stop_words('nl'))
    data = tf_vectorizer.fit_transform(X)
    #print('Before LDA, sample size: {}'.format(data.shape))
    best_params_ = {
        'n_topics': 4,
        'max_iter': 100,
        'n_jobs': -1,
        'learning_method': 'batch'
    }
    lda = decomposition.LatentDirichletAllocation(**best_params_)
    data = lda.fit_transform(data)
    #print('After LDA, sample size: {}'.format(data.shape))
    return data
Exemple #17
0
    def construct_model ( self ):
        '''
        Learn an 10 topic LDA model on the wine descriptions provided in
        the database.        
        '''
        df = self.df
        tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                        max_features=2000,
                                        stop_words='english')

        tf_names = tf_vectorizer.fit_transform ( df['description'].dropna() )
        model = decomposition.LatentDirichletAllocation ( )
        model.fit ( tf_names )

        self.model = model
        self.tf_vectorizer = tf_vectorizer
Exemple #18
0
def get_top_k_latent_semantics(k, model, gesture_word_matrix):
    if model == "PCA":
        model = decomposition.PCA(n_components=top_k_input)
    if model == "SVD":
        model = decomposition.TruncatedSVD(n_components=top_k_input)
    if model == "NMF":
        model = decomposition.NMF(n_components=top_k_input, max_iter=10000)
    if model == "LDA":
        model = decomposition.LatentDirichletAllocation(n_components=top_k_input)

    model.fit(gesture_word_matrix)

    # take the top-k latent semantics
    top_k_matrix = model.components_

    return top_k_matrix
Exemple #19
0
def get_lda_df():
    lda = decomp.LatentDirichletAllocation(
        n_topics=data_config.lda_num_topics,
        doc_topic_prior=1./200,
        topic_word_prior=1./200,
    )
    print "getting sample 1"
    lda_w_2_idx, lda_sample = models.create_lda_sample(p=.1)
    print "partial fit 1"
    lda.partial_fit(lda_sample)
    print "getting sample 2"
    _, lda_sample = models.create_lda_sample(p=.1)
    print "partial fit 2"
    lda.partial_fit(lda_sample)
    lda_mean = {}
    lda_mean_mean = {}
    #lda_max = {}
    print "labeling corpus"
    for day, comments in corpus.get_day_preprocessed_comments(p=.25, include_oos=True):
        print day
        coo_dict = coll.defaultdict(int)
        for doc_idx, sent_tokens in enumerate(comments):
            for sent in sent_tokens:
                for token in sent:
                    if token not in lda_w_2_idx:
                        continue
                    word_idx = lda_w_2_idx[token]
                    coo_dict[doc_idx, word_idx] += 1
        lda_day = lda.transform(util.d_to_sparse(coo_dict, shape=(doc_idx+1, lda.components_.shape[1])))
        lda_day_mean = (lda_day.T / lda_day.sum(axis=1)).T
        lda_mean[day] = lda_day.mean(axis=0)
        lda_mean_mean[day] = lda_day_mean.mean(axis=0)
        #lda_max[day] = lda_day.max(axis=0)

    lda_idx_2_w = {idx: w for w, idx in lda_w_2_idx.viewitems()}

    days, lda_reps = zip(*sorted(lda_mean.items(), key=lambda x:x[0]))
    df_lda_mean = pd.DataFrame(map(tuple, lda_reps), index=days)

    days, lda_reps = zip(*sorted(lda_mean_mean.items(), key=lambda x:x[0]))
    df_lda_mean_mean = pd.DataFrame(map(tuple, lda_reps), index=days)

    days, lda_reps = zip(*sorted(lda_max.items(), key=lambda x:x[0]))
    df_lda_max = pd.DataFrame(map(tuple, lda_reps), index=days)


    return lda, lda_w_2_idx, pd.DataFrame(map(tuple, lda_reps), index=days)
def Dirichlet(documents, vectorizer):
    dtm = vectorizer.fit_transform(documents).toarray()

    lda = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                                  max_iter=50,
                                                  learning_method='online',
                                                  learning_offset=50.,
                                                  random_state=0,
                                                  verbose=1,
                                                  evaluate_every=1,
                                                  doc_topic_prior=0.2,
                                                  topic_word_prior=0.6)
    lda.fit(dtm)

    print_top_words(lda, vectorizer)

    return lda
Exemple #21
0
def generate_topic(data, num_topic=10):
    vector = CountVectorizer()
    vector.fit(data)
    vocab = vector.vocabulary_
    vector = CountVectorizer(stop_words="english", vocabulary=vocab.keys())
    X = vector.fit_transform(data)
    lda = decomposition.LatentDirichletAllocation(n_topics=num_topic,
                                                  learning_method="online")
    for day in range(X.shape[0] - 1, -1, -1):
        lda.partial_fit(X[day, :])
        doc_topic = lda.transform(X[day, :])
        alpha = sum(doc_topic) / len(doc_topic)
        eta = sum(doc_topic) / len(doc_topic)
        lda.set_params(doc_topic_prior=alpha, topic_word_prior=eta)
    doc_topic = lda.transform(X)
    doc_topic = pandas.DataFrame(doc_topic)
    return doc_topic
Exemple #22
0
def fit_and_predict_LDA(num_topics, num_top_words, vocab, dtm_train, dtm_test):
    """
    Fit the LDA topic modeling to the training document term matrix.
    Using the generated topics, map test document term matrix to 
    document to topic matrix. Also return topic words.


    Parameters
    ----------
    num_topics: int
        number of topics NMF decomposition should generate
    num_top_words: int
        number of topic words stored in topic_words list
    vocab: set
        set of unique terms in the reviews
    dtm_train: scipy sparse matrix
        Data for training (matrix with features, e.g. BOW or tf-idf)
    dtm_test: scipy sparse matrix
        Data for testing and used for 'prediction' (matrix with features, e.g. BOW or tf-idf)

    Returns
    -------
    Tuple(numpy.ndarray, set)
        Returns doctopic, topic_words as a tuple
    """
    lda = decomposition.LatentDirichletAllocation(n_topics=num_topics, random_state=1)
    lda.fit(dtm_train)
    doctopic = lda.transform(dtm_test)
    #scale the document-component matrix such that the component values associated with each document sum to one
    doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

    topic_words = []
    for topic in lda.components_: # components is the topic-term matrix
        word_idx = np.argsort(topic)[::-1][0:num_top_words]
        topic_words.append([vocab[i] for i in word_idx])
    
    print_top_5_topics(doctopic, len(lda.components_), topic_words)        
    
#    for t in range(len(topic_words)):
#        print("Topic {}: {}".format(t+1, ' '.join(topic_words[t][:10])))
    
    # I just hard coded the type of tokenization, because I didn't want to over-complicate the arguments to this function
    pickle.dump((doctopic, topic_words), open("pickles/lda-np-"+str(num_topics)+"-doctopic-topic_words.p", "wb"))
    
    return (doctopic, topic_words)
Exemple #23
0
def process_lda():
    # inputs
    file_lines = []
    lines = []
    with codecs.open(sys.argv[1], encoding='UTF-8') as f:
        file_lines = f.read().splitlines()

    #lines = cleanup(file_lines)
    lines = file_lines

    # params for LDA
    n_feats = 250  #1000
    n_topics = 20
    n_top_words = 50

    # getting a custom stop-words list
    en_stop_words = []
    en_stop_words = stop_words()

    # use tf (raw term count) features for LDA
    tf_vectorizer = text.CountVectorizer(
        max_df=0.95,
        min_df=2,
        #max_features=n_feats, stop_words='english')
        max_features=n_feats,
        stop_words=en_stop_words)
    tf = tf_vectorizer.fit_transform(lines)
    tf_feature_names = tf_vectorizer.get_feature_names()

    # fit an LDA model to the tf feats of the textual data
    lda = decomposition.LatentDirichletAllocation(max_iter=10,
                                                  learning_method='online',
                                                  learning_offset=50.,
                                                  random_state=0,
                                                  verbose=1).fit(tf)

    # outputs
    with open(
            sys.argv[2] + "/lda." + sys.argv[1].split("/")[-1:].pop() + ".out",
            "w") as f:
        messages = print_top_words(lda, tf_feature_names, n_top_words)
        for m in messages:
            f.write("{}\n".format(m.encode('utf-8')))
Exemple #24
0
def extract_topic(str_arg, num_topics=1, num_top_words=3):
    vectorizer = text.CountVectorizer(input='content',
                                      analyzer='word',
                                      lowercase=True,
                                      stop_words='english')
    dtm = vectorizer.fit_transform(str_arg.split())
    vocab = np.array(vectorizer.get_feature_names())

    #clf = decomposition.NMF(n_components=num_topics, random_state=1) ## topic extraction
    clf = decomposition.LatentDirichletAllocation(n_components=num_topics,
                                                  learning_method='online')
    clf.fit_transform(dtm)

    topic_words = []
    for topic in clf.components_:
        word_idx = np.argsort(topic)[::-1][
            0:num_top_words]  ##[::-1] reverses the list
        topic_words.append([vocab[i] for i in word_idx])
    return topic_words
def lda(df,
        cat=[
            'Name',
            'Platform',
            'Genre',
            'Developer',
            'Year_of_Release',
            'Rating',
        ]):
    for v in tqdm(itertools.permutations(cat, 2)):
        # co-occurence matrix
        n_comp = 3
        if (v[0] in ['Publisher', 'Name', 'Developer']) & (
                v[1] in ['Publisher', 'Name', 'Developer', 'Rating']):
            continue
        if (v[0] == 'Platform_Genre') & (v[1] in ['Platform', 'Genre']):
            continue
        if (v[1] == 'Platform_Genre') & (v[0] in ['Platform', 'Genre']):
            continue
        if f'lda1_{v[0]}_{v[1]}' not in df.columns.values.tolist():
            print(f'{v[0]} vs {v[1]}')
            agg_df = pd.crosstab(df[v[0]], df[v[1]])

            # lda
            trans = decomposition.LatentDirichletAllocation(
                n_components=n_comp, random_state=42)
            trans2 = decomposition.NMF(n_components=n_comp,
                                       max_iter=8000,
                                       random_state=42)
            trans3 = decomposition.PCA(n_components=n_comp, random_state=42)

            lda_df = add_transformed(agg_df, trans, v, method='lda')
            nmf_df = add_transformed(agg_df, trans2, v, method='nmf')
            pca_df = add_transformed(agg_df, trans3, v, method='pca')

            # merge
            df = df.merge(lda_df, how='left', on=v[0])
            df = df.merge(nmf_df, how='left', on=v[0])
            df = df.merge(pca_df, how='left', on=v[0])
    return df
Exemple #26
0
def get_lda_features(features, xtrain_count):

    # create a count vectorizer object
    count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
    count_vect.fit(features)

    # train a LDA Model
    lda_model = decomposition.LatentDirichletAllocation(
        n_components=20, learning_method='online', max_iter=20)
    X_topics = lda_model.fit_transform(xtrain_count)
    topic_word = lda_model.components_
    vocab = count_vect.get_feature_names()

    # view the topic models
    n_top_words = 10
    topic_summaries = []
    for i, topic_dist in enumerate(topic_word):
        topic_words = numpy.array(vocab)[numpy.argsort(
            topic_dist)][:-(n_top_words + 1):-1]
        topic_summaries.append(' '.join(topic_words))

    return topic_summaries
def clustervis_pipelines(visdim):
    return {
        'PCA': Pipeline([
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', decomposition.PCA(n_components=visdim)),
            ('sca2', preprocessing.MaxAbsScaler()),
        ]),
        'NMF': Pipeline([                
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', decomposition.NMF(n_components=visdim, random_state=1, alpha=.1, l1_ratio=.5)),
            ('sca2', preprocessing.MaxAbsScaler()),
        ]),    
        'LDA': Pipeline([                
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', decomposition.LatentDirichletAllocation(n_components=visdim, learning_method='online')),
            ('sca2', preprocessing.MaxAbsScaler()),
        ]),
        'SVD': Pipeline([                
            ('sca', preprocessing.MaxAbsScaler()),
            ('clu', decomposition.TruncatedSVD(n_components=visdim)),
            ('sca2', preprocessing.MaxAbsScaler()),
        ])    
    }
 def perform(self):
     lda = skd.LatentDirichletAllocation(n_components=self.n_components, random_state=self.random_state)
     transform = lda.fit_transform(self.data)
     return transform
Exemple #29
0
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print

print("\n")
print("Prediction")

prediction = model.predict(valid_x)
print(prediction)'''


# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 10
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))


def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
#dtm = extract_tfidf_dtm(documents, advanced_parsing.extract_np_tokens)

count_vect = pickle.load(open("pickles/np-30000-count-vect.p", "rb"))
tfidf_transformer = pickle.load(open("pickles/np-30000-tfidf-trans.p", "rb"))

dtm = pickle.load(open("pickles/np-30000-dtm.p", "rb"))

import numpy as np  # a conventional alias
import sklearn.feature_extraction.text as text

from sklearn import decomposition

num_topics = 60
num_top_words = 20

lda = decomposition.LatentDirichletAllocation(n_topics=num_topics,
                                              random_state=1)

# this next step may take some time
doctopic = nmf.fit_transform(dtm)
doctopic = doctopic / np.sum(doctopic, axis=1, keepdims=True)

# print words associated with topics
topic_words = []
for topic in nmf.components_:
    word_idx = np.argsort(topic)[::-1][0:num_top_words]
    topic_words.append([vocab[i] for i in word_idx])

print(topic_words)
print(word_idx)
#
#