Esempio n. 1
0
def plot_perplexity_iter(A_tfidf, num_topics):
    
    print "computing perplexity vs iter..."
    max_iter = 5
    perplexity = []
    em_iter = []
    for sweep in range(1,max_iter+1):
        lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)    
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end    
    np.save('./data/perplexity_iter.npy', perplexity)
    
    f = plt.figure()
    plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_iter.png')
Esempio n. 2
0
def plot_perplexity_batch(A_tfidf, num_docs):
    
    print "computing perplexity vs batch size..."
    max_iter = 5
    num_topics = 10
    batch_size = np.logspace(6, 10, 5, base=2).astype(int)
    perplexity = np.zeros((len(batch_size),max_iter))
    em_iter = np.zeros((len(batch_size),max_iter))
    for ii, mini_batch in enumerate(batch_size):
        for jj, sweep in enumerate(range(1,max_iter+1)):
            lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = mini_batch, random_state=0, n_jobs=-1)
            tic = time()
            lda.fit(A_tfidf)  #online VB
            toc = time()
            print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
            perplexity[ii,jj] = lda.perplexity(A_tfidf)
            em_iter[ii,jj] = lda.n_batch_iter_
        #end
    #end
    np.save('./data/perplexity.npy', perplexity)
    np.save('./data/em_iter.npy', em_iter)    
    
    f = plt.figure()
    for mb in range(len(batch_size)):
        plt.plot(em_iter[mb,:], perplexity[mb,:], color=np.random.rand(3,), marker='o', lw=2.0, label='mini_batch: '+str(batch_size[mb]))
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_batch.png')
Esempio n. 3
0
def plot_perplexity_topics(A_tfidf):
    
    print "computing perplexity vs K..."
    max_iter = 5    #based on plot_perplexity_iter()
    #num_topics = np.linspace(2,20,5).astype(np.int)
    num_topics = np.logspace(1,2,5).astype(np.int)
    perplexity = []
    em_iter = []
    for k in num_topics:
        lda = LatentDirichletAllocation(n_topics = k, max_iter=max_iter, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "K= %d, elapsed time: %.4f sec" %(k, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end
    
    np.save('./data/perplexity_topics.npy', perplexity)
    np.save('./data/perplexity_topics2.npy', num_topics)    
    
    f = plt.figure()
    plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('Number of Topics, K')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_topics.png')
def extractTopicLDA(func_message_dic, store_cloumn):
    if len(func_message_dic) == 0:
        print "func_message_dic is null"
        return False
    try:
        conn=MySQLdb.connect(host='192.168.162.122',user='******',passwd='123456',port=3306)
        cur=conn.cursor()
        cur.execute('set names utf8mb4')
        conn.select_db('codeAnalysis')
        for function in func_message_dic:
            message = func_message_dic[function]
            np_extractor = nlp.semantics_extraction.NPExtractor(message)
            text = np_extractor.extract()
            if len(text) == 0:
                continue
            tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=n_features, stop_words='english')
            tf = tf_vectorizer.fit_transform(text)
            print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features))
            lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50.,
                                                                    random_state=0)
            lda.fit(tf)
            tf_feature_names = tf_vectorizer.get_feature_names()
            seprator = " "
            for topic_idx, topic in enumerate(lda.components_):
                keywords = seprator.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
            sql = "update func_semantic set "+store_cloumn+" = '"+keywords+"' where func_name = '"+function+"'"
            print sql
            cur.execute(sql)
            conn.commit()
        cur.close()
        conn.close()
        return True
    except MySQLdb.Error,e:
        print e
        raise
Esempio n. 5
0
def lda_tuner(ingroup_otu, best_models):

    best_score = -1*np.inf
    dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    topic_series = [3]
    X = ingroup_otu.values
    eval_counter = 0

    for topics in topic_series: 
        for dtp in dtp_series:
            for twp in twp_series:
                eval_counter +=1
                X_train, X_test = train_test_split(X, test_size=0.5)
                lda = LatentDirichletAllocation(n_topics=topics, 
                                                doc_topic_prior=dtp, 
                                                topic_word_prior=twp, 
                                                learning_method='batch',
                                                random_state=42,
                                                max_iter=20)
                lda.fit(X_train)
                this_score = lda.score(X_test)
                this_perplexity = lda.perplexity(X_test)
                if this_score > best_score:
                    best_score = this_score
                    print "New Max Likelihood: {}".format(best_score)

                print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, 
                                                                 topics, dtp, twp,
                                                                 this_score, this_perplexity)

                best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
                                    'score': this_score, 'perp': this_perplexity})
                if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
                    eval_counter +=1
                    X_train, X_test = train_test_split(X, test_size=0.5)
                    lda = LatentDirichletAllocation(n_topics=topics, 
                                                    doc_topic_prior=1./topics, 
                                                    topic_word_prior=1./topics, 
                                                    learning_method='batch',
                                                    random_state=42,
                                                    max_iter=20)
                    lda.fit(X_train)
                    this_score = lda.score(X_test)
                    this_perplexity = lda.perplexity(X_test)
                    if this_score > best_score:
                        best_score = this_score
                        print "New Max Likelihood: {}".format(best_score)

                    print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, 
                                                                                topics, 
                                                                                (1./topics), 
                                                                                (1./topics),
                                                                                this_score,
                                                                                this_perplexity)

                    best_models.append({'n': topics, 'dtp': (1./topics), 
                                        'twp': (1./topics), 'score': this_score,
                                        'perp': this_perplexity})
    return best_models
Esempio n. 6
0
def fit_lda(tf):
    '''takes in a tf sparse vector and finds the top topics'''
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()
    lda_topic_dict = print_top_words(lda, tf_feature_names, n_top_words)
    return lda, lda_topic_dict
Esempio n. 7
0
def LDA(tf,word):
    lda = LatentDirichletAllocation(n_topics=30, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    lda.fit(tf)
    print_top_words(lda,word,20)
Esempio n. 8
0
    def applyLDA2(self, number_of_clusters, country_specific_tweets):
        train, feature_names = self.extractFeatures(country_specific_tweets,False)
        
        name = "lda"
        if self.results:
            print("Fitting LDA model with tfidf", end= " - ")
        t0 = time()     
        lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5,
                                        learning_method='online', learning_offset=50.,
                                        random_state=0)

        lda.fit(train)
        
        if self.results:
            print("done in %0.3fs." % (time() - t0))
        
        parameters = lda.get_params()
        topics = lda.components_
        doc_topic = lda.transform(train)
        top10, labels = self.printTopicCluster(topics, doc_topic, feature_names)
        labels = numpy.asarray(labels)
        
        if self.results:
            print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels)))
        
        return name, parameters, top10, labels
Esempio n. 9
0
def topicmodel( comments ):

    _texts = []
    texts = []

    for c in comments:

        c = c['text']
        _texts.append( c )
        texts.append( c )



    tf_vectorizer = CountVectorizer(
                max_df=.20,
                min_df=10,
                stop_words = stopwords )
    texts = tf_vectorizer.fit_transform( texts )

    ## test between 2 and 20 topics
    topics = {}

    for k in range(2, 10):

        print "Testing", k

        model = LatentDirichletAllocation(
                    n_topics= k ,
                    max_iter=5,
                    learning_method='batch',
                    learning_offset=50.,
                    random_state=0
                )
        model.fit( texts )
        ll = model.score( texts )
        topics[ ll ] = model

    topic = max( topics.keys() )

    ret = collections.defaultdict( list )

    ## ugly, rewrite some day
    model = topics[ topic ]

    ## for debug pront chosen models' names
    feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]])
        print

    for i, topic in enumerate( model.transform( texts ) ):

        topic = numpy.argmax( topic )
        text = _texts[ i ].encode('utf8')

        ret[ topic ].append( text )

    return ret
Esempio n. 10
0
class LDATopics:
	# Constructor
	def __init__(self, filename):
		# Member variables
		self.email_data = []
		self.lda = None
		self.feature_names = None
		self.num_topics = NUM_TOPICS
		self.num_words_per_topic = NUM_WORDS_PER_TOPIC
		self.num_features = NUM_FEATURES

		# Load emails from full path to file
		emails = EmailLoader(filename).get_email_dict_array()

		# Process emails into a list of email body contents
		for email_rec in emails:
			if email_rec['body']:
				# Clean the text and add to list
				cleaner = TextCleaner(email_rec['body'])

				self.email_data.append(" ".join(cleaner.tokenize_str()))

	## Public methods ##
	def process(self, topics=None, features=None):
		# Check if default numbers should be used
		if topics is None:
			topics = self.num_topics
			
		if features is None:
			features = self.num_features

		# Calculate term frequency for LDA
		tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=features, stop_words='english')
		tf = tf_vectorizer.fit_transform(self.email_data)

		# Fit the LDA model to data samples
		self.lda = LatentDirichletAllocation(n_topics=topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)

		self.lda.fit(tf)

		# Set the feature name (words)
		self.feature_names = tf_vectorizer.get_feature_names()

	def print_topics(self, words_per_topic=None):
		# Check if default number of words per topics should be used
		if words_per_topic is None:
			words_per_topic = self.num_words_per_topic

		self._print_topics(self.lda, self.feature_names, words_per_topic)

	## Private methods ##
	def _print_topics(self, model, feature_names, words_per_topic):
	    for topic_idx, topic in enumerate(model.components_):
	        print("Topic #%d:" % topic_idx)
	        print(" ".join([feature_names[i]
	                        for i in topic.argsort()[:-words_per_topic - 1:-1]]))

	    print()
 def perform_analysis(self, stocks, szTimeAxis, n_ahead):
     # load Snowball comment data
     from agares.datasource.snowball_cmt_loader import SnowballCmtLoader
     SBLoader = SnowballCmtLoader()
     date = self.dt_start.date()
     df_cmt_list = []
     while date <= self.dt_end.date():
         df_cmt_list.append(SBLoader.load(str(date)))
         date += timedelta(days=1)
     df_cmt = pd.concat(df_cmt_list, ignore_index=True)
     # Chinese text segmentation
     self.set_jieba()
     df_cmt['RawComment'] = df_cmt['RawComment'].map(jieba.cut)
     # drop stopwords
     self.stopwords = [line.strip() for line in open('stopwords').readlines()]
     self.stopwords.append(' ')
     df_cmt['RawComment'] = df_cmt['RawComment'].map(self.drop_useless_word)
     cmt = df_cmt['RawComment'].tolist()
     # construct tfidf matrix
     tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=0.05)
     tfidf = tfidf_vectorizer.fit_transform(cmt)
     
     # Fit the NMF model
     n_topics = 5
     n_top_words = 20
     print("Fitting the NMF model with tf-idf features..")
     t0 = time()
     nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
     print("done in %0.3fs." % (time() - t0))
     print("\nTopics in NMF model:")
     tfidf_feature_names = tfidf_vectorizer.get_feature_names()
     self.print_top_words(nmf, tfidf_feature_names, n_top_words)
     
     # Fit the LDA model
     print("Fitting LDA models with tf-idf features..")
     lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                     learning_method='online', learning_offset=50.,
                                     random_state=0)
     t0 = time()
     lda.fit(tfidf)
     print("done in %0.3fs." % (time() - t0))
     print("\nTopics in LDA model:")
     self.print_top_words(lda, tfidf_feature_names, n_top_words)
     
     # load sz daily candlestick data
     sz = next(iter(stocks))
     cst_Day = stocks[sz].cst['1Day'] 
     # print close price within the timescope
     date = self.dt_start
     print()
     print("The ShangHai stock Index (close index) within the timescope")
     while date <= self.dt_end:
         ts = pd.to_datetime(date)
         try:
             print("Date: {0:s}, Index: {1:.2f}".format(str(date.date()), cst_Day.at[ts, 'close']))
         except KeyError: # sz candlestick data does not exist at this datetime
             print("Date: {0:s}, Index: (market closed)".format(str(date.date())))
         date += timedelta(days=1)
Esempio n. 12
0
def LDA(matrix,preserve,n_topics=100):

    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                        learning_method='online', learning_offset=50.,
                                        random_state=randint(1,100))
    lda.fit(matrix[preserve])
    topic_model=lda.transform(matrix)

    return topic_model
Esempio n. 13
0
def get_lda():
    lda = LatentDirichletAllocation(
        n_topics=K,
        max_iter=5,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    lda.fit(X)
    tf_feature_names = VECTORIZER.get_feature_names()
    print_top_words(lda, tf_feature_names, 10)
    return lda
Esempio n. 14
0
    def calculate_lda(self, tfidf):
        print("Fitting LDA models with tf features...")
        lda = LatentDirichletAllocation(n_topics=self.num_topics, max_iter=5,
                                        learning_method='online', learning_offset=50.,
                                        random_state=0)
        t0 = time()
        lda.fit(tfidf)

        print("Topics in LDA model:")
        print_top_words(lda, self.tfidf_feature_names, self.num_words)
        print("done in %0.3fs." % (time() - t0))
def test_perplexity_input_format():
    # Test LDA perplexity for sparse and dense input
    # score should be the same for both dense and sparse input
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch',
                                    total_samples=100, random_state=0)
    lda.fit(X)
    perp_1 = lda.perplexity(X)
    perp_2 = lda.perplexity(X.toarray())
    assert_almost_equal(perp_1, perp_2)
def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                    random_state=0)
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)
def test_lda_dense_input():
    # Test LDA with dense input.
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components,
                                    learning_method='batch', random_state=rng)
    lda.fit(X.toarray())

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Esempio n. 18
0
def test_lda_multi_jobs():
    # Test LDA batch training with multi CPU
    for method in ('online', 'batch'):
        rng = np.random.RandomState(0)
        n_topics, X = _build_sparse_mtx()
        lda = LatentDirichletAllocation(n_topics=n_topics, n_jobs=3,
                                        learning_method=method, random_state=rng)
        lda.fit(X)

        correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
        for c in lda.components_:
            top_idx = set(c.argsort()[-3:][::-1])
            assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Esempio n. 19
0
def test_lda_fit_batch():
    # Test LDA batch learning_offset (`fit` method with 'batch' learning)
    rng = np.random.RandomState(0)
    n_topics, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_topics=n_topics, evaluate_every=1,
                                    learning_method='batch', random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for component in lda.components_:
        # Find top 3 words in each LDA component
        top_idx = set(component.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
def LDA_feature_extraction(text_lst, n_samples, n_features, n_topics, n_top_words):
    print "Extracting tf features for LDA..."
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(text_lst)
    print "Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)
    print "\nTopics in LDA model:"
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)
    print "*************end LDA****************"
Esempio n. 21
0
def test_lda_multi_jobs(method):
    n_components, X = _build_sparse_mtx()
    # Test LDA batch training with multi CPU
    rng = np.random.RandomState(0)
    lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
                                    learning_method=method,
                                    evaluate_every=1, random_state=rng)
    lda.fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Esempio n. 22
0
def main():
	with codecs.open("wallsDB.txt","r",encoding='utf-8') as f:
		walls = f.read().split("\t\n\t")
		vectorizer = CountVectorizer(max_df=0.95, min_df=2)
		F = vectorizer.fit_transform(walls)
		vocab = vectorizer.vocabulary_

		lda = LatentDirichletAllocation(n_topics=1000, max_iter=10,
                                learning_method='online', learning_offset=30.,
                                random_state=777)
		lda.fit(F)
		save_obj(lda, "Phi")
		save_obj(vocab, "vocab")
Esempio n. 23
0
def lauch_lda(featured, n_topics=10, n_top_words=20):
    """ Latent Dirichlet Allocation with online variational Bayes algorithm
    """
    # Use tf (raw term count) features for LDA.
    print "extracting tf features for LDA..."
    tf_vectorizer = CountVectorizer(preprocessor=custom_preprocessor, max_df=0.95, min_df=2) # max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(featured)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    lda.fit(tf)

    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names)
    return load_top_words(lda, tf_feature_names, n_top_words)
Esempio n. 24
0
def test_lda_preplexity_mismatch():
    # test dimension mismatch in `perplexity` method
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    n_samples = rng.randint(6, 10)
    X = np.random.randint(4, size=(n_samples, 10))
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5.,
                                    total_samples=20, random_state=rng)
    lda.fit(X)
    # invalid samples
    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_topics))
    assert_raises_regexp(ValueError, r'Number of samples', lda.perplexity, X, invalid_n_samples)
    # invalid topic number
    invalid_n_topics = rng.randint(4, size=(n_samples, n_topics + 1))
    assert_raises_regexp(ValueError, r'Number of topics', lda.perplexity, X, invalid_n_topics)
Esempio n. 25
0
class LDA():

    def __init__(self, args=None, from_file=None):
        # Initialize LDA model from either arguments or a file. If both are
        # provided, file will be used.
        assert args or from_file, 'Improper initialization of LDA model'
        if from_file is not None:
            with open(from_file, 'rb') as f:
                self.model, self.vectorizer = pickle.load(f, encoding='latin1')
        else:  # training for the first time
            self.vectorizer = TfidfVectorizer(lowercase=False, token_pattern=u'[^;]+')
            self.alpha = args.alpha
            self.beta = args.beta
            self.ntopics = args.ntopics
            self.model = None

    def top_words(self, n):
        features = self.vectorizer.get_feature_names()
        words = [OrderedDict([(features[i], topic[i]) for i in topic.argsort()[:-n - 1:-1]])
                 for topic in self.model.components_]
        return words

    def train(self, docs):
        data = [';'.join(bow) for bow in docs]
        vect = self.vectorizer.fit_transform(data)
        self.alpha = self.alpha if self.alpha is not None else 50./self.ntopics
        self.beta = self.beta if self.beta is not None else 200./len(self.vectorizer.vocabulary_)
        print('{} words in vocabulary'.format(len(self.vectorizer.vocabulary_)))
        print('Training LDA with {} topics, {} alpha, {} beta'.format(self.ntopics, self.alpha, self.beta))
        self.model = LatentDirichletAllocation(self.ntopics,
                                               doc_topic_prior=self.alpha, topic_word_prior=self.beta,
                                               learning_method='batch', max_iter=100,
                                               verbose=1, evaluate_every=1,
                                               max_doc_update_iter=100, mean_change_tol=1e-5)
        self.model.fit(vect)
        # normalizing does not change subsequent inference, provided no further training is done
        self.model.components_ /= self.model.components_.sum(axis=1)[:, np.newaxis]

    def infer(self, docs):
        data = [';'.join(bow) for bow in docs]
        vect = self.vectorizer.transform(data)
        dist = self.model.transform(vect)
        assert vect.shape[0] == dist.shape[0]

        # NOTE: if a document is empty, this method returns a zero topic-dist vector
        samples = [list(doc_topic_dist) if m.nnz > 0 else ([0.] * self.model.n_components)
                   for m, doc_topic_dist in zip(vect, dist)]
        return samples
def test_lda_fit_perplexity():
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch', random_state=0,
                                    evaluate_every=1)
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2)
    def run(self):
# Use tf-idf features for NMF.
        with self.input().open('r') as f:
            data = json.loads(f.read())
            data_samples = data['data']
            print("Extracting tf-idf features for NMF...")
            tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, #max_features=n_features,
                                               stop_words='english')
            t0 = time()
            tfidf = tfidf_vectorizer.fit_transform(data_samples)
            print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
            print("Extracting tf features for LDA...")
            tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features,
                                            stop_words='english')
            t0 = time()
            tf = tf_vectorizer.fit_transform(data_samples)
            print("done in %0.3fs." % (time() - t0))

# Fit the NMF model
            print("Fitting the NMF model with tf-idf features,"
                  "n_samples=%d and n_features=%d..."
                  % (n_samples, n_features))
            t0 = time()
            nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
#exit()
            print("done in %0.3fs." % (time() - t0))

            print("\nTopics in NMF model:")
            tfidf_feature_names = tfidf_vectorizer.get_feature_names()
            tw = get_top_words(nmf, tfidf_feature_names, n_top_words)
            print(tw)
            print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..."
                  % (n_samples, n_features))
            lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                            learning_method='online', learning_offset=50.,
                                            random_state=0)
            t0 = time()
            lda.fit(tf)
            print("done in %0.3fs." % (time() - t0))

            print("\nTopics in LDA model:")
            tf_feature_names = tf_vectorizer.get_feature_names()
            tw = get_top_words(lda, tf_feature_names, n_top_words)
            with self.output().open('w') as out_f:
                out_f.write(json.dumps(tw))
Esempio n. 28
0
def topicExtractionLDA():
    output = open("../../result/topic_extraction", "wr")
    conn= MySQLdb.connect(host='localhost', port = 3306, user='******', passwd='wangyu', db ='vccfinder')
    cur = conn.cursor()
    sql = "select cluster from commit_cluster_600 group by cluster"
    cur.execute(sql)
    result = cur.fetchall()
    clusterids = []
    if None != result:
        for item in result:
            clusterids.append(item[0])
    print("finish get cluster ids...")
    for clusterid in clusterids:
        text = []
        sql = "select message from commits, commit_cluster_600 where commits.id = commit_cluster_600.original_id and cluster = " + str(clusterid)
        cur.execute(sql)
        result = cur.fetchall()
        print("finish get messages...")
        if None != result:
            output.writelines("\n====================start " + str(clusterid) + "====================")
            for message in result:
                setence = message[0].replace("\n", "").replace("_", " ").replace("---", " ")
                filtered_setence = ""
                words = setence.split()
                for word in words:
                    word = filter(str.isalnum, str(word))
                    if word != "":
                        filtered_setence += word + " "
                filtered_setence = filtered_setence.rstrip()
                filtered_setence += "."
                #print(filtered_setence)
                text.append(filtered_setence)
        print("finish build text array... then extracting tf features for LDA...")
        tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
        tf = tf_vectorizer.fit_transform(text)
        print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features))
        lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50.,
                                                                    random_state=0)
        lda.fit(tf)
        tf_feature_names = tf_vectorizer.get_feature_names()
        print_top_words(lda, tf_feature_names, n_top_words, output)
    output.close()
    cur.close()
    conn.commit()
    conn.close()
Esempio n. 29
0
def calculate_lda_for_chinese_restaurants():
    print 'Calculating LDA...'
    n_features = 1000
    n_topics = 10
    n_top_words = 5

    t0 = time()
    chinese_reviews = get_chinese_restaurants_reviews(get_chinese_restaurants())
    print("done in %0.3fs." % (time() - t0))

    # Use tf-idf features for Non-negative matrix factorization.
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=2,
                                       max_features=n_features,
                                       stop_words='english')
    t0 = time()
    tfidf = tfidf_vectorizer.fit_transform(chinese_reviews)
    print("done in %0.3fs." % (time() - t0))

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=n_features,
                                    stop_words='english')
    t0 = time()
    tf = tf_vectorizer.fit_transform(chinese_reviews)
    print("done in %0.3fs." % (time() - t0))

    # Fit LDA model for tf features
    print("Fitting LDA models with tf features, " "n_features=%d..." % n_features)
    lda = LatentDirichletAllocation(n_topics=n_topics,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    t0 = time()
    lda.fit(tf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    print_top_words(lda, tf_feature_names, n_top_words)
def find_topics(df_train, df_test, n_topics):
    
    #http://scikit-learn.org/stable/auto_examples/applications/topics_extraction_with_nmf_lda.html    
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.decomposition import LatentDirichletAllocation    
    
    # Use tf (raw term count) features for LDA.
    print("Extracting character frequency features for topic modeling...")
    
    #Need to create a dtm with combined (train/test) vocabulary in columns
    n_train = df_train.shape[0]    
    df_combined = df_train.copy(deep = True).append(df_test.copy(deep = True))
    vectorizer = CountVectorizer(decode_error = 'strict', analyzer = 'char')
    corpus_combined = df_combined.loc[:,'text_read']
    dtm_combined = vectorizer.fit_transform(corpus_combined) 
    
    #split the train and test data again to ensure we only use test set for
    #supervised cross-validated learning
    dtm_train = dtm_combined[:n_train,:]
    dtm_test = dtm_combined[n_train:,:]
    
    print("Fitting LDA models with character frequency features...")
    #This requires sklearn.__version__ to be 0.17.X or greater    
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_method='batch', 
                                    random_state=0)
    #fit to the training document term matrix
    lda.fit(dtm_train)
    
    #create topic 'names' and columns in dataframe    
    topic_names = []    
    for i in range(0, n_topics):
        name = 't' + str(i+1)        
        topic_names.append(name)
        df_train.loc[:, name] = 0.0
        df_test.loc[:, name] = 0.0
    
    df_train.loc[:, topic_names] = lda.transform(dtm_train)
    df_test.loc[:, topic_names] = lda.transform(dtm_test)
    
    #normalize these topic features
    df_train = normalize_features(df_train, topic_names)    
    df_test = normalize_features(df_test, topic_names)
    
    return df_train
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance=""):
    train = False
    narratives = []
    keywords = []
    
    # Get the xml from file
    root = etree.parse(infile).getroot()

    if dict_keys == None:
        train = True

        # Set up the keys for the feature vector
        dict_keys = ["MG_ID", labelname]
        if checklist in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma","CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD"]
        elif dem in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"]
        print "dict_keys: " + str(dict_keys)
        #keywords = set([])
        #narrwords = set([])

    print "train: " + str(train)
    print "stem: " + str(stem)
    print "lemma: " + str(lemma)
    # Extract features
    matrix = []
    for child in root:
        features = {}

        if rec_type in featurenames:
            features["CL_" + rec_type] = child.tag

        # CHECKLIST features
        for key in dict_keys:
            if key[0:3] == "CL_":
                key = key[3:]
            item = child.find(key)
            value = "0"
            if item != None:
                value = item.text
            if key == "AlcoholD" or key == "ApplytobaccoD":
                if value == 'N':
                    value = 9
            features[key] = value
            #print "-- value: " + value
            #if key == "MG_ID":
            #    print "extracting features from: " + value

        # KEYWORD features
        if kw_features:
            keyword_string = get_keywords(child)
            # Remove punctuation and trailing spaces from keywords
            words = [s.strip().translate(string.maketrans("",""), string.punctuation) for s in keyword_string.split(',')]
            # Split keyword phrases into individual words
            for word in words:
                w = word.split(' ')
                words.remove(word)
                for wx in w:
                    words.append(wx.strip().strip('–'))
            keywords.append(" ".join(words))
                
        # NARRATIVE features
        if narr_features or ((not train) and (symp_train in featurenames)):
            narr_string = ""
            item = child.find(element)
            if item != None:
                if item.text != None:
                    narr_string = item.text.encode("utf-8")
                else:
                    print "warning: empty narrative"
                narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
                text = " ".join(narr_words)

                if stem:
                    narr_string = preprocessing.stem(text)
                elif lemma:
                    narr_string = preprocessing.lemmatize(text)
            narratives.append(narr_string.strip().lower())
            #print "Adding narr: " + narr_string.lower()

        # SYMPTOM features
        elif train and (symp_train in featurenames):
            narr_string = ""
            item = child.find("narrative_symptoms")
            if item != None:
                item_text = item.text
                if item_text != None and len(item_text) > 0:
                    narr_string = item.text.encode("utf-8")
                    #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
            narratives.append(narr_string.lower())
            print "Adding symp_narr: " + narr_string.lower()

        # Save features
        matrix.append(features)

    # Construct the feature matrix

    # COUNT or TFIDF features
    if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
        documents = []
        if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
            documents = narratives
            print "narratives: " + str(len(narratives))
        elif kw_count in featurenames or kw_tfidf in featurenames:
            documents = keywords
            print "keywords: " + str(len(keywords))

        # Create count matrix
        global count_vectorizer
        if train:
            print "training count_vectorizer"
            count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(min_ngram,max_ngram),stop_words=stopwords)
            count_vectorizer.fit(documents)
            dict_keys = dict_keys + count_vectorizer.get_feature_names()
        print "transforming data with count_vectorizer"
        count_matrix = count_vectorizer.transform(documents)
        matrix_keys = count_vectorizer.get_feature_names()

        print "writing count matrix to file"
        out_matrix = open(infile + ".countmatrix", "w")
        out_matrix.write(str(count_matrix))
        out_matrix.close()

        # Add count features to the dictionary
        for x in range(len(matrix)):
            feat = matrix[x]
            for i in range(len(matrix_keys)):
                key = matrix_keys[i]
                val = count_matrix[x,i]
                feat[key] = val

        # Convert counts to TFIDF
        if (narr_tfidf in featurenames) or (kw_tfidf in featurenames):
            print "converting to tfidf..."
            print "matrix_keys: " + str(len(matrix_keys))

            # Use the training count matrix for fitting
            if train:
                global tfidfTransformer
                tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer()
                tfidfTransformer.fit(count_matrix)

            # Convert matrix to tfidf
            tfidf_matrix = tfidfTransformer.transform(count_matrix)
            print "count_matrix: " + str(count_matrix.shape)
            print "tfidf_matrix: " + str(tfidf_matrix.shape)

            # Replace features in matrix with tfidf
            for x in range(len(matrix)):
                feat = matrix[x]
                #values = tfidf_matrix[x,0:]
                #print "values: " + str(values.shape[0])
                for i in range(len(matrix_keys)):
                    key = matrix_keys[i]
                    val = tfidf_matrix[x,i]
                    feat[key] = val

        # LDA topic modeling features
        if lda in featurenames:
            global ldaModel
            if train:
                ldaModel = LatentDirichletAllocation(n_topics=num_topics)
                ldaModel.fit(count_matrix)
            lda_matrix = ldaModel.transform(count_matrix)
            for t in range(0,num_topics):
                dict_keys.append("lda_topic_" + str(t))
            for x in range(len(matrix)):
                for y in range(len(lda_matrix[x])):
                    val = lda_matrix[x][y]
                    matrix[x]["lda_topic_" + str(y)] = val

            # TODO: Print LDA topics

    # WORD2VEC features
    elif narr_vec in featurenames:
        print "Warning: using word2vec features, ignoring all other features"

        # Create word2vec mapping
        word2vec, dim = load_word2vec(vecfile)

        # Convert words to vectors and add to matrix
        dict_keys.append(narr_vec)
        global max_seq_len
        max_seq_len = 200
        #if train:
            #max_seq_len = 0
        print "word2vec dim: " + str(dim)
        print "initial max_seq_len: " + str(max_seq_len)
        zero_vec = []
        for z in range(0, dim):
            zero_vec.append(0)
        for x in range(len(matrix)):
            narr = narratives[x]
            #print "narr: " + narr
            vectors = []
            vec = zero_vec
            for word in narr.split(' '):
                if len(word) > 0:
                    #if word == "didnt":
                    #    word = "didn't"
                    if word in word2vec:
                        vec = word2vec[word]
                    vectors.append(vec)
            length = len(vectors)
            if length > max_seq_len:
                #if train:
                #    max_seq_len = length
                vectors = vectors[(-1*max_seq_len):]
            (matrix[x])[narr_vec] = vectors

        # Pad the narr_vecs with 0 vectors
        print "padding vectors to reach maxlen " + str(max_seq_len)
        for x in range(len(matrix)):
            length = len(matrix[x][narr_vec])
            matrix[x]['max_seq_len'] = max_seq_len
            if length < max_seq_len:
                for k in range(0, max_seq_len-length):
                    matrix[x][narr_vec].insert(0,zero_vec) # use insert for pre-padding

    # narr_seq for RNN
    elif narr_seq in featurenames:
        global vocab_size, max_seq_len
        if train:
            dict_keys.append(narr_seq)
            dict_keys.append('vocab_size')
            dict_keys.append('max_seq_len')
            vocab = set()
            for narr in narratives:
                words = narr.split(' ')
                for word in words:
                    vocab.add(word)
            vocab_size = len(vocab)
            max_seq_len = 0

        sequences = []

        # Convert text into integer sequences
        for x in range(len(matrix)):
            narr = narratives[x]
            seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ')
            if len(seq) > max_seq_len:
                max_seq_len = len(seq)
            sequences.append(seq)

        # Pad the sequences
        sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre')
        for x in range(len(matrix)):
            matrix[x]['narr_seq'] = sequences[x]
            matrix[x]['vocab_size'] = vocab_size
            matrix[x]['max_seq_len'] = max_seq_len

    #if arg_rebalance != "":
    #    matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance)
    #    write_to_file(matrix_re, dict_keys, outfile)
    #else:
    data_util.write_to_file(matrix, dict_keys, outfile)
tf_truthful = tf_vectorizer_truthful.fit_transform(pos_truthful + neg_truthful)

tf_vectorizer_deceptive = CountVectorizer(max_df=0.95,
                                          min_df=2,
                                          stop_words='english')
tf_deceptive = tf_vectorizer_deceptive.fit_transform(pos_deceptive +
                                                     neg_deceptive)

print("Fitting LDA models with tf features, "
      "n_samples=%d and n_features=%d..." % (n_samples, n_features))
lda_neg_review = LatentDirichletAllocation(n_components=n_components,
                                           max_iter=20,
                                           learning_method='online',
                                           learning_offset=50.,
                                           random_state=0)
lda_neg_review.fit(tf_pos_review)

# print("\nTopics in LDA model of negative deceptive:")
tf_feature_names = tf_vectorizer_pos_review.get_feature_names()
top_word_list = print_top_words(lda_neg_review, tf_feature_names, n_top_words)
''' Start '''

lda_neg_deceptive = LatentDirichletAllocation(n_components=n_components,
                                              max_iter=20,
                                              learning_method='online',
                                              learning_offset=50.,
                                              random_state=0)
lda_neg_deceptive.fit(tf_pos_deceptive)

print("\nTopics in LDA model of negative deceptive:")
tf_feature_names = tf_vectorizer_pos_deceptive.get_feature_names()
# LDA’s objective is to maximize separation between means of projected topics and
# minimize variance within each projected topic

# So LDA defines each topic as a bag of words by carrying out three steps described below.
# Step 1: Initialize k clusters and assign each word in the document to one of the k  topics.
# Step 2: Re-assign word to new topic based on a) how is the proportion of words
#    for a document to a topic, and b) how is the proportion of a topic widespread across all documents.
# Step 3: Repeat step 2 until coherent topics result.

from sklearn.decomposition import LatentDirichletAllocation
# continuing with the 20 newsgroup dataset and 3 topics
total_topics = 3
lda = LatentDirichletAllocation(n_topics=total_topics,max_iter=100,learning_method='online',learning_offset=50.,random_state=2017)

lda.fit(X)
feature_names = np.array(vectorizer.get_feature_names())
for topic_idx, topic in enumerate(lda.components_):
	print("Topic #%d:" % topic_idx)
	print(" ".join([feature_names[i] for i in topic.argsort()[:-20 - 1:-1]]))

# Non-negative Matrix Factorization :-
# NMF is a decomposition method for multivariate data, and is given by V = MH, where V
# is the product of matrices W and H. W is a matrix of word rank in the features, and H is
# the coefficient matrix with each row being a feature. The three matrices have no negative
# elements.

from sklearn.decomposition import NMF
nmf = NMF(n_components=total_topics, random_state=2017, alpha=.1, l1_ratio=.5)
nmf.fit(X)
for topic_idx, topic in enumerate(nmf.components_):
Esempio n. 34
0
# Remove special characters, stopwords, twitter IDs, and hashtags.

cleanedTweets = [clean_text(tweet) for tweet in tweets]

# Train a topic (LDA) model.

lda = LatentDirichletAllocation(n_topics=5, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

vectorizer = TfidfVectorizer()
tf = vectorizer.fit_transform(cleanedTweets)
feature_names = vectorizer.get_feature_names()
lda.fit(tf)

topic_words = []

for topic in lda.components_:
	word_idx = np.argsort(topic)[::-1][0:1]
	topic_words.append([feature_names[i] for i in word_idx][0])
	
print topic_words

# Construct topic groups.

cameron = [tweet for tweet in cleanedTweets if tweet.find('cameron')>-1]
farage = [tweet for tweet in cleanedTweets if tweet.find('farage')>-1]
claim = [tweet for tweet in cleanedTweets if tweet.find('claim')>-1]
ukip = [tweet for tweet in cleanedTweets if tweet.find('ukip')>-1]
    train_content, train_tag, train_raw, test_content, test_tag, test_raw = divideData(
        rawdialogue, content, tag, 0.2)
    # 得到单词-文档共现矩阵
    vectorizer = CountVectorizer(encoding='unicode',
                                 stop_words='english',
                                 max_features=N_FEATURES)

    train_data = vectorizer.fit_transform(train_content)

    train_tag = np.array(train_tag)

    test_data = vectorizer.fit_transform(
        test_content)  # [n_samples, n_features]

    model = LDA(n_topics=N_TOPICS, max_iter=5, batch_size=128)
    model.fit(train_data)

    train_data_distr = model.transform(train_data)
    pred_tag = train_data_distr.argmax(axis=1)

    # 投票
    id2class = dict()
    for idx in range(N_TOPICS):
        idxs = np.where(pred_tag == idx)[0]
        # print Counter(train_tag[idxs])
        id2class[idx] = Counter(train_tag[idxs]).most_common(1)[0][0]
    print id2class
    doc_topic_distr = model.transform(test_data)  # [n_samples, n_topics]
    class_id = doc_topic_distr.argmax(axis=1)
    pred = [id2class[each] for each in class_id]
    pred = np.array(pred)
Esempio n. 36
0
    #fit LDA topic model based on tf-idf of term-document matrix
    num_features = dictionary_size
    num_topics = 8  #fixed for LDA

    #fit LDA model
    print "Fitting LDA model..."
    lda_vb = LatentDirichletAllocation(n_topics=num_topics,
                                       max_iter=10,
                                       learning_method='online',
                                       batch_size=512,
                                       random_state=0,
                                       n_jobs=1)

    tic = time()
    lda_vb.fit(A.T)  #online VB
    toc = time()
    print "elapsed time: %.4f sec" % (toc - tic)
    print "LDA params"
    print lda_vb.get_params()

    print "number of EM iter: %d" % lda_vb.n_batch_iter_
    print "number of dataset sweeps: %d" % lda_vb.n_iter_

    #topic matrix W: K x V
    #components[i,j]: topic i, word j
    #note: here topics correspond to label clusters
    topics = lda_vb.components_

    f = plt.figure()
    plt.matshow(topics, cmap='gray')
Esempio n. 37
0
    doc = doc.lower()
    doc_cleaned = ' '.join(
        lemmatizer.lemmatize(word) for word in doc.split()
        if word.isalpha() and word not in all_names)
    data_cleaned.append(doc_cleaned)

from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer(stop_words="english",
                               max_features=None,
                               max_df=0.5,
                               min_df=2)

data = count_vector.fit_transform(data_cleaned)

from sklearn.decomposition import LatentDirichletAllocation

t = 20
lda = LatentDirichletAllocation(n_components=t,
                                learning_method='batch',
                                random_state=42)

lda.fit(data)

print(lda.components_)

terms = count_vector.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic {}:".format(topic_idx))
    print(" ".join([terms[i] for i in topic.argsort()[-10:]]))
                                   analyzer='char',
                                   stop_words=None,
                                   max_df=0.999)
cv = count_vectorizer.fit_transform(docs)
k = cv.todense()
# lda.fit(cv)

# change alpha
y = list()
x = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for i in range(10):
    lda = LatentDirichletAllocation(n_components=num_topics,
                                    doc_topic_prior=alpha,
                                    topic_word_prior=float((i + 1) * 5),
                                    learning_method='online')
    lda.fit(cv)
    word_dist, topics = print_top_words(lda, word_types, 20)
    word_sampling = 0
    for j in range(3):
        arr = word_dist[j]
        word_sampling = word_sampling + entropy(arr)
    print word_sampling / 3.0
    y.append(word_sampling / 3.0)
    # y.append(entropy(topics))

plt.plot(x, y)
plt.xlabel("Alpha")
plt.ylabel("Entropy")
plt.title("Entropy of Topic Distribution")
plt.show()
# true topic distribution
def build_dimensionality_reduction_model(data,
                                         model_type,
                                         cross_validate=False,
                                         num_iters=10):
    '''
    This function fits a dimensionality reduction model (of the type model_type) to the given features
    
    input:
        training_set: the set of features of the data from which to build our model
        model_type: the scikit-learn model of choice given by user input parameter
        
    output:
        trained model fit to the features of the data
    '''

    # create a model variable
    model = None

    if model_type == 'latent dirichlet allocation':
        # instantiate model with default hyperparameter settings
        model = LatentDirichletAllocation(n_topics=5)

        if cross_validate:
            # create parameter distributions
            param_distro = {}

            # create random grid search object
            model = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distro,
                                       n_iter=num_iters,
                                       n_jobs=-1,
                                       verbose=True)

            print '\n', '... performing cross-validation', '\n'

            # cross-validate the model
            model.fit(data)
        else:
            # fit the vanilla model to the data
            model.fit(data)

    elif model_type == 'non-negative matrix factorization':
        # instantiate model with default hyperparameter settings
        model = NMF(n_components=5)

        if cross_validate:
            # create parameter distributions
            param_distro = {}

            # create random grid search object
            model = RandomizedSearchCV(estimator=model,
                                       param_distributions=param_distro,
                                       n_iter=num_iters,
                                       n_jobs=-1,
                                       verbose=True)

            print '\n', '... performing cross-validation', '\n'

            # cross-validate the model
            model.fit(data)
        else:
            # fit the vanilla model to the data
            model.fit(data)

    else:
        raise NotImplementedError

    # return the fitted / cross-validated model
    return model
Esempio n. 40
0
others.append("al")
my_stop_words = text.ENGLISH_STOP_WORDS.union(others)
#print(others)
#print(my_stop_words)

count_vectorizer = CountVectorizer(stop_words=my_stop_words)# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(texts)# Visualise the 10 most common words
#count_data = crossRef(count_data, men)
#plot_10_most_common_words(count_data, count_vectorizer)


# Tweak the two parameters below
number_topics = 11
number_words = 10# Create and fit the LDA model
lda = LDA(n_components=number_topics, n_jobs=-1)
lda.fit(count_data)# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)



LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(number_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = sklearn_lda.prepare(lda, count_data, count_vectorizer)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
    f.close()
# load the pre-prepared pyLDAvis data from disk
Esempio n. 41
0
        'Topic_ID': topic_id_list,
        'Topics': feature_names_list
    })
    return topic_df


for i in range(5, 11):
    comments_file = 'data/reddit_tldr/tldr_comments_cleaned_{0}.txt'.format(i)
    model_file = 'data/reddit_tldr/topic_models/tldr_{0}_topics.csv'.format(i)

    documents = open(comments_file, 'r')
    no_features = 1000
    no_topics = 10
    no_top_words = 10

    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    max_features=no_features,
                                    stop_words='english',
                                    min_df=2)
    tf = tf_vectorizer.fit_transform(documents)
    tf_feature_names = tf_vectorizer.get_feature_names()

    lda = LatentDirichletAllocation(n_components=no_topics,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=1)
    lda_new = lda.fit(tf)
    lda_topic_df = display_relevant_topics(lda_new, tf_feature_names,
                                           no_top_words)
    lda_topic_df.to_csv(model_file)
    for index, row in df1.iterrows():
        # build_WordCloud(row['reviews'],row['brand'],row['asin'])
        diction = calculate_word_frequency(row['reviews'])
        s = row['brand'] + row['asin']
        freq_list = {s: diction}
        fp.write(json.dumps(freq_list) + "\n")

<<<<<<< HEAD
print("Execution Time: ", time.clock()-start)


def do_lda(reviews_string):
    vectorizer = CountVectorizer(stop_words='english', lowercase=True, token_pattern='\s\w+\s', max_df=0.8)
    vectorized_data = vectorizer.fit_transform(reviews_string)
    lda = LatentDirichletAllocation(n_components=4, max_iter=15)
    lda.fit(vectorized_data)
    components = lda.components_.T
    features = vectorizer.get_feature_names()
    labels = {0: [], 1: [], 2: [], 3: []}
    stop_words = set(stopwords.words('english'))
    for i in range(len(features)):
        label = np.argmax(components[i])
        word = features[i].lower().strip()
        if word not in stop_words:
            labels[label].append(word)
    return labels


def get_frequency_table(documents):
    giant_document = " ".join(documents)
    all_words = giant_document.split()
Esempio n. 43
0
variety_dict = Counter(wine_df['variety'])
most_common = [t[0] for t in variety_dict.most_common(20)]

# vectrize
vect = CountVectorizer(stop_words='english', lowercase=True, min_df=10)
#vect = CountVectorizer(tokenizer = my_tokenizer)
counter = vect.fit_transform(wine_df['description'])

transf = TfidfTransformer(norm='l2',
                          use_idf=True,
                          smooth_idf=True,
                          sublinear_tf=False)
# TfidfTransformer takes the CountVectorizer output and computes the tf-idf
tf_idf = transf.fit_transform(counter)

lda = LatentDirichletAllocation(n_components=20, random_state=0)
lda.fit(counter)
lda.transform(counter)
tf_feature_name = vect.get_feature_names()


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join(
            [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


print_top_words(lda, tf_feature_name, 10)
class LDAModel(BenchmarkModel):
    def __init__(self,
                 n_components,
                 max_features,
                 max_df,
                 min_df,
                 learning_method="batch",
                 learning_decay=0.7,
                 cores=1,
                 epochs=10):
        super().__init__()
        self.n_components = n_components
        self.cores = cores
        self.epochs = epochs
        self.max_features = max_features
        self.max_df = max_df
        self.min_df = min_df
        self.learning_method = learning_method
        self.learning_decay = learning_decay

    def build_model(self):
        super().build_model()
        self.model = LatentDirichletAllocation(
            n_components=self.n_components,
            learning_method=self.learning_method,
            learning_decay=self.learning_decay,
            n_jobs=self.cores,
            max_iter=self.epochs)
        self.count_vectorizer = CountVectorizer(max_features=self.max_features,
                                                max_df=self.max_df,
                                                min_df=self.min_df,
                                                stop_words='english')

    def train(self, x, y=None):
        logging.info("Building vocabulary on " + self.__class__.__name__)
        t0 = time.time()
        processed_dataset = process_dataset(x)
        processed_dataset = processed_dataset.map(
            lambda x: ' '.join(word for word in x))
        doc_term_matrix = self.count_vectorizer.fit_transform(
            processed_dataset.values.astype('U'))
        self.model.fit(doc_term_matrix)
        elapsed = (time.time() - t0)
        logging.info("Done in %.3fsec" % elapsed)

    def preprocess_data(self, dataset, y_dataset):
        logging.info("Transform data on " + self.__class__.__name__)
        processed_dataset = process_dataset(dataset)
        processed_dataset = processed_dataset.map(
            lambda x: ' '.join(word for word in x))
        doc_term_matrix = self.count_vectorizer.transform(
            processed_dataset.values.astype('U'))
        return self.model.transform(doc_term_matrix)

    def save(self, path):
        logging.info("Saving " + self.__class__.__name__)
        combined_path = os.path.join(path, self.__class__.__name__)
        pickle.dump(self.clf, open(combined_path + "_clf.pickle", 'wb'))
        pickle.dump(self.model, open(combined_path + "_model.pickle", 'wb'))
        pickle.dump(self.count_vectorizer.vocabulary_,
                    open(combined_path + "_vec.pickle", 'wb'))

    def load(self, path):
        logging.info("Loading " + self.__class__.__name__)
        combined_path = os.path.join(path, self.__class__.__name__)
        self.clf = pickle.load(open(combined_path + "_clf.pickle", 'rb'))
        self.model = pickle.load(open(combined_path + "_model.pickle", 'rb'))
        self.count_vectorizer = CountVectorizer(
            vocabulary=pickle.load(open(combined_path + "_vec.pickle", 'rb')))

    def can_load(self, path):
        combined_path = os.path.join(path, self.__class__.__name__)
        return os.path.isfile(combined_path + "_clf.pickle") and \
        os.path.isfile(combined_path + "_model.pickle") and \
        os.path.isfile(combined_path + "_vec.pickle")
Esempio n. 45
0
    #print model.components_.shape
    #print model.components_


#doc_topic_prior=[0.001, 0.01, 0.05, 0.1, 0.2,0.5]
#topic_word_prior=[0.001, 0.01, 0.05, 0.1, 0.2,0.5]
#topics=[50,100,500,1000]
#iters=[50,100,500,1000]
#plex=[]
lda = LatentDirichletAllocation(n_components=100,
                                max_iter=100,
                                learning_method='batch',
                                doc_topic_prior=0.5,
                                topic_word_prior=0.2)
lda_begin_time = time.time()
lda.fit(X_tfidf_train)
lda_end_time = time.time()
print "LDA training time:%fs" % (lda_end_time - lda_begin_time)
X_tfidf_train = lda.transform(X_tfidf_train)
X_tfidf_test = lda.transform(X_tfidf_test)
X_train = np.concatenate((X_dcr_train, X_tfidf_train), axis=1)
#km=KMeans(n_clusters=30)
#km_begin_time=time.time()
#km.fit(X_train)
#km_end_time=time.time()
#print "KMeans training time:%fs" % (km_end_time-km_begin_time)
#print calinski_harabaz_score(X_train,km.labels_)
#print km.labels_
#ms=MeanShift()
#ms_begin_time=time.time()
#ms.fit(X_train)
Esempio n. 46
0
def compute_lda_model(input_dir,
                      output_file,
                      n_topics=500,
                      format="corenlp",
                      extension="xml",
                      use_lemmas=False,
                      stemmer="porter",
                      language="english"):
    """ Compute a LDA model from a collection of documents. Latent Dirichlet
        Allocation is computed using sklearn module.

        Args:
            input_dir (str): the input directory.
            output_file (str): the output file.
            n_topics (int): number of topics for the LDA model, defaults to 500.
            format (str): the input files format, defaults to corenlp.
            extension (str): file extension for input documents, defaults to
                xml.
            use_lemmas (bool): whether lemmas from stanford corenlp are used
                instead of stems (computed by nltk), defaults to False.
            stemmer (str): the stemmer in nltk to used (if used), defaults
                to porter.
            language (str): the language of the documents, used for stop_words
                in sklearn CountVectorizer, defaults to 'english'.
    """

    # texts container
    texts = []

    # loop throught the documents
    for input_file in glob.glob(input_dir + '/*.' + extension):

        # initialize load file object
        doc = LoadFile(input_file)

        # read the input file
        doc.read_document(format=format,
                          use_lemmas=use_lemmas,
                          stemmer=stemmer,
                          sep='/')

        # container for current document
        text = []

        # loop through sentences
        for sentence in doc.sentences:

            # get the tokens (stems) from the sentence if they are not
            # punctuation marks
            text.extend([ sentence.stems[i] for i in range(sentence.length) \
                          if not re.search('[^A-Z$]', sentence.pos[i]) ])

        # add the document to the texts container
        texts.append(' '.join(text))

    # vectorize dataset
    # get the stoplist from nltk because CountVectorizer only contains english
    # stopwords atm
    tf_vectorizer = CountVectorizer(stop_words=stopwords.words(language))
    tf = tf_vectorizer.fit_transform(texts)

    # extract vocabulary
    vocabulary = tf_vectorizer.get_feature_names()

    # create LDA model and train
    lda_model = LatentDirichletAllocation(n_components=n_topics,
                                          random_state=0,
                                          learning_method='batch')
    lda_model.fit(tf)

    # save all data necessary for later prediction
    saved_model = (vocabulary, lda_model.components_,
                   lda_model.exp_dirichlet_component_,
                   lda_model.doc_topic_prior_)

    # Dump the df container
    logging.info('writing LDA model to ' + output_file)
    with gzip.open(output_file, 'wb') as fp:
        pickle.dump(saved_model, fp)
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, plot_confusion_matrix

#create the document term matrix
vectorizer = CountVectorizer(max_df=0.8, min_df=4, stop_words='english')
doc_term_matrix = vectorizer.fit_transform(
    tweets_data['tweettext'].values.astype('U'))

#Generate the LDA with the top 4 topics in the argument. Use random seed 35.
LDA = LatentDirichletAllocation(n_components=4, random_state=35)
LDA.fit(doc_term_matrix)
#Retrieve words in the first topic, sort the indexes according to probability using argsort()
first_topic = LDA.components_[0]
top_topic_words = first_topic.argsort()[-10:]

for i in top_topic_words:
    print(vectorizer.get_feature_names()[i])

#top 10 words for each topic
for i, topic in enumerate(LDA.components_):
    print(f'Top 10 words for topic #{i}:')
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print('\n')
#1. Add a new column to the dataframe containing the LDA topic number
topic_values = LDA.transform(doc_term_matrix)
topic_values.shape
Esempio n. 48
0
                    configs.append(config)


# ** running

for cfg in configs:
    for itr in range(1,5):

        t1 = time.time()
        ldax = LatentDirichletAllocation(n_jobs = 3,
                                         n_components=cfg['n_components'], 
                                         max_iter = cfg['max_iter'],
                                         doc_topic_prior = cfg['doc_topic_prior'],
                                         topic_word_prior = cfg['topic_word_prior']
        )
        ldax.fit(mat_dict[cfg['mat']])
        t2 = time.time()
        print(cfg, t2-t1)
    
        flnm = cfg['mat'] + "_iter_" + str(cfg['max_iter']) + "_alpha_" + str(cfg['doc_topic_prior']) + "_beta_" + str(cfg['topic_word_prior']) + '_ncomp' + str(cfg['n_components']) + '_it_' + str(itr)

        dump(ldax, diag_dir + flnm)

# mat_edge_smpl: one iterations takes 1.22 secs
# mat_song_smpl: one iteration takes 3.6 secs
# mat_cutofs: 3.9 secs
# on average 2.9 secs
# on average 35 iterations
# total of 135*35 = 4725 iterations
# 4725*2.9 = 13702.5 secs = 3.8 hours
Esempio n. 49
0
        else:
            categories[document["category"]] = k
            docToLabel[str(document['_id'])] = k
            k = k + 1

    labels = np.array(list(docToLabel.values()))

    #instantiate CountVectorizer()
    cv = CountVectorizer(stop_words='english')

    # this steps generates word counts for the words in your docs
    word_count_vector = cv.fit_transform(docs)

    # Create and fit the LDA model
    lda = LDA(n_components=6, n_jobs=-1)
    lda.fit(word_count_vector)  # Print the topics found by the LDA model
    # print("Topics found via LDA:")
    # print_topics(lda, cv, 10)

    documentTopicDistr = lda.transform(word_count_vector)
    documentTopicDistr = np.array(documentTopicDistr)

    lda_labels = np.argmax(documentTopicDistr, axis=1)

    print("Metrici LDA " + stemmer)

    print(metrics.homogeneity_score(labels, lda_labels))
    print(metrics.completeness_score(labels, lda_labels))
    print(metrics.v_measure_score(labels, lda_labels))
    print(metrics.adjusted_rand_score(labels, lda_labels))
    print(metrics.adjusted_mutual_info_score(labels, lda_labels))
Esempio n. 50
0
    print("preprocessing data")
    df = utils.preprocess_data(df, analyzer, tt)
    df.to_csv("data/tesi_US_preprocessed.csv", index=None)
else:
    print("loading preprocessed data")
    df = pd.read_csv("data/tesi_US_preprocessed.csv")

print("training vectorizer")
TDmat = cv.fit_transform(df['preprocessed'])
joblib.dump(cv, "models/cv_{}.pkl".format(n_features))

if isinstance(n_topics, list):
    topic_numbers = n_topics
else:
    topic_numbers = [n_topics]

for num in topic_numbers:
    lda = LatentDirichletAllocation(n_components=num,
                                    max_iter=12,
                                    learning_method='online',
                                    learning_offset=30.,
                                    random_state=0,
                                    n_jobs=6)
    print("training lda with {} topics".format(num))
    lda.fit(cv.transform(df['preprocessed']))
    utils.print_top_words(lda, cv.get_feature_names(), n_top_words)

    joblib.dump(lda, "models/lda_{}_{}.pkl".format(num, n_features))
    utils.visualize_lda(lda, TDmat, cv, True,
                        "html/lda_{}_{}.html".format(num, n_features))
Esempio n. 51
0
        out = out + xy
        #print(out)
    return out


cntVect = CountVectorizer(stop_words=stop_word_list)
cntTf = cntVect.fit_transform(word_cut)

list_numb_topics = [10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]
list_perplexity = []
for topic_numb in list_numb_topics:
    #topic_numb = 18
    lda = LatentDirichletAllocation(n_components=topic_numb,
                                    max_iter=1000,
                                    learning_method='batch')
    lda.fit(cntTf)

    dic = {}
    out = ""

    n_top_words = 20
    tf_features_names = cntVect.get_feature_names()
    out = foo(lda, tf_features_names, n_top_words, out)
    # print(out)
    wo = r'C:\Users\admin\Desktop\file\n_topic_numb= %d.txt' % topic_numb
    file = open(wo, 'w')
    file.write(out)
    # print(dic)
    doc_topic_dist = lda.transform(cntTf)
    doc_topic_dist = pd.DataFrame(
        doc_topic_dist, columns=['topic_#%d' % i for i in range(topic_numb)])
        listOfCoords = []
        aux = l.split(',')
        for dim in range(len(aux)):
            listOfCoords.append(float(aux[dim]))
        # normalize values
        normalizedListOfCoords = [
            (x - min(listOfCoords)) / (max(listOfCoords) - min(listOfCoords))
            for x in listOfCoords
        ]
        dataset.append(normalizedListOfCoords)

    X = np.array(dataset)

    # Create and fit the LDA model
    lda = LDA(n_components=6, n_jobs=-1)
    lda.fit(X)  # Print the topics found by the LDA model
    # print("Topics found via LDA:")
    # print_topics(lda, cv, 10)

    documentTopicDistr = lda.transform(X)
    documentTopicDistr = np.array(documentTopicDistr)

    lda_labels = np.argmax(documentTopicDistr, axis=1)

    print("Metrici LDA " + file)

    print(metrics.homogeneity_score(labels, lda_labels))
    print(metrics.completeness_score(labels, lda_labels))
    print(metrics.v_measure_score(labels, lda_labels))
    print(metrics.adjusted_rand_score(labels, lda_labels))
    print(metrics.adjusted_mutual_info_score(labels, lda_labels))
# Perform LDA.
from sklearn.decomposition import LatentDirichletAllocation, NMF

# In[28]:

# n_components - number of topics returned.
LDA = LatentDirichletAllocation(n_components=3, random_state=42)

# In[39]:

nmf = NMF(n_components=8, random_state=42)

# In[29]:

# Fit LDA to document term matrix.
LDA.fit(dtm)

# In[48]:

nmf.fit(dtm_ifidf)

# In[30]:

# Grab the vocabulary of words.
import random

random_word_id = random.randint(0, 6924)

cv.get_feature_names()[random_word_id]

# In[31]:
Esempio n. 54
0
    df = pd.read_csv('articles.csv', parse_dates=['post_published'])
    text = df['processed_text'].values.tolist()
max_features = 5000
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=3,
                                max_features=max_features,
                                stop_words='english')
tf = tf_vectorizer.fit_transform(text)
print("ready")

n_topics = 18
lda_model = LatentDirichletAllocation(n_components=n_topics, max_iter=5,
                                      learning_method='online',
                                      learning_offset=50.,
                                      random_state=0)

    lda_model.fit(tf)
    # pyLDAvis.enable_notebook()
    # pyLDAvis.sklearn.prepare(lda_model,tf, tf_vectorizer, R=20, mds='tsne')

        ## get the token to topic matrix
word_topic = np.zeros((max_features,n_topics),)
print(n_topics)
lda_model.components_
for topic_idx, topic in enumerate(lda_model.components_):
    word_topic[:,topic_idx] = topic

print("token-topic matrix",word_topic.shape)

    ## create a matrix of the top words used to define each topic
top_words = 20
tf_feature_names = np.array(tf_vectorizer.get_feature_names())
def run_lda(n_samples, n_features, n_components, n_top_words):
    texts = []
    res = elastic_utils.iterate_search(
        index_name=cfg.twitter_credentials['topic'])
    for i in res:
        texts.append(i['_source']['text'])

    # Use tf-idf features for NMF.
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                       min_df=2,
                                       max_features=n_features,
                                       stop_words='english')
    t0 = time()
    tfidf = tfidf_vectorizer.fit_transform(texts)
    print("done in %0.3fs." % (time() - t0))

    # Use tf (raw term count) features for LDA.
    print("Extracting tf features for LDA...")
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=n_features,
                                    stop_words='english')
    t0 = time()
    tf = tf_vectorizer.fit_transform(texts)
    print("done in %0.3fs." % (time() - t0))
    print()

    # Fit the NMF model
    print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components, random_state=1, alpha=.1,
              l1_ratio=.5).fit(tfidf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (Frobenius norm):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    # Fit the NMF model
    print(
        "Fitting the NMF model (generalized Kullback-Leibler divergence) with "
        "tf-idf features, n_samples=%d and n_features=%d..." %
        (n_samples, n_features))
    t0 = time()
    nmf = NMF(n_components=n_components,
              random_state=1,
              beta_loss='kullback-leibler',
              solver='mu',
              max_iter=1000,
              alpha=.1,
              l1_ratio=.5).fit(tfidf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in NMF model (generalized Kullback-Leibler divergence):")
    tfidf_feature_names = tfidf_vectorizer.get_feature_names()
    print_top_words(nmf, tfidf_feature_names, n_top_words)

    print("Fitting LDA models with tf features, "
          "n_samples=%d and n_features=%d..." % (n_samples, n_features))
    lda = LatentDirichletAllocation(n_components=n_components,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=50.,
                                    random_state=0)
    t0 = time()
    lda.fit(tf)
    print("done in %0.3fs." % (time() - t0))

    print("\nTopics in LDA model:")
    tf_feature_names = tf_vectorizer.get_feature_names()
    categories = print_top_words(lda, tf_feature_names, n_top_words)
    predict = lda.transform(tf)
    result = {"predictions": predict, "text": texts, "categories": categories}
    return result
Esempio n. 56
0
                    total_topics=2,
                    num_terms=5,
                    display_weights=True)

from sklearn.decomposition import LatentDirichletAllocation

norm_corpus = normalize_corpus(toy_corpus)
vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                feature_type='tfidf')
total_topics = 2
lda = LatentDirichletAllocation(n_topics=total_topics,
                                max_iter=1000,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=42)
lda.fit(tfidf_matrix)

feature_names = vectorizer.get_feature_names()
weights = lda.components_

topics = get_topics_terms_weights(weights, feature_names)
print_topics_udf(topics=topics,
                 total_topics=total_topics,
                 num_terms=8,
                 display_weights=True)

from sklearn.decomposition import NMF

norm_corpus = normalize_corpus(toy_corpus)
vectorizer, tfidf_matrix = build_feature_matrix(norm_corpus,
                                                feature_type='tfidf')
Esempio n. 57
0
class TextTopics():
    """
    Text classifier.
    """
    def __init__(self,
                 df: pd.DataFrame,
                 number_topics=50,
                 instance_path=instance_path(),
                 **kwargs):
        self._instance_path = instance_path
        self.number_topics = number_topics
        self.stop_words: List = get_stop_words("fi")
        self._count_vector: CountVectorizer = None
        self._lda: LDA = None
        self.token_cache = {}
        self._tokenizer = None
        self.min_sentence_length = 17

        # `kk` is used in assocation with time periods.
        self.stop_words += ["kk"]

        self.init(df, kwargs)

    def init(self, df: pd.DataFrame, generate_visualization=False, lang="fi"):
        """
        :param df: :class:`~pandas.Dataframe` containing text colums
        :param generate_visualization: Generate visalization of LDA results. Slows down
                                       generation notably.
        :param lang: Language for :class:`~Voikko`
        """
        if self._count_vector and self._lda:
            return True

        file_words = self.instance_path() / "word.dat"
        file_lda = self.instance_path() / "lda.dat"
        file_ldavis = self.instance_path() / "ldavis.html"

        try:
            # Try loading saved lda files.
            self._count_vector = joblib.load(file_words)
            self._lda = joblib.load(file_lda)
        except FileNotFoundError as e:
            logger.exception(e)

            texts = [x for x in df.to_numpy().flatten() if x is not np.NaN]

            # Setup word count vector
            self._count_vector = CountVectorizer(
                tokenizer=self.text_tokenize,
                stop_words=self.stop_words
            )
            count_data = self._count_vector.fit_transform(texts)

            self._lda = LDA(n_components=self.number_topics, n_jobs=-1)
            self._lda.fit(count_data)

            if generate_visualization:
                logger.debug("Generating LDA visualization. This might take a while")
                from pyLDAvis import sklearn as sklearn_lda
                import pyLDAvis

                LDAvis_prepared = sklearn_lda.prepare(self._lda, count_data, self._count_vector)
                pyLDAvis.save_html(LDAvis_prepared, str(file_ldavis))

            joblib.dump(self._count_vector, file_words)
            joblib.dump(self._lda, file_lda)

    def instance_path(self):
        path = self._instance_path / "lda" / str(self.number_topics)
        path.mkdir(exist_ok=True, parents=True)
        return path

    def tokenizer(self):
        if not self._tokenizer:
            self._tokenizer = VoikkoTokenizer("fi")
        return self._tokenizer

    @cached(LRUCache(maxsize=1024))
    def text_tokenize(self, text):
        """ Cached wrapper for `VoikkoTokenizer.tokenize()` """
        return self.tokenizer().tokenize(text)

    def find_talkingpoint(self, candidate: pd.Series) -> str:
        """ Find most suitable sentence from text """
        texts = tuple(candidate.dropna())
        if len(texts) == 0:
            return None

        x = self._get_topics(texts)
        return self.nearest_sentence(x[1], texts)

    def nearest_sentence(self, topics: List[float], texts: List[str]) -> str:
        """
        Find sentence closest to topic.

        TODO: When joining multiple sentences, it should be checked that they are from same paragraph.
        """
        @cached(LFUCache(maxsize=128))
        def lda(sentences):
            count_data = self._count_vector.transform(sentences)
            _lda = self._lda.transform(count_data)
            return _lda

        # Tokenize into sentences.
        sentences = chain(*[re.findall(r"\s*(.+?[\.!?])+", b, re.MULTILINE + re.DOTALL) for b in texts if b.strip() != ""])

        # cleanup sentences.
        sentences = tuple(set(filter(lambda x: len(x) > self.min_sentence_length, map(str.strip, sentences))))
        if len(sentences) == 0:
            return None

        # Find most topical sentence.
        tl_dr = []
        distance = 1.
        prev_sentence = ""
        for current_sentence, m in zip(sentences, lda(sentences)):
            _distance = np.abs(np.mean(topics - m))
            if _distance < distance:
                tl_dr, distance = ([prev_sentence, current_sentence], _distance)
            
            # Previous sentence is to provide context to most suitable sentence.
            prev_sentence = current_sentence

        return " ".join(filter(None, tl_dr))

    def compare_series(self, source: pd.Series, target: pd.Series):
        """
        Compare two text sets.

        First tuple contains topic word not found in :param:`target`, and second tuple
        contains word not found in :param:`source`.

        Note: This result will not be cached. Use :method:`compare_rows()` if possible.
        """
        # Convert them into tuples, so they can be cached.
        _source = tuple(source.dropna())
        _target = tuple(target.dropna())

        return self.compare_count_data(
            *self._get_topics(_source),
            *self._get_topics(_target)
        )

    def compare_rows(self, df: pd.DataFrame, i, l):
        x = self.row_topics(df, i)
        y = self.row_topics(df, l)
        if not x or not y:
            return None

        r = self.compare_count_data(*x, *y)
        return r

    def row_topics(self, df: pd.DataFrame, idx):
        """ Return suitable topics from dataset `df` row :param:`idx` """
        x = tuple(df.loc[idx].dropna())
        if len(x) == 0:
            return None

        return self._get_topics(x)

    @cached(LRUCache(maxsize=512))
    def _get_topics(self, source: List) -> Tuple:

        count_data = self._count_vector.transform(source)
        return (count_data, self._lda.transform(count_data).mean(axis=0))

    def compare_count_data(self, counts_data_source, topics_source, counts_data_target, topics_target) -> Tuple[Tuple[str, int], Tuple[str, int]]:
        diffs = topics_source - topics_target

        topic_max = np.argmax(diffs)
        topic_min = np.argmin(diffs)

        source_words = self.suggest_topic_word(counts_data_source, counts_data_target, topic_max)
        target_words = self.suggest_topic_word(counts_data_target, counts_data_source, topic_min)

        word_for_source = self.suitable_topic_word(source_words) if len(source_words) else None
        word_for_target = self.suitable_topic_word(target_words) if len(target_words) else None

        return TopicComparision(
            source=Topic(id=topic_max, term=word_for_source),
            target=Topic(id=topic_min, term=word_for_target)
        )

    def suggest_topic_word(self, A, B, topic_id: int) -> List[Tuple[int, float]]:
        """ Find relevant word for topic.

        Copares :param:`A` and :param:`B` words, and topic words to find
        suitable word with enough difference between `A` and `B`.

        :param A: :class:`csr_matrix` Target to find word for.
        :param B: :class:`csr_matrix` Comparative target for `A`
        :param topic_id: lda topic id number.

        :return: List of tuples in prominen order.
                 First instance in tuple is word vector feature number, and second is prominence value.
        """
        # Generate sum of used words
        a_sum = A.toarray().sum(0)
        b_sum = B.toarray().sum(0)

        # Topic word, prefering unique ones.
        λ = self._lda.components_[topic_id] / self._lda.components_.sum(0)

        # Remove words from A that B has used too.
        # Note: Doesn't actually remove.
        complement = a_sum - b_sum

        # Use logarithm, so topic words are prefered.
        prominence = np.log(complement) * λ

        # Generate list of words, ordered by prominence
        r = sorted([(i, prominence[i]) for i in prominence.argsort() if prominence[i] != 0 > -np.inf], key=lambda x: x[1], reverse=True)
        return r

    # sequence list is too volatile to be cached.
    def suitable_topic_word(self, seq: List[List[int, ]]) -> str:
        """
        Find first suitable word from :param:`seq` list.

        :param: 1d matrix of word feature indexes. Only first column in row
                is interepted as feature number.
        """
        vector_words = self.vector_words()
        """ Find first suitable word from word list """
        for r in seq:
            word = vector_words[r[0]]
            if self._suitable_topic_word(word):
                return word
        return None

    @cached(LFUCache(maxsize=512))
    def _suitable_topic_word(self, word) -> bool:
        """
        Check if word can be used as topic word

        Accepted word classes:
        :nimi:      Names; Words like `Linux` and `Microsoft`, `Kokoomus`
        :nimisana:  Substantives; like `ihminen`, `maahanmuutto`, `koulutus`, `Kokoomus`
        :laatusana: Adjectives; words like `maksuton`
        :nimisana_laatusana: Adjectives, that are not "real", like `rohkea` or `liberaali`
        :lyhenne:   Abbrevations; Words like `EU`
        :paikannimi:Geographical locations, like `Helsinki`
        :sukunimi:  Last names, like `Kekkonen`
        """

        for morph in self.tokenizer().analyze(word):
            _class = morph.get("CLASS")
            if _class in ["nimi", "nimisana", "nimisana_laatusana", "lyhenne", "paikannimi", "sukunimi"]:
                return True
            else:
                logger.debug("Unsuitable word class %s for word %s", _class, word)

        return False

    def vector_words(self) -> List:
        """ Feature names in CountVector """
        return self._count_vector.get_feature_names()
class LDA(Model):
    def __init__(self, reader, dataset='', topics=50, max_iter=20):

        self.lda = LatentDirichletAllocation(n_components=topics,
                                             max_iter=max_iter,
                                             learning_method='online',
                                             learning_offset=50.,
                                             random_state=0)
        self.reader = reader
        self.dataset = dataset
        self.n_topics = topics

        self.train = np.array([x["doc_tm"] for x in self.reader.train])
        self.valid = np.array([x["doc_tm"] for x in self.reader.valid])
        self.test = np.array([x["doc_tm"] for x in self.reader.test])

        self.lda.fit(self.train)

    def show_topics(self, model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            message = "Topic #%d: " % topic_idx
            message += " ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            print(message)
        print()

    def save_topic_distribution(self, save_path, n_top_words):
        model = self.lda
        feature_names = self.reader.idx2word
        str = ""

        for topic_idx, topic in enumerate(model.components_):
            message = "Topic #%d: " % topic_idx
            message += " ".join([
                feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]
            ])
            str += message + "\n"
        str += "\n"
        print(str)
        with open(save_path, "a") as f:
            f.write(str)

        print("saved to", save_path)

    def LDA_recall(self,
                   testset,
                   n_samples=1000,
                   n_recall=3,
                   print_output=True):
        x = testset[:n_samples]
        t = self.lda.transform(x)
        x_hat = np.matmul(t, self.lda.components_)

        recall_tot = []
        for i in range(np.shape(x)[0]):
            x_temp = x[i, :]
            x_hat_temp = x_hat[i, :]
            recall_tot.append(self.recall(x_temp, x_hat_temp, n_recall))

        output = np.sum(recall_tot) / len(recall_tot)
        if print_output:
            print("recall", n_recall, "over", n_samples, ":", output)
        return output

    def get_topic_distribution(self, x):
        return self.lda.transform(x)

    def perplexity(self, testset, n_samples=100, print_errors=False):
        # Topic distribution x word distribution over topics (normalization over components is required)
        n_samples_real = n_samples
        x_hats = np.matmul(self.lda.transform(testset[:n_samples]),
                           (self.lda.components_ /
                            self.lda.components_.sum(axis=1)[:, np.newaxis]))
        perplexities = []

        for i in range(n_samples):
            idxs = np.where(testset[i] > 0)

            x_hat = x_hats[i]
            probs = np.log(np.take(x_hat, idxs))
            if len(probs[0]) == 0:
                n_samples_real -= 1
                if print_errors:
                    print("datapoint", i,
                          "has no length, perplexity is now based on",
                          n_samples_real, "samples.")
                continue

            perplexities.append(sum(probs[0]) / len(probs[0]))

        total_perplexity = np.exp(
            -sum(perplexities) /
            n_samples_real)  #np.exp(- sum(perplexities) / len(perplexities))
        print("LDA perplexity on test_set", total_perplexity)

    def experiments(self, save_location="topics.txt"):
        self.show_topics(self.lda, self.reader.idx2word, 10)
        feature_names = self.reader.idx2word
        n_top_words = 10
        self.LDA_recall(self.test, print_output=True)
        self.perplexity(self.test)
        self.save_topic_distribution(save_location, 10)
df = pd.read_json(path)

df.head()

mask = ~df.loc[:, 'story'].isnull() & (df.loc[:, 'story'] != '')
df = df.loc[mask, :]

##############################################################################
# First model
##############################################################################
x = df.loc[:, 'story']
cv = CountVectorizer(max_df=0.9, min_df=2, stop_words=full_stopwords)
dtm = cv.fit_transform(x)

lda = LatentDirichletAllocation(n_components=10, random_state=42)
lda.fit(dtm)

topic_results = lda.transform(dtm)
df.loc[:, 'topic_id'] = topic_results.argmax(axis=1) + 1

args = [
    lda,
    cv,
    df,
]
create_df_topic_word_lists(*args, verbose=True)

##############################################################################
# Remove other languages
##############################################################################
df.loc[:, 'language'] = df.loc[:, 'story'].apply(lambda x: detect(str(x)))
def process(db_msg):

    logger, log_stream = slog.set_logging('topic_identification',
                                          loglevel=api.config.debug_mode)
    logger.info("Process started")
    time_monitor = tp.progress()

    columns = [c['name'] for c in db_msg.attributes['table']['columns']]
    df = pd.DataFrame(db_msg.body, columns=columns)

    # Language filter
    language_filter = tfp.read_list(api.config.language_filter)
    if language_filter:
        df = df.loc[df["LANGUAGE"].isin(language_filter)]
    else:
        language_filter = list(df['LANGUAGE'].unique())
    logger.info('Languages : {}'.format(language_filter))

    # Word type filter
    word_type_filter = tfp.read_value(api.config.word_type_filter)
    if word_type_filter:
        types = [c for c in word_type_filter]
        df = df.loc[df["TYPE"].isin(types)]
    logger.info('Word restricted to types : {}'.format(word_type_filter))

    # groupby and concatenate words
    gdf = df.groupby(by=['HASH_TEXT', 'LANGUAGE'])['WORD'].apply(
        lambda x: ' '.join(x)).reset_index()

    logger.info('Topic identification: ')
    for lang in language_filter:
        logger.info('Language: {}  #Documents: {}  #Words: {}'.format(lang,gdf.loc[gdf['LANGUAGE']==lang].shape[0],\
                                                                      df.loc[df['LANGUAGE'] == lang].shape[0]))

    api.send(outports[0]['name'], log_stream.getvalue())
    log_stream.seek(0)

    # create document-term matrix - no tokenization or text prep are needed
    tf_vectorizer = CountVectorizer(analyzer='word',
                                    min_df=1,
                                    lowercase=False,
                                    tokenizer=str.split)

    # tf means term-frequency in a document for each language
    date_today = str(date.today())

    # 2-array with TOPIC, LANGUAGE, TYPE, DATE, EXPIRY_DATE, ATTRIBUTE, KEYWORD_i (num of topics)
    topic_list = list()
    for lang in language_filter:
        logger.info('Process all texts for language: {}'.format(lang))
        lang_gdf = gdf.loc[gdf['LANGUAGE'] == lang]
        dtm_tf = tf_vectorizer.fit_transform(lang_gdf['WORD'])
        # for tf dtm
        lda_tf = LatentDirichletAllocation(n_components=api.config.num_topics,
                                           learning_method='online',
                                           evaluate_every=-1,
                                           n_jobs=-1)
        lda_tf.fit(dtm_tf)
        feature_names = tf_vectorizer.get_feature_names()

        for i, topic in enumerate(lda_tf.components_):
            topic_words = [
                feature_names[f]
                for f in topic.argsort()[:-api.config.topic_num_words - 1:-1]
            ]
            logger.debug('Len: {}  topic_words:{}'.format(
                len(topic_words), topic_words))
            row = [
                date_today + "-" + str(i), lang, 'ALGO', date_today, None, None
            ] + topic_words
            topic_list.append(row)

    attributes = {
        "table": {
            "columns": [{
                "class": "string",
                "name": "TOPIC",
                "nullable": False,
                "size": 80,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "LANGUAGE",
                "nullable": False,
                "size": 2,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "TYPE",
                "nullable": False,
                "size": 10,
                "type": {
                    "hana": "NVARCHAR"
                }
            }, {
                "class": "string",
                "name": "DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "EXPIRY_DATE",
                "nullable": True,
                "type": {
                    "hana": "DATE"
                }
            }, {
                "class": "string",
                "name": "ATTRIBUTE",
                "nullable": True,
                "size": 25,
                "type": {
                    "hana": "NVACHAR"
                }
            }],
            "name":
            "DIPROJECTS.WORD_INDEX",
            "version":
            1
        }
    }
    for i in range(1, api.config.topic_num_words + 1):
        attributes['table']['columns'].append({
            "class": "string",
            "name": "KEYWORD_" + str(i),
            "nullable": True,
            "size": 80,
            "type": {
                "hana": "NVARCHAR"
            }
        })

    msg = api.Message(attributes=attributes, body=topic_list)
    logger.debug('Process ended, topics processed {}'.format(
        time_monitor.elapsed_time()))
    api.send(outports[0]['name'], log_stream.getvalue())
    api.send(outports[1]['name'], msg)