Ejemplo n.º 1
0
    def applyLDA2(self, number_of_clusters, country_specific_tweets):
        train, feature_names = self.extractFeatures(country_specific_tweets,False)
        
        name = "lda"
        if self.results:
            print("Fitting LDA model with tfidf", end= " - ")
        t0 = time()     
        lda = LatentDirichletAllocation(n_topics=number_of_clusters, max_iter=5,
                                        learning_method='online', learning_offset=50.,
                                        random_state=0)

        lda.fit(train)
        
        if self.results:
            print("done in %0.3fs." % (time() - t0))
        
        parameters = lda.get_params()
        topics = lda.components_
        doc_topic = lda.transform(train)
        top10, labels = self.printTopicCluster(topics, doc_topic, feature_names)
        labels = numpy.asarray(labels)
        
        if self.results:
            print("Silhouette Coefficient {0}: {1}".format(name, metrics.silhouette_score(train, labels)))
        
        return name, parameters, top10, labels
Ejemplo n.º 2
0
def score_lda(src, dst):
	##read sentence pairs to two lists
	b1 = []
	b2 = []
	lines = 0
	with open(src) as p:
		for i, line in enumerate(p):
			s = line.split('\t')
			b1.append(s[0])
			b2.append(s[1][:-1]) #remove \n
			lines = i + 1

	vectorizer = CountVectorizer()
	vectors=vectorizer.fit_transform(b1 + b2)

	lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
	X = lda.fit_transform(vectors)
	print X.shape
	b1_v = vectorizer.transform(b1)
	b2_v = vectorizer.transform(b2)
	b1_vecs = lda.transform(b1_v)
	b2_vecs = lda.transform(b2_v)

	res = [round(5*(1 - spatial.distance.cosine(b1_vecs[i], b2_vecs[i])),2) for i in range(lines)]
	with open(dst, 'w') as thefile:
		thefile.write("\n".join(str(i) for i in res))
Ejemplo n.º 3
0
def fit_lda(tf):
    '''takes in a tf sparse vector and finds the top topics'''
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
    lda.fit(tf)
    tf_feature_names = tf_vectorizer.get_feature_names()
    lda_topic_dict = print_top_words(lda, tf_feature_names, n_top_words)
    return lda, lda_topic_dict
Ejemplo n.º 4
0
def LDA(tf,word):
    lda = LatentDirichletAllocation(n_topics=30, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
    lda.fit(tf)
    print_top_words(lda,word,20)
Ejemplo n.º 5
0
def basic_lda(df, n_topics=200, max_df=0.5, min_df=5):
    '''
    Basic LDA model for album recommendations

    Args:
        df: dataframe with Pitchfork reviews
        n_topics: number of lda topics
        max_df: max_df in TfidfVectorizer
        min_df: min_df in TfidfVectorizer
    Returns:
        tfidf: sklearn fitted TfidfVectorizer
        tfidf_trans: sparse matrix with tfidf transformed data
        lda: sklearn fitted LatentDirichletAllocation
        lda_trans: dense array with lda transformed data

    '''

    X = df['review']
    cv = CountVectorizer(stop_words='english',
                         min_df=5,
                         max_df=0.5)
    cv_trans = cv.fit_transform(X)

    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=7)
    lda_trans = lda.fit_transform(cv_trans)

    return cv, cv_trans, lda, lda_trans
Ejemplo n.º 6
0
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
Ejemplo n.º 7
0
def plot_perplexity_iter(A_tfidf, num_topics):
    
    print "computing perplexity vs iter..."
    max_iter = 5
    perplexity = []
    em_iter = []
    for sweep in range(1,max_iter+1):
        lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)    
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end    
    np.save('./data/perplexity_iter.npy', perplexity)
    
    f = plt.figure()
    plt.plot(em_iter, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_iter.png')
Ejemplo n.º 8
0
def plot_perplexity_batch(A_tfidf, num_docs):
    
    print "computing perplexity vs batch size..."
    max_iter = 5
    num_topics = 10
    batch_size = np.logspace(6, 10, 5, base=2).astype(int)
    perplexity = np.zeros((len(batch_size),max_iter))
    em_iter = np.zeros((len(batch_size),max_iter))
    for ii, mini_batch in enumerate(batch_size):
        for jj, sweep in enumerate(range(1,max_iter+1)):
            lda = LatentDirichletAllocation(n_topics = num_topics, max_iter=sweep, learning_method='online', batch_size = mini_batch, random_state=0, n_jobs=-1)
            tic = time()
            lda.fit(A_tfidf)  #online VB
            toc = time()
            print "sweep %d, elapsed time: %.4f sec" %(sweep, toc - tic)
            perplexity[ii,jj] = lda.perplexity(A_tfidf)
            em_iter[ii,jj] = lda.n_batch_iter_
        #end
    #end
    np.save('./data/perplexity.npy', perplexity)
    np.save('./data/em_iter.npy', em_iter)    
    
    f = plt.figure()
    for mb in range(len(batch_size)):
        plt.plot(em_iter[mb,:], perplexity[mb,:], color=np.random.rand(3,), marker='o', lw=2.0, label='mini_batch: '+str(batch_size[mb]))
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('EM iter')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_batch.png')
Ejemplo n.º 9
0
def plot_perplexity_topics(A_tfidf):
    
    print "computing perplexity vs K..."
    max_iter = 5    #based on plot_perplexity_iter()
    #num_topics = np.linspace(2,20,5).astype(np.int)
    num_topics = np.logspace(1,2,5).astype(np.int)
    perplexity = []
    em_iter = []
    for k in num_topics:
        lda = LatentDirichletAllocation(n_topics = k, max_iter=max_iter, learning_method='online', batch_size = 512, random_state=0, n_jobs=-1)
        tic = time()
        lda.fit(A_tfidf)  #online VB
        toc = time()
        print "K= %d, elapsed time: %.4f sec" %(k, toc - tic)
        perplexity.append(lda.perplexity(A_tfidf))
        em_iter.append(lda.n_batch_iter_)
    #end
    
    np.save('./data/perplexity_topics.npy', perplexity)
    np.save('./data/perplexity_topics2.npy', num_topics)    
    
    f = plt.figure()
    plt.plot(num_topics, perplexity, color='b', marker='o', lw=2.0, label='perplexity')
    plt.title('Perplexity (LDA, online VB)')
    plt.xlabel('Number of Topics, K')
    plt.ylabel('Perplexity')
    plt.grid(True)
    plt.legend()
    plt.show()
    f.savefig('./figures/perplexity_topics.png')
Ejemplo n.º 10
0
def lda_tuner(ingroup_otu, best_models):

    best_score = -1*np.inf
    dtp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    twp_series = [0.0001, 0.001, 0.01, 0.1, 0.2]
    topic_series = [3]
    X = ingroup_otu.values
    eval_counter = 0

    for topics in topic_series: 
        for dtp in dtp_series:
            for twp in twp_series:
                eval_counter +=1
                X_train, X_test = train_test_split(X, test_size=0.5)
                lda = LatentDirichletAllocation(n_topics=topics, 
                                                doc_topic_prior=dtp, 
                                                topic_word_prior=twp, 
                                                learning_method='batch',
                                                random_state=42,
                                                max_iter=20)
                lda.fit(X_train)
                this_score = lda.score(X_test)
                this_perplexity = lda.perplexity(X_test)
                if this_score > best_score:
                    best_score = this_score
                    print "New Max Likelihood: {}".format(best_score)

                print "#{}: n:{}, dtp:{}, twp:{}, score:{}, perp:{}".format(eval_counter, 
                                                                 topics, dtp, twp,
                                                                 this_score, this_perplexity)

                best_models.append({'n': topics, 'dtp': dtp, 'twp': twp,
                                    'score': this_score, 'perp': this_perplexity})
                if (dtp == dtp_series[-1]) and (twp == twp_series[-1]):
                    eval_counter +=1
                    X_train, X_test = train_test_split(X, test_size=0.5)
                    lda = LatentDirichletAllocation(n_topics=topics, 
                                                    doc_topic_prior=1./topics, 
                                                    topic_word_prior=1./topics, 
                                                    learning_method='batch',
                                                    random_state=42,
                                                    max_iter=20)
                    lda.fit(X_train)
                    this_score = lda.score(X_test)
                    this_perplexity = lda.perplexity(X_test)
                    if this_score > best_score:
                        best_score = this_score
                        print "New Max Likelihood: {}".format(best_score)

                    print "#{}: n:{}, dtp:{}, twp:{}, score:{} perp: {}".format(eval_counter, 
                                                                                topics, 
                                                                                (1./topics), 
                                                                                (1./topics),
                                                                                this_score,
                                                                                this_perplexity)

                    best_models.append({'n': topics, 'dtp': (1./topics), 
                                        'twp': (1./topics), 'score': this_score,
                                        'perp': this_perplexity})
    return best_models
Ejemplo n.º 11
0
def extractTopicLDA(func_message_dic, store_cloumn):
    if len(func_message_dic) == 0:
        print "func_message_dic is null"
        return False
    try:
        conn=MySQLdb.connect(host='192.168.162.122',user='******',passwd='123456',port=3306)
        cur=conn.cursor()
        cur.execute('set names utf8mb4')
        conn.select_db('codeAnalysis')
        for function in func_message_dic:
            message = func_message_dic[function]
            np_extractor = nlp.semantics_extraction.NPExtractor(message)
            text = np_extractor.extract()
            if len(text) == 0:
                continue
            tf_vectorizer = CountVectorizer(max_df=1.0, min_df=1, max_features=n_features, stop_words='english')
            tf = tf_vectorizer.fit_transform(text)
            print("Fitting LDA models with tf features, n_samples=%d and n_features=%d..." % (n_samples, n_features))
            lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50.,
                                                                    random_state=0)
            lda.fit(tf)
            tf_feature_names = tf_vectorizer.get_feature_names()
            seprator = " "
            for topic_idx, topic in enumerate(lda.components_):
                keywords = seprator.join([tf_feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
            sql = "update func_semantic set "+store_cloumn+" = '"+keywords+"' where func_name = '"+function+"'"
            print sql
            cur.execute(sql)
            conn.commit()
        cur.close()
        conn.close()
        return True
    except MySQLdb.Error,e:
        print e
        raise
Ejemplo n.º 12
0
 def _get_model_LDA(self, corpus):
     #lda = models.LdaModel(corpus, id2word=self.corpus.dictionary, num_topics=5, alpha='auto', eval_every=50)
     lda = LatentDirichletAllocation(n_topics=self.num_of_clusters, max_iter=20,
                                     learning_method='online',
                                     learning_offset=50.,
                                     random_state=1)
     return lda.fit_transform(corpus)
Ejemplo n.º 13
0
def produceLDATopics():
    '''
    Takes description of each game and uses sklearn's latent dirichlet allocation and count vectorizer
    to extract topics.
    :return: pandas data frame with topic weights for each game (rows) and topic (columns)
    '''
    data_samples, gameNames = create_game_profile_df(game_path)
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
    tf = tf_vectorizer.fit_transform(data_samples)
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                    learning_method='online', learning_offset=50.,
                                    random_state=0)
    topics = lda.fit_transform(tf)
    # for i in range(50):
    #     gameTopics = []
    #     for j in range(len(topics[0])):
    #         if topics[i,j] > 1.0/float(n_topics):
    #             gameTopics.append(j)
    #     print gameNames[i], gameTopics
    topicsByGame = pandas.DataFrame(topics)
    topicsByGame.index = gameNames
    print topicsByGame

    tf_feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(lda.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([tf_feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))

    return topicsByGame
Ejemplo n.º 14
0
def topicmodel( comments ):

    _texts = []
    texts = []

    for c in comments:

        c = c['text']
        _texts.append( c )
        texts.append( c )



    tf_vectorizer = CountVectorizer(
                max_df=.20,
                min_df=10,
                stop_words = stopwords )
    texts = tf_vectorizer.fit_transform( texts )

    ## test between 2 and 20 topics
    topics = {}

    for k in range(2, 10):

        print "Testing", k

        model = LatentDirichletAllocation(
                    n_topics= k ,
                    max_iter=5,
                    learning_method='batch',
                    learning_offset=50.,
                    random_state=0
                )
        model.fit( texts )
        ll = model.score( texts )
        topics[ ll ] = model

    topic = max( topics.keys() )

    ret = collections.defaultdict( list )

    ## ugly, rewrite some day
    model = topics[ topic ]

    ## for debug pront chosen models' names
    feature_names = tf_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print "Topic #%d:" % topic_idx
        print " ".join( [feature_names[i].encode('utf8') for i in topic.argsort()[:-5 - 1:-1]])
        print

    for i, topic in enumerate( model.transform( texts ) ):

        topic = numpy.argmax( topic )
        text = _texts[ i ].encode('utf8')

        ret[ topic ].append( text )

    return ret
Ejemplo n.º 15
0
def latdirall(content):
    lda = LatentDirichletAllocation(n_topics=10)
    tf_vectorizer = TfidfVectorizer(max_df=0.99, min_df=1,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(content)
    lolz = lda.fit_transform(tf)
    tfidf_feature_names = tf_vectorizer.get_feature_names()
    return top_topics(lda, tfidf_feature_names, 10)
Ejemplo n.º 16
0
class LDATopics:
	# Constructor
	def __init__(self, filename):
		# Member variables
		self.email_data = []
		self.lda = None
		self.feature_names = None
		self.num_topics = NUM_TOPICS
		self.num_words_per_topic = NUM_WORDS_PER_TOPIC
		self.num_features = NUM_FEATURES

		# Load emails from full path to file
		emails = EmailLoader(filename).get_email_dict_array()

		# Process emails into a list of email body contents
		for email_rec in emails:
			if email_rec['body']:
				# Clean the text and add to list
				cleaner = TextCleaner(email_rec['body'])

				self.email_data.append(" ".join(cleaner.tokenize_str()))

	## Public methods ##
	def process(self, topics=None, features=None):
		# Check if default numbers should be used
		if topics is None:
			topics = self.num_topics
			
		if features is None:
			features = self.num_features

		# Calculate term frequency for LDA
		tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=features, stop_words='english')
		tf = tf_vectorizer.fit_transform(self.email_data)

		# Fit the LDA model to data samples
		self.lda = LatentDirichletAllocation(n_topics=topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0)

		self.lda.fit(tf)

		# Set the feature name (words)
		self.feature_names = tf_vectorizer.get_feature_names()

	def print_topics(self, words_per_topic=None):
		# Check if default number of words per topics should be used
		if words_per_topic is None:
			words_per_topic = self.num_words_per_topic

		self._print_topics(self.lda, self.feature_names, words_per_topic)

	## Private methods ##
	def _print_topics(self, model, feature_names, words_per_topic):
	    for topic_idx, topic in enumerate(model.components_):
	        print("Topic #%d:" % topic_idx)
	        print(" ".join([feature_names[i]
	                        for i in topic.argsort()[:-words_per_topic - 1:-1]]))

	    print()
 def perform_analysis(self, stocks, szTimeAxis, n_ahead):
     # load Snowball comment data
     from agares.datasource.snowball_cmt_loader import SnowballCmtLoader
     SBLoader = SnowballCmtLoader()
     date = self.dt_start.date()
     df_cmt_list = []
     while date <= self.dt_end.date():
         df_cmt_list.append(SBLoader.load(str(date)))
         date += timedelta(days=1)
     df_cmt = pd.concat(df_cmt_list, ignore_index=True)
     # Chinese text segmentation
     self.set_jieba()
     df_cmt['RawComment'] = df_cmt['RawComment'].map(jieba.cut)
     # drop stopwords
     self.stopwords = [line.strip() for line in open('stopwords').readlines()]
     self.stopwords.append(' ')
     df_cmt['RawComment'] = df_cmt['RawComment'].map(self.drop_useless_word)
     cmt = df_cmt['RawComment'].tolist()
     # construct tfidf matrix
     tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_df=0.95, min_df=0.05)
     tfidf = tfidf_vectorizer.fit_transform(cmt)
     
     # Fit the NMF model
     n_topics = 5
     n_top_words = 20
     print("Fitting the NMF model with tf-idf features..")
     t0 = time()
     nmf = NMF(n_components=n_topics, random_state=1, alpha=.1, l1_ratio=.5).fit(tfidf)
     print("done in %0.3fs." % (time() - t0))
     print("\nTopics in NMF model:")
     tfidf_feature_names = tfidf_vectorizer.get_feature_names()
     self.print_top_words(nmf, tfidf_feature_names, n_top_words)
     
     # Fit the LDA model
     print("Fitting LDA models with tf-idf features..")
     lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                     learning_method='online', learning_offset=50.,
                                     random_state=0)
     t0 = time()
     lda.fit(tfidf)
     print("done in %0.3fs." % (time() - t0))
     print("\nTopics in LDA model:")
     self.print_top_words(lda, tfidf_feature_names, n_top_words)
     
     # load sz daily candlestick data
     sz = next(iter(stocks))
     cst_Day = stocks[sz].cst['1Day'] 
     # print close price within the timescope
     date = self.dt_start
     print()
     print("The ShangHai stock Index (close index) within the timescope")
     while date <= self.dt_end:
         ts = pd.to_datetime(date)
         try:
             print("Date: {0:s}, Index: {1:.2f}".format(str(date.date()), cst_Day.at[ts, 'close']))
         except KeyError: # sz candlestick data does not exist at this datetime
             print("Date: {0:s}, Index: (market closed)".format(str(date.date())))
         date += timedelta(days=1)
Ejemplo n.º 18
0
def LDA(matrix,preserve,n_topics=100):

    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                        learning_method='online', learning_offset=50.,
                                        random_state=randint(1,100))
    lda.fit(matrix[preserve])
    topic_model=lda.transform(matrix)

    return topic_model
Ejemplo n.º 19
0
def test_lda_transform():
    # Test LDA transform.
    # Transform result cannot be negative
    rng = np.random.RandomState(0)
    X = rng.randint(5, size=(20, 10))
    n_topics = 3
    lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
    X_trans = lda.fit_transform(X)
    assert_true((X_trans > 0.0).any())
Ejemplo n.º 20
0
def test_lda_transform():
    # Test LDA transform.
    # Transform result cannot be negative and should be normalized
    rng = np.random.RandomState(0)
    X = rng.randint(5, size=(20, 10))
    n_topics = 3
    lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
    X_trans = lda.fit_transform(X)
    assert_true((X_trans > 0.0).any())
    assert_array_almost_equal(np.sum(X_trans, axis=1), np.ones(X_trans.shape[0]))
Ejemplo n.º 21
0
def test_lda_fit_transform(method):
    # Test LDA fit_transform & transform
    # fit_transform and transform result should be the same
    rng = np.random.RandomState(0)
    X = rng.randint(10, size=(50, 20))
    lda = LatentDirichletAllocation(n_components=5, learning_method=method,
                                    random_state=rng)
    X_fit = lda.fit_transform(X)
    X_trans = lda.transform(X)
    assert_array_almost_equal(X_fit, X_trans, 4)
Ejemplo n.º 22
0
def test_lda_transform_mismatch():
    # test `n_features` mismatch in partial_fit and transform
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    X_2 = rng.randint(4, size=(10, 8))

    n_topics = rng.randint(3, 6)
    lda = LatentDirichletAllocation(n_topics=n_topics, random_state=rng)
    lda.partial_fit(X)
    assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                    random_state=0)
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)
Ejemplo n.º 24
0
def test_lda_partial_fit_dim_mismatch():
    # test `n_features` mismatch in `partial_fit`
    rng = np.random.RandomState(0)
    n_topics = rng.randint(3, 6)
    n_col = rng.randint(6, 10)
    X_1 = np.random.randint(4, size=(10, n_col))
    X_2 = np.random.randint(4, size=(10, n_col + 1))
    lda = LatentDirichletAllocation(n_topics=n_topics, learning_offset=5.,
                                    total_samples=20, random_state=rng)
    lda.partial_fit(X_1)
    assert_raises_regexp(ValueError, r"^The provided data has", lda.partial_fit, X_2)
def test_perplexity_input_format():
    # Test LDA perplexity for sparse and dense input
    # score should be the same for both dense and sparse input
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch',
                                    total_samples=100, random_state=0)
    lda.fit(X)
    perp_1 = lda.perplexity(X)
    perp_2 = lda.perplexity(X.toarray())
    assert_almost_equal(perp_1, perp_2)
Ejemplo n.º 26
0
def get_lda():
    lda = LatentDirichletAllocation(
        n_topics=K,
        max_iter=5,
        learning_method='online',
        learning_offset=50.,
        random_state=0)
    lda.fit(X)
    tf_feature_names = VECTORIZER.get_feature_names()
    print_top_words(lda, tf_feature_names, 10)
    return lda
Ejemplo n.º 27
0
    def calculate_lda(self, tfidf):
        print("Fitting LDA models with tf features...")
        lda = LatentDirichletAllocation(n_topics=self.num_topics, max_iter=5,
                                        learning_method='online', learning_offset=50.,
                                        random_state=0)
        t0 = time()
        lda.fit(tfidf)

        print("Topics in LDA model:")
        print_top_words(lda, self.tfidf_feature_names, self.num_words)
        print("done in %0.3fs." % (time() - t0))
def test_doc_topic_distr_deprecation():
    # Test that the appropriate warning message is displayed when a user
    # attempts to pass the doc_topic_distr argument to the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch',
                                    total_samples=100, random_state=0)
    distr1 = lda.fit_transform(X)
    distr2 = None
    assert_warns(DeprecationWarning, lda.perplexity, X, distr1)
    assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
Ejemplo n.º 29
0
def test_lda_default_prior_params():
    # default prior parameter should be `1 / topics`
    # and verbose params should not affect result
    n_topics, X = _build_sparse_mtx()
    prior = 1. / n_topics
    lda_1 = LatentDirichletAllocation(n_topics=n_topics, doc_topic_prior=prior,
                                      topic_word_prior=prior, random_state=0)
    lda_2 = LatentDirichletAllocation(n_topics=n_topics, random_state=0)

    topic_distr_1 = lda_1.fit_transform(X)
    topic_distr_2 = lda_2.fit_transform(X)
    assert_almost_equal(topic_distr_1, topic_distr_2)
Ejemplo n.º 30
0
def latdirall(content):
    lda = LatentDirichletAllocation(n_topics=5)
    tf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
    tf = tf_vectorizer.fit_transform(content)
    lolz = lda.fit_transform(tf)
    tfidf_feature_names = tf_vectorizer.get_feature_names()
    tops = top_topics(lda, tfidf_feature_names, 10)
    wordlist = []
    for topic in tops:
        wordlist += topic
    return wordlist
Ejemplo n.º 31
0
transformer = TfidfTransformer()
cntTf = vectorizer.fit_transform(apps)
tfidf = transformer.fit_transform(cntTf)
word = vectorizer.get_feature_names()
weight = tfidf.toarray()
df_weight = pd.DataFrame(weight)
feature = df_weight.columns
df_weight['sum'] = 0
for f in tqdm(feature):
    df_weight['sum'] += df_weight[f]
deviceid_packages['tfidf_sum'] = df_weight['sum']

# In[10]:

lda = LatentDirichletAllocation(n_topics=5,
                                learning_offset=50.,
                                random_state=666)
docres = lda.fit_transform(cntTf)

# In[11]:

deviceid_packages = pd.concat(
    [deviceid_packages, pd.DataFrame(docres)], axis=1)

# In[12]:

temp = deviceid_packages.drop('apps', axis=1)
deviceid_train = pd.merge(deviceid_train, temp, on='device_id', how='left')

# In[13]:
Ejemplo n.º 32
0
############################################# 降维 ######################################################
# user_action_ = user_action[user_action['time']<'2017-04-01']
user_action_ = user_action[user_action['a_date']<'2017-04-01']

mapping = {}
for sample in user_action_[['user_id', 'sku_id']].values:
    mapping.setdefault(sample[0], []).append(str(sample[1]))
cate1s = list(mapping.keys())
print(len(cate1s))
cate2_as_sentence = [' '.join(mapping[cate_]) for cate_ in cate1s]
cate2_as_matrix = CountVectorizer(token_pattern='(?u)\\b\\w+\\b', min_df=2).fit_transform(cate2_as_sentence)

lda = LDA(n_components=5,
          learning_method='online',
          batch_size=1000,
          n_jobs=40,
          random_state=520)
topics_of_cate1 = lda.fit_transform(cate2_as_matrix)
topics_of_cate1 = pd.DataFrame(topics_of_cate1,
                               columns=["%s_%s_lda_action" % ('user_sku', i) for i in range(5)]).astype('float32')
topics_of_cate1['user_id'] = cate1s
topics_of_cate1.to_hdf(cache_path + 'p1.hdf', 'w')

nmf = NMF(n_components=5,
          random_state=520,
          beta_loss='kullback-leibler',
          solver='mu',
          max_iter=1000,
          alpha=.1,
          l1_ratio=.5)
Ejemplo n.º 33
0
def main(file, userselection, n_pois, output, desc):

    df_reviews = pd.read_csv(file).drop_duplicates(subset=['Tripadvisor'],
                                                   keep='first')
    #Lower all words
    df_reviews['description'] = df_reviews['description'].str.lower()
    #Numeric to strings
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub(r'\d+', '', x))
    #remove html tags
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: (html.unescape(x)))
    #remove punctuation
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    #remove accent
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: unidecode.unidecode(x))
    #remove specific characters and words
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub("description", '', x))
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub("wikipedia", '', x))
    df_reviews['description'] = df_reviews['description'].apply(
        lambda x: re.sub("'s", '', x))
    #stop words
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    df_reviews['tokens'] = df_reviews['description'].apply(lambda x: [
        lemmatizer.lemmatize(word) for word in word_tokenize(x)
        if not word in stop_words and detect_language(word) == 'English'
    ])

    #get corpus
    corpus = get_corpus(df_reviews)

    seg_list = split_to_words(corpus)
    vectorizer_model = CountVectorizer(stop_words=stop_words,
                                       analyzer='word',
                                       max_features=2000)
    vec_docs = vectorizer_model.fit_transform(seg_list)
    tf_feature_names = vectorizer_model.get_feature_names()

    no_topics = 10
    no_top_words = 5

    lda = LatentDirichletAllocation(n_components=no_topics,
                                    max_iter=5,
                                    learning_method='online',
                                    learning_offset=40.,
                                    random_state=0).fit(vec_docs)
    display_topics(lda, tf_feature_names, no_top_words)
    lda_results = lda.fit_transform(vec_docs)

    df_reviews = get_topics(lda_results, df_reviews)

    topic_dict = display_topics(lda, tf_feature_names, no_top_words)

    h = pd.DataFrame.from_dict(topic_dict, orient='index').transpose().melt()

    df_reviews = df_reviews.merge(h,
                                  left_on='topics',
                                  right_on='variable',
                                  how='left')
    df_reviews = df_reviews.drop(columns=['topics', 'variable', 'tokens'])
    df_reviews = pd.get_dummies(df_reviews,
                                prefix=['keyword'],
                                columns=['value']).drop_duplicates()
    cols = [col for col in df_reviews.columns if 'keyword' not in col]

    df_reviews = df_reviews.groupby(cols).sum().reset_index()

    df_reviews = review_rate(df_reviews)

    selection = user_selection(userselection)

    results = best_results(dict_user(df_reviews, selection),
                           df_reviews,
                           n_pois,
                           relevant=userselection)

    if output != "default":
        results.to_csv("{output}_{sufix}.csv".format(output=output,
                                                     sufix=selection),
                       index=False)
    desc_ = pd.read_csv(desc, sep="|")
    results = results.merge(desc_,
                            left_on='Tripadvisor',
                            right_on='name',
                            how='inner')
    return results
Ejemplo n.º 34
0

# #TODO Eu queria mostrar quais combinações de tokens os vetorizadores usaram como features.

# ## 6. Topic Modelling
# One way to organize those feature vectors is to search for unsupervisionised patterns inside data to form topics and then use those topics to classify.

# ### 6.1. Generating topics

# In[42]:


num_topics = 30
number_words = 10
''' Creating and fit the LDA model using the count_vectors generated before '''
lda = LDA(n_components=num_topics, max_iter = 20, n_jobs=-1)
topics_vectors = lda.fit_transform(count_vectors)
''' Printing the topics found by the LDA model '''
print("Topics found via LDA:")

words = count_vectorizer.get_feature_names()
for topic_idx, topic in enumerate(lda.components_):
    print("\nTopic #%d:" % topic_idx)
    print(" ".join([words[i]
                    for i in topic.argsort()[:-number_words - 1:-1]]))


# Let's see how the topics found are related to each other

# In[49]:
Ejemplo n.º 35
0
    def summarize(self, text, num=100, topic_min=3, judge_topic=None):
        """
        :param text: str
        :param num: int
        :return: list
        """
        # 切句
        if type(text) == str:
            self.sentences = cut_sentence(text)
        elif type(text) == list:
            self.sentences = text
        else:
            raise RuntimeError("text type must be list or str")
        len_sentences_cut = len(self.sentences)
        # 切词
        sentences_cut = [[
            word for word in list(jieba.cut(sentence)) if word.strip()
        ] for sentence in self.sentences]
        # sentences_cut = [[word for word in jieba_cut(extract_chinese(sentence)) if word.strip()] for sentence in self.sentences]
        # 去除停用词等
        self.sentences_cut = [
            list(filter(lambda x: x not in self.stop_words, sc))
            for sc in sentences_cut
        ]
        self.sentences_cut = [" ".join(sc) for sc in self.sentences_cut]
        print(sentences_cut)
        # 计算每个句子的tf
        vector_c = CountVectorizer(ngram_range=(1, 2),
                                   stop_words=self.stop_words)
        tf_ngram = vector_c.fit_transform(self.sentences_cut)
        # 主题数, 经验判断
        topic_num = min(topic_min, int(len(sentences_cut) / 2))  # 设定最小主题数为3
        print('topic_num', topic_num)
        lda = LatentDirichletAllocation(n_components=topic_num,
                                        max_iter=32,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=2019)
        res_lda_u = lda.fit_transform(tf_ngram.T)
        res_lda_v = lda.components_
        print('res_lda_v', res_lda_v)  # 各个主题在各个文档上分配的概率

        if judge_topic:
            ### 方案一, 获取最大那个主题的k个句子
            ##################################################################################
            topic_t_score = np.sum(res_lda_v, axis=-1)
            print('topic_t_score', topic_t_score)
            # 对每列(一个句子topic_num个主题),得分进行排序,0为最大
            res_nmf_h_soft = res_lda_v.argsort(axis=0)[-topic_num:][::-1]
            # 统计为最大每个主题的句子个数
            exist = (res_nmf_h_soft <= 0) * 1.0
            factor = np.ones(res_nmf_h_soft.shape[1])
            topic_t_count = np.dot(exist, factor)
            # 标准化
            topic_t_count /= np.sum(topic_t_count, axis=-1)
            topic_t_score /= np.sum(topic_t_score, axis=-1)
            # 主题最大个数占比, 与主题总得分占比选择最大的主题
            topic_t_tc = topic_t_count + topic_t_score
            topic_t_tc_argmax = np.argmax(topic_t_tc)
            # 最后得分选择该最大主题的
            res_nmf_h_soft_argmax = res_lda_v[topic_t_tc_argmax].tolist()
            res_combine = {}
            for l in range(len_sentences_cut):
                res_combine[self.sentences[l]] = res_nmf_h_soft_argmax[l]
            score_sen = [(rc[1], rc[0]) for rc in sorted(
                res_combine.items(), key=lambda d: d[1], reverse=True)]
            #####################################################################################
        else:
            ### 方案二, 获取最大主题概率的句子, 不分主题
            res_combine = {}
            for i in range(len_sentences_cut):
                res_row_i = res_lda_v[:, i]
                res_row_i_argmax = np.argmax(res_row_i)
                res_combine[self.sentences[i]] = res_row_i[res_row_i_argmax]
            score_sen = [(rc[1], rc[0]) for rc in sorted(
                res_combine.items(), key=lambda d: d[1], reverse=True)]
        num_min = min(num, len(self.sentences))
        return score_sen[0:num_min]
Ejemplo n.º 36
0

# In[35]:

feedback = []
for y, santens in enumerate(train_headlines_sentens):
    # Now, we obtain a Counts design matrix, for which we use SKLearn’s CountVectorizer module. The transformation will return a matrix of size (Documents x Features), where the value of a cell is going to be the number of times the feature (word) appears in that document.
    # To reduce the size of the matrix, to speed up computation, we will set the maximum feature size to 5000, which will take the top 5000 best features that can contribute to our model.
    vectorizer = CountVectorizer(analyzer='word',
                                 ngram_range=(2, 3),
                                 max_features=5000)
    x_coints = vectorizer.fit_transform(santens)
    transformer = TfidfTransformer(smooth_idf=False)
    x_tfidf = transformer.fit_transform(x_coints)
    xtfidf_norm = normalize(x_tfidf, norm='l1', axis=1)
    model = LatentDirichletAllocation(n_components=num_topics,
                                      learning_method='online')
    model.fit(xtfidf_norm)
    toPik = get_lda_topics(lda, 7)
    print('tanggal', data_tgl[y])
    print('\n', 'konsep matrix \n', toPik)
    feedback.append(toPik)

# In[36]:

model.fit(x_coints)

# In[37]:

x_coints.shape

# In[38]:
Ejemplo n.º 37
0
def _lda3(table,
          input_col,
          topic_name='topic',
          num_voca=1000,
          num_topic=3,
          num_topic_word=3,
          max_iter=20,
          learning_method='online',
          learning_offset=10.,
          random_state=None):
    corpus = np.array(table[input_col])
    if isinstance(corpus[0], np.ndarray):
        tf_vectorizer = CountVectorizer(preprocessor=' '.join,
                                        stop_words='english',
                                        max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca)
    else:
        tf_vectorizer = CountVectorizer(max_df=0.95,
                                        min_df=2,
                                        max_features=num_voca,
                                        stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")
    voca_weights_list = []
    for weights in lda_model.components_:
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)

    doc_topic = lda_model.transform(term_count)
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors([{
            '0100':
            "Existing table contains Topic Column Name. Please choose again."
        }])
    out_table[topic_name] = [doc_topic[i].argmax() for i in range(len(corpus))]
    weight_list = []
    for ind in out_table[topic_name]:
        weight_list.append(voca_weights_list[ind])
    out_table['topic_vocabularies'] = weight_list
    return {'out_table': out_table}
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

with open("../data/preprocessed_data.pk", 'rb') as fp:
    result = pickle.load(fp)
tf_vectorizer = CountVectorizer(max_df=0.95,
                                min_df=2,
                                max_features=1000,
                                stop_words='english')
result = list(set(result))
tf = tf_vectorizer.fit_transform(result)
tf_feature_names = tf_vectorizer.get_feature_names()

# tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, tokenizer = word_tokenize, stop_words='english')
# result = list(set(result))
# tfidf = tfidf_vectorizer.fit_transform(result)
# tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# Run LDA
no_topics = 3
# nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
lda = LatentDirichletAllocation(n_components=no_topics,
                                max_iter=10,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(tf)

with open("../data/lda_data.pk", 'wb') as fp:
    pickle.dump([tf_vectorizer, tf, lda], fp)
print('LDA models secured')
Ejemplo n.º 39
0
bow=bow_vectorizer.fit_transform(clean_text)
word_counts=bow.toarray()
tfidf_transformer=TfidfTransformer()
tfidf=tfidf_transformer.fit_transform(word_counts)

#dimension reduction
from sklearn.decomposition import TruncatedSVD
TSVD = TruncatedSVD(n_components=200, algorithm = "randomized", n_iter = 5)
TSVD_fit=TSVD.fit(tfidf)
TSVD_reduced=TSVD.fit_transform(tfidf)

# Latent Dirchlet Allocation
from sklearn.decomposition import LatentDirichletAllocation
lda_ = LatentDirichletAllocation(n_components=50, max_iter=500,
        learning_method='online',
        learning_offset=50.,
        total_samples = len(clean_text),
        random_state=0)
lda_tx=lda_.fit_transform(word_counts) #fit transform 

#save models - especially important for LDA taking so long to run
import pickle
from sklearn.externals import joblib
#joblib.dump(lda_, 'filename.pkl')
#joblib.dump(lda_tx, 'lda_tx.pkl')
# pickle.dump(lda_,open('lda_output.txt','wb'))

m_list1=[' '.join(el) for el in mesh_list] #comes in as list of lists

#MeSH term TF IDF
Mesh_bow_vectorizer=CountVectorizer()
Ejemplo n.º 40
0
with open(my_stop_words_path, errors='ignore') as fr:
    for line in fr.readlines():
        stop_words_dict.append(line.strip())
print('停用词数={}'.format(len(stop_words_dict)))

# 鬼吹灯文本挖掘4:LDA模型提取文档主题 sklearn LatentDirichletAllocation和gensim LdaModel
# 注:tfidf_mat数据准备可参考鬼吹灯文本挖掘3
import pickle
tfidf_mat = pickle.load(open('tfidf_mat.txt', 'rb'))

# 1. Sklearn实现LDA模型,并提取文档主题
#      (1)其中参数n_topics是主题个数,max_iter是迭代次数
#     (2)lda_model.components_中每行代表一个主题,每行中的每个元素代表对应词属于这个主题的得分
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 8  # 自定义主题个数#DeprecationWarning: n_topics has been renamed to n_components in version 0.19 and will be removed in 0.21
lda_model = LatentDirichletAllocation(n_topics=n_topics, max_iter=10)
# 使用TF-IDF矩阵拟合LDA模型
lda_model.fit(tfidf_mat)

# 拟合后模型的实质
print(lda_model.components_.shape)
print(lda_model.components_[:2])


# (8, 1654)
# Out[105]:
# array([[0.30237038, 0.29720752, 0.31504618, ..., 0.33985295, 0.2906448 ,
#         0.3043558 ],
#        [0.29870912, 0.30435234, 0.31793515, ..., 0.3215601 , 0.32073196,
#         0.31859002]])
# (3)其中argsort() 取元素的索引值,并将指最小的元素对应的索引值放在最前面,依次按元素值的大小顺序排列。
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print(" ".join(
            [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


tokens = df["query"].apply(nlp)
tokens = map(lambda text: map(lambda x: x.lemma_, text), tokens)
query_list = [
    " ".join(
        map(lambda x: str(x) if not nlp.vocab[str(x)].is_stop else "", text))
    for text in tokens
]

lda = LDA(n_components=5)
count_vectorizer = CountVectorizer()
count_data = count_vectorizer.fit_transform(query_list)
output = lda.fit(count_data)

print_topics(lda, count_vectorizer, 1)

all_t_lemma_stop = [
    " ".join(
        map(lambda x: str(x) if not nlp.vocab[str(x)].is_stop else "", text))
    for text in tokens
]
all_t = ' '.join(map(str, all_t_lemma_stop))

filtered_words = [word for word in str(all_t).split()]
counted_words = collections.Counter(filtered_words)
Ejemplo n.º 42
0
                                min_df = 0.002) #去除文档内出现几率过大或过小的词汇

tf = tf_vectorizer.fit_transform(corpus)

print(tf.shape)
print(tf)

#-------------------------  第三步 LDA分析  ------------------------ 
from sklearn.decomposition import LatentDirichletAllocation

# 设置主题数
n_topics = 2

lda = LatentDirichletAllocation(n_components=n_topics,
                                max_iter=100,
                                learning_method='online',
                                learning_offset=50,
                                random_state=0)
lda.fit(tf)

# 显示主题数 model.topic_word_
print(lda.components_)
# 几个主题就是几行 多少个关键词就是几列 
print(lda.components_.shape)                         

# 计算困惑度
print(u'困惑度:')
print(lda.perplexity(tf,sub_sampling = False))

# 主题-关键词分布
def print_top_words(model, tf_feature_names, n_top_words):
Ejemplo n.º 43
0
# count tf-idf
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2)
tfidf = tfidf_vectorizer.fit_transform(tweets)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

print 'shape *******'
print tf.shape
dist = 1 - cosine_similarity(tfidf)
print

no_topics = 2

# Start Clustering #
lda = LatentDirichletAllocation(n_topics=no_topics,
                                max_iter=100,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0).fit(tf)
nmf = NMF(n_components=no_topics,
          random_state=1,
          alpha=.1,
          l1_ratio=.5,
          init='nndsvd').fit(tfidf)


# print top tf-idf words #
def display_topics(H, W, feature_names, documents, no_top_words,
                   no_top_documents):
    for topic_idx, topic in enumerate(H):
        print "Cluster %d: " % (topic_idx)
        print "".join([
Ejemplo n.º 44
0
# #### Count Vectorizer

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words='english')
doc_term_matrix = count_vect.fit_transform(
    reviews_datasets['Text'].values.astype('U'))

doc_term_matrix

# #### Use LDA

from sklearn.decomposition import LatentDirichletAllocation

LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)
"""
Randomly fetches 10 words from our vocabulary
"""

import random

for i in range(10):
    random_id = random.randint(0, len(count_vect.get_feature_names()))
    print(count_vect.get_feature_names()[random_id])

first_topic = LDA.components_[0]
top_topic_words = first_topic.argsort()[-10:]

for i in top_topic_words:
Ejemplo n.º 45
0
print(df.head(3))
print(df)

df = df.head(260000)

from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english', max_df=.1, max_features=5000)
X = count.fit_transform(df['review'].values)

# In[3]:

from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=14,
                                random_state=123,
                                learning_method='batch')
X_topics = lda.fit_transform(X)

# In[4]:

lda.components_.shape

# In[5]:

n_top_words = 15
feature_names = count.get_feature_names()

for topic_idx, topic in enumerate(lda.components_):
    print("Topic %d:" % (topic_idx + 1))
    print(" ".join([feature_names[i]
corpus = [
    'bread bread bread bread bread bread bread bread bread bread',
    'milk milk milk milk milk milk milk milk milk milk',
    'pet pet pet pet pet pet pet pet pet pet',
    'bread bread bread bread bread bread bread bread bread bread milk milk milk milk milk milk milk milk milk milk'
]

from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer()
matrix_X = vec.fit_transform(corpus)

from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=2,
                                topic_word_prior=0.1,
                                doc_topic_prior=0.1)
lda.fit(matrix_X)

for topic in lda.components_:
    print([topic[t] for t in topic.argsort()[::-1]])

print(lda.transform(matrix_X))
Ejemplo n.º 47
0
    plt.xticks(x_pos, words, rotation=90) 
    plt.xlabel('words')
    plt.ylabel('counts')
    plt.show()

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(padfpers['text'])
# Visualise the 10 most common words
plot_10_most_common_words(count_data, count_vectorizer)

warnings.simplefilter("ignore", DeprecationWarning)
# Load the LDA model from sk-learn

top_words = list()
# Helper function
def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        top_words.extend(([words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# Tweak the two parameters below
number_topics = 50
number_words = 100
# Create and fit the LDA model
res = LDA(n_components=number_topics, n_jobs=-1)
res.fit(count_data)
# Print the topics found by the LDA model
print_topics(res, count_vectorizer, number_words)
Ejemplo n.º 48
0
    doc_id = read_doc_list()

    for query in doc_list:
        print len(doc_list[query])
        data_samples = build_matrix(doc_list[query])

        tfidf_vectorizer = TfidfVectorizer(
            max_df=0.95,
            min_df=2,  #max_features=n_features,
            stop_words='english')
        tfidf = tfidf_vectorizer.fit_transform(data_samples)
        tfidf_feature_names = tfidf_vectorizer.get_feature_names()

        lda = LatentDirichletAllocation(n_topics=20,
                                        max_iter=5,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0)
        lda.fit(tfidf)

        of = open("./../data/query_result.txt", "a")
        of.write("Query " + str(query) + "\n")
        print_top_words(lda, tfidf_feature_names, of)
        of.write("\n")

        doc_topic = lda.transform(tfidf)
        of2 = open("./../data/doc_result.txt", "a")
        of2.write("Query " + str(query) + "\n")
        print_doc_topic(doc_topic, doc_id, query, of2)
        of2.write("\n")
Ejemplo n.º 49
0
        #--- Test set

        test_set = []
        y_test = []
        u_test = []
        i = -1
        for text in entity:
            i += 1
            if sampling[i] != 3:
                continue
            test_set.append(text)
            y_test.append(Score[i])
            u_test.append(UserId[i])
        X_new_counts = count_vect.transform(test_set)
        model_lda = LatentDirichletAllocation(n_topics=100)
        X_train_lda = model_lda.fit_transform(X_train_counts)
        X_test_lda = model_lda.transform(X_new_counts)
        from sklearn.multiclass import OneVsRestClassifier
        from sklearn.svm import LinearSVC
        from sklearn import linear_model
        ans_simple = OneVsRestClassifier(LinearSVC(random_state=0)).fit(
            X_train_lda, y_train).predict(X_test_lda)
        rmse_simple = 0
        mae_simple = 0
        for i, ans in izip(range(0, len(y_test)), y_test):
            mae_simple += abs(ans_simple[i] - ans)
            rmse_simple += (ans_simple[i] - ans)**2
        print 'mae_SVM', mae_simple * 1.0 / (len(y_test))
        print 'rmse_SVM', (rmse_simple * 1.0 / (len(y_test)))**(0.5)
        for name2, algo in izip(['UBR-1', 'UBR-2'], [1, 2]):
Ejemplo n.º 50
0
def _lda(table,
         input_col,
         num_voca=1000,
         num_topic=3,
         num_topic_word=3,
         max_iter=20,
         learning_method='online',
         learning_offset=10.,
         random_state=None):
    corpus = table[input_col]
    tf_vectorizer = CountVectorizer(max_df=0.95,
                                    min_df=2,
                                    max_features=num_voca,
                                    stop_words='english')
    term_count = tf_vectorizer.fit_transform(corpus)
    tf_feature_names = tf_vectorizer.get_feature_names()

    if learning_method == 'online':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            learning_offset=learning_offset,
            random_state=random_state).fit(term_count)
    elif learning_method == 'batch':
        lda_model = LatentDirichletAllocation(
            n_components=num_topic,
            max_iter=max_iter,
            learning_method=learning_method,
            random_state=random_state).fit(term_count)
    else:
        raise_runtime_error("Please check 'learning_method'.")

    topic_model = pd.DataFrame([])
    topic_idx_list = []
    voca_weights_list = []
    for topic_idx, weights in enumerate(lda_model.components_):
        topic_idx_list.append("Topic {}".format(topic_idx))
        pairs = []
        for term_idx, value in enumerate(weights):
            pairs.append((abs(value), tf_feature_names[term_idx]))
        pairs.sort(key=lambda x: x[0], reverse=True)
        voca_weights = []
        for pair in pairs[:num_topic_word]:
            voca_weights.append("{}: {}".format(pair[1], pair[0]))
        voca_weights_list.append(voca_weights)
    topic_model['topic idx'] = topic_idx_list
    topic_model['topic vocabularies'] = voca_weights_list

    doc_topic = lda_model.transform(term_count)

    doc_classification = pd.DataFrame()
    doc_classification['documents'] = [doc for doc in corpus]
    doc_classification['top topic'] = [
        "Topic {}".format(doc_topic[i].argmax()) for i in range(len(corpus))
    ]

    params = {
        'Input Column': input_col,
        'Number of Vocabularies': num_voca,
        'Number of Topics': num_topic,
        'Number of Terminologies': num_topic_word,
        'Iterations': max_iter,
        'Learning Method': learning_method,
    }

    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""# Latent Dirichlet Allocation Result"""))
    rb.addMD(
        strip_margin("""
    |
    |### Parameters
    |
    | {display_params}
    |
    |### Topic Model
    |
    |{topic_model}
    |
    |### Documents Classification
    |
    |{doc_classification}
    |
    """.format(display_params=dict2MD(params),
               topic_model=pandasDF2MD(topic_model, num_rows=num_topic + 1),
               doc_classification=pandasDF2MD(doc_classification,
                                              num_rows=len(corpus) + 1))))

    model = _model_dict('lda')
    model['parameter'] = params
    model['topic_model'] = topic_model
    model['documents_classification'] = doc_classification
    model['_repr_brtc_'] = rb.get()

    return {'model': model}
Ejemplo n.º 51
0
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.90,
                                min_df=2,
                                max_features=N_FEATURES,
                                stop_words='english',
                                tokenizer=LemmaTokenizer())
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

print("Fitting LDA models with tf features, "
      "N_SAMPLES=%d and N_FEATURES=%d..." % (N_SAMPLES, N_FEATURES))
lda = LatentDirichletAllocation(n_topics=N_TOPICS,
                                max_iter=20,
                                learning_method='batch',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, N_TOP_WORDS)
Ejemplo n.º 52
0
#load in the samples:

data_samples = np.load('./books.npy')


#use tf (raw count) features for LDA:

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')
tf = tf_vectorizer.fit_transform(data_samples)

for i in range(20):

  # fit the lda model:

  lda = LatentDirichletAllocation(n_components=n_components, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
  lda.fit(tf)
  #print("\nTopics in LDA model:")
  #tf_feature_names = tf_vectorizer.get_feature_names()
  #print_top_words(lda, tf_feature_names, n_top_words)
  score = lda.score(tf)
  theRecord.append(score)
  print("Log likelihood: ", score, "with ", n_components, "topics") #we'd like to maximise this
  print("-->Perplexity: ", lda.perplexity(tf)) #we'd like to minimise this
  n_components += 1

best = np.argmax(theRecord) + 1
print("The best number of topics to use is ", best)
print("\nTopics in the best LDA model:")
lda = LatentDirichletAllocation(n_components=best, max_iter=5, learning_method='online', learning_offset=50., random_state=0)
lda.fit(tf)
Ejemplo n.º 53
0
    mrHeader = next(csvreader)

    # extracting each data row one by one
    for row in csvreader:
        data.append(row)

#sentiments = [["name","content","number of positive words",
#               "number of negative words","positive sentiment", "Negative sentiment",
#               "comment","like","retweet","url"]]
for row in data:
    content.append((row[1]))
    numPosWords.append(float(row[2]))
    numNegWords.append(float(row[3]))
    senti_polarity_pos.append(float(row[4]))
    senti_polarity_neg.append(float(row[5]))
    comment.append(float(row[6]))
    likes.append(float(row[7]))
    retweet.append(float(row[8]))

count_vect = CountVectorizer(max_df=0.8, min_df=2, stop_words="english")
doc_term_matrix = count_vect.fit_transform(content)
LDA = LatentDirichletAllocation(n_components=5, random_state=42)
LDA.fit(doc_term_matrix)

# pyLDAvis.enable_notebook()
# pyLDAvis.sklearn.prepare(LDA, doc_term_matrix, count_vect, mds='tsne')

for i, topic in enumerate(LDA.components_):
    print("Top 10 words for topic#", i)
    print([count_vect.get_feature_names()[i] for i in topic.argsort()[-10:]])
    print("\n")
CV = lda_cv.fit_transform(file_list)
lda_columns = lda_cv.get_feature_names()
df_corpus_lda = pd.DataFrame(CV.toarray(), columns=lda_columns)
df_corpus_lda.head()

# In[7]:

no_topics = 5
max_iterations = 10
learn_off = 50
random = 0

lda_model = LatentDirichletAllocation(n_components=no_topics,
                                      max_iter=max_iterations,
                                      learning_method='online',
                                      learning_offset=learn_off,
                                      random_state=random)
#lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
LDA_DH_Model = lda_model.fit_transform(CV)

# In[8]:

print(LDA_DH_Model.shape)  # docs | topics
print(LDA_DH_Model[0])

# In[9]:

print_topics(lda_model, lda_cv)

# In[10]:
Ejemplo n.º 55
0
import numpy as np
from time import time

# Data
from preprocessing.read_ap import sparse_docs as W_tr
from preprocessing.dictionary import dictionary as dic, \
		inverse_dictionary as inv_dic, terms

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

# Model
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=100, max_iter=20,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(W_tr)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
print_top_words(lda, terms, 20)
Ejemplo n.º 56
0
def extract_bonus(text):
    """This method extracts the LDA and LSA features, saving them to a file. Then,
  the method returns the LDA features for use in classification, since LDA is
  empirically shown to beat LSA in general.

  :param text: text to be featurized using LDA or LSA (full comment)
  :param infile: infile to write to (as string) for learnings from this bonus
  :return: all LDA features as matrix
  """
    # we keep words with their tag to see if a different tagged word has different topic etc.
    for use_LDA in [False, True]:
        if use_LDA:
            featurizer = CountVectorizer(stop_words='english')
        else:
            featurizer = TfidfVectorizer(sublinear_tf=True,
                                         stop_words='english')
        data, labels, = zip(*[(c['body'], c['cat']) for c in text])
        new_data = []
        for comment in data:
            row = " ".join(
                [word[:word.rfind('/')] for word in comment.split(' ')])
            # sentence is now just lemmatized, tokenized words separated by spaces,
            # as required by sklearn Coutnvectorizer.
            new_data.append(row)
        data = new_data
        labels = [files[lbl][1][0, -1]
                  for lbl in labels]  # transform to integer
        data = featurizer.fit_transform(data)
        n_components = 100
        if use_LDA:
            topic_modeller = LatentDirichletAllocation(
                n_components=n_components, batch_size=100, random_state=2)
        else:
            topic_modeller = TruncatedSVD(n_components=n_components,
                                          n_iter=1,
                                          random_state=2)
        data = topic_modeller.fit_transform(data)
        labels = np.array(labels)[:, np.newaxis]
        data = np.concatenate([data, labels], axis=1)
        with open('a1_bonus_lda.txt', 'w' if not use_LDA else 'a') as outf:
            if use_LDA:
                topic_distribution = topic_modeller.components_ / topic_modeller.components_.sum(
                    axis=1)[:, np.newaxis]
                for i in range(n_components):
                    top_10_indices = np.argpartition(topic_distribution[i],
                                                     -10)[-10:]
                    top_10_words = np.array(
                        featurizer.get_feature_names())[top_10_indices]
                    top_10_probs = topic_distribution[i, top_10_indices]
                    outf.write(
                        f'topic {i} is best described by the 10 words: {top_10_words} with probabilities: {top_10_probs}\n'
                    )
            else:
                outf.write(
                    f"explained variance from total variance is: {topic_modeller.explained_variance_ratio_.sum()}\n"
                )
    with open('a1_bonus_lda.txt', 'a') as outf:
        outf.write(
            'We see that topic 0 are likely positive adverbs, similar to what we saw with our Kbest feature selection. '
            'Topic 1 is difficult to describe, but could be described as how pro-life or thoughtful the text is.'
            'Topic 2 makes a lot of sense, and is related to the religion and country of origin.'
            'Since I printed all 100 topics, I will show a select few more.'
            'Topic 97 and 99 both both related to censorship, which makes sense as they are related to generally to right-ist views.'
            'Topic 98 relates to compassion and sharing, which could be tied with left-ist views.'
            'As we can see, the topics selected by the LDA generally do correspond with generally ideas associated with the different views of '
            'different political texts. These topics enable us to reduce the number of dimensions while keeping the important information.'
            'We could use these topics to perform unsupervised learning on the different clusters of text to understand if there are any class-imbalances '
            'in our data, which might result from mis-labelled datapoints (given that our dataset was poorly labelled (texts in a Left channel might have rightist views and just be argueing there).'
        )
    return data
Ejemplo n.º 57
0
    tfidf_vectorizer = TfidfVectorizer(use_idf=True)

    # just send in all your docs here
    tfidf_vectorizer_vectors = tfidf_vectorizer.fit_transform(df_u_clean)

    # get the first vector out (for the first document)
    first_vector_tfidfvectorizer = tfidf_vectorizer_vectors[0]

    # place tf-idf values in a pandas data frame
    df = pd.DataFrame(first_vector_tfidfvectorizer.T.todense(), index=tfidf_vectorizer.get_feature_names(),
                      columns=["tfidf"])
    df.sort_values(by=["tfidf"], ascending=False)

    lda_model = LatentDirichletAllocation(n_components=20,  # Number of topics
                                          learning_method='online',
                                          random_state=0,
                                          n_jobs=-1  # Use all available CPUs
                                          )
    lda_output = lda_model.fit_transform(data_vectorized)

    topic_keywords = show_topics(vectorizer=vectorizer, lda_model=lda_model, n_words=20)

    # print(topic_keywords[1])

    # learn tfidf using TfidfVectorizer from sklean
    tfidf_vectorizer = TfidfVectorizer(use_idf=True, ngram_range=(1,5), stop_words='english', # ngram_range=(1,6)
                                        analyzer = 'word',
                                        min_df = 3,  # minimum required occurences of a word
                                        # min_df = 3
                                        lowercase = True,  # convert all words to lowercase
                                        token_pattern = '[a-zA-Z0-9]{3,}',  # num chars > 3
Ejemplo n.º 58
0
type = "tf"
n_features = 10

for i in range(0, 10):
    if type == "tfidf":
        vectorizer = TfidfVectorizer(max_df=0.95,
                                     min_df=2,
                                     stop_words='english')
    else:
        vectorizer = CountVectorizer(max_df=0.95,
                                     min_df=2,
                                     stop_words='english')

    vecs = vectorizer.fit_transform(data["abstract"].tolist())
    lda = LatentDirichletAllocation(learning_method="batch").fit(vecs)

    vectorizers.append(vectorizer)
    ldas.append(lda)

[
    print_top_words(lda, vectorizer.get_feature_names(), n_features)
    for lda, vectorizer in zip(ldas, vectorizers)
]
words = [[
    set(d)
    for d in get_top_words(lda, vectorizer.get_feature_names(), n_features)
] for lda, vectorizer in zip(ldas, vectorizers)]

distances = np.eye(len(words))
intersections = np.eye(len(words), dtype=object)
Ejemplo n.º 59
0
con.row_factory = dict_factory
cur = con.cursor()
cur.execute("select * from headlines")
results = cur.fetchall()

#tf-idf the articles
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3))
X = vectorizer.fit_transform([*map(lambda x: x['text'], results)])

for item in X[0]:
    print(item)

#print(vectorizer.get_feature_names())

svd = TruncatedSVD(n_components=100, n_iter=100)
lda = LatentDirichletAllocation(n_components=10)
L = lda.fit(X)
S = svd.fit(X)
#normalizer = Normalizer(copy=False)
#lsa = make_pipeline(svd, normalizer)
#X = lsa.fit_transform(X)

terms = vectorizer.get_feature_names()
for i, comp in enumerate(S.components_):
    termsInComp = zip(terms, comp)
    sortedTerms = sorted(termsInComp, key=lambda x: x[1], reverse=True)[:20]
    print("Concept %d:" % i)
    for term in sortedTerms:
        print(term[0])
    print(" ")
        tree_repr = unidecode(line.strip())
        if len(tree_repr) == 0:
            continue

        while tree_repr[-1] == ' ':
            tree_repr = tree_repr[:-1]

        test.append(tree_repr)
        pass

    vectorizer = CountVectorizer(stop_words=stop_words)
    data = vectorizer.fit_transform(test)

    start_time = time.time()

    lda = LatentDirichletAllocation(n_components=nb_topics, random_state=0)
    lda.fit(data)

    print('\nSklearn LDA exec time: ' + str(time.time() - start_time) + 's')

    #Print the nb_words_topic main words of each topic, for the Sklearn LDA implementation
    print("\nTopics found via Sklearn LDA: ")
    words = vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(
            lda.components_ / lda.components_.sum(axis=1)[:, np.newaxis]):
        print("\nTopic " + str(topic_idx + 1) + ': ' + str(nb_words_topic) +
              ' most important words, with p(w|z):')
        topic_sorted = topic.argsort()[:-nb_words_topic - 1:-1]
        probas = np.sort(np.array(topic))[::-1]
        for i in range(len(topic_sorted)):
            print(words[topic_sorted[i]] + ', ' +