Ejemplo n.º 1
0
                    labelleft='off',
                    left='off',
                    right='off')


digits = load_digits()

images = digits['images']
images = [image.reshape((1, -1)) for image in images]
images = np.concatenate(tuple(images), axis=0)

topicsRange = [(i + 1) * 5 for i in range(10)]
print(topicsRange)

ldaModels = [
    LDA(n_components=numTopics, learning_method='batch')
    for numTopics in topicsRange
]

for lda in ldaModels:
    lda.fit(images)

scores = [lda.perplexity(images) for lda in ldaModels]

plt.plot(topicsRange, scores)
plt.show()

maxLogLikelihoodTopicsNumber = np.argmin(scores)
plotNumbers = [4, 14, 24, 34, 44, 49]

if maxLogLikelihoodTopicsNumber not in plotNumbers:
Ejemplo n.º 2
0
	def train_model(self):
		get_messages_sql = s.sql.text(
                            """
				SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, i.issue_id thread_id,m.msg_text,i.issue_title thread_title,m.msg_id
					FROM augur_data.repo r, augur_data.issues i,
					augur_data.message m, augur_data.issue_message_ref imr
					WHERE r.repo_id=i.repo_id
					AND imr.issue_id=i.issue_id
					AND imr.msg_id=m.msg_id
					UNION
				SELECT r.repo_group_id, r.repo_id, r.repo_git, r.repo_name, pr.pull_request_id thread_id,m.msg_text,pr.pr_src_title thread_title,m.msg_id
					FROM augur_data.repo r, augur_data.pull_requests pr,
					augur_data.message m, augur_data.pull_request_message_ref prmr
					WHERE r.repo_id=pr.repo_id
					AND prmr.pull_request_id=pr.pull_request_id
					AND prmr.msg_id=m.msg_id
				"""
                                )
		msg_df_all = pd.read_sql(get_messages_sql, self.db, params={})
		
		
		#select only highly active repos
		msg_df_all = msg_df_all.groupby("repo_id").filter(lambda x: len(x)>500)
		
		#combining all the messages in a repository to form a single doc
		msg_df = msg_df_all.groupby('repo_id')['msg_text'].apply(','.join)
		msg_df = msg_df.reset_index()
		
		#dataframe summarizing total message count in a repository
		message_desc_df = msg_df_all[["repo_id","repo_git","repo_name","msg_id"]].groupby(["repo_id","repo_git","repo_name"]).agg('count').reset_index()
		message_desc_df.columns = ["repo_id","repo_git", "repo_name", "message_count"]
		self.logger.info(msg_df.head())
		
		tfidf_matrix, features = self.get_tf_idf_matrix(msg_df['msg_text'], self.max_df, self.max_features, self.min_df, self.ngram_range)
		msg_df['cluster'] = self.cluster_and_label(tfidf_matrix, self.num_clusters)
		
		
		
		#visualize_labels_PCA(tfidf_matrix.todense(), msg_df['cluster'], msg_df['repo_id'], 2, "MIN_DF={} and MAX_DF={} and NGRAM_RANGE={}".format(MIN_DF, MAX_DF, NGRAM_RANGE))
		
		
		#LDA - Topic Modeling
		count_vectorizer = CountVectorizer(max_df=self.max_df, max_features=self.max_features, min_df=self.min_df,stop_words="english", tokenizer=self.preprocess_and_tokenize)
		
		#count_matrix = count_vectorizer.fit_transform(msg_df['msg_text'])
		count_transformer = count_vectorizer.fit(msg_df['msg_text'])
		count_matrix = count_transformer.transform(msg_df['msg_text'])
		pickle.dump(count_transformer.vocabulary_, open("vocabulary_count",'wb'))
		feature_names = count_vectorizer.get_feature_names()
			
		lda_model = LDA(n_components=self.num_topics)
		lda_model.fit(count_matrix)
		# each component in lda_model.components_ represents probability distribution over words in that topic
		topic_list = lda_model.components_
		# Getting word probability 
		# word_prob = lda_model.exp_dirichlet_component_
		#word probabilities 
		#lda_model does not have state variable in this library
		# topics_terms = lda_model.state.get_lambda()
		# topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)
		# word_prob = [lda_model.id2word[i] for i in range(topics_terms_proba.shape[1])]

		# Site explaining main library used for parsing topics: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html

		# Good site for optimizing: https://medium.com/@yanlinc/how-to-build-a-lda-topic-model-using-from-text-601cdcbfd3a6
		# Another Good Site: https://towardsdatascience.com/an-introduction-to-clustering-algorithms-in-python-123438574097
		# https://machinelearningmastery.com/clustering-algorithms-with-python/

		logging.info("Topic List Created: {}".format(topic_list))
		pickle.dump(lda_model, open("lda_model",'wb'))
		logging.info("pickle dump")

		## Advance Sequence SQL
		
		# key_sequence_words_sql = s.sql.text(
  #                           """
		# 		SELECT nextval('augur_data.topic_words_topic_words_id_seq'::text)
		# 		"""
  #                               )

		# twid = self.db.execute(key_sequence_words_sql)
		# self.logger.info("twid variable is: {}".format(twid)) 
		#insert topic list into database
		topic_id = 1
		for topic in topic_list:
			#twid = self.get_max_id('topic_words', 'topic_words_id') + 1
			#self.logger.info("twid variable is: {}".format(twid))
			for i in topic.argsort()[:-self.num_words_per_topic-1:-1]:
				#twid+=1
				#self.logger.info("in loop incremented twid variable is: {}".format(twid))
				#self.logger.info("twid variable is: {}".format(twid))
				record = {
				  #'topic_words_id': twid,
				  #'word_prob': word_prob[i],
				  'topic_id': int(topic_id),
				  'word': feature_names[i],
				  'tool_source' : self.tool_source,
				  'tool_version' : self.tool_version,
				  'data_source' : self.data_source
				  }
				result = self.db.execute(self.topic_words_table.insert().values(record))
				self.logger.info("Primary key inserted into the topic_words table: {}".format(result.inserted_primary_key))
			topic_id+=1
		
		#insert topic list into database
		
		#save the model and predict on each repo separately
			
		
		prediction = lda_model.transform(count_matrix)

		topic_model_dict_list = []
		for i, prob_vector in enumerate(prediction):
			topic_model_dict = {}
			topic_model_dict['repo_id'] = msg_df.loc[i]['repo_id']
			for i, prob in enumerate(prob_vector):
				topic_model_dict["topic"+str(i+1)] = prob
			topic_model_dict_list.append(topic_model_dict)
		topic_model_df = pd.DataFrame(topic_model_dict_list)

		result_content_df = topic_model_df.set_index('repo_id').join(message_desc_df.set_index('repo_id')).join(msg_df.set_index('repo_id'))
		result_content_df = result_content_df.reset_index()
		self.logger.info(result_content_df)
		
		POS_count_dict = msg_df.apply(lambda row : self.count_func(row['msg_text']), axis = 1)
		msg_df_aug = pd.concat([msg_df,pd.DataFrame.from_records(POS_count_dict)], axis=1)
		self.logger.info(msg_df_aug)
def create_and_fit_lda(data, num_topics):
    lda = LDA(n_components=num_topics, n_jobs=-1)
    lda.fit(data)
    return lda
Ejemplo n.º 4
0
def main():
    data1 = pandas.read_csv(r"liwc_input.csv")
    data2 = pandas.read_csv(r'liwc_test.csv')
    trainX = data1.iloc[:, 1:99]
    yTrain = data1.iloc[:, 99]
    testX = data2.iloc[:, 1:99]
    yTest = data2.iloc[:, 99]
    runBaseline = True
    #trainX, testX, yTrain, yTest = cross_validation.train_test_split(X, Y, test_size=0.1, random_state=0)  #test train split

    vectorizer = feature_extraction.text.TfidfVectorizer()
    sentiment_scaler = preprocessing.StandardScaler()
    liwc_scaler = preprocessing.StandardScaler()
    unigrams = vectorizer.fit_transform(trainX["text"]).toarray()
    vectorizer1 = feature_extraction.text.TfidfVectorizer()
    #synst=vectorizer1.fit_transform(trainX["synset"].values.astype('U')).toarray()
    tf_vectorizer = feature_extraction.text.CountVectorizer()
    tf = tf_vectorizer.fit_transform(trainX["text"]).toarray()
    tf_feature_names = tf_vectorizer.get_feature_names()
    lda = LDA(n_topics=10,
              max_iter=5,
              learning_method='online',
              learning_offset=50.,
              random_state=0).fit(tf)
    lda_train = lda.transform(tf)
    sentiment = sentiment_scaler.fit_transform(trainX.ix[:,
                                                         "pscore":"obscore"])
    liwc = liwc_scaler.fit_transform(trainX.ix[:, "WC":"OtherP"])
    allf = np.hstack((unigrams, lda_train, liwc, sentiment))

    unigrams_t = vectorizer.transform(testX["text"]).toarray()
    liwc_t = liwc_scaler.fit_transform(testX.ix[:, "WC":"OtherP"])
    tf_t = tf_vectorizer.transform(testX["text"]).toarray()
    lda_test = lda.transform(tf_t)
    sentiment_t = sentiment_scaler.transform(testX.ix[:, "pscore":"obscore"])
    #3synst_t = vectorizer1.transform(testX["synset"].values.astype('U')).toarray()
    allf_t = np.hstack((unigrams_t, lda_test, liwc_t, sentiment_t))

    features = {
        "sentiment": (sentiment, sentiment_t),
        "lda": (lda_train, lda_test),
        'unigrams': (unigrams, unigrams_t),
        "liwc": (liwc, liwc_t),
        "all": (allf, allf_t)
    }

    for f in features:
        xTrain = features[f][0]
        xTest = features[f][1]

        if runBaseline:
            baseline = dummy.DummyClassifier(strategy='most_frequent',
                                             random_state=0)
            baseline.fit(xTrain, yTrain)
            predictions = baseline.predict(xTest)

            print(indent("Baseline: ", 4))
            print(indent("Test Accuracy: ", 4),
                  metrics.accuracy_score(yTest, predictions))
            print(indent(metrics.classification_report(yTest, predictions), 4))
            print()
            runBaseline = False

        print(indent("Features: ", 4), f)
        count = 0
        ac = [0, 0, 0, 0, 0, 0, 0, 0]
        for model, name in zip(models, model_names):
            model.fit(xTrain, yTrain)
            # Simple SVM
            # print('fitting...')
            prediction = model.predict(xTest)
            # Print Accuracy
            print(model)
            print(indent("Test Accuracy: ", 4),
                  metrics.accuracy_score(yTest, prediction))
            print(indent(metrics.classification_report(yTest, prediction), 4))
            print()
            # clf = SVC(C=20.0, gamma=0.00001)
            # clf.fit(X_train, y_train)
            # acc = clf.score(X_test, y_test)

        print()
    print()
Ejemplo n.º 5
0

# Initialise the count vectorizer with the English stop words
count_vectorizer = CountVectorizer(stop_words='english')
# Fit and transform the processed titles
count_data = count_vectorizer.fit_transform(papers)
print(count_data)

plot_10_most_common_words(count_data, count_vectorizer)


def print_topics(model, count_vectorizer, n_top_words):
    words = count_vectorizer.get_feature_names()
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join(
            [words[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))


# Tweak the two parameters below
number_topics = 5
number_words = 5
# Create and fit the LDA model
lda = LDA(n_components=number_topics)

lda.fit(count_data)

lda.fit(count_data)
# Print the topics found by the LDA model
print("Topics found via LDA:")
print_topics(lda, count_vectorizer, number_words)
def main():
    warnings.simplefilter('ignore', DeprecationWarning)
    sns.set_style('whitegrid')

    #os.chdir('..')

    #make pd df from allegation summaries
    summary_file = open('data/raw/complaints.json', encoding="utf8")
    summary_data = json.load(summary_file)
    summary_file.close()
    summaries = []

    en_core = spacy.load('en_core_web_lg')

    for dicts in summary_data:
        summaries.append(dicts["summary"])

    summary_df = pd.DataFrame(summaries, columns=['summary'])

    stopwords = set([
        'cook county', 'chicago', 'allegedly', 'po', 'ofcer', 'sergeant',
        'ipra', 'detective', 'accused', 'officer', 'officers', 'reporting',
        'party', 'alleged', 'alleges', 'complainant', 'victim', 'police',
        '-pron-', 'pron', '-PRON-', '[-PRON-]', 'allege', 'accuse', 'report'
    ])

    print('Preprocessing starting')

    #remove punctuation
    summary_df['summary_text_processed1'] = summary_df['summary'].map(
        lambda x: re.sub('[,\.!?]()', '', x))

    #convert to lower
    summary_df['summary_text_processed2'] = summary_df[
        'summary_text_processed1'].map(lambda x: x.lower())

    #lemmatize
    summary_df['summary_text_processed3'] = summary_df[
        'summary_text_processed2'].map(
            lambda x: " ".join([y.lemma_ for y in en_core(x)]))

    #remove years
    summary_df['summary_text_processed4'] = summary_df[
        'summary_text_processed3'].map(
            lambda x: re.sub('(19|20)[0-9][0-9]', '', x))

    #remove stopwords
    summary_df['summary_text_processed'] = summary_df[
        'summary_text_processed4'].map(lambda x: " ".join(
            [item for item in x.split() if item not in stopwords]))

    print('Preprocessing finished')

    # Initialise the count vectorizer with the English stop words
    count_vectorizer = CountVectorizer(stop_words='english')

    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(
        summary_df['summary_text_processed'])

    NUM_ITERATIONS = 10
    NUM_TOPICS = 20
    NUM_WORDS = 12

    for iteration in range(NUM_ITERATIONS):
        number_topics = NUM_TOPICS
        number_words = NUM_WORDS

        print('Training model ' + str(iteration))

        lda = LDA(n_components=number_topics, max_iter=80)
        lda.fit(count_data)

        filename = 'lda_models_and_test_files/lda_model_iteration_' + str(
            iteration) + '_topics_' + str(number_topics) + '_words_' + str(
                number_words)

        filename_sav = filename + '.sav'

        filename_txt = filename + '.txt'

        if not os.path.exists('lda_models_and_test_files'):
            os.makedirs('lda_models_and_test_files')

        with open(filename_sav, 'wb') as f:
            pickle.dump(lda, f)

        with open(filename_txt, 'w+') as f:
            f.write('Topics found via LDA:')
            st = print_topics(lda, count_vectorizer, number_words)
            f.write(st)
            st = str(lda.transform(count_data[-10:]))
            f.write(st)
Ejemplo n.º 7
0
                                           stop_words=STOP_WORDS,
                                           ngram_range=(1, 2),
                                           max_df=max_df,
                                           min_df=min_df)
    else:
        count_vectorizer = CountVectorizer(analyzer='word',
                                           stop_words=STOP_WORDS,
                                           max_df=max_df,
                                           min_df=min_df)

    # Fit and transform the processed titles
    count_data = count_vectorizer.fit_transform(messages)

    if alpha and eta:
        lda = LDA(n_components=n_clusters,
                  n_jobs=-1,
                  doc_topic_prior=alpha,
                  topic_word_prior=eta)
    else:
        lda = LDA(n_components=n_clusters, n_jobs=-1)
    X = lda.fit_transform(count_data)

    # A sentence is included in the topic that represents the most.
    labels = np.argmax(X, axis=1)

    # Save clusters in given folders.
    save_clusters(messages, labels, output_dir)

    with open(os.path.join(output_dir, 'lda.model'), 'wb') as f:
        pickle.dump(lda, f)
    with open(os.path.join(output_dir, 'vect.model'), 'wb') as f:
        pickle.dump(count_vectorizer, f)
Ejemplo n.º 8
0
def compute_Semantics_1b(method, genre_, k_topics):
	'''Here the data is (genre X actors) with each cell having Tf-IDF values for that genre and actor'''
	print "\n\n\n============================================"
	#All genres
	_genres = MlMovies.objects.values_list('genres', flat=True)
	genres = []
	for genre in _genres:
		genres.extend(genre.split(','))
	genres = [x.strip() for x in genres]
	genres = list(set(genres))

	#All actors
	actorobjs = ImdbActorInfo.objects.values_list('actorid','name')
	actors_dict = {x[0]:x[1] for x in actorobjs}
	actors = ImdbActorInfo.objects.values_list('actorid', flat=True)

	'''Matrix Dataset'''
	V = sp.lil_matrix((len(genres), len(actors)))
	decomposed = []
	'''get tf-idfs vectors for each genre w.r.t actors'''
	for i in range(len(genres)):
		tf_idf = print_genreactor_vector.main(str(genres[i]))
		for j in range(len(actors)):
			V[i, j] = tf_idf[actors[j]]

	if (method.upper() == 'SVD'):
		'''  SVD  Calculation '''
		U, sigma, Vt = svds(V, k=k_topics)
		sigma = np.diag(sigma)
		# print "\n\nSigma = \t",sigma
		print "\n\nU:", len(U), len(U[0]), "Sigma: ", sigma.shape, " V: ", Vt.shape, "\n\n"
		#print U
		print "For genre",genre_,"Latent semantics are:", U[genres.index(genre_)]
		decomposed = U

	if (method.upper() == 'PCA'):
		# standardizing data
		V = sp.csr_matrix(V).todense()
		V_std = StandardScaler().fit_transform(V)
		print "Stdandardized size: ", V_std.shape

		'''PCA::   Using Inbuilt library function'''
		sklearn_pca = PCA(n_components=k_topics)
		pca = sklearn_pca.fit(V_std)
		Vt = pca.components_
		# print Vt
		decomposed = pca.transform(V_std)
		print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)]

	if (method.upper() == 'LDA'):
		'''TO:DO://  Create matrix with doc as rows and words as column s with each cell having freq count not tf-idf'''
		for i, gen in enumerate(genres):
			tobjects = Task5.objects.filter(genre=gen)
			t_actors_id = tobjects.values_list('actorid', flat=True)
			for j in range(len(actors)):
				aid = actors[j]
				if aid in t_actors_id:
					V[i, j] = tobjects.get(actorid=int(aid)).score
				else:
					V[i, j] = 0.0

		lda = LDA(n_components=k_topics, max_iter=10000, learning_method="batch",evaluate_every=10,perp_tol=1e-12)
		lda.fit(V)
		Vt = lda.components_
		decomposed = lda.transform(V)
		print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)]


	'''SVD,PCA :: IN order to give Latenet Semantics some names: Normalize each column in feature factor matrix
					  and then pick top 5 actors somewhat describing that Latent Semantic '''
	#normed_Vt = normalize(Vt, axis=0, norm='max')
	#normed_Vt = Vt / Vt.sum(axis=0)
	normed_Vt = Vt.copy()

	x = normed_Vt.max(axis=0)
	y = normed_Vt.min(axis=0)
	for i in range(len(normed_Vt)):
		for j in range(len(normed_Vt[0])):
			normed_Vt[i][j] = float(normed_Vt[i][j] - y[j]) / float(x[j] - y[j])

	for i in range(k_topics):
		idx = np.argpartition(-normed_Vt[i], 10)[:10]
		print "Latent Semantic: ", i + 1, " = "
		li = []
		for j in idx:
			li.append(actors_dict[actors[j]])
		print '\t', li, "\n"
	return decomposed
Ejemplo n.º 9
0
def compute_Semantics_1a(method, genre_,k_topics):
	"""Here the data is (genre X tags) with each cell having Tf-IDF values for that genre and tag"""
	print "\n\n\n============================================"
	#All genres
	_genres = MlMovies.objects.values_list('genres', flat=True)
	genres = []
	for genre in _genres:
		genres.extend(genre.split(','))
	genres = [x.strip() for x in genres]
	genres = list(set(genres))
	#All tags
	tagobjs = GenomeTags.objects.values_list('tagid','tag')
	tags_dict = {x[0]:x[1] for x in tagobjs}
	tags = GenomeTags.objects.values_list('tagid', flat=True)

	'''Matrix Dataset'''
	V = sp.lil_matrix((len(genres), len(tags)))
	decomposed = []
	'''get tf-idfs vectors for genre-tag pairs and fill the matrix
		0 if genre-tag doesn't exist'''

	for i in range(len(genres)):
		# tf_idf = compute_tf_idf_movie(cur_movie,"TF-IDF")
		tf_idf = print_genre_vector.main(str(genres[i]), 1)
		for j in range(len(tags)):
			V[i, j] = tf_idf[tags[j]]

	if(method.upper() == 'SVD'):
		'''  SVD  Calculation '''
		U, sigma, Vt = svds(V, k=k_topics)
		sigma = np.diag(sigma)
		# print "\n\nSigma = \t",sigma
		print "\n\nU:", len(U), len(U[0]), "Sigma: ", sigma.shape, " V: ", Vt.shape, "\n\n"
		#print U
		decomposed = U
		print "For genre",genre_,"Latent semantics are:", U[genres.index(genre_)]


	if(method.upper() == 'PCA'):
		# standardizing data
		V = sp.csr_matrix(V).todense()
		V_std = StandardScaler().fit_transform(V)
		#print "Stdandardized size: ", V_std.shape

		'''PCA::   Using Inbuilt library function'''
		sklearn_pca = PCA(n_components=k_topics)
		pca = sklearn_pca.fit(V_std)
		Vt = pca.components_
		#print Vt
		decomposed = pca.transform(V_std)
		print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)]

	if (method.upper() == 'LDA'):
		'''TO:DO://  Create matrix with doc as rows and words as column s with each cell having freq count not tf-idf'''
		for i, gen in enumerate(genres):
			tobjects = Task2.objects.filter(genre=gen)
			t_tags = tobjects.values_list('tag', flat=True)
			tags_dict1 = deepcopy(tags_dict)
			inv_tags = {v: k for k, v in tags_dict1.iteritems()}
			t_tags_id = [inv_tags[x] for x in inv_tags if x in t_tags]
			for j in range(len(tags)):
				tid = tags[j]
				if tid in t_tags_id:
					V[i, j] = tobjects.get(tag=str(tags_dict[tid])).score
				else:
					V[i, j] = 0.0

		lda = LDA(n_components=k_topics, max_iter=10000, learning_method="batch",evaluate_every=10,perp_tol=1e-12)
		lda.fit(V)
		Vt = lda.components_
		decomposed = lda.transform(V)
		print "For genre",genre_,"Latent semantics are:", decomposed[genres.index(genre_)]

	'''IN order to give Latenet Semantics some names: Normalize each column in feature factor matrix
					  and then pick top 5 tags somewhat describing that Latent Semantic '''
	#normalize columns for most discriminating feature finding
	#normed_Vt = Vt/Vt.sum(axis=0)
	normed_Vt = Vt.copy()

	x = normed_Vt.max(axis=0)
	y = normed_Vt.min(axis=0)
	for i in range(len(normed_Vt)):
		for j in range(len(normed_Vt[0])):
			normed_Vt[i][j] = float(normed_Vt[i][j] - y[j]) / float(x[j] - y[j])

	#print "\n\nHo ho!!\n", normed_Vt
	#print tags_dict
	for i in range(k_topics):
		idx = np.argpartition(-normed_Vt[i], 10)[:10]
		# print "What is this?", -np.partition(-normed_Vt[0], 5)[:5]
		#rint idx
		print "Latent Semantics: ", i + 1, " = "
		li = []
		for j in idx:
			li.append(tags_dict[tags[j]])
		print '\t', li, "\n"

	return decomposed
Ejemplo n.º 10
0
count_data = count_vectorizer.fit_transform(gd['pro_text'])
vocab = count_vectorizer.get_feature_names()
word2id = dict((v, idx) for idx, v in enumerate(vocab))
#pickle.dump(vocab,open('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/vocab.pkl','w'))

#pickle.dump(word2id,open('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/word2id.pkl','w'))

#scipy.sparse.save_npz('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/count_data.npz', count_data)

#count_data = scipy.sparse.load_npz('/home/mcorrito/projects/def-mcorrito/mcorrito/HH/temp_data/count_data.npz')

#count_data = count_data.toarray()

# Create and fit the LDA model
lda = LDA(n_components=number_topics,
          n_jobs=-1,
          random_state=182,
          learning_method='online')
fit = lda.fit(count_data)
topic_weights = lda.transform(count_data)
weights = pd.DataFrame(topic_weights)
final = pd.concat([ids, weights], axis=1, ignore_index=True)
final.to_csv('~/projects/def-mcorrito/mcorrito/HH/data/lda_weights.csv',
             index=False,
             header=False)

# Print the topics found by the LDA model
print_topics(lda, count_vectorizer, number_words)

# # Guided LDA with seed topics.
# seed_topic_list = [['innov','experi','experiment','dynam','fast','creativ']]
Ejemplo n.º 11
0
def LDA_modeling(corona_body_all_text):
    corpus=[]
    for text in corona_body_all_text:
        corpus.append(' '.join(text))

    # the number of terms included in the bag of words matrix is restricted to the top 100
    no_features=100
    tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
    corpus_matrix = tf_vectorizer.fit_transform(corpus)
    print(corpus_matrix.shape)

    # the most representative top 1500 words in corona_body_all_text
    # the basic structure is like a python dictionary
    word_feature_list = tf_vectorizer.get_feature_names()


    """
    this is for the grid search
    """
    search_params = {'n_components': [10, 15, 20, 25, 30], 'learning_decay': [.5, .7, .9]}
    lda = LDA()
    model= GridSearchCV(lda, param_grid=search_params, n_jobs=6)
    model.fit(corpus_matrix)

    best_lda_model=model.best_estimator_

    # Model Parameters
    print("Best Model's Params: ", model.best_params_)

    # Log Likelihood Score
    print("Best Log Likelihood Score: ", model.best_score_)

    # Perplexity
    print("Model Perplexity: ", best_lda_model.perplexity(corpus_matrix))

    # Get Log Likelyhoods from Grid Search Output
    n_topics = [10, 15, 20, 25, 30]
    log_likelyhoods_5 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['param_learning_decay']) if
                         gscore == 0.5]
    log_likelyhoods_7 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['param_learning_decay']) if
                         gscore== 0.7]
    log_likelyhoods_9 = [round(model.cv_results_['mean_test_score'][index]) for index, gscore in enumerate(model.cv_results_['param_learning_decay']) if
                         gscore== 0.9]

    # Show graph
    plt.figure(figsize=(12, 8))
    plt.plot(n_topics, log_likelyhoods_5, label='0.5')
    plt.plot(n_topics, log_likelyhoods_7, label='0.7')
    plt.plot(n_topics, log_likelyhoods_9, label='0.9')
    plt.title("Choosing Optimal LDA Model")
    plt.xlabel("Num Topics")
    plt.ylabel("Log Likelyhood Scores")
    plt.legend(title='Learning decay', loc='best')
    plt.savefig('learning_decay.pdf')
    plt.show()

    # Create Document - Topic Matrix
    lda_output = best_lda_model.transform(corpus_matrix)

    # column names
    topicnames = ["Topic" + str(i) for i in range(best_lda_model.n_components)]

    # index names
    docnames = ["Doc" + str(i) for i in range(len(corona_body_all_text))]

    # Make the pandas dataframe
    df_document_topic = pd.DataFrame(np.round(lda_output, 2), columns=topicnames, index=docnames)

    # Get dominant topic for each document
    dominant_topic = np.argmax(df_document_topic.values, axis=1)
    df_document_topic['dominant_topic'] = dominant_topic

    # Styling
    def color_green(val):
        color = 'green' if val > .1 else 'black'
        return 'color: {col}'.format(col=color)

    def make_bold(val):
        weight = 700 if val > .1 else 400
        return 'font-weight: {weight}'.format(weight=weight)

    # Apply Style
    df_document_topics = df_document_topic.head(len(corona_body_all_text)).style.applymap(color_green).applymap(make_bold)
    df_document_topics.to_excel("df_document_topics.xlsx")
    print(df_document_topics)

    # Review topics distribution across documents
    df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
    df_topic_distribution.columns = ['Topic Num', 'Num Documents']
    df_topic_distribution.to_excel("df_topic_distribution.xlsx")
    print(df_topic_distribution)


    panel = pyLDAvis.sklearn.prepare(best_lda_model, corpus_matrix, tf_vectorizer, mds='tsne')
    pyLDAvis.save_html(panel,'LDA_Visualization.html')


    # Topic-Keyword Matrix
    df_topic_keywords = pd.DataFrame(best_lda_model.components_)

    # Assign Column and Index
    df_topic_keywords.columns = tf_vectorizer.get_feature_names()
    df_topic_keywords.index = topicnames

    # View
    df_topic_keywords.head()

    # Show top n keywords for each topic
    def show_topics(vectorizer, lda_model, n_words):
        keywords = np.array(vectorizer.get_feature_names())
        topic_keywords = []
        for topic_weights in lda_model.components_:
            top_keyword_locs = (-topic_weights).argsort()[:n_words]
            topic_keywords.append(keywords.take(top_keyword_locs))
        return topic_keywords

    topic_keywords = show_topics(vectorizer=tf_vectorizer, lda_model=best_lda_model, n_words=15)

    # Topic - Keywords Dataframe
    df_topic_keywords = pd.DataFrame(topic_keywords)
    df_topic_keywords.columns = ['Word ' + str(i) for i in range(df_topic_keywords.shape[1])]
    df_topic_keywords.index = ['Topic ' + str(i) for i in range(df_topic_keywords.shape[0])]
    df_topic_keywords.to_excel("df_topic_keywords.xlsx")
    print(df_topic_keywords)


    """
    this is for printing the content of all the topics
    """
    # no_topics = 20
    # lda = LDA(n_components=no_topics, learning_method='online', learning_offset=50.,
    #                                random_state=0).fit(tf)
    #
    #
    # def display_topics(model, feature_names, no_top_words):
    #
    #     for topic_idx, topic in enumerate(model.components_):
    #         print("\nTopic #%d:" % topic_idx)
    #         print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
    # print()
    # no_top_words = 10
    # display_topics(lda, tf_feature_names, no_top_words)


    sns.set_style('whitegrid')

    # Helper function
    def plot_10_most_common_words(count_data, count_vectorizer):
        words = count_vectorizer.get_feature_names()
        total_counts = np.zeros(len(words))
        # count_data is like a Counter dictionary
        for t in count_data:
            total_counts += t.toarray()[0]

        count_dict = (zip(words, total_counts))
        count_dict = sorted(count_dict, key=lambda x: x[1], reverse=True)[0:10]
        words = [w[0] for w in count_dict]
        counts = [w[1] for w in count_dict]
        x_pos = np.arange(len(words))

        plt.figure(2, figsize=(15, 15 / 1.6180))
        plt.subplot(title='10 most common words')
        sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
        sns.barplot(x_pos, counts, palette='husl')
        plt.xticks(x_pos, words, rotation=90)
        plt.xlabel('words')
        plt.ylabel('counts')
        plt.savefig('10_most_common_words.pdf')
        plt.show()


    # Visualise the 10 most common words
    plot_10_most_common_words(corpus_matrix, tf_vectorizer)


    corpus_matrix=corpus_matrix.toarray()
    return corpus_matrix,word_feature_list
Ejemplo n.º 12
0
    def fit(self, X, y=None, features=None):
        """
        Constructs DAG according to `self.dag_method` and learns
        coexpression modules across multiple resolutions.        

        Parameters
        ----------
        X: `numpy.ndarray` or `scipy.sparse.csr_matrix`
            Matrix with rows corresponding to all of the samples that
            define the DAG and columns corresponding to features that
            define the correlation matrices.
        y
            Ignored
        features: `numpy.ndarray` of `str`
            A list of strings with feature labels.
        """
        super(DecomposeDAG, self).fit(X, y, features)

        n_samples, n_features = X.shape

        if self.verbose:
            print('Stacking...')
            sys.stdout.flush()
        X_multi = self.multiresolution_stack(X)

        if self.verbose:
            print('Decomposing...')
            sys.stdout.flush()

        if self.decomp_method == 'nmf':
            #from sklearn.decomposition import NMF
            from nmf import NMF
            decomp = NMF(
                n_components=self.n_components,
                init=None,
                solver='cd',
                beta_loss='frobenius',
                alpha=1e-3,
                l1_ratio=1,
                random_state=69,
                tol=1e-2,
                verbose=self.verbose,
            ).fit(X_multi)
            components = decomp.components_

        elif self.decomp_method == 'lda':
            from sklearn.decomposition import (LatentDirichletAllocation as
                                               LDA)
            decomp = LDA(
                n_components=self.n_components,
                learning_method='online',
                max_iter=20,
                mean_change_tol=1e-2,
                n_jobs=self.n_jobs,
                random_state=69,
                verbose=self.verbose,
            ).fit(X_multi)
            components = decomp.components_

        elif self.decomp_method == 'hdp':
            from bnp.online_hdp import (HierarchicalDirichletProcess as HDP)
            hdp = HDP(
                n_topic_truncate=self.n_components,
                n_doc_truncate=10000,
                learning_method='online',
                n_jobs=self.n_jobs,
                random_state=69,
                verbose=self.verbose,
            ).fit(X_multi)
            components = hdp.lambda_

        else:
            raise ValueError('Invalid decomposition method {}'.format(
                self.decomp_method))

        n_components = components.shape[0]
        self.cluster_components = np.reshape(
            components, (n_components, n_features, len(self.nodes)))

        cc = np.sum(self.cluster_components, axis=1)
        cc /= cc.max()
        assert (cc.shape == (n_components, len(self.nodes)))

        for node_idx, node in enumerate(self.nodes):
            node.viz_value = list(cc[:, node_idx])

        return self
Ejemplo n.º 13
0
# "restaurants and television shows are his ticket. Seb also has a rep for being "
# "one of the hottest catches around.Well from here on out you get the gist of it a"
# "ll. They meet; sparks fly as Lexi doesn't fall at Sebs' feet. He is intrigued to "
# "find a girl who serves him up on a platter cold. No his usual dish so to speak."
# " (I know puns aplenty today) *HA!There is a bit of mystery when notes keep "
# "appearing with a threatening overture. Lexi is attacked and her deliveries are "
# "beginning to not show up.My thoughts on this debut title......4 Stars on the "
# "love connection. It was cute and spunky. They had sizzle and Seb is very much "
# "an alpha male. Always a good thing.2.5 Stars on the mystery/suspense. It was "
# "easy to figure out what was going on from the beginning. I wish the author had"
# " stuck to one plotline with this. There were too many little side twists that "
# "in my opinion wasn't needed and some plots were left unfinished. It is possible"
# " that this will be resolved in later books but it just felt incomplete in this"
# " one.There was a bit of filler in the book as well. I didn't need all the "
# "explanations' on cutting styles for chops and how the kitchen runs. I am sure "
# "some will find it informative but for me it just took away from the story and "
# "well...filler.I do see this series getting better and I did like the writing "
# "style very much. Will I be looking out for the next book? Sure I will :)3 StarsT~"]

x = x.toarray()[0][:50].reshape(1, -1)
print(x)
print(x.shape)
lda = LDA(n_topics=20)
lda_res = lda.fit_transform(x)
# lda_res = lda.transform(lda_res)
print(type(lda_res))
a = lda_res[0, :]
b = lda_res[0, :]
print(data.Hellinger_distance(a, b))
# print(lda_res.split(' '))
Ejemplo n.º 14
0
def main():
    if style == 0:
        dataset = loadData()
        random.shuffle(dataset)

        ##      to get the part of dataset
        ##        dataset = dataset[:DATA_NUM]

        ##      to filter data
        for data in dataset:
            data[0] = dataFilter(data)

        if isSetVocabulary == 1:
            save(vocabulary, 'Vocabulary')
            print('building vocabulary is completed\nvocabulary size =',
                  len(vocabulary))

        print('filtering data is completed')

        ##        global vocabulary
        ##        save(vocabulary, 'Vocabulary')
        ##        print('vocabulary size =', len(vocabulary))

        ##        global vocabulary
        ##        vocabulary = load('Vocabulary')

        trainingData = dataset[:DATA_NUM]

        trainingTargetedClassifierFeatures = [
            (buildTargetedClassifierFeatures(data[0], data[1]))
            for data in trainingData
        ]
        save(trainingTargetedClassifierFeatures,
             'trainingTargetedClassifierFeatures')
        print('training targeted classifier features are completed')

        ##      to train the sentiment models

        global classifiers
        print('classifiers training start!')

        ##      to use the naive bayes classifier to train. [ [{}, ''] , ....]
        classifiers.append(
            nltk.NaiveBayesClassifier.train(
                trainingTargetedClassifierFeatures))

        ##    to try other classifier
        ##    MultinomialNB
        classifiers.append(SklearnClassifier(MultinomialNB()))

        ##    BernoulliNB
        classifiers.append(SklearnClassifier(BernoulliNB()))

        ##    LogisticRegression
        classifiers.append(SklearnClassifier(LogisticRegression()))

        ##    SGDClassifier
        classifiers.append(SklearnClassifier(SGDClassifier()))

        ##    SVC
        classifiers.append(SklearnClassifier(SVC()))

        ##    LinearSVC
        classifiers.append(SklearnClassifier(LinearSVC()))

        ##    NuSVC
        classifiers.append(SklearnClassifier(NuSVC()))

        ##    to train the classifier
        length = len(classifiers)
        ##    print(length)

        ##    except naive bayes classifier
        for i in range(1, length):
            classifiers[i].train(trainingSet)

##    to use our vote classifier
        voteClassifier = VoteClassifier(classifiers[0], classifiers[1],
                                        classifiers[2], classifiers[3],
                                        classifiers[4], classifiers[5],
                                        classifiers[6], classifiers[7])
        classifiers.append(voteClassifier)

        print('classifiers training end!')

        ##      to predict
        testData = dataset[DATA_NUM:]
        testClassifierFeatures = [(buildClassifierFeatures(data[0]))
                                  for data in testData]
        save(testClassifierFeatures, 'testClassifierFeatures')
        print('test classifier features are completed')

        print('predicting start')
        predictions = []

        for feature in testClassifierFeatures:
            predictions.append([
                voteClassifier.classify(feature),
                voteClassifier.confidence(feature)
            ])

        for prediction in predictions:
            print("Classification: ", prediction[0], " Confidence: ",
                  prediction[1])

##      to add to training targetd classifier features if confidence > threshold
        newTestClassifierFeatures = []
        newTestData = []
        for i in range(len(predictions)):
            if predictions[i][1] > threshold:
                trainingData.append(testData[i])
                trainingTargetedClassifierFeatures.append(
                    [testClassifierFeatures[i], predictions[i][0]])
            else:
                newTestData.append(testData[i])
                newTestClassifierFeatures.append(testClassifierFeatures[i])

        testData = newTestData
        testClassifierFeatures = newTestClassifierFeatures

        ##      to train the LDA
        zeros = [0 for n in range(len(vocabulary))]
        trainingLDAFeatures = [(buildLDAFeatures(zeros, data[0]))
                               for data in trainingData]

        print('LDA start')
        model = LDA(n_topics=2, max_iter=1500, learning_method='online')
        topicDistributions = model.fit_transform(trainingLDAFeatures)
        print('LDA end')

        save(model, 'LDAModel')

        for distribution in topicDistributions:
            print(distribution)

        testFeatures = [
            buildFeatures(zeros, i, testData[i][0])
            for i in range(len(testData))
        ]
        print(model.transform(testFeatures))

        print('ground truth')
        for data in testData:
            print(data[1])


##        topic = model.components_
##        print(topic)

        n_top_words = 8
        for i, topic_dist in enumerate(model.components_):
            topic_words = np.array(vocabulary)[np.argsort(
                topic_dist)][:-(n_top_words + 1):-1]
            print('Topic {}: {}'.format(i, ' '.join(topic_words)))
    elif style == 1:
        print('YO')
tfidf_model = tfidf_vectorizer_small.fit(text)
with open("../../4_models/tfidf_50K_influential_reviews_10191994.pickle", "wb") as f:
  pickle.dump(tfidf_model, f)

tfidf_bigram_model = tfidf_bigram_vectorizer_small.fit(text)
with open("../../4_models/tfidf_bigram_50K_influential_reviews_10191994.pickle", "wb") as f:
  pickle.dump(tfidf_bigram_model, f)


tfidf = tfidf_vectorizer.fit_transform(text)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

start1 = time.time()
lda = LDA(n_components=num_topics,
          max_iter=5,
          learning_method="online",
          learning_offset=50,
          random_state=10191994).fit(tf)
# save the LDA model
with open("../../4_models/lda_50K_influential_reviews_10191994.pickle", "wb") as f:
    pickle.dump(lda, f)


end1 = time.time()
print("LDA: {}".format(end1 - start1))

start2 = time.time()
nmf = NMF(n_components=num_topics,
          random_state=10191994,
          alpha=.1,
          l1_ratio=.5,
    labels.append(key.split("_")[0])
# print(data[0][:5])

# data = data[16:]
# labels = labels[16:]

# 计算文档的词频向量
fileVector = CountVectorizer(stop_words=stopwords)

fileTfVector = fileVector.fit_transform(data)

print(fileTfVector.shape)

# LDA训练  使用16部小说进行训练
topic = args.k
model = LDA(n_components=topic, max_iter=50, learning_method='batch')
docres = model.fit_transform(fileTfVector[:16])

# print(docres)
value, indices = torch.max(torch.tensor(docres), 1)
print(indices)
print("{}个主题识别出来了{}个主题".format(topic, len(list(set(indices.tolist())))))
# print(len(model.components_))

res = model.transform(fileTfVector)
assert len(res) == len(labels)
df_labels = pd.DataFrame(labels)
# df_labels.to_excel("labels.xlsx")
df_res = pd.DataFrame(res)
# df_res.to_excel("ldaVector.xlsx")
df = pd.concat([df_labels, df_res], axis=1)
                pb.update()

df.columns=['review','sentiment']
df=df.reindex(np.random.permutation[df.index])
df.to_csv('movie_data.csv',encoding='utf-8',index=False)

df=pd.read_csv('movie_data.csv')
print(df.head(),'\n',len(df.index))
print(df['sentiment'][:5])

from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.feature_extraction.text import CountVectorizer
count=CountVectorizer(max_df=0.1,max_features=5000,stop_words='english')
help(CountVectorizer)
help(LDA)
lda=LDA(n_components=10,learning_method='batch',random_state=123)
X=count.fit_transform(df['review'].values)
X_topics=lda.fit_transform(X)
print(X_topics.shape) # (50000, 10)

# Components are matrix of topics X words/features
print(lda.components_.shape) # (10, 5000)

n_top_words=5
features=count.get_feature_names()
print(len(features)) # 5000
for idx,topics in enumerate(lda.components_):
    print('Topic : ',(idx+1))
    print([features[i] for i in topics.argsort()[-n_top_words:]])
    
b=np.array([1,3,5,7,2,9,4])
Ejemplo n.º 18
0
h = .02  # step size in the mesh

names = [
    "Nearest Neighbors", "Linear SVM", "RBF SVM", "Decision Tree",
    "Random Forest", "AdaBoost", "Naive Bayes", "LDA", "QDA"
]
classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    AdaBoostClassifier(),
    GaussianNB(),
    LDA(),
    QDA()
]

X, y = make_classification(n_features=2,
                           n_redundant=0,
                           n_informative=2,
                           random_state=1,
                           n_clusters_per_class=1)
rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [
    make_moons(noise=0.3, random_state=0),
    make_circles(noise=0.2, factor=0.5, random_state=1), linearly_separable
Ejemplo n.º 19
0
# After obtaining the final distances between each abstract in the data set of papers, we apply a clustering method to sort each paper into just one latent topic. Since we only have distances between documents as opposed to points in space, we apply k-medoids clustering to the distance matrix.
# 
# The following chunks of code calculate distances between the abstracts using LDA, calculate the distances between abstracts using LSA, average the two distance matrices, and apply k-medoids. For LDA we use 7 clusters, having the maximum coherence score (code omitted from this write-up). For LSA, we reduce $w$ to 14-dimensions. Following LDA, we perform k-medoids clustering using 7 clusters.

# In[4]:


# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA
from scipy.spatial import distance
import random

random.seed(123)

numberTopics = 7
lda = LDA(n_components=numberTopics, random_state=0)
ldaFit = lda.fit(w)
topicDistributions = lda.transform(w)
distsLDA = distance.cdist(topicDistributions, topicDistributions, 'euclidean')


# In[5]:


# LSA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

random.seed(234)

n_components = 14
Ejemplo n.º 20
0
def modeling_lda(x,num_t):
    lda=LDA(n_components=num_t,random_state=0)
    lda.fit(x)
    return lda
Ejemplo n.º 21
0
def main():
    data = pandas.read_csv(r"newsenti1.csv")

    X = data.iloc[:, 1:]
    Y = data.iloc[:, 0]
    runBaseline = True

    trainX, testX, yTrain, yTest = cross_validation.train_test_split(
        X, Y, test_size=0.1, random_state=0)  #test train split

    vectorizer = feature_extraction.text.TfidfVectorizer()
    sentiment_scaler = preprocessing.StandardScaler()
    unigrams = vectorizer.fit_transform(trainX["text"]).toarray()
    vectorizer1 = feature_extraction.text.TfidfVectorizer()
    synst = vectorizer1.fit_transform(
        trainX["synset"].values.astype('U')).toarray()
    tf_vectorizer = feature_extraction.text.CountVectorizer()
    tf = tf_vectorizer.fit_transform(trainX["text"]).toarray()
    tf_feature_names = tf_vectorizer.get_feature_names()
    lda = LDA(n_topics=10,
              max_iter=5,
              learning_method='online',
              learning_offset=50.,
              random_state=0).fit(tf)
    lda_train = lda.transform(tf)
    sentiment = sentiment_scaler.fit_transform(trainX.ix[:,
                                                         "pscore":"obscore"])
    allf = np.hstack((unigrams, lda_train, synst, sentiment))

    unigrams_t = vectorizer.transform(testX["text"]).toarray()
    tf_t = tf_vectorizer.transform(testX["text"]).toarray()
    lda_test = lda.transform(tf_t)
    sentiment_t = sentiment_scaler.transform(testX.ix[:, "pscore":"obscore"])
    synst_t = vectorizer1.transform(
        testX["synset"].values.astype('U')).toarray()
    allf_t = np.hstack((unigrams_t, lda_test, synst_t, sentiment_t))

    features = {"All_f": (allf, allf_t)}

    for f in features:
        xTrain = features[f][0]
        xTest = features[f][1]

        if runBaseline:
            baseline = dummy.DummyClassifier(strategy='most_frequent',
                                             random_state=0)
            baseline.fit(xTrain, yTrain)
            predictions = baseline.predict(xTest)

            print(indent("Baseline: ", 4))
            print(indent("Test Accuracy: ", 4),
                  metrics.accuracy_score(yTest, predictions))
            print(indent(metrics.classification_report(yTest, predictions), 4))
            print()
            runBaseline = False

        print(indent("Features: ", 4), f)

        for m, model in enumerate(models):
            hyp = clf_hyp[m]
            pipe = pipeline.Pipeline([('clf', model)])

            if len(hyp) > 0:
                grid = grid_search.GridSearchCV(
                    pipe, hyp, cv=10,
                    n_jobs=6)  #grid search for best hyperparameters
                grid.fit(xTrain, yTrain)
                predictions = grid.predict(xTest)

                print(indent(type(model).__name__, 6))
                print(indent("Best hyperparameters: ", 8), grid.best_params_)
                print(indent("Validation Accuracy: ", 8), grid.best_score_)
                print(indent("Test Accuracy: ", 8),
                      metrics.accuracy_score(yTest, predictions))
                print(
                    indent(metrics.classification_report(yTest, predictions),
                           8))

            else:
                grid = model
                grid.fit(xTrain, yTrain)
                predictions = grid.predict(xTest)

                print(indent(type(model).__name__, 6))
                print(indent("Test Accuracy: ", 8),
                      metrics.accuracy_score(yTest, predictions))
                print(
                    indent(metrics.classification_report(yTest, predictions),
                           8))

        print()
    print()
Ejemplo n.º 22
0
def lda(X, n_components = 4):
	p = LDA(n_components = n_components)
	return p.fit_transform(X), p
    'newsletter', 'mit', 'subscribe', 'blockchain', 'karen', 'hao', 'will',
    'knight', 'technologies'
])

len(articles.index)

all_list = []
for i in range(len(articles.index)):
    words = CountVectorizer(max_df=10,
                            min_df=1,
                            max_features=1000,
                            stop_words=stopwords)
    bag_of_words = words.fit_transform(
        split_text_into_paras(articles.iloc[i]['Article ']))
    word_names = words.get_feature_names()
    lda = LDA(n_components=2).fit(bag_of_words)
    all_list.append(([i] + display_topics(lda, word_names, 5)))

all_list

import numpy as np
from os import path
from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS

stopwords = set(STOPWORDS)
stopwords.update([
    'newsletter', 'mit', 'subscribe', 'blockchain', 'karen', 'hao', 'will',
    'knight', 'technologies', 'say', 'people', 'technology', 'algorithm',
    'says', 'one'
])
Ejemplo n.º 24
0
filenames = ['laDuda.txt', 'eljorge.txt', 'llecspier.txt', 'vairon.txt']

vectorizer = CountVectorizer(input='filename',
                             ngram_range=(1, 3),
                             stop_words="english")

dtm = vectorizer.fit_transform(filenames)  # a sparse matrix

tfidf_transformer = TfidfTransformer(norm='l2')
x_tfidf = tfidf_transformer.fit_transform(dtm)
matVoc = x_tfidf.toarray()

vocab = vectorizer.get_feature_names()  # a list
vocab = np.array(vocab)

lda = LDA(n_components=3, random_state=0)

lda_array = lda.fit_transform(matVoc)

labels = [np.argmax(x) for x in lda_array]

print(lda_array)

colores = ["r", "b", "c", "y"]
autores = ["Poema", "Chapman", "Shakespeare", "Lord Byron"]
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

for i, punto in enumerate(lda_array):
    ax.scatter(punto[0],
               punto[1],
Ejemplo n.º 25
0
import matplotlib.pyplot as plt

featurized_data = pd.read_csv('~/capstone_project/data/featurized_data.csv')
test = featurized_data[featurized_data.min_game > 10]
test_players = test[['player_id', 'display_name']]
test.drop(['Unnamed: 0', 'player_id', 'display_name'], inplace=True, axis=1)
features = featurized_data.columns
test_players.set_index('player_id', inplace=True, drop=True)
test.fillna(0, inplace=True)
test = normalize(test)

count_topics = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

lda = LDA(n_topics=4,
          max_iter=5,
          learning_method='online',
          learning_offset=50.,
          random_state=0)

lda.fit(test)

# for i in [2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]:
#     KMeans_test = KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300, tol=0.0001, \
#     precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1, algorithm='auto')
#     KMeans_test.fit(players_by_topic)
#     test_labels = KMeans_test.labels_
#     players_clustered = pd.DataFrame(players_by_topic)
#     players_clustered['cluster'] = test_labels
#     players_clustered['player_name'] = player_info['display_name']
#     score = silhouette_score(players_by_topic, test_labels, metric='euclidean',sample_size=None)
#     print score
Ejemplo n.º 26
0
#Get column names for timbre and harmony features
columns = data.columns
timbre_col = [col for col in columns if re.search("Timbre", col)]
harm_col = [col for col in columns if re.search("Harm", col)]

#Create matrices of timbre and harmony features
timbre_features = data[timbre_col]
timbre_features = timbre_features.to_numpy()

harm_features = data[harm_col]
harm_features = harm_features.to_numpy()

#Apply LDA on both separately
lda_timbre = LDA(n_components=t_topics,
                 doc_topic_prior=doc_topic_prior,
                 random_state=0)
lda_timbre.fit(timbre_features)
topics_timbre = lda_timbre.transform(timbre_features)

lda_harm = LDA(n_components=h_topics,
               doc_topic_prior=doc_topic_prior,
               random_state=0)
lda_harm.fit(harm_features)
topics_harm = lda_harm.transform(harm_features)

#Summary of the top components of topics


def get_top_components(model, feature_names, n):
    #n is the number of top components you want to output
Ejemplo n.º 27
0
path = dph.getDataPath('pressBiTriLemma.json')
df = pd.read_json(path)

dfBigramLemma = df[columnName]

df2 = pd.DataFrame(dfBigramLemma)
df2[columnName] = df2.apply(lambda row: ' '.join(map(str, row[columnName])), axis=1)


vectorizer = TfidfVectorizer(strip_accents = 'unicode', ngram_range = (1,2));
xTrainTfidf = vectorizer.fit_transform(df2[columnName]);

searchParams = {'n_components': [10], 'learning_decay': [.5]}
if True:
    model = LDA()
    model = GridSearchCV(model, searchParams)
    model.fit(xTrainTfidf)
    model = model.best_estimator_
    if False:
        dph.saveModel(model, 'ldaGrid' + columnName)
else:
    model = dph.loadModel('ldaGrid' + columnName)

# Zeigt den Socre
print("Model perplexity: ", model.perplexity(xTrainTfidf))

#   Ermittelt die Werte für die nächsten Funktionen
featureNames = vectorizer.get_feature_names()
weights = model.components_
Ejemplo n.º 28
0
    start=2,
    limit=40,
    step=6)

limit = 40
start = 2
step = 6
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()
"""***TRANSFORMATION OF THE TEXT USING LDA FROM BOW***"""

lda = LDA(n_components=25, n_jobs=-1)
array_lda_Bow = lda.fit_transform(X_Bow)

X_lda_Bow = pd.DataFrame(array_lda_Bow)
X_lda_Bow
"""***TRANSFORMATION OF THE TEXT USING LDA FROM TFID***"""

lda = LDA(n_components=25, n_jobs=-1)
array_lda_Tfid = lda.fit_transform(X_Tfid)

X_lda_Tfid = pd.DataFrame(array_lda_Tfid)
X_lda_Tfid
"""***CONCATENATION OF MODELS***"""

concat_lda = np.concatenate((array_tfid, array_lda_Tfid), axis=1)
        with open(os.path.join(PICKLE_DIR,'clues_df.p'),'rb') as f:
            clues = pickle.load(f)
    except:
        exec(open('./process_puz.py').read())

    try:
       with open(os.path.join(PICKLE_DIR,'lda_fit.p'),'rb') as f:
            pickle.load(f)

    except:
        nclues = clues.shape[0]
        ntext = int(.1*nclues)
        clue_samples = np.random.choice(range(nclues),ntext)

        count_vectorizer = CountVectorizer(stop_words='english')
        data = count_vectorizer.fit_transform(clues.clue_text.iloc[clue_samples])

        number_topics = 4
        lda = LDA(n_components=number_topics, n_jobs=-1)
        lda.fit(data)

        with open(os.path.join(PICKLE_DIR,'lda_fit.p'),'wb') as f:
            pickle.dump(lda,f)

    LDAvis_prepared = sklearn_lda.prepare(lda, data, count_vectorizer)

    with open(os.path.join(PICKLE_DIR,'ldavis.p'), 'wb') as f:
            pickle.dump(LDAvis_prepared, f)

    pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(number_topics) +'.html')
	if(word != '' and not is_other(word)):
		words_not_other.append(word)


# Convert a collection of words to a matrix of token counts
print_status("Counting ngrams...")
# vectorizer = CountVectorizer(analyzer='char', ngram_range=(1, 5), binary=True)
vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 5), binary=True)
vectorized_train_data = vectorizer.fit_transform(X_train)
vectorized_dev_data = vectorizer.transform(words_not_other)


# Create and fit the LDA model
print_status("Training LDA...")
number_topics = 2
lda_model = LDA(n_components=number_topics, max_iter=100, random_state=123)
lda_model.fit(vectorized_train_data)
lda = lda_model.transform(vectorized_dev_data)

# Decide labels that belong to each cluster
cluster_0_label = ''
cluster_1_label = ''
# Get indexes of words that represent better cluster 0
cluster_0 = lda[:,0]
top_n_words_c0_idx = (-cluster_0).argsort()[:10]
# Check in which language these words belong to
count_lang1 = 0
count_lang2 = 0

for i in top_n_words_c0_idx:
	word = words_not_other[i]