# ] # gensim_lda.fit(docs) # with open('models/topics/' + gensim_lda.model.named_steps['model'].gensim_model.__class__.__name__+ 'alpha_auto' + '.pkl', 'wb') as fobj: # pickle.dump(gensim_lda, fobj) # lda = gensim_lda.model.named_steps['model'].gensim_model # print(lda.show_topics()) import pyLDAvis import pyLDAvis.gensim gensim_lda = pickle.load( open('path\\to\\models\\topics\\LdaModel.pkl', 'rb')) lda = gensim_lda.model.named_steps['model'].gensim_model pickled_corpus = PickledCorpusReader( 'path\\to\\App\\preprocessed_corpus\\basic_corpus') docs = pickled_corpus.docs() corpus = [ gensim_lda.model.named_steps['vect'].lexicon.doc2bow(doc) for doc in gensim_lda.model.named_steps['norm'].transform(docs) ] lexicon = gensim_lda.model.named_steps['vect'].lexicon data = pyLDAvis.gensim.prepare(lda, corpus, lexicon) pyLDAvis.save_html(data, 'results/topics_alpha_auto.html')
('normalize', TextNormalizer()), ('vectorize', GensimVectorizer(path="path\\to\\doc2vec_complete\\doc2vec.d2v")) ] if reduction: steps.append(( 'reduction', TruncatedSVD(n_components=1000) )) # Add the estimator steps.append(('classifier', estimator)) return Pipeline(steps) labels = ["not_escalated", "BEMS","RRR", "CAP"] reader = PickledCorpusReader('path\\to\\preprocessed_data\\preprocessed_complete') loader = CorpusLoader(reader, 12, shuffle=True, categories=labels) models = [] for form in (LogisticRegression, SGDClassifier): models.append(create_pipeline(form(), False)) models.append(create_pipeline(LinearSVC(), False)) import time import json from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score def score_models(models, loader):
fig, ax = plt.subplots(figsize=(15, 7)) ax = dendrogram(linkage_matrix, orientation='left', **kwargs) plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off') plt.tight_layout() plt.show() if __name__ == '__main__': fp = open('stopwords.txt', 'r') stopwords = fp.read() stopwords = stopwords.split('\n') fp.close() corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\basic_corpus') agglomerative_clust = HierarchicalTopics(corpus=corpus, custom_stopwords=stopwords) agglomerative_clust.cluster(corpus) with open('models/topics' + 'HierarchicalTopics.pkl', 'wb') as fobj: pickle.dump(agglomerative_clust, fobj) # # visualize # corpus = PickledCorpusReader('path\\to\\preprocessed_small') # normalizer = TextNormalizer(custom_stopwords=stopwords) # labels=[] # for fileid in corpus.fileids(): # terms = [] # for term, count in Counter(list(normalizer.normalize(corpus.words(fileid)))).most_common(10):
plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off') plt.tight_layout() plt.show() if __name__ == '__main__': fp = open('stopwords.txt', 'r') stopwords = fp.read() stopwords = stopwords.split('\n') fp.close() STOPWORDS.update(stopwords) # create corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\basic_corpus') clusterer = HierarchicalTopics(corpus) clusterer.cluster(corpus, n_clusters=30) with open('models/' + 'HierarchicalTopics_30.pkl', 'wb') as fobj: pickle.dump(clusterer, fobj) for idx, fileid in enumerate(corpus.fileids(categories=['ArticlesItem'])): print(clusterer.labels[idx], fileid) # # visualize # from collections import Counter # corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\subset_corpus') # labels = []
yield LabeledSentence(element, [label]) def to_array(self): self.sentences = [] for line, label in self.sources: self.sentences.append(LabeledSentence(line, [label])) return self.sentences def sentences_perm(self): shuffle(self.sentences) return self.sentences if __name__ == '__main__': labels = ["not_escalated", "BEMS", "RRR", "CAP"] reader = PickledCorpusReader( 'path\\to\\preprocessed_data\\preprocessed_min') trg_docs = reader.docs(categories=labels) # normalize normalizer = TextNormalizer() normalized_docs = normalizer.transform(trg_docs) # vectorize vectorizer = GensimVectorizer( path='path\\to\\doc2vec_complete\\doc2vec.d2v') vectorizer.fit(normalized_docs) vectorized_docs = vectorizer.transform(normalized_docs) # vectorized_docs = np.toarray(vectorized_docs)
names = vectorizer.get_feature_names() topics = dict() for idx, topic in enumerate(model.components_): features = topic.argsort()[:-(n - 1): -1] tokens = [names[i] for i in features] topics[idx] = tokens return topics if __name__ == '__main__': fp = open('stopwords.txt', 'r') stopwords = fp.read() stopwords = stopwords.split('\n') fp.close() corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\basic_corpus') lda = SklearnTopicModels(n_topics=30, custom_stopwords=stopwords) documents = corpus.docs() lda.fit_transform(documents) topics = lda.get_topics() with open('models/topics/' + 'LDA' + '.pkl', 'wb') as fobj: pickle.dump(lda, fobj) for topic, terms in topics.items(): print("Topic #{}:".format(topic+1)) print(terms)