# ]

    # gensim_lda.fit(docs)

    # with open('models/topics/' + gensim_lda.model.named_steps['model'].gensim_model.__class__.__name__+ 'alpha_auto' + '.pkl', 'wb') as fobj:
    #     pickle.dump(gensim_lda, fobj)

    # lda = gensim_lda.model.named_steps['model'].gensim_model
    # print(lda.show_topics())

    import pyLDAvis
    import pyLDAvis.gensim

    gensim_lda = pickle.load(
        open('path\\to\\models\\topics\\LdaModel.pkl', 'rb'))

    lda = gensim_lda.model.named_steps['model'].gensim_model

    pickled_corpus = PickledCorpusReader(
        'path\\to\\App\\preprocessed_corpus\\basic_corpus')
    docs = pickled_corpus.docs()

    corpus = [
        gensim_lda.model.named_steps['vect'].lexicon.doc2bow(doc)
        for doc in gensim_lda.model.named_steps['norm'].transform(docs)
    ]

    lexicon = gensim_lda.model.named_steps['vect'].lexicon

    data = pyLDAvis.gensim.prepare(lda, corpus, lexicon)
    pyLDAvis.save_html(data, 'results/topics_alpha_auto.html')
        ('normalize', TextNormalizer()),
        ('vectorize', GensimVectorizer(path="path\\to\\doc2vec_complete\\doc2vec.d2v"))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=1000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps)


labels = ["not_escalated", "BEMS","RRR", "CAP"]
reader = PickledCorpusReader('path\\to\\preprocessed_data\\preprocessed_complete')
loader = CorpusLoader(reader, 12, shuffle=True, categories=labels)

models = []

for form in (LogisticRegression, SGDClassifier):
    models.append(create_pipeline(form(), False))

models.append(create_pipeline(LinearSVC(), False))

import time
import json

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def score_models(models, loader):
Ejemplo n.º 3
0
        fig, ax = plt.subplots(figsize=(15, 7))

        ax = dendrogram(linkage_matrix, orientation='left', **kwargs)

        plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off')
        plt.tight_layout()
        plt.show()


if __name__ == '__main__':
    fp = open('stopwords.txt', 'r')
    stopwords = fp.read()
    stopwords = stopwords.split('\n')
    fp.close()

    corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\basic_corpus')

    agglomerative_clust = HierarchicalTopics(corpus=corpus,
                                             custom_stopwords=stopwords)
    agglomerative_clust.cluster(corpus)

    with open('models/topics' + 'HierarchicalTopics.pkl', 'wb') as fobj:
        pickle.dump(agglomerative_clust, fobj)

    # # visualize
    # corpus = PickledCorpusReader('path\\to\\preprocessed_small')
    # normalizer = TextNormalizer(custom_stopwords=stopwords)
    # labels=[]
    # for fileid in corpus.fileids():
    #     terms = []
    #     for term, count in Counter(list(normalizer.normalize(corpus.words(fileid)))).most_common(10):
        plt.tick_params(axis='x', bottom='off', top='off', labelbottom='off')
        plt.tight_layout()
        plt.show()


if __name__ == '__main__':
    fp = open('stopwords.txt', 'r')
    stopwords = fp.read()
    stopwords = stopwords.split('\n')
    fp.close()

    STOPWORDS.update(stopwords)

    # create
    corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\basic_corpus')

    clusterer = HierarchicalTopics(corpus)
    clusterer.cluster(corpus, n_clusters=30)

    with open('models/' + 'HierarchicalTopics_30.pkl', 'wb') as fobj:
        pickle.dump(clusterer, fobj)

    for idx, fileid in enumerate(corpus.fileids(categories=['ArticlesItem'])):
        print(clusterer.labels[idx], fileid)

    # # visualize
    # from collections import Counter
    # corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\subset_corpus')

    # labels = []
                yield LabeledSentence(element, [label])

        def to_array(self):
            self.sentences = []
            for line, label in self.sources:
                self.sentences.append(LabeledSentence(line, [label]))
            return self.sentences

        def sentences_perm(self):
            shuffle(self.sentences)
            return self.sentences


if __name__ == '__main__':
    labels = ["not_escalated", "BEMS", "RRR", "CAP"]
    reader = PickledCorpusReader(
        'path\\to\\preprocessed_data\\preprocessed_min')

    trg_docs = reader.docs(categories=labels)

    # normalize
    normalizer = TextNormalizer()
    normalized_docs = normalizer.transform(trg_docs)

    # vectorize
    vectorizer = GensimVectorizer(
        path='path\\to\\doc2vec_complete\\doc2vec.d2v')
    vectorizer.fit(normalized_docs)
    vectorized_docs = vectorizer.transform(normalized_docs)

    # vectorized_docs = np.toarray(vectorized_docs)
Ejemplo n.º 6
0
        names = vectorizer.get_feature_names()
        topics = dict()

        for idx, topic in enumerate(model.components_):
            features = topic.argsort()[:-(n - 1): -1]
            tokens = [names[i] for i in features]
            topics[idx] = tokens

        return topics

if __name__ == '__main__':
    fp = open('stopwords.txt', 'r')
    stopwords = fp.read()
    stopwords = stopwords.split('\n')
    fp.close()

    corpus = PickledCorpusReader('path\\to\\preprocessed_corpus\\basic_corpus')

    lda = SklearnTopicModels(n_topics=30, custom_stopwords=stopwords)
    documents = corpus.docs()

    lda.fit_transform(documents)
    topics = lda.get_topics()

    with open('models/topics/' + 'LDA' + '.pkl', 'wb') as fobj:
        pickle.dump(lda, fobj)

    for topic, terms in topics.items():
        print("Topic #{}:".format(topic+1))
        print(terms)