def get_topics(candidate, day):
    start_time = datetime.strptime(day, "%Y-%m-%d").date()
    start_time = int(start_time.strftime('%s'))*1000
    end_time = start_time + 86399999
    try:
        client = MongoClient()
        tweets = client.fletcher.tweets
        tweets = tweets.aggregate([
            {"$match":{"$text":{"$search":candidate_search[candidate_slugs[candidate]]}}},
            {"$match":{"timestamp_ms":{"$gte":start_time,"$lt":end_time}}}])
        documents = []
        pattern = re.compile("[^a-zA-Z ]")
        for tweet in tweets:
            documents.append(pattern.sub('', tweet['text']))
        stoplist = set(candidate_stop_words[candidate_slugs[candidate]] + stopwords)
        texts = [[word for word in document.lower().split() if word not in stoplist]
                for document in documents]
        frequency = defaultdict(int)
        for text in texts:
            for token in text:
                frequency[token] += 1
        texts = [[token for token in text if frequency[token] > 1]
                for text in texts]
        dictionary = corpora.Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=3, update_every=1, chunksize=10000, passes=10)
        return lda.print_topics(3)
    except:
        return None
Beispiel #2
0
def lda_topic_model(data, is_clean=False, num_of_topics=10, num_of_pass=5):
    """do the topic model for the given dataset
    input:
        data: a documents or a list of words
        is_clean: Use this notation to pre-process the data.
        num_of_topics: An LDA model requires the user to determine how many
                        topics should be generated.
        num_of_pass: The greater the number of passes, the more accurate the
                    model will be.
                    A lot of passes can be slow on a very large corpus.
    """
    if not is_clean:
        stops = set(nltk.corpus.stopwords.words("english"))
        texts = prepare_for_lda(data, stops)
    else:
        texts = data
    dictionary = corpora.Dictionary(texts)
    print dictionary
    corpus = [dictionary.doc2bow(text) for text in texts]
    ldamodel = LdaModel(corpus, id2word=dictionary, num_topics=num_of_topics, \
                        passes=num_of_pass)
    return ldamodel.print_topics(num_topics=num_of_topics, num_words=10)
def analyze_speeches(filename="1.txt"):
    """Read a speech file.

    Args:
    filename - speech file
    """
    dictionary = corpora.dictionary.Dictionary()
    train_documents = list()
    all_words = list()
    for i in xrange(1, GC.N_SPEECHES):
        filename = path_join(GC.SPEECH_FOLDER, str(i) + ".txt")
        with open(filename, "r") as speech_file:
            speech_words = list()
            for line in speech_file:
                words = line.strip().decode("utf8").split()
                words = [word for word in words if valid_word(word)]
                words = " ".join(map(unidecode, words))
                output = words.translate(string.maketrans("", ""), punct)
                speech_words += [word.lower() for word in output.split()
                                 if valid_word(word, True)]
            all_words += speech_words
            dictionary.add_documents([speech_words])
            train_documents.append(speech_words)
    corpus = [dictionary.doc2bow(text) for text in train_documents]
    lda = LdaModel(corpus=corpus,
                   id2word=dictionary,
                   num_topics=GC.N_TOPICS,
                   passes=10,
                   alpha='auto')

    print '{} topics with corresponding top {} words'.format(GC.N_TOPICS, 10)
    pprint(lda.print_topics())

    word_counter = Counter(all_words)
    print 'Top {} words in {} speeches of NaMo'.format(GC.N_TOP_WORDS,
                                                       GC.N_SPEECHES)
    pprint(word_counter.most_common(GC.N_TOP_WORDS))
Beispiel #4
0
# Creating the term dictionary of our courpus, where every unique term is assigned an index
dictionary = corpora.Dictionary(clean_headlines)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_headlines]

# build LDA model
lda_model = LdaModel(doc_term_matrix,
                     num_topics=5,
                     id2word=dictionary,
                     iterations=10,
                     random_state=2)

# extract topics for headlines
topics = lda_model.print_topics(num_topics=5, num_words=10)

# pprint topics
pprint.pprint(topics)

# Code ends here

# --------------
# coherence score

coherence_model_lda = CoherenceModel(model=lda_model,
                                     texts=clean_headlines,
                                     dictionary=dictionary,
                                     coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(coherence_lda)
Beispiel #5
0
import pandas as pd
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

df = pd.read_csv('encuesta_internautas_binomiales_menos_desviacion.csv')
columns = list(df.columns.values)
#print(columns)
#print(df.values)
count = 0
texts = []
for i in df.values:
    #print(i)
    count += count
    words = []
    for j, valor in enumerate(i):
        if (valor == 1):
            words.append(columns[j])
    texts.append(words)
    if count > 1:
        break

common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

ldamodel = LdaModel(common_corpus, num_topics=5, id2word=common_dictionary)
print(ldamodel.print_topics(num_topics=5, num_words=5))
Beispiel #6
0
def fit_model(corpus,id2word,num_topics):
    # 训练模型
    lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=num_topics)
    lda.print_topics(num_topics)
    return lda
Beispiel #7
0
# print(tfidf[vec])
# index = similarities.SparseMatrixSimilarity(tfidf[corpus], num_features=12)
# sims = index[tfidf[vec]]
# print(list(enumerate(sims)))   
corpora.MmCorpus.save_corpus('file.mm', corpus)
#id2word= corpora.Dictionary.load('deerwester.dict')
mmCorpus = corpora.MmCorpus("file.mm")
print mmCorpus
lsi = LsiModel(mmCorpus, id2word=dictionary,num_topics=10)
print "lsi:"
#print(lsi[new_vec])
lsi.print_debug(4, 4)
lsi.print_topics(4,2)
lsi.show_topic(10, 10)

lda = LdaModel(mmCorpus,id2word=dictionary,num_topics=10)
lda.print_topics(4,4)
doc_lda = lda[new_vec]

print "lda:"
#print doc_lda
         
# corpus = [[(0, 1.0), (1, 1.0), (2, 1.0)],
#            [(2, 1.0), (3, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (8, 1.0)],
#            [(1, 1.0), (3, 1.0), (4, 1.0), (7, 1.0)],
#            [(0, 1.0), (4, 2.0), (7, 1.0)],
#            [(3, 1.0), (5, 1.0), (6, 1.0)],
#            [(9, 1.0)],
#            [(9, 1.0), (10, 1.0)],
#            [(9, 1.0), (10, 1.0), (11, 1.0)],
#            [(8, 1.0), (10, 1.0), (11, 1.0)]]
Beispiel #8
0
def fit_model(data, n_topics, iterations, passes, min_prob, eval_every, n_best,
              min_df, max_df, preserved_words):
    dt = cur_date()
    output_folder = "lda_%stopics_%s" % (n_topics, dt)
    os.makedirs(output_folder, exist_ok=True)
    os.makedirs("%s/separate" % output_folder, exist_ok=True)

    logging.info("creating corpus...")
    dictionary, corpus = make_corpus(list(data.values()), min_df, max_df,
                                     preserved_words, output_folder)
    # generate LDA model
    logging.info("training model...")
    lda = LdaModel(corpus,
                   num_topics=n_topics,
                   id2word=dictionary,
                   iterations=iterations,
                   passes=passes,
                   minimum_probability=min_prob,
                   eval_every=eval_every)
    logging.info("saving model...")
    lda.save('saved/lda_%s_%s.serialized' % (n_topics, dt))
    # print(lda.print_topics(num_topics=n_topics, num_words=4))

    # save all-vs-all pairwise similarities
    logging.info("creating index...")
    index = Similarity('./sim_index',
                       lda[corpus],
                       num_features=n_topics,
                       num_best=n_best + 1)
    paths = list(data.keys())
    logging.info("write all similarities to result file")
    with open('%s/similarities.txt' % output_folder, 'w') as res_file:
        with open('%s/similarities_summary.txt' % output_folder,
                  'w',
                  encoding='utf-8') as res_file_sum:
            for i, similarities in enumerate(index):
                cur_fname = get_filename(paths[i])
                top_similar = [(paths[s[0]], s[1]) for s in similarities
                               if s[0] != i]
                res_file.write('%s: %s\n' %
                               (cur_fname, [(get_filename(p), c)
                                            for (p, c) in top_similar]))

                res_file_sum.write('%s: %s\n' %
                                   (cur_fname, get_title(paths[i])))
                for sim in top_similar:
                    res_file_sum.write(
                        '%s: %s' % (get_filename(sim[0]), get_title(sim[0])))
                res_file_sum.write('-' * 100 + '\n')

                # for each doc we make separate file which containts list of similar docs
                with open(
                        '%s/separate/%s.txt' %
                    (output_folder, cur_fname.split('.')[0]), 'w') as sep_res:
                    for sim in top_similar:
                        sep_res.write('%s\n' % get_filename(sim[0]))

    logging.info("save index")
    index.save('saved/lda_index_%s.index' % dt)

    # save topic - words matrix
    with open("%s/topic_words.txt" % output_folder, 'w',
              encoding='utf-8') as f:
        for topic_words in lda.print_topics(lda.num_topics):
            f.write("#%s: %s\n" % (topic_words[0], topic_words[1]))

    # save document - topics matrix
    with open("%s/document_topics.txt" % output_folder, 'w') as f:
        for i, topics in enumerate(lda[corpus]):
            f.write("#%s: %s\n" % (get_filename(paths[i]), topics))

    # save dictionary
    dictionary.save_as_text("%s/dictionary.txt" % output_folder)
Beispiel #9
0
class LDA_parser():
    """
    This class implements a wrapper pipeline for text preprocessing and LDA parsing of an input corpus 
    in the form ['str','str','str', ... ]. 
    """
    def __init__(self,
                 corpus='',
                 language='english',
                 preprocessor_type="spacy",
                 tags=["DET", "PUNCT", "NUM", "SYM", "SPACE"],
                 custom_filter=[],
                 lemmatize=False,
                 stem=False,
                 min_len=2,
                 num_topics=10,
                 passes=100):
        """ 
        Parses the input text into a suitable format, then performs all LDA extraction tasks. 
        It expects the input corpus to be a list of texts. If input is a long string, it will attempt 
        create documents by splitting by 
        @ params: 
            @ corpus: Input corpus in str or ['str','str','str', ... ] format, where each entry
                      is a document of type str. Alternatively, a str format input (not recommended).
            @ preprocessor_type: Use nltk-based or spaCy-base preprocessor 
            @ language: language to use in the preprocessor 
            @ tags: if spaCy is selected, will filter words with input POS tags 
            @ custom_filter: filter words in this input list in the preprocessing step 
            @ lemmatize: use lemmatization in the preprocessing 
            @ stem: use stemming in the preprocessing  
            @ num_topics: maximum number of topics in the LDA algorithm 
            @ passes: number of training epochs in the LDA 
        """

        print("Initializing model...\n")
        if preprocessor_type == "nltk":
            print("NLTK preprocessor selected.")
            self.preprocessor = nltk_preprocessor(language=language)
        if preprocessor_type == "spacy":
            print("spaCy preprocessor selected.")
            self.preprocessor = spacy_preprocessor(language=language)

        self.language = language  # input language
        self.raw_corpus = ""  # simply stores the input if in str type
        self.clean_corpus = [
        ]  # [doc, doc, ..., doc] = [[sent, sent, ...], ... ,[sent, sent, ...]]
        self.dictionary = None  # holds a corpora.Dictionary representation of corpus
        self.doc2bow_corpus = None  # contains doc2bow vector representations of each document in the corpus
        self.lda_model = None  # LDA model trained on the input corpus
        self.topic_mixtures = [
        ]  # contains str representations of mixtures of words with their probabilities
        self.topics = {
        }  # Contains a dictionary of topics with words and respective mix probabilities once "extract topics" is called.
        self.topic_words = {
        }  # As above, but only contains the respective words of the topic

        # check for raw str corpus format
        if isinstance(corpus, str):
            print(
                "***WARNING***\nRaw input (str) received. Text will be sentence-tokenized and parsed accordingly."
            )
            print("Make sure this is intended. \n")
            self.raw_corpus = str(corpus)  # transform input to string
            self.fit(corpus,
                     raw=True,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)  # fit corpus as raw

        elif corpus == '':
            print("***WARNING***\nNull Corpus")
        # assume input corpus is in the right format
        else:
            self.fit(corpus,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)

    def fit(self,
            corpus,
            raw=False,
            language='english',
            stem=False,
            lemmatize=False,
            num_topics=10,
            passes=100,
            min_len=2,
            echo_corpus=False):
        """ 
        Assumes input corpus is in the right format. 
        @args: 
            @ corpus = input corpus  
            @ language = input language  
            @ stem/lemmatize = if true, stem or lemmatize input corpus
            @ num_topics = number of topics to choose in the algorithm 
            @ passes = number of epochs of the LDA 
            @ min_len = minimum length of words to consider when preprocessing words
        """

        if echo_corpus:
            print("CORPUS: {}".format(corpus))

        t0 = time.time()

        print("Fitting LDA topic modelling...")
        self.raw_corpus = corpus  # input corpus as is
        self.language = language  # in case initial language changed

        if raw:
            print("Preprocessing corpus...(raw)")
            self.clean_corpus = self.preprocessor.preprocess_str_corpus(
                corpus, stem=stem, lemmatize=lemmatize, min_len=min_len)
        else:
            print("Preprocessing corpus...")
            self.clean_corpus = self.preprocessor.preprocess_texts(
                self.raw_corpus, min_len=2)  # preprocess text list

        print("Creating corpora dictionary...")
        self.dictionary = corpora.Dictionary(
            self.clean_corpus)  # create corpora.Dictionary mapping
        print("Translating doc2bow corpus...")
        self.doc2bow_corpus = [
            self.dictionary.doc2bow(text) for text in self.clean_corpus
        ]  # doc2bow corpus representation
        print("Running LDA...")
        self.lda_model = LdaModel(self.doc2bow_corpus,
                                  num_topics=num_topics,
                                  id2word=self.dictionary,
                                  passes=passes)
        self.topic_mixtures = self.lda_model.show_topics(
            num_topics=-1,
            num_words=10)  # string representation of topics mixtures

        t1 = time.time()
        print("\nDone in {:.3f} seconds.".format(t1 - t0))

    def print_topics(self, words_per_topic=5):
        """
        Displays the topics in string format
        """
        topics = self.lda_model.print_topics(num_words=words_per_topic)
        for topic in topics:
            print(topic)

    def extract_topics(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of tuples of words_per_topic many words with 
        probability at least as high as threshold, where the second value is the density 
        for the topic. 
        @params: 
            @ max_words_per_topic: Maximum topic mixture component words to consider. 
            @ threshold: select words whose density is at least this value
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topics = topics  # update attribute

        return topics

    def extract_topic_words(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of words_per_topic many words with 
        probability at least as high as threshold. 
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup[0]
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topic_words = topics  # update attribute

        return topics

    def parse_new(self,
                  new_text,
                  top_n_topics=100,
                  top_n_w=30,
                  max_words_per_topic=50,
                  threshold=0.005,
                  verbose=True):
        """
        Parses a new text by obtaining the most likely topics for the new input, 
        as well as the respective words. This function should be used only after 
        the LDA parser has been fitted. 
        @params: 
            @ new_text: new input text 
            @ top_n_topics: top n topics with larges densities  p(topic)
            @ top_n_w: top n word with largest densities p(word) = p(word|topic)*p(topic)
            @ verbose: display information
            @ max_words_per_topic: maximum words per topic  
            @ thrshold: only consider words with density greater than threshold 
        @returns: 
            @ max_topic: most likely topic for the document 
            @ doc_max_topic_words: words associated with the most likely topic 
            @ doc_topics: all topics related to the document 
            @ doc_topic_words: all words from all topics associated with the document 
        """

        self.extract_topic_words(
            max_words_per_topic,
            threshold)  # extract topics to ensure they are there

        new_text_clean = self.preprocessor.preprocess_sentence(
            new_text)  # preprocess input text
        new_doc_bow = self.dictionary.doc2bow(
            new_text_clean)  # convert to doc2bow

        doc_topics = self.lda_model.get_document_topics(
            new_doc_bow)  # obtain topics for input document
        topic_idx = [tup[0] for tup in doc_topics]  # topic indices

        doc_topic_words = [
            word for idx in topic_idx for word in self.topic_words[idx]
        ]  # extract all words from every topic
        top_n_topics = nlargest(top_n_topics,
                                list(doc_topics),
                                key=lambda x: x[1])  # extract top n topics

        top_n_words = list(
            set([
                word for idx in [tup[0] for tup in top_n_topics]
                for word in self.topic_words[idx]
            ]))  # extrac the word for the topc words

        # Currently, we have access to the top n topics and their actual probabilities.
        # We want to collect all the words for those topics, and multiply them with their probabilities

        words_with_probs = [
        ]  # will store words with their actual probabilities:

        for topic_tup in doc_topics:
            topic_idx = topic_tup[0]  # obtain topic index
            topic_prob = topic_tup[1]  # obtain topic probability p(topic)
            for word_tup in self.lda_model.show_topic(topic_idx, topn=10):
                word_probability = word_tup[
                    1] * topic_prob  # p(w) = p(w|topic)p(topic)
                words_with_probs.append(
                    (word_tup[0], word_probability))  # (word, p(w))

        # obtain the n most likely words according to they individual probabilities
        n_most_likely_words = [
            tup[0] for tup in nlargest(
                top_n_w, list(words_with_probs), key=lambda x: x[1])
        ]

        if verbose:
            print("\nLOGS: \n")
            print("*** Most likely topic: ***\n", top_n_topics)
            print("*** Words for most likely topic: ***\n", top_n_words)
            print("*** All topics: ***\n", doc_topics)
            print("*** All topics words: ***\n", doc_topic_words)

        return n_most_likely_words, top_n_topics, top_n_words, doc_topics, doc_topic_words

    def pickle_save(self, savename="full_LDA_parser.pkl"):
        """ 
        Saves the full model object in pkl format
        """
        pickle.dump(self, open(savename, 'wb'))

    def save_model(self, name="LDA_model"):
        """ 
        Saves the LDA model, doc2bow_corpus and dictionary.
        These parameters can be used to instantiate a gensim 
        model, so there is no load in this class. 
        """
        dictionary_name = name + "_dictionary.gensim"
        corpus_name = name + "_doc2bow_corpus.pkl"
        model_name = name + ".gensim"

        pickle.dump(self.doc2bow_corpus, open(corpus_name,
                                              'wb'))  # save the doc2bow_corpus
        self.dictionary.save(dictionary_name)  # save corpus dictionary mapping
        self.lda_model.save(model_name)  # save the full model
Beispiel #10
0
    # #part of speech tagging
    for i in stemmed_tokens:
        words = nltk.word_tokenize(i)
        t = nltk.pos_tag(words)
        #Getting only the nouns and adjectives
        if t[0][1] == 'NN' or t[0][1] == 'NNS' or t[0][1] == 'NNP' or t[0][1] == 'NNPS' or t[0][1] == 'JJ' or t[0][1] == 'JJR' or t[0][1] == 'JJS':
            tagged.append(t[0][0])

    tagged = set(tagged)
    texts.append(tagged) #Contains cleaned, condensed and tagged text

    dictionary = Dictionary(texts)
    new_corpus = [dictionary.doc2bow(text) for text in texts]

    # generate LDA model
    lda = LdaModel(new_corpus, num_topics=10, id2word = dictionary, passes=20)
    #Store topics in generated_topic variable
    generated_topic = lda.print_topics()
    #Converting to string
    generated_topic = str(generated_topic)
    #To get all the topics from model
    token = RegexpTokenizer(r'"\w+"')
    #To renove quotation marks from topics
    token1 = RegexpTokenizer(r'\w+')
    generated_topic = token.tokenize(generated_topic)
    generated_topic = str(generated_topic)
    generated_topic = token1.tokenize(generated_topic)
    #removing duplicate topics
    generated_topic = set(generated_topic)
    output = list(generated_topic) # 'output' contains final list of topics
def program_clusters(pgms, n_topics, awds, papers):
    #First we need to filter the data by program code. Some grants have multiple program
    #codes, so we first filter through to determine which cells contain the program code
    #then we replace the exisiting program code(s) with the provided one. This ensures there
    #is only one code per award.
    papers = papers
    papers['year'] = pd.to_datetime(papers['year'])
    papers['citations per year'] = papers['citations'].divide([
        ((datetime.datetime.today() - x).days) / 365.2422
        for x in papers['year']
    ])
    num_pubs = papers.groupby('award number')[['publication'
                                               ]].count().reset_index()
    cits_year_mean = papers.groupby('award number')[['citations per year'
                                                     ]].mean().reset_index()

    pgms = [
        '6878', '6880', '6882', '6883', '6884', '6885', '9101', '9102', '6881'
    ]
    awds = awds
    awds = awds[awds['ProgramElementCode(s)'].str.contains('|'.join(pgms))]
    for x in pgms:
        awds['ProgramElementCode(s)'] = np.where(
            awds['ProgramElementCode(s)'].str.contains(x), x,
            awds['ProgramElementCode(s)'])
    awds['StartDate'] = pd.to_datetime(awds['StartDate'])
    awds['EndDate'] = pd.to_datetime(awds['EndDate'])
    awds['AwardedAmountToDate'] = [
        x.replace('$', '') for x in awds['AwardedAmountToDate']
    ]
    awds['AwardedAmountToDate'] = [
        x.replace(',', '') for x in awds['AwardedAmountToDate']
    ]
    awds['AwardedAmountToDate'] = pd.to_numeric(awds['AwardedAmountToDate'])
    awds = pd.merge(awds,
                    num_pubs,
                    left_on='AwardNumber',
                    right_on='award number',
                    how='left')
    awds = pd.merge(awds,
                    cits_year_mean,
                    left_on='AwardNumber',
                    right_on='award number',
                    how='left')
    awds.drop(columns=['award number_x', 'award number_y'], inplace=True)
    awds[['publication', 'citations per year'
          ]] = awds[['publication', 'citations per year']].replace(np.nan, 0)
    awds['pubs per year'] = np.where(
        awds['EndDate'] > datetime.datetime.today(),
        awds['publication'].divide([
            ((datetime.datetime.today() - x).days) / 365.2422
            for x in awds['StartDate']
        ]), awds['publication'].divide(
            (awds['EndDate'] - awds['StartDate']).astype('timedelta64[D]') /
            365.2422))

    abstracts = awds[[
        'ProgramElementCode(s)', 'AwardNumber', 'Abstract',
        'citations per year', 'pubs per year', 'AwardedAmountToDate'
    ]].copy()
    #This is a pretty clean data set, but there are some empty entries, so we
    #filter them out here
    abstracts = abstracts.dropna()

    #The first step in the tokenization process is splitting the abstract text
    #into a list of words.
    abstracts['clean_abstracts'] = [
        doc.lower().split() for doc in abstracts['Abstract']
    ]

    #we want to account for possible bigrams and trigams, which we search for
    #here
    bigram = Phrases(list(abstracts['clean_abstracts']),
                     min_count=5,
                     threshold=20)
    trigram = Phrases(bigram[list(abstracts['clean_abstracts'])], threshold=20)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    #Now we start building our dictinary and creating the cleaned up corpus.
    #We start by creating a list of stop words, punctuation, and other text to remove.
    #we also instantiate a lemmatizer
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()
    boiler_plate = 'This award reflects NSF' 's statutory mission and has been deemed worthy of support through evaluation using the Foundation' 's intellectual merit and broader impacts review criteria'

    #This function applies the bigram and trigram functions and lemmatizes the
    #the abstracts and only keeps words that a greater than 2 characters
    def word_mod(doc):
        doc = re.sub('<.*?>', ' ', doc)
        doc = re.sub(boiler_plate, '', doc)
        punct_free = ''.join(ch for ch in doc if ch not in exclude)
        words = punct_free.lower().split()
        bigs = bigram_mod[words]
        tris = trigram_mod[bigs]
        stop_free = " ".join([i for i in tris if i not in stop])
        lemm = " ".join(lemma.lemmatize(word) for word in stop_free.split())
        word_list = lemm.split()
        # only take words which are greater than 2 characters
        cleaned = [word for word in word_list if len(word) > 2]
        return cleaned

    abstracts['clean_abstracts'] = [
        word_mod(doc) for doc in abstracts['Abstract']
    ]

    # Here we create the dictionary from the corpus of abstracts, where each unique term is assigned an index.
    dictionary = corpora.Dictionary(abstracts['clean_abstracts'])
    # Filter terms which occurs in less than 4 articles & more than 40% of the abstracts
    dictionary.filter_extremes(no_below=4, no_above=0.45)
    #This creates a sparse matrix of word frequencies in each abstracts
    abstract_term_matrix = [
        dictionary.doc2bow(doc) for doc in abstracts['clean_abstracts']
    ]

    # Here we create and train the LDA model, passing in our term frequncy matrix, the number of
    #topics/clusters to be created, and our dictionary
    ldamodel = Lda(abstract_term_matrix,
                   num_topics=n_topics,
                   id2word=dictionary,
                   passes=50,
                   iterations=500)

    # Here we print out the top 10 words for each topic and their weight
    for i, topic in enumerate(
            ldamodel.print_topics(num_topics=n_topics, num_words=10)):
        words = topic[1].split("+")
        print(words, "\n")

    #Next we want to know what topic each abstract belongs to we pass each abstract
    #into the get_document_topics method and it returns the topic and the
    #probability of the abstract beloning to a that topic. We take the one that
    #has the highest probability
    def pred_topic(doc):
        doc_bow = ldamodel.id2word.doc2bow(doc)
        doc_topics = ldamodel.get_document_topics(doc_bow,
                                                  minimum_probability=0.20)
        if doc_topics:
            doc_topics.sort(key=operator.itemgetter(1), reverse=True)
            theme = doc_topics[0][0]
        else:
            theme = np.nan
        return theme

    abstracts['predicted topic'] = [
        pred_topic(doc) for doc in abstracts['clean_abstracts']
    ]

    #Here we do a histogram of how many abstracts/awards fall into each topic
    ab_hist = abstracts.groupby(['predicted topic'])['AwardNumber'].count()
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
    cols = cols + cols + cols + cols
    f1, ax = plt.subplots()
    ab_hist.plot.bar(rot=0, color=cols)
    ax.set_xticklabels([x for x in ab_hist.index])
    ax.set_xlabel('Topic Number')
    ax.set_ylabel('Count of Awards in Topic')
    ax.set_title('Distribution of Awards in Derived Topic Areas')
    plt.show()

    #Here we create a word cloud for each of the top words in the topic. Their size
    #is indicative of their weight.
    cloud = WordCloud(stopwords=stopwords.words('english'),
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)

    topics = ldamodel.show_topics(formatted=False, num_topics=n_topics)
    fig, axes = plt.subplots(1,
                             n_topics,
                             figsize=(10, 10),
                             sharex=True,
                             sharey=True)
    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()

    #Next we'll do a t-SNE plot clustering the abstracts based off the topic
    #probabilities returned from the model. This creates a array where each
    #column is a topic and each row is an abstract and each entry is the probability
    #that the abstract belongs to that topic.
    col_ns = range(0, n_topics)
    topic_weights = pd.DataFrame(columns=col_ns)
    for i in range(0, len(ldamodel[abstract_term_matrix])):
        weights = ldamodel[abstract_term_matrix][i]
        for j in range(0, len(weights)):
            entry = pd.DataFrame(columns=col_ns)
            idx = weights[j][0]
            entry.loc[0, idx] = weights[j][1]
        topic_weights = topic_weights.append(entry)
    topic_weights.reset_index(drop=True, inplace=True)

    # Replace any nan entries (because there was zero probability the
    #abstract belonged in that topic) with zero
    arr = pd.DataFrame(topic_weights).fillna(0).values

    # We can limit this to only well separated abstracts as well
    #arr = arr[np.amax(arr, axis=1) > 0.15]

    # This is pulls out the highest probability topic for each abstract.  We'll
    #use this for the color scheme in the t-SNE plot.
    topic_num = np.argmax(arr, axis=1)

    # Here we initialize and fit our t-SNE model
    tsne_model = TSNE(n_components=2,
                      verbose=1,
                      random_state=0,
                      perplexity=50,
                      init='pca')
    tsne_lda = tsne_model.fit_transform(arr)

    #Here we plot out the results for the t-SNE transformation

    mycolors = np.array(cols)

    title = "t-SNE Clustering of {} LDA Topics".format(n_topics)
    f = plt.figure()
    plt.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1], color=mycolors[topic_num])
    plt.title(title)
    plt.show()

    fig = plt.figure(figsize=(12, 6))
    ax1 = fig.add_subplot(1, 3, 1)
    ax1.scatter(x=abstracts['AwardedAmountToDate'],
                y=abstracts['citations per year'],
                color=mycolors[abstracts['predicted topic']])
    ax1.set_ylabel('Average Citations per Year')
    ax1.set_xlabel('Award Size [$]')
    ax1.set_title('Average Citiations per Year', fontsize=11)
    ax2 = fig.add_subplot(1, 3, 2)
    ax2.scatter(x=abstracts['AwardedAmountToDate'],
                y=abstracts['pubs per year'],
                color=mycolors[abstracts['predicted topic']])
    ax2.set_ylabel('Number Publications per Year')
    ax2.set_xlabel('Award Size [$]')
    ax2.set_title('Number of Publications per Year', fontsize=11)
    ax3 = fig.add_subplot(1, 3, 3)
    ax3.scatter(x=abstracts['pubs per year'],
                y=abstracts['citations per year'],
                color=mycolors[abstracts['predicted topic']])
    ax3.set_xlabel('Number Publications per Year')
    ax3.set_ylabel('Average Citiations per Year')
    ax3.set_title('Number Publications vs \nAverage Citation Count',
                  fontsize=11)
    from matplotlib.legend_handler import HandlerPatch

    class HandlerEllipse(HandlerPatch):
        def create_artists(self, legend, orig_handle, xdescent, ydescent,
                           width, height, fontsize, trans):
            center = 0.5 * width - 0.5 * xdescent, 0.5 * height - 0.5 * ydescent
            p = mpatches.Ellipse(xy=center,
                                 width=height + xdescent,
                                 height=height + ydescent)
            self.update_prop(p, orig_handle, legend)
            p.set_transform(trans)
            return [p]

    handles = [
        mpatches.Circle((0.5, 0.5),
                        radius=0.25,
                        facecolor=mycolors[i],
                        edgecolor="none") for i in range(0, n_topics)
    ]
    handles = [
        mpatches.Circle(
            (0.5, 0.5), radius=0.25, facecolor='w', edgecolor="none")
    ] + handles
    legend_labels = list(range(0, n_topics))
    legend_labels = ['Topic'] + legend_labels
    ax3.legend(handles,
               legend_labels,
               bbox_to_anchor=(1, .88),
               bbox_transform=fig.transFigure,
               handler_map={mpatches.Circle: HandlerEllipse()})
    plt.tight_layout()
Beispiel #12
0
class LDAWDF:
    mysql: mysql.MySQL
    ldamodel: LdaModel
    dictionary = None
    corpus = None

    def __init__(self, mysql):
        self.mysql = mysql
        self.dataFolder = './data/'
        self.saveFile = 'lda_model'
        self.saveFileDict = 'lda_model_dict'

    def trainFromStart(self):
        with self.mysql as db:
            content = db.getContentsText()
        documents = []
        for item in content:
            documents.append(item['content'].split())

        self.dictionary = corpora.Dictionary(documents)

        self.dictionary.filter_extremes(no_below=5, no_above=0.5)

        doc_term_matrix = [self.dictionary.doc2bow(doc) for doc in documents]

        self.corpus = doc_term_matrix

        # Running and Training LDA model on the document term matrix.
        print("Starting to train LDA Model...")
        self.ldamodel = LdaModel(
            doc_term_matrix,
            num_topics=200,
            id2word=self.dictionary,
            passes=100)

    def printTest(self):
        print(self.ldamodel.print_topics(num_topics=10, num_words=5))

    def save(self):
        self.ldamodel.save(self.dataFolder + self.saveFile)
        self.dictionary.save(self.dataFolder + self.saveFileDict)

    def canLoad(self):
        my_file = Path(self.dataFolder + self.saveFile)
        my_file_dict = Path(self.dataFolder + self.saveFileDict)
        return my_file.is_file() and my_file_dict.is_file()

    def update(self, corpus):
        self.ldamodel.update(corpus)

    def load(self, subfolder=None):
        if subfolder:
            sf = subfolder + '/'
        else:
            sf = ''
        self.ldamodel = LdaModel.load(self.dataFolder + sf + self.saveFile)
        self.dictionary = gensim.corpora.Dictionary.load(self.dataFolder + sf + self.saveFileDict)

    def fillDb(self):
        topics = {}
        result = []
        result2 = []
        nbTopics = self.ldamodel.get_topics().shape[0]
        # "Old"
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 3)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            words = []
            for topicTerm in topicTerms:
                words.append(self.dictionary.get(topicTerm[0]))
            topics[topicId] = ' '.join(words)
        with mysql as db:
            contentsText = db.getContentsText()
            for element in contentsText:
                bow = self.dictionary.doc2bow(element['content'].split())
                docTopics = self.ldamodel.get_document_topics(bow, minimum_probability=0.05)
                if len(docTopics) > 0:
                    docTopics.sort(key=lambda x: x[1], reverse=True)
                    result.append((element['url'], topics[docTopics[0][0]]))
                    for docTopic in docTopics:
                        result2.append((element['url'], docTopic[0], str(docTopic[1])))
            db.emptyUrlsTopic()
            db.emptyCurrentUrlsTopic()
            db.emptyCurrentUserTags()
            db.setCurrentUrlsTopic(result2)
            db.setPrecalcTopics()
        # "New"
        terms = []
        for topicId in range(0, nbTopics):
            topicTerms = self.ldamodel.get_topic_terms(topicId, 5)
            topicTerms.sort(key=lambda x: x[1], reverse=True)
            for topicTerm in topicTerms:
                terms.append((topicId, self.dictionary.get(topicTerm[0]), str(topicTerm[1])))
        with mysql as db:
            db.emptyLdaTopics()
            db.setLdaTopics(terms)


    def get_terms_topics(self, keywords):
        bow = self.dictionary.doc2bow(keywords[:30])
        topics = {}
        keywordsResult = {}
        for word in bow:
            wordTopics = self.ldamodel.get_term_topics(word[0], 0.05)
            keywordsResult[word[0]] = {'word': self.dictionary.get(word[0]), 'topics': wordTopics}
            for wordTopic in wordTopics:
                wordTopicId = wordTopic[0]
                if wordTopicId not in topics:
                    topics[wordTopicId] = self.ldamodel.show_topic(wordTopicId)
        return {'topics': topics, 'keywords': keywordsResult}
Beispiel #13
0

# Convert texts to lda-specific corpous
lda_corpus = [dct.doc2bow(text) for text in preprocessed_texts]

#set number of topics
num_topics = 8

# fit LDA model
speeches_topics = LdaModel(corpus=lda_corpus,
                           id2word=dct,
                           num_topics=num_topics,
                           passes=5)

# print out first 8 topics
for i, topic in enumerate(speeches_topics.print_topics(8)):
    print (i, topic)


# visualization of topics
#vis_data = gensimvis.prepare(speeches_topics, lda_corpus, dct)
#pyLDAvis.display(vis_data)




# extract all document-topic distritbutions to dictionnary
document_key = list(speeches.index)
document_topic = {}
for doc_id in range(len(lda_corpus)):
    docbok = lda_corpus[doc_id]
Beispiel #14
0
class ModelVanilla:
    """Adjust, train and optimize LDA model

    This class is responsible for traning the Topic Model using Gensim's
    LDA.

    Attributes:
        tokens: List of lists containing data index number and tokens
        id2word: Dictionary of the Corpus
        corpus: Term Document frequency
        alpha = Model alpha hyperparameter
        workers = Number of workers spawned while training the model
        prefix = prefix
        optimize_interval = Number of iterations after which to re-evaluate
            hyperparameters
        iterations = Number of iterations
        topic_threshold = topic threshold
        num_topics = Number of topics
        lda_model = Gensim LDA object
    """
    def __init__(self, tokens=None, input_path=None):
        """Inits Model, tokenized data or input path to open saved tokenized
        data.

        NOTE: If both input_path and tokens is given, tokens will always take
        higher preference

        Args:
            input_path: Location of saved preprocessed tokens file
            tokens: tokens of preprocessed data

        Raises:
            IOError: Tokens file not found or not in specified format
            Exception: Not in specified structure
        """
        self.tokens = []

        if (tokens is not None and input_path is None) or \
                            (tokens is not None and input_path is not None):
            # Use tokens list passed as an argument
            print('Using tokens passed as argument')
            try:
                for i, val in enumerate(tokens):
                    self.tokens.append(tokens[i][1:])
            except:
                raise Exception("Tokens list does not follow required " + \
                                                                "structure")

        elif tokens is None and input_path is not None:

            # Read the saved tokens file
            print('Opening tokens file')
            try:
                with codecs.open(input_path, 'r', encoding='utf8') as F:
                    for row in F:
                        token_in_row = row.split(",")
                        for i, val in enumerate(token_in_row):
                            token_in_row[i] = force_unicode(token_in_row[i])
                        self.tokens.append(token_in_row[1:])
            except IOError:
                raise IOError("File not found")
            except:
                raise Exception("Tokens list does not follow required " + \
                                                                "structure")
        elif tokens is None and input_path is None:
            print("Assuming load model from saved file, use Model.load()")
        else:
            print("Missing tokens data")

    def fit(self):
        """Generate the id2word dictionary and term document frequency of
        the given tokens

        NOTE: Should be called only after making sure that the tokens
        have been properly read

        Raises:
            Exception: self.tokens empty or not in required format
        """
        try:
            # Create Dictionary
            self.id2word = corpora.Dictionary(self.tokens)
            # Term Document Frequency
            self.corpus = \
                    [self.id2word.doc2bow(text) for text in self.tokens]
        except:
            raise Exception('tokens not compatible')

    def params(self, alpha='symmetric', num_topics=100, distributed=False, \
                    chunksize=2000, passes=1, update_every=1, iterations=50):
        """Model parameters

        NOTE: These are the same parameters used while traning models
        for coherence computation. Call this function to re-initialize
        parameter values in that case

        Args:
            alpha: Can be set to an 1D array of length equal to the number of
            expected topics that expresses our a-priori belief for the each
            topics’ probability. Alternatively default prior selecting
            strategies can be employed by supplying a string:

                ’asymmetric’: Uses a fixed normalized asymmetric prior
                    of 1.0 / topicno.
                ’auto’: Learns an asymmetric prior from the corpus
                    (not available if distributed==True).
            num_topics: Number of topics
            distributed: Whether distributed computing should be used to
                accelerate training.
            chunksize: Number of documents to be used in each training chunk.
            passes: Number of passes through the corpus during training.
            update_every: Number of documents to be iterated through for each
                update. Set to 0 for batch learning, > 1 for online iterative
                learning.
            iterations: Number of iterations
        """
        self.alpha = alpha
        self.num_topics = num_topics
        self.distributed = distributed
        self.chunksize = chunksize
        self.passes = passes
        self.update_every = update_every
        self.iterations = iterations

    def train(self):
        """Train LDA model using gensim's LDA object
        """
        self.lda_model = LdaModel(corpus=self.corpus, \
                        num_topics=self.num_topics, alpha=self.alpha, \
                        id2word=self.id2word, distributed=self.distributed, \
                        chunksize=self.chunksize, passes=self.passes, \
                        update_every=self.update_every, \
                                                iterations=self.iterations)

    def topics(self, num_topics=100, num_words=10):
        """Return top <num_words> words for the first <num_topics> topics

        Args:
            num_topics: Number of topics to print
            num_words: Number of top words to print for each topic

        Returns:
            List of topics and top words
        """
        return self.lda_model.print_topics(num_topics, num_words)

    def save(self, output_path):
        """Save the lDA model

        Args:
            output_path: Location with filename to save the LDA model
        Raises:
            IOError: Error with output_path / File already exists
        """
        self.lda_model.save(output_path)

    def get_coherence(self):
        """Compute Coherence Score of the model

        NOTE: You cannot compute the coherence score of a saved model

        Returns:
            Float value
        """
        coherence_model_lda = CoherenceModel(model=self.lda_model, \
                texts=self.tokens, dictionary=self.id2word, \
                coherence='c_v')
        coherence_lda = coherence_model_lda.get_coherence()
        return coherence_lda

    def optimum_topic(self, start=10, limit=100, step=11):
        """Compute c_v coherence for various number of topics

        if you want to change the parameters of the model while training,
        call Model.params() first as it uses the same parameters.

        NOTE: You cannot compute the coherence score of a saved model.

        Args:
            start: Starting number of topics
            limit: Limit number of topics
            step: Step size

        Returns:
            Dictionary of {num_topics, c_v}
        """
        coherence_values = []
        model_list = []
        for num_topics in range(start, limit, step):
            model = LdaModel(corpus=self.corpus, \
                        num_topics=self.num_topics, alpha=self.alpha, \
                        id2word=self.id2word, distributed=self.distributed, \
                        chunksize=self.chunksize, passes=self.passes, \
                        update_every=self.update_every, \
                                                iterations=self.iterations)
            model_list.append(model)
            coherencemodel = CoherenceModel(model=model, \
                texts=self.tokens, dictionary=self.id2word, \
                coherence='c_v')
            coherence_values.append(coherencemodel.get_coherence())
        x = range(start, limit, step)
        out = dict()
        for m, cv in zip(x, coherence_values):
            out["num_topics"] = m
            out["c_v"] = round(cv, 4)
        return out

    def load(self, saved_model):
        """Load a LDA model previously saved

        Args:
            saved_model: Location to saved model
        Raises:
            IOError: File already present or location does not exist
        """
        try:
            self.lda_model = utils.SaveLoad.load(saved_model)
        except IOError:
            raise IOError('File already present or location does not exist')
words,ids = dictionary.filter_n_most_frequent(50)
print words,"\n\n",ids

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]


#Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix, num_topics=50, id2word = dictionary, passes=50, iterations=500)
ldafile = open('lda_model_sym_wiki.pkl','wb')
cPickle.dump(ldamodel,ldafile)
ldafile.close()

#Print all the 50 topics
for topic in ldamodel.print_topics(num_topics=50, num_words=10):
    print topic[0]+1, " ", topic[1],"\n"













# Filter the terms which have occured in less than 3 articles and more than 40% of the articles
dictionary.filter_extremes(no_below=4, no_above=0.4)

# List of some words which has to be removed from dictionary as they are content neutral words
stoplist = set(
    'also use make people know many call include part find become like mean often different \
                usually take wikt come give well get since type list say change see refer actually iii \
                aisne kinds pas ask would way something need things want every str'
    .split())
stop_ids = [
    dictionary.token2id[stopword] for stopword in stoplist
    if stopword in dictionary.token2id
]
dictionary.filter_tokens(stop_ids)

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

#Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix,
               num_topics=50,
               id2word=dictionary,
               passes=50,
               iterations=500)
ldafile = open('lda_model_sym_wiki.pkl', 'wb')
cPickle.dump(ldamodel, ldafile)
ldafile.close()

#Print all the 50 topics
for topic in ldamodel.print_topics(num_topics=50, num_words=10):
    print(topic[0] + 1, " ", topic[1], "\n")
Beispiel #17
0
# removes symbols given a text input
symbols = [',', '.', '-', '@']
def clean(word):
    word = word.lower()
    for symbol in symbols:
        word = word.replace(symbol, '')
    return word

# load dictionary that was generated by calling corpora.Dictionary() on TripAdvisor text
id2word = Dictionary.load('scraper/tripadvisor.dict')
# constructs a sparse matrix by calling corpora.MmCorpus.serialize() on the dictionary
mm = MmCorpus('scraper/tripadvisor.mm')
# call LDA, set topics to 100
lda = LdaModel(corpus=mm, id2word=id2word, num_topics=100, update_every=1, chunksize=100, passes=1)
topics = lda.print_topics(num_topics=100)

top_words = []

# construct top words list based on topics and words generated by LDA
for topic in topics:
    topic = topic.replace('+', '')
    topic_split = topic.split()
    for item in topic_split:
        item = item.strip().split('*')[1]
        top_words.append(clean(item))

# write top words to a file
f = open('scraper/top_words_lda.txt', 'w')
for word in top_words:
    f.write(word.encode('utf-8') + '\n')
				num_topics = 30
			#ldamodel = Lda(doc_term_matrix, num_topics = num_topics, id2word = dictionary)#, passes=50, iterations=500)
			ldamodel = Lda(doc_term_matrix, num_topics = num_topics, id2word = dictionary, passes=3)
			print("Saving the LDA model")

			filename = model_path + pf + '_' + ccode + '_lda_model.pkl' #/auto/vgapps-cstg02-vapps/analytics/csap/models/files/sr/
			if os.path.exists(filename):
				os.remove(filename)
				print("File Removed!")

			ldafile = open(filename,'wb')
			cPickle.dump(ldamodel, ldafile)
			print("File Created!")
			ldafile.close()

			topics_words = ldamodel.print_topics(num_topics = num_topics, num_words = 10)
			c=0
			doc_topics=[]
			prob=[]
			for doc in doc_term_matrix:
				a = sorted(ldamodel[doc], key=lambda x: x[1])[-1]
				doc_topics.append(a[0])
				prob.append(a[1])
				c=c+1
			df["topic_number"] = doc_topics
			df["topic_probability"] = prob
			#Create a dataframe and append the topic number to that column
			final_df = pd.DataFrame()
			final_df["SR_number"] = df['sr_number']
			final_df["PF"] = pf #asa_df['sr_hw_product_erp_family']
			final_df["topic"] = df['topic_number']
 
print ("Set = ,Original Qty = ")
print (word_list_len,list_len)
print (word_list)
print ('********************************************************************************************************')
print (tweet_clean_fin)
print (len(tweet_clean_fin))
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
dictionary = Dictionary(tweet_clean_fin)
print("\n --- dictionary \n",dictionary)
bow_vectors = [dictionary.doc2bow(text) for text in tweet_clean_fin]

goodLdaModel=LdaModel(corpus=bow_vectors,id2word=dictionary,iterations=50,num_topics=6)
print('\n --- goodLdaModel: all topics in result ordered by significance \n')
all_goos_topics=goodLdaModel.print_topics(-1)
print(all_goos_topics)
print("\n---goodLdaModel.print_topics(num_topics=6,num_words=12 \n")
print(goodLdaModel.print_topics(num_topics=6,num_words=16))
%%time
import warnings
import pandas as  pd
import seaborn as sns
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore")
fiz=plt.figure(figsize=(30,60))
for i in range(6):
    df=pd.DataFrame(goodLdaModel.show_topic(i,16),columns=['term','prob']).set_index('term')
    plt.subplot(6,3,i+1)
    plt.title('topic'+str(i+1))
    sns.barplot(x='prob',y=df.index,data=df,label='Cities',palette='Reds_d')
    def fit(self):
        # 载入IT停用词
        stopword = StopWord("./stopwords_it.txt")

        # 载入语料库(from seg_join/corpus.txt)
        print "reading corpus"
        corpus_name = "corpus.dat"
        if not os.path.exists(corpus_name):
            with open(self.proj_name + "/seg_join/corpus.txt",
                      "r") as corpus_file:
                for line in corpus_file:
                    words = line.split()
                    words = [
                        word for word in words
                        if not stopword.is_stop_word(word)
                    ]
                    self.corpus.append(words)
            # Dumper.save(self.corpus, corpus_name)
        else:
            self.corpus = Dumper.load(corpus_name)
        self.doc_num = len(self.corpus)

        # 生成文档的词典,每个词与一个整型索引值对应
        print "creating dictionary"
        id2word_name = "id2word.dat"
        if not os.path.exists(id2word_name):
            self.id2word = corpora.Dictionary(self.corpus)
            # Dumper.save(self.id2word, id2word_name)
        else:
            self.id2word = Dumper.load(id2word_name)

        # 删除低频词
        # ignore words that appear in less than 20 documents or more than 10% documents
        # id2word.filter_extremes(no_below=20, no_above=0.1)

        # 词频统计,转化成空间向量格式
        print "tranforming doc to vector"
        corpus_bow_name = "corpus_bow.dat"
        if not os.path.exists(corpus_bow_name):
            self.corpus_bow = [
                self.id2word.doc2bow(doc) for doc in self.corpus
            ]
            # Dumper.save(self.corpus_bow, corpus_bow_name)
        else:
            self.corpus_bow = Dumper.load(corpus_bow_name)

        # 训练LDA模型
        print "training lda model"
        lda_model_name = "lda_models/lda.dat"
        if not os.path.exists(lda_model_name):
            lda = LdaModel(corpus=self.corpus_bow,
                           id2word=self.id2word,
                           num_topics=self.topic_num,
                           alpha='auto')
            Dumper.save(lda, lda_model_name)
        else:
            lda = Dumper.load(lda_model_name)

        # 打印识别出的主题
        topics = lda.print_topics(num_topics=self.topic_num, num_words=10)
        for topic in topics:
            print "topic %d: %s" % (topic[0], topic[1].encode("utf-8"))
        with open("topics.txt", "w") as topic_file:
            for topic in topics:
                print >> topic_file, "topic %d: %s" % (
                    topic[0], topic[1].encode("utf-8"))
        self.lda = lda
# Creating the object for LDA model using gensim library
Lda = gensim.models.ldamodel.LdaModel

#Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
ldamodel = Lda(doc_term_matrix,
               num_topics=50,
               id2word=dictionary,
               passes=20,
               iterations=500)
print('used: {:.2f}s'.format(time() - start))

#Save a model
#      ldamodel.save('topic_articles.model')
#print(ldamodel.print_topics(num_topics=2, num_words=4))

ii = ldamodel.print_topics(num_topics=50, num_words=30)
df = pd.DataFrame(ii, columns=['id_topics', 'words']).set_index('id_topics')
df1 = df.to_csv('50_topics_on_articles.csv')
df2 = df.to_excel('50_topics_on_articles.xlsx')

#MAIN PLOT
viz = pyLDAvis.gensim.prepare(ldamodel, doc_term_matrix, dictionary)
pyLDAvis.save_html(viz, '50t_articles.html')

#Load a model
#ldamodel.load('topic_articles.model')

yo = ldamodel.get_document_topics(
    doc_term_matrix)  #get topics on all 10339 articles
li = []
for i in range(len(comments)):
start = datetime.now()
print 'Building model ...'
documents = wikiDocs()

#build a dictionary which maps between words and index numbers:
dictionary = corpora.Dictionary(documents)
dictionary.save(fileLocation + 'cs_lda6.dict')
corpus = wikiDocBow()

#model = Doc2Vec( documents, size=25, window=8, min_count=5, workers=0)
ldaModel = LdaModel(corpus=corpus, id2word=dictionary,
                    num_topics=100)  # default size = 25
print 'Out of ' + str(len(domainIlinks)) + ' domain pages ' + str(
    MissingFileCount) + ' were missing.'
print 'Topics = ' + str(len(ldaModel.print_topics(num_topics=10,
                                                  num_words=10)))
print 'Showing topics = ' + str(len(ldaModel.show_topics()))

# store the model to mmap-able files
ldaModel.save(fileLocation + 'wiki_model6.ldamodel'
              )  #model.save_word2vec_format('/tmp/my_model.doc2vec')
print('Model made in ' + str(
    ((datetime.now() - start).total_seconds()) / 60) + ' minutes.')

# load the model back
dictionary_loaded = corpora.Dictionary.load(fileLocation + 'cs_lda6.dict')
model_loaded = LdaModel.load(
    fileLocation + 'wiki_model6.ldamodel'
)  #model_loaded = Doc2Vec.load_word2vec_format('/tmp/my_model.doc2vec')
print 'Topics in loaded model = '
print model_loaded.print_topics(num_topics=5, num_words=5)
Beispiel #23
0
from gensim.test.utils import common_texts, common_corpus
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel
import pprint

# Create a corpus from a list of texts
texts = [['褒贬','春秋','看重','德望','孔子'],
        ['慈禧','政治','果断','胆识','眼光','改革'],
        ['历史','兴替','唐朝'],
        ['食堂','饭菜','不好吃','闹肚子', '便宜'],
        ['就餐','挑剔','新鲜','美味','蜜桃','西瓜'],
         ['苹果', '梨子','价格','便宜'],
         ['奥迪', '大众', '汽车', '价格', '昂贵', '买不起']]

# pprint.pprint(common_texts)

dct = Dictionary(texts)
corpus = [dct.doc2bow(text) for text in texts]

lda = LdaModel(corpus, num_topics=3, id2word=dct)
# Print the most contributing words for 2 topics
res = lda.print_topics(num_topics=3, num_words=6)

pprint.pprint(res)
    num_topics = 20
    chunksize = 500
    passes = 20
    iterations = 400
    eval_every = 1

    # Make a index to word dictionary.
    temp = dictionary[0]  # only to "load" the dictionary.
    id2word = dictionary.id2token

    lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                           alpha='auto', eta='auto', \
                           iterations=iterations, num_topics=num_topics, \
                           passes=passes, eval_every=eval_every)
    # Print the Keyword in the 5 topics
    print(lda_model.print_topics())

    # Compute Coherence Score using c_v
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=docs,
                                         dictionary=dictionary,
                                         coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score: ', coherence_lda)

    # Compute Coherence Score using UMass
    coherence_model_lda = CoherenceModel(model=lda_model,
                                         texts=docs,
                                         dictionary=dictionary,
                                         coherence="u_mass")
    coherence_lda = coherence_model_lda.get_coherence()
corpus = [words.doc2bow(doc) for doc in doc_list]

# Create LDA model
lda = LdaModel(corpus=corpus,
               id2word=words,
               num_topics=3,
               random_state=2,
               update_every=1,
               passes=20,
               alpha='auto',
               per_word_topics=True)

# Pickle that too

# with open('pickles/movie_lda_1.pickle', 'wb') as f:
    # pickle.dump(lda, f)

with open('pickles/movie_lda_1.pickle', 'rb') as f:
    lda = pickle.load(f)

pprint(lda.print_topics(num_words=10))

# Generate wordclouds
for t in range(lda.num_topics):
    wc = WordCloud(width=800, height=400)
    wc.fit_words(dict(lda.show_topic(t, 200)))
    plt.figure()
    plt.imshow(wc)
    plt.axis("off")
    plt.show()
                         chunksize=chunksize,
                         alpha='auto',
                         eta='auto',
                         iterations=iterations,
                         num_topics=num_topics,
                         passes=passes,
                         eval_every=eval_every)

    # Pickle the model for later use
    pickle.dump(
        lda_model,
        open(os.path.join('./results/lda_save_' + str(num_topics) + '.pk'),
             'wb'))

    print('The top 10 keywords in each topic')
    pprint(lda_model.print_topics(num_words=10))

    # Topic coherence https://rare-technologies.com/what-is-topic-coherence/
    top_topics = lda_model.top_topics(corpus)  # , num_words=20)
    avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
    print('Average topic coherence: %.4f.' % avg_topic_coherence)
    print('Top topics and their coherence:')
    pprint(top_topics)

    # Comparing LDA models
    # https://radimrehurek.com/gensim/auto_examples/howtos/run_compare_lda.html

    # LDA Results Visual Analysis
    if visualize:
        #    pyLDAvis.enable_notebook()
        lda_res_path = os.path.join('./results/lda_pyldavis_' +
Beispiel #27
0
# In[9]:

from gensim.corpora import Dictionary
from gensim.models.lsimodel import LsiModel
from gensim.models.ldamodel import LdaModel
from gensim.models.tfidfmodel import TfidfModel

# In[55]:

lda = LdaModel(corpus=corpus,
               id2word=dictionary,
               num_topics=5,
               update_every=1,
               chunksize=10000,
               passes=1)
lda.print_topics(5)

# In[56]:

from gensim import corpora, models, similarities

tfidf = models.TfidfModel(corpus)

# In[57]:

corpus_tfidf = tfidf[corpus]

# In[58]:

lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=5)
Beispiel #28
0
def get_topic(corpus, dictionary):
    model = LdaModel(
        corpus=corpus, id2word=dictionary, num_topics=10
    )  #num topic menyesuaikan hasil dari coherence value paling tinggi
    return model.print_topics()
    # Creating the object for LDA model using gensim library & Training LDA model on the document term matrix.
    print("Running the LDA model")
    ldamodel = Lda(doc_term_matrix, num_topics=50, id2word = dictionary)#, passes=50, iterations=500)
    print("Saving the LDA model")

    filename = model_path + pf + '_lda_model.pkl' #/auto/vgapps-cstg02-vapps/analytics/csap/models/files/sr/
    if os.path.exists(filename):
        os.remove(filename)
        print("File Removed!")

    ldafile = open(filename,'wb')
    cPickle.dump(ldamodel, ldafile)
    print("File Created!")
    ldafile.close()

    topics_words = ldamodel.print_topics(num_topics=50, num_words = 200)
    c=0
    doc_topics=[]
    prob=[]
    for doc in doc_term_matrix:
        a = sorted(ldamodel[doc], key=lambda x: x[1])[-1]
        doc_topics.append(a[0])
        prob.append(a[1])
        c=c+1

    print("DataFrame created")
    asr_df["topic_number"] = doc_topics
    asr_df["topic_probability"] = prob

    #asr_df = asr_df.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1)
    #asr_df.to_csv("SR_topic_classification.csv", encoding='utf-8')
# Code starts here
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
headlines = data['headline_text'].tolist()
clean_headlines = [clean(row).split() for row in headlines]
# stopwords list
dictionary = corpora.Dictionary(clean_headlines)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in clean_headlines]
lda_model = LdaModel(doc_term_matrix,
                     num_topics=5,
                     id2word=dictionary,
                     random_state=2,
                     iterations=10)
topics = lda_model.print_topics()
pprint.pprint(topics)
# string punctuations

# lemmatizer

# convert headlines to list

# cleaned data

# Creating the term dictionary of our courpus, where every unique term is assigned an index

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.

# build LDA model
Beispiel #31
0
dictionary = corpora.Dictionary(resultlist)
corpus = [dictionary.doc2bow(text) for text in resultlist]

with open ('mcdi_word.csv', 'rb')as f:
    reader = csv.reader(f)
    wlist = []
    for row in reader:
        wlist.append(row)

idlist = []

for row in wlist:
    idrow = []
    for key in dictionary.iteritems():
        if key[1].encode('utf-8') in row:
            idrow.append(key[0])
    idlist.append(idrow)

a = 0.05
ntopic = 75
eta_arr = ones((ntopic, len(dictionary))) * 0.5
for x in range(0, len(idlist)):
    for id in idlist[x]:
        eta_arr[x, id] *= 1000


lda = LdaModel(id2word = dictionary, num_topics = ntopic)
lda.update(corpus)
topiclist = lda.print_topics(num_topics = 75, num_words = 50)
lda.save('childs_file_75.model')
Beispiel #32
0
# LDA
from gensim.parsing.preprocessing import preprocess_string

tweets = tweets.text.apply(preprocess_string).tolist()

from gensim import corpora
from gensim.models.ldamodel import LdaModel

dictionary = corpora.Dictionary(tweets)
corpus = [dictionary.doc2bow(text) for text in tweets]

NUM_TOPICS = 5
ldamodel = LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=10)

ldamodel.print_topics(num_words=5)

from gensim.models.coherencemodel import CoherenceModel

def calculate_coherence_score(documents, dictionary, model):
    coherence_model = CoherenceModel(model=model, texts=documents, dictionary=dictionary, coherence='c_v')
    return coherence_model.get_coherence()

def get_coherence_values(start, stop):
    for num_topics in range(start, stop):
        print(f'\nCalculating coherence for {num_topics} topics')
        ldamodel = LdaModel(corpus, num_topics = num_topics, id2word=dictionary, passes=2)
        coherence = calculate_coherence_score(tweets, dictionary, ldamodel)
        yield coherence

Beispiel #33
0
model_lda.do_estep(chunk, state=None)


# In[84]:


# print keywords in n topics
sorted(model_lda.show_topics(), key=lambda x: x[1])


# In[85]:


# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])


# In[86]:


# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])


# In[87]:


# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[0])
Beispiel #34
0
dictionary = corpora.Dictionary(doc_clean)
print("len of dictionary2:{}".format(len(dictionary)))
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break
dictionary.filter_extremes(no_below=8, no_above=0.5, keep_n=50000)
path_dictionary = getcwd() + "/data/dictionary"
print("path of dictionary:{}".format(path_dictionary))
dictionary.save(path_dictionary)
print("len of dictionary2 after filter:{}".format(len(dictionary)))

doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]
ldamodel = LdaModel(doc_term_matrix,
                    num_topics=NUM_TOPIC,
                    id2word=dictionary,
                    passes=NUM_PASS)
print(ldamodel.print_topics(num_topics=NUM_TOPIC, num_words=NUM_WORDS))
# Save model to disk.
path_ldamodel = getcwd() + "/data/ldamodel"
print("path of ldamodel:{}".format(path_ldamodel))
# with open("./data/model", 'w') as f:
#     ldamodel.save(f)
ldamodel.save(path_ldamodel)
# Load a potentially pretrained model from disk.
lda_load = LdaModel.load(path_ldamodel)
unseen_doc = dictionary.doc2bow(doc_clean[-1])
vector = lda_load[unseen_doc]
print(vector)
Beispiel #35
0
from gensim import corpora, models
from gensim.models.ldamodel import LdaModel
import pymongo

# put top k word here
top_k_words = [['human', 'graph', 'computer']]

dictionary =  corpora.Dictionary(top_k_words)

print dictionary.token2id

class MyCorpus(object):
	def __iter__(self):
		# change to get document from mongodb
		for line in open('mycorpus.txt'):
			yield dictionary.doc2bow(line.lower().split())

corpus = MyCorpus()

lda = LdaModel(corpus, num_topics = 2, id2word = dictionary)

print lda.print_topics(2)



def program_clusters(pgms,n_topics,data):
    #First we need to filter the data by program code. Some grants have multiple program
    #codes, so we first filter through to determine which cells contain the program code
    #then we replace the exisiting program code(s) with the provided one. This ensures there
    #is only one code per award.
    awds = data
    awds = awds[awds['ProgramElementCode(s)'].str.contains('|'.join(pgms))]
    for x in pgms:
        awds['ProgramElementCode(s)'] = np.where(awds['ProgramElementCode(s)'].str.contains(x), x, awds['ProgramElementCode(s)'] )
        
    abstracts = awds[['ProgramElementCode(s)', 'AwardNumber','Abstract']].copy()
    #This is a pretty clean data set, but there are some empty entries, so we
    #filter them out here
    abstracts = abstracts.dropna()
    
    #Here we start building our dictinary and creating the cleaned up corpus.
    #We start by  removing stop words, punctuation, and stemming or lemmatizing
    #he abstract text
    stop    = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    lemma   = WordNetLemmatizer()
    stemmer2 = SnowballStemmer("english", ignore_stopwords=True)
    # pass the article text as string "doc"
    
    #Here we use a small nexted function to pass through each abstract individually
    def clean(doc):
        #here we clean up errent breaks like <br/>
        doc = re.sub('<.*?>', ' ', doc)
        #This creates a long string
        #of words while excluding stop words
        stop_free  = " ".join([i for i in doc.lower().split() if i not in stop])
        #This goes through each character and removes punctuation
        punct_free  = ''.join(ch for ch in stop_free if ch not in exclude)
        words   = punct_free.split()
        return words
        
    
    #Here is where we pass each abstract through the cleaning function
    abstracts['clean_abstracts'] = [clean(doc) for doc in abstracts['Abstract']]
    
    # So we can use bigrams and trigrams, we create new models, running through our
    #cleaned abstracts
    bigram = Phrases(list(abstracts['clean_abstracts']), min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram =Phrases(bigram[list(abstracts['clean_abstracts'])], threshold=100)  
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)  
    
    #This function applies the bigram and trigram functions and lemmatizes the 
    #the abstracts and only keeps words that a greater than 2 characters
    def word_mod(doc):
        bigs = bigram_mod[doc]
        tris = trigram_mod[bigs]
        lemm = " ".join(lemma.lemmatize(word) for word in tris)
        #stemm    = " ".join(stemmer2.stem(word) for word in punct_free.split())
        words = lemm.split()
        # only take words which are greater than 2 characters
        cleaned = [word for word in words if len(word) > 2]
        return cleaned
    abstracts['clean_abstracts'] = [word_mod(doc) for doc in abstracts['clean_abstracts']]  
    
    
    # Here we create the dictionary from the corpus of abstracts, where each unique term is assigned an index. 
    dictionary = corpora.Dictionary(abstracts['clean_abstracts'])
    # Filter terms which occurs in less than 4 articles & more than 40% of the abstracts 
    dictionary.filter_extremes(no_below=4, no_above=0.4)
    #This creates a sparse matrix of word frequencies in each abstracts
    abstract_term_matrix = [dictionary.doc2bow(doc) for doc in abstracts['clean_abstracts']]   
    
    # Here we create and train the LDA model, passing in our term frequncy matrix, the number of
    #topics/clusters to be created, and our dictionary
    ldamodel = Lda(abstract_term_matrix, num_topics= n_topics, id2word = dictionary, passes=15, iterations=500)
              
    # Here we print out the top 10 words for each topic and their weight
    for i,topic in enumerate(ldamodel.print_topics(num_topics=10, num_words=10)):
       words = topic[1].split("+")
       print (words,"\n")
     
     #Next we want to know what topic each abstract belongs to we pass each abstract
     #into the get_document_topics method and it returns the topic and the 
     #probability of the abstract beloning to a that topic. We take the one that
     #has the highest probability
    def pred_topic(doc):
        doc_bow = ldamodel.id2word.doc2bow(doc)
        doc_topics = ldamodel.get_document_topics(doc_bow, minimum_probability=0.20)  
        if doc_topics:
            doc_topics.sort(key = operator.itemgetter(1), reverse=True)
            theme = doc_topics[0][0]
        else:
            theme = np.nan
        return theme

    abstracts['predicted topic'] = [pred_topic(doc) for doc in abstracts['clean_abstracts']]
    
    #Here we do a histogram of how many abstracts/awards fall into each topic
    ab_hist = abstracts.groupby(['predicted topic','ProgramElementCode(s)'])['AwardNumber'].count()
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 
    f1, ax  = plt.subplots()
    ab_hist.plot.bar(rot = 0, color = cols )
    ax.set_xticklabels([x[0] for x in ab_hist.index])
    ax.set_xlabel('Topic Number')
    ax.set_ylabel('Count of Awards in Topic')
    ax.set_title('Distribution of Awards in Derived Topic Areas')
    plt.show()
    
    #Here we create a word cloud for each of the top words in the topic. Their size 
    #is indicative of their weight.
    cloud = WordCloud(stopwords=stopwords.words('english'),
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)
    
    topics = ldamodel.show_topics(formatted=False)
    fig, axes = plt.subplots(1, n_topics, figsize=(10,10), sharex=True, sharey=True)
    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')   
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()
    
        
    #Next we'll do a t-SNE plot clustering the abstracts based off the topic
    #probabilities returned from the model. This creates a array where each
    #column is a topic and each row is an abstract and each entry is the probability
    #that the abstract belongs to that topic.
    col_ns = range(0,n_topics)
    topic_weights = pd.DataFrame(columns = col_ns)
    for i in range(0,len(ldamodel[abstract_term_matrix])):
        weights = ldamodel[abstract_term_matrix][i]
        for j in range(0, len(weights)):
           entry = pd.DataFrame(columns = col_ns)
           idx = weights[j][0]
           entry.loc[0,idx] = weights[j][1]
        topic_weights = topic_weights.append(entry)
    topic_weights.reset_index(drop = True, inplace = True)
    
    # Replace any nan entries (because there was zero probability the 
    #abstract belonged in that topic) with zero
    arr = pd.DataFrame(topic_weights).fillna(0).values
    
    # We can limit this to only well separated abstracts as well
    #arr = arr[np.amax(arr, axis=1) > 0.15]
    
    # This is pulls out the highest probability topic for each abstract.  We'll
    #use this for the color scheme in the t-SNE plot.
    topic_num = np.argmax(arr, axis=1)
    
    # Here we initialize and fit our t-SNE model
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
    tsne_lda = tsne_model.fit_transform(arr)
    
    #Here we plot out the results for the t-SNE transformation
      
    mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
    title ="t-SNE Clustering of {} LDA Topics".format(n_topics)
    f = plt.figure()
    plt.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
    plt.title(title)
    plt.show()
Beispiel #37
0
from util.TextSimilarity import TextSimilarity
from util.TaskReader import TaskReader

from gensim.corpora import Dictionary
from gensim.models.ldamodel import LdaModel

print("LDA Output: ")

first_num = 244

task = TaskReader.read("text.txt")
similarity = TextSimilarity('french')
doc_set = similarity.get_modified_text(task.text)
edu_set = similarity.get_modified_text(task.education)

dictionary = Dictionary([[x for x in i.split()] for i in edu_set])
for i in range(0, len(doc_set)):
    num = i + first_num
    corp = [x for x in doc_set[i].split()]
    corpus = [dictionary.doc2bow(corp)]
    ldamodel = LdaModel(corpus, num_topics=1, id2word=dictionary, passes=50)
    [print("Topic № " + str(num) + " : " + x[1]) for x in ldamodel.print_topics(num_topics=1, num_words=6)]
Beispiel #38
0
for n in NoStop:
    Dummy = []
    Dummy = [wordnet_lemmatizer.lemmatize(t) for t in n]
    Lemma.append(Dummy)

#print(' lemma : ', Lemma, '  ::  ', type(Lemma))

# Create a Dictionary from the articles: dictionary
dictionary = Dictionary(Lemma)

#print(' dict : ', dictionary)

# Create a MmCorpus: corpus
corpus = [dictionary.doc2bow(article) for article in Lemma]

# Print the first 10 word ids with their frequency counts from the fifth document
print(corpus, '  LEN  :  ', len(corpus))

num_topics = 5
num_words = 8
passes = 20

lda = LdaModel(corpus,
               id2word=dictionary,
               num_topics=num_topics,
               passes=passes)

pp = pprint.PrettyPrinter(indent=4)

pp.pprint(lda.print_topics(num_words=num_words))