def main():
    collection_name = "nips"
    years = xrange(2010, 2015)  # 10 ~ 14
    n_topics = 10
    
    corpus_paths = map(lambda y: 
                       "data/{}-{}.dat".format(collection_name, y),
                       years)
    all_corpus = []
    year2corpus = {}
    for year, path in zip(years, corpus_paths):
        corpus = list(load_line_corpus(path))
        all_corpus.append(proc_corpus(corpus))
        year2corpus[year] = corpus

    all_corpus = list(itertools.chain.from_iterable(all_corpus))

    dictionary = Dictionary(all_corpus)
    all_corpus = [dictionary.doc2bow(doc)
                  for doc in all_corpus]

    import pdb
    pdb.set_trace()

    # print all_corpus
    model = LdaModel(all_corpus, num_topics=n_topics,
                     id2word=dictionary,
                     eval_every=10, passes=100)
    print model.show_topics()
def draw_cluster_key_word(cluster: list):
    """
    抽取一个聚类的关键词
    :param cluster: list of tuple(7),问题二中聚类得到的簇
    :return: list of words,关键词列表
    """
    stop = fetch_default_stop_words()  # 停用词表
    stop.extend(["", " ", "\n", "\t", "*"])  # 附加几个停用词

    sents = [
        jieba.lcut(row[2] + "。" + row[4], cut_all=True) for row in cluster
    ]  # 分词
    sents = [[word for word in sent if word not in stop]
             for sent in sents]  # 去停用词

    dictionary = corpora.Dictionary(sents)  # 建立词典
    doc_term_matrix = [dictionary.doc2bow(doc) for doc in sents]  # 文档-词频矩阵

    # 训练LDA模型
    lda_model = LdaModel(doc_term_matrix,
                         num_topics=1,
                         id2word=dictionary,
                         passes=1)

    # 解析出主题中概率最大的前6个词
    key_words = [
        word
        for index, word in enumerate(lda_model.show_topics()[0][1].split("\""))
        if index in [1, 3, 5, 7, 9, 11]
    ]
    return key_words
Example #3
0
def train_model(texts, **kwargs):

  # parse args
  filter_stopwords = kwargs.get('filter_stopwords', True)
  normalizer = kwargs.get('normalizer', 'porter')
  tfidf = kwargs.get('tfidf', True)
  num_topics = kwargs.get('num_topics', 20)
  min_freq = kwargs.get('min_freq', 2)
  use_pickle = kwargs.get('use_pickle', True)
  update_pickle = kwargs.get('update_pickle', True)
  report = kwargs.get('report', True)
  distributed = kwargs.get('distributed', False)
  
  # build corpus or read it in from pickle
  if use_pickle:
    print "INFO: loading pickled corpus and word hash"
    corpus = pickle.load( open( "pickles/corpus.p", "rb" ) )
    id2word = pickle.load( open( "pickles/id2word.p", "rb" ) )
            
  else:
    print "INFO: processing text and building corpus..."
    corpus, id2word = process_texts(
      texts = texts, 
      filter_stopwords = filter_stopwords,
      normalizer = normalizer,
      min_freq = min_freq
    )

    if update_pickle:
      # pickle files
      print "INFO: updating pickled coprus and word hash"
      pickle.dump(corpus, open( "pickles/corpus.p", "wb" ) )
      pickle.dump(id2word, open( "pickles/id2word.p", "wb" ) )

  # optional tfidf transformation
  if tfidf:
    print "INFO: applying tfidf transformation..."
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]

  # fit model
  print "INFO: fitting model..."
  lda = LdaModel(
    corpus = corpus, 
    id2word = id2word, 
    num_topics = num_topics,
    distributed = distributed
  )

  # report
  if report:
    perplexity = lda.bound(corpus)
    print "RESULTS:"
    print "\nperplexity: ", perplexity, "\n"
    topics = lda.show_topics(num_topics)
    for i, t in enumerate(topics):
      print "topic %d:" % i
      print t

  return lda, corpus, id2word
Example #4
0
def gensim_lda(d):
    from gensim import corpora, models
    from gensim.models.ldamodel import LdaModel
    list_doc = []
    for i in range(0,len(d)):
        list_doc = list_doc + d[i]

    dictionary = corpora.Dictionary(list_doc)
    model = LdaModel(num_topics = 20, id2word = dictionary)
    for i in range(0, len(d)):
        print 'Generating corpus and updating model ', i
        corpus = [dictionary.doc2bow(doc) for doc in d[i]]
        model.update(corpus)

    model.save('model_20')
    print model.show_topics(num_topics = 20, num_words = 10)
Example #5
0
 def find_topic(self,condition=None,n_topics=10,n_words=10,topic_model='lda',vec_model='tf',show=True,**kwargs):
     '''主题模型,和上面那个函数,优先使用该函数
     parameter
     ---------
     condition: 语料逻辑值,可以用于专门对好评/差评进行主题分解
     n_topics: 主题数
     n_words: 每个主题输出的词语数
     vec_model: 向量化方法,默认是tf
     '''
     if condition is not None:
         texts=self.texts_seg[condition]
     else:
         texts=self.texts_seg
     if topic_model in ['lda','LDA']:
         dictionary = corpora.Dictionary([doc.split(' ') for doc in texts])
         corpus = [dictionary.doc2bow(text.split(' ')) for text in texts]
         if vec_model in ['idf','tfidf']:
             tfidf = models.TfidfModel(corpus)
             corpus = tfidf[corpus]
         lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics)
         topics_keywords=lda.show_topics(num_topics=n_topics, num_words=n_words,formatted=False)
         if show:
             print('\n'.join(['主题 {}: {}'.format(i,' | '.join([k[0] for k in \
             topic[1]])) for i,topic in enumerate(topics_keywords)]))
         return topics_keywords
Example #6
0
class LDA:
	def __init__(self):
		print("Initializing topic extractor")
		self.topics = []
	"""
	This method takes list of documents in string format and returns a list of tokens
	"""
	def __tokenize(self, docs):
		output = []
		for doc in docs:
			tokenizer = RegexpTokenizer(r'\w\w\w\w\w+')
			output.append(tokenizer.tokenize(doc.lower()))
		return output


	"""
	This method takes list of words and identifies stop words and removes them from the list
	"""
	def __remove_stop_words(self, docs):
		output = []
		for doc in docs:
			en_stop = get_stop_words('en')
			stopped_tokens = [i for i in doc if not i in en_stop]
			output.append(stopped_tokens)
		return output


	"""
	This method takes words in each document and returns its corresponding base word
	"""
	def __lemmatizer(self, docs):
		output = []
		for doc in docs:
			stemmer = PorterStemmer()
			texts = [stemmer.stem(i) for i in doc]
			output.append(texts)
		return output


	"""
	This method takes each lemmatized text and generates a document-term matrix
	"""
	def __dt_matrix(self, terms):
		gen_dict = corpora.Dictionary(terms)
		corpus = [gen_dict.doc2bow(term) for term in terms]
		return [corpus, gen_dict]


	def get_topic(self, doc_set):
		# compile sample documents into a list
		o1 = self.__tokenize(doc_set)
		o2 = self.__remove_stop_words(o1)
		#o3 = self.__lemmatizer(o2)
		o4 = self.__dt_matrix(o2)
		
		self.topics = LdaModel(o4[0], num_topics=1, id2word=o4[1], passes=50)
		output = self.topics.show_topics(num_topics=1, num_words=3, log=False, formatted=True)
		return [x.split("*")[1].replace('"', '') for x in output[0][1].split("+")]
Example #7
0
class topicExtract_lda:
    
    def __init__(self,docs,nTopic = 20):
        """
            extract topic using LDA
            input:
                list of list of words, each list is a token of a doc
        """
        for i,idoc in enumerate(docs):
            if isinstance(idoc, str):
                docs[i] = word_tokenize(idoc)
        self.wordDict = Dictionary(docs)
        self.corpus_docs = [self.wordDict.doc2bow(doc) for doc in docs]
        corpus_csc = corpus2csc(self.corpus_docs)
        #tfidf_model = models.TfidfModel(self.corpus_docs)
        #tfidf_corpus = tfidf_model[self.corpus_docs]
        self.nmf = NMF(n_components = nTopic, random_state = 42)
        self.W = self.nmf.fit_transform(corpus_csc)
        
        self.topics = {'Topic '+ str(i):' '.join(list(self.get_topic_words(i)[1].values())) for i in range(nTopic)}
        self.lda2 = LdaModel(corpus=self.corpus_docs, id2word=self.wordDict, num_topics=nTopic, update_every=1, chunksize=1000, passes=4, random_state = 24)
        #self.lda2.show_topics(num_topics=-1, num_words=4)
        
            
    
    def get_topic_words(self, component_number):
        """
            NMF topics with a gensim corpus represented by component vectors
        """
        sorted_idx = np.argsort(self.W[:,component_number])[::-1][:5]
        component_words = {self.W[:, component_number][number]:self.wordDict[number] for number in sorted_idx[:5]}
        return sorted_idx, component_words
    
    def get_doc_components(self, doc_number):
        sorted_idx = np.argsort(self.nmf.components_[:,doc_number])[::-1][0:3]
        result = {number: self.nmf.components_[:,doc_number][number] for number in sorted_idx}
        return result
    
    def get_document_details(self, doc_number):
        results = []
        for item, val in self.get_doc_components(doc_number).items():
            print("document is composed of topic %d with weight %.4f" % (item, val))
            result = self.get_topic_words(item)[1]
            results.append(result)
        return results
    
    def show_lda(self, doc_num, threshold = 0.05, nWord = 5):
        topic_list = []
        for topic, weight in self.lda2[self.corpus_docs[doc_num]]:
            if weight > threshold:
                topic_list.append({(topic, weight):self.lda2.show_topic(topic, topn = nWord)})
        return topic_list    

    def showTopics(self, nWord= 4):
        output = self.lda2.show_topics(num_topics=-1, num_words = nWord)
        for i in output:
            print(i)
Example #8
0
def train_model(texts, **kwargs):

    # parse args
    filter_stopwords = kwargs.get('filter_stopwords', True)
    normalizer = kwargs.get('normalizer', 'porter')
    tfidf = kwargs.get('tfidf', True)
    num_topics = kwargs.get('num_topics', 20)
    min_freq = kwargs.get('min_freq', 2)
    use_pickle = kwargs.get('use_pickle', True)
    update_pickle = kwargs.get('update_pickle', True)
    report = kwargs.get('report', True)
    distributed = kwargs.get('distributed', False)

    # build corpus or read it in from pickle
    if use_pickle:
        print "INFO: loading pickled corpus and word hash"
        corpus = pickle.load(open("pickles/corpus.p", "rb"))
        id2word = pickle.load(open("pickles/id2word.p", "rb"))

    else:
        print "INFO: processing text and building corpus..."
        corpus, id2word = process_texts(texts=texts,
                                        filter_stopwords=filter_stopwords,
                                        normalizer=normalizer,
                                        min_freq=min_freq)

        if update_pickle:
            # pickle files
            print "INFO: updating pickled coprus and word hash"
            pickle.dump(corpus, open("pickles/corpus.p", "wb"))
            pickle.dump(id2word, open("pickles/id2word.p", "wb"))

    # optional tfidf transformation
    if tfidf:
        print "INFO: applying tfidf transformation..."
        tfidf = TfidfModel(corpus)
        corpus = tfidf[corpus]

    # fit model
    print "INFO: fitting model..."
    lda = LdaModel(corpus=corpus,
                   id2word=id2word,
                   num_topics=num_topics,
                   distributed=distributed)

    # report
    if report:
        perplexity = lda.bound(corpus)
        print "RESULTS:"
        print "\nperplexity: ", perplexity, "\n"
        topics = lda.show_topics(num_topics)
        for i, t in enumerate(topics):
            print "topic %d:" % i
            print t

    return lda, corpus, id2word
Example #9
0
def main(argv):
    if len(argv) < 4:
        print 'python train_lda.py group_id num_topics passes'
        sys.exit(1)
        
    group_id = argv[1]
    num_topics = int(argv[2])
    passes = int(argv[3])
    log.info('Prepare corpus for group: %s' % group_id)

    base_path = 'tables/' + group_id + '/'
    model_base_path = 'ldamodels/' + group_id + '/'
    
    # buid dict and corpus
    #now = datetime.now()
    indicator = 'title-comment'
    source_path = base_path + 'corpus-topic-comment'
    
    corpus_path = model_base_path + 'corpus-'+ indicator + '-' + group_id + '.mm'
    dict_path = model_base_path + 'dict-' + indicator + '-' + group_id + '.dict'
    
    log.info('Building the dict...')
    build_dict_corpus(source_path, corpus_path, dict_path)
    
    log.info('Loading dict from pre-saved file...')
    dictionary = corpora.Dictionary.load(dict_path)
    log.info('Done')
    
    #dictionary.save_as_text(base_path + 'text-dict.txt')
    
    log.info('Build a lda model...')
    log.info('Loading corpus from pre-saved .mm file...')
    mmcorpus = corpora.MmCorpus(corpus_path)
    log.info('Done')
    
    log.info('Training lda model...')
    model = LdaModel(mmcorpus, num_topics=num_topics, id2word = dictionary, passes = passes)
    model_path = model_base_path + indicator + '-' + group_id + '.ldamodel'
    model.save(model_path)
    log.info('Done.')
    
    model = LdaModel.load(model_path)
    model.show_topics(topics=num_topics, topn=10, log=True)
def gensim_lda(texts, n_topics=10, n_words=10, vec_model='tf'):
    dictionary = corpora.Dictionary([doc.split(' ') for doc in texts])
    corpus = [dictionary.doc2bow(text.split(' ')) for text in texts]
    if vec_model in ['idf', 'tfidf']:
        tfidf = models.TfidfModel(corpus)
        corpus = tfidf[corpus]
    lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=n_topics)
    topics_keywords = lda.show_topics(num_topics=n_topics,
                                      num_words=n_words,
                                      formatted=False)
    print('\n'.join([
        '主题 {}: {}'.format(i, ' | '.join([k[0] for k in topic[1]]))
        for i, topic in enumerate(topics_keywords)
    ]))
    return topics_keywords
    def compute_coherence_values(self, kmin, kmax, kstep):
        """
        Compute c_v coherence for various number of topics

        Parameters:
        ----------
        kmin : The minimum number of topics
        kmax : Max num of topics
        kstep : The step size of the topics

        Returns:
        -------
        k_values: The number of topics used. 
        coherence_values : Coherence values corresponding to the LDA model with respective number of topics
        topic_list: The list of topics. 
        """
        dictionary = Dictionary(self.docs)
        dictionary.filter_extremes(no_below=10, no_above=0.2)
        corpus = [dictionary.doc2bow(doc) for doc in self.docs]

        k_values = []
        coherence_values = []
        topic_list = []
        for num_topics in range(kmin, kmax + 1, kstep):
            # The following print line is so that you can visually see it go and don't freak out
            print("num_topics:\t" + str(num_topics))
            model = LdaModel(corpus=corpus,
                             id2word=dictionary,
                             num_topics=num_topics)
            coherencemodel = CoherenceModel(model=model,
                                            texts=self.docs,
                                            dictionary=dictionary,
                                            coherence='c_v')
            coherence_lda = coherencemodel.get_coherence()
            coherence_values.append(coherence_lda)
            topic_list.append(
                model.show_topics(num_topics=num_topics,
                                  num_words=20,
                                  log=False,
                                  formatted=True))
            k_values.append(num_topics)
        return k_values, coherence_values, topic_list
Example #12
0
def topic_classification_gensim_train(filename_1, topic_number, top_idf_number):
    """use gensim to perform lda algorithm"""
    common_texts = process_doc(filename_1, top_idf_number)
    common_dictionary = Dictionary(common_texts)
    common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]
    lda = LdaModel(common_corpus, id2word=common_dictionary, iterations=50, num_topics=topic_number,
                   random_state=np.random.RandomState(23455))
    for index, topic in lda.show_topics(formatted=False, num_words=20, num_topics=topic_number):
        print('Topic: {} \nWords: {}'.format(index, [w[0] for w in topic]))
    # print the topic and words
    topic_2 = [0.00 for n in range(topic_number)]
    for seen_doc in common_corpus:
        vector_1 = lda[seen_doc]
        for vec in vector_1:
            topic_2[vec[0]] = topic_2[vec[0]]+vec[1]
        # find the distribution of each topic.
    topic_2 = np.array(topic_2) / np.linalg.norm(topic_2)
    print(filename_1+" word distribution:")
    print(topic_2)
    return topic_2, lda, common_dictionary
Example #13
0
def lda(documents):

    texts = []
    idx = 0
    for file_docs in documents:
        document = file_docs["documents"]
        words = get_words(document["content"])
        if document["claim"]:
            words = words + get_words(document["claim"])
        stemmer = EnglishStemmer()
        words = [stemmer.stem(word) for word in words]
        texts.append(words)

    dictionary = Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]

    ldamodel = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=20)
    print(ldamodel.show_topics())
    data = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
    pyLDAvis.save_html(data, 'vis.html')
Example #14
0
def LDA(corpus, dictionary, numTopics):
	"""
	Performs LDA on a list of abstracts represented as a gensim corpus. 
	Returns both the fitted topics and the assigned topic for each abstract
	"""
	print("Fitting LDA model...")
	lda = LdaModel(corpus, num_topics=numTopics, id2word=dictionary)
	# retrieve assigned topics for abstracts in corpus
	lda_corpus = lda[corpus]
	# iterate over each assignment and choose dominant topic 
	assigned_topics = []
	for assignment in lda_corpus:
		# if assigned only one topic, retrieve that topic
		if len(assignment) == 1:
			assigned_topics.append(assignment.pop()[0])
		# if assigned multiple topics, retrieve topic with highest probability
		else:
			main_topic = max(assignment, key=lambda item:item[1])
			assigned_topics.append(main_topic[0])
	return(lda.show_topics(num_topics=-1), assigned_topics)
Example #15
0
    def topics_by_lda(self, num_topics=20, num_words=10):
        """
        利用 LDA 模型进行语料集分析

        Arguments:
        num_topics -> integer -- 既定的主题数目
        num_words -> integer -- 最终返回的单主题词数目
        """

        # 如果是从命令行启动则执行数据抓取
        if not hasattr(self, "data"):
            logging.info("数据集尚未准备,重新准备数据集中!")
            self.fetch_data()

        # 构建语料集统计向量
        vec = CountVectorizer(min_df=10, max_df=80, stop_words='english')

        # 对于数据进行分析
        X = vec.fit_transform(self.data['train'].data)

        # 获取词表
        vocab = vec.get_feature_names()

        # 构建多核 LDA 模型
        lda = LdaModel(matutils.Sparse2Corpus(X, documents_columns=False),
                       num_topics=num_topics,
                       id2word=dict([(i, s) for i, s in enumerate(vocab)]))

        # 打印并且返回主题数据
        topics = lda.show_topics(num_topics=num_topics,
                                 num_words=num_words,
                                 formatted=False,
                                 log=False)

        for ti, topic in enumerate(topics):
            print("Topic", ti, ":", " ".join(word[0] for word in topic[1]))

        if __name__ != '__main__':
            return topics
Example #16
0
def topic_list(text):
    print 'Topic modeling...'

    tokenizer = RegexpTokenizer('\w+')
    document = []
    for token in tokenizer.tokenize(text):
        word = token.lower()
        if word not in stop_words:
            document.append(word)
    documents = [document]

    dic = Dictionary(documents)
    corpus = [dic.doc2bow(doc) for doc in documents]

    lda = LdaModel(corpus, num_topics=5)

    topics = [
        dic[int(id)] for topic in lda.show_topics(formatted=False)
        for prob, id in topic
    ][:5]

    print topics
    return topics
Example #17
0
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logger = logging.getLogger('LDA_model_builder')
################################################################################################################################################
logger.info('building word_id_map...')
word_id_map = build_word_id_map([trainPosts_loc, testPosts_loc])
pickle(word_id_map, 'word_id_map')
normalize_content_stats()

train_and_test_corpus = MyCorpus([trainPosts_loc, testPosts_loc], word_id_map)
logger.info('training LDA model...')
#id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.
lda = LdaModel(train_and_test_corpus, id2word=word_id_map, num_topics=topic_count, update_every=1, chunksize=10000, passes=1)
pickle(lda, 'lda')

#Print the 'topn' most probable words for (randomly selected) 'topics' number of topics. Set topics=-1 to print all topics.
lda.show_topics(topics=topic_count, topn=10)
################################################################################################################################################
#key = blog + '_' + post_id
#value = a list of (topic_id, topic_probability) 2-tuples
blog_topic_distribution_map = {}

#key = uid (user id)
#value = list of (blog, post_id) tuples
train_user_likes_map = defaultdict(list)

#key = blog
#value = list of post_ids
test_blog_post_map = defaultdict(list)

logger.info('starting LDA prediction for training data...')
for blog, post_id, likes, blog_content_as_list_of_words in MyFilesIterator([trainPosts_loc]).iterate_fields():
Example #18
0
if len(sys.argv) != 2:
    print 'Usage: {0} rcv1_data_dir'.format(sys.argv[0])
    raise SystemExit(1)

data_dir = sys.argv[1]
mapping_file = data_dir+'/token_id_idf'
dictionary_file = data_dir+'/id_token_df'
token_file = data_dir+'/tokens'
lda_file = data_dir+'/lda_model'

print 'creating dictionary...'
N = 23307  # supplied idfs from rcv1/lyrl2004 were based on 23307 training docs
create_dictionary_file(mapping_file,dictionary_file,23307)
dictionary = Dictionary.load_from_text(dictionary_file)

print 'creating corpus...'
corpus = SimpleLowCorpus(token_file,dictionary)

print 'training model...'
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
lda = LdaModel(corpus,id2word=dictionary,num_topics=200)
print 'done!'
print '\n'*3
print '======final topics======'
topics = lda.show_topics(topics=-1,topn=4)
for i,topic in enumerate(topics):
    print i,topic

print 'saving model...'
lda.save(lda_file)
Example #19
0
            output_file = output_file_template.format(run_id=run_id)

            # Train and save
            print 'Training...'
            model = LdaModel(corpus,
                             alpha=alpha,
                             eta=beta,
                             id2word=dictionary,
                             num_topics=num_topics,
                             iterations=num_iterations)
            # model = LdaMulticore(corpus,
            # 	alpha=alpha, eta=beta,
            # 	id2word=dictionary, num_topics=num_topics, iterations=num_iterations, workers=2
            # )
            print 'Done training.'
            model.save(output_file)

            # Print top 10 words in topics, if desired
            if print_topics:
                topics = model.show_topics(num_topics=100, formatted=False)
                for topic in topics:
                    for tup in topic[1]:
                        print tup[0] + ": " + str(tup[1])
                    print '\n'

            # Evaluate perplexity
            ll = model.log_perplexity(test_corpus)
            print "LL:   " + str(ll)
            print "Perp: " + str(np.exp2(-ll))
Example #20
0
    def fit(self):
        # 载入IT停用词
        stopword = StopWord("./stopwords_it.txt")

        # 载入语料库(from seg_join/corpus.txt)
        print "reading corpus"
        corpus_name = "corpus.dat"
        if not os.path.exists(corpus_name):
            with open(self.proj_name + "/seg_join/corpus.txt",
                      "r") as corpus_file:
                for line in corpus_file:
                    words = line.split()
                    words = [
                        word for word in words
                        if not stopword.is_stop_word(word)
                    ]
                    self.corpus.append(words)
            # Dumper.save(self.corpus, corpus_name)
        else:
            self.corpus = Dumper.load(corpus_name)
        self.doc_num = len(self.corpus)

        # 生成文档的词典,每个词与一个整型索引值对应
        print "creating dictionary"
        id2word_name = "id2word.dat"
        if not os.path.exists(id2word_name):
            self.id2word = corpora.Dictionary(self.corpus)
            # Dumper.save(self.id2word, id2word_name)
        else:
            self.id2word = Dumper.load(id2word_name)

        # 删除低频词
        # ignore words that appear in less than 20 documents or more than 10% documents
        # id2word.filter_extremes(no_below=20, no_above=0.1)

        # 词频统计,转化成空间向量格式
        print "tranforming doc to vector"
        corpus_bow_name = "corpus_bow.dat"
        if not os.path.exists(corpus_bow_name):
            self.corpus_bow = [
                self.id2word.doc2bow(doc) for doc in self.corpus
            ]
            # Dumper.save(self.corpus_bow, corpus_bow_name)
        else:
            self.corpus_bow = Dumper.load(corpus_bow_name)

        # 训练LDA模型
        print "training lda model"
        lda_model_name = "lda_models/lda.dat"
        if not os.path.exists(lda_model_name):
            lda = LdaModel(corpus=self.corpus_bow,
                           id2word=self.id2word,
                           num_topics=self.topic_num,
                           alpha='auto')
            Dumper.save(lda, lda_model_name)
        else:
            lda = Dumper.load(lda_model_name)

        # 给每个主题起名字
        topics = lda.show_topics(num_topics=self.topic_num,
                                 num_words=2,
                                 log=False,
                                 formatted=False)
        topic_names = [
            topic[1][0][0] + "+" + topic[1][1][0] for topic in topics
        ]
        for i, topic_name in enumerate(topic_names):
            self.tree.create_node((i, topic_name), i, parent=-1)

        # 打印识别出的主题
        topics = lda.print_topics(num_topics=self.topic_num, num_words=10)
        for topic in topics:
            print "topic %d: %s" % (topic[0], topic[1].encode("utf-8"))
        with open("topics.txt", "w") as topic_file:
            for topic in topics:
                print >> topic_file, "topic %d: %s" % (
                    topic[0], topic[1].encode("utf-8"))
        self.lda = lda
Example #21
0

model_lda[doc]


# In[83]:


model_lda.do_estep(chunk, state=None)


# In[84]:


# print keywords in n topics
sorted(model_lda.show_topics(), key=lambda x: x[1])


# In[85]:


# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])


# In[86]:


# print keywords in n topics
sorted(model_lda.print_topics(), key=lambda x: x[1])
Example #22
0

tfidf_model = models.TfidfModel(corpus)
corpus_tfidf = tfidf_model[corpus]


# In[109]:


lda = LdaModel(corpus=corpus_tfidf,id2word=dictionary,num_topics=3)


# In[118]:


lda.show_topics(9)


# In[120]:


vis_data = pyLDAvis.gensim.prepare(lda,corpus,dictionary)
vis_data


# In[123]:


pyLDAvis.display(vis_data)

Example #23
0
class GensimLDA:
    def __init__(self, texts):
        self.dictionary = Dictionary(texts)
        self.corpus = [self.dictionary.doc2bow(text) for text in texts]

        self.k_topics = None
        self.model = None

    def fit(self, k_topics, iterations=50):
        ''''''
        self.k_topics = k_topics
        self.model = LdaModel(corpus=self.corpus, id2word=self.dictionary, \
            num_topics=k_topics, iterations=iterations)

    def get_document_topic_matrix(self, X=None):
        '''Returns an n_docs x k_topics array of probabilities
        of a topic in a given document.'''
        if X is None:
            X = self.corpus
        else:
            X = [self.dictionary.doc2bow(text) for text in X]

        n_docs = len(X)
        V = np.zeros((n_docs, self.k_topics))

        # Extract assignments
        some_iterable = self.model.get_document_topics(
            X)  ## equiv: self.model[X]
        for i, doc_topic in enumerate(some_iterable):
            for topic_id, prob in doc_topic:
                V[i, topic_id] = prob
        return V

    def get_topic_term_matrix(self):
        '''Returns an k_topics x m_words array of probabilities
        of a word in a given topic.'''
        return self.model.get_topics()

    def print_topics(self, top_n=10):
        '''Prints the top_n words in a topic'''
        for row in self.get_topic_term_matrix():
            ranking = np.argsort(row)
            ids = np.arange(len(ranking))[ranking]

            for k in ids[:-top_n:-1]:
                weight = row[k]
                word = self.dictionary.id2token[k]
                print(k, word, weight)
            print()

    def print_topic_words(self, topic_num, topn=None):
        '''Prints the top words and probabilities of a given topic in
        descending probability.'''
        for tok_id, prob in self.model.get_topic_terms(topic_num, topn=topn):
            word = self.dictionary.id2token[tok_id]
            print(word, prob)

    def get_topic_bows(self, num_words=10):
        '''Returns a list (for each topic) containing a list of the top num_words'''
        q = self.model.show_topics(num_topics=self.k_topics,
                                   num_words=num_words,
                                   formatted=False)
        topics = []
        for id, topic in q:
            words = []
            for w, p in topic:
                words.append(w)
            topics.append(words)
        return topics
Example #24
0
################################################################################################################
print('Topic coherence...')
local_time = time.time()
coherence = CoherenceModel(model=lda, corpus=word_corpus, texts=lda_input, dictionary=word_dictionary, coherence='c_v')
print('coherence:', coherence.get_coherence())
timer(global_time,local_time)

################################################################################################################
# Pick keywords in each clusters
################################################################################################################
print('Picking words in each topic...')
local_time = time.time()
keyword_vocab = []
topics_keywords_map = {}
# pick keywords using show_topics function in gensim
for topic in lda.show_topics(num_topics=cluster_num, num_words=1000, formatted=False):
    topic_id = topic[0]
    keywords = topic[1]
    keyword_weight_map = {}
    for keyword, weight in keywords:
        keyword_weight_map[keyword] = weight
        if keyword not in keyword_vocab:
            keyword_vocab.append(keyword)
    topics_keywords_map[topic_id]=keyword_weight_map
timer(global_time,local_time)

################################################################################################################
# Deal with the problem of word overlapping
################################################################################################################
print('Keeping distinct word in each topic ...')
local_time = time.time()
Example #25
0
def lda_train_model(filename, output_name):
    output = {}
    output['dataset'] = filename
    output['output_name'] = output_name

    #LEMMATIZATION
    df = pd.DataFrame.from_csv(filename)
    lemmas_list = []

    for lemmas in df['lemmas']:
        lemmas = str(lemmas)
        lemmas = lemmas.replace('[', '')
        lemmas = lemmas.replace(']', '')
        lemmas = lemmas.replace(',', '')
        lemmas_list.append(lemmas.split())

    #print(lemmas_list)

    #CLEAN DOCUMENT
    dictionary = corpora.Dictionary(lemmas_list)
    make_dir('./lda/dicts/')
    dictionary.save('./lda/dicts/%s_corpus.dict' % output_name)
    output['dict'] = '%s_corpus.dict' % output_name
    clean_doc = [dictionary.doc2bow(text) for text in lemmas_list]
    tfidf = models.TfidfModel(clean_doc, normalize=True)

    try:
        #CREATE MODEL
        lda = LdaModel(corpus=tfidf[clean_doc],
                       id2word=dictionary,
                       num_topics=5)
    except ValueError:
        print("Error: Cannot compute LDA over an empty collection (no terms)")
        return output

    make_dir('./lda/models')
    lda.save('./lda/models/yolanda nov 8_%s_model.txt' % output_name)
    output['model'] = '%s_model.txt' % output_name

    #GET TOPICS AND TOPIC DISTRIBUTION TO BE SAVED TO FILE
    topics = lda.show_topics()
    topic_dist = lda[clean_doc]
    make_dir('./lda/topics')
    text_file = open("./lda/topics/yolanda nov 8_%s_topics.txt" % output_name,
                     "w")

    iterr = 0
    text_file.write("TOPICS: \n ----- \n \n")
    for topic in topics:
        text_file.write("TOPIC " + str(iterr) + ": " + str(topic) + "\n")
        iterr += 1

    final_topics = []
    iterr = 0
    text_file.write("-----\n \nTOPIC DISTRIBUTION: \n-----\n \n")
    for topic_dis in topic_dist:
        text_file.write(
            str(iterr) + ": " + str(topic_dis) + "\n" + "--- " +
            str(max(topic_dis, key=itemgetter(1))[0]) + "\n")
        iterr += 1
        final_topics.append(max(topic_dis, key=itemgetter(1))[0])
    text_file.close()

    #CREATE DATASET WITH CORRESPONDING TOPIC NUMBER TO BE PLOTTED
    with open('./lda/dataset_yolanda nov 8_%s.csv' % output_name,
              'r') as csvinput:
        #with open('yolanda nov 8.csv','r') as csvinput:
        with open('./lda/dataset_yolanda nov 8_%s_topics.csv' % output_name,
                  'w') as csvoutput:
            writer = csv.writer(csvoutput, lineterminator='\n')
            reader = csv.reader(csvinput)

            al = []
            row = next(reader)
            row.append('TOPIC')
            al.append(row)

            i = 0
            for row in reader:
                if row[5] != '':
                    row.append(final_topics[i])
                    al.append(row)
                i += 1

            writer.writerows(al)

    print(output)
    return output
term_lists = []
for i in range(len(df)):
    df['msg'][i] = df['msg'][i].lower()
    j = df['msg'][i].find('req')
    if j > -1:
        df['msg'][i] = df['msg'][i][j:] 
        idx.append(i)
        terms = df['msg'][i].split()
        terms = terms[5:]
        filtered_terms = [t for t in terms if len(t) > 0]
        term_lists.append(filtered_terms)

# Merge term lists into the main dataframe    
d = {'terms':term_lists}
term_df = DataFrame(data=d,columns=['terms'],index=df.index[idx])
df = df.join(term_df)

# Create corpus for topic modeling
corpora_dict = Dictionary(term_lists)
corpus = [corpora_dict.doc2bow(msg) for msg in term_lists]

# Perform topic modeling
lda = LdaModel(corpus=corpus,id2word=corpora_dict,num_topics=5)

# Print out top terms for each topic
topics = lda.show_topics()
i = 0
for topic in topics:
    i += 1
    print "Topic %d: %s" % (i,str(topic))
    #Create a dataframe and append the topic number to that column
    final_df = pd.DataFrame()
    final_df["SR_number"] = asr_df['sr_number']
    final_df["PF"] = pf #asr_df['sr_hw_product_erp_family']
    final_df["topic"] = asr_df['topic_number']
    final_df["topic_probability"] = asr_df['topic_probability']

    records = json.loads(final_df.T.to_json()).values()
    print("Ran successfully")
    #db.SR_topic_classification.drop()
    db.SR_topic_classification.insert(records)
    print("Created the collection")
    ####################Running the topics and words collection#####################

    topics_matrix = ldamodel.show_topics(formatted=False,num_words=200, num_topics=50)
    topics_matrix = np.array((topics_matrix),dtype=list)
    topics_df = pd.DataFrame()
    top_probs = []
    top_words = []
    top = []
    for topic in range(0, 50):
        a = topics_matrix[topic]
        for i in range(0,200):
            top.append(topic)
            top_words.append(a[1][i][0])
            top_probs.append(a[1][i][1])

    topics_df['Topic_number'] = top
    topics_df['keyword'] = top_words
    topics_df['probability'] = top_probs
Example #28
0
pickle(word_id_map, 'word_id_map')
normalize_content_stats()

train_and_test_corpus = MyCorpus([trainPosts_loc, testPosts_loc], word_id_map)
logger.info('training LDA model...')
#id2word is a mapping from word ids (integers) to words (strings). It is used to determine the vocabulary size, as well as for debugging and topic printing.
lda = LdaModel(train_and_test_corpus,
               id2word=word_id_map,
               num_topics=topic_count,
               update_every=1,
               chunksize=10000,
               passes=1)
pickle(lda, 'lda')

#Print the 'topn' most probable words for (randomly selected) 'topics' number of topics. Set topics=-1 to print all topics.
lda.show_topics(topics=topic_count, topn=10)
################################################################################################################################################
#key = blog + '_' + post_id
#value = a list of (topic_id, topic_probability) 2-tuples
blog_topic_distribution_map = {}

#key = uid (user id)
#value = list of (blog, post_id) tuples
train_user_likes_map = defaultdict(list)

#key = blog
#value = list of post_ids
test_blog_post_map = defaultdict(list)

logger.info('starting LDA prediction for training data...')
for blog, post_id, likes, blog_content_as_list_of_words in MyFilesIterator(
print 'Building model ...'
documents = wikiDocs()

#build a dictionary which maps between words and index numbers:
dictionary = corpora.Dictionary(documents)
dictionary.save(fileLocation + 'cs_lda6.dict')
corpus = wikiDocBow()

#model = Doc2Vec( documents, size=25, window=8, min_count=5, workers=0)
ldaModel = LdaModel(corpus=corpus, id2word=dictionary,
                    num_topics=100)  # default size = 25
print 'Out of ' + str(len(domainIlinks)) + ' domain pages ' + str(
    MissingFileCount) + ' were missing.'
print 'Topics = ' + str(len(ldaModel.print_topics(num_topics=10,
                                                  num_words=10)))
print 'Showing topics = ' + str(len(ldaModel.show_topics()))

# store the model to mmap-able files
ldaModel.save(fileLocation + 'wiki_model6.ldamodel'
              )  #model.save_word2vec_format('/tmp/my_model.doc2vec')
print('Model made in ' + str(
    ((datetime.now() - start).total_seconds()) / 60) + ' minutes.')

# load the model back
dictionary_loaded = corpora.Dictionary.load(fileLocation + 'cs_lda6.dict')
model_loaded = LdaModel.load(
    fileLocation + 'wiki_model6.ldamodel'
)  #model_loaded = Doc2Vec.load_word2vec_format('/tmp/my_model.doc2vec')
print 'Topics in loaded model = '
print model_loaded.print_topics(num_topics=5, num_words=5)
Example #30
0
    def tranform(self):
        # 分析每篇文章的主题
        db = ArticleDB()
        topic_doc_ids = [[] for i in xrange(self.topic_num)]
        topic_docs = [[] for i in xrange(self.topic_num)]
        for i, doc_bow in enumerate(self.corpus_bow):
            doc_topics = self.lda[doc_bow]
            if len(doc_topics) == 0:
                print "no topics for doc %d " % i + 1
                continue
            topic_item = max(doc_topics, key=lambda topic_item: topic_item[1])
            topic_id = topic_item[0]
            topic_doc_ids[topic_id].append(i + 1)
            topic_docs[topic_id].append(self.corpus[i])
            db.execute("update %s set lda_category1=%d where id = %d" %
                       (self.proj_name, topic_id, i + 1))
        db.commit()
        db.close()

        # 对分组内文章再次进行主题分析
        topic_offset = self.topic_num
        for topic_fid in xrange(self.topic_num):
            sub_ids = topic_doc_ids[topic_fid]
            sub_corpus = topic_docs[topic_fid]

            # 生成字典
            print "creating dictionary"
            sub_id2word = corpora.Dictionary(sub_corpus)

            # 删除低频词
            # ignore words that appear in less than 20 documents or more than 10% documents
            # id2word.filter_extremes(no_below=20, no_above=0.1)

            # 词频统计,转化成空间向量格式
            print "tranforming doc to vector"
            sub_corpus_bow = [
                sub_id2word.doc2bow(doc_bow) for doc_bow in sub_corpus
            ]

            # 训练LDA模型
            print "training lda model"
            sub_lda_model_name = "lda_models/lda_%d.dat" % topic_fid
            if not os.path.exists(sub_lda_model_name):
                sub_lda = LdaModel(corpus=sub_corpus_bow,
                                   id2word=sub_id2word,
                                   num_topics=self.sub_topic_num,
                                   alpha='auto')
                Dumper.save(sub_lda, sub_lda_model_name)
            else:
                sub_lda = Dumper.load(sub_lda_model_name)

            # 给每个主题起名字
            sub_topics = sub_lda.show_topics(num_topics=self.sub_topic_num,
                                             num_words=2,
                                             log=False,
                                             formatted=False)
            sub_topic_names = [
                sub_topic[1][0][0] + "+" + sub_topic[1][1][0]
                for sub_topic in sub_topics
            ]
            for i, sub_topic_name in enumerate(sub_topic_names):
                self.tree.create_node((topic_offset + i, sub_topic_name),
                                      topic_offset + i,
                                      parent=topic_fid)

            # 打印识别出的主题
            sub_topics = sub_lda.print_topics(num_topics=self.sub_topic_num,
                                              num_words=10)
            for sub_topic in sub_topics:
                print "sub topic %d: %s" % (sub_topic[0],
                                            sub_topic[1].encode("utf-8"))
            with open("sub_topics_%d.txt" % topic_fid, "w") as topic_file:
                for sub_topic in sub_topics:
                    print >> topic_file, "topic %d: %s" % (
                        sub_topic[0], sub_topic[1].encode("utf-8"))

            # 分析每篇文章的主题
            db = ArticleDB()
            for i, doc_bow in enumerate(sub_corpus_bow):
                doc_topics = sub_lda[doc_bow]
                if len(doc_topics) == 0:
                    print "no sub topics for doc %d " % sub_ids[i]
                    continue
                topic_item = max(doc_topics,
                                 key=lambda topic_item: topic_item[1])
                topic_id = topic_item[0]
                db.execute(
                    "update %s set lda_category2=%d where id = %d" %
                    (self.proj_name, topic_offset + topic_id, sub_ids[i]))
            db.commit()
            db.close()
            topic_offset += self.sub_topic_num
Example #31
0
docs = [lemmatization(text) for text in tqdm(df["text"], total=len(df))]

from gensim.corpora import Dictionary

id2word = corpora.Dictionary(docs)
id2word.filter_extremes(no_below=5, no_above=0.25)
corpus = [id2word.doc2bow(doc) for doc in docs]

from gensim.models.ldamodel import LdaModel

bow = [id2word.doc2bow(text) for text in docs]
lda = LdaModel(corpus=corpus, id2word=id2word, num_topics=10)

print(lda)
for idx, topic in lda.show_topics(num_words=5, formatted=False):
    print(idx, "; ".join([x[0] for x in topic]))

exit()
# orpus = [

X = np.zeros(shape=(len(df), len(id2word)), dtype=int)

for i, text in enumerate(docs):
    for j in id2word.doc2idx(text):
        if j >= 0:
            X[i, j] += 1

import lda

model = lda.LDA(n_topics=5, n_iter=1500, random_state=1)
def program_clusters(pgms, n_topics, awds, papers):
    #First we need to filter the data by program code. Some grants have multiple program
    #codes, so we first filter through to determine which cells contain the program code
    #then we replace the exisiting program code(s) with the provided one. This ensures there
    #is only one code per award.
    papers = papers
    papers['year'] = pd.to_datetime(papers['year'])
    papers['citations per year'] = papers['citations'].divide([
        ((datetime.datetime.today() - x).days) / 365.2422
        for x in papers['year']
    ])
    num_pubs = papers.groupby('award number')[['publication'
                                               ]].count().reset_index()
    cits_year_mean = papers.groupby('award number')[['citations per year'
                                                     ]].mean().reset_index()

    pgms = [
        '6878', '6880', '6882', '6883', '6884', '6885', '9101', '9102', '6881'
    ]
    awds = awds
    awds = awds[awds['ProgramElementCode(s)'].str.contains('|'.join(pgms))]
    for x in pgms:
        awds['ProgramElementCode(s)'] = np.where(
            awds['ProgramElementCode(s)'].str.contains(x), x,
            awds['ProgramElementCode(s)'])
    awds['StartDate'] = pd.to_datetime(awds['StartDate'])
    awds['EndDate'] = pd.to_datetime(awds['EndDate'])
    awds['AwardedAmountToDate'] = [
        x.replace('$', '') for x in awds['AwardedAmountToDate']
    ]
    awds['AwardedAmountToDate'] = [
        x.replace(',', '') for x in awds['AwardedAmountToDate']
    ]
    awds['AwardedAmountToDate'] = pd.to_numeric(awds['AwardedAmountToDate'])
    awds = pd.merge(awds,
                    num_pubs,
                    left_on='AwardNumber',
                    right_on='award number',
                    how='left')
    awds = pd.merge(awds,
                    cits_year_mean,
                    left_on='AwardNumber',
                    right_on='award number',
                    how='left')
    awds.drop(columns=['award number_x', 'award number_y'], inplace=True)
    awds[['publication', 'citations per year'
          ]] = awds[['publication', 'citations per year']].replace(np.nan, 0)
    awds['pubs per year'] = np.where(
        awds['EndDate'] > datetime.datetime.today(),
        awds['publication'].divide([
            ((datetime.datetime.today() - x).days) / 365.2422
            for x in awds['StartDate']
        ]), awds['publication'].divide(
            (awds['EndDate'] - awds['StartDate']).astype('timedelta64[D]') /
            365.2422))

    abstracts = awds[[
        'ProgramElementCode(s)', 'AwardNumber', 'Abstract',
        'citations per year', 'pubs per year', 'AwardedAmountToDate'
    ]].copy()
    #This is a pretty clean data set, but there are some empty entries, so we
    #filter them out here
    abstracts = abstracts.dropna()

    #The first step in the tokenization process is splitting the abstract text
    #into a list of words.
    abstracts['clean_abstracts'] = [
        doc.lower().split() for doc in abstracts['Abstract']
    ]

    #we want to account for possible bigrams and trigams, which we search for
    #here
    bigram = Phrases(list(abstracts['clean_abstracts']),
                     min_count=5,
                     threshold=20)
    trigram = Phrases(bigram[list(abstracts['clean_abstracts'])], threshold=20)
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)

    #Now we start building our dictinary and creating the cleaned up corpus.
    #We start by creating a list of stop words, punctuation, and other text to remove.
    #we also instantiate a lemmatizer
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation)
    lemma = WordNetLemmatizer()
    boiler_plate = 'This award reflects NSF' 's statutory mission and has been deemed worthy of support through evaluation using the Foundation' 's intellectual merit and broader impacts review criteria'

    #This function applies the bigram and trigram functions and lemmatizes the
    #the abstracts and only keeps words that a greater than 2 characters
    def word_mod(doc):
        doc = re.sub('<.*?>', ' ', doc)
        doc = re.sub(boiler_plate, '', doc)
        punct_free = ''.join(ch for ch in doc if ch not in exclude)
        words = punct_free.lower().split()
        bigs = bigram_mod[words]
        tris = trigram_mod[bigs]
        stop_free = " ".join([i for i in tris if i not in stop])
        lemm = " ".join(lemma.lemmatize(word) for word in stop_free.split())
        word_list = lemm.split()
        # only take words which are greater than 2 characters
        cleaned = [word for word in word_list if len(word) > 2]
        return cleaned

    abstracts['clean_abstracts'] = [
        word_mod(doc) for doc in abstracts['Abstract']
    ]

    # Here we create the dictionary from the corpus of abstracts, where each unique term is assigned an index.
    dictionary = corpora.Dictionary(abstracts['clean_abstracts'])
    # Filter terms which occurs in less than 4 articles & more than 40% of the abstracts
    dictionary.filter_extremes(no_below=4, no_above=0.45)
    #This creates a sparse matrix of word frequencies in each abstracts
    abstract_term_matrix = [
        dictionary.doc2bow(doc) for doc in abstracts['clean_abstracts']
    ]

    # Here we create and train the LDA model, passing in our term frequncy matrix, the number of
    #topics/clusters to be created, and our dictionary
    ldamodel = Lda(abstract_term_matrix,
                   num_topics=n_topics,
                   id2word=dictionary,
                   passes=50,
                   iterations=500)

    # Here we print out the top 10 words for each topic and their weight
    for i, topic in enumerate(
            ldamodel.print_topics(num_topics=n_topics, num_words=10)):
        words = topic[1].split("+")
        print(words, "\n")

    #Next we want to know what topic each abstract belongs to we pass each abstract
    #into the get_document_topics method and it returns the topic and the
    #probability of the abstract beloning to a that topic. We take the one that
    #has the highest probability
    def pred_topic(doc):
        doc_bow = ldamodel.id2word.doc2bow(doc)
        doc_topics = ldamodel.get_document_topics(doc_bow,
                                                  minimum_probability=0.20)
        if doc_topics:
            doc_topics.sort(key=operator.itemgetter(1), reverse=True)
            theme = doc_topics[0][0]
        else:
            theme = np.nan
        return theme

    abstracts['predicted topic'] = [
        pred_topic(doc) for doc in abstracts['clean_abstracts']
    ]

    #Here we do a histogram of how many abstracts/awards fall into each topic
    ab_hist = abstracts.groupby(['predicted topic'])['AwardNumber'].count()
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]
    cols = cols + cols + cols + cols
    f1, ax = plt.subplots()
    ab_hist.plot.bar(rot=0, color=cols)
    ax.set_xticklabels([x for x in ab_hist.index])
    ax.set_xlabel('Topic Number')
    ax.set_ylabel('Count of Awards in Topic')
    ax.set_title('Distribution of Awards in Derived Topic Areas')
    plt.show()

    #Here we create a word cloud for each of the top words in the topic. Their size
    #is indicative of their weight.
    cloud = WordCloud(stopwords=stopwords.words('english'),
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)

    topics = ldamodel.show_topics(formatted=False, num_topics=n_topics)
    fig, axes = plt.subplots(1,
                             n_topics,
                             figsize=(10, 10),
                             sharex=True,
                             sharey=True)
    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()

    #Next we'll do a t-SNE plot clustering the abstracts based off the topic
    #probabilities returned from the model. This creates a array where each
    #column is a topic and each row is an abstract and each entry is the probability
    #that the abstract belongs to that topic.
    col_ns = range(0, n_topics)
    topic_weights = pd.DataFrame(columns=col_ns)
    for i in range(0, len(ldamodel[abstract_term_matrix])):
        weights = ldamodel[abstract_term_matrix][i]
        for j in range(0, len(weights)):
            entry = pd.DataFrame(columns=col_ns)
            idx = weights[j][0]
            entry.loc[0, idx] = weights[j][1]
        topic_weights = topic_weights.append(entry)
    topic_weights.reset_index(drop=True, inplace=True)

    # Replace any nan entries (because there was zero probability the
    #abstract belonged in that topic) with zero
    arr = pd.DataFrame(topic_weights).fillna(0).values

    # We can limit this to only well separated abstracts as well
    #arr = arr[np.amax(arr, axis=1) > 0.15]

    # This is pulls out the highest probability topic for each abstract.  We'll
    #use this for the color scheme in the t-SNE plot.
    topic_num = np.argmax(arr, axis=1)

    # Here we initialize and fit our t-SNE model
    tsne_model = TSNE(n_components=2,
                      verbose=1,
                      random_state=0,
                      perplexity=50,
                      init='pca')
    tsne_lda = tsne_model.fit_transform(arr)

    #Here we plot out the results for the t-SNE transformation

    mycolors = np.array(cols)

    title = "t-SNE Clustering of {} LDA Topics".format(n_topics)
    f = plt.figure()
    plt.scatter(x=tsne_lda[:, 0], y=tsne_lda[:, 1], color=mycolors[topic_num])
    plt.title(title)
    plt.show()

    fig = plt.figure(figsize=(12, 6))
    ax1 = fig.add_subplot(1, 3, 1)
    ax1.scatter(x=abstracts['AwardedAmountToDate'],
                y=abstracts['citations per year'],
                color=mycolors[abstracts['predicted topic']])
    ax1.set_ylabel('Average Citations per Year')
    ax1.set_xlabel('Award Size [$]')
    ax1.set_title('Average Citiations per Year', fontsize=11)
    ax2 = fig.add_subplot(1, 3, 2)
    ax2.scatter(x=abstracts['AwardedAmountToDate'],
                y=abstracts['pubs per year'],
                color=mycolors[abstracts['predicted topic']])
    ax2.set_ylabel('Number Publications per Year')
    ax2.set_xlabel('Award Size [$]')
    ax2.set_title('Number of Publications per Year', fontsize=11)
    ax3 = fig.add_subplot(1, 3, 3)
    ax3.scatter(x=abstracts['pubs per year'],
                y=abstracts['citations per year'],
                color=mycolors[abstracts['predicted topic']])
    ax3.set_xlabel('Number Publications per Year')
    ax3.set_ylabel('Average Citiations per Year')
    ax3.set_title('Number Publications vs \nAverage Citation Count',
                  fontsize=11)
    from matplotlib.legend_handler import HandlerPatch

    class HandlerEllipse(HandlerPatch):
        def create_artists(self, legend, orig_handle, xdescent, ydescent,
                           width, height, fontsize, trans):
            center = 0.5 * width - 0.5 * xdescent, 0.5 * height - 0.5 * ydescent
            p = mpatches.Ellipse(xy=center,
                                 width=height + xdescent,
                                 height=height + ydescent)
            self.update_prop(p, orig_handle, legend)
            p.set_transform(trans)
            return [p]

    handles = [
        mpatches.Circle((0.5, 0.5),
                        radius=0.25,
                        facecolor=mycolors[i],
                        edgecolor="none") for i in range(0, n_topics)
    ]
    handles = [
        mpatches.Circle(
            (0.5, 0.5), radius=0.25, facecolor='w', edgecolor="none")
    ] + handles
    legend_labels = list(range(0, n_topics))
    legend_labels = ['Topic'] + legend_labels
    ax3.legend(handles,
               legend_labels,
               bbox_to_anchor=(1, .88),
               bbox_transform=fig.transFigure,
               handler_map={mpatches.Circle: HandlerEllipse()})
    plt.tight_layout()
id2word = corpora.Dictionary(documents)

corpus = [id2word.doc2bow(doc) for doc in documents]

onehot_labels = onehot_enc.transform(labels)

print("starting LDA model")
# plug into LDA model.
# this can take a while with larger number of documents
lda = LdaModel(num_topics=20,
               id2word=id2word,
               corpus=corpus,
               passes=50,
               eval_every=1)
print("topics:")
for topic in lda.show_topics(num_topics=20, num_words=20):  #print_topics():
    print(topic)

#print("getting topics for testing document")
#topic_prediction = lda.get_document_topics(bow=corpus[0])

#print(testing_text_raw)
#print(topic_prediction)

print("")
print(
    "starting setup to train a classifier based on LDA topics for each document"
)

topic_vecs = []
Example #34
0
class LDA_parser():
    """
    This class implements a wrapper pipeline for text preprocessing and LDA parsing of an input corpus 
    in the form ['str','str','str', ... ]. 
    """
    def __init__(self,
                 corpus='',
                 language='english',
                 preprocessor_type="spacy",
                 tags=["DET", "PUNCT", "NUM", "SYM", "SPACE"],
                 custom_filter=[],
                 lemmatize=False,
                 stem=False,
                 min_len=2,
                 num_topics=10,
                 passes=100):
        """ 
        Parses the input text into a suitable format, then performs all LDA extraction tasks. 
        It expects the input corpus to be a list of texts. If input is a long string, it will attempt 
        create documents by splitting by 
        @ params: 
            @ corpus: Input corpus in str or ['str','str','str', ... ] format, where each entry
                      is a document of type str. Alternatively, a str format input (not recommended).
            @ preprocessor_type: Use nltk-based or spaCy-base preprocessor 
            @ language: language to use in the preprocessor 
            @ tags: if spaCy is selected, will filter words with input POS tags 
            @ custom_filter: filter words in this input list in the preprocessing step 
            @ lemmatize: use lemmatization in the preprocessing 
            @ stem: use stemming in the preprocessing  
            @ num_topics: maximum number of topics in the LDA algorithm 
            @ passes: number of training epochs in the LDA 
        """

        print("Initializing model...\n")
        if preprocessor_type == "nltk":
            print("NLTK preprocessor selected.")
            self.preprocessor = nltk_preprocessor(language=language)
        if preprocessor_type == "spacy":
            print("spaCy preprocessor selected.")
            self.preprocessor = spacy_preprocessor(language=language)

        self.language = language  # input language
        self.raw_corpus = ""  # simply stores the input if in str type
        self.clean_corpus = [
        ]  # [doc, doc, ..., doc] = [[sent, sent, ...], ... ,[sent, sent, ...]]
        self.dictionary = None  # holds a corpora.Dictionary representation of corpus
        self.doc2bow_corpus = None  # contains doc2bow vector representations of each document in the corpus
        self.lda_model = None  # LDA model trained on the input corpus
        self.topic_mixtures = [
        ]  # contains str representations of mixtures of words with their probabilities
        self.topics = {
        }  # Contains a dictionary of topics with words and respective mix probabilities once "extract topics" is called.
        self.topic_words = {
        }  # As above, but only contains the respective words of the topic

        # check for raw str corpus format
        if isinstance(corpus, str):
            print(
                "***WARNING***\nRaw input (str) received. Text will be sentence-tokenized and parsed accordingly."
            )
            print("Make sure this is intended. \n")
            self.raw_corpus = str(corpus)  # transform input to string
            self.fit(corpus,
                     raw=True,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)  # fit corpus as raw

        elif corpus == '':
            print("***WARNING***\nNull Corpus")
        # assume input corpus is in the right format
        else:
            self.fit(corpus,
                     language=language,
                     num_topics=num_topics,
                     passes=passes,
                     min_len=min_len)

    def fit(self,
            corpus,
            raw=False,
            language='english',
            stem=False,
            lemmatize=False,
            num_topics=10,
            passes=100,
            min_len=2,
            echo_corpus=False):
        """ 
        Assumes input corpus is in the right format. 
        @args: 
            @ corpus = input corpus  
            @ language = input language  
            @ stem/lemmatize = if true, stem or lemmatize input corpus
            @ num_topics = number of topics to choose in the algorithm 
            @ passes = number of epochs of the LDA 
            @ min_len = minimum length of words to consider when preprocessing words
        """

        if echo_corpus:
            print("CORPUS: {}".format(corpus))

        t0 = time.time()

        print("Fitting LDA topic modelling...")
        self.raw_corpus = corpus  # input corpus as is
        self.language = language  # in case initial language changed

        if raw:
            print("Preprocessing corpus...(raw)")
            self.clean_corpus = self.preprocessor.preprocess_str_corpus(
                corpus, stem=stem, lemmatize=lemmatize, min_len=min_len)
        else:
            print("Preprocessing corpus...")
            self.clean_corpus = self.preprocessor.preprocess_texts(
                self.raw_corpus, min_len=2)  # preprocess text list

        print("Creating corpora dictionary...")
        self.dictionary = corpora.Dictionary(
            self.clean_corpus)  # create corpora.Dictionary mapping
        print("Translating doc2bow corpus...")
        self.doc2bow_corpus = [
            self.dictionary.doc2bow(text) for text in self.clean_corpus
        ]  # doc2bow corpus representation
        print("Running LDA...")
        self.lda_model = LdaModel(self.doc2bow_corpus,
                                  num_topics=num_topics,
                                  id2word=self.dictionary,
                                  passes=passes)
        self.topic_mixtures = self.lda_model.show_topics(
            num_topics=-1,
            num_words=10)  # string representation of topics mixtures

        t1 = time.time()
        print("\nDone in {:.3f} seconds.".format(t1 - t0))

    def print_topics(self, words_per_topic=5):
        """
        Displays the topics in string format
        """
        topics = self.lda_model.print_topics(num_words=words_per_topic)
        for topic in topics:
            print(topic)

    def extract_topics(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of tuples of words_per_topic many words with 
        probability at least as high as threshold, where the second value is the density 
        for the topic. 
        @params: 
            @ max_words_per_topic: Maximum topic mixture component words to consider. 
            @ threshold: select words whose density is at least this value
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topics = topics  # update attribute

        return topics

    def extract_topic_words(self, max_words_per_topic=50, threshold=0.005):
        """
        Returns all topics as a dictionary of tuples, where the key is the topic 
        number, and the value is a list of words_per_topic many words with 
        probability at least as high as threshold. 
        """
        topics = {}  # to store the topics
        indexes = [tup[0]
                   for tup in self.topic_mixtures]  # indexes of the thing

        # assign the topics mixtures
        for i in indexes:
            topics[i] = [
                tup[0]
                for tup in self.lda_model.show_topic(i,
                                                     topn=max_words_per_topic)
                if tup[1] >= threshold
            ]  # extract mosst probable words for topic i

        self.topic_words = topics  # update attribute

        return topics

    def parse_new(self,
                  new_text,
                  top_n_topics=100,
                  top_n_w=30,
                  max_words_per_topic=50,
                  threshold=0.005,
                  verbose=True):
        """
        Parses a new text by obtaining the most likely topics for the new input, 
        as well as the respective words. This function should be used only after 
        the LDA parser has been fitted. 
        @params: 
            @ new_text: new input text 
            @ top_n_topics: top n topics with larges densities  p(topic)
            @ top_n_w: top n word with largest densities p(word) = p(word|topic)*p(topic)
            @ verbose: display information
            @ max_words_per_topic: maximum words per topic  
            @ thrshold: only consider words with density greater than threshold 
        @returns: 
            @ max_topic: most likely topic for the document 
            @ doc_max_topic_words: words associated with the most likely topic 
            @ doc_topics: all topics related to the document 
            @ doc_topic_words: all words from all topics associated with the document 
        """

        self.extract_topic_words(
            max_words_per_topic,
            threshold)  # extract topics to ensure they are there

        new_text_clean = self.preprocessor.preprocess_sentence(
            new_text)  # preprocess input text
        new_doc_bow = self.dictionary.doc2bow(
            new_text_clean)  # convert to doc2bow

        doc_topics = self.lda_model.get_document_topics(
            new_doc_bow)  # obtain topics for input document
        topic_idx = [tup[0] for tup in doc_topics]  # topic indices

        doc_topic_words = [
            word for idx in topic_idx for word in self.topic_words[idx]
        ]  # extract all words from every topic
        top_n_topics = nlargest(top_n_topics,
                                list(doc_topics),
                                key=lambda x: x[1])  # extract top n topics

        top_n_words = list(
            set([
                word for idx in [tup[0] for tup in top_n_topics]
                for word in self.topic_words[idx]
            ]))  # extrac the word for the topc words

        # Currently, we have access to the top n topics and their actual probabilities.
        # We want to collect all the words for those topics, and multiply them with their probabilities

        words_with_probs = [
        ]  # will store words with their actual probabilities:

        for topic_tup in doc_topics:
            topic_idx = topic_tup[0]  # obtain topic index
            topic_prob = topic_tup[1]  # obtain topic probability p(topic)
            for word_tup in self.lda_model.show_topic(topic_idx, topn=10):
                word_probability = word_tup[
                    1] * topic_prob  # p(w) = p(w|topic)p(topic)
                words_with_probs.append(
                    (word_tup[0], word_probability))  # (word, p(w))

        # obtain the n most likely words according to they individual probabilities
        n_most_likely_words = [
            tup[0] for tup in nlargest(
                top_n_w, list(words_with_probs), key=lambda x: x[1])
        ]

        if verbose:
            print("\nLOGS: \n")
            print("*** Most likely topic: ***\n", top_n_topics)
            print("*** Words for most likely topic: ***\n", top_n_words)
            print("*** All topics: ***\n", doc_topics)
            print("*** All topics words: ***\n", doc_topic_words)

        return n_most_likely_words, top_n_topics, top_n_words, doc_topics, doc_topic_words

    def pickle_save(self, savename="full_LDA_parser.pkl"):
        """ 
        Saves the full model object in pkl format
        """
        pickle.dump(self, open(savename, 'wb'))

    def save_model(self, name="LDA_model"):
        """ 
        Saves the LDA model, doc2bow_corpus and dictionary.
        These parameters can be used to instantiate a gensim 
        model, so there is no load in this class. 
        """
        dictionary_name = name + "_dictionary.gensim"
        corpus_name = name + "_doc2bow_corpus.pkl"
        model_name = name + ".gensim"

        pickle.dump(self.doc2bow_corpus, open(corpus_name,
                                              'wb'))  # save the doc2bow_corpus
        self.dictionary.save(dictionary_name)  # save corpus dictionary mapping
        self.lda_model.save(model_name)  # save the full model
Example #35
0
			run_id = "ldaU_K{K}_a{alpha_frac}-K_b{beta}_iter{iter}.gensim".format(K=num_topics, alpha_frac=alpha_frac, beta=beta, iter=num_iterations)
			print run_id

			output_file = output_file_template.format(run_id=run_id)

			# Train and save
			print 'Training...'
			model = LdaModel(corpus, 
				alpha=alpha, eta=beta,
				id2word=dictionary, num_topics=num_topics, iterations=num_iterations
			)
			# model = LdaMulticore(corpus, 
			# 	alpha=alpha, eta=beta,
			# 	id2word=dictionary, num_topics=num_topics, iterations=num_iterations, workers=2
			# )
			print 'Done training.'
			model.save(output_file)

			# Print top 10 words in topics, if desired
			if print_topics:
				topics = model.show_topics(num_topics=100, formatted=False)
				for topic in topics:
					for tup in topic[1]:
						print tup[0] + ": " + str(tup[1])
					print '\n'

			# Evaluate perplexity
			ll = model.log_perplexity(test_corpus)
			print "LL:   "+str(ll)
			print "Perp: "+str(np.exp2(-ll))
Example #36
0
class CustomLda(object):
    def __init__(self, data=None, dictionary=None):
        """ initialize, data should be provided, only when unpickling class object it is not needed!"""
        self.data = data
        self.model = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.dictionary = dictionary
        if self.data is not None:
            if self.dictionary is None:
                self.dictionary = Dictionary(self.data)
            self.corpus = [self.dictionary.doc2bow(text) for text in self.data]
        else:
            self.dictionary = None
            self.corpus = None
        self.distributed = None
        self.chuncksize = None
        self.passes = None
        self.update_every = None
        self.alpha = None
        self.eta = None
        self.decay = None
        self.offset = None
        self.eval_every = None
        self.gamma_threshold = None
        self.minimum_probability = None
        self.ns_conf = None
        self.minimum_phi_value = None
        self.per_word_topics = None
        self.num_topics = None
        self.iterations = None
        self.random_state = None
        self.model = None
        self.coherence_model = None
        self.coherence = None
        self.coherence_type = None

    def train(self,
              num_topics,
              iterations=1500,
              random_state=1,
              distributed=False,
              chunksize=2000,
              passes=1,
              update_every=1,
              alpha='symmetric',
              eta=None,
              decay=0.5,
              offset=1.0,
              eval_every=10,
              gamma_threshold=0.001,
              minimum_probability=0.01,
              ns_conf=None,
              minimum_phi_value=0.01,
              per_word_topics=False,
              workers=1):
        """train lda model. If workers >1, goes multicore"""

        self.distributed = distributed
        self.chuncksize = chunksize
        self.passes = passes
        self.update_every = update_every
        self.alpha = alpha
        self.eta = eta
        self.decay = decay
        self.offset = offset
        self.eval_every = eval_every
        self.gamma_threshold = gamma_threshold
        self.minimum_probability = minimum_probability
        self.ns_conf = ns_conf
        self.minimum_phi_value = minimum_phi_value
        self.per_word_topics = per_word_topics
        self.num_topics = num_topics
        self.iterations = iterations
        self.random_state = random_state
        self.workers = workers

        if self.workers > 1:
            self.model = LdaMulticore(
                workers=3,
                corpus=self.corpus,
                id2word=self.dictionary,
                iterations=self.iterations,
                num_topics=self.num_topics,
                random_state=self.
                random_state,  # distributed=self.distributed,
                chunksize=self.chuncksize,
                passes=self.passes,  # update_every= self.update_every,
                alpha=self.alpha,
                eta=self.eta,
                decay=self.decay,
                offset=self.offset,
                eval_every=self.eval_every,
                gamma_threshold=self.gamma_threshold,
                minimum_probability=self.
                minimum_probability,  # ns_conf=self.ns_conf,
                minimum_phi_value=self.minimum_phi_value,
                per_word_topics=self.per_word_topics)
        else:
            self.model = LdaModel(corpus=self.corpus,
                                  id2word=self.dictionary,
                                  iterations=self.iterations,
                                  num_topics=self.num_topics,
                                  random_state=self.random_state,
                                  distributed=self.distributed,
                                  chunksize=self.chuncksize,
                                  passes=self.passes,
                                  update_every=self.update_every,
                                  alpha=self.alpha,
                                  eta=self.eta,
                                  decay=self.decay,
                                  offset=self.offset,
                                  eval_every=self.eval_every,
                                  gamma_threshold=self.gamma_threshold,
                                  minimum_probability=self.minimum_probability,
                                  ns_conf=self.ns_conf,
                                  minimum_phi_value=self.minimum_phi_value,
                                  per_word_topics=self.per_word_topics)
        print('Trained!')

    def _train_coherence_model(self, coherence_type='u_mass'):
        """could be made on top of model to get coherence, type could be 'u_mass' or 'c_v'"""
        self.coherence_model = CoherenceModel(model=self.model,
                                              texts=self.data,
                                              dictionary=self.dictionary,
                                              coherence=coherence_type)

    def _calculate_coherence(self, coherence_type='u_mass'):
        self._train_coherence_model(coherence_type=coherence_type)
        self.coherence = self.coherence_model.get_coherence()

    def get_coherence(self, coherence_type='u_mass'):
        if coherence_type != self.coherence_type:
            self._calculate_coherence(coherence_type=coherence_type)
        return self.coherence

    def get_topic_terms(self, num, topn=10):
        return self.model.get_topic_terms(num, topn=topn)

    def get_preplexity(self):
        return self.model.log_perplexity(self.corpus)

    def get_topics(self, num):
        return self.model.show_topics(num)

    def _make_visualization(self):
        """prepare visualisation for display/saving"""
        return pyLDAvis.gensim.prepare(self.model,
                                       self.corpus,
                                       self.dictionary,
                                       sort_topics=False)

    def display(self):
        """display LDAvis in notebook"""
        visualisation = self._make_visualization()
        return pyLDAvis.display(visualisation)

    def save_ldavis(self, filename='topic.html'):
        """save LDAvis to .html"""
        ldavis = self._make_visualization()
        pyLDAvis.save_html(ldavis, filename)

    def save_lda(self, filename):
        """save lda model only"""
        self.model.save(filename)

    def pickle(self, filename):
        """save class instance to file"""
        f = open(filename, 'wb')
        pickle.dump(self, f, pickle.HIGHEST_PROTOCOL)
        f.close()

    @staticmethod
    def unpickle(filename):
        """read class instance from file"""
        with open(filename, 'rb') as f:
            return pickle.load(f)

    def predict_topic(self, doc_list):
        """predict topic of document list (consists of strings"""
        topic_list = []
        for doc in doc_list:
            bow = self.dictionary.doc2bow(str(doc).split())
            topics_probs = self.model.get_document_topics(bow)
            topics_probs.sort(key=lambda tup: tup[1], reverse=True)
            topic_list.append(topics_probs)
        return topic_list
def program_clusters(pgms,n_topics,data):
    #First we need to filter the data by program code. Some grants have multiple program
    #codes, so we first filter through to determine which cells contain the program code
    #then we replace the exisiting program code(s) with the provided one. This ensures there
    #is only one code per award.
    awds = data
    awds = awds[awds['ProgramElementCode(s)'].str.contains('|'.join(pgms))]
    for x in pgms:
        awds['ProgramElementCode(s)'] = np.where(awds['ProgramElementCode(s)'].str.contains(x), x, awds['ProgramElementCode(s)'] )
        
    abstracts = awds[['ProgramElementCode(s)', 'AwardNumber','Abstract']].copy()
    #This is a pretty clean data set, but there are some empty entries, so we
    #filter them out here
    abstracts = abstracts.dropna()
    
    #Here we start building our dictinary and creating the cleaned up corpus.
    #We start by  removing stop words, punctuation, and stemming or lemmatizing
    #he abstract text
    stop    = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    lemma   = WordNetLemmatizer()
    stemmer2 = SnowballStemmer("english", ignore_stopwords=True)
    # pass the article text as string "doc"
    
    #Here we use a small nexted function to pass through each abstract individually
    def clean(doc):
        #here we clean up errent breaks like <br/>
        doc = re.sub('<.*?>', ' ', doc)
        #This creates a long string
        #of words while excluding stop words
        stop_free  = " ".join([i for i in doc.lower().split() if i not in stop])
        #This goes through each character and removes punctuation
        punct_free  = ''.join(ch for ch in stop_free if ch not in exclude)
        words   = punct_free.split()
        return words
        
    
    #Here is where we pass each abstract through the cleaning function
    abstracts['clean_abstracts'] = [clean(doc) for doc in abstracts['Abstract']]
    
    # So we can use bigrams and trigrams, we create new models, running through our
    #cleaned abstracts
    bigram = Phrases(list(abstracts['clean_abstracts']), min_count=5, threshold=100) # higher threshold fewer phrases.
    trigram =Phrases(bigram[list(abstracts['clean_abstracts'])], threshold=100)  
    bigram_mod = Phraser(bigram)
    trigram_mod = Phraser(trigram)  
    
    #This function applies the bigram and trigram functions and lemmatizes the 
    #the abstracts and only keeps words that a greater than 2 characters
    def word_mod(doc):
        bigs = bigram_mod[doc]
        tris = trigram_mod[bigs]
        lemm = " ".join(lemma.lemmatize(word) for word in tris)
        #stemm    = " ".join(stemmer2.stem(word) for word in punct_free.split())
        words = lemm.split()
        # only take words which are greater than 2 characters
        cleaned = [word for word in words if len(word) > 2]
        return cleaned
    abstracts['clean_abstracts'] = [word_mod(doc) for doc in abstracts['clean_abstracts']]  
    
    
    # Here we create the dictionary from the corpus of abstracts, where each unique term is assigned an index. 
    dictionary = corpora.Dictionary(abstracts['clean_abstracts'])
    # Filter terms which occurs in less than 4 articles & more than 40% of the abstracts 
    dictionary.filter_extremes(no_below=4, no_above=0.4)
    #This creates a sparse matrix of word frequencies in each abstracts
    abstract_term_matrix = [dictionary.doc2bow(doc) for doc in abstracts['clean_abstracts']]   
    
    # Here we create and train the LDA model, passing in our term frequncy matrix, the number of
    #topics/clusters to be created, and our dictionary
    ldamodel = Lda(abstract_term_matrix, num_topics= n_topics, id2word = dictionary, passes=15, iterations=500)
              
    # Here we print out the top 10 words for each topic and their weight
    for i,topic in enumerate(ldamodel.print_topics(num_topics=10, num_words=10)):
       words = topic[1].split("+")
       print (words,"\n")
     
     #Next we want to know what topic each abstract belongs to we pass each abstract
     #into the get_document_topics method and it returns the topic and the 
     #probability of the abstract beloning to a that topic. We take the one that
     #has the highest probability
    def pred_topic(doc):
        doc_bow = ldamodel.id2word.doc2bow(doc)
        doc_topics = ldamodel.get_document_topics(doc_bow, minimum_probability=0.20)  
        if doc_topics:
            doc_topics.sort(key = operator.itemgetter(1), reverse=True)
            theme = doc_topics[0][0]
        else:
            theme = np.nan
        return theme

    abstracts['predicted topic'] = [pred_topic(doc) for doc in abstracts['clean_abstracts']]
    
    #Here we do a histogram of how many abstracts/awards fall into each topic
    ab_hist = abstracts.groupby(['predicted topic','ProgramElementCode(s)'])['AwardNumber'].count()
    cols = [color for name, color in mcolors.TABLEAU_COLORS.items()] 
    f1, ax  = plt.subplots()
    ab_hist.plot.bar(rot = 0, color = cols )
    ax.set_xticklabels([x[0] for x in ab_hist.index])
    ax.set_xlabel('Topic Number')
    ax.set_ylabel('Count of Awards in Topic')
    ax.set_title('Distribution of Awards in Derived Topic Areas')
    plt.show()
    
    #Here we create a word cloud for each of the top words in the topic. Their size 
    #is indicative of their weight.
    cloud = WordCloud(stopwords=stopwords.words('english'),
                      background_color='white',
                      width=2500,
                      height=1800,
                      max_words=10,
                      colormap='tab10',
                      color_func=lambda *args, **kwargs: cols[i],
                      prefer_horizontal=1.0)
    
    topics = ldamodel.show_topics(formatted=False)
    fig, axes = plt.subplots(1, n_topics, figsize=(10,10), sharex=True, sharey=True)
    for i, ax in enumerate(axes.flatten()):
        fig.add_subplot(ax)
        topic_words = dict(topics[i][1])
        cloud.generate_from_frequencies(topic_words, max_font_size=300)
        plt.gca().imshow(cloud)
        plt.gca().set_title('Topic ' + str(i), fontdict=dict(size=16))
        plt.gca().axis('off')   
    plt.subplots_adjust(wspace=0, hspace=0)
    plt.axis('off')
    plt.margins(x=0, y=0)
    plt.tight_layout()
    plt.show()
    
        
    #Next we'll do a t-SNE plot clustering the abstracts based off the topic
    #probabilities returned from the model. This creates a array where each
    #column is a topic and each row is an abstract and each entry is the probability
    #that the abstract belongs to that topic.
    col_ns = range(0,n_topics)
    topic_weights = pd.DataFrame(columns = col_ns)
    for i in range(0,len(ldamodel[abstract_term_matrix])):
        weights = ldamodel[abstract_term_matrix][i]
        for j in range(0, len(weights)):
           entry = pd.DataFrame(columns = col_ns)
           idx = weights[j][0]
           entry.loc[0,idx] = weights[j][1]
        topic_weights = topic_weights.append(entry)
    topic_weights.reset_index(drop = True, inplace = True)
    
    # Replace any nan entries (because there was zero probability the 
    #abstract belonged in that topic) with zero
    arr = pd.DataFrame(topic_weights).fillna(0).values
    
    # We can limit this to only well separated abstracts as well
    #arr = arr[np.amax(arr, axis=1) > 0.15]
    
    # This is pulls out the highest probability topic for each abstract.  We'll
    #use this for the color scheme in the t-SNE plot.
    topic_num = np.argmax(arr, axis=1)
    
    # Here we initialize and fit our t-SNE model
    tsne_model = TSNE(n_components=2, verbose=1, random_state=0, angle=.99, init='pca')
    tsne_lda = tsne_model.fit_transform(arr)
    
    #Here we plot out the results for the t-SNE transformation
      
    mycolors = np.array([color for name, color in mcolors.TABLEAU_COLORS.items()])
    title ="t-SNE Clustering of {} LDA Topics".format(n_topics)
    f = plt.figure()
    plt.scatter(x=tsne_lda[:,0], y=tsne_lda[:,1], color=mycolors[topic_num])
    plt.title(title)
    plt.show()