def train_model(data_filename, num_topics=10, num_passes=1, topic_x_word=True, email_x_topic=True, binarize=False): print "Loading data..." data = eio.load_bow(data_filename) if binarize: data[np.nonzero(data)] = 1 #Converting to gensim corpus corpus = gensim.matutils.Dense2Corpus(data, documents_columns=False) print "Beginning training..." model = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, passes=num_passes, alpha=0.01,eta=0.01) print "Done" if email_x_topic: ext = run_inference(model, corpus) if topic_x_word: txw = extract_topic_word_dist(model) if email_x_topic and topic_x_word: return model, ext, txw elif email_x_topic: return model, ext elif topic_x_word: return model, txw else: return model
def run_inference_over_file(data_filename, model_filename, save=False, num_topics=None, output_prefix=None): print "Loading data..." data = eio.load_bow(data_filename) #Converting to gensim corpus corpus = gensim.matutils.Dense2Corpus(data, documents_columns=False) print "Loading model..." model = mi.load_model( model_filename ) ext = run_inference(model, corpus) if save: assert num_topics is not None assert output_prefix is not None num_emails = ext.shape[0] filenames = output_filenames( output_prefix, num_emails, num_topics ) ext.tofile( filenames['email_x_topic'] ) return ext