def __init__(self, model_name, func_tokenizer, func_stemmer): ''' - Pre-condition: pickle files or mongodb data_home/ : 'dict_articles.pkl', 'df_articles.pkl', self.model_name + 'W_articles.pkl' self.model_name + 'X_articles.csv' - INPUT: model_name str func_tokenizer tokenizer used in the model_name func_stemmer stemmer used in the model_name - Post-condition: self.df_articles, self.W_articles, self.X_articles ''' if model_name == '': model_name = 'v2_2' self.top_k_recommend = 5 self.max_rank = 50 self.top_k_topics = 5 self.max_cosine_sim_tfidf = 0.5 self.method = None # content or rating self.max_len_body_text = 600 # 3000 self.model_name = model_name # 'v2_2' t0 = time.time() # time it self.topic_model = load_topic_model( model_name, func_tokenizer, func_stemmer) t1 = time.time() # time it print "finished in %4.4f %s " % ((t1 - t0) / 60, 'loading model\n') # load all articles # these include newest articles, which may not be used in model_name # and related H t0 = t1 df_article_fname = data_home + 'df_articles.pkl' dict_article_fname = data_home + 'dict_articles.pkl' W_article_fname = data_home + self.model_name + 'W_articles.pkl' X_article_fname = data_home + self.model_name + 'X_articles.csv' if os.path.exists(df_article_fname): print 'found picklet files %s' % df_article_fname self.load_articles_from_pickle( df_article_fname, W_article_fname, X_article_fname, dict_article_fname) else: print 'no pickle files %s. read from mongodb' % df_article_fname self.df_articles = read_articles() self.W_articles, tokenized_articles, self.X_articles = self.topic_model.transform_bodytext2topics( self.df_articles.body_text, 1) with open(df_article_fname, 'w') as out_fh: pickle.dump(self.df_articles, out_fh) with open(W_article_fname, 'w') as out_fh: pickle.dump(self.W_articles, out_fh) with open(X_article_fname, 'w') as out_fh: pickle.dump(self.X_articles, out_fh) with open(dict_article_fname, 'w') as out_fh: pickle.dump(self.df_articles.to_dict(), out_fh) # print topic_model.sorted_topics_for_articles(W_articles[:1,:]) self.sorted_topics_articles = self.topic_model.sorted_topics_for_articles( self.W_articles) t1 = time.time() # time it print 'topics for articles:' print "finished in %4.4f min for %s " % ((t1 - t0) / 60, 'topics of articles\n')
def make_recommendation(fname, model_name='v2_2'): ''' test run content based recommendation content in fname command line make recommendations - INPUT: fname str input file name ( in folder data/) - OUTPUT: ''' # load model t0 = time.time() func_tokenizer = TfidfVectorizer(stop_words='english').build_tokenizer() #model_name = 'v2_2' topic_model = load_topic_model(model_name, func_tokenizer) t1 = time.time() # time it print "finished in %4.4f %s " % ((t1 - t0) / 60, 'loading model\n') t0 = t1 print 'fname: %s' % fname #read in input cleaned_slack = read_slack_msgs(func_tokenizer, fname=fname) # print type(cleaned_slack) W, tokenized_slacks2, test_X2 = topic_model.transform_bodytext2topics( [cleaned_slack], 1) print 'topics for slack messages' print topic_model.sorted_topics_for_articles(W) t1 = time.time() # time it print "finished in %4.4f min %s " % ((t1 - t0) / 60, 'topics of slack message\n') # load articles t0 = t1 df_articles = read_articles() W_articles, tokenized_articles, X_articles = topic_model.transform_bodytext2topics( df_articles.body_text, 1) # print topic_model.sorted_topics_for_articles(W_articles[:1,:]) sorted_topics_articles = topic_model.sorted_topics_for_articles(W_articles) # print sorted_topics_articles[:1] t1 = time.time() # time it print '%i articles processed' % df_articles.shape[0] print "finished in %4.4f min for %s " % ((t1 - t0) / 60, 'topics of articles\n') #test_X2, tokenized_slacks2 = transform_tfidf(vectorizer, [cleaned_slack]) #test_X2 = test_X2.getA().flatten() # summary of input top_n = 50 print "top %i most frequenct features in input %s" % (top_n, fname) sorted_feature_indexes = np.argsort(test_X2, axis=1) # print test_X2[desc_feature_indexes[:top_n]] features = topic_model.vectorizer.get_feature_names() i_article = 0 desc_feature_indexes = sorted_feature_indexes[ i_article, :].getA().flatten()[::-1] txt_list = [] for i in desc_feature_indexes[:top_n]: txt_list.append('%s (%.2f)' % (features[i], test_X2[i_article, i])) print ', '.join(txt_list) # cal simmilarity to all articles t0 = time.time() cosine_similarities = linear_kernel( X_articles, test_X2[i_article, :]).flatten() cosin_simi_latent_topics = linear_kernel( W_articles, W[i_article, :]).flatten() cosine_similarities_rank = get_rank(cosine_similarities) cosin_simi_latent_topics_rank = get_rank(cosin_simi_latent_topics) t1 = time.time() # time it print "finished in %4.4f min for %s " % ((t1 - t0) / 60, 'calculate cosine similarity\n') # diagnostic similarity plots fig, ax = plt.subplots(1, 2, figsize=(10, 6)) ax[0].scatter(cosine_similarities, cosin_simi_latent_topics, alpha=0.2) ax[0].set_title('cosine similarity: tfidf vs. latent topic') ax[1].scatter( cosine_similarities_rank, cosin_simi_latent_topics_rank, alpha=0.2) ax[1].set_title('rank cosine similarity: tfidf vs. latent topic') # plt.show() fig.savefig(fname.replace('.txt', '') + '_similarities.png') plt.close(fig) fig = plt.figure() plt.hist(cosine_similarities, bins=30, alpha=0.2, label='tfidf') plt.hist(cosin_simi_latent_topics, bins=30, alpha=0.2, label='topics') plt.title('cosine similarity to all articles') plt.legend() fig.savefig(fname.replace('.txt', '') + '_similarity_hist.png') # plt.show() # recommendations print '--------------- recommendations --------------' desc_sim_indexes = np.argsort(cosine_similarities)[::-1] cosine_similarities[desc_sim_indexes[:20]] i_print = 0 top_k = 5 for i in desc_sim_indexes[:50]: if cosin_simi_latent_topics_rank[i] < 50 and i_print < top_k: url = df_articles.iloc[i].url print sorted_topics_articles[i][0:2] print cosine_similarities[i], cosine_similarities_rank[i], '***' + df_articles.iloc[i].title + '***' print cosin_simi_latent_topics[i], cosin_simi_latent_topics_rank[i] print body_cleaned = ascii_text(df_articles.iloc[i].body_text[:300]) print body_cleaned print i_print = i_print + 1