def save_data_for_frontend(model, vectorizer, df): doc_ids = np.argsort(model.doc_topic_, axis=0)[-5:-1,:].T doc_probs = np.sort(model.doc_topic_, axis=0)[-5:-1,:].T topic_total_probs = np.sum(doc_probs, axis=1) ## extract and prepare most probable words. ## split bigrams and take the unique set of the resulting word list. w = p.most_probable_words(model, vectorizer.get_feature_names(), 10) word_data = collections.defaultdict(list) for topic, g in w.groupby('topic'): word_data[topic] = ', '.join([w.capitalize() for w in p.unique(itertools.chain(*g.sort('prob', ascending=False)['word'].str.split(' ').values))]) # word_data[topic] = ', '.join([str(g['prob'].sum())] + [w.capitalize() for w in p.unique(itertools.chain(*g.sort('prob', ascending=False)['word'].str.split(' ').values))]) # for k,v in word_data.iteritems(): # print k # print topic_total_probs[k] # word_data[k] = v + str(topic_total_probs[k]) with open('frontend/app/word_data.pkl', 'w') as f: pickle.dump(word_data, f) di = pd.DataFrame(doc_ids) di['topic'] = di.index di = pd.melt(di, id_vars='topic') di.columns = ['topic','rank','key'] dp = pd.DataFrame(doc_probs) dp['topic'] = dp.index dp = pd.melt(dp, id_vars='topic') dp.columns = ['topic','rank','prob'] dd = pd.merge(di, dp) ## merge in document data for the most probable documents. df['topic'] = np.argmax(model.doc_topic_, axis=1).T df['topic_prob'] = np.max(model.doc_topic_, axis=1).T df['key'] = df.index most_probable_docs = pd.merge(df, dd) ## TODO: do the decoding here. most_probable_docs['ingredient_txt'] = [w for w in most_probable_docs['ingredient_txt'].str.split('\n') if w != []] doc_data = collections.defaultdict(list) for topic, g in most_probable_docs.groupby('topic'): row = g.sort('prob')[['ingredient_txt','image','url','title', 'key']].values doc_data[topic] = map(lambda x: dict(zip(['ingredient','image','url','title','key'], x)), row) with open('frontend/app/doc_data.pkl', 'w') as f: pickle.dump(doc_data, f) engine = p.make_engine() df.to_sql('clean_recipes', engine, if_exists='replace')
w2v.most_similar(positive=['chicken','thighs']) df = p.clean_formatting(df) assert(type(df)) is pd.core.frame.DataFrame, "%r is not a DataFrame." % df assert(df.shape) == (16526, 7), "Has the wrong shape." vectorizer, features = p.extract_features(df, title=True) ## trying cosine similarity between topics and search term vector. m = p.run_model(features, n_topics=40, random_state=0, n_iter=100) ## get topics as words: topics = p.most_probable_words(m, vectorizer.get_feature_names(), 100) ## word to vec. df['ingredient_txt'].values[0] ## search string: raw = 'chicken lemon' ## using gensim similarities. from gensim import corpora, models, similarities