def test_phrase_search(self): """ moar tests """ mock_es_conn = MagicMock() patches = { 'RangeFilter' : DEFAULT, 'ESRange' : DEFAULT, 'FilteredQuery' : DEFAULT, 'TermsFilter' : DEFAULT, 'MatchAllQuery' : DEFAULT, 'ANDFilter' : DEFAULT, 'get_es_connection' : MagicMock(return_value=mock_es_conn) } my_dict = {'bibleverse' : 'test 1:1'} def getitem(name): return my_dict[name] def setitem(name, val): my_dict[name] = val with patch.multiple('topic_analysis.topic_extraction', **patches) as mocks: mock_es_doc = MagicMock() mock_es_doc.text = "love and awesome peace on earth! homey!" mock_es_doc._meta.score = 99 mock_es_doc.__getitem__.side_effect = getitem mock_result_set = [ mock_es_doc ] mock_es_conn.search.return_value = mock_result_set ret = topic_extraction.phrase_search([[{'text': 'love peace', 'weight' : 1}]], ['john 3:16'], date(2014, 01, 01), date(2014, 01, 07)) # num topics self.assertEquals(1, len(ret)) ret = ret[0] # num topic terms self.assertEquals(1, len(ret)) # phrase ret phrase_ret = ret[0] expected = {'weight': 1, 'text': 'love peace', 'es_score': 99, 'final_score': 99, 'bibleverse':'test 1:1', 'es_phrase': 'love and awesome peace on earth', 'tweet_text': 'love and awesome peace on earth! homey!'} self.assertEquals(expected, phrase_ret)
def main(_dt, top_n=3, n_clusters=6, num_days=15): # TODO: For some reason I need to increment the date by 1 day # to get all the tweets and bv counts. Probably due to UTC/GMT # shenanigans. et = _dt + timedelta(days=1) st = et - timedelta(days=num_days) valid_bv_set = set(BIBLEVERSE_LIST) # create a dictionary[date] = counter data = [] for created_at_date, _counter in topic_clustering.get_data_from_store(st=st, et=et, valid_bv_set=valid_bv_set): data.append((created_at_date, _counter)) data = dict(data) # filter for most common bibleverses, returns a DataFrame df = topic_clustering.get_most_common_df(data, num=top_n) # get bv counts and max counts top_df = topic_clustering.get_count_features_df(df) #print top_df # perform clustering cluster_data = topic_clustering.build_clusters(top_df, n_clusters=n_clusters) cluster_data['dates'] = data.keys() saved_cluster_data = [] for label in cluster_data['clusters']: # print df.ix[clusters[label]][["count_entries", "max"]] data = {} data['label'] = int(label) data['points'] = [] data['bibleverses'] = [] data['cluster_size'] = len(cluster_data['clusters'][label]) for bibleverse in cluster_data['clusters'][label]: data['points'].append((df["count_entries"][bibleverse], df["max"][bibleverse])) data['bibleverses'].append(bibleverse) # topic analysis bv_tokens, corpus = topic_extraction.build_corpus(st, et, cluster_data['clusters'][label]) topics = topic_extraction.nmf_topic_extraction(corpus, bv_tokens, data=data) if topics: data['topics'] = topics data['topics'] = topic_extraction.phrase_search(data['topics'], data['bibleverses'], st, et) saved_cluster_data.append(data) #print_clusters(df, cluster_data['clusters']) doc = { 'date' : _dt.strftime("%Y-%m-%d"), 'start_date' : st.strftime("%Y-%m-%d"), 'end_date' : et.strftime("%Y-%m-%d"), 'num_days' : num_days, 'n_clusters' : n_clusters, 'top_n' : top_n, 'cluster_topics' : saved_cluster_data } topic_extraction.save_topic_clusters(doc) topic_extraction.rank_phrases_and_store(doc) logger.debug("{}".format(json.dumps(doc, indent=2))) return doc