def incr_task(topic): T_ChannelTopicTrends.increment(channel=self.channel, time_slot=time_slot, topic=topic, status=status, intention_ids=[1], inc_dict={'topic_count': 1}) return True
def test_transaction(self): from solariat_bottle.db.channel_topic_trends import ChannelTopicTrends as T_ChannelTopicTrends # # class T_ChannelTopicTrends(ChannelTopicTrends, Transactional): # def upsert(self, w=1): # return Transactional.upsert(self) time_slot = datetime_to_timeslot(now(), 'month') topic = 'laptop' from itertools import cycle colliding_topics = [ "oldie", "bt subscribers", "pisces woman", "layman"] gen_colliding_topics = cycle(colliding_topics) status = 0 def incr_task(topic): T_ChannelTopicTrends.increment(channel=self.channel, time_slot=time_slot, topic=topic, status=status, intention_ids=[1], inc_dict={'topic_count': 1}) return True # get_var('_TEST_TRANSACTION_FAILURE') = True settings.DEBUG = True settings.USE_CELERY = False from multiprocessing import Process proc_num = 100 processes = [Process(target=incr_task, args=(gen_colliding_topics.next(),)) for i in range(proc_num)] for proc in processes: proc.start() for proc in processes: proc.join() for topic in colliding_topics: colliding_topics.index(topic) doc = T_ChannelTopicTrends( channel = self.channel, time_slot = time_slot, topic = topic, status = status ) doc.reload() self.assertEqual(doc.filter(intention=1, is_leaf=True)[0].topic_count, proc_num / len(colliding_topics)) self.assertTrue(doc.version > 1)
def test_duplicate_handle_diff_channels(self): channel3 = TwitterChannel.objects.create_by_user( self.user, title='TestChannel3', type='twitter', intention_types=SA_TYPES) duplicate_post = self._create_db_post( channels=[self.channel2, channel3], content=self.duplicate_content, url=self.url, twitter={ "created_at": "Wed, 06 Aug 2014 18:38:47 +0000", "id": "497089420017676290" }) self.assertEqual(len(duplicate_post.channels), 3) time_slot = datetime_to_timeslot(now(), 'day') ht_stat = ChannelHotTopics.objects.by_time_span( channel=self.channel2, from_ts=datetime_to_timeslot(None, 'day'), ) tt_stat = ChannelTopicTrends(channel=self.channel2, time_slot=time_slot, topic=self.topic, status=0) self.assertEqual(ht_stat, self.hot_topic_stat) self.assertEqual(tt_stat, self.topic_trends_stat)
def purge_outdated_trends_stats(coll, channel, level, delta): initial_timedelta_arg_name = {"hour": "days", "day": "months"}[level] timedelta_arg_name = {"hour": "hours", "day": "days"}[level] start_dt = now() - relativedelta(**{initial_timedelta_arg_name: delta}) current_dt = start_dt time_step = relativedelta(**{timedelta_arg_name: 1}) ts = datetime_to_timeslot(current_dt, level) zero_counts = 0 total_records_removed = 0 EMPTY_SLOTS_NUMBER = 10 while zero_counts <= EMPTY_SLOTS_NUMBER: t0 = datetime.now() channel_ts_val = ChannelTopicTrends.make_channel_ts(channel, ts) res = coll.objects.coll.remove(coll.objects.get_query(time_slot=ts)) if res['n'] == 0: zero_counts += 1 current_dt = current_dt - time_step total_records_removed += res['n'] ts = datetime_to_timeslot(current_dt, level) LOGGER.info( "purging Q:: collection: %s; func: %s; timedelta: %s; date: %s; level: %s; records removed: %s", coll.__name__, inspect.stack()[0][3], datetime.now() - t0, current_dt, level, res['n']) return total_records_removed
def extend_trends(channel): from solariat_bottle.db.channel_topic_trends import ChannelTopicTrends from solariat_bottle.utils.id_encoder import ( pack_components, CHANNEL_WIDTH, TIMESLOT_WIDTH, BIGGEST_STATUS_VALUE, BIGGEST_TOPIC_VALUE, BIGGEST_TIMESOLT_VALUE) logger.info("------------------------") if channel.is_migrated: logger.info("SKIPPING CHANNEL: %s" % channel.title) return lower_bound = ChannelTopicTrends.make_id(channel, 0, 0, 0) upper_bound = ChannelTopicTrends.make_id(channel, BIGGEST_TIMESOLT_VALUE, BIGGEST_TOPIC_VALUE, BIGGEST_STATUS_VALUE) count = ChannelTopicTrends.objects(id__gte=lower_bound, id__lte=upper_bound).count() logger.info("CHANNEL START: %s (%s)" % (channel.title, count)) from solariat.db.fields import BytesField l = BytesField().to_mongo limit = 100 offset = 0 while offset <= count: logger.info("--> channel: %s offset %s of %s" % (channel.title, offset, count)) query = ChannelTopicTrends.objects(id__gte=lower_bound, id__lte=upper_bound) query = query.skip(offset).limit(100) for trend in query: channel_num, topic_hash, status, time_slot = trend.unpacked channel_ts = pack_components( (channel_num, CHANNEL_WIDTH), (time_slot, TIMESLOT_WIDTH), ) ChannelTopicTrends.objects.coll.update( {"_id": l(trend.id)}, {"$set": { "ct": l(channel_ts) }}, upsert=False) offset += limit channel.is_migrated = True channel.save() logger.info("CHANNEL END: %s (%s)" % (channel.title, count))
def print_db_records(): # print "Topics:" # for row in ChannelHotTopics.objects(): # print "{0: ^14s} | {1: ^4s}".format(row.topic, decode_timeslot(row.time_slot)) # print print "Trends:" for row in ChannelTopicTrends.objects(): print u"{0: ^14s} | {1: ^4s}".format(row.topic, decode_timeslot(row.time_slot)) print print
def test_number_of_leafs(self): """ Note: leaf means it is a stat record for a specific topic (max tri-gram), not a smaller part of the topic """ content = "I need a mac laptop" self._create_db_post(content) leaf_stats = [ s for s in ChannelTopicTrends.objects() if s.filter(is_leaf=True) ] self.assertEqual( len(leaf_stats), 2) # ("mac laptop") x (hour + day) #NO __ALL__, it is not a leaf
def test_number_of_stats_intention_id(self): content = "I need a mac laptop" #topics: "mac laptop" #terms: "mac laptop", "laptop" post = self._create_db_post(content) intention_title = post.speech_acts[0]['intention_type'] intention_id = get_sa_type_id(intention_title) stats = [ s for s in ChannelTopicTrends.objects() if s.filter(intention=int(intention_id)) ] self.assertEqual(len(stats), (2 + 1) * 2) needs_count = sum( s.filter(intention=int(intention_id), is_leaf=False)[0].topic_count for s in ChannelTopicTrends.objects() if s.topic != ALL_TOPICS) self.assertEqual(needs_count, 2 * 2) stats = [ s for s in ChannelTopicTrends.objects() if s.filter(intention=15) ] self.assertEqual(len(stats), 0)
def test_number_of_nodes(self): """ Note: node means it is a stat record for a smaller part of a bigger topic, not a topic itself """ content = "I need a mac laptop" self._create_db_post(content) node_stats = [ True for s in ChannelTopicTrends.objects() if s.filter(is_leaf=False) ] self.assertEqual( len(node_stats), (2 + 1) * 2) # ("mac laptop", "laptop", "__ALL__") x (hour + day )
def trends_mark_to_remove(time_slot, channel_or_tag, counter): channel_ts_val = ChannelTopicTrends.make_channel_ts( channel_or_tag, time_slot) # import ipdb; ipdb.set_trace() t0 = datetime.now() res = ChannelTopicTrends.objects.coll.update( {FT("channel_ts"): to_binary(channel_ts_val)}, {'$set': { FT('gc_counter'): counter }}, multi=True) LOGGER.info( "purging Q:: channel: %s; collection: ChannelTopicTrends; func: %s; timedelta: %s" % (channel_or_tag.title, inspect.stack()[0][3], datetime.now() - t0)) return res
def _store_existing_data(self): # Keep track of what was in database when this was called self.ctt = {} self.ctt_bk = {} self.cht = {} self.ct = {} self.ctt_count = ChannelTopicTrends.objects.count() self.cht_count = ChannelHotTopics.objects.count() self.ct_count = ChannelTrends.objects.count() for ctt in ChannelTopicTrends.objects(): self.ctt_bk[ctt.data['_id']] = ctt.data self.ctt[ctt.data['_id']] = self._process_es(ctt) for cht in ChannelHotTopics.objects(): self.cht[cht.data['_id']] = self._process_es(cht) for ct in ChannelTrends.objects(): self.ct[ct.data['_id']] = self._process_es(ct)
def test_outdated_trends4(self): """ all existing hour stats should be kept """ date_now = now() date_old = now() - relativedelta( days=get_var('TOPIC_TRENDS_HOUR_STATS_KEEP_DAYS') - 1, hours=23) self._make_laptops_and_icecream(_created=date_old) total_trends = ChannelTopicTrends.objects().count() hour_trends = total_trends / 2 day_trends = total_trends / 2 stats = purge_stats(self.channel) self.assertEqual(day_trends, 6) self.assertEqual(hour_trends, 6) self.assertEqual(stats['discard_junk_stats']['trends_day_count'], 0) self.assertEqual(stats['discard_junk_stats']['trends_hour_count'], 0)
def _compare_existing_data(self): # Compare what is currently in database with what we have stored for ctt in ChannelTopicTrends.objects(): for data in ctt.data['es']: keys = tuple(sorted(data.keys())) values = tuple(sorted(data.values())) self.assertTrue((keys, values) in self.ctt[ctt.data['_id']]) for cht in ChannelHotTopics.objects(): for data in cht.data['es']: keys = tuple(sorted(data.keys())) values = tuple(sorted(data.values())) self.assertTrue((keys, values) in self.cht[cht.data['_id']]) for ct in ChannelTrends.objects(): for data in ct.data['es']: keys = tuple(sorted(data.keys())) values = tuple(sorted(data.values())) self.assertTrue((keys, values) in self.ct[ct.data['_id']])
def test_outdated_trends2(self): """ all existing stats should be kept, cause it's not too old """ date_now = now() date_old = now() - relativedelta( months=get_var('TOPIC_TRENDS_DAY_STATS_KEEP_MONTHS')) self._make_laptops_and_icecream(_created=date_now) total_trends = ChannelTopicTrends.objects().count() hour_trends = total_trends / 2 day_trends = total_trends / 2 stats = purge_stats(self.channel) self.assertEqual(day_trends, 6) self.assertEqual(hour_trends, 6) self.assertEqual(stats['discard_junk_stats']['trends_day_count'], 0) self.assertEqual(stats['discard_junk_stats']['trends_hour_count'], 0)
def trends_mark_to_keep(time_slot, channel_or_tag, topics): channel_ts_val = ChannelTopicTrends.make_channel_ts( channel_or_tag, time_slot) t0 = datetime.now() res = ChannelTopicTrends.objects.coll.update( { FT("channel_ts"): to_binary(channel_ts_val), FT('topic'): { "$in": topics + ["__ALL__"] } }, {'$set': { FT('gc_counter'): MARKED_TO_KEEP }}, multi=True) LOGGER.info( "purging Q:: channel: %s; collection: ChanneTopicTrends; func: %s; timedelta: %s" % (channel_or_tag.title, inspect.stack()[0][3], datetime.now() - t0)) return res
def test_outdated_trends3(self): """ all existing hour stats should be removed, cause it's too old """ date_now = now() date_old = now() - relativedelta( days=get_var('TOPIC_TRENDS_HOUR_STATS_KEEP_DAYS'), hours=1) LOGGER.info( "11111111, %s, %s, %s" % (date_now, date_old, get_var('TOPIC_TRENDS_HOUR_STATS_KEEP_DAYS'))) self._make_laptops_and_icecream(_created=date_old) total_trends = ChannelTopicTrends.objects().count() hour_trends = total_trends / 2 day_trends = total_trends / 2 stats = purge_stats(self.channel) self.assertEqual(day_trends, 6) self.assertEqual(hour_trends, 6) self.assertEqual(stats['discard_junk_stats']['trends_day_count'], 0) self.assertEqual(stats['discard_junk_stats']['trends_hour_count'], 6)
def setUp(self): super(TestDuplicatePostProcessing, self).setUp() self.created = now() self.url = '%s/posts/%s' % (get_var('HOST_DOMAIN'), str(ObjectId())) self.content = "I'm so much want to buy a new laptop" self.duplicate_content = "I'm so much want to find a laptop" self.channel2 = TwitterChannel.objects.create_by_user( self.user, title='TestChannel2', type='twitter', intention_types=SA_TYPES) self.post = self._create_db_post( channels=[self.channel, self.channel2], content=self.content, url=self.url, twitter={ "created_at": "Wed, 06 Aug 2014 18:38:47 +0000", "id": "497089420017676290" }) time_slot = datetime_to_timeslot(now(), 'day') self.topic = "laptop" self.hot_topic_stat = ChannelHotTopics.objects.by_time_span( channel=self.channel2, from_ts=datetime_to_timeslot(None, 'day'), ) self.topic_trends_stat = ChannelTopicTrends(channel=self.channel2, time_slot=time_slot, topic=self.topic, status=0)
def _update_monthly_cht_values(channel, from_date_end, to_date_end, topics): """ Do upsert on monthly values based on the daily values. """ from solariat.utils.timeslot import datetime_to_timeslot from solariat_bottle.utils.id_encoder import get_topic_hash from solariat_nlp.utils.topics import get_subtopics from solariat_bottle.db.speech_act import SpeechActMap from solariat_bottle.db.channel_hot_topics import ChannelHotTopics from solariat_bottle.db.channel_topic_trends import ChannelTopicTrends from solariat_bottle.db.channel_stats_base import CountDict, batch_insert start_time = datetime.now() statuses = SpeechActMap.STATUS_NAME_MAP.keys() insertable_values = {} if not topics: logger.warning("No topics found for channel %s." % (channel.title, )) return month_intervals = _generate_day_level_ranges(from_date_end, to_date_end) for topic in topics: for from_date, to_date in month_intervals: or_query = [] # $match query for topic, status in product([topic], statuses): from_id = ChannelTopicTrends.make_id( channel, datetime_to_timeslot(from_date, 'day'), topic, status) to_id = ChannelTopicTrends.make_id( channel, datetime_to_timeslot(to_date, 'day'), topic, status) or_query.append({"_id": {"$gte": from_id, "$lte": to_id}}) if len(or_query) == 1: match_query = or_query[0] else: match_query = {"$or": or_query} pipeline = [{ "$match": match_query }, { "$unwind": '$es' }, { '$group': { '_id': { 'grp_at': '$es.at', 'grp_if': '$es.if', 'grp_in': '$es.in', 'grp_le': '$es.le', 'grp_tc': '$tc', 'grp_ss': '$ss' }, 'count': { '$sum': '$es.tt' } } }] month_level_counts = {} agreggation_result = ChannelHotTopics.objects.coll.aggregate( pipeline) if agreggation_result['ok']: for aggregated_count in agreggation_result['result']: month_id = ChannelHotTopics.make_id( channel=channel, time_slot=datetime_to_timeslot(from_date, 'month'), topic=aggregated_count['_id']['grp_tc'], status=aggregated_count['_id']['grp_ss']) if month_id in month_level_counts: month_doc = month_level_counts[month_id] else: hashed_parents = map( get_topic_hash, get_subtopics(aggregated_count['_id']['grp_tc'])) month_doc = ChannelHotTopics( channel=channel, hashed_parents=hashed_parents, time_slot=datetime_to_timeslot(from_date, 'month'), topic=aggregated_count['_id']['grp_tc'], status=aggregated_count['_id']['grp_ss']) month_doc.version = 0 month_doc.embedded_dict = {} month_level_counts[month_id] = month_doc es_key = (aggregated_count['_id']['grp_at'], aggregated_count['_id']['grp_if'], aggregated_count['_id']['grp_in'], aggregated_count['_id']['grp_le']) # Default increment for all existign stats to 0, we will add to this later. month_doc.embedded_dict[es_key] = CountDict( {'topic_count': aggregated_count['count']}) for key in month_level_counts: insertable_values[key] = month_level_counts[key] else: logger.warning("Pipeline failed. Returned %s." % agreggation_result) if insertable_values: ChannelHotTopics.objects.coll.remove( {'_id': { '$in': insertable_values.keys() }}) batch_insert(insertable_values.values()) logger.info("Integrating monthly level topics took: " + str(datetime.now() - start_time))
def test_multi_post(self): contents = [ 'Any recommendations for a basketball scholarship? I need a basketball scholarship.', 'Any recommendations for a basketball scholarship? I need a basketball scholarship.', 'I love my display!', 'My display is just not working out for me :-(', 'Any recommendations for a display?', 'I like my display' ] for content in contents: post = self._create_db_post(content, channel=self.channel) from solariat_bottle.db.speech_act import SpeechActMap stats_by_topic_intention = {} #Calculating stats iterating through SAM from solariat_bottle.db.post.base import Post for post in Post.objects(channels__in=[self.channel.id]): for sa in post.speech_acts: topics = sa['intention_topics'] int_id = sa['intention_type_id'] topics.append('__ALL__') for topic in topics: if topic in stats_by_topic_intention: if str(int_id) in stats_by_topic_intention[topic]: stats_by_topic_intention[topic][str(int_id)] += 1 else: stats_by_topic_intention[topic][str(int_id)] = 1 else: stats_by_topic_intention[topic] = {str(int_id): 1} expected_stats_from_sam = { u'basketball scholarship': { '1': 2, '2': 2 }, u'display': { '1': 1, '3': 1, '4': 2 }, '__ALL__': { '1': 3, '3': 1, '2': 2, '4': 2 } } self.assertDictEqual(stats_by_topic_intention, expected_stats_from_sam) time_slot = datetime_to_timeslot( Post.objects( channels__in=[self.channel.id]).limit(1)[0].created_at, 'hour') status = SpeechActMap.ACTIONABLE #Now verify SAM stats correspond to ChannelTopicTrends stats for topic, sa_stats in stats_by_topic_intention.iteritems(): if topic == '__ALL__': continue stat = ChannelTopicTrends(channel=self.channel, time_slot=time_slot, topic=topic, status=status) stat.reload() ctt_by_int = {} filtered = stat.filter(is_leaf=True, intention__ne=0) for s in filtered: ctt_by_int[str(s.intention)] = s.topic_count self.assertDictEqual(ctt_by_int, sa_stats)
def test_stat_update(self): time_slot = datetime_to_timeslot(now(), 'hour') topic = 'laptop' agent_id = 12345 stat = ChannelTopicTrends(channel=self.channel, time_slot=time_slot, topic=topic, status=0) stat.compute_increments(is_leaf=True, intention_ids=JUNK, agent=None, inc_dict={'topic_count': 1}, n=1) stat.compute_increments(is_leaf=False, intention_ids=HELP, agent=None, inc_dict={'topic_count': 1}, n=1) stat.upsert() stat = ChannelTopicTrends.objects.get(id=stat.id) stat.compute_increments(is_leaf=True, intention_ids=JUNK, agent=agent_id, inc_dict={'topic_count': 2}, n=1) stat.compute_increments(is_leaf=False, intention_ids=HELP, agent=None, lang_id=EN, inc_dict={'topic_count': 2}, n=1) stat.upsert() stat.reload() expected_stats = [ (ALL_AGENTS, Term, ALL_INTENTIONS_INT, LALL, 1 + 2), # +2 for EN (ALL_AGENTS, Term, HELP, LALL, 1 + 2), (ALL_AGENTS, Term, ALL_INTENTIONS_INT, EN, 2), (ALL_AGENTS, Term, HELP, EN, 2), (ALL_AGENTS, Topic, ALL_INTENTIONS_INT, LALL, 1 + 2), # +2 from specific agent (ALL_AGENTS, Topic, JUNK, LALL, 1 + 2), (agent_id, Topic, ALL_INTENTIONS_INT, LALL, 2), (agent_id, Topic, JUNK, LALL, 2) ] self.assert_stats(stat, expected_stats) self.assertFalse(stat.filter(agent=0, is_leaf=True, intention=10)) # no such combination