def incr_task(topic):
     T_ChannelTopicTrends.increment(channel=self.channel,
                                    time_slot=time_slot,
                                    topic=topic,
                                    status=status,
                                    intention_ids=[1],
                                    inc_dict={'topic_count': 1})
     return True
    def test_transaction(self):
        from solariat_bottle.db.channel_topic_trends import ChannelTopicTrends as T_ChannelTopicTrends
        #
        # class T_ChannelTopicTrends(ChannelTopicTrends, Transactional):
        #     def upsert(self, w=1):
        #         return Transactional.upsert(self)

        time_slot = datetime_to_timeslot(now(), 'month')
        topic = 'laptop'
        from itertools import cycle
        colliding_topics = [
            "oldie", "bt subscribers",
            "pisces woman", "layman"]
        gen_colliding_topics = cycle(colliding_topics)

        status = 0

        def incr_task(topic):
            T_ChannelTopicTrends.increment(channel=self.channel,
                                           time_slot=time_slot,
                                           topic=topic,
                                           status=status,
                                           intention_ids=[1],
                                           inc_dict={'topic_count': 1})
            return True

        # get_var('_TEST_TRANSACTION_FAILURE') = True
        settings.DEBUG      = True
        settings.USE_CELERY = False

        from multiprocessing import Process

        proc_num = 100
        processes = [Process(target=incr_task, args=(gen_colliding_topics.next(),)) for i in range(proc_num)]
        for proc in processes:
            proc.start()
        for proc in processes:
            proc.join()

        for topic in colliding_topics:
            colliding_topics.index(topic)
            doc = T_ChannelTopicTrends(
                channel   = self.channel,
                time_slot = time_slot,
                topic     = topic,
                status    = status
            )
            doc.reload()
            self.assertEqual(doc.filter(intention=1, is_leaf=True)[0].topic_count, proc_num / len(colliding_topics))
            self.assertTrue(doc.version > 1)
Exemple #3
0
    def test_duplicate_handle_diff_channels(self):

        channel3 = TwitterChannel.objects.create_by_user(
            self.user,
            title='TestChannel3',
            type='twitter',
            intention_types=SA_TYPES)

        duplicate_post = self._create_db_post(
            channels=[self.channel2, channel3],
            content=self.duplicate_content,
            url=self.url,
            twitter={
                "created_at": "Wed, 06 Aug 2014 18:38:47 +0000",
                "id": "497089420017676290"
            })

        self.assertEqual(len(duplicate_post.channels), 3)

        time_slot = datetime_to_timeslot(now(), 'day')
        ht_stat = ChannelHotTopics.objects.by_time_span(
            channel=self.channel2,
            from_ts=datetime_to_timeslot(None, 'day'),
        )

        tt_stat = ChannelTopicTrends(channel=self.channel2,
                                     time_slot=time_slot,
                                     topic=self.topic,
                                     status=0)

        self.assertEqual(ht_stat, self.hot_topic_stat)
        self.assertEqual(tt_stat, self.topic_trends_stat)
Exemple #4
0
def purge_outdated_trends_stats(coll, channel, level, delta):
    initial_timedelta_arg_name = {"hour": "days", "day": "months"}[level]
    timedelta_arg_name = {"hour": "hours", "day": "days"}[level]
    start_dt = now() - relativedelta(**{initial_timedelta_arg_name: delta})
    current_dt = start_dt
    time_step = relativedelta(**{timedelta_arg_name: 1})
    ts = datetime_to_timeslot(current_dt, level)
    zero_counts = 0
    total_records_removed = 0
    EMPTY_SLOTS_NUMBER = 10
    while zero_counts <= EMPTY_SLOTS_NUMBER:
        t0 = datetime.now()
        channel_ts_val = ChannelTopicTrends.make_channel_ts(channel, ts)
        res = coll.objects.coll.remove(coll.objects.get_query(time_slot=ts))
        if res['n'] == 0:
            zero_counts += 1
        current_dt = current_dt - time_step
        total_records_removed += res['n']
        ts = datetime_to_timeslot(current_dt, level)
        LOGGER.info(
            "purging Q:: collection: %s; func: %s; timedelta: %s; date: %s; level: %s; records removed: %s",
            coll.__name__,
            inspect.stack()[0][3],
            datetime.now() - t0, current_dt, level, res['n'])
    return total_records_removed
Exemple #5
0
def extend_trends(channel):
    from solariat_bottle.db.channel_topic_trends import ChannelTopicTrends
    from solariat_bottle.utils.id_encoder import (
        pack_components, CHANNEL_WIDTH, TIMESLOT_WIDTH, BIGGEST_STATUS_VALUE,
        BIGGEST_TOPIC_VALUE, BIGGEST_TIMESOLT_VALUE)

    logger.info("------------------------")
    if channel.is_migrated:
        logger.info("SKIPPING CHANNEL: %s" % channel.title)
        return

    lower_bound = ChannelTopicTrends.make_id(channel, 0, 0, 0)
    upper_bound = ChannelTopicTrends.make_id(channel, BIGGEST_TIMESOLT_VALUE,
                                             BIGGEST_TOPIC_VALUE,
                                             BIGGEST_STATUS_VALUE)
    count = ChannelTopicTrends.objects(id__gte=lower_bound,
                                       id__lte=upper_bound).count()
    logger.info("CHANNEL START: %s (%s)" % (channel.title, count))

    from solariat.db.fields import BytesField
    l = BytesField().to_mongo
    limit = 100
    offset = 0

    while offset <= count:
        logger.info("--> channel: %s offset %s of %s" %
                    (channel.title, offset, count))
        query = ChannelTopicTrends.objects(id__gte=lower_bound,
                                           id__lte=upper_bound)
        query = query.skip(offset).limit(100)
        for trend in query:
            channel_num, topic_hash, status, time_slot = trend.unpacked
            channel_ts = pack_components(
                (channel_num, CHANNEL_WIDTH),
                (time_slot, TIMESLOT_WIDTH),
            )
            ChannelTopicTrends.objects.coll.update(
                {"_id": l(trend.id)}, {"$set": {
                    "ct": l(channel_ts)
                }},
                upsert=False)
        offset += limit

    channel.is_migrated = True
    channel.save()

    logger.info("CHANNEL END: %s (%s)" % (channel.title, count))
Exemple #6
0
def print_db_records():
    # print "Topics:"
    # for row in ChannelHotTopics.objects():
    #     print "{0: ^14s} | {1: ^4s}".format(row.topic, decode_timeslot(row.time_slot))
    # print
    print "Trends:"
    for row in ChannelTopicTrends.objects():
        print u"{0: ^14s} | {1: ^4s}".format(row.topic,
                                             decode_timeslot(row.time_slot))
    print
    print
    def test_number_of_leafs(self):
        """ Note: leaf means it is a stat record for a specific topic (max tri-gram),
                  not a smaller part of the topic
        """
        content = "I need a mac laptop"

        self._create_db_post(content)
        leaf_stats = [
            s for s in ChannelTopicTrends.objects() if s.filter(is_leaf=True)
        ]
        self.assertEqual(
            len(leaf_stats),
            2)  # ("mac laptop") x (hour + day)  #NO __ALL__, it is not a leaf
    def test_number_of_stats_intention_id(self):
        content = "I need a mac laptop"
        #topics: "mac laptop"
        #terms: "mac laptop", "laptop"
        post = self._create_db_post(content)

        intention_title = post.speech_acts[0]['intention_type']
        intention_id = get_sa_type_id(intention_title)
        stats = [
            s for s in ChannelTopicTrends.objects()
            if s.filter(intention=int(intention_id))
        ]
        self.assertEqual(len(stats), (2 + 1) * 2)

        needs_count = sum(
            s.filter(intention=int(intention_id), is_leaf=False)[0].topic_count
            for s in ChannelTopicTrends.objects() if s.topic != ALL_TOPICS)
        self.assertEqual(needs_count, 2 * 2)

        stats = [
            s for s in ChannelTopicTrends.objects() if s.filter(intention=15)
        ]
        self.assertEqual(len(stats), 0)
    def test_number_of_nodes(self):
        """ Note: node means it is a stat record for a smaller part of a bigger topic,
                  not a topic itself
        """
        content = "I need a mac laptop"

        self._create_db_post(content)

        node_stats = [
            True for s in ChannelTopicTrends.objects()
            if s.filter(is_leaf=False)
        ]
        self.assertEqual(
            len(node_stats),
            (2 + 1) * 2)  # ("mac laptop", "laptop", "__ALL__") x (hour + day )
Exemple #10
0
def trends_mark_to_remove(time_slot, channel_or_tag, counter):
    channel_ts_val = ChannelTopicTrends.make_channel_ts(
        channel_or_tag, time_slot)
    # import ipdb; ipdb.set_trace()
    t0 = datetime.now()
    res = ChannelTopicTrends.objects.coll.update(
        {FT("channel_ts"): to_binary(channel_ts_val)},
        {'$set': {
            FT('gc_counter'): counter
        }},
        multi=True)
    LOGGER.info(
        "purging Q:: channel: %s; collection: ChannelTopicTrends; func: %s; timedelta: %s"
        % (channel_or_tag.title, inspect.stack()[0][3], datetime.now() - t0))
    return res
Exemple #11
0
 def _store_existing_data(self):
     # Keep track of what was in database when this was called
     self.ctt = {}
     self.ctt_bk = {}
     self.cht = {}
     self.ct = {}
     self.ctt_count = ChannelTopicTrends.objects.count()
     self.cht_count = ChannelHotTopics.objects.count()
     self.ct_count = ChannelTrends.objects.count()
     for ctt in ChannelTopicTrends.objects():
         self.ctt_bk[ctt.data['_id']] = ctt.data
         self.ctt[ctt.data['_id']] = self._process_es(ctt)
     for cht in ChannelHotTopics.objects():
         self.cht[cht.data['_id']] = self._process_es(cht)
     for ct in ChannelTrends.objects():
         self.ct[ct.data['_id']] = self._process_es(ct)
Exemple #12
0
    def test_outdated_trends4(self):
        """
        all existing hour stats should be kept
        """
        date_now = now()
        date_old = now() - relativedelta(
            days=get_var('TOPIC_TRENDS_HOUR_STATS_KEEP_DAYS') - 1, hours=23)
        self._make_laptops_and_icecream(_created=date_old)
        total_trends = ChannelTopicTrends.objects().count()
        hour_trends = total_trends / 2
        day_trends = total_trends / 2

        stats = purge_stats(self.channel)
        self.assertEqual(day_trends, 6)
        self.assertEqual(hour_trends, 6)
        self.assertEqual(stats['discard_junk_stats']['trends_day_count'], 0)
        self.assertEqual(stats['discard_junk_stats']['trends_hour_count'], 0)
Exemple #13
0
 def _compare_existing_data(self):
     # Compare what is currently in database with what we have stored
     for ctt in ChannelTopicTrends.objects():
         for data in ctt.data['es']:
             keys = tuple(sorted(data.keys()))
             values = tuple(sorted(data.values()))
             self.assertTrue((keys, values) in self.ctt[ctt.data['_id']])
     for cht in ChannelHotTopics.objects():
         for data in cht.data['es']:
             keys = tuple(sorted(data.keys()))
             values = tuple(sorted(data.values()))
             self.assertTrue((keys, values) in self.cht[cht.data['_id']])
     for ct in ChannelTrends.objects():
         for data in ct.data['es']:
             keys = tuple(sorted(data.keys()))
             values = tuple(sorted(data.values()))
             self.assertTrue((keys, values) in self.ct[ct.data['_id']])
Exemple #14
0
    def test_outdated_trends2(self):
        """
        all existing stats should be kept, cause it's not too old
        """
        date_now = now()
        date_old = now() - relativedelta(
            months=get_var('TOPIC_TRENDS_DAY_STATS_KEEP_MONTHS'))

        self._make_laptops_and_icecream(_created=date_now)
        total_trends = ChannelTopicTrends.objects().count()
        hour_trends = total_trends / 2
        day_trends = total_trends / 2

        stats = purge_stats(self.channel)
        self.assertEqual(day_trends, 6)
        self.assertEqual(hour_trends, 6)
        self.assertEqual(stats['discard_junk_stats']['trends_day_count'], 0)
        self.assertEqual(stats['discard_junk_stats']['trends_hour_count'], 0)
Exemple #15
0
def trends_mark_to_keep(time_slot, channel_or_tag, topics):
    channel_ts_val = ChannelTopicTrends.make_channel_ts(
        channel_or_tag, time_slot)
    t0 = datetime.now()
    res = ChannelTopicTrends.objects.coll.update(
        {
            FT("channel_ts"): to_binary(channel_ts_val),
            FT('topic'): {
                "$in": topics + ["__ALL__"]
            }
        }, {'$set': {
            FT('gc_counter'): MARKED_TO_KEEP
        }},
        multi=True)
    LOGGER.info(
        "purging Q:: channel: %s; collection: ChanneTopicTrends; func: %s; timedelta: %s"
        % (channel_or_tag.title, inspect.stack()[0][3], datetime.now() - t0))
    return res
Exemple #16
0
    def test_outdated_trends3(self):
        """
        all existing hour stats should be removed, cause it's too old
        """
        date_now = now()
        date_old = now() - relativedelta(
            days=get_var('TOPIC_TRENDS_HOUR_STATS_KEEP_DAYS'), hours=1)

        LOGGER.info(
            "11111111, %s, %s, %s" %
            (date_now, date_old, get_var('TOPIC_TRENDS_HOUR_STATS_KEEP_DAYS')))
        self._make_laptops_and_icecream(_created=date_old)
        total_trends = ChannelTopicTrends.objects().count()
        hour_trends = total_trends / 2
        day_trends = total_trends / 2

        stats = purge_stats(self.channel)
        self.assertEqual(day_trends, 6)
        self.assertEqual(hour_trends, 6)
        self.assertEqual(stats['discard_junk_stats']['trends_day_count'], 0)
        self.assertEqual(stats['discard_junk_stats']['trends_hour_count'], 6)
Exemple #17
0
    def setUp(self):

        super(TestDuplicatePostProcessing, self).setUp()

        self.created = now()
        self.url = '%s/posts/%s' % (get_var('HOST_DOMAIN'), str(ObjectId()))
        self.content = "I'm so much want to buy a new laptop"
        self.duplicate_content = "I'm so much want to find a laptop"

        self.channel2 = TwitterChannel.objects.create_by_user(
            self.user,
            title='TestChannel2',
            type='twitter',
            intention_types=SA_TYPES)

        self.post = self._create_db_post(
            channels=[self.channel, self.channel2],
            content=self.content,
            url=self.url,
            twitter={
                "created_at": "Wed, 06 Aug 2014 18:38:47 +0000",
                "id": "497089420017676290"
            })

        time_slot = datetime_to_timeslot(now(), 'day')
        self.topic = "laptop"

        self.hot_topic_stat = ChannelHotTopics.objects.by_time_span(
            channel=self.channel2,
            from_ts=datetime_to_timeslot(None, 'day'),
        )

        self.topic_trends_stat = ChannelTopicTrends(channel=self.channel2,
                                                    time_slot=time_slot,
                                                    topic=self.topic,
                                                    status=0)
Exemple #18
0
def _update_monthly_cht_values(channel, from_date_end, to_date_end, topics):
    """ Do upsert on monthly values based on the daily values.
    """
    from solariat.utils.timeslot import datetime_to_timeslot
    from solariat_bottle.utils.id_encoder import get_topic_hash
    from solariat_nlp.utils.topics import get_subtopics

    from solariat_bottle.db.speech_act import SpeechActMap
    from solariat_bottle.db.channel_hot_topics import ChannelHotTopics
    from solariat_bottle.db.channel_topic_trends import ChannelTopicTrends
    from solariat_bottle.db.channel_stats_base import CountDict, batch_insert

    start_time = datetime.now()
    statuses = SpeechActMap.STATUS_NAME_MAP.keys()
    insertable_values = {}

    if not topics:
        logger.warning("No topics found for channel %s." % (channel.title, ))
        return

    month_intervals = _generate_day_level_ranges(from_date_end, to_date_end)
    for topic in topics:
        for from_date, to_date in month_intervals:
            or_query = []
            # $match query
            for topic, status in product([topic], statuses):
                from_id = ChannelTopicTrends.make_id(
                    channel, datetime_to_timeslot(from_date, 'day'), topic,
                    status)
                to_id = ChannelTopicTrends.make_id(
                    channel, datetime_to_timeslot(to_date, 'day'), topic,
                    status)
                or_query.append({"_id": {"$gte": from_id, "$lte": to_id}})

            if len(or_query) == 1:
                match_query = or_query[0]
            else:
                match_query = {"$or": or_query}

            pipeline = [{
                "$match": match_query
            }, {
                "$unwind": '$es'
            }, {
                '$group': {
                    '_id': {
                        'grp_at': '$es.at',
                        'grp_if': '$es.if',
                        'grp_in': '$es.in',
                        'grp_le': '$es.le',
                        'grp_tc': '$tc',
                        'grp_ss': '$ss'
                    },
                    'count': {
                        '$sum': '$es.tt'
                    }
                }
            }]
            month_level_counts = {}
            agreggation_result = ChannelHotTopics.objects.coll.aggregate(
                pipeline)
            if agreggation_result['ok']:
                for aggregated_count in agreggation_result['result']:
                    month_id = ChannelHotTopics.make_id(
                        channel=channel,
                        time_slot=datetime_to_timeslot(from_date, 'month'),
                        topic=aggregated_count['_id']['grp_tc'],
                        status=aggregated_count['_id']['grp_ss'])
                    if month_id in month_level_counts:
                        month_doc = month_level_counts[month_id]
                    else:
                        hashed_parents = map(
                            get_topic_hash,
                            get_subtopics(aggregated_count['_id']['grp_tc']))
                        month_doc = ChannelHotTopics(
                            channel=channel,
                            hashed_parents=hashed_parents,
                            time_slot=datetime_to_timeslot(from_date, 'month'),
                            topic=aggregated_count['_id']['grp_tc'],
                            status=aggregated_count['_id']['grp_ss'])
                        month_doc.version = 0
                        month_doc.embedded_dict = {}
                        month_level_counts[month_id] = month_doc

                    es_key = (aggregated_count['_id']['grp_at'],
                              aggregated_count['_id']['grp_if'],
                              aggregated_count['_id']['grp_in'],
                              aggregated_count['_id']['grp_le'])
                    # Default increment for all existign stats to 0, we will add to this later.
                    month_doc.embedded_dict[es_key] = CountDict(
                        {'topic_count': aggregated_count['count']})
                for key in month_level_counts:
                    insertable_values[key] = month_level_counts[key]
            else:
                logger.warning("Pipeline failed. Returned %s." %
                               agreggation_result)

    if insertable_values:
        ChannelHotTopics.objects.coll.remove(
            {'_id': {
                '$in': insertable_values.keys()
            }})
    batch_insert(insertable_values.values())
    logger.info("Integrating monthly level topics took: " +
                str(datetime.now() - start_time))
Exemple #19
0
    def test_multi_post(self):
        contents = [
            'Any recommendations for a basketball scholarship? I need a basketball scholarship.',
            'Any recommendations for a basketball scholarship? I need a basketball scholarship.',
            'I love my display!',
            'My display is just not working out for me :-(',
            'Any recommendations for a display?', 'I like my display'
        ]

        for content in contents:
            post = self._create_db_post(content, channel=self.channel)

        from solariat_bottle.db.speech_act import SpeechActMap

        stats_by_topic_intention = {}

        #Calculating stats iterating through SAM
        from solariat_bottle.db.post.base import Post
        for post in Post.objects(channels__in=[self.channel.id]):
            for sa in post.speech_acts:
                topics = sa['intention_topics']
                int_id = sa['intention_type_id']
                topics.append('__ALL__')
                for topic in topics:
                    if topic in stats_by_topic_intention:
                        if str(int_id) in stats_by_topic_intention[topic]:
                            stats_by_topic_intention[topic][str(int_id)] += 1
                        else:
                            stats_by_topic_intention[topic][str(int_id)] = 1
                    else:
                        stats_by_topic_intention[topic] = {str(int_id): 1}

        expected_stats_from_sam = {
            u'basketball scholarship': {
                '1': 2,
                '2': 2
            },
            u'display': {
                '1': 1,
                '3': 1,
                '4': 2
            },
            '__ALL__': {
                '1': 3,
                '3': 1,
                '2': 2,
                '4': 2
            }
        }

        self.assertDictEqual(stats_by_topic_intention, expected_stats_from_sam)

        time_slot = datetime_to_timeslot(
            Post.objects(
                channels__in=[self.channel.id]).limit(1)[0].created_at, 'hour')
        status = SpeechActMap.ACTIONABLE

        #Now verify SAM stats correspond to ChannelTopicTrends stats
        for topic, sa_stats in stats_by_topic_intention.iteritems():
            if topic == '__ALL__':
                continue

            stat = ChannelTopicTrends(channel=self.channel,
                                      time_slot=time_slot,
                                      topic=topic,
                                      status=status)
            stat.reload()
            ctt_by_int = {}
            filtered = stat.filter(is_leaf=True, intention__ne=0)

            for s in filtered:
                ctt_by_int[str(s.intention)] = s.topic_count
            self.assertDictEqual(ctt_by_int, sa_stats)
Exemple #20
0
    def test_stat_update(self):
        time_slot = datetime_to_timeslot(now(), 'hour')

        topic = 'laptop'
        agent_id = 12345

        stat = ChannelTopicTrends(channel=self.channel,
                                  time_slot=time_slot,
                                  topic=topic,
                                  status=0)

        stat.compute_increments(is_leaf=True,
                                intention_ids=JUNK,
                                agent=None,
                                inc_dict={'topic_count': 1},
                                n=1)
        stat.compute_increments(is_leaf=False,
                                intention_ids=HELP,
                                agent=None,
                                inc_dict={'topic_count': 1},
                                n=1)
        stat.upsert()

        stat = ChannelTopicTrends.objects.get(id=stat.id)

        stat.compute_increments(is_leaf=True,
                                intention_ids=JUNK,
                                agent=agent_id,
                                inc_dict={'topic_count': 2},
                                n=1)

        stat.compute_increments(is_leaf=False,
                                intention_ids=HELP,
                                agent=None,
                                lang_id=EN,
                                inc_dict={'topic_count': 2},
                                n=1)
        stat.upsert()

        stat.reload()

        expected_stats = [
            (ALL_AGENTS, Term, ALL_INTENTIONS_INT, LALL, 1 + 2),  # +2 for EN
            (ALL_AGENTS, Term, HELP, LALL, 1 + 2),
            (ALL_AGENTS, Term, ALL_INTENTIONS_INT, EN, 2),
            (ALL_AGENTS, Term, HELP, EN, 2),
            (ALL_AGENTS, Topic, ALL_INTENTIONS_INT, LALL,
             1 + 2),  # +2 from specific agent
            (ALL_AGENTS, Topic, JUNK, LALL, 1 + 2),
            (agent_id, Topic, ALL_INTENTIONS_INT, LALL, 2),
            (agent_id, Topic, JUNK, LALL, 2)
        ]

        self.assert_stats(stat, expected_stats)

        self.assertFalse(stat.filter(agent=0, is_leaf=True,
                                     intention=10))  # no such combination