Esempio n. 1
0
    def compute(self, item_id):
        # refresh Tracked term
        if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type(
                'word'):
            self.list_tracked_words = Term.get_tracked_words_list()
            self.last_refresh_word = time.time()
            self.redis_logger.debug('Tracked word refreshed')
            print('Tracked word refreshed')

        if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type(
                'set'):
            self.set_tracked_words_list = Term.get_set_tracked_words_list()
            self.last_refresh_set = time.time()
            self.redis_logger.debug('Tracked set refreshed')
            print('Tracked set refreshed')

        # Cast message as Item
        item = Item(item_id)
        item_date = item.get_date()
        item_content = item.get_content()

        signal.alarm(self.max_execution_time)

        dict_words_freq = None
        try:
            dict_words_freq = Term.get_text_word_frequency(item_content)
        except TimeoutException:
            self.redis_logger.warning(f"{item.get_id()} processing timeout")
        else:
            signal.alarm(0)

        if dict_words_freq:
            # create token statistics
            # for word in dict_words_freq:
            #    Term.create_token_statistics(item_date, word, dict_words_freq[word])
            item_source = item.get_source()

            # check solo words
            ####### # TODO: check if source needed #######
            for word in self.list_tracked_words:
                if word in dict_words_freq:
                    self.new_term_found(word, 'word', item.get_id(),
                                        item_source)

            # check words set
            for elem in self.set_tracked_words_list:
                list_words = elem[0]
                nb_words_threshold = elem[1]
                word_set = elem[2]
                nb_uniq_word = 0

                for word in list_words:
                    if word in dict_words_freq:
                        nb_uniq_word += 1
                if nb_uniq_word >= nb_words_threshold:
                    self.new_term_found(word_set, 'set', item.get_id(),
                                        item_source)
Esempio n. 2
0
    max_execution_time = p.config.getint(config_section, "max_execution_time")

    full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url

    while True:

        item_id = p.get_from_set()

        if item_id is not None:

            item_date = Item.get_item_date(item_id)
            item_content = Item.get_item_content(item_id)

            signal.alarm(max_execution_time)
            try:
                dict_words_freq = Term.get_text_word_frequency(item_content)
            except TimeoutException:
                print("{0} processing timeout".format(paste.p_rel_path))
                continue
            else:
                signal.alarm(0)

            # create token statistics
            for word in dict_words_freq:
                Term.create_token_statistics(item_date, word,
                                             dict_words_freq[word])

            # check solo words
            for word in list_tracked_words:
                if word in dict_words_freq:
                    new_term_found(word, 'word', item_id, item_date)