Exemple #1
0
    def compute(self, message):

        obj_id = Item.get_item_id(message)

        # Extract info from message
        content = Item.get_item_content(obj_id)
        date = Item.get_item_date(obj_id)

        for decoder in self.decoder_order:  # add threshold and size limit
            # max execution time on regex
            signal.alarm(decoder['max_execution_time'])

            try:
                encoded_list = decoder['regex'].findall(content)
            except TimeoutException:
                encoded_list = []
                self.process.incr_module_timeout_statistic(
                )  # add encoder type
                self.redis_logger.debug(f"{obj_id} processing timeout")
                continue
            else:
                signal.alarm(0)

                if (len(encoded_list) > 0):
                    content = self.decode_string(content, message, date,
                                                 encoded_list, decoder['name'],
                                                 decoder['encoded_min_size'])
    def compute(self, item_id):
        # refresh Tracked term
        if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type(
                'word'):
            self.list_tracked_words = Term.get_tracked_words_list()
            self.last_refresh_word = time.time()
            self.redis_logger.debug('Tracked word refreshed')

        if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type(
                'set'):
            self.set_tracked_words_list = Term.get_set_tracked_words_list()
            self.last_refresh_set = time.time()
            self.redis_logger.debug('Tracked set refreshed')

        # Cast message as Item
        item_date = Item.get_item_date(item_id)
        item_content = Item.get_item_content(item_id)

        signal.alarm(self.max_execution_time)

        dict_words_freq = None
        try:
            dict_words_freq = Term.get_text_word_frequency(item_content)
        except TimeoutException:
            self.redis_logger.warning("{0} processing timeout".format(item_id))
        else:
            signal.alarm(0)

        if dict_words_freq:
            # create token statistics
            #for word in dict_words_freq:
            #    Term.create_token_statistics(item_date, word, dict_words_freq[word])

            # check solo words
            for word in self.list_tracked_words:
                if word in dict_words_freq:
                    self.new_term_found(word, 'word', item_id, item_date)

                # check words set
                for elem in self.set_tracked_words_list:
                    list_words = elem[0]
                    nb_words_threshold = elem[1]
                    word_set = elem[2]
                    nb_uniq_word = 0

                    for word in list_words:
                        if word in dict_words_freq:
                            nb_uniq_word += 1
                    if nb_uniq_word >= nb_words_threshold:
                        self.new_term_found(word_set, 'set', item_id,
                                            item_date)
Exemple #3
0
    publisher.info("Script TermTrackerMod started")

    config_section = 'TermTrackerMod'
    p = Process(config_section)
    max_execution_time = p.config.getint(config_section, "max_execution_time")

    full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url

    while True:

        item_id = p.get_from_set()

        if item_id is not None:

            item_date = Item.get_item_date(item_id)
            item_content = Item.get_item_content(item_id)

            signal.alarm(max_execution_time)
            try:
                dict_words_freq = Term.get_text_word_frequency(item_content)
            except TimeoutException:
                print("{0} processing timeout".format(paste.p_rel_path))
                continue
            else:
                signal.alarm(0)

            # create token statistics
            for word in dict_words_freq:
                Term.create_token_statistics(item_date, word,
                                             dict_words_freq[word])
    #iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b')
    iban_regex = re.compile(
        r'\b([A-Za-z]{2}[ \-]?[0-9]{2})(?=(?:[ \-]?[A-Za-z0-9]){9,30})((?:[ \-]?[A-Za-z0-9]{3,5}){2,6})([ \-]?[A-Za-z0-9]{1,3})\b'
    )
    iban_regex_verify = re.compile(r'^([A-Z]{2})([0-9]{2})([A-Z0-9]{9,30})$')

    while True:

        message = p.get_from_set()

        if message is not None:

            obj_id = Item.get_item_id(message)

            content = Item.get_item_content(obj_id)

            signal.alarm(max_execution_time)
            try:
                l_iban = iban_regex.findall(content)
            except TimeoutException:
                print("{0} processing timeout".format(obj_id))
                continue
            else:
                signal.alarm(0)

            if (len(l_iban) > 0):
                check_all_iban(l_iban, obj_id)

        else:
            publisher.debug("Script BankAccount is Idling 10s")