def compute(self, message): obj_id = Item.get_item_id(message) # Extract info from message content = Item.get_item_content(obj_id) date = Item.get_item_date(obj_id) for decoder in self.decoder_order: # add threshold and size limit # max execution time on regex signal.alarm(decoder['max_execution_time']) try: encoded_list = decoder['regex'].findall(content) except TimeoutException: encoded_list = [] self.process.incr_module_timeout_statistic( ) # add encoder type self.redis_logger.debug(f"{obj_id} processing timeout") continue else: signal.alarm(0) if (len(encoded_list) > 0): content = self.decode_string(content, message, date, encoded_list, decoder['name'], decoder['encoded_min_size'])
def compute(self, item_id): # refresh Tracked term if self.last_refresh_word < Term.get_tracked_term_last_updated_by_type( 'word'): self.list_tracked_words = Term.get_tracked_words_list() self.last_refresh_word = time.time() self.redis_logger.debug('Tracked word refreshed') if self.last_refresh_set < Term.get_tracked_term_last_updated_by_type( 'set'): self.set_tracked_words_list = Term.get_set_tracked_words_list() self.last_refresh_set = time.time() self.redis_logger.debug('Tracked set refreshed') # Cast message as Item item_date = Item.get_item_date(item_id) item_content = Item.get_item_content(item_id) signal.alarm(self.max_execution_time) dict_words_freq = None try: dict_words_freq = Term.get_text_word_frequency(item_content) except TimeoutException: self.redis_logger.warning("{0} processing timeout".format(item_id)) else: signal.alarm(0) if dict_words_freq: # create token statistics #for word in dict_words_freq: # Term.create_token_statistics(item_date, word, dict_words_freq[word]) # check solo words for word in self.list_tracked_words: if word in dict_words_freq: self.new_term_found(word, 'word', item_id, item_date) # check words set for elem in self.set_tracked_words_list: list_words = elem[0] nb_words_threshold = elem[1] word_set = elem[2] nb_uniq_word = 0 for word in list_words: if word in dict_words_freq: nb_uniq_word += 1 if nb_uniq_word >= nb_words_threshold: self.new_term_found(word_set, 'set', item_id, item_date)
publisher.info("Script TermTrackerMod started") config_section = 'TermTrackerMod' p = Process(config_section) max_execution_time = p.config.getint(config_section, "max_execution_time") full_item_url = p.config.get("Notifications", "ail_domain") + full_item_url while True: item_id = p.get_from_set() if item_id is not None: item_date = Item.get_item_date(item_id) item_content = Item.get_item_content(item_id) signal.alarm(max_execution_time) try: dict_words_freq = Term.get_text_word_frequency(item_content) except TimeoutException: print("{0} processing timeout".format(paste.p_rel_path)) continue else: signal.alarm(0) # create token statistics for word in dict_words_freq: Term.create_token_statistics(item_date, word, dict_words_freq[word])
#iban_regex = re.compile(r'\b[A-Za-z]{2}[0-9]{2}(?:[ ]?[0-9]{4}){4}(?:[ ]?[0-9]{1,2})?\b') iban_regex = re.compile( r'\b([A-Za-z]{2}[ \-]?[0-9]{2})(?=(?:[ \-]?[A-Za-z0-9]){9,30})((?:[ \-]?[A-Za-z0-9]{3,5}){2,6})([ \-]?[A-Za-z0-9]{1,3})\b' ) iban_regex_verify = re.compile(r'^([A-Z]{2})([0-9]{2})([A-Z0-9]{9,30})$') while True: message = p.get_from_set() if message is not None: obj_id = Item.get_item_id(message) content = Item.get_item_content(obj_id) signal.alarm(max_execution_time) try: l_iban = iban_regex.findall(content) except TimeoutException: print("{0} processing timeout".format(obj_id)) continue else: signal.alarm(0) if (len(l_iban) > 0): check_all_iban(l_iban, obj_id) else: publisher.debug("Script BankAccount is Idling 10s")