Esempio n. 1
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(
            id=item_id,file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item,extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass
Esempio n. 2
0
    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item, extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass
Esempio n. 3
0
 def __op_refresh(self, data_loaded, connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         item = UrlItem.load_with_content(
             id=item_id, file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item, extractor)
                 delete_ids.append(item_id)
     # clear delete_ids
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]
Esempio n. 4
0
 def __op_refresh(self,data_loaded,connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         if not os.path.isfile(
                 config.path_extractor_inbox+"/"+str(item_id)+".html"):
             delete_ids.append(item_id)
             continue
         item = UrlItem.load_with_content(
             id=item_id,file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item,extractor)
                 delete_ids.append(item_id)
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]