Python UrlItem.load_with_content Beispiele, SAECrawlers.items.UrlItem.load_with_content Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: PagesCrawler.py Projekt: flytian/saextractor

 def parse_item(response):
     # response are all updated or new
     # db not changed
     # is this url in URL_LIB
     item = UrlItem.load_with_content(url=response.url, response=response)
     logging.info("PC get page [%s]:- %s" % (item['id'], item['url']))
     yield item

Beispiel #2

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: ileodo/msc-thesis

    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(
            id=item_id,file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item,extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass

Beispiel #3

0

Datei anzeigen

Datei: PagesCrawler.py Projekt: ileodo/saextractor

 def parse_item(response):
     # response are all updated or new
     # db not changed
     # is this url in URL_LIB
     item = UrlItem.load_with_content(url=response.url, response=response)
     logging.info("PC get page [%s]:- %s" % (item['id'], item['url']))
     yield item

Beispiel #4

0

Datei anzeigen

Datei: SAEJudge.py Projekt: ileodo/msc-thesis

    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(
            id=item_id,file_path=config.path_judge_inbox)
        feature = self.__fe.extract_item(item)

        if 'decision' not in data_loaded.keys():
            decision, confidence = self.__auto_judge(feature)
            log.info("[%s]: [%s] # %s # %s%%" % (item_id, FeatureExtract.str_feature(feature), decision, confidence))
        else:
            decision, confidence = data_loaded['decision'],100
            log.info("[%s]: back from Extractor # %s # %s%%" % (item_id, decision, confidence))
            self.__relearn_clf(feature,decision)

        if confidence > config.const_CONFIDENCE_THRESHOLD:
            item['is_target'] = decision
            item.save()
            if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]:
                self.__send_to_extractor(item)
            else:
                os.remove(config.path_judge_inbox + "/%s" % item.filename())
        else:
            item['is_target'] = config.const_IS_TARGET_UNKNOW
            item.save()

            self.__judge_queue[item_id] = {
                "title": item['title'],
                "url": item['url'],
                "filename": item.filename(),
                "confidence": round(confidence,2),
                "decision": decision,
                "feature": feature
            }
        pass

Beispiel #5

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: flytian/saextractor

    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_extractor_inbox)

        count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
        log.info(str(maps))
        log.info(count)
        if len(maps) > 0:
            import operator

            tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
            log.info(float(tar_ext[1]) / len(maps))
            if tar_ext[1] > config.extractor_same_layout_number:
                extractor = tool.str2extractor(tar_ext[0])
                self.__extract(item, extractor)
                return

        extractor = config.const_RULE_UNKNOW

        self.__ext_queue[item_id] = {
            "title": item['title'],
            "url": item['url'],
            "filename": item.filename(),
            "decision": item['is_target'],
            "extractor": extractor
        }

        log.info("[%s]: # %s " % (item_id, extractor))

        pass

Beispiel #6

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: flytian/saextractor

 def __op_test_rule(self, data_loaded, connection):
     item_id = int(data_loaded['id'])
     rule = data_loaded['rule']
     attrid = int(data_loaded['attrid'])
     item = UrlItem.load_with_content(id=item_id,
                                      file_path=config.path_extractor_inbox)
     tool.send_msg(
         connection,
         self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid))
     pass

Beispiel #7

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: flytian/saextractor

    def __op_add_extractor(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        extractor = data_loaded['extractor']

        item = UrlItem.load_with_content(item_id,
                                         file_path=config.path_extractor_inbox)
        self.__extract(item, extractor)
        del self.__ext_queue[item['id']]
        tool.send_msg(connection, "0")
        pass

Beispiel #8

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: ileodo/msc-thesis

 def __op_test_rule(self, data_loaded, connection):
     item_id = int(data_loaded['id'])
     rule = data_loaded['rule']
     attrid = int(data_loaded['attrid'])
     item = UrlItem.load_with_content(
         id=item_id, file_path=config.path_extractor_inbox)
     tool.send_msg(
         connection,
         self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid)
     )
     pass

Beispiel #9

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: ileodo/msc-thesis

    def __op_add_extractor(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        extractor = data_loaded['extractor']

        item = UrlItem.load_with_content(
            item_id, file_path=config.path_extractor_inbox)
        self.__extract(item,extractor)
        del self.__ext_queue[item['id']]
        tool.send_msg(
            connection,
            "0"
        )
        pass

Beispiel #10

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: ileodo/msc-thesis

 def __op_preview(self, data_loaded, connection):
     log.info(data_loaded['extractor'])
     if data_loaded['extractor'] == config.const_RULE_UNKNOW:
         result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)}
     else:
         item_id = int(data_loaded['id'])
         item = UrlItem.load_with_content(
             item_id, file_path=config.path_extractor_inbox)
         extractor = data_loaded['extractor']
         result = self.__ie.extract(item, extractor)
     preview = list()
     for att, str in result.iteritems():
         preview.insert(att, dict(name=self.__ie.name(att), value=str))
     log.info(preview)
     tool.send_msg(connection, pickle.dumps(preview, -1))
     pass

Beispiel #11

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: flytian/saextractor

 def __op_preview(self, data_loaded, connection):
     log.info(data_loaded['extractor'])
     if data_loaded['extractor'] == config.const_RULE_UNKNOW:
         result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)}
     else:
         item_id = int(data_loaded['id'])
         item = UrlItem.load_with_content(
             item_id, file_path=config.path_extractor_inbox)
         extractor = data_loaded['extractor']
         result = self.__ie.extract(item, extractor)
     preview = list()
     for att, str in result.iteritems():
         preview.insert(att, dict(name=self.__ie.name(att), value=str))
     log.info(preview)
     tool.send_msg(connection, pickle.dumps(preview, -1))
     pass

Beispiel #12

0

Datei anzeigen

Datei: SAEJudge.py Projekt: flytian/saextractor

    def __op_new(self, data_loaded, connection):
        item_id = int(data_loaded['id'])
        item = UrlItem.load_with_content(id=item_id,
                                         file_path=config.path_judge_inbox)
        feature = self.__fe.extract_item(item)

        if 'decision' not in data_loaded.keys():
            decision, confidence = self.__auto_judge(feature)
            log.info("[%s]: [%s] # %s # %s%%" %
                     (item_id, FeatureExtract.str_feature(feature), decision,
                      confidence))
        else:
            decision, confidence = data_loaded['decision'], 100
            log.info("[%s]: back from Extractor # %s # %s%%" %
                     (item_id, decision, confidence))
            self.__relearn_clf(feature, decision)

        if confidence > config.const_CONFIDENCE_THRESHOLD:
            # pretty sure, save to db, and pass to extract
            item['is_target'] = decision
            item.save()
            if int(item['is_target']) in [
                    config.const_IS_TARGET_MULTIPLE,
                    config.const_IS_TARGET_SIGNLE
            ]:
                # item is target
                self.__send_to_extractor(item)
            else:
                # item is not target
                os.remove(config.path_judge_inbox + "/%s" % item.filename())
        else:
            # not sure, put it in queue, involving human-being
            item['is_target'] = config.const_IS_TARGET_UNKNOW
            item.save()

            self.__judge_queue[item_id] = {
                "title": item['title'],
                "url": item['url'],
                "filename": item.filename(),
                "confidence": round(confidence, 2),
                "decision": decision,
                "feature": feature
            }
        pass

Beispiel #13

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: flytian/saextractor

 def __op_refresh(self, data_loaded, connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         item = UrlItem.load_with_content(
             id=item_id, file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item, extractor)
                 delete_ids.append(item_id)
     # clear delete_ids
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]

Beispiel #14

0

Datei anzeigen

Datei: SAEExtractor.py Projekt: ileodo/msc-thesis

 def __op_refresh(self,data_loaded,connection):
     delete_ids = []
     for key, ent in self.__ext_queue.iteritems():
         item_id = int(key)
         if not os.path.isfile(
                 config.path_extractor_inbox+"/"+str(item_id)+".html"):
             delete_ids.append(item_id)
             continue
         item = UrlItem.load_with_content(
             id=item_id,file_path=config.path_extractor_inbox)
         count, maps = db.get_url_with_same_layout_hash(item['layout_hash'])
         log.info(str(maps))
         log.info(count)
         if len(maps) > 0:
             import operator
             tar_ext = max(maps.iteritems(), key=operator.itemgetter(1))
             log.info(float(tar_ext[1]) / len(maps))
             if tar_ext[1] > config.extractor_same_layout_number:
                 extractor = tool.str2extractor(tar_ext[0])
                 self.__extract(item,extractor)
                 delete_ids.append(item_id)
     for ent_id in delete_ids:
         del self.__ext_queue[ent_id]

Beispiel #15

0

Datei anzeigen

Datei: ExtractTester.py Projekt: flytian/saextractor

__author__ = 'LeoDong'
import requests
from util import config
from bs4 import BeautifulSoup
from extractor.InfoExtractor import InfoExtractor
from SAECrawlers.items import UrlItem
import json

ie = InfoExtractor(config.path_extract_onto + "/seminar.xml",
                   config.path_extract_onto)

item = UrlItem.load_with_content(id=1, file_path=config.path_judge_inbox)
# print item.get_part('soup').prettify()
# print item.get_part('content')
rule = {
    "on": "content",
    "scope": {
        "sel": "section#visible-body .logo",
        "target": "text"
    },
    "description": "url",
    "actions": [2],
    "substring": {
        "after": "H",
        "before": ""
    },
}

print ie.extract_attr(item, rule_id_or_dict=rule)

# print json.dumps(ie.map(1), indent=2)

Beispiel #16

0

Datei anzeigen

Datei: ExtractTester.py Projekt: ileodo/saextractor

__author__ = 'LeoDong'
import requests
from util import config
from bs4 import BeautifulSoup
from extractor.InfoExtractor import InfoExtractor
from SAECrawlers.items import UrlItem
import json


ie = InfoExtractor(config.path_extract_onto+"/seminar.xml",config.path_extract_onto)

item = UrlItem.load_with_content(id=1,file_path=config.path_judge_inbox)
# print item.get_part('soup').prettify()
# print item.get_part('content')
rule={
    "on": "content",
    "scope": {
        "sel":"section#visible-body .logo",
        "target":"text"
    },
    "description": "url",
    "actions": [
        2
    ],
    "substring": {
        "after": "H",
        "before": ""
    },
}

print ie.extract_attr(item,rule_id_or_dict=rule)

Beispiel #17

0

Datei anzeigen

Datei: Updater.py Projekt: flytian/saextractor

 def parse(self, response):
     item = UrlItem.load_with_content(url=response.url, response=response)
     logging.debug("Updater get page [%s]:- %s" % (item['id'], item['url']))
     yield item