def parse_item(response): # response are all updated or new # db not changed # is this url in URL_LIB item = UrlItem.load_with_content(url=response.url, response=response) logging.info("PC get page [%s]:- %s" % (item['id'], item['url'])) yield item
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content( id=item_id,file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item,extractor) return extractor = config.const_RULE_UNKNOW self.__ext_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "decision": item['is_target'], "extractor": extractor } log.info("[%s]: # %s " % (item_id, extractor)) pass
def parse_item(response): # response are all updated or new # db not changed # is this url in URL_LIB item = UrlItem.load_with_content(url=response.url, response=response) logging.info("PC get page [%s]:- %s" % (item['id'], item['url'])) yield item
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content( id=item_id,file_path=config.path_judge_inbox) feature = self.__fe.extract_item(item) if 'decision' not in data_loaded.keys(): decision, confidence = self.__auto_judge(feature) log.info("[%s]: [%s] # %s # %s%%" % (item_id, FeatureExtract.str_feature(feature), decision, confidence)) else: decision, confidence = data_loaded['decision'],100 log.info("[%s]: back from Extractor # %s # %s%%" % (item_id, decision, confidence)) self.__relearn_clf(feature,decision) if confidence > config.const_CONFIDENCE_THRESHOLD: item['is_target'] = decision item.save() if int(item['is_target']) in [config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE]: self.__send_to_extractor(item) else: os.remove(config.path_judge_inbox + "/%s" % item.filename()) else: item['is_target'] = config.const_IS_TARGET_UNKNOW item.save() self.__judge_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "confidence": round(confidence,2), "decision": decision, "feature": feature } pass
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content(id=item_id, file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item, extractor) return extractor = config.const_RULE_UNKNOW self.__ext_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "decision": item['is_target'], "extractor": extractor } log.info("[%s]: # %s " % (item_id, extractor)) pass
def __op_test_rule(self, data_loaded, connection): item_id = int(data_loaded['id']) rule = data_loaded['rule'] attrid = int(data_loaded['attrid']) item = UrlItem.load_with_content(id=item_id, file_path=config.path_extractor_inbox) tool.send_msg( connection, self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid)) pass
def __op_add_extractor(self, data_loaded, connection): item_id = int(data_loaded['id']) extractor = data_loaded['extractor'] item = UrlItem.load_with_content(item_id, file_path=config.path_extractor_inbox) self.__extract(item, extractor) del self.__ext_queue[item['id']] tool.send_msg(connection, "0") pass
def __op_test_rule(self, data_loaded, connection): item_id = int(data_loaded['id']) rule = data_loaded['rule'] attrid = int(data_loaded['attrid']) item = UrlItem.load_with_content( id=item_id, file_path=config.path_extractor_inbox) tool.send_msg( connection, self.__ie.extract_attr(item, rule_id_or_dict=rule, attr_id=attrid) ) pass
def __op_add_extractor(self, data_loaded, connection): item_id = int(data_loaded['id']) extractor = data_loaded['extractor'] item = UrlItem.load_with_content( item_id, file_path=config.path_extractor_inbox) self.__extract(item,extractor) del self.__ext_queue[item['id']] tool.send_msg( connection, "0" ) pass
def __op_preview(self, data_loaded, connection): log.info(data_loaded['extractor']) if data_loaded['extractor'] == config.const_RULE_UNKNOW: result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)} else: item_id = int(data_loaded['id']) item = UrlItem.load_with_content( item_id, file_path=config.path_extractor_inbox) extractor = data_loaded['extractor'] result = self.__ie.extract(item, extractor) preview = list() for att, str in result.iteritems(): preview.insert(att, dict(name=self.__ie.name(att), value=str)) log.info(preview) tool.send_msg(connection, pickle.dumps(preview, -1)) pass
def __op_preview(self, data_loaded, connection): log.info(data_loaded['extractor']) if data_loaded['extractor'] == config.const_RULE_UNKNOW: result = {x: "" for x in xrange(1, self.__ie.num_attr() + 1)} else: item_id = int(data_loaded['id']) item = UrlItem.load_with_content( item_id, file_path=config.path_extractor_inbox) extractor = data_loaded['extractor'] result = self.__ie.extract(item, extractor) preview = list() for att, str in result.iteritems(): preview.insert(att, dict(name=self.__ie.name(att), value=str)) log.info(preview) tool.send_msg(connection, pickle.dumps(preview, -1)) pass
def __op_new(self, data_loaded, connection): item_id = int(data_loaded['id']) item = UrlItem.load_with_content(id=item_id, file_path=config.path_judge_inbox) feature = self.__fe.extract_item(item) if 'decision' not in data_loaded.keys(): decision, confidence = self.__auto_judge(feature) log.info("[%s]: [%s] # %s # %s%%" % (item_id, FeatureExtract.str_feature(feature), decision, confidence)) else: decision, confidence = data_loaded['decision'], 100 log.info("[%s]: back from Extractor # %s # %s%%" % (item_id, decision, confidence)) self.__relearn_clf(feature, decision) if confidence > config.const_CONFIDENCE_THRESHOLD: # pretty sure, save to db, and pass to extract item['is_target'] = decision item.save() if int(item['is_target']) in [ config.const_IS_TARGET_MULTIPLE, config.const_IS_TARGET_SIGNLE ]: # item is target self.__send_to_extractor(item) else: # item is not target os.remove(config.path_judge_inbox + "/%s" % item.filename()) else: # not sure, put it in queue, involving human-being item['is_target'] = config.const_IS_TARGET_UNKNOW item.save() self.__judge_queue[item_id] = { "title": item['title'], "url": item['url'], "filename": item.filename(), "confidence": round(confidence, 2), "decision": decision, "feature": feature } pass
def __op_refresh(self, data_loaded, connection): delete_ids = [] for key, ent in self.__ext_queue.iteritems(): item_id = int(key) item = UrlItem.load_with_content( id=item_id, file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item, extractor) delete_ids.append(item_id) # clear delete_ids for ent_id in delete_ids: del self.__ext_queue[ent_id]
def __op_refresh(self,data_loaded,connection): delete_ids = [] for key, ent in self.__ext_queue.iteritems(): item_id = int(key) if not os.path.isfile( config.path_extractor_inbox+"/"+str(item_id)+".html"): delete_ids.append(item_id) continue item = UrlItem.load_with_content( id=item_id,file_path=config.path_extractor_inbox) count, maps = db.get_url_with_same_layout_hash(item['layout_hash']) log.info(str(maps)) log.info(count) if len(maps) > 0: import operator tar_ext = max(maps.iteritems(), key=operator.itemgetter(1)) log.info(float(tar_ext[1]) / len(maps)) if tar_ext[1] > config.extractor_same_layout_number: extractor = tool.str2extractor(tar_ext[0]) self.__extract(item,extractor) delete_ids.append(item_id) for ent_id in delete_ids: del self.__ext_queue[ent_id]
__author__ = 'LeoDong' import requests from util import config from bs4 import BeautifulSoup from extractor.InfoExtractor import InfoExtractor from SAECrawlers.items import UrlItem import json ie = InfoExtractor(config.path_extract_onto + "/seminar.xml", config.path_extract_onto) item = UrlItem.load_with_content(id=1, file_path=config.path_judge_inbox) # print item.get_part('soup').prettify() # print item.get_part('content') rule = { "on": "content", "scope": { "sel": "section#visible-body .logo", "target": "text" }, "description": "url", "actions": [2], "substring": { "after": "H", "before": "" }, } print ie.extract_attr(item, rule_id_or_dict=rule) # print json.dumps(ie.map(1), indent=2)
__author__ = 'LeoDong' import requests from util import config from bs4 import BeautifulSoup from extractor.InfoExtractor import InfoExtractor from SAECrawlers.items import UrlItem import json ie = InfoExtractor(config.path_extract_onto+"/seminar.xml",config.path_extract_onto) item = UrlItem.load_with_content(id=1,file_path=config.path_judge_inbox) # print item.get_part('soup').prettify() # print item.get_part('content') rule={ "on": "content", "scope": { "sel":"section#visible-body .logo", "target":"text" }, "description": "url", "actions": [ 2 ], "substring": { "after": "H", "before": "" }, } print ie.extract_attr(item,rule_id_or_dict=rule)
def parse(self, response): item = UrlItem.load_with_content(url=response.url, response=response) logging.debug("Updater get page [%s]:- %s" % (item['id'], item['url'])) yield item