def handle_simple(self, obj): if self.flowcontrol.is_active(obj): html_header = obj.json_data html_body = obj.str_data() extracted = {} try: for field in abf_extract_body_fields(html_body): extracted[field[0]] = field[1] except: pass jd = obj.json_data jd['url'] = html_header.get("url") jd['extraction_id'] = self.sentence_id jd['extraction_module'] = 'abf' jd['extraction_kindtags'] = obj.kindtags jd['extraction_fields'] = extracted # print("ABF-EXTRACTING:\n"+json.dumps(jd, indent=' ')) return [ engine.LwObject(self.kindtags_default, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, "", None, jd, obj.sentence), ] else: return []
def handle_entity(self, obj, do_compare): oid = obj.oid kindtags = obj.kindtags kind = kindtags['kind'] if kind == "book_top1000_seed": fields = obj.json_data self.rsrc.seed = fields elif kind == "book_top1000_xpaths": fields = obj.json_data self.rsrc.xpath = fields else: raise Exception('Should not happen:-)') if (self.rsrc.seed is not None) and (self.rsrc.xpath is not None): json_data = {} json_data['url'] = self.rsrc.seed['url'] json_data['xpath'] = self.rsrc.seed['xpath'] json_data['crawl_descr'] = self.rsrc.xpath result = [ engine.LwObject(self.kindtags_default, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, self.rsrc.seed['url'], None, json_data, obj.sentence) ] else: result = [] self.new_activation([obj], result, [ self.rsrc, ], [ self.rsrc, ])
def load_configuration(self, configuration): logging.info("Create new JSON Bearings Schedule") with open(configuration) as data_file: for d in json.load(data_file): (k, v) = d.popitem() if k == 'create_context': context = self.db.add_context(v['name'], v['description']) elif k == 'create_job': context = self.db.get_context(v['context']) job = self.db.add_job(context, v['name'], v['description']) for a in v['activities']: activity = a['activity'] job.add_activity(activity['module'], activity['args'], activity['kindtags_in'], activity['kindtags_out']) seedobj = [] for o in v['seed_data']['objects']: obj = o['object'] seedobj.append(engine.LwObject(obj['kindtags'], obj['metadata'], obj['str_data'], obj['bytes_data'], obj['json_data'], obj['sentence'])) job.add_seed_data(seedobj) job.start() else: raise ValueError('unknown JSON object: ' + k) self.db.commit()
def handle_simple(self, obj): # activation.input(obj) # read the seeds from the file in the start seed file result = [] pdb = [] crawl_json = obj.json_data for book in crawl_json['examples']: print("Training on book: "+json.dumps(book, indent=' ')) url = book.get('url') cur_pdb = {} for ex in crawl_json['tags']: cur_pdb[ex] = book.get(ex) cur_pdb[ex+"_xp"] = set() pdb.append(cur_pdb) page = modules.fetch.get_webpage(self.scheduler.db, url) tree = etree.fromstring(page, parser=etree.HTMLParser()) # for el in tree.iter(): if el.text is not None: base = str(el.text) for ex in crawl_json['tags']: if base == book[ex]: xp = el.getroottree().getpath(el) cur_pdb[ex+'_xp'].add(xp) print("xpath[\'"+ex+"\']#"+ str(el.text)+"(100%)#"+xp) crawl_json['xpath'] = { } print("\n") for ex in crawl_json['tags']: crawl_json['xpath'][ex] = sub_intersect(pdb, ex+'_xp') print("Traing xpath for \'"+ex+"\': "+str(crawl_json['xpath'][ex])) result = [ engine.LwObject(self.kindtags_default, {'Content-Type': 'text/html', 'encoding': 'utf-8'}, url, None, crawl_json, obj.sentence) ] return result
def handle_simple(self, obj): # activation.input(obj) result = [] json_data = obj.json_data json_data['detail_url_id'] = self.sentence_id rfc_item = json_data['seed'][2] # print('RFC_ITEM'+str(rfc_item)) for detail_url in get_product_urls(rfc_item): # print("DETAIL_URL: "+detail_url) if detail_url not in self.url_dict: newobj = engine.LwObject(self.kindtags_default, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, detail_url, None, json_data, obj.sentence) self.url_dict[detail_url] = newobj.delayed_oid_container() result.append(newobj) else: # the activation shares its output with the first # print("DUPLICATE_URL: "+detail_url) delayed_oid = self.url_dict[detail_url][0] if delayed_oid > 0: # otherwise the duplicate is duplicate within provenance newobj = self.get_object(delayed_oid) result.append(newobj) if False: break return result
def handle_simple(self, obj): # activation.input(obj) result = [] json_data = obj.json_data; json_data['detail_url_id'] = self.sentence_id rfc_item = json_data['seed'][2] query_page = abf_search_page(rfc_item) for p in query_page['Products']: bag = dict() for t in p: # Assuming only a single value per key: bag[t['Key']] = t['Value'] detail_url = abf_build_detail_url(bag) # print("DETAIL_URL: "+detail_url) if detail_url not in self.url_dict: newobj = engine.LwObject(self.kindtags_default, {'Content-Type': 'text/html', 'encoding': 'utf-8'}, detail_url, None, json_data, obj.sentence) self.url_dict[detail_url] = newobj.delayed_oid_container() result.append(newobj) else: # the activation shares its output with the first # print("DUPLICATE_URL: "+detail_url) delayed_oid = self.url_dict[detail_url][0] if delayed_oid > 0: # otherwise the duplicate is duplicate within provenance newobj = self.get_object(delayed_oid) result.append(newobj) # TODO: implement minimality testing through env if True: break return result
def handle_simple(self, obj): detail_url = obj.str_data() detail_domain = detail_url.split("//")[-1].split("/")[0] # print("FETCH: "+detail_url) throttle.wait_for(detail_domain) result = requests.get(detail_url) result.raise_for_status() result.encoding = "utf-8" metadata = { "status": result.status_code, "headers": dict(result.headers) } # print(str(obj.json_data)) for k, v in obj.json_data.items(): metadata[k] = v # copy metadata['url'] = detail_url # overwrite text = json.dumps(metadata, indent=' ') + '\n--\n' + result.text result = [ engine.LwObject(self.kindtags_default, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, text, None, metadata, obj.sentence), ] if True: file = open("./cache/fetch" + str(obj.oid), "w") file.write(text) file.close() return result
def handle_simple(self, obj): # activation.input(obj) # read the seeds from the file in the start seed file result = [] print("top1000_json: "+json.dumps(obj.json_data, indent=' ')) url = obj.json_data.get('url') return [ engine.LwObject(self.kindtags_default, {'Content-Type': 'text/html', 'encoding': 'utf-8'}, url, None, obj.json_data, obj.sentence) ]
def stop_object(extraction_id, extraction_kindtags): return [ engine.LwObject({ 'kind': "extract_notify", 'tags': [] }, None, "", None, { 'notify': 'stop', 'extraction_id': extraction_id, 'extraction_kindtags': extraction_kindtags }, None), ]
def handle_simple(self, obj): html_header = obj.json_data html_body = obj.str_data() fields = {} for field in btshop_extract_body_fields(html_body): fields[field[0]] = field[1] fields['url'] = html_header.get("url") print("BTF_EXTRACTING:\n" + json.dumps(fields, indent=' ')) return [ engine.LwObject(self.kindtags_default, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, "", None, fields, obj.sentence), ]
def handle_simple(self, obj): result = [] html_header = obj.json_data html_body = obj.str_data() tree = etree.fromstring(html_body, parser=etree.HTMLParser()) xp = html_header['xpath'] # xp = '//table[@class=\'worksinseries\']/descendant::tr/descendant::a[1]/@href' for item in tree.xpath(xp): work_field = str(item) url = "http://www.librarything.com%s" % (work_field) result.append( engine.LwObject(self.kindtags_default, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, url, None, obj.json_data, obj.sentence)) if len(result) > 5: break return result
def compare(self, rfc_oid, e_rfc, abf_oid, e_abf): rfc_in = self.get_object(rfc_oid) abf_in = self.get_object(abf_oid) if True: print("COMPARE TODO:") print(str(e_rfc)) print("<<<>>>") print(str(e_abf)) out_obj = engine.LwObject({ 'kind': "rfc_x_abf", 'tags': [] }, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, "INCOMPLETE", None, {}, None) self.new_activation([rfc_in, abf_in], [ out_obj, ], [ self.rsrc, ], [])
def handle_simple(self, obj): if self.flowcontrol.is_active(obj): html_header = obj.json_data html_body = obj.str_data() extracted = {} tree = etree.fromstring(html_body, parser=etree.HTMLParser()) findSimple(tree, self.single_words, extracted) findSiblings(tree, self.sibling_words, extracted) jd = obj.json_data jd['url'] = html_header.get("url") jd['extraction_id'] = self.sentence_id jd['extraction_module'] = 'simple' jd['extraction_kindtags'] = obj.kindtags jd['extraction_fields'] = extracted # print("BTSHOP_JSON_DATA:\n"+json.dumps(jd, indent=' ')) return [ engine.LwObject(self.kindtags_default, { 'Content-Type': 'text/html', 'encoding': 'utf-8' }, "", None, jd, obj.sentence), ] else: return []