Beispiel #1
0
 def handle_simple(self, obj):
     if self.flowcontrol.is_active(obj):
         html_header = obj.json_data
         html_body = obj.str_data()
         extracted = {}
         try:
             for field in abf_extract_body_fields(html_body):
                 extracted[field[0]] = field[1]
         except:
             pass
         jd = obj.json_data
         jd['url'] = html_header.get("url")
         jd['extraction_id'] = self.sentence_id
         jd['extraction_module'] = 'abf'
         jd['extraction_kindtags'] = obj.kindtags
         jd['extraction_fields'] = extracted
         # print("ABF-EXTRACTING:\n"+json.dumps(jd, indent='   '))
         return [
             engine.LwObject(self.kindtags_default, {
                 'Content-Type': 'text/html',
                 'encoding': 'utf-8'
             }, "", None, jd, obj.sentence),
         ]
     else:
         return []
Beispiel #2
0
 def handle_entity(self, obj, do_compare):
     oid = obj.oid
     kindtags = obj.kindtags
     kind = kindtags['kind']
     if kind == "book_top1000_seed":
         fields = obj.json_data
         self.rsrc.seed = fields
     elif kind == "book_top1000_xpaths":
         fields = obj.json_data
         self.rsrc.xpath = fields
     else:
         raise Exception('Should not happen:-)')
     if (self.rsrc.seed is not None) and (self.rsrc.xpath is not None):
         json_data = {}
         json_data['url'] = self.rsrc.seed['url']
         json_data['xpath'] = self.rsrc.seed['xpath']
         json_data['crawl_descr'] = self.rsrc.xpath
         result = [
             engine.LwObject(self.kindtags_default, {
                 'Content-Type': 'text/html',
                 'encoding': 'utf-8'
             }, self.rsrc.seed['url'], None, json_data, obj.sentence)
         ]
     else:
         result = []
     self.new_activation([obj], result, [
         self.rsrc,
     ], [
         self.rsrc,
     ])
Beispiel #3
0
 def load_configuration(self, configuration):
     logging.info("Create new JSON Bearings Schedule")
     with open(configuration) as data_file:
         for d in json.load(data_file):
             (k, v) = d.popitem()
             if k == 'create_context':
                 context = self.db.add_context(v['name'], v['description'])
             elif k == 'create_job':
                 context = self.db.get_context(v['context'])
                 job = self.db.add_job(context, v['name'], v['description'])
                 for a in v['activities']:
                     activity = a['activity']
                     job.add_activity(activity['module'], activity['args'],
                                      activity['kindtags_in'],
                                      activity['kindtags_out'])
                 seedobj = []
                 for o in v['seed_data']['objects']:
                     obj = o['object']
                     seedobj.append(engine.LwObject(obj['kindtags'],
                                    obj['metadata'], obj['str_data'],
                                    obj['bytes_data'], obj['json_data'],
                                    obj['sentence']))
                 job.add_seed_data(seedobj)
                 job.start()
             else:
                 raise ValueError('unknown JSON object: ' + k)
     self.db.commit()
Beispiel #4
0
 def handle_simple(self, obj):
     # activation.input(obj)
     # read the seeds from the file in the start seed file
     result = []
     pdb = []
     crawl_json = obj.json_data
     for book in crawl_json['examples']:
         print("Training on book: "+json.dumps(book, indent='   '))
         url = book.get('url')
         cur_pdb = {}
         for ex in crawl_json['tags']:
             cur_pdb[ex] = book.get(ex)
             cur_pdb[ex+"_xp"] = set()
         pdb.append(cur_pdb)
         page = modules.fetch.get_webpage(self.scheduler.db, url)
         tree = etree.fromstring(page, parser=etree.HTMLParser())
         #
         for el in tree.iter():
             if el.text is not None:
                 base = str(el.text)
                 for ex in crawl_json['tags']:
                     if base == book[ex]:
                         xp = el.getroottree().getpath(el)
                         cur_pdb[ex+'_xp'].add(xp)
                         print("xpath[\'"+ex+"\']#"+ str(el.text)+"(100%)#"+xp)
     crawl_json['xpath'] = { }
     print("\n")
     for ex in crawl_json['tags']:
         crawl_json['xpath'][ex] = sub_intersect(pdb, ex+'_xp')
         print("Traing xpath for \'"+ex+"\': "+str(crawl_json['xpath'][ex]))
     result = [ engine.LwObject(self.kindtags_default, {'Content-Type': 'text/html', 'encoding': 'utf-8'}, url, None, crawl_json, obj.sentence) ]
     return result
Beispiel #5
0
 def handle_simple(self, obj):
     # activation.input(obj)
     result = []
     json_data = obj.json_data
     json_data['detail_url_id'] = self.sentence_id
     rfc_item = json_data['seed'][2]
     # print('RFC_ITEM'+str(rfc_item))
     for detail_url in get_product_urls(rfc_item):
         # print("DETAIL_URL: "+detail_url)
         if detail_url not in self.url_dict:
             newobj = engine.LwObject(self.kindtags_default, {
                 'Content-Type': 'text/html',
                 'encoding': 'utf-8'
             }, detail_url, None, json_data, obj.sentence)
             self.url_dict[detail_url] = newobj.delayed_oid_container()
             result.append(newobj)
         else:
             # the activation shares its output with the first
             # print("DUPLICATE_URL: "+detail_url)
             delayed_oid = self.url_dict[detail_url][0]
             if delayed_oid > 0:
                 # otherwise the duplicate is duplicate within provenance
                 newobj = self.get_object(delayed_oid)
                 result.append(newobj)
         if False:
             break
     return result
Beispiel #6
0
 def handle_simple(self, obj):
     # activation.input(obj)
     result = []
     json_data = obj.json_data;
     json_data['detail_url_id'] = self.sentence_id
     rfc_item = json_data['seed'][2]
     query_page = abf_search_page(rfc_item)
     for p in query_page['Products']:
         bag = dict()
         for t in p:
             # Assuming only a single value per key:
             bag[t['Key']] = t['Value']
         detail_url = abf_build_detail_url(bag)
         # print("DETAIL_URL: "+detail_url)
         if detail_url not in self.url_dict:
             newobj = engine.LwObject(self.kindtags_default, {'Content-Type': 'text/html', 'encoding': 'utf-8'}, detail_url, None, json_data, obj.sentence)
             self.url_dict[detail_url] = newobj.delayed_oid_container()
             result.append(newobj)
         else:
             # the activation shares its output with the first
             # print("DUPLICATE_URL: "+detail_url)
             delayed_oid = self.url_dict[detail_url][0]
             if delayed_oid > 0:
                 # otherwise the duplicate is duplicate within provenance
                 newobj = self.get_object(delayed_oid)
                 result.append(newobj)
         # TODO: implement minimality testing through env
         if True:
             break
     return result
Beispiel #7
0
 def handle_simple(self, obj):
     detail_url = obj.str_data()
     detail_domain = detail_url.split("//")[-1].split("/")[0]
     # print("FETCH: "+detail_url)
     throttle.wait_for(detail_domain)
     result = requests.get(detail_url)
     result.raise_for_status()
     result.encoding = "utf-8"
     metadata = {
         "status": result.status_code,
         "headers": dict(result.headers)
     }
     # print(str(obj.json_data))
     for k, v in obj.json_data.items():
         metadata[k] = v  # copy
     metadata['url'] = detail_url  # overwrite
     text = json.dumps(metadata, indent='   ') + '\n--\n' + result.text
     result = [
         engine.LwObject(self.kindtags_default, {
             'Content-Type': 'text/html',
             'encoding': 'utf-8'
         }, text, None, metadata, obj.sentence),
     ]
     if True:
         file = open("./cache/fetch" + str(obj.oid), "w")
         file.write(text)
         file.close()
     return result
Beispiel #8
0
 def handle_simple(self, obj):
     # activation.input(obj)
     # read the seeds from the file in the start seed file
     result = []
     print("top1000_json: "+json.dumps(obj.json_data, indent='   '))
     url = obj.json_data.get('url')
     return [ engine.LwObject(self.kindtags_default, {'Content-Type': 'text/html', 'encoding': 'utf-8'}, url, None, obj.json_data, obj.sentence) ]
Beispiel #9
0
def stop_object(extraction_id, extraction_kindtags):
    return [
        engine.LwObject({
            'kind': "extract_notify",
            'tags': []
        }, None, "", None, {
            'notify': 'stop',
            'extraction_id': extraction_id,
            'extraction_kindtags': extraction_kindtags
        }, None),
    ]
Beispiel #10
0
 def handle_simple(self, obj):
     html_header = obj.json_data
     html_body = obj.str_data()
     fields = {}
     for field in btshop_extract_body_fields(html_body):
         fields[field[0]] = field[1]
     fields['url'] = html_header.get("url")
     print("BTF_EXTRACTING:\n" + json.dumps(fields, indent='   '))
     return [
         engine.LwObject(self.kindtags_default, {
             'Content-Type': 'text/html',
             'encoding': 'utf-8'
         }, "", None, fields, obj.sentence),
     ]
Beispiel #11
0
 def handle_simple(self, obj):
     result = []
     html_header = obj.json_data
     html_body = obj.str_data()
     tree = etree.fromstring(html_body, parser=etree.HTMLParser())
     xp = html_header['xpath']
     # xp = '//table[@class=\'worksinseries\']/descendant::tr/descendant::a[1]/@href'
     for item in tree.xpath(xp):
         work_field = str(item)
         url = "http://www.librarything.com%s" % (work_field)
         result.append(
             engine.LwObject(self.kindtags_default, {
                 'Content-Type': 'text/html',
                 'encoding': 'utf-8'
             }, url, None, obj.json_data, obj.sentence))
         if len(result) > 5:
             break
     return result
Beispiel #12
0
 def compare(self, rfc_oid, e_rfc, abf_oid, e_abf):
     rfc_in = self.get_object(rfc_oid)
     abf_in = self.get_object(abf_oid)
     if True:
         print("COMPARE TODO:")
         print(str(e_rfc))
         print("<<<>>>")
         print(str(e_abf))
     out_obj = engine.LwObject({
         'kind': "rfc_x_abf",
         'tags': []
     }, {
         'Content-Type': 'text/html',
         'encoding': 'utf-8'
     }, "INCOMPLETE", None, {}, None)
     self.new_activation([rfc_in, abf_in], [
         out_obj,
     ], [
         self.rsrc,
     ], [])
Beispiel #13
0
 def handle_simple(self, obj):
     if self.flowcontrol.is_active(obj):
         html_header = obj.json_data
         html_body = obj.str_data()
         extracted = {}
         tree = etree.fromstring(html_body, parser=etree.HTMLParser())
         findSimple(tree, self.single_words, extracted)
         findSiblings(tree, self.sibling_words, extracted)
         jd = obj.json_data
         jd['url'] = html_header.get("url")
         jd['extraction_id'] = self.sentence_id
         jd['extraction_module'] = 'simple'
         jd['extraction_kindtags'] = obj.kindtags
         jd['extraction_fields'] = extracted
         # print("BTSHOP_JSON_DATA:\n"+json.dumps(jd, indent='   '))
         return [
             engine.LwObject(self.kindtags_default, {
                 'Content-Type': 'text/html',
                 'encoding': 'utf-8'
             }, "", None, jd, obj.sentence),
         ]
     else:
         return []