def main(): try: client = ThriftEntityExtractor(port=12500) with open('extractors/owx/宝应.txt', 'r') as f: html_table = f.read() print "client - start" extractor_info = {"topic_id": 155, "target_dir_name": "test", "extractor_name": "测试"} primary_keys = json.dumps([["title"]]) schema = json.dumps({"type": "object", "title": "百度搜索", "description": "百度搜索", "properties": {"keyword": {"type": "string", "title": "搜索公司"}, "title": {"type": "string", "title": "标题"}, "url": {"type": "string", "title": "来源url"}, "abstract": {"type": "string", "title": "抽象"}}}) topic_info = {"id": 155, "name": "测试动态加载解析器", "schema": schema, "primary_keys": primary_keys, "table_name":"test"} topic_info = json.dumps(topic_info) resp = client.add_topic(topic_info) # print resp.msg # extractor_info = json.dumps(extractor_info) resp = client.add_extractor(extractor_info) resp = client.reload(155) extract_data = {} extract_data = json.dumps(extract_data) base_info = BaseInfo(url="", site_id=1) extract_info = ExtractInfo(ex_status=2, extract_data=extract_data, topic_id=155) crawl_info = CrawlInfo(download_time=1474547589) req = PageParseInfo(base_info=base_info, crawl_info=crawl_info, extract_info=extract_info, scheduler="a", parse_extends="b", data_extends="c") resp = client.entity_extract(req) print resp.entity_data_list # 捕获异常 except Thrift.TException, ex: print "%s" % (ex.message)
def main(obj): from pymongo import MongoClient import traceback host = '101.201.102.37' port = 28019 database = 'final_data' coll = 'baidu_news' client = MongoClient(host, port) db = client[database][coll] cursor = db.find() num = 0 for item in cursor: try: num += 1 item.pop('_id') src_url = item.get('_src')[0]['url'] extract_data = item data = json.dumps(extract_data) extract_info = ExtractInfo(ex_status=2, extract_data=data) base_info = BaseInfo(url=src_url) crawl_info = CrawlInfo(download_time=1474547589) parser_info = PageParseInfo(base_info=base_info, extract_info=extract_info, crawl_info=crawl_info) data = obj.do_merge(parser_info, item) print src_url for key, value in data.items(): if isinstance(value, list): for i in value: print key, ":", i elif isinstance(value, dict): for key2, value2 in value.items(): print key2, ":", value2 else: print key, ":", value if num % 100 == 0: break except: print traceback.format_exc()
def get_entity_extractor_info(self, company, base_info_url, in_time, model, topic, year=None): # 去除none值 store_model = util.del_none(model) base_url = base_info_url.encode('utf-8') replace_company = company.replace('(', '(').replace(')', ')') if year is None: record = '|' + replace_company else: record = '|' + replace_company + '|' + str(year) _site_record_id = tools.get_md5(record) if year is None: self.log.info('company = {company} record_id = {_set_record_id} url = {url}'. format(company=company, _set_record_id=_site_record_id, url=base_url)) else: self.log.info('company = {company} year = {year} record_id = {_set_record_id} url = {url}'. format(company=company, _set_record_id=_site_record_id, url=base_url, year=year)) store_model['_src'] = [] store_model['_src'].append({'url': base_url, 'site': self.host, 'download_time': in_time}) store_model['_site_record_id'] = _site_record_id extract_info = ExtractInfo() extract_info.ex_status = ExStatus.kEsSuccess extract_info.extract_data = json.dumps(store_model) extract_info.topic_id = topic crawl_info = CrawlInfo() crawl_info.content = "" crawl_info.download_time = in_time url_info = get_url_info(base_url) base_info = BaseInfo() base_info.site = url_info.get('site', '') base_info.url = url_info.get('url', '') base_info.site_id = url_info.get('site_id', 0) base_info.url_id = url_info.get('url_id', 0) return PageParseInfo(extract_info=extract_info, crawl_info=crawl_info, base_info=base_info)
"key": "主承销商", "value": "广发证券股份有限公司" }, { "key": "上市推荐人", "value": "" }, { "key": "保荐机构", "value": "广发证券股份有限公司" }, { "key": "股票代码", "value": "300599" }] } data = json.dumps(extract_data) extract_info = ExtractInfo(ex_status=2, extract_data=data) base_info = BaseInfo(url=src_url) parser_info = PageParseInfo(base_info=base_info, extract_info=extract_info) entity_data = obj.entity_extract(parser_info, extract_data) entity_data = obj.after_extract(base_info.url, entity_data, extract_data) for key, value in entity_data.items(): if isinstance(value, list): for i in value: print key, ":", i elif isinstance(value, dict): for key2, value2 in value.items(): print key2, ":", value2 else: print key, ":", value
def get_page_parseinfo(self): url = 'http://www.baidu.com' url_id = 0 site = 'www.baidu.com' site_id = 0 domain = None domain_id = 0 segment_id = 0 src_type = 'test src_type' base_info = BaseInfo(url=url, url_id=url_id, site=site, site_id=site_id, domain=domain, domain_id=domain_id, segment_id=segment_id, src_type=src_type) status_code = 0 http_code = 0 download_time = 0 redirect_url = 'test redirect_url' elapsed = 0 content_type = 'test content_type' content = 'test content1' page_size = 0 crawl_info = CrawlInfoOld(status_code=status_code, http_code=http_code, download_time=download_time, redirect_url=redirect_url, elapsed=elapsed, content_type=content_type, content=content, page_size=page_size) ex_status = ExStatus.kEsSuccess extract_error = ExFailErrorCode.KExFailPageTranscoding redirect_url = 'test redirect_url' next_page_type = True struct_type = 0 compose_type = 0 content_type = 0 topic_id = 0 extracted_body_time = 0 content_time = 0 html_tag_title = 'test html_tag_title' analyse_title = 'test analyse_title3' zone = 'test zone' page_text = 'test page_text' content_language = 'test content_language' second_navigate = 'test second_navigate' valid_pic_url = 'test valid_pic_url' digest = 'test digest' finger_feature = 'test finger_feature' content_finger = 0 simhash_finger = 0 link_finger = 0 link1 = Link(url='http://www.baidu.com/url3', type=0) link2 = Link(url='http://www.baidu.com/url2') link3 = Link(url='http://www.baidusdf.com/url5', type=0) link4 = Link(url='http://www.baidusdf.com/url6', type=2) links = [link1, link2, link3, link4] extract_data = 'test extract_data' extract_info = ExtractInfo(ex_status=ex_status, extract_error=extract_error, redirect_url=redirect_url, next_page_type=next_page_type, struct_type=struct_type, compose_type=compose_type, content_type=content_type, topic_id=topic_id, extracted_body_time=extracted_body_time, content_time=content_time, html_tag_title=html_tag_title, analyse_title=analyse_title, zone=zone, page_text=page_text, content_language=content_language, second_navigate=second_navigate, valid_pic_url=valid_pic_url, digest=digest, finger_feature=finger_feature, content_finger=content_finger, simhash_finger=simhash_finger, link_finger=link_finger, links=links, extract_data=extract_data) parse_extends = 'b' data_extends = 'c' scheduler = 'd' page_parseinfo = PageParseInfo(base_info=base_info, crawl_info=crawl_info, extract_info=extract_info, parse_extends=parse_extends, data_extends=data_extends, scheduler=scheduler) return page_parseinfo