def __init__(self, conf): self.conf = conf self.log = log self.parser_tool = parser_tool self.route = EntityExtractorRoute(conf) self.topic_manager = TopicManager(conf) self.validate_manager = ValidateManager(self.topic_manager, conf, 'all') self.count = 0
if __name__ == '__main__': import pytoml import sys sys.path.append('../../') from conf import get_config with open('../../entity.toml', 'rb') as config: config = pytoml.load(config) conf = get_config(config) import common topic_id = 32 from entity_extractor_route import EntityExtractorRoute route = EntityExtractorRoute() topic_info = route.all_topics.get(topic_id, None) obj = ListingEventsExtractor(topic_info, common.log) extract_data = { "_site_record_id": "http://www.pedata.cn/ipo/321436101.html", "accounting_firm": "大华会所", "enterprise_full_name": "广东芳源环保股份有限公司", "equity": "31,000,000", "exchanges": "全国中小企业股份转让系统(新三板)", "industry": "互联网 电商", "law_firm": "广东华商律所", "lead_underwriter": "华创证券", "market_date": "2016-10-21", "site_url": "http://www.pedata.cn/ipo/321436101.html", "source_site": "私募通",
import pytoml import sys sys.path.append('../../') from conf import get_config from bdp.i_crawler.i_extractor.ttypes import BaseInfo, CrawlInfo, ExtractInfo, PageParseInfo with open('../../entity.toml', 'rb') as config: config = pytoml.load(config) conf = get_config(config) import common topic_id = 102 from entity_extractor_route import EntityExtractorRoute route = EntityExtractorRoute(conf) topic_info = route.all_topics.get(topic_id, None) obj = SsggCaibaoExtractor(topic_info, common.log) src_url = "" extract_data = { "code": "szcn300599", "info": [{ "key": "公司全称", "value": "广东雄塑科技集团股份有限公司" }, { "key": "英文名称", "value": "Guangdong Xiongsu Technology Group Co., Ltd." }, { "key": "注册地址", "value": "广东省佛山市南海区九江镇龙高路敦根路段雄塑工业园"
"sex_female": female_rate, "province_rank": extract_data.get("province_rank"), } return entity_data if __name__ == "__main__": import conf from bdp.i_crawler.i_extractor.ttypes import BaseInfo, CrawlInfo, ExtractInfo, PageParseInfo from common_parser_lib.parser_tool import ParserTool from entity_extractor_route import EntityExtractorRoute topic_id = 56 parser_tool = ParserTool(conf) route = EntityExtractorRoute(conf, parser_tool) topic_info = route.all_topics.get(topic_id, None) parser_tool = ParserTool(conf) obj = BaiDuIndexExtractor(conf.log, topic_info, parser_tool) extract_data = {} src_url = "www.baidu.com" data = json.dumps({}) extract_info = ExtractInfo(ex_status=2, extract_data=data) base_info = BaseInfo(url=src_url) parser_info = PageParseInfo(base_info=base_info, extract_info=extract_info) data = obj.entity_extract(parser_info, extract_data) print src_url for key, value in data.items(): if isinstance(value, list): for i in value:
class EntityExtractor(object): validator_used = ['pk', 'required_attr', 'jsonschema'] all_validators = ['meta', 'pk', 'jsonschema'] def __init__(self, conf): self.conf = conf self.log = log self.parser_tool = parser_tool self.route = EntityExtractorRoute(conf) self.topic_manager = TopicManager(conf) self.validate_manager = ValidateManager(self.topic_manager, conf, 'all') self.count = 0 def reload(self): self.topic_manager.reload(-1) return True # TODO: need to change extractor route to use the common topic manager, this is temporary code def add_topic(self, topic_info): resp = self.route.add_topic(topic_info) self.topic_manager.reload(-1) return resp def insert_extractor(self, extractor_info): resp = self.route.insert_extractor(extractor_info) self.topic_manager.reload(-1) return resp def process_json(self, j, topic_id): """ 消息队列处理 :param j: json data :param topic_id: topic_id :return: [{topic_id:主题ID(int), data:{解析结果(json)}}] """ result_list = [] try: extractor = self.route.get_extractor(topic_id) formatted_json = extractor.process_json(j, topic_id) if formatted_json is None: return None def process_single_result(single_result): after_process_json = extractor.after_process(single_result) if after_process_json is None: return None else: result_list.append({ "topic_id": topic_id, "data": after_process_json }) if isinstance(formatted_json, list): for entity_data in formatted_json: process_single_result(entity_data) else: process_single_result(formatted_json) except Exception, e: self.log.error("extract_error\tmsg:%s" % (traceback.format_exc())) return result_list