class NewsGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 used_entity = [ 'News', ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def create_all_relationship(self): """ 1.enterprise -[have or x]->x :return: """ ops = self.base.query( sql={'metaModel': '公司新闻'}, # limit=10, skip=2020, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = ops.count() relationships = [] # etp = Enterprise() s_t = time.time() for o in ops: k += 1 # if k < 43500: # continue # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node( *legal, cypher='_.NAME = "{}"'.format(o['name']) ) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one( sql={'metaModel': '基本信息', 'name': o['name']} ) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo(_) pass else: # 没有这个公司的信息,那就创建一个信息不全的公司 etp = Related(**{'名称': o['name'], '链接': o['url']}) # etp['NAME'] = o['name'] # etp['URL'] = o['url'] etp_n = self.get_neo_node(etp) if etp_n is None: continue pass if '新闻舆情' in o['content'].keys(): data = self.get_format_dict(o['content']['新闻舆情']) ns = News.create_from_dict(data) for n in ns: n_ = n.pop('news') n_n = self.get_neo_node(n_) if n_n is not None: relationships.append( Have(etp_n, n_n, **n).get_relationship() ) pass if len(relationships) > 1000: i += 1 sp = int(time.time() - s_t) s_t = time.time() self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise and spend {} ' 'seconds,and merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, sp, len(relationships) ))) relationships.clear() # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '新闻舆情' in etp['content'].keys(): data = self.get_format_dict(etp['content']['新闻舆情']) ns = News.create_from_dict(data) for n in ns: n_ = n.pop('news') n_n = self.get_neo_node(n_) if n_n is not None: nodes.append(n_n) relationships.append( Have(etp_n, n_n, **n) ) pass return nodes, relationships def get_all_nodes_and_relationships( self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '公司新闻', # 'name': '重庆轩烽建材有限公司' }, # limit=10000, # skip=100000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info(SuccessMessage( 'success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(_st_ - time.time())) )) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class DvpGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 constraint = { # 'News': [News.primarykey], # 'Possession': [Possession.primarykey], # 'Involveder': ['HASH_ID'], } index = { # 'Enterprise': [('NAME',)] } self.add_index_and_constraint(index, constraint) pass def create_all_relationship(self): """ 1.enterprise -[compete]->enterprise :return: """ ops = self.base.query(sql={'metaModel': '企业发展'}, field={ 'name': 1, 'url': 1, 'content.竞品信息': 1 }, limit=1000, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = ops.count() relationships = [] etp = Enterprise() for o in ops: k += 1 # if k < 41321: # continue # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node(*legal, cypher='_.NAME = "{}"'.format(o['name'])) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one(sql={ 'metaModel': '基本信息', 'name': o['name'] }) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo( _) pass else: # 没有这个公司的信息,那就创建一个信息不全的公司 # etp = Enterprise({'name': o['name'], 'url': o['url']}) etp = Related() etp['NAME'] = o['name'] etp['URL'] = o['url'] etp_n = self.get_neo_node(etp) pass if '竞品信息' in o['content'].keys(): data = self.get_format_dict(o['content']['竞品信息']) for d in data: etp_2 = d.pop('关联企业') if etp_2['名称'] is not None and len(etp_2['名称']) > 1: etp_2['链接'] = etp.parser_url(etp_2['链接']) etp_n_2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( etp_2['链接'])) if etp_n_2 is None and etp_2['名称'] > 1: _ = { 'URL': etp_2['链接'], 'NAME': etp_2['名称'], '简介': d.pop('产品介绍'), '成立日期': d.pop('成立日期'), '融资信息': d.pop('融资信息'), '所属地': d.pop('所属地'), } etp_n_2 = Related(**_) etp_n_2 = self.get_neo_node(etp_n_2) relationships.append( Compete(etp_n, etp_n_2, **d).get_relationship()) pass if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage( '{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '竞品信息' in etp['content'].keys(): data = self.get_format_dict(etp['content']['竞品信息']) data = Product.create_from_dict(data) for d in data: p = d.pop('product') p_n = self.get_neo_node(p) if p_n is None: continue nodes.append(p_n) relationships.append(Compete(etp_n, p_n)) etp_2 = d.pop('关联企业') etp_2['链接'] = Enterprise.parser_url(etp_2['链接']) if etp_2['名称'] is not None and len(etp_2['名称']) > 1: # etp_2['链接'] = Enterprise.parser_url(etp_2['链接']) etp_n_2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( etp_2['链接'])) if etp_n_2 is None and len(etp_2['名称']) > 1: etp_n_2 = Enterprise(**etp_2) if not etp_n_2.isEnterprise(): _ = { 'URL': etp_2['链接'], 'NAME': etp_2['名称'], '简介': d.pop('产品介绍'), '成立日期': d.pop('成立日期'), '融资信息': d.pop('融资信息'), '所属地': d.pop('所属地'), } etp_n_2 = Related(**{ '链接': etp_2['链接'], '名称': etp_2['名称'] }) # etp_n_2 = Related(**_) etp_n_2 = self.get_neo_node(etp_n_2) nodes.append(etp_n_2) relationships.append(Produce(etp_n_2, p_n)) return nodes, relationships def get_all_nodes_and_relationships(self, save_folder=None, **kwargs): enterprises = self.base.query( sql={'metaModel': '企业发展'}, field={ 'name': 1, 'url': 1, 'content.竞品信息': 1 }, # limit=100000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info( SuccessMessage('success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)))) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class OptGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 # 用到是实体对象 used_entity = [ 'License', 'Bidding', 'Check', 'RandomCheck', 'TaxCredit', 'IAE', 'Position', # 'Client', # 'Supplier', # 'Possession', 'Plot' ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def get_all_nodes_from_enterprise(self, etp): nodes = [Enterprise(URL=etp['url'], NAME=etp['name'])] if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: nodes.append(l.pop('license')) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: nodes.append(l.pop('license')) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: nodes.append(b.pop('bidding')) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: nodes.append(c.pop('check')) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 nodes.append(rc.pop('check')) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: nodes.append(t.pop('TaxCredit')) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: nodes.append(ie.pop('iae')) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: nodes.append(r.pop('position')) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: nodes.append(c.pop('client')) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: nodes.append(s.pop('supplier')) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: nodes.append(d.pop('评级公司')) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) nodes.append(p) pass return nodes pass def get_all_nodes(self): enterprises = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, limit=1000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 etp_count = enterprises.count() nodes = {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: continue ep['url'] = '/firm_' + uc + '.html' nds = self.get_all_nodes_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue _nds_ = _nds_.to_dict() if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass if i % 1000 == 0: j += 1 print( SuccessMessage('{}:success merge nodes to database ' 'round {} and deal {}/{} enterprise' ''.format(dt.datetime.now(), i, j, etp_count))) pass return nodes def get_all_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [] relationships = [] if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append(Have(etp_n, l_n, **l)) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append(Have(etp_n, l_n, **l)) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: _ = b.pop('bidding') b_n = self.get_neo_node(_) if b_n is None: continue # TODO(leung):项目分类用作了招投标结果 relationships.append( TakePartIn(etp_n, b_n, **dict(b, **{'RESULT': b_n['TYPE']}))) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: _ = c.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']}))) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 _ = rc.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append(Have(etp_n, n, **rc)) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: _ = t.pop('TaxCredit') n = self.get_neo_node(_) if n is None: continue # TODO(leung):纳税信用等级作为税务信用评级结果 relationships.append( Have(etp_n, n, **dict(RESULT=n['GRADE'], **t))) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: _ = ie.pop('iae') n = self.get_neo_node(_) if n is None: continue relationships.append(Have(etp_n, n, **ie)) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: _ = r.pop('position') n = self.get_neo_node(_) if n is None: continue relationships.append(Recruit(etp_n, n, **r)) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: _ = c.pop('client') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append(SellTo(etp_n, n, **c)) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: _ = s.pop('supplier') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append(BuyFrom(etp_n, n, **s)) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: _ = d.pop('评级公司') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['链接'], _['名称'])) if n is None: n = Enterprise(**_) n = self.get_neo_node(n) if n is None: continue __ = d.pop('内容') d['评级内容'] = __['内容'] d['评级链接'] = __['链接'] relationships.append(Appraise(n, etp_n, **d)) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) p_n = self.get_neo_node(p) if p_n is None: continue if e1['名称'] == etp['name'] or e1['链接'] == etp['url']: n1 = etp_n else: # 有可能是人 n1 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e1['链接'])) if n1 is None: n1 = Enterprise(**e1) if not n1.isEnterprise(): n1 = Person(**e1) if not n1.isPerson(): n1 = Related(**e1) n1 = self.get_neo_node(n1) if n1 is not None: relationships.append(Sell(n1, p_n)) if e2['名称'] == etp['name'] or e2['链接'] == etp['url']: n2 = etp_n else: n2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e2['链接'])) if n2 is None: n2 = Enterprise(**e2) if not n2.isEnterprise(): n2 = Person(**e2) if not n2.isPerson(): n2 = Related(**e2) n2 = self.get_neo_node(n2) if n2 is not None: relationships.append(Buy(n2, p_n)) pass return relationships def get_all_relationships(self): enterprises = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, limit=1000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 etp_count = enterprises.count() relationships = {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: continue ep['url'] = '/firm_' + uc + '.html' rps = self.get_all_relationships_from_enterprise(ep) for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 1000 == 0: j += 1 print( SuccessMessage('{}:success merge relationship to database ' 'round {} and deal {}/{} enterprise' ''.format(dt.datetime.now(), i, j, etp_count))) pass return relationships def create_all_relationship(self): """ 1.enterprise -[have or x]->x :return: """ ops = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, limit=1000, # skip=2000, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = ops.count() relationships = [] # etp = Enterprise() for o in ops: k += 1 # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node(*legal, cypher='_.NAME = "{}"'.format(o['name'])) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one(sql={ 'metaModel': '基本信息', 'name': o['name'] }) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo( _) pass else: # 没有这个公司的信息,那就创建一个信息不全的公司 # 如果在neo4j里面存着只有name,url的公司,意味着 # 这家公司没有“基本信息” etp = Related() etp['NAME'] = o['name'] etp['URL'] = o['url'] etp_n = self.get_neo_node(etp) pass if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append( Have(etp_n, l_n, **l).get_relationship()) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append( Have(etp_n, l_n, **l).get_relationship()) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: _ = b.pop('bidding') b_n = self.get_neo_node(_) if b_n is None: continue # TODO(leung):项目分类用作了招投标结果 relationships.append( TakePartIn(etp_n, b_n, **dict(b, **{'RESULT': b_n['TYPE']})).get_relationship()) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: _ = c.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']})).get_relationship()) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 _ = rc.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **rc).get_relationship()) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: _ = t.pop('TaxCredit') n = self.get_neo_node(_) if n is None: continue # TODO(leung):纳税信用等级作为税务信用评级结果 relationships.append( Have(etp_n, n, **dict(RESULT=n['GRADE'], **t)).get_relationship()) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: _ = ie.pop('iae') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **ie).get_relationship()) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: _ = r.pop('position') n = self.get_neo_node(_) if n is None: continue relationships.append( Recruit(etp_n, n, **r).get_relationship()) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: _ = c.pop('client') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append( SellTo(etp_n, n, **c).get_relationship()) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: _ = s.pop('supplier') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append( BuyFrom(etp_n, n, **s).get_relationship()) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: _ = d.pop('评级公司') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['链接'], _['名称'])) if n is None: n = Related() n['NAME'] = _['名称'] n['URL'] = _['链接'] n = self.get_neo_node(n) if n is None: continue __ = d.pop('内容') d['评级内容'] = __['内容'] d['评级链接'] = __['链接'] relationships.append( Appraise(n, etp_n, **d).get_relationship()) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) p_n = self.get_neo_node(p) if p_n is None: continue if e1['名称'] == o['name'] or e1['链接'] == o['url']: n1 = etp_n else: # 有可能是人 n1 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e1['链接'])) if n1 is None: n1 = Related(**e1) n1 = self.get_neo_node(n1) if n1 is not None: relationships.append(Sell(n1, p_n).get_relationship()) if e2['名称'] == o['name'] or e2['链接'] == o['url']: n2 = etp_n else: n2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e2['链接'])) if n2 is None: n2 = Related(**e2) n2 = self.get_neo_node(n2) if n2 is not None: relationships.append(Buy(n2, p_n).get_relationship()) pass if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage( '{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue nodes.append(l_n) relationships.append(Have(etp_n, l_n, **l)) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue nodes.append(l_n) relationships.append(Have(etp_n, l_n, **l)) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: _ = b.pop('bidding') b_n = self.get_neo_node(_) if b_n is None: continue # TODO(leung):项目分类用作了招投标结果 nodes.append(b_n) relationships.append( TakePartIn(etp_n, b_n, **dict(b, **{'RESULT': b_n['TYPE']}))) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: _ = c.pop('check') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append( Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']}))) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 _ = rc.pop('check') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append(Have(etp_n, n, **rc)) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: _ = t.pop('TaxCredit') n = self.get_neo_node(_) if n is None: continue # TODO(leung):纳税信用等级作为税务信用评级结果 nodes.append(n) relationships.append( Have(etp_n, n, **dict(RESULT=n['GRADE'], **t))) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: _ = ie.pop('iae') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append(Have(etp_n, n, **ie)) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: _ = r.pop('position') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append(Recruit(etp_n, n, **r)) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: cli = c.pop('client') cli_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( cli['URL'], cli['NAME'])) if cli_n is None: if cli.isEnterprise(): cli = Enterprise(**cli.to_dict(with_label=False)) cli_n = self.get_neo_node(cli) if cli_n is None: continue nodes.append(cli_n) relationships.append(SellTo(etp_n, cli_n, **c)) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: sup = s.pop('supplier') sup_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( sup['URL'], sup['NAME'])) if sup_n is None: if sup.isEnterprise(): sup = Enterprise(**sup.to_dict(with_label=False)) sup_n = self.get_neo_node(sup) if sup_n is None: continue nodes.append(sup_n) relationships.append(BuyFrom(etp_n, sup_n, **s)) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: _ = d.pop('评级公司') _['链接'] = Enterprise.parser_url(_['链接']) n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['链接'], _['名称'])) if n is None: n = Enterprise(**_) n = self.get_neo_node(n) if n is None: continue __ = d.pop('内容') d['评级内容'] = __['内容'] d['评级链接'] = __['链接'] nodes.append(n) relationships.append(Appraise(n, etp_n, **d)) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) p_n = self.get_neo_node(p) if p_n is None: continue e1['链接'] = Enterprise.parser_url(e1['链接']) if e1['名称'] == etp['name'] or e1['链接'] == etp['url']: n1 = etp_n else: # 有可能是人 n1 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e1['链接'])) if n1 is None: n1 = Enterprise(**e1) if not n1.isEnterprise(): n1 = Person(**e1) if not n1.isPerson(): n1 = Related(**e1) n1 = self.get_neo_node(n1) if n1 is not None: nodes.append(n1) nodes.append(p_n) relationships.append(Sell(n1, p_n)) e2['链接'] = Enterprise.parser_url(e2['链接']) if e2['名称'] == etp['name'] or e2['链接'] == etp['url']: n2 = etp_n else: n2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e2['链接'])) if n2 is None: n2 = Enterprise(**e2) if not n2.isEnterprise(): n2 = Person(**e2) if not n2.isPerson(): n2 = Related(**e2) n2 = self.get_neo_node(n2) if n2 is not None: nodes.append(n2) nodes.append(p_n) relationships.append(Buy(n2, p_n)) pass return nodes, relationships def get_all_nodes_and_relationships(self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, # limit=100000, # skip=10000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info( SuccessMessage('success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)))) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class IndGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_api', # tn='relationsDetail.1.0', # location='gcxy', # dbname='data' ) pass def get_all_nodes_and_relationships_from_api(self, etp): """ 创建所有的行业实体,实体对象从外部传进来,因为行业可能 会作为一个相对独立的研究领域,与数据库中企业基本信息中的 行业可能不完全匹配 :return: """ etp_n = self.match_node( 'Enterprise', cypher='_.URL = "{}" OR _.NAME = "{}"' ''.format(Enterprise.parser_url(etp['url']), etp['name'])) if etp_n is None: etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) ind = etp['IndustryV3'] if ind is None: return nodes, relationships ind1 = self.get_neo_node(Industry(**{ 'name': ind['Industry'], 'code': ind['IndustryCode'], '类别': '一级' })) ind2 = self.get_neo_node(Industry(**{ 'name': ind['SubIndustry'], 'code': ind['SubIndustryCode'], '类别': '二级' })) ind3 = self.get_neo_node(Industry(**{ 'name': ind['MiddleCategory'], 'code': ind['MiddleCategoryCode'], '类别': '三级' })) ind4 = self.get_neo_node(Industry(**{ 'name': ind['SmallCategory'], 'code': ind['SmallCategoryCode'], '类别': '四级' })) _ids_ = [ind4, ind3, ind2, ind1] ids = [] for i in _ids_: if i is not None: ids.append(i) nodes.append(i) if len(ids): relationships.append(Belong(etp_n, ids[0])) for i in range(len(ids) - 1): relationships.append(Belong(ids[i], ids[i + 1])) pass return nodes, relationships pass def merge_all_nodes_and_relationships(self): enterprises = self.base.query( # sql={'metaModel': '企业发展'}, field={ '_id': 0, 'value.Result.Name': 1, 'value.Result.KeyNo': 1, 'value.Result.IndustryV3': 1 }, limit=10000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = 10000 # etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 ep = ep['value']['Result'] uc = ep['KeyNo'] # getUniqueCode(ep['url']) ep['name'] = ep.pop('Name') if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_api(ep) def get_all_nodes_and_relationships( self, save_folder=None, enterprises=None, **kwargs): if enterprises is None: enterprises_data = self.base.query( # sql={'metaModel': '企业发展'}, field={ '_id': 0, 'value.Result.Name': 1, 'value.Result.KeyNo': 1, 'value.Result.IndustryV3': 1 }, limit=10000, # skip=2000, no_cursor_timeout=True) etp_count = 10000 # etp_count = enterprises_data.count() else: enterprises_data = enterprises etp_count = len(enterprises) i, j = 0, 0 nc, rc = 0, 0 nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises_data: i += 1 if enterprises is not None: ep = self.base.query_one( sql={'value.Result.Name': ep['name']}, field={ '_id': 0, 'value.Result.Name': 1, 'value.Result.KeyNo': 1, 'value.Result.IndustryV3': 1 }, ) if ep is None: continue ep = ep['value']['Result'] uc = ep['KeyNo'] # getUniqueCode(ep['url']) ep['name'] = ep.pop('Name') if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_api(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info(SuccessMessage( 'success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)) )) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class OptRiskGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 used_entity = [ 'Punishment', 'Possession', ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def create_all_relationship(self): """ 1.enterprise -[have]->punishment :return: """ ors = self.base.query( sql={ 'metaModel': '经营风险', # 'name': '重庆铭悦机械设备有限公司' }, limit=1000, # skip=2000, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = ors.count() relationships = [] # prs = Person() etp = Enterprise() for j in ors: # 每个公司经营风险下列式的东西,肯定就是这家公司的 k += 1 # if k < 43500: # continue # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node( *legal, cypher='_.NAME = "{}"'.format(j['name']) ) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one( sql={'metaModel': '基本信息', 'name': j['name']} ) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo(_) pass else: # 没有这个公司的信息,那么就简单的把这个公司理解成一个涉案者 # 这里就相当于把一个公司当做了一个风险提示的涉及者 # etp = Related(**{'名称': j['name'], '链接': j['url']}) etp = Related() etp['NAME'] = j['name'] etp['URL'] = j['url'] etp_n = self.get_neo_node(etp) pass if '动产抵押' in j['content'].keys(): data = self.get_format_dict(j['content']['动产抵押']) for d in data: _ = d.pop('被担保主债权数额') debt = Debt(**{'债务(金额)': _['金额'], '债务(单位)': _['单位'], '履行期限': d.pop('债务人履行债务的期限') }) debt_n = self.get_neo_node(debt) dy = d.pop('抵押权人') zw = d.pop('债务人') sy = d.pop('所有权或使用权归属') if dy['名称'] == j['name'] or dy['链接'] == j['url']: dy_n = etp_n else: dy_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( dy['链接'], dy['名称']) ) if dy_n is None and len(dy['名称']) > 1: dy_n = Related(**dy) dy_n = self.get_neo_node(dy_n) if dy_n is not None: relationships.append(Have( dy_n, debt_n, **dict(角色='抵押权人', **d) ).get_relationship()) if zw['名称'] == j['name'] or zw['链接'] == j['url']: zw_n = etp_n else: zw_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( zw['链接'], zw['名称']) ) if zw_n is None and len(zw['名称']) > 1: zw_n = Related(**zw) zw_n = self.get_neo_node(zw_n) if zw_n is not None: relationships.append(Have( zw_n, debt_n, **dict(角色='债务人', **d) ).get_relationship()) if sy['名称'] == j['name'] or sy['链接'] == j['url']: sy_n = etp_n else: sy_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( sy['链接'], sy['名称']) ) if sy_n is None and len(sy['名称']) > 1: sy_n = Related(**sy) sy_n = self.get_neo_node(sy_n) if sy_n is not None: relationships.append(Have( sy_n, debt_n, **dict(角色='所有权或使用权人', **d) ).get_relationship()) pass if '公示催告' in j['content'].keys(): data = self.get_format_dict(j['content']['公示催告']) for d in data: _ = d.pop('票面金额') bn = Banknote(**{'票据号': d.pop('票据号'), '票据类型': d.pop('票据类型'), '票面金额(金额)': _['金额'], '票面金额(单位)': _['单位'] }) bn_n = self.get_neo_node(bn) sq = d.pop('申请人') cp = d.pop('持票人') if sq['名称'] == j['name'] or sq['链接'] == j['url']: sq_n = etp_n else: sq_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( sq['链接'], sq['名称']) ) if sq_n is None: sq_n = Related(**sq) sq_n = self.get_neo_node(sq_n) if sq_n is not None: relationships.append(Have( sq_n, bn_n, **dict(角色='申请人', **d) ).get_relationship()) if cp['名称'] == j['name'] or cp['链接'] == j['url']: cp_n = etp_n else: cp_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( cp['链接'], cp['名称']) ) if cp_n is None: cp_n = Related(**cp) cp_n = self.get_neo_node(cp_n) if cp_n is not None: relationships.append(Have( cp_n, bn_n, **dict(角色='持票人', **d) ).get_relationship()) relationships.append(Have( etp_n, bn_n, **dict(角色='出票人', **d) ).get_relationship()) pass if '行政处罚' in j['content'].keys(): data = j['content']['行政处罚'] d1 = self.get_format_dict(data['工商局']) ps = Punishment.create_from_dict(d1, '工商局') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: relationships.append( Have( etp_n, n, **p ).get_relationship() ) d2 = self.get_format_dict(data['税务局']) ps = Punishment.create_from_dict(d2, '税务局') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: relationships.append( Have( etp_n, n, **p ).get_relationship() ) d3 = self.get_format_dict(data['信用中国']) ps = Punishment.create_from_dict(d3, '信用中国') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: relationships.append( Have( etp_n, n, **p ).get_relationship() ) d4 = self.get_format_dict(data['其他']) ps = Punishment.create_from_dict(d4, '其他') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: relationships.append( Have( etp_n, n, **p ).get_relationship() ) pass if '环保处罚' in j['content'].keys(): data = self.get_format_dict(j['content']['环保处罚']) ps = Punishment.create_from_dict(data, '环保局') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: relationships.append( Have( etp_n, n, **p ).get_relationship() ) if '股权出质' in j['content'].keys(): sh_info = j['content']['股权出质'] sh_info = self.get_format_dict(sh_info) for sh in sh_info: sh = dict(sh, **self.get_format_amount( '出质数额', sh.pop('出质数额') )) # 确定出质人 cz = sh.pop('出质人') cz['链接'] = etp.parser_url(cz['链接']) # 判断出质人是不是当前公司 if j['name'] == cz['名称'] or cz['链接'] == etp_n['URL']: cz_n = etp_n else: # 确定出质人,先在法人主体中找 cz_n = self.match_node( *legal, cypher='_.NAME = "{}" OR _.URL = "{}"'.format( cz['名称'], cz['链接'] ) ) if cz_n is None: # 在法人中没找到,就通过url在自然人中找 # 这里最好不要通过名称找了,除公司以外出现 # 同名的几率很大 # TODO(leung):在所有实体中去找开销很大,需要注意 cz_n = self.match_node( 'Person', cypher='_.URL = "{}"'.format(cz['链接']) ) if cz_n is None: # 创建这个股权出质人 if len(cz['名称']) > 1: cz_n = Involveder(**cz) cz_n = self.get_neo_node(cz_n) pass # 确定质权人 zq = sh.pop('质权人') zq['链接'] = etp.parser_url(zq['链接']) # 判断质权人是不是当前公司 if j['name'] == zq['名称'] or zq['链接'] == etp_n['URL']: zq_n = etp_n else: # 确定质权人,先在企业中找 zq_n = self.match_node( *legal, cypher='_.NAME = "{}" OR _.URL = "{}"'.format( zq['名称'], zq['链接'] ) ) if zq_n is None: # 在企业中没找到,就通过url在所有对象中找 # 这里最好不要通过名称找了,除公司以外出现 # 同名的几率很大 # TODO(leung):在所有实体中去找开销很大,需要注意 zq_n = self.match_node( 'Person', cypher='_.URL = "{}"'.format(zq['链接']) ) if zq_n is None: # 创建这个股权出质人 if len(zq['名称']) > 1: zq_n = Involveder(**zq) zq_n = self.get_neo_node(zq_n) pass # 确定出质标的企业 bd = sh.pop('标的企业') bd['链接'] = etp.parser_url(bd['链接']) # 判断出质标的是不是当前公司 if j['name'] == bd['名称'] or bd['链接'] == etp_n['URL']: bd_n = etp_n else: # 确定出质标的,先在企业中找 bd_n = self.match_node( *legal, cypher='_.NAME = "{}" OR _.URL = "{}"'.format( bd['名称'], bd['链接'] ) ) if bd_n is None: # 在企业中没找到,就通过url在所有对象中找 # 这里最好不要通过名称找了,除公司以外出现 # 同名的几率很大 # TODO(leung):在所有实体中去找开销很大,需要注意 bd_n = self.match_node( 'Person', cypher='_.URL = "{}"'.format(bd['链接']) ) if bd_n is None: # 创建这个出质标的 if len(bd['名称']) > 1: bd_n = Possession(**bd) bd_n = self.get_neo_node(bd_n) pass # 创建关系 # 1. 抵押 if cz_n is not None and bd_n is not None: relationships.append( Guaranty(cz_n, bd_n, **sh).get_relationship() ) # 2. 质权 if zq_n is not None and bd_n is not None: relationships.append( Have(zq_n, bd_n, **sh).get_relationship() ) if '破产重组' in j['content'].keys(): data = self.get_format_dict(j['content']['破产重组']) for d in data: sq = d.pop('申请人') if sq['名称'] == j['name'] or sq['链接'] == etp_n['URL']: sq_n = etp_n else: sq_n = self.match_node( *['person'] + legal, cypher='_.URL = "{}"'.format(sq['链接']) ) if sq_n is None: sq_n = Involveder(**sq) sq_n = self.get_neo_node(sq_n) bsq = d.pop('被申请人') if bsq['名称'] == j['name'] or bsq['链接'] == etp_n['URL']: bsq_n = etp_n else: # 被申请破产的一般是法人 bsq_n = self.match_node( *['person'] + legal, cypher='_.URL = "{}"'.format(bsq['链接']) ) if bsq_n is None: bsq_n = Involveder(**bsq) bsq_n = self.get_neo_node(bsq_n) if sq_n is not None and bsq_n is not None: relationships.append( Relationship(sq_n, '申请破产', bsq_n, **d) ) pass if '土地抵押' in j['content'].keys(): data = self.get_format_dict(j['content']['土地抵押']) for d in data: _ = d.pop('抵押面积') p = Plot(**{'位置': d.pop('位置'), '面积(数量)': _['数额'], '面积(单位)': _['单位'], }) p_n = self.get_neo_node(p) d = dict(d, **self.get_format_amount( '抵押金额', d.pop('抵押金额') )) dy = d.pop('抵押人') dyq = d.pop('抵押权人') if dy['名称'] == j['name'] or dy['链接'] == etp_n['URL']: dy_n = etp_n else: dy_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( dy['链接'], dy['名称']) ) if dy_n is None: dy_n = Related(**dy) dy_n = self.get_neo_node(dy_n) if dy_n is not None: relationships.append( Guaranty(dy_n, p_n, **d).get_relationship() ) if dyq['名称'] == j['name'] or dyq['链接'] == etp_n['URL']: dyq_n = etp_n else: dyq_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( dyq['链接'], dyq['名称']) ) if dyq_n is None: dyq_n = Related(**dyq) dyq_n = self.get_neo_node(dyq_n) if dyq_n is not None: relationships.append( Have(dyq_n, p_n, **d).get_relationship() ) pass if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() # return pass if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '动产抵押' in etp['content'].keys(): data = self.get_format_dict(etp['content']['动产抵押']) for d in data: _ = d.pop('被担保主债权数额') debt = Debt(**{'债务(金额)': _['金额'], '债务(单位)': _['单位'], '履行期限': d.pop('债务人履行债务的期限') }) debt_n = self.get_neo_node(debt) nodes.append(debt_n) dy = d.pop('抵押权人') zw = d.pop('债务人') sy = d.pop('所有权或使用权归属') dy['链接'] = Enterprise.parser_url(dy['链接']) zw['链接'] = Enterprise.parser_url(zw['链接']) sy['链接'] = Enterprise.parser_url(sy['链接']) if dy['名称'] == etp['name'] or dy['链接'] == etp['url']: dy_n = etp_n else: dy_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( dy['链接'], dy['名称']) ) if dy_n is None: # dy_n = Related(**dy) dy_n = Enterprise(**dy) if not dy_n.isEnterprise(): dy_n = Person(**dy) if not dy_n.isPerson(): dy_n = Related(**dy) dy_n = self.get_neo_node(dy_n) if dy_n is not None: nodes.append(dy_n) relationships.append(Have( dy_n, debt_n, **dict(角色='抵押权人', **d) )) if zw['名称'] == etp['name'] or zw['链接'] == etp['url']: zw_n = etp_n else: zw_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( zw['链接'], zw['名称']) ) if zw_n is None and len(zw['名称']) > 1: # zw_n = Related(**zw) zw_n = Enterprise(**zw) if not zw_n.isEnterprise(): zw_n = Person(**zw) if not zw_n.isPerson(): zw_n = Related(**zw) zw_n = self.get_neo_node(zw_n) if zw_n is not None: nodes.append(zw_n) relationships.append(Have( zw_n, debt_n, **dict(角色='债务人', **d) )) if sy['名称'] == etp['name'] or sy['链接'] == etp['url']: sy_n = etp_n else: sy_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( sy['链接'], sy['名称']) ) if sy_n is None and len(sy['名称']) > 1: # sy_n = Related(**sy) sy_n = Enterprise(**sy) if not sy_n.isEnterprise(): sy_n = Person(**sy) if not sy_n.isPerson(): sy_n = Related(**sy) sy_n = self.get_neo_node(sy_n) if sy_n is not None: nodes.append(sy_n) relationships.append(Have( sy_n, debt_n, **dict(角色='所有权或使用权人', **d) )) pass if '公示催告' in etp['content'].keys(): data = self.get_format_dict(etp['content']['公示催告']) for d in data: _ = d.pop('票面金额') bn = Banknote(**{'票据号': d.pop('票据号'), '票据类型': d.pop('票据类型'), '票面金额(金额)': _['金额'], '票面金额(单位)': _['单位'] }) bn_n = self.get_neo_node(bn) nodes.append(bn_n) sq = d.pop('申请人') cp = d.pop('持票人') sq['链接'] = Enterprise.parser_url(sq['链接']) cp['链接'] = Enterprise.parser_url(cp['链接']) if sq['名称'] == etp['name'] or sq['链接'] == etp['url']: sq_n = etp_n else: sq_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( sq['链接'], sq['名称']) ) if sq_n is None: # sq_n = Related(**sq) sq_n = Enterprise(**sq) if not sq_n.isEnterprise(): sq_n = Person(**sq) if not sq_n.isPerson(): sq_n = Related(**sq) sq_n = self.get_neo_node(sq_n) if sq_n is not None: nodes.append(sq_n) relationships.append(Have( sq_n, bn_n, **dict(角色='申请人', **d) )) if cp['名称'] == etp['name'] or cp['链接'] == etp['url']: cp_n = etp_n else: cp_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( cp['链接'], cp['名称']) ) if cp_n is None: # cp_n = Related(**cp) cp_n = Enterprise(**cp) if not cp_n.isEnterprise(): cp_n = Person(**cp) if not cp_n.isPerson(): cp_n = Related(**cp) cp_n = self.get_neo_node(cp_n) if cp_n is not None: nodes.append(cp_n) relationships.append(Have( cp_n, bn_n, **dict(角色='持票人', **d) )) relationships.append(Have( etp_n, bn_n, **dict(角色='出票人', **d) )) pass if '行政处罚' in etp['content'].keys(): data = etp['content']['行政处罚'] d1 = self.get_format_dict(data['工商局']) ps = Punishment.create_from_dict(d1, '工商局') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: nodes.append(n) relationships.append( Have(etp_n, n, **p) ) d2 = self.get_format_dict(data['税务局']) ps = Punishment.create_from_dict(d2, '税务局') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: nodes.append(n) relationships.append( Have(etp_n, n, **p) ) d3 = self.get_format_dict(data['信用中国']) ps = Punishment.create_from_dict(d3, '信用中国') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: nodes.append(n) relationships.append( Have(etp_n, n, **p) ) d4 = self.get_format_dict(data['其他']) ps = Punishment.create_from_dict(d4, '其他') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: nodes.append(n) relationships.append( Have(etp_n, n, **p) ) pass if '环保处罚' in etp['content'].keys(): data = self.get_format_dict(etp['content']['环保处罚']) ps = Punishment.create_from_dict(data, '环保局') for p in ps: _ = p.pop('punishment') n = self.get_neo_node(_) if n is not None: nodes.append(n) relationships.append( Have(etp_n, n, **p) ) if '股权出质' in etp['content'].keys(): sh_info = etp['content']['股权出质'] sh_info = self.get_format_dict(sh_info) for sh in sh_info: sh = dict(sh, **self.get_format_amount( '出质数额', sh.pop('出质数额') )) # 确定出质人 cz = sh.pop('出质人') cz['链接'] = Enterprise.parser_url(cz['链接']) # 判断出质人是不是当前公司 if etp['name'] == cz['名称'] or cz['链接'] == etp_n['URL']: cz_n = etp_n else: # 确定出质人,先在法人主体中找 cz_n = self.match_node( *legal, cypher='_.NAME = "{}" OR _.URL = "{}"'.format( cz['名称'], cz['链接'] ) ) if cz_n is None: # 在法人中没找到,就通过url在自然人中找 # 这里最好不要通过名称找了,除公司以外出现 # 同名的几率很大 # TODO(leung):在所有实体中去找开销很大,需要注意 cz_n = self.match_node( 'Person', cypher='_.URL = "{}"'.format(cz['链接']) ) if cz_n is None: # 创建这个股权出质人 if len(cz['名称']) > 1: # cz_n = Involveder(**cz) cz_n = Enterprise(**cz) if not cz_n.isEnterprise(): cz_n = Person(**cz) if not cz_n.isPerson(): cz_n = Related(**cz) cz_n = self.get_neo_node(cz_n) pass # 确定质权人 zq = sh.pop('质权人') zq['链接'] = Enterprise.parser_url(zq['链接']) # 判断质权人是不是当前公司 if etp['name'] == zq['名称'] or zq['链接'] == etp_n['URL']: zq_n = etp_n else: # 确定质权人,先在企业中找 zq_n = self.match_node( *legal, cypher='_.NAME = "{}" OR _.URL = "{}"'.format( zq['名称'], zq['链接'] ) ) if zq_n is None: # 在企业中没找到,就通过url在所有对象中找 # 这里最好不要通过名称找了,除公司以外出现 # 同名的几率很大 # TODO(leung):在所有实体中去找开销很大,需要注意 zq_n = self.match_node( 'Person', cypher='_.URL = "{}"'.format(zq['链接']) ) if zq_n is None: # 创建这个股权出质人 if len(zq['名称']) > 1: # zq_n = Involveder(**zq) zq_n = Enterprise(**zq) if not zq_n.isEnterprise(): zq_n = Person(**zq) if not zq_n.isPerson(): zq_n = Related(**zq) zq_n = self.get_neo_node(zq_n) pass # 确定出质标的企业 bd = sh.pop('标的企业') bd['链接'] = Enterprise.parser_url(bd['链接']) # 判断出质标的是不是当前公司 if etp['name'] == bd['名称'] or bd['链接'] == etp_n['URL']: bd_n = etp_n else: # 确定出质标的,先在企业中找,不会是人 bd_n = self.match_node( *legal, cypher='_.NAME = "{}" OR _.URL = "{}"'.format( bd['名称'], bd['链接'] ) ) if bd_n is None: # 创建这个出质标的 if len(bd['名称']) > 1: bd_n = Enterprise(**bd) if not bd_n.isEnterprise(): bd_n = Possession(**bd) bd_n = self.get_neo_node(bd_n) pass # 创建关系 if bd_n is None: continue nodes.append(bd_n) # 1. 抵押 if cz_n is not None: nodes.append(cz_n) relationships.append( Guaranty(cz_n, bd_n, **sh) ) # 2. 质权 if zq_n is not None: nodes.append(zq_n) relationships.append( Have(zq_n, bd_n, **sh) ) if '破产重组' in etp['content'].keys(): data = self.get_format_dict(etp['content']['破产重组']) for d in data: sq = d.pop('申请人') sq['链接'] = Enterprise.parser_url(sq['链接']) if sq['名称'] == etp['name'] or sq['链接'] == etp_n['URL']: sq_n = etp_n else: sq_n = self.match_node( *['person'] + legal, cypher='_.URL = "{}"'.format(sq['链接']) ) if sq_n is None: # sq_n = Involveder(**sq) sq_n = Enterprise(**sq) if not sq_n.isEnterprise(): sq_n = Person(**sq) if not sq_n.isPerson(): sq_n = Related(**sq) sq_n = self.get_neo_node(sq_n) bsq = d.pop('被申请人') bsq['链接'] = Enterprise.parser_url(bsq['链接']) if bsq['名称'] == etp['name'] or bsq['链接'] == etp_n['URL']: bsq_n = etp_n else: # 被申请破产的一般是法人 bsq_n = self.match_node( *['person'] + legal, cypher='_.URL = "{}"'.format(bsq['链接']) ) if bsq_n is None: # bsq_n = Involveder(**bsq) bsq_n = Enterprise(**bsq) if not bsq_n.isEnterprise(): bsq_n = Person(**bsq) if not bsq_n.isPerson(): bsq_n = Related(**bsq) bsq_n = self.get_neo_node(bsq_n) if sq_n is not None and bsq_n is not None: nodes += [sq_n, bsq_n] relationships.append( ApplyBankrupt(sq_n, bsq_n, **d) ) pass if '土地抵押' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地抵押']) for d in data: _ = d.pop('抵押面积') p = Plot(**{'位置': d.pop('位置'), '面积(数量)': _['数额'], '面积(单位)': _['单位'], }) p_n = self.get_neo_node(p) nodes.append(p_n) d = dict(d, **self.get_format_amount( '抵押金额', d.pop('抵押金额') )) dy = d.pop('抵押人') dyq = d.pop('抵押权人') dy['链接'] = Enterprise.parser_url(dy['链接']) dyq['链接'] = Enterprise.parser_url(dyq['链接']) if dy['名称'] == etp['name'] or dy['链接'] == etp_n['URL']: dy_n = etp_n else: dy_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( dy['链接'], dy['名称']) ) if dy_n is None: # dy_n = Related(**dy) dy_n = Enterprise(**dy) if not dy_n.isEnterprise(): dy_n = Person(**dy) if not dy_n.isPerson(): dy_n = Related(**dy) dy_n = self.get_neo_node(dy_n) if dy_n is not None: nodes.append(dy_n) relationships.append( Guaranty(dy_n, p_n, **d) ) if dyq['名称'] == etp['name'] or dyq['链接'] == etp_n['URL']: dyq_n = etp_n else: dyq_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( dyq['链接'], dyq['名称']) ) if dyq_n is None: # dyq_n = Related(**dyq) dyq_n = Enterprise(**dyq) if not dyq_n.isEnterprise(): dyq_n = Person(**dyq) if not dyq_n.isPerson(): dyq_n = Related(**dyq) dyq_n = self.get_neo_node(dyq_n) if dyq_n is not None: nodes.append(dyq_n) relationships.append( Have(dyq_n, p_n, **d) ) pass return nodes, relationships def get_all_nodes_and_relationships( self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '经营风险', # 'name': '重庆轩烽建材有限公司' }, # limit=100000, # skip=90000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info(SuccessMessage( 'success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)) )) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class JusGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 # 用到的实体对象 used_entity = [ # 'JusticeCase', 'Ruling', 'Involveder', 'Executed', 'SXExecuted', 'LimitOrder', 'StockFreeze' ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def create_relationship_from_justice_case( self, suspect, justice_case, **kwargs): """ enterprise or person -[involve_case]->justice case :param suspect: :param justice_case: :param kwargs: :return: """ rps = [] for jc in justice_case: kwargs = dict(kwargs, **{'案件身份': jc.CASE_IDENTITY}) jc_n = jc.get_neo_node(primarykey=jc.primarykey) if jc_n is None: self.to_logs('filed initialize justice case Neo node', 'ERROR') else: rps.append(InvolveCase( suspect, jc_n, **kwargs ).get_relationship()) return rps def create_all_relationship(self): """ 1.enterprise or person -[involve_case]->case :return: """ justices = self.base.query( sql={ 'metaModel': '法律诉讼', # 'name': '重庆思途科技有限公司' }, limit=100, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = justices.count() relationships = [] # prs = Person() # etp = Enterprise() for j in justices: # 每个公司的法律诉讼下的司法案件肯定跟这个案件有联系 k += 1 # if k < 4910: # continue # TODO(leung): 这里要注意,法律诉讼模块中的url确定不了公司 etp_n = self.match_node( *legal, cypher='_.NAME = "{}"'.format(j['name']) ) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one( sql={'metaModel': '基本信息', 'name': j['name']} ) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo(_) pass else: # 没有这个公司的信息,那么就简单的把这个公司理解成一个涉案者 etp = Involveder(**{'名称': j['name'], '链接': j['url']}) etp_n = self.get_neo_node(etp) if etp_n is None: continue pass if '被执行人' in j['content'].keys(): data = self.get_format_dict(j['content']['被执行人']) eps = Enforcement.create_from_dict(data) for ep in eps: e = ep.pop('executed') e_n = self.get_neo_node(e) if e_n is not None: relationships.append( InvolveCase(etp_n, e_n, **ep).get_relationship() ) pass # if '司法案件' in j['content'].keys(): # justice_case_info = j['content']['司法案件'] # jcs = JusticeCase.create_from_dict(justice_case_info) # rps = self.create_relationship_from_justice_case( # etp_n, jcs) # relationships += rps # pass if '裁判文书' in j['content'].keys(): data = self.get_format_dict(j['content']['裁判文书']) # 返回的是[[Ruling, 相关对象],[]...] rls = Judgment.create_from_dict(data) for ruling, involve in rls: rul_n = self.get_neo_node(ruling) if rul_n is None: continue for inv in involve: # 案件相关主体 # 先判断是不是当前的企业 if j['name'] == inv[1] or j['url'] == inv[2]: # 如果是,直接关联起来 inv_n = etp_n else: # 1.先在企业中匹配 # 2.匹配自然人 inv_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( inv[2]) ) if inv_n is None: ivl = Involveder() ivl['NAME'] = inv[1] ivl['URL'] = inv[2] # if inv[2] is not None: # ivl['URL'] = inv[2] # else: # ivl['URL'] = ivl.get_entity_unique_code( # j['name']+inv[1] # ) inv_n = self.get_neo_node(ivl) # 3.以上两者都没匹配到的时候,创建这个案件参与者 # 实际上还可以到其他实体中去匹配,但那些可能是数据 # 集之外的对象了,可以先不去管他们 if inv_n is not None: relationships.append( InvolveCase( inv_n, rul_n, **{'案件身份': inv[0]} ).get_relationship() ) pass if '失信被执行人' in j['content'].keys(): data = self.get_format_dict( j['content']['失信被执行人'] ) eps = SXEnforcement.create_from_dict(data) for ep in eps: e = ep.pop('sxexecuted') e_n = self.get_neo_node(e) if e_n is not None: relationships.append( InvolveCase(etp_n, e_n, **ep).get_relationship() ) pass if '限制高消费' in j['content'].keys(): data = self.get_format_dict( j['content']['限制高消费'] ) for d in data: sq = d.pop('申请人') lh = d.pop('限消令对象') xg = d.pop('关联对象') _ = d.pop('案号') lo = dict(案号=_['名称'], 案号链接=_['链接'], **d) lo = LimitOrder(**lo) lo_n = self.get_neo_node(lo) if lo_n is None: continue if sq['名称'] == j['name'] or sq['链接'] == etp_n['URL']: sq_n = etp_n else: sq_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( sq['链接']) ) if sq_n is None: # 创建这个对象 sq_n = Involveder(**sq) sq_n = self.get_neo_node(sq_n) if sq_n is not None: relationships.append( InvolveCase(sq_n, lo_n, **{'案件身份': '申请人'} ).get_relationship() ) if lh['名称'] == j['name'] or lh['链接'] == etp_n['URL']: lh_n = etp_n else: lh_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( lh['链接']) ) if lh_n is None: # 创建这个对象 lh_n = Involveder(**lh) lh_n = self.get_neo_node(lh_n) if lh_n is not None: relationships.append( InvolveCase(lo_n, lh_n, **{'案件身份': '限制对象'} ).get_relationship() ) if xg['名称'] == j['name'] or xg['链接'] == etp_n['URL']: xg_n = etp_n else: xg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( xg['链接']) ) if xg_n is None: # 创建这个对象 xg_n = Involveder(**xg) xg_n = self.get_neo_node(xg_n) if xg_n is not None: relationships.append( InvolveCase(lo_n, xg_n, **{'案件身份': '关联对象'} ).get_relationship() ) pass if '股权冻结' in j['content'].keys(): data = self.get_format_dict( j['content']['股权冻结'] ) for d in data: bd = d.pop('标的企业') zx = d.pop('被执行人') _1 = d.pop('股权数额') _2 = d.pop('类型|状态').split('|') sf = dict(冻结数额=_1['金额'], 金额单位=_1['单位'], 类型=_2[0], 状态=_2[1] if len(_2) > 1 else None, **d ) sf = StockFreeze(**sf) sf_n = self.get_neo_node(sf) if sf_n is None: continue if bd['名称'] == j['name'] or bd['链接'] == etp_n['URL']: bd_n = etp_n else: bd_n = self.match_node( *legal, cypher='_.URL = "{}"'.format( bd['链接']) ) if bd_n is None: bd_n = Involveder(**bd) bd_n = self.get_neo_node(bd_n) if bd_n is not None: relationships.append( InvolveCase(sf_n, bd_n, **{'案件身份': '标的企业'} ).get_relationship() ) if zx['名称'] == j['name'] or zx['链接'] == etp_n['URL']: zx_n = etp_n else: zx_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( zx['链接']) ) if zx_n is None: zx_n = Involveder(**zx) zx_n = self.get_neo_node(zx_n) if zx_n is not None: relationships.append( InvolveCase(sf_n, zx_n, **{'案件身份': '被执行人'} ).get_relationship() ) if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '法院公告' in etp['content'].keys(): data = self.get_format_dict( etp['content']['法院公告'] ) cas = CourtAnnounce.create_from_dict(data) for ca in cas: a = ca.pop('announce') a_n = self.get_neo_node(a) if a_n is None: continue nodes.append(a_n) bgs = ca.pop('defendant') for bg in bgs: bg['链接'] = Enterprise.parser_url(bg['链接']) if bg['名称'] == etp['name'] or bg['链接'] == etp_n['URL']: bg_n = etp_n else: bg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( bg['链接']) ) if bg_n is None: # 创建这个对象 # sq_n = Involveder(**sq) bg_n = Enterprise(**bg) if not bg_n.isEnterprise(): bg_n = Person(**bg) if not bg_n.isPerson(): bg_n = Related(**bg) bg_n = self.get_neo_node(bg_n) if bg_n is not None: nodes.append(bg_n) relationships.append( InvolveCase(bg_n, a_n, **{'案件身份': '被告'}) ) ygs = ca.pop('plaintiff') for yg in ygs: yg['链接'] = Enterprise.parser_url(yg['链接']) if yg['名称'] == etp['name'] or yg['链接'] == etp_n['URL']: yg_n = etp_n else: yg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( yg['链接']) ) if yg_n is None: # 创建这个对象 # lh_n = Involveder(**lh) yg_n = Enterprise(**yg) if not yg_n.isEnterprise(): yg_n = Person(**yg) if not yg_n.isPerson(): yg_n = Related(**yg) yg_n = self.get_neo_node(yg_n) if yg_n is not None: nodes.append(yg_n) relationships.append( InvolveCase(yg_n, a_n, **{'案件身份': '原告'}) ) pass if '开庭公告' in etp['content'].keys(): data = self.get_format_dict( etp['content']['开庭公告'] ) cas = OpenAnnounce.create_from_dict(data) for ca in cas: a = ca.pop('announce') a_n = self.get_neo_node(a) if a_n is None: continue nodes.append(a_n) bgs = ca.pop('defendant') for bg in bgs: bg['链接'] = Enterprise.parser_url(bg['链接']) if bg['名称'] == etp['name'] or bg['链接'] == etp_n['URL']: bg_n = etp_n else: bg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( bg['链接']) ) if bg_n is None: # 创建这个对象 # sq_n = Involveder(**sq) bg_n = Enterprise(**bg) if not bg_n.isEnterprise(): bg_n = Person(**bg) if not bg_n.isPerson(): bg_n = Related(**bg) bg_n = self.get_neo_node(bg_n) if bg_n is not None: nodes.append(bg_n) relationships.append( InvolveCase(bg_n, a_n, **{'案件身份': '被告'}) ) ygs = ca.pop('plaintiff') for yg in ygs: yg['链接'] = Enterprise.parser_url(yg['链接']) if yg['名称'] == etp['name'] or yg['链接'] == etp_n['URL']: yg_n = etp_n else: yg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( yg['链接']) ) if yg_n is None: # 创建这个对象 # lh_n = Involveder(**lh) yg_n = Enterprise(**yg) if not yg_n.isEnterprise(): yg_n = Person(**yg) if not yg_n.isPerson(): yg_n = Related(**yg) yg_n = self.get_neo_node(yg_n) if yg_n is not None: nodes.append(yg_n) relationships.append( InvolveCase(yg_n, a_n, **{'案件身份': '原告'}) ) pass if '送达公告' in etp['content'].keys(): data = self.get_format_dict( etp['content']['送达公告'] ) cas = DeliveryAnnounce.create_from_dict(data) for ca in cas: a = ca.pop('announce') a_n = self.get_neo_node(a) if a_n is None: continue nodes.append(a_n) bgs = ca.pop('defendant') for bg in bgs: bg['链接'] = Enterprise.parser_url(bg['链接']) if bg['名称'] == etp['name'] or bg['链接'] == etp_n['URL']: bg_n = etp_n else: bg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( bg['链接']) ) if bg_n is None: # 创建这个对象 # sq_n = Involveder(**sq) bg_n = Enterprise(**bg) if not bg_n.isEnterprise(): bg_n = Person(**bg) if not bg_n.isPerson(): bg_n = Related(**bg) bg_n = self.get_neo_node(bg_n) if bg_n is not None: nodes.append(bg_n) relationships.append( InvolveCase(bg_n, a_n, **{'案件身份': '被告'}) ) ygs = ca.pop('plaintiff') for yg in ygs: yg['链接'] = Enterprise.parser_url(yg['链接']) if yg['名称'] == etp['name'] or yg['链接'] == etp_n['URL']: yg_n = etp_n else: yg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( yg['链接']) ) if yg_n is None: # 创建这个对象 # lh_n = Involveder(**lh) yg_n = Enterprise(**yg) if not yg_n.isEnterprise(): yg_n = Person(**yg) if not yg_n.isPerson(): yg_n = Related(**yg) yg_n = self.get_neo_node(yg_n) if yg_n is not None: nodes.append(yg_n) relationships.append( InvolveCase(yg_n, a_n, **{'案件身份': '原告'}) ) pass if '立案信息' in etp['content'].keys(): data = self.get_format_dict( etp['content']['立案信息'] ) cas = RegisterCase.create_from_dict(data) for ca in cas: c = ca.pop('case') c_n = self.get_neo_node(c) if c_n is None: continue nodes.append(c_n) bgs = ca.pop('defendant') for bg in bgs: bg['链接'] = Enterprise.parser_url(bg['链接']) if bg['名称'] == etp['name'] or bg['链接'] == etp_n['URL']: bg_n = etp_n else: bg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( bg['链接']) ) if bg_n is None: # 创建这个对象 # sq_n = Involveder(**sq) bg_n = Enterprise(**bg) if not bg_n.isEnterprise(): bg_n = Person(**bg) if not bg_n.isPerson(): bg_n = Related(**bg) bg_n = self.get_neo_node(bg_n) if bg_n is not None: nodes.append(bg_n) relationships.append( InvolveCase(bg_n, c_n, **{'案件身份': '被告'}) ) ygs = ca.pop('plaintiff') for yg in ygs: yg['链接'] = Enterprise.parser_url(yg['链接']) if yg['名称'] == etp['name'] or yg['链接'] == etp_n['URL']: yg_n = etp_n else: yg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( yg['链接']) ) if yg_n is None: # 创建这个对象 # lh_n = Involveder(**lh) yg_n = Enterprise(**yg) if not yg_n.isEnterprise(): yg_n = Person(**yg) if not yg_n.isPerson(): yg_n = Related(**yg) yg_n = self.get_neo_node(yg_n) if yg_n is not None: nodes.append(yg_n) relationships.append( InvolveCase(yg_n, c_n, **{'案件身份': '原告'}) ) pass if '终本案件' in etp['content'].keys(): data = self.get_format_dict( etp['content']['终本案件'] ) cas = FinalCase.create_from_dict(data) for ca in cas: c = ca.pop('case') c_n = self.get_neo_node(c) if c_n is None: continue nodes.append(c_n) relationships.append( InvolveCase(etp_n, c_n) ) if '裁判文书' in etp['content'].keys(): data = self.get_format_dict(etp['content']['裁判文书']) # 返回的是[[Ruling, 相关对象],[]...] rls = Judgment.create_from_dict(data) for ruling, involve in rls: rul_n = self.get_neo_node(ruling) if rul_n is None: continue nodes.append(rul_n) for inv in involve: # 案件相关主体 # 先判断是不是当前的企业 inv[2] = Enterprise.parser_url(inv[2]) if etp['name'] == inv[1] or etp['url'] == inv[2]: # 如果是,直接关联起来 inv_n = etp_n else: # 1.先在企业中匹配 # 2.匹配自然人 inv_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( inv[2]) ) if inv_n is None: # ivl = Involveder() _ivl_ = {'名称': inv[1], '链接': inv[2]} ivl = Enterprise(**_ivl_) if not ivl.isEnterprise(): ivl = Person(**_ivl_) if not ivl.isPerson(): ivl = Related(**_ivl_) inv_n = self.get_neo_node(ivl) # 3.以上两者都没匹配到的时候,创建这个案件参与者 # 实际上还可以到其他实体中去匹配,但那些可能是数据 # 集之外的对象了,可以先不去管他们 if inv_n is not None: nodes.append(inv_n) relationships.append( InvolveCase( inv_n, rul_n, **{'案件身份': inv[0]} ) ) pass if '被执行人' in etp['content'].keys(): data = self.get_format_dict(etp['content']['被执行人']) eps = Enforcement.create_from_dict(data) for ep in eps: e = ep.pop('executed') e_n = self.get_neo_node(e) if e_n is not None: nodes.append(e_n) relationships.append( InvolveCase(etp_n, e_n, **ep) ) pass if '失信被执行人' in etp['content'].keys(): data = self.get_format_dict( etp['content']['失信被执行人'] ) eps = SXEnforcement.create_from_dict(data) for ep in eps: e = ep.pop('sxexecuted') e_n = self.get_neo_node(e) if e_n is not None: nodes.append(e_n) relationships.append( InvolveCase(etp_n, e_n, **ep) ) pass if '限制高消费' in etp['content'].keys(): data = self.get_format_dict( etp['content']['限制高消费'] ) for d in data: sq = d.pop('申请人') lh = d.pop('限消令对象') xg = d.pop('关联对象') sq['链接'] = Enterprise.parser_url(sq['链接']) lh['链接'] = Enterprise.parser_url(lh['链接']) xg['链接'] = Enterprise.parser_url(xg['链接']) _ = d.pop('案号') lo = dict(案号=_['名称'], 案号链接=_['链接'], **d) lo = LimitOrder(**lo) lo_n = self.get_neo_node(lo) if lo_n is None: continue nodes.append(lo_n) if sq['名称'] == etp['name'] or sq['链接'] == etp_n['URL']: sq_n = etp_n else: sq_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( sq['链接']) ) if sq_n is None: # 创建这个对象 # sq_n = Involveder(**sq) sq_n = Enterprise(**sq) if not sq_n.isEnterprise(): sq_n = Person(**sq) if not sq_n.isPerson(): sq_n = Related(**sq) sq_n = self.get_neo_node(sq_n) if sq_n is not None: nodes.append(sq_n) relationships.append( InvolveCase(sq_n, lo_n, **{'案件身份': '申请人'}) ) if lh['名称'] == etp['name'] or lh['链接'] == etp_n['URL']: lh_n = etp_n else: lh_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( lh['链接']) ) if lh_n is None: # 创建这个对象 # lh_n = Involveder(**lh) lh_n = Enterprise(**lh) if not lh_n.isEnterprise(): lh_n = Person(**lh) if not lh_n.isPerson(): lh_n = Related(**lh) lh_n = self.get_neo_node(lh_n) if lh_n is not None: nodes.append(lh_n) relationships.append( InvolveCase(lo_n, lh_n, **{'案件身份': '限制对象'}) ) if xg['名称'] == etp['name'] or xg['链接'] == etp_n['URL']: xg_n = etp_n else: xg_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( xg['链接']) ) if xg_n is None: # 创建这个对象 # xg_n = Involveder(**xg) xg_n = Enterprise(**xg) if not xg_n.isEnterprise(): xg_n = Person(**xg) if not xg_n.isPerson(): xg_n = Related(**xg) xg_n = self.get_neo_node(xg_n) if xg_n is not None: nodes.append(xg_n) relationships.append( InvolveCase(lo_n, xg_n, **{'案件身份': '关联对象'}) ) pass if '股权冻结' in etp['content'].keys(): data = self.get_format_dict( etp['content']['股权冻结'] ) for d in data: bd = d.pop('标的企业') zx = d.pop('被执行人') bd['链接'] = Enterprise.parser_url(bd['链接']) zx['链接'] = Enterprise.parser_url(zx['链接']) _1 = d.pop('股权数额') _2 = d.pop('类型|状态').split('|') sf = dict(冻结数额=_1['金额'], 金额单位=_1['单位'], 类型=_2[0], 状态=_2[1] if len(_2) > 1 else None, **d ) sf = StockFreeze(**sf) sf_n = self.get_neo_node(sf) if sf_n is None: continue nodes.append(sf_n) if bd['名称'] == etp['name'] or bd['链接'] == etp_n['URL']: bd_n = etp_n else: bd_n = self.match_node( *legal, cypher='_.URL = "{}"'.format( bd['链接']) ) if bd_n is None: # bd_n = Involveder(**bd) bd_n = Enterprise(**bd) if not bd_n.isEnterprise(): bd_n = Person(**bd) if not bd_n.isPerson(): bd_n = Related(**bd) bd_n = self.get_neo_node(bd_n) if bd_n is not None: nodes.append(bd_n) relationships.append( InvolveCase(sf_n, bd_n, **{'案件身份': '标的企业'}) ) if zx['名称'] == etp['name'] or zx['链接'] == etp_n['URL']: zx_n = etp_n else: zx_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format( zx['链接']) ) if zx_n is None: # zx_n = Involveder(**zx) zx_n = Enterprise(**zx) if not zx_n.isEnterprise(): zx_n = Person(**zx) if not zx_n.isPerson(): zx_n = Related(**zx) zx_n = self.get_neo_node(zx_n) if zx_n is not None: nodes.append(zx_n) relationships.append( InvolveCase(sf_n, zx_n, **{'案件身份': '被执行人'}) ) pass return nodes, relationships def get_all_nodes_and_relationships( self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '法律诉讼', # 'name': '重庆合文贸易有限公司' }, # limit=10000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info(SuccessMessage( 'success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(_st_ - time.time())) )) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class Timeline: def __init__(self, name): self.bm = BaseModel(tn='qcc', location='gcxy', dbname='data') self.name = name self.timeline = [] self.getTimeline() self.timeline.sort(key=lambda x: x[0], reverse=False) pass def getTimeline(self): self.f1() self.f2() self.f3() self.f4() self.f5() self.f6() self.f7() tl = [] for t in self.timeline: if t[0] is not None: tl.append(t) self.timeline = tl pass def to_excel(self, path): wb = load_workbook(project_dir + '\\xxx公司发展历程.xlsx') sh = wb['公司历程'] sh['A1'] = self.name for i in range(len(self.timeline)): sh.cell(i + 3, 1, self.timeline[i][0]) sh.cell(i + 3, 2, self.timeline[i][1]) sh.cell(i + 3, 3, self.timeline[i][2]) wb.save(path) def f1(self): meta = '基本信息' d = self.bm.query_one(sql={'name': self.name, 'metaModel': meta}) if d is None: return d = d['content'] __ = d['工商信息']['#1'] self.timeline.append([ __['成立日期'], meta, '公司注册成立,注册资本{}{}'.format(__['注册资本']['金额'], __['注册资本']['单位']) ]) __ = d['变更记录'] for b in __.values(): self.timeline.append([ b['变更日期'], meta, "公司发生{}:\n变更前:{}\n变更后:{}".format( b['变更项目'], b['变更前']['内容'], b['变更后']['内容'], ) ]) __ = d['股东信息'] for _ in __.values(): if '认缴出资日期' in _.keys(): self.timeline.append([ _['认缴出资日期'], meta, '股东{}认缴出资{}{}'.format(_['股东']['名称'], _['认缴出资额']['金额'], _['认缴出资额']['单位']) ]) if '实缴出资日期' in _.keys(): self.timeline.append([ _['实缴出资日期'], meta, '股东{}实缴出资{}{}'.format(_['股东']['名称'], _['实缴出资额']['金额'], _['实缴出资额']['单位']) ]) __ = d['对外投资'] for _ in __.values(): if '融资日期' in _.keys(): self.timeline.append([ _['融资日期'], meta, '投资{}{}{},所占比例{}'.format(_['被投资企业']['名称'], _['投资数额']['金额'], _['投资数额']['单位'], _['投资比例']) ]) __ = d['建筑资质资格'] for _ in __.values(): self.timeline.append( [_['发证日期'], meta, '获得建筑资质资格,证书名称{}'.format(_['资质名称'])]) __ = d['股权变更'] for _ in __.values(): self.timeline.append([ _['公示日期'], meta, '股东,股权比例由{}变更为{}'.format(_['变更前股权比例'], _['变更后股权比例']) ]) pass def f2(self): meta = '经营状况' d = self.bm.query_one(sql={'name': self.name, 'metaModel': meta}) if d is None: return d = d['content'] __ = d['产权交易'] for _ in __.values(): self.timeline.append([ _['交易日期'], meta, '以{}{}转让{}给{}'.format(_['转让价格']['金额'], _['转让价格']['单位'], _['标的企业']['名称'], _['转让方']['名称']) ]) __ = d['抽查检查'] for _ in __.values(): self.timeline.append([ _['日期'], meta, '{}对公司进行{}检查,检查结果为{}'.format(_['实施机关'], _['类型'], _['结果']) ]) __ = d['购地信息'] for _ in __.values(): self.timeline.append([ _['合同签订日期'], meta, '购入位于{},共{}{}土地,土地用途为{},供地方式为{}'.format( _['项目位置']['位置'], _['面积']['数量'], _['面积']['单位'], _['土地用途'], _['供地方式']) ]) __ = d['行政许可'] for v in __.values(): if '工商局' in v.keys(): _ = v['工商局'] self.timeline.append([ _['有效期自'], meta, '由{}颁布行政许可"{}"'.format(_['许可机关'], _['许可内容']) ]) if '信用中国' in v.keys(): _ = v['信用中国'] self.timeline.append([ _['决定日期'], meta, '由{}颁布行政许可"{}"'.format(_['许可机关'], _['决定文书号']) ]) __ = d['进出口信用'] for _ in __.values(): self.timeline.append( [_['注册日期'], meta, '在{}注册{}进出口信息'.format(_['注册海关'], _['经营类别'])]) __ = d['双随机抽查'] for _ in __.values(): self.timeline.append( [_['完成日期'], meta, '{}完成对公司的“{}”'.format(_['抽查机关'], _['任务名称'])]) __ = d['招聘'] for _ in __.values(): self.timeline.append([ _['发布日期'], meta, '发布招聘职位:{},薪资:{}'.format(_['职位']['职位'], _['月薪']) ]) __ = d['招投标信息'] for _ in __.values(): self.timeline.append( [_['发布日期'], meta, '{}:{}'.format(_['项目分类'], _['描述']['描述'])]) __ = d['信用评级'] for _ in __.values(): self.timeline.append([ _['评级日期'], meta, '公司被{}信用评级为{}'.format(_['评级公司']['名称'], _['主体评级']) ]) pass def f3(self): meta = '经营风险' d = self.bm.query_one(sql={'name': self.name, 'metaModel': meta}) if d is None: return d = d['content'] __ = d['动产抵押'] for _ in __.values(): self.timeline.append([ _['登记日期'], meta, '涉及动产抵押,抵押权人:{},债务人:{},所有权或使用权归属{},涉及金额{}{}'.format( _['抵押权人']['名称'], _['债务人']['名称'], _['所有权或使用权归属']['名称'], _['被担保主债权数额']['金额'], _['被担保主债权数额']['单位']) ]) __ = d['公示催告'] for _ in __.values(): self.timeline.append([ _['公告日期'], meta, '{}申请{}票据承兑,票面金额{}{},持票人:{}'.format( _['申请人']['名称'], _['票据类型'], _['票面金额']['金额'], _['票面金额']['单位'], _['持票人']['名称'], ) ]) __ = d['股权出质'] for _ in __.values(): self.timeline.append([ _['登记日期'], meta, '{}将{}出质给{},涉及金额{}{}'.format( _['出质人']['名称'], _['标的企业']['名称'], _['质权人']['名称'], _['出质数额']['金额'], _['出质数额']['单位'], ) ]) __ = d['行政处罚']['工商局'] for _ in __.values(): self.timeline.append([ _['公示日期'] if _['公示日期'] is not None else _['决定日期'], meta, '公司因{},{}对公司实施{}'.format( _['违法行为类型'], _['决定机关'], _['处罚内容'], ) ]) __ = d['行政处罚']['税务局'] for _ in __.values(): self.timeline.append( [_['处罚决定日期'], meta, '公司因{},税务局对公司实施行政处罚'.format(_['处罚事由'], )]) __ = d['行政处罚']['其他'] for _ in __.values(): self.timeline.append([ _['处罚日期'], meta, '公司因{},{}对公司实施行政处罚'.format( _['处罚事由'], _['处罚单位'], ) ]) __ = d['行政处罚']['信用中国'] for _ in __.values(): self.timeline.append([ _['处罚日期'], meta, '公司因{},{}对公司实施行政处罚'.format( _['处罚事由'], _['处罚机关'], ) ]) __ = d['环保处罚'] for _ in __.values(): self.timeline.append([ _['处罚日期'], meta, '公司因{},{}对公司实施行政处罚'.format( _['违法类型'], _['处罚单位'], ) ]) __ = d['简易注销'] for _ in __.values(): self.timeline.append([_['公告申请日期'], meta, '申请简易注销']) __ = d['经营异常'] for _ in __.values(): self.timeline.append( [_['列入日期'], meta, '因{},被列入经营异常名单'.format(_['列入原因'])]) if '移出日期' in _.keys(): self.timeline.append( [_['移出日期'], meta, '因{},被移出经营异常名单'.format(_['移出原因'])]) __ = d['破产重组'] for _ in __.values(): self.timeline.append([ _['公开日期'], meta, '{}申请对{}进行破产重组'.format(_['申请人']['名称'], _['被申请人']['名称']) ]) __ = d['欠税公告'] for _ in __.values(): self.timeline.append([ _['发布日期'], meta, '公司涉及{}{}的欠税'.format(_['欠税余额']['金额'], _['欠税余额']['单位']) ]) __ = d['税收违法'] for _ in __.values(): self.timeline.append( [_['发布日期'], meta, '公司涉税收违法,案件性质:{}'.format(_['案件性质'], )]) __ = d['司法拍卖'] for _ in __.values(): self.timeline.append([_['拍卖时间'], meta, '{}'.format(_['标题'], )]) __ = d['土地抵押'] for _ in __.values(): self.timeline.append([ _['抵押起止日期'].split('至')[0].strip().replace('\n', ''), meta, '{}将{}{}土地抵押给{},抵押金额{}{},土地坐落于{}'.format( _['抵押人']['名称'], _['抵押面积']['数额'], _['抵押面积']['单位'], _['抵押权人']['名称'], _['抵押金额']['金额'], _['抵押金额']['单位'], _['位置']) ]) __ = d['询价评估'] for _ in __.values(): self.timeline.append([ _['发布日期'], meta, '公司就{}发布询价评估,询价结果{}{}'.format( _['标的物']['名称'], _['询价结果']['金额'], _['询价结果']['单位'], ) ]) __ = d['严重违法'] for _ in __.values(): self.timeline.append( [_['列入日期'], meta, '因{},被列入严重违法名单'.format(_['列入原因'])]) if '移出日期' in _.keys(): self.timeline.append( [_['移出日期'], meta, '因{},被移出严重违法名单'.format(_['移出原因'])]) __ = d['注销备案'] if '清算组备案信息' in __.keys(): _ = __['清算组备案信息'] self.timeline.append( [_['清算组备案日期'], meta, '公司因{}成立清算组'.format(_['注销原因'], )]) pass def f4(self): meta = '企业发展' d = self.bm.query_one(sql={'name': self.name, 'metaModel': meta}) if d is None: return d = d['content'] __ = d['企业业务'] for _ in __.values(): self.timeline.append( [_['成立日期'], meta, '公司开发或生产{}产品'.format(_['产品名']['名称'], )]) __ = d['融资信息'] for _ in __.values(): self.timeline.append([ _['日期'], meta, '公司向{}融资{}{}'.format( _['投资方']['名称'], _['金额']['金额'], _['金额']['单位'], ) ]) pass def f5(self): meta = '知识产权' d = self.bm.query_one(sql={'name': self.name, 'metaModel': meta}) if d is None: return d = d['content'] __ = d['软件著作权'] for _ in __.values(): self.timeline.append([ _['发布日期'] if _['发布日期'] is not None else _['登记批准日期'], meta, '公司申请了{}的软件著作权'.format(_['软件名称'], ) ]) __ = d['商标信息'] for _ in __.values(): self.timeline.append( [_['申请日期'], meta, '公司申请了商标:{}'.format(_['商标']['名称'], )]) __ = d['网站信息'] for _ in __.values(): self.timeline.append( [_['审核日期'], meta, '公司申请了网站备案:{}'.format(_['名称'], )]) __ = d['证书信息'] for _ in __.values(): self.timeline.append( [_['发证日期'], meta, '公司获得了{}证书'.format(_['证书']['名称'], )]) __ = d['专利信息'] for _ in __.values(): self.timeline.append( [_['公开日期'], meta, '公司申请了{}专利'.format(_['专利']['名称'], )]) __ = d['作品著作权'] for _ in __.values(): self.timeline.append( [_['首次发表日期'], meta, '公司申请了{}的作品著作权'.format(_['作品名称'], )]) pass def f6(self): meta = '法律诉讼' d = self.bm.query_one(sql={'name': self.name, 'metaModel': meta}) if d is None: return d = d['content'] try: __ = d['被执行人'] for _ in __.values(): self.timeline.append([ _['立案日期'], meta, '公司被列为“被执行人”,执行金额{}{}'.format( _['执行标的']['金额'], _['执行标的']['单位'], ) ]) except Exception as e: ExceptionInfo(e) __ = d['裁判文书'] for _ in __.values(): self.timeline.append([ _['发布日期'], meta, '公司涉及的“{}”案件:{},裁决结果:{}'.format( _['案由'], _['案号'], _['裁判文书']['标题'], ) ]) __ = d['法院公告'] for _ in __.values(): self.timeline.append([ _['刊登日期'], meta, '法院公告:{}与{}的“{}”'.format(_['被告人/被告/被上诉人/被申请人']['名称'], _['公诉人/原告/上诉人/申请人']['名称'], _['案由']) ]) # __ = d['股权冻结'] # for _ in __.values(): # self.timeline.append([ # _['刊登日期'], # '{}的{}被执行股权冻结,冻结股权数额{}{}'.format( # _['被执行人']['名称'], # _['标的企业']['名称'], # _['股权数额']['金额'], _['股权数额']['单位'] # ) # ]) __ = d['开庭公告'] for _ in __.values(): self.timeline.append([ _['开庭时间'], meta, '开庭公告:{}与{}的“{}”'.format(_['被告人/被告/被上诉人/被申请人']['名称'], _['公诉人/原告/上诉人/申请人']['名称'], _['案由']) ]) __ = d['立案信息'] for _ in __.values(): self.timeline.append([ _['立案日期'], meta, '立案信息:{}与{}的“{}”'.format(_['被告人/被告/被上诉人/被申请人']['名称'], _['公诉人/原告/上诉人/申请人']['名称'], _['案由']) ]) __ = d['失信被执行人'] for _ in __.values(): self.timeline.append( [_['发布日期'], meta, '公司被列为失信被执行人,履行情况:{}'.format(_['履行情况'])]) __ = d['送达公告'] for _ in __.values(): self.timeline.append([ _['发布日期'], meta, '送达公告:{}与{}的“{}”'.format(_['被告人/被告/被上诉人/被申请人']['名称'], _['公诉人/原告/上诉人/申请人']['名称'], _['案由']) ]) __ = d['限制高消费'] for _ in __.values(): self.timeline.append([ _['发布日期'], meta, '因“{}”,{}申请对{}实施限制高消费'.format( _['案号']['名称'], _['申请人']['名称'], _['限消令对象']['名称'], ) ]) __ = d['终本案件'] for _ in __.values(): self.timeline.append([ _['终本日期'], meta, '终本案件:执行标的{}{},未履行{}{}'.format( _['执行标的']['金额'], _['执行标的']['单位'], _['未履行金额']['金额'], _['未履行金额']['单位'], ) ]) pass def f7(self): meta = '公司新闻' d = self.bm.query_one(sql={'name': self.name, 'metaModel': meta}) if d is None: return d = d['content'] __ = d['企业公告'] for _ in __.values(): self.timeline.append( [_['日期'], meta, '发布企业公告:{}'.format(_['标题'], )]) __ = d['相关公告'] for _ in __.values(): self.timeline.append([_['日期'], meta, '相关公告:{}'.format(_['标题'], )]) __ = d['新闻舆情'] for _ in __.values(): self.timeline.append( [_['发布时间'], meta, '公司新闻:{}'.format(_['新闻标题'], )]) pass
class RightsGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不必再单独创建索引 :return: """ # 用到是实体对象 used_entity = [ 'Website', 'Certificate', 'Patent', 'Trademark', 'App', 'WorkCopyRight', 'SoftCopyRight', 'Weibo', 'OfficialAccount', 'Applets', ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def create_all_relationship(self): """ 1.enterprise -[have]->x :return: """ rts = self.base.query( sql={'metaModel': '知识产权'}, # limit=100, skip=79175 + 7909, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() # etp = Enterprise() etp_count = rts.count() relationships = [] s_t = time.time() for r in rts: k += 1 # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node(*legal, cypher='_.NAME = "{}"'.format(r['name'])) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one(sql={ 'metaModel': '基本信息', 'name': r['name'] }) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo( _) pass else: # 没有这个公司的信息,那就创建一个信息不全的公司 etp = Related(**{'名称': r['name'], '链接': r['url']}) # etp['NAME'] = r['name'] # etp['URL'] = r['url'] etp_n = self.get_neo_node(etp) pass pass if '网站信息' in r['content'].keys(): data = self.get_format_dict(r['content']['网站信息']) webs = Website.create_from_dict(data) for web in webs: w = web.pop('website') w_n = self.get_neo_node(w) if w_n is not None: relationships.append( Have(etp_n, w_n, **web).get_relationship()) pass if '证书信息' in r['content'].keys(): data = self.get_format_dict(r['content']['证书信息']) ctfs = Certificate.create_from_dict(data) for ctf in ctfs: c = ctf.pop('certificate') c_n = self.get_neo_node(c) if c_n is not None: relationships.append( Have(etp_n, c_n, **ctf).get_relationship()) pass if '专利信息' in r['content'].keys(): data = self.get_format_dict(r['content']['专利信息']) pats = Patent.create_from_dict(data) for pat in pats: p = pat.pop('patent') p_n = self.get_neo_node(p) if p_n is not None: relationships.append( Have(etp_n, p_n, **pat).get_relationship()) pass if '商标信息' in r['content'].keys(): data = self.get_format_dict(r['content']['商标信息']) tms = Trademark.create_from_dict(data) for tm in tms: t = tm.pop('trademark') t_n = self.get_neo_node(t) if t_n is not None: relationships.append( Have(etp_n, t_n, **tm).get_relationship()) pass if '软件著作权' in r['content'].keys(): data = self.get_format_dict(r['content']['软件著作权']) scrs = SoftCopyRight.create_from_dict(data) for scr in scrs: s = scr.pop('softcopyright') s_n = self.get_neo_node(s) if s_n is not None: relationships.append( Have(etp_n, s_n, **scr).get_relationship()) pass if '作品著作权' in r['content'].keys(): data = self.get_format_dict(r['content']['作品著作权']) wcrs = WorkCopyRight.create_from_dict(data) for wcr in wcrs: w = wcr.pop('workcopyright') w_n = self.get_neo_node(w) if w_n is not None: relationships.append( Have(etp_n, w_n, **wcr).get_relationship()) pass if '微博' in r['content'].keys(): data = self.get_format_dict(r['content']['微博']) wbs = Weibo.create_from_dict(data) for wb in wbs: w = wb.pop('weibo') w_n = self.get_neo_node(w) if w_n is not None: relationships.append( Have(etp_n, w_n, **wb).get_relationship()) pass if '微信公众号' in r['content'].keys(): data = self.get_format_dict(r['content']['微信公众号']) oas = OfficialAccount.create_from_dict(data) for oa in oas: woa = oa.pop('WeChat') woa_n = self.get_neo_node(woa) if woa_n is not None: relationships.append( Have(etp_n, woa_n, **oa).get_relationship()) pass if '小程序' in r['content'].keys(): data = self.get_format_dict(r['content']['小程序']) alts = Applets.create_from_dict(data) for alt in alts: a = alt.pop('applets') a_n = self.get_neo_node(a) if a_n is not None: relationships.append( Have(etp_n, a_n, **alt).get_relationship()) pass if 'APP' in r['content'].keys(): data = self.get_format_dict(r['content']['APP']) aps = App.create_from_dict(data) for ap in aps: a = ap.pop('app') a_n = self.get_neo_node(a) if a_n is not None: relationships.append( Have(etp_n, a_n, **ap).get_relationship()) pass if len(relationships) > 1000: i += 1 sp = int(time.time() - s_t) s_t = time.time() self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage( '{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise and spend {} ' 'seconds,and merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, sp, len(relationships)))) relationships.clear() # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '网站信息' in etp['content'].keys(): data = self.get_format_dict(etp['content']['网站信息']) webs = Website.create_from_dict(data) for web in webs: w = web.pop('website') w_n = self.get_neo_node(w) if w_n is not None: nodes.append(w_n) relationships.append(Have(etp_n, w_n, **web)) pass if '证书信息' in etp['content'].keys(): data = self.get_format_dict(etp['content']['证书信息']) ctfs = Certificate.create_from_dict(data) for ctf in ctfs: c = ctf.pop('certificate') c_n = self.get_neo_node(c) if c_n is not None: nodes.append(c_n) relationships.append(Have(etp_n, c_n, **ctf)) pass if '专利信息' in etp['content'].keys(): data = self.get_format_dict(etp['content']['专利信息']) pats = Patent.create_from_dict(data) for pat in pats: p = pat.pop('patent') p_n = self.get_neo_node(p) if p_n is not None: nodes.append(p_n) relationships.append(Have(etp_n, p_n, **pat)) pass if '商标信息' in etp['content'].keys(): data = self.get_format_dict(etp['content']['商标信息']) tms = Trademark.create_from_dict(data) for tm in tms: t = tm.pop('trademark') t_n = self.get_neo_node(t) if t_n is not None: nodes.append(t_n) relationships.append(Have(etp_n, t_n, **tm)) pass if '软件著作权' in etp['content'].keys(): data = self.get_format_dict(etp['content']['软件著作权']) scrs = SoftCopyRight.create_from_dict(data) for scr in scrs: s = scr.pop('softcopyright') s_n = self.get_neo_node(s) if s_n is not None: nodes.append(s_n) relationships.append(Have(etp_n, s_n, **scr)) pass if '作品著作权' in etp['content'].keys(): data = self.get_format_dict(etp['content']['作品著作权']) wcrs = WorkCopyRight.create_from_dict(data) for wcr in wcrs: w = wcr.pop('workcopyright') w_n = self.get_neo_node(w) if w_n is not None: nodes.append(w_n) relationships.append(Have(etp_n, w_n, **wcr)) pass if '微博' in etp['content'].keys(): data = self.get_format_dict(etp['content']['微博']) wbs = Weibo.create_from_dict(data) for wb in wbs: w = wb.pop('weibo') w_n = self.get_neo_node(w) if w_n is not None: nodes.append(w_n) relationships.append(Have(etp_n, w_n, **wb)) pass if '微信公众号' in etp['content'].keys(): data = self.get_format_dict(etp['content']['微信公众号']) oas = OfficialAccount.create_from_dict(data) for oa in oas: woa = oa.pop('WeChat') woa_n = self.get_neo_node(woa) if woa_n is not None: nodes.append(woa_n) relationships.append(Have(etp_n, woa_n, **oa)) pass if '小程序' in etp['content'].keys(): data = self.get_format_dict(etp['content']['小程序']) alts = Applets.create_from_dict(data) for alt in alts: a = alt.pop('applets') a_n = self.get_neo_node(a) if a_n is not None: nodes.append(a_n) relationships.append(Have(etp_n, a_n, **alt)) pass if 'APP' in etp['content'].keys(): data = self.get_format_dict(etp['content']['APP']) aps = App.create_from_dict(data) for ap in aps: a = ap.pop('app') a_n = self.get_neo_node(a) if a_n is not None: nodes.append(a_n) relationships.append(Have(etp_n, a_n, **ap)) pass return nodes, relationships def get_all_nodes_and_relationships(self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '知识产权', # 'name': '重庆轩烽建材有限公司' }, # limit=100000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: print('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info( SuccessMessage('success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(_st_ - time.time())))) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships