def run(): bm = BaseModel(tn='qcc_original') bm2 = BaseModel(tn='qcc_format') metaModels = ['基本信息', '企业发展', '法律诉讼', '经营风险', '经营状况', '公司新闻', '知识产权'] models = { '基本信息': Enterprise(), '企业发展': Develop(), '法律诉讼': Judicature(), '经营风险': Risk(), '经营状况': Operating(), '公司新闻': News(), '知识产权': Right() } for m in metaModels: enterprises = bm.query( sql={ 'metaModel': m, # 'name': '重庆斯麦尔酒店有限公司' }, # field={'content': 1, '_id': 0}, # no_cursor_timeout=True limit=1000, # skip=0 ) print('\ndeal metaModel({})...'.format(m)) mdl = models[m] mdl.run(enterprises, bm2) pass
def __init__(self, name): self.bm = BaseModel(tn='qcc', location='gcxy', dbname='data') self.name = name self.timeline = [] self.getTimeline() self.timeline.sort(key=lambda x: x[0], reverse=False) pass
def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # location='local2', # dbname='data' ) pass
def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_api', # tn='relationsDetail.1.0', # location='gcxy', # dbname='data' ) pass
def f1(): bm = BaseModel(tn='qcc_spider_all_4_14', location='server', dbname='prod') # enterprises = bm.aggregate(pipeline=[ # {'$match': {'metaModel': '基本信息'}}, # # {'$project': {'_id': 1, 'name': 1}} # ]) enterprises = bm.query(sql={'metaModel': '基本信息'}, no_cursor_timeout=True) ds = [] data = [] keep = [] i = 0 for etp in enterprises: i += 1 # if i > 10: # break cs = get_keys(etp, '基本信息', return_value=True, filter_key=[ '_id', 'metaModel', 'source', 'url', 'headers', 'get', 'date', '序号', '日期', '链接', '时间' ]) for c in cs: _ = c.split(':') if len(keep): if sum([1 if kp in _[0] else 0 for kp in keep]): data.append([_[0], _[1]]) else: data.append([_[0], _[1]]) if i % 1000 == 0: d = pd.DataFrame(data, columns=['k', 'v']) d['f'] = 1 d = d.groupby(['k', 'v'], as_index=False).agg({ # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')])) 'f': 'sum' }) ds.append(d) data.clear() pass d = pd.DataFrame(data, columns=['k', 'v']) d['f'] = 1 d = d.groupby(['k', 'v'], as_index=False).agg({ # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')])) 'f': 'sum' }) ds.append(d) ds = pd.concat(ds) ds = ds.groupby(['k', 'v'], as_index=False).agg({ # 'v': lambda x: '\n'.join(set([_ for _ in ('\n'.join(list(x))).split('\n')])) 'f': 'sum' }) # ds.to_csv(workspace + 'flss-all.csv', index=False) return ds pass
def insert(): bm = BaseModel(tn='qcc_original') fs = File.get_all_file('D:\graph_data\data\qcc_20200423\\') for f in fs: js = read_json(f) try: bm.insert_batch(js) except: continue pass
def duplication(): import time import pandas as pd bm = BaseModel(tn='qcc') metaModels = [ '基本信息', # '企业发展', # '法律诉讼', # '经营风险', # '经营状况', # '公司新闻', # '知识产权' ] for m in metaModels: data = bm.aggregate(pipeline=[ { '$match': { 'metaModel': m } }, { '$project': { '_id': 0, 'name': 1, # 'recall': 1, # 'date': 1 } } ]) data = pd.DataFrame(list(data)) data.to_csv('qcc_names.csv', index=False) # data = data.sort_values(['name', 'recall', 'date'], ascending=False) # data['dup'] = data['name'].duplicated(keep='first') # total = len(data) # dup = data[data['dup']]['_id'] # dup_count = len(dup) # print('\nduplicate({}): {}/{}'.format(m, dup_count, total)) # i = 0 # start = time.time() # for _ in dup: # duplicate: 356454/1691737 # # bm.remove(_id=i) # dc = bm.mc.delete_one({'_id': _}) # i += dc.deleted_count # if i % 10 == 0: # progress_bar( # dup_count, i, 'drop duplicate data and spend {} ' # 'seconds'.format(int(time.time() - start))) pass # duplication()
def get_old_keys(): bm = BaseModel(tn='qcc_original') metaModel = '公司新闻' enterprises = bm.query( sql={ 'metaModel': metaModel, # 'name': '重庆导宇科技有限公司' }, field={ 'content': 1, '_id': 0, 'name': 1 }, no_cursor_timeout=True) i = 0 exit_filed = set() for etp in enterprises: i += 1 # if i > 10: # break name = etp.pop('name') try: cs = dictToDim2(etp, metaModel, '$') except Exception as e: print(e) print(name) for c in cs: exit_filed.add(c) pass data = [] for s in exit_filed: _ = s.split('$') d = [] for i in _: if len(i): d.append(i) data.append(','.join(d) + '\n') fp = workspace + '{}\\'.format(metaModel) File.check_file(fp) with open(fp + '字段.csv', 'w', encoding='gbk') as f: f.writelines(data) pass # exit_filed = pd.DataFrame(data=[f for f in exit_filed], columns=['key']) # fp = workspace + '{}\\'.format(metaModel) # File.check_file(fp) # exit_filed.to_csv(fp + '字段.csv', index=False) pass
def aggregate(self, table_name, pipeline): """ :param table_name: :param pipeline: a list, 每一个元素相当于一个管道操作,常见的操作包括 匹配('$match')、属性域选择('$project') :return: """ try: cursor = BaseModel(table_name, self.location, self.dbname).aggregate(pipeline) # data = pd.DataFrame() # if cursor.count(): data = pd.DataFrame(list(cursor)) cursor.close() return data except Exception as e: ExceptionInfo(e) return pd.DataFrame()
def read_data(cls, table_name, field=None, **kw): """ 一个简易的数据读取接口 :param table_name: :param field: :param kw: :return: """ try: cursor = BaseModel(table_name, cls.location, cls.dbname).query(kw, field) data = pd.DataFrame() if cursor.count(): data = pd.DataFrame(list(cursor)) except Exception as e: ExceptionInfo(e) finally: cursor.close() return data
def remove_data(cls, table_name, **kw): """ 删除数据 :param table_name: :param kw: :return: """ try: BaseModel(table_name, cls.location, cls.dbname).remove(kw) except Exception: raise MongoIOError('Failed with delete data by MongoDB')
def remove_data(self, table_name, **kw): """ 删除数据 :param table_name: :param kw: :return: """ try: r = BaseModel(table_name, self.location, self.dbname).remove(kw) return r except Exception: raise MongoIOError('Failed with delete data by MongoDB')
def field(cls, table_name, field_name): """ Query the value of a field in the database :param table_name: the database's table name :param field_name: the table's field name :return: all values in database """ try: return BaseModel(table_name, cls.location, cls.dbname).distinct(field_name) except Exception: raise MongoIOError('query the field raise a error')
def insert_data(cls, table_name, data): """ 一个简易的数据插入接口 :param table_name: :param data: :return: """ try: if len(data): d = data.to_dict(orient='records') BaseModel(table_name, cls.location, cls.dbname).insert_batch(d) except Exception: raise MongoIOError('Failed with insert data by MongoDB')
def update_date(cls, table_name, condition, **kw): """ 按condition条件更新table_name表数据 :param table_name: :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典 :param kw:形如close=0这样的参数组 :return: """ try: BaseModel(table_name, cls.location, cls.dbname).update_batch(condition, kw) except Exception: raise MongoIOError('Failed with update by MongoDB')
def min(cls, table_name, field='_id', **kw): """ 找到满足kw条件的field列上的最小值 :param table_name: :param field: :param kw: :return: """ try: if not isinstance(field, str): raise TypeError('field must be an instance of str') cursor = BaseModel(table_name, cls.location, cls.dbname).query(sql=kw, field={field: True}) if cursor.count(): d = pd.DataFrame(list(cursor)) m = d.loc[:, [field]].min()[field] else: m = None cursor.close() return m except Exception as e: raise e
def insert_one(self, table_name, data, add_id=False): """ insert one record :param table_name: :param data: a dict :param add_id: :return: """ try: if add_id: data['_id'] = ObjectId() BaseModel(table_name, self.location, self.dbname).insert(data) except Exception: raise MongoIOError('Failed with insert data by MongoDB')
def lasted_ticker(cls, code, date, table_name='ticker'): try: if isinstance(code, str): sc = code elif isinstance(code, list): sc = {'$in': code} else: raise TypeError("'code' must be str or list of str") if isinstance(date, dt.datetime): d = dt.datetime(date.year, date.month, date.day) t = {'$gte': date - dt.timedelta(minutes=1), '$lte': date} pass else: raise TypeError("this 'date' must be datetime") cursor = BaseModel(table_name, cls.location, cls.dbname).aggregate([{ '$match': { 'stock_code': sc, 'date': d } }, { '$match': { 'datetime': t } }]) data = pd.DataFrame(list(cursor)) if len(data): data = data.sort_values(['stock_code', 'datetime'], ascending=False) data = data.drop_duplicates(['stock_code'], keep='first') data = data.reset_index(drop=True) cursor.close() return data pass except Exception as e: ExceptionInfo(e) return pd.DataFrame()
def update_data(self, table_name, condition, **kw): """ 按condition条件更新table_name表数据 :param table_name: :param condition: 形如{‘date':datetime.datetime(2018,1,1)}的一个字典 :param kw:形如close=0这样的参数组 :return: """ try: r = BaseModel(table_name, self.location, self.dbname).update_batch(condition, kw) return r except Exception as e: ExceptionInfo(e) raise MongoIOError('Failed with update by MongoDB')
def read_one(self, table_name, field=None, **kw): """ 有时候只需要读一条数据,没必要使用read_data, :param table_name: :param field: :param kw: :return: a dict or None """ try: cursor = BaseModel(table_name, self.location, self.dbname).query_one(kw, field) except Exception as e: ExceptionInfo(e) cursor = None finally: return cursor
def insert_data(self, table_name, data, add_id=False): """ 一个简易的数据插入接口 :param table_name: :param data: :param add_id: :return: """ try: if add_id: data['_id'] = data.index.map(lambda x: ObjectId()) if len(data): d = data.to_dict(orient='records') BaseModel(table_name, self.location, self.dbname).insert_batch(d) except Exception: raise MongoIOError('Failed with insert data by MongoDB')
def read_data(cls, code, start_date, end_date, field=None, timemerge=False, **kw): """ :param field: :param code: :param start_date: :param end_date: :param timemerge: :return: """ try: sql = dict(stock_code=code, date={ '$gte': start_date, '$lte': end_date }) sql = dict(sql, **kw) cursor = BaseModel('kline_tick', cls.location, cls.dbname).query(sql, field) if cursor.count(): data = pd.DataFrame(list(cursor)) data = cls.merge_time(data) if timemerge else data cursor.close() return data else: cursor.close() return pd.DataFrame() except Exception as e: ExceptionInfo(e) return pd.DataFrame()
class EtpGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # location='local2', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不必再单独创建索引 :return: """ # 用到的实体对象 used_entity = [ 'Enterprise', 'Person', 'Telephone', 'Address', 'Email', # 'ShareHolder', # 'Branch', # 'HeadCompany', # 'Invested', # 'Related', 'ConstructionProject', 'Certificate' ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def create_nodes_from_enterprise_baseinfo(self, eb): """ 创建企业基本信息衍生出来的所有节点: 1.企业 2.法人代表 3.管理人员 4.地址 实际上公司基本信息里面还衍生出了很多实体对象 但这些对象是在后面随关系一并创建的 :return: """ nodes = [] etp = Enterprise(eb) etp_n = etp.get_neo_node(primarykey=etp.primarykey) if etp_n is None: self.to_logs('filed initialize enterprise Neo node', 'ERROR', eb['name']) return None else: nodes.append(etp_n) try: lr = etp.get_legal_representative() lr_n = lr.get_neo_node(primarykey=lr.primarykey) if lr_n is None: self.to_logs('filed initialize legal representative ' 'Neo node', 'ERROR', eb['name']) else: nodes.append(lr_n) except Exception as e: self.to_logs('deal legal representative raise ({})' ''.format(e), 'EXCEPTION', eb['name']) try: ms = etp.get_manager() if len(ms): for m in ms: m_n = m['person'] m_n = m_n.get_neo_node(primarykey=m_n.primarykey) if m_n is None: self.to_logs('filed initialize major manager ' 'Neo node', 'ERROR', eb['name']) else: nodes.append(m_n) except Exception as e: self.to_logs('deal major managers raise ({})'.format(e), 'EXCEPTION', eb['name']) try: dz = etp.get_address() dz_n = dz.get_neo_node(primarykey=dz.primarykey) if dz_n is None: self.to_logs('filed initialize address Neo node', 'ERROR', eb['name']) else: nodes.append(dz_n) except Exception as e: self.to_logs('deal address raise ({})'.format(e), 'EXCEPTION', eb['name']) return nodes def get_all_nodes_from_enterprise(self, etp): nodes = [etp] try: lr = etp.get_legal_representative() if lr.isPerson(): nodes.append(lr) except Exception as e: self.to_logs('deal legal representative raise ({})' ''.format(e), 'EXCEPTION', etp['NAME']) try: ms = etp.get_manager() if len(ms): nodes += [m['person'] for m in ms] except Exception as e: self.to_logs('deal major managers raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: nodes.append(etp.get_address()) except Exception as e: self.to_logs('deal address raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: nodes.append(etp.get_telephone_number()) pass except Exception as e: ExceptionInfo(e) self.to_logs('deal telephone number raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: nodes.append(etp.get_email()) pass except Exception as e: ExceptionInfo(e) self.to_logs('deal email raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: cps = etp.get_construction_project() if len(cps): nodes += [ c.pop('project') for c in cps ] except Exception as e: ExceptionInfo(e) self.to_logs('deal construction project raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: ccs = etp.get_construction_certificate() nodes += [c.pop('ctf') for c in ccs] except Exception as e: ExceptionInfo(e) self.to_logs('deal construction certificate raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: sh = etp.get_share_holder() if len(sh): _nds_ = [] for s in sh: _s_ = s.pop('share_holder') if _s_.isPerson(): _nds_.append(_s_) nodes += _nds_ except Exception as e: ExceptionInfo(e) self.to_logs('deal share holder raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: brs = etp.get_branch() if len(brs): _nds_ = [] for b in brs: _p_ = b['principal'] if _p_.isPerson(): _nds_.append(_p_) nodes += _nds_ except Exception as e: ExceptionInfo(e) self.to_logs('deal branch raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: hcs = etp.get_head_company() if len(hcs): _nds_ = [] for h in hcs: _p_ = h['principal'] if _p_.isPerson(): _nds_.append(_p_) nodes += _nds_ except Exception as e: ExceptionInfo(e) self.to_logs('deal head company raise ({})'.format(e), 'EXCEPTION', etp['NAME']) return nodes def get_all_nodes(self): enterprises = self.base.query( sql={ 'metaModel': '基本信息', # 'name': {'$in': ns['name'].tolist()} }, limit=10000, no_cursor_timeout=True) i, j = 0, 0 # etp_count = enterprises.count() etp_count = 1000 nodes = dict() for ep in enterprises: i += 1 etp = Enterprise(ep) nds = self.get_all_nodes_from_enterprise(etp) for _nds_ in nds: if _nds_ is None: continue _nds_ = _nds_.to_dict() if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass if i % 1000 == 0: j += 1 print(SuccessMessage( '{}:success merge nodes to database ' 'round {} and deal {}/{} enterprise' ''.format(dt.datetime.now(), j, i, etp_count) )) pass return nodes def create_all_nodes(self): """ 创建企业基本信息衍生出来的所有节点 :return: """ # import pandas as pd # ns = pd.read_csv('D:\graph_data\graph_run_logs_for_enterprise.csv') enterprises = self.base.query( sql={ 'metaModel': '基本信息', # 'name': {'$in': ns['name'].tolist()} }, limit=1000, no_cursor_timeout=True) i, j = 0, 0 etp_count = enterprises.count() nodes = [] for e in enterprises: j += 1 nds = self.create_nodes_from_enterprise_baseinfo(e) # nds = self.get_nodes_from_enterprise_baseinfo(e) nodes += nds if len(nodes) > 1000: i += 1 # self.graph_merge_nodes(nodes) # if not self.index_and_constraint_statue: # self.create_index_and_constraint() print(SuccessMessage('{}:success merge nodes to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} nodes.'.format( dt.datetime.now(), i, j, etp_count, len(nodes) ))) nodes.clear() if len(nodes): i += 1 # self.graph_merge_nodes(nodes) # if not self.index_and_constraint_statue: # self.create_index_and_constraint() print(SuccessMessage('{}:success merge nodes to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} nodes.'.format( dt.datetime.now(), i, j, etp_count, len(nodes) ))) nodes.clear() pass def get_all_relationships_from_enterprise(self, etp): """ 创建从公司基本信息可以看出的关系: 1.person-[lr]->enterprise 2.person-[be_in_office]->enterprise 3.enterprise-[located]->address 4.person|enterprise-[holding]->enterprise 5.enterprise-[have]->telephone 6.enterprise-[have]->email :param : :return: """ # 如果关系上的节点不存在,数据库同样会补充创建节点,这一点很重要 rps = [] etp_n = etp.get_neo_node(primarykey=etp.primarykey) if etp_n is None: self.to_logs('filed initialize enterprise Neo node', 'ERROR', etp['NAME']) return rps try: lr = etp.get_legal_representative() # 法定代表人有可能会是以下这些对象 lr_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format(lr['URL']) ) if lr_n is None: lr_n = lr.get_neo_node(primarykey=lr.primarykey) if lr_n is None: self.to_logs('filed initialize legal representative Neo node', 'ERROR', etp['NAME']) else: rps.append(LegalRep(lr_n, etp_n)) except Exception as e: ExceptionInfo(e) self.to_logs('deal legal representative raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: ms = etp.get_manager() if len(ms): for m in ms: # 主要人员 下面必然是人 m_n = m.pop('person') m_n = m_n.get_neo_node(primarykey=m_n.primarykey) if m_n is None: self.to_logs('filed initialize major manager Neo node', 'ERROR', etp['NAME']) else: rps.append(BeInOffice(m_n, etp_n, **m)) except Exception as e: self.to_logs('deal major managers raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: dz = etp.get_address() dz_n = dz.get_neo_node(primarykey=dz.primarykey) if dz_n is None: self.to_logs('filed initialize address Neo node', 'ERROR', etp['NAME']) else: rps.append(Located(etp_n, dz_n)) except Exception as e: self.to_logs('deal address raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: sh = etp.get_share_holder() if len(sh): for s in sh: s_ = s.pop('share_holder') # 股东有可能会是以下这些对象 sh_n = self.match_node( 'Person', cypher='_.URL = "{}"'.format(s_['URL']) ) if sh_n is None: sh_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( s_['URL'], s_['NAME']) ) if sh_n is None: # 在以有的对象里面没找到这个股东 # 创建这个意外的股东 sh_n = s_.get_neo_node(primarykey=s_.primarykey) if sh_n is None: self.to_logs('filed initialize unexpected share ' 'holder Neo node', 'ERROR', etp['NAME']) if sh_n is not None: rps.append(Share(sh_n, etp_n, **s)) except Exception as e: ExceptionInfo(e) self.to_logs('deal share holder raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: tel = etp.get_telephone_number() if tel is None: # self.to_logs('there is not valid telephone for' # ' this enterprise.', 'ERROR', eb['name']) pass else: tel_n = tel.get_neo_node(primarykey=tel.primarykey) if tel_n is None: self.to_logs('filed initialize telephone Neo node', 'ERROR', etp['NAME']) else: rps.append(Have(etp_n, tel_n)) pass except Exception as e: ExceptionInfo(e) self.to_logs('deal telephone number raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: eml = etp.get_email() if eml is None: # self.to_logs('there is not valid email for' # ' this enterprise.', 'ERROR', eb['name']) pass else: eml_n = eml.get_neo_node(primarykey=eml.primarykey) if eml_n is None: self.to_logs('filed initialize email Neo node', 'ERROR', etp['NAME']) else: rps.append(Have(etp_n, eml_n)) pass except Exception as e: ExceptionInfo(e) self.to_logs('deal email raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: ivs = etp.get_invest_outer() if len(ivs): for iv in ivs: iv_ = iv.pop('invested') # 被投资企业可能是下面这些对象 iv_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( iv_['URL'], iv_['NAME']) ) if iv_n is None: iv_n = iv_.get_neo_node(primarykey=iv_.primarykey) if iv_n is None: self.to_logs('filed initialize unexpected invested ' 'Neo node', 'ERROR', etp['NAME']) continue rps.append(Investing(etp_n, iv_n, **iv)) except Exception as e: ExceptionInfo(e) self.to_logs('deal invest raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: brs = etp.get_branch() if len(brs): for b in brs: b_ = b.pop('branch') # 分支机构可能是下面这些对象 b_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( b_['URL'], b_['NAME']) ) if b_n is None: b_n = b_.get_neo_node(primarykey=b_.primarykey) if b_n is None: self.to_logs('filed initialize unexpected branch ' 'Neo node', 'ERROR', etp['NAME']) continue p_ = b['principal'] p_n = p_.get_neo_node(primarykey=p_.primarykey) if p_n is not None: rps.append(Principal(p_n, b_n)) b.pop('principal') rps.append(BranchAgency( etp_n, b_n, **b )) except Exception as e: ExceptionInfo(e) self.to_logs('deal branch raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: hcs = etp.get_head_company() if len(hcs): for h in hcs: h_ = h.pop('head') # 总公司可能是下面这些对象 h_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( h_['URL'], h_['NAME']) ) if h_n is None: h_n = h_.get_neo_node(primarykey=h_.primarykey) if h_n is None: self.to_logs('filed initialize unexpected head ' 'company Neo node', 'ERROR', etp['NAME']) continue p_ = h['principal'] p_n = p_.get_neo_node(primarykey=p_.primarykey) if p_n is not None: rps.append(Principal(p_n, h_n)) h.pop('principal') rps.append(SuperiorAgency( etp_n, h_n, **h )) except Exception as e: ExceptionInfo(e) self.to_logs('deal head company raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: cps = etp.get_construction_project() if len(cps): for c in cps: c_ = c.pop('project') c_n = c_.get_neo_node(primarykey=c_.primarykey) if c_n is None: self.to_logs('filed initialize unexpected construction ' 'project Neo node', 'ERROR', etp['NAME']) continue jsdw = c.pop('jsdw') # 查询这个建设单位是否已经存在 j_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( jsdw['URL'], jsdw['NAME']) ) if j_n is None: j_n = jsdw.get_neo_node(primarykey=jsdw.primarykey) if j_n is None: self.to_logs('filed initialize unexpected construction ' 'agency Neo node', 'ERROR', etp['NAME']) continue # TODO(lj):需要考虑是否将承建、建设单独列为一种关系 rps.append(Have( etp_n, c_n, **dict(角色='承建单位', **c) )) rps.append(Have( j_n, c_n, **dict(角色='建设单位', **c) )) except Exception as e: ExceptionInfo(e) self.to_logs('deal construction project raise ({})'.format(e), 'EXCEPTION', etp['NAME']) try: ccs = etp.get_construction_certificate() if len(ccs): for c in ccs: c_ = c.pop('ctf') c_n = c_.get_neo_node(primarykey=c_.primarykey) if c_n is None: self.to_logs('filed initialize unexpected construction ' 'certificate Neo node', 'ERROR', etp['NAME']) continue rps.append(Have(etp_n, c_n, **c)) except Exception as e: ExceptionInfo(e) self.to_logs('deal construction certificate raise ({})'.format(e), 'EXCEPTION', etp['NAME']) return rps def create_all_relationship(self): """ 创建从公司基本信息可以看出的关系: 1.person-[lr]->enterprise 2.person-[be_in_office]->enterprise 3.enterprise-[located]->address 4.person|enterprise-[holding]->enterprise :return: """ enterprises = self.base.query( sql={ 'metaModel': '基本信息', # 'name': '重庆长安汽车股份有限公司' }, limit=1000, no_cursor_timeout=True) i, j = 0, 0 etp_count = enterprises.count() relationships = [] for _ in enterprises: j += 1 etp = Enterprise(_) rps = self.get_relationship_from_enterprise(etp) relationships += rps if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, j, etp_count, len(relationships) ))) relationships.clear() # if i > 10: # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, j, etp_count, len(relationships) ))) relationships.clear() def get_all_relationships(self): enterprises = self.base.query( sql={ 'metaModel': '基本信息', # 'name': '重庆长安汽车股份有限公司' }, limit=10000, no_cursor_timeout=True) i, j = 0, 0 etp_count = enterprises.count() # etp_count = 1000 relationships = {} for ep in enterprises: i += 1 etp = Enterprise(ep) rps = self.get_all_relationships_from_enterprise(etp) for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 1000 == 0: j += 1 print(SuccessMessage( '{}:success merge nodes to database ' 'round {} and deal {}/{} enterprise' ''.format(dt.datetime.now(), j, i, etp_count) )) pass return relationships def get_all_nodes_and_relationships_from_enterprise(self, etp): """ 创建从公司基本信息可以看出的关系: 1.person-[lr]->enterprise 2.person-[be_in_office]->enterprise 3.enterprise-[located]->address 4.person|enterprise-[holding]->enterprise 5.enterprise-[have]->telephone 6.enterprise-[have]->email :param : :return: """ # 如果关系上的节点不存在,数据库同样会补充创建节点,这一点很重要 nodes, rps = [], [] etp_n = self.get_neo_node(etp) if etp_n is None: self.logger.debug('{} filed initialize enterprise ' 'Neo node'.format(etp['NAME'])) return nodes, rps nodes.append(etp_n) try: lr = etp.get_legal_representative() # 法定代表人有可能会是以下这些对象 lr_n = self.match_node( *['Person'] + legal, cypher='_.URL = "{}"'.format(lr['URL']) ) if lr_n is None: lr_n = self.get_neo_node(lr) if lr_n is None: self.logger.debug('{} filed initialize legal representative ' 'Neo node'.format(etp['NAME'])) else: nodes.append(lr_n) rps.append(LegalRep(lr_n, etp_n)) except Exception as e: ExceptionInfo(e) self.logger.error('{} deal legal representative raise ' '({})'.format(etp['NAME'], e), exc_info=True) try: ms = etp.get_manager() if len(ms): for m in ms: # 主要人员 下面必然是人 m_n = m.pop('person') m_n = self.get_neo_node(m_n) if m_n is None: self.logger.debug('{} filed initialize major manager ' 'Neo node'.format(etp['NAME'])) else: nodes.append(m_n) rps.append(BeInOffice(m_n, etp_n, **m)) except Exception as e: self.logger.error('{} deal major managers raise ' '({})'.format(etp['NAME'], e), exc_info=True) try: dz = etp.get_address() dz_n = self.get_neo_node(dz) if dz_n is None: self.logger.debug('{} filed initialize address ' 'Neo node'.format(etp['NAME'])) else: nodes.append(dz_n) rps.append(Located(etp_n, dz_n)) except Exception as e: self.logger.error('{} deal address raise ' '({})'.format(etp['NAME'], e), exc_info=True) try: sh = etp.get_share_holder() if len(sh): for s in sh: s_ = s.pop('share_holder') # 股东有可能会是以下这些对象 sh_n = self.match_node( 'Person', cypher='_.URL = "{}"'.format(s_['URL']) ) if sh_n is None: sh_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( s_['URL'], s_['NAME']) ) if sh_n is None: # 在以有的对象里面没找到这个股东 # 创建这个意外的股东 sh_n = self.get_neo_node(s_) if sh_n is None: self.logger.debug('{} filed initialize unexpected share ' 'holder Neo node'.format(etp['NAME'])) if sh_n is not None: nodes.append(sh_n) rps.append(Share(etp_n, sh_n, **s)) except Exception as e: self.logger.error('{} deal share holder raise ' '({})'.format(etp['NAME'], e), exc_info=True) try: tel = etp.get_telephone_number() if tel is None: # self.to_logs('there is not valid telephone for' # ' this enterprise.', 'ERROR', eb['name']) pass else: tel_n = self.get_neo_node(tel) if tel_n is None: self.logger.debug('{} filed initialize telephone ' 'Neo node'.format(etp['NAME'])) else: nodes.append(tel_n) rps.append(Have(etp_n, tel_n)) pass except Exception as e: self.logger.error('{} deal telephone number raise ' '({})'.format(etp['NAME'], e), exc_info=True) try: eml = etp.get_email() if eml is None: # self.to_logs('there is not valid email for' # ' this enterprise.', 'ERROR', eb['name']) pass else: eml_n = self.get_neo_node(eml) if eml_n is None: self.logger.debug('{} filed initialize email ' 'Neo node'.format(etp['NAME'])) else: nodes.append(eml_n) rps.append(Have(etp_n, eml_n)) pass except Exception as e: self.logger.debug('{} deal email raise ({})' ''.format(etp['NAME'], e), exc_info=True) try: ivs = etp.get_invest_outer() if len(ivs): for iv in ivs: iv_ = iv.pop('invested') # 被投资企业可能是下面这些对象 iv_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( iv_['URL'], iv_['NAME']) ) if iv_n is None: iv_n = self.get_neo_node(iv_) if iv_n is None: self.logger.debug('{} filed initialize unexpected invested ' 'Neo node'.format(etp['NAME'])) continue nodes.append(iv_n) rps.append(Investing(etp_n, iv_n, **iv)) except Exception as e: self.logger.error('{} deal invest raise ({})' ''.format(etp['NAME'], e), exc_info=True) try: brs = etp.get_branch() if len(brs): for b in brs: b_ = b.pop('branch') # 分支机构可能是下面这些对象 b_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( b_['URL'], b_['NAME']) ) if b_n is None: b_n = self.get_neo_node(b_) if b_n is None: self.logger.debug('{} filed initialize unexpected branch ' 'Neo node'.format(etp['NAME'])) continue p_ = b['principal'] p_n = self.get_neo_node(p_) if p_n is not None: nodes.append(p_n) rps.append(Principal(p_n, b_n)) b.pop('principal') nodes.append(b_n) rps.append(BranchAgency( etp_n, b_n, **b )) except Exception as e: self.logger.error('{} deal branch raise ({})' ''.format(etp['NAME'], e), exc_info=True) try: hcs = etp.get_head_company() if len(hcs): for h in hcs: h_ = h.pop('head') # 总公司可能是下面这些对象 h_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( h_['URL'], h_['NAME']) ) if h_n is None: h_n = self.get_neo_node(h_) if h_n is None: self.logger.debug('filed initialize unexpected head ' 'company Neo node'.format(etp['NAME'])) continue p_ = h['principal'] p_n = self.get_neo_node(p_) if p_n is not None: nodes.append(p_n) rps.append(Principal(p_n, h_n)) h.pop('principal') nodes.append(h_n) rps.append(SuperiorAgency( etp_n, h_n, **h )) except Exception as e: self.logger.error('{} deal head company raise ({})' ''.format(etp['NAME'], e), exc_info=True) try: cps = etp.get_construction_project() if len(cps): for c in cps: c_ = c.pop('project') c_n = self.get_neo_node(c_) if c_n is None: self.logger.debug('filed initialize unexpected construction ' 'project Neo node'.format(etp['NAME'])) continue jsdw = c.pop('jsdw') # 查询这个建设单位是否已经存在 j_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( jsdw['URL'], jsdw['NAME']) ) if j_n is None: j_n = self.get_neo_node(jsdw) if j_n is None: self.logger.debug('filed initialize unexpected construction ' 'agency Neo node'.format(etp['NAME'])) continue # TODO(lj):需要考虑是否将承建、建设单独列为一种关系 nodes.append(c_n) rps.append(Have( etp_n, c_n, **dict(角色='承建单位', **c) )) nodes.append(j_n) rps.append(Have( j_n, c_n, **dict(角色='建设单位', **c) )) except Exception as e: self.logger.error('{} deal construction project raise ({})' ''.format(etp['NAME'], e), exc_info=True) try: ccs = etp.get_construction_certificate() if len(ccs): for c in ccs: c_ = c.pop('ctf') c_n = self.get_neo_node(c_) if c_n is None: self.logger.debug('filed initialize unexpected construction ' 'certificate Neo node'.format(etp['NAME'])) continue nodes.append(c_n) rps.append(Have(etp_n, c_n, **c)) except Exception as e: ExceptionInfo(e) self.logger.error('deal construction certificate raise ({})' ''.format(etp['NAME'], e), exc_info=True) return nodes, rps def get_all_nodes_and_relationships( self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '基本信息', # 'name': '重庆长寿城乡商贸总公司' # {'$in': ns['name'].tolist()} }, # limit=100000, # skip=290000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() # etp_count = 1000 nodes, relationships = {}, {} _st_ = time.time() for ep in enterprises: try: i += 1 etp = Enterprise(ep) nds, rps = self.get_all_nodes_and_relationships_from_enterprise(etp) for _nds_ in nds: if _nds_ is None: continue label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass except Exception as e: self.logger.error('{} {}'.format(e, ep['name']), exc_info=True) continue if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info(SuccessMessage( 'success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)) )) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class IndGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_api', # tn='relationsDetail.1.0', # location='gcxy', # dbname='data' ) pass def get_all_nodes_and_relationships_from_api(self, etp): """ 创建所有的行业实体,实体对象从外部传进来,因为行业可能 会作为一个相对独立的研究领域,与数据库中企业基本信息中的 行业可能不完全匹配 :return: """ etp_n = self.match_node( 'Enterprise', cypher='_.URL = "{}" OR _.NAME = "{}"' ''.format(Enterprise.parser_url(etp['url']), etp['name'])) if etp_n is None: etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) ind = etp['IndustryV3'] if ind is None: return nodes, relationships ind1 = self.get_neo_node(Industry(**{ 'name': ind['Industry'], 'code': ind['IndustryCode'], '类别': '一级' })) ind2 = self.get_neo_node(Industry(**{ 'name': ind['SubIndustry'], 'code': ind['SubIndustryCode'], '类别': '二级' })) ind3 = self.get_neo_node(Industry(**{ 'name': ind['MiddleCategory'], 'code': ind['MiddleCategoryCode'], '类别': '三级' })) ind4 = self.get_neo_node(Industry(**{ 'name': ind['SmallCategory'], 'code': ind['SmallCategoryCode'], '类别': '四级' })) _ids_ = [ind4, ind3, ind2, ind1] ids = [] for i in _ids_: if i is not None: ids.append(i) nodes.append(i) if len(ids): relationships.append(Belong(etp_n, ids[0])) for i in range(len(ids) - 1): relationships.append(Belong(ids[i], ids[i + 1])) pass return nodes, relationships pass def merge_all_nodes_and_relationships(self): enterprises = self.base.query( # sql={'metaModel': '企业发展'}, field={ '_id': 0, 'value.Result.Name': 1, 'value.Result.KeyNo': 1, 'value.Result.IndustryV3': 1 }, limit=10000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = 10000 # etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 ep = ep['value']['Result'] uc = ep['KeyNo'] # getUniqueCode(ep['url']) ep['name'] = ep.pop('Name') if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_api(ep) def get_all_nodes_and_relationships( self, save_folder=None, enterprises=None, **kwargs): if enterprises is None: enterprises_data = self.base.query( # sql={'metaModel': '企业发展'}, field={ '_id': 0, 'value.Result.Name': 1, 'value.Result.KeyNo': 1, 'value.Result.IndustryV3': 1 }, limit=10000, # skip=2000, no_cursor_timeout=True) etp_count = 10000 # etp_count = enterprises_data.count() else: enterprises_data = enterprises etp_count = len(enterprises) i, j = 0, 0 nc, rc = 0, 0 nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises_data: i += 1 if enterprises is not None: ep = self.base.query_one( sql={'value.Result.Name': ep['name']}, field={ '_id': 0, 'value.Result.Name': 1, 'value.Result.KeyNo': 1, 'value.Result.IndustryV3': 1 }, ) if ep is None: continue ep = ep['value']['Result'] uc = ep['KeyNo'] # getUniqueCode(ep['url']) ep['name'] = ep.pop('Name') if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_api(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info(SuccessMessage( 'success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)) )) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class JusRulingTextGraph(BaseGraph): def __init__(self): BaseGraph.__init__(self) self.base = BaseModel(tn='重庆裁决文书(内容)') pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 constraint = { 'RulingText': ['CASE_NUM'], } index = { # 'Enterprise': [('NAME',)] } self.add_index_and_constraint(index, constraint) pass def create_all_relationship(self): """ 1.ruling -[have]->ruling_text :return: """ rts = self.base.query( sql={'metaModel': '裁判文书'}, no_cursor_timeout=True) i, k = 0, 0 # eg = EtpGraph() etp_count = rts.count() relationships = [] # prs = Person() ruling = Ruling() for r in rts: k += 1 rt = RulingText.create_from_original_text( r['content'], **{'链接': r['url']} ) rl_n = self.NodeMatcher.match(ruling.label).where( '_.CASE_NUM="{}"'.format( # OR _.URL="{}" rt.BaseAttributes['CASE_NUM'], # rt.BaseAttributes['URL'] ) ).first() if rl_n is None: continue relationships.append( Have(rl_n, rt.get_neo_node(primarykey=rt.primarykey) ).get_relationship() ) if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() pass # rtg = JusRulingTextGraph() # rtg.create_all_relationship()
def check(): fps = File.get_all_file(import_path) n_fps = [] r_fps = [] for p in fps: if 'nodes' in p: n_fps.append(p) if 'relationships': r_fps.append(p) pass from Calf.data import BaseModel base = BaseModel(tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) def func1(): # 处理非基本信息模块下的Enterprise etp_fps = [] for p in n_fps: if 'Enterprise' in p and 'EtpGraph' not in p: etp_fps.append(p) etp_fps = set([os.path.join(*p.split('\\')[:-1]) for p in etp_fps]) etp = entities('Enterprise') etp_data = [] for ep in etp_fps: ed = etp.read_csv(ep, ep) etp_data.append(ed) etp_data = pd.concat(etp_data) etp_data.drop_duplicates(['URL:ID(Enterprise)'], inplace=True) etp_data.reset_index(drop=True, inplace=True) total = len(etp_data) etp_data['exist'] = False for i, r in etp_data.iterrows(): try: _ = base.query_one(sql={ 'name': r['NAME'], 'metaModel': '基本信息' }, field={ 'name': 1, '_id': 0 }) if _ is not None: etp_data.loc[i, ['exist']] = True if i % 100 == 0: progress_bar(total, i, 'check') except Exception as e: print(e) etp_data = etp_data[~etp_data['exist']] # etp_data.drop(['exist'], axis=1) etp.to_csv(etp_data, import_path, split_header=True) pass # func1() def func2(): # 处理Related rel_fps = [] for p in n_fps: if 'Related' in p: rel_fps.append(p) rel_fps = set([os.path.join(*p.split('\\')[:-1]) for p in rel_fps]) rel = entities('Related') rel_data = [] for ep in rel_fps: ed = rel.read_csv(ep, ep) rel_data.append(ed) rel_data = pd.concat(rel_data) # rel_data.drop_duplicates(['URL:ID'], inplace=True) drop = rel_data.loc[:, ['URL:ID(Related)', 'NAME']] drop['count'] = 1 drop = drop.groupby(['URL:ID(Related)'], as_index=False).agg({ 'count': 'count', 'NAME': 'first' }) drop = drop[(drop['count'] > 3) & (drop['NAME'].str.len() < 4)] drop = drop['URL:ID(Related)'] # drop = drop.tolist() if len(drop): rel_data = rel_data[~rel_data['URL:ID(Related)'].isin(drop)] rel.to_csv(rel_data, import_path, split_header=True) pass func2() pass
def __init__(self): BaseGraph.__init__(self) self.base = BaseModel(tn='重庆裁决文书(内容)') pass
class DvpGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 constraint = { # 'News': [News.primarykey], # 'Possession': [Possession.primarykey], # 'Involveder': ['HASH_ID'], } index = { # 'Enterprise': [('NAME',)] } self.add_index_and_constraint(index, constraint) pass def create_all_relationship(self): """ 1.enterprise -[compete]->enterprise :return: """ ops = self.base.query(sql={'metaModel': '企业发展'}, field={ 'name': 1, 'url': 1, 'content.竞品信息': 1 }, limit=1000, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = ops.count() relationships = [] etp = Enterprise() for o in ops: k += 1 # if k < 41321: # continue # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node(*legal, cypher='_.NAME = "{}"'.format(o['name'])) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one(sql={ 'metaModel': '基本信息', 'name': o['name'] }) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo( _) pass else: # 没有这个公司的信息,那就创建一个信息不全的公司 # etp = Enterprise({'name': o['name'], 'url': o['url']}) etp = Related() etp['NAME'] = o['name'] etp['URL'] = o['url'] etp_n = self.get_neo_node(etp) pass if '竞品信息' in o['content'].keys(): data = self.get_format_dict(o['content']['竞品信息']) for d in data: etp_2 = d.pop('关联企业') if etp_2['名称'] is not None and len(etp_2['名称']) > 1: etp_2['链接'] = etp.parser_url(etp_2['链接']) etp_n_2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( etp_2['链接'])) if etp_n_2 is None and etp_2['名称'] > 1: _ = { 'URL': etp_2['链接'], 'NAME': etp_2['名称'], '简介': d.pop('产品介绍'), '成立日期': d.pop('成立日期'), '融资信息': d.pop('融资信息'), '所属地': d.pop('所属地'), } etp_n_2 = Related(**_) etp_n_2 = self.get_neo_node(etp_n_2) relationships.append( Compete(etp_n, etp_n_2, **d).get_relationship()) pass if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage( '{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '竞品信息' in etp['content'].keys(): data = self.get_format_dict(etp['content']['竞品信息']) data = Product.create_from_dict(data) for d in data: p = d.pop('product') p_n = self.get_neo_node(p) if p_n is None: continue nodes.append(p_n) relationships.append(Compete(etp_n, p_n)) etp_2 = d.pop('关联企业') etp_2['链接'] = Enterprise.parser_url(etp_2['链接']) if etp_2['名称'] is not None and len(etp_2['名称']) > 1: # etp_2['链接'] = Enterprise.parser_url(etp_2['链接']) etp_n_2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( etp_2['链接'])) if etp_n_2 is None and len(etp_2['名称']) > 1: etp_n_2 = Enterprise(**etp_2) if not etp_n_2.isEnterprise(): _ = { 'URL': etp_2['链接'], 'NAME': etp_2['名称'], '简介': d.pop('产品介绍'), '成立日期': d.pop('成立日期'), '融资信息': d.pop('融资信息'), '所属地': d.pop('所属地'), } etp_n_2 = Related(**{ '链接': etp_2['链接'], '名称': etp_2['名称'] }) # etp_n_2 = Related(**_) etp_n_2 = self.get_neo_node(etp_n_2) nodes.append(etp_n_2) relationships.append(Produce(etp_n_2, p_n)) return nodes, relationships def get_all_nodes_and_relationships(self, save_folder=None, **kwargs): enterprises = self.base.query( sql={'metaModel': '企业发展'}, field={ 'name': 1, 'url': 1, 'content.竞品信息': 1 }, # limit=100000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info( SuccessMessage('success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)))) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class NewsGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 used_entity = [ 'News', ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def create_all_relationship(self): """ 1.enterprise -[have or x]->x :return: """ ops = self.base.query( sql={'metaModel': '公司新闻'}, # limit=10, skip=2020, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = ops.count() relationships = [] # etp = Enterprise() s_t = time.time() for o in ops: k += 1 # if k < 43500: # continue # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node( *legal, cypher='_.NAME = "{}"'.format(o['name']) ) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one( sql={'metaModel': '基本信息', 'name': o['name']} ) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo(_) pass else: # 没有这个公司的信息,那就创建一个信息不全的公司 etp = Related(**{'名称': o['name'], '链接': o['url']}) # etp['NAME'] = o['name'] # etp['URL'] = o['url'] etp_n = self.get_neo_node(etp) if etp_n is None: continue pass if '新闻舆情' in o['content'].keys(): data = self.get_format_dict(o['content']['新闻舆情']) ns = News.create_from_dict(data) for n in ns: n_ = n.pop('news') n_n = self.get_neo_node(n_) if n_n is not None: relationships.append( Have(etp_n, n_n, **n).get_relationship() ) pass if len(relationships) > 1000: i += 1 sp = int(time.time() - s_t) s_t = time.time() self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise and spend {} ' 'seconds,and merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, sp, len(relationships) ))) relationships.clear() # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print(SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships) ))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '新闻舆情' in etp['content'].keys(): data = self.get_format_dict(etp['content']['新闻舆情']) ns = News.create_from_dict(data) for n in ns: n_ = n.pop('news') n_n = self.get_neo_node(n_) if n_n is not None: nodes.append(n_n) relationships.append( Have(etp_n, n_n, **n) ) pass return nodes, relationships def get_all_nodes_and_relationships( self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '公司新闻', # 'name': '重庆轩烽建材有限公司' }, # limit=10000, # skip=100000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info(SuccessMessage( 'success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(_st_ - time.time())) )) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph( save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships
class OptGraph(BaseGraph): def __init__(self, **kwargs): BaseGraph.__init__(self, **kwargs) self.base = BaseModel( tn='cq_all', # tn='qcc.1.1', # location='gcxy', # dbname='data' ) pass def create_index_and_constraint(self): """ 为涉及到的实体创建唯一性约束跟索引,唯一键自动带有索引 不比再单独创建索引 :return: """ # TODO(leung): 要随时确保label的准确性 # 用到是实体对象 used_entity = [ 'License', 'Bidding', 'Check', 'RandomCheck', 'TaxCredit', 'IAE', 'Position', # 'Client', # 'Supplier', # 'Possession', 'Plot' ] constraint = {} index = {} for l in used_entity: constraint[l] = [entities(l).primarykey] idx = entities(l).index if len(idx): index[l] = idx self.add_index_and_constraint(index, constraint) pass def get_all_nodes_from_enterprise(self, etp): nodes = [Enterprise(URL=etp['url'], NAME=etp['name'])] if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: nodes.append(l.pop('license')) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: nodes.append(l.pop('license')) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: nodes.append(b.pop('bidding')) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: nodes.append(c.pop('check')) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 nodes.append(rc.pop('check')) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: nodes.append(t.pop('TaxCredit')) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: nodes.append(ie.pop('iae')) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: nodes.append(r.pop('position')) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: nodes.append(c.pop('client')) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: nodes.append(s.pop('supplier')) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: nodes.append(d.pop('评级公司')) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) nodes.append(p) pass return nodes pass def get_all_nodes(self): enterprises = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, limit=1000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 etp_count = enterprises.count() nodes = {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: continue ep['url'] = '/firm_' + uc + '.html' nds = self.get_all_nodes_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue _nds_ = _nds_.to_dict() if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass if i % 1000 == 0: j += 1 print( SuccessMessage('{}:success merge nodes to database ' 'round {} and deal {}/{} enterprise' ''.format(dt.datetime.now(), i, j, etp_count))) pass return nodes def get_all_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [] relationships = [] if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append(Have(etp_n, l_n, **l)) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append(Have(etp_n, l_n, **l)) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: _ = b.pop('bidding') b_n = self.get_neo_node(_) if b_n is None: continue # TODO(leung):项目分类用作了招投标结果 relationships.append( TakePartIn(etp_n, b_n, **dict(b, **{'RESULT': b_n['TYPE']}))) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: _ = c.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']}))) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 _ = rc.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append(Have(etp_n, n, **rc)) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: _ = t.pop('TaxCredit') n = self.get_neo_node(_) if n is None: continue # TODO(leung):纳税信用等级作为税务信用评级结果 relationships.append( Have(etp_n, n, **dict(RESULT=n['GRADE'], **t))) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: _ = ie.pop('iae') n = self.get_neo_node(_) if n is None: continue relationships.append(Have(etp_n, n, **ie)) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: _ = r.pop('position') n = self.get_neo_node(_) if n is None: continue relationships.append(Recruit(etp_n, n, **r)) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: _ = c.pop('client') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append(SellTo(etp_n, n, **c)) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: _ = s.pop('supplier') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append(BuyFrom(etp_n, n, **s)) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: _ = d.pop('评级公司') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['链接'], _['名称'])) if n is None: n = Enterprise(**_) n = self.get_neo_node(n) if n is None: continue __ = d.pop('内容') d['评级内容'] = __['内容'] d['评级链接'] = __['链接'] relationships.append(Appraise(n, etp_n, **d)) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) p_n = self.get_neo_node(p) if p_n is None: continue if e1['名称'] == etp['name'] or e1['链接'] == etp['url']: n1 = etp_n else: # 有可能是人 n1 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e1['链接'])) if n1 is None: n1 = Enterprise(**e1) if not n1.isEnterprise(): n1 = Person(**e1) if not n1.isPerson(): n1 = Related(**e1) n1 = self.get_neo_node(n1) if n1 is not None: relationships.append(Sell(n1, p_n)) if e2['名称'] == etp['name'] or e2['链接'] == etp['url']: n2 = etp_n else: n2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e2['链接'])) if n2 is None: n2 = Enterprise(**e2) if not n2.isEnterprise(): n2 = Person(**e2) if not n2.isPerson(): n2 = Related(**e2) n2 = self.get_neo_node(n2) if n2 is not None: relationships.append(Buy(n2, p_n)) pass return relationships def get_all_relationships(self): enterprises = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, limit=1000, # skip=2000, no_cursor_timeout=True) i, j = 0, 0 etp_count = enterprises.count() relationships = {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: continue ep['url'] = '/firm_' + uc + '.html' rps = self.get_all_relationships_from_enterprise(ep) for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 1000 == 0: j += 1 print( SuccessMessage('{}:success merge relationship to database ' 'round {} and deal {}/{} enterprise' ''.format(dt.datetime.now(), i, j, etp_count))) pass return relationships def create_all_relationship(self): """ 1.enterprise -[have or x]->x :return: """ ops = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, limit=1000, # skip=2000, no_cursor_timeout=True) i, k = 0, 0 eg = EtpGraph() etp_count = ops.count() relationships = [] # etp = Enterprise() for o in ops: k += 1 # TODO(leung): 这里要注意,基本信息以外的模块中的url确定不了公司 etp_n = self.match_node(*legal, cypher='_.NAME = "{}"'.format(o['name'])) if etp_n is None: # 如果这个公司还没在数据库里面,那么应该创建这个公司 _ = self.base.query_one(sql={ 'metaModel': '基本信息', 'name': o['name'] }) if _ is not None: etp = Enterprise(_) etp_n = self.get_neo_node(etp) # 虽然在创建司法关系的时候会创建未在库中的企业,但不会创建 # 这个企业的基本关系,因此需要添加其基本关系 relationships += eg.create_relationship_from_enterprise_baseinfo( _) pass else: # 没有这个公司的信息,那就创建一个信息不全的公司 # 如果在neo4j里面存着只有name,url的公司,意味着 # 这家公司没有“基本信息” etp = Related() etp['NAME'] = o['name'] etp['URL'] = o['url'] etp_n = self.get_neo_node(etp) pass if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append( Have(etp_n, l_n, **l).get_relationship()) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue relationships.append( Have(etp_n, l_n, **l).get_relationship()) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: _ = b.pop('bidding') b_n = self.get_neo_node(_) if b_n is None: continue # TODO(leung):项目分类用作了招投标结果 relationships.append( TakePartIn(etp_n, b_n, **dict(b, **{'RESULT': b_n['TYPE']})).get_relationship()) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: _ = c.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']})).get_relationship()) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 _ = rc.pop('check') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **rc).get_relationship()) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: _ = t.pop('TaxCredit') n = self.get_neo_node(_) if n is None: continue # TODO(leung):纳税信用等级作为税务信用评级结果 relationships.append( Have(etp_n, n, **dict(RESULT=n['GRADE'], **t)).get_relationship()) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: _ = ie.pop('iae') n = self.get_neo_node(_) if n is None: continue relationships.append( Have(etp_n, n, **ie).get_relationship()) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: _ = r.pop('position') n = self.get_neo_node(_) if n is None: continue relationships.append( Recruit(etp_n, n, **r).get_relationship()) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: _ = c.pop('client') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append( SellTo(etp_n, n, **c).get_relationship()) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: _ = s.pop('supplier') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['URL'], _['NAME'])) if n is None: n = self.get_neo_node(_) if n is None: continue relationships.append( BuyFrom(etp_n, n, **s).get_relationship()) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: _ = d.pop('评级公司') n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['链接'], _['名称'])) if n is None: n = Related() n['NAME'] = _['名称'] n['URL'] = _['链接'] n = self.get_neo_node(n) if n is None: continue __ = d.pop('内容') d['评级内容'] = __['内容'] d['评级链接'] = __['链接'] relationships.append( Appraise(n, etp_n, **d).get_relationship()) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) p_n = self.get_neo_node(p) if p_n is None: continue if e1['名称'] == o['name'] or e1['链接'] == o['url']: n1 = etp_n else: # 有可能是人 n1 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e1['链接'])) if n1 is None: n1 = Related(**e1) n1 = self.get_neo_node(n1) if n1 is not None: relationships.append(Sell(n1, p_n).get_relationship()) if e2['名称'] == o['name'] or e2['链接'] == o['url']: n2 = etp_n else: n2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e2['链接'])) if n2 is None: n2 = Related(**e2) n2 = self.get_neo_node(n2) if n2 is not None: relationships.append(Buy(n2, p_n).get_relationship()) pass if len(relationships) > 1000: i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage( '{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() # return if len(relationships): i += 1 self.graph_merge_relationships(relationships) if not self.index_and_constraint_statue: self.create_index_and_constraint() print( SuccessMessage('{}:success merge relationships to database ' 'round {} and deal {}/{} enterprise,and' ' merge {} relationships.'.format( dt.datetime.now(), i, k, etp_count, len(relationships)))) relationships.clear() pass def get_all_nodes_and_relationships_from_enterprise(self, etp): etp_n = Enterprise(URL=etp['url'], NAME=etp['name']) etp_n = self.get_neo_node(etp_n) if etp_n is None: return [], [] nodes, relationships = [], [] nodes.append(etp_n) if '产权交易' in etp['content'].keys(): # data = self.get_format_dict(etp['content']['产权交易']) # for d in data: # bd = d.pop('标的') # bd_n = pass if '行政许可' in etp['content'].keys(): data = etp['content']['行政许可'] if '工商局' in data.keys(): d1 = self.get_format_dict(data['工商局']) ls = License.create_from_dict(d1, '工商局') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue nodes.append(l_n) relationships.append(Have(etp_n, l_n, **l)) pass if '信用中国' in data.keys(): d2 = self.get_format_dict(data['信用中国']) ls = License.create_from_dict(d2, '信用中国') for l in ls: l_ = l.pop('license') l_n = self.get_neo_node(l_) if l_n is None: continue nodes.append(l_n) relationships.append(Have(etp_n, l_n, **l)) pass pass if '招投标信息' in etp['content'].keys(): # 公示的招投标信息一般都是结果,一般情况下是找不到 # 共同投标的单位,除非是共同中标 data = self.get_format_dict(etp['content']['招投标信息']) bs = Bidding.create_from_dict(data) for b in bs: _ = b.pop('bidding') b_n = self.get_neo_node(_) if b_n is None: continue # TODO(leung):项目分类用作了招投标结果 nodes.append(b_n) relationships.append( TakePartIn(etp_n, b_n, **dict(b, **{'RESULT': b_n['TYPE']}))) pass if '抽查检查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['抽查检查']) cs = Check.create_from_dict(data) for c in cs: _ = c.pop('check') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append( Have(etp_n, n, **dict(c, **{'RESULT': n['RESULT']}))) pass if '双随机抽查' in etp['content'].keys(): data = self.get_format_dict(etp['content']['双随机抽查']) rcs = RandomCheck.create_from_dict(data) # rcs_n = self.get_neo_node(rcs) for rc in rcs: # TODO(leung):随机抽查没有结果 _ = rc.pop('check') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append(Have(etp_n, n, **rc)) pass if '税务信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['税务信用']) ts = TaxCredit.create_from_dict(data) # ts_n = self.get_neo_node(ts) for t in ts: _ = t.pop('TaxCredit') n = self.get_neo_node(_) if n is None: continue # TODO(leung):纳税信用等级作为税务信用评级结果 nodes.append(n) relationships.append( Have(etp_n, n, **dict(RESULT=n['GRADE'], **t))) pass if '进出口信用' in etp['content'].keys(): data = self.get_format_dict(etp['content']['进出口信用']) ies = IAE.create_from_dict(data) # ies_n = self.get_neo_node(ies) for ie in ies: _ = ie.pop('iae') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append(Have(etp_n, n, **ie)) pass if '招聘' in etp['content'].keys(): data = self.get_format_dict(etp['content']['招聘']) rs = Position.create_from_dict(data) for r in rs: _ = r.pop('position') n = self.get_neo_node(_) if n is None: continue nodes.append(n) relationships.append(Recruit(etp_n, n, **r)) pass if '客户' in etp['content'].keys(): data = self.get_format_dict(etp['content']['客户']) cs = Client.create_from_dict(data) for c in cs: cli = c.pop('client') cli_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( cli['URL'], cli['NAME'])) if cli_n is None: if cli.isEnterprise(): cli = Enterprise(**cli.to_dict(with_label=False)) cli_n = self.get_neo_node(cli) if cli_n is None: continue nodes.append(cli_n) relationships.append(SellTo(etp_n, cli_n, **c)) pass if '供应商' in etp['content'].keys(): data = self.get_format_dict(etp['content']['供应商']) ss = Supplier.create_from_dict(data) for s in ss: sup = s.pop('supplier') sup_n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( sup['URL'], sup['NAME'])) if sup_n is None: if sup.isEnterprise(): sup = Enterprise(**sup.to_dict(with_label=False)) sup_n = self.get_neo_node(sup) if sup_n is None: continue nodes.append(sup_n) relationships.append(BuyFrom(etp_n, sup_n, **s)) pass if '信用评级' in etp['content'].keys(): data = self.get_format_dict(etp['content']['信用评级']) for d in data: _ = d.pop('评级公司') _['链接'] = Enterprise.parser_url(_['链接']) n = self.match_node( *legal, cypher='_.URL = "{}" OR _.NAME = "{}"'.format( _['链接'], _['名称'])) if n is None: n = Enterprise(**_) n = self.get_neo_node(n) if n is None: continue __ = d.pop('内容') d['评级内容'] = __['内容'] d['评级链接'] = __['链接'] nodes.append(n) relationships.append(Appraise(n, etp_n, **d)) pass if '土地转让' in etp['content'].keys(): data = self.get_format_dict(etp['content']['土地转让']) for d in data: e1 = d.pop('原土地使用权人') e2 = d.pop('现有土地使用权人') p = Plot(**d) p_n = self.get_neo_node(p) if p_n is None: continue e1['链接'] = Enterprise.parser_url(e1['链接']) if e1['名称'] == etp['name'] or e1['链接'] == etp['url']: n1 = etp_n else: # 有可能是人 n1 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e1['链接'])) if n1 is None: n1 = Enterprise(**e1) if not n1.isEnterprise(): n1 = Person(**e1) if not n1.isPerson(): n1 = Related(**e1) n1 = self.get_neo_node(n1) if n1 is not None: nodes.append(n1) nodes.append(p_n) relationships.append(Sell(n1, p_n)) e2['链接'] = Enterprise.parser_url(e2['链接']) if e2['名称'] == etp['name'] or e2['链接'] == etp['url']: n2 = etp_n else: n2 = self.match_node(*legal, cypher='_.URL = "{}"'.format( e2['链接'])) if n2 is None: n2 = Enterprise(**e2) if not n2.isEnterprise(): n2 = Person(**e2) if not n2.isPerson(): n2 = Related(**e2) n2 = self.get_neo_node(n2) if n2 is not None: nodes.append(n2) nodes.append(p_n) relationships.append(Buy(n2, p_n)) pass return nodes, relationships def get_all_nodes_and_relationships(self, save_folder=None, **kwargs): enterprises = self.base.query( sql={ 'metaModel': '经营状况', # 'name': '重庆轩烽建材有限公司' }, # limit=100000, # skip=10000, no_cursor_timeout=True) i, j = 0, 0 nc, rc = 0, 0 etp_count = enterprises.count() nodes, relationships = {}, {} unique_code_pattern = re.compile('(?<=unique=)\w{32}') def getUniqueCode(url): _uc_ = re.search(unique_code_pattern, url) if _uc_ is not None: return _uc_.group(0) else: return None _st_ = time.time() for ep in enterprises: i += 1 uc = getUniqueCode(ep['url']) if uc is None: self.logger.info('{}:mismatch url'.format(ep['name'])) continue ep['url'] = '/firm_' + uc + '.html' nds, rps = self.get_all_nodes_and_relationships_from_enterprise(ep) for _nds_ in nds: if _nds_ is None: continue # _nds_ = _nds_.to_dict() label = list(_nds_.labels)[0] _nds_ = dict(label=label, **_nds_) if _nds_['label'] in nodes.keys(): nodes[_nds_['label']].append(_nds_) else: nodes[_nds_['label']] = [_nds_] pass for _rps_ in rps: _rps_ = _rps_.to_dict() if _rps_['label'] in relationships.keys(): relationships[_rps_['label']].append(_rps_) else: relationships[_rps_['label']] = [_rps_] pass if i % 10000 == 0: j += 1 if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info( SuccessMessage('success trans data to csv round {} and ' 'deal {}/{} enterprise spend {} seconds.' ''.format(j, i, etp_count, int(time.time() - _st_)))) _st_ = time.time() pass if save_folder is not None: _nc_, _rc_ = self.save_graph(save_folder, nodes, relationships, **kwargs) nc += _nc_ rc += _rc_ nodes.clear() relationships.clear() self.logger.info('Summary:') self.logger.info(' save graph data:') self.logger.info(' {} nodes'.format(nc)) self.logger.info(' {} relationships'.format(rc)) pass return nodes, relationships