def graph_merge_nodes(self, nodes=None, toleration=10): """ 把一个子图合并到数据库中去,因为我们一般都是批量插入节点或者关系, 但有时候在这批量数据中可能有一些异常的节点,会影响整批数据的插入, 因此,在一般情况下,将这些少数的坏节点抛弃掉,将大部分正常的节点 继续写入数据库。 :param toleration: 容忍失败的最大节点数 :param nodes: :return: """ try: if nodes is not None: tx = self.graph.begin() tx.merge(Subgraph(nodes=nodes)) tx.commit() except Exception as e: self.logger.debug('commit subgraph to database ' 'raise ({})'.format(e)) l = len(nodes) if l < toleration: return bk = l // 10 + 1 # 每块的数量 for i in range(0, 11): nds = nodes[i * bk:(i + 1) * bk] try: if len(nds): tx = self.graph.begin() tx.merge(Subgraph(nodes=nds)) tx.commit() except Exception as e: self.logger.debug( 'commit subgraph to database raise ({}) on ' '[{}:{}]'.format(e, i * bk, (i + 1) * bk)) self.graph_merge_nodes(nds, toleration)
def import_company_relation(): df = pd.read_csv('company_data/公司-供应商.csv') matcher = NodeMatcher(graph) eid1 = df['eid1'].values eid2 = df['eid2'].values relations = [] data = list(zip(eid1, eid2)) for e1, e2 in tqdm(data): if pd.notna(e1) and pd.notna(e2) and e1 != e2: company1 = matcher.match('company', eid=e1).first() company2 = matcher.match('company', eid=e2).first() if company1 is not None and company2 is not None: relations.append(Relationship(company1, '供应商', company2)) graph.create(Subgraph(relationships=relations)) print('import company-supplier relation succeeded') df = pd.read_csv('company_data/公司-担保.csv') matcher = NodeMatcher(graph) eid1 = df['eid1'].values eid2 = df['eid2'].values relations = [] data = list(zip(eid1, eid2)) for e1, e2 in tqdm(data): if pd.notna(e1) and pd.notna(e2) and e1 != e2: company1 = matcher.match('company', eid=e1).first() company2 = matcher.match('company', eid=e2).first() if company1 is not None and company2 is not None: relations.append(Relationship(company1, '担保', company2)) graph.create(Subgraph(relationships=relations)) print('import company-guarantee relation succeeded') df = pd.read_csv('company_data/公司-客户.csv') matcher = NodeMatcher(graph) eid1 = df['eid1'].values eid2 = df['eid2'].values relations = [] data = list(zip(eid1, eid2)) for e1, e2 in tqdm(data): if pd.notna(e1) and pd.notna(e2): company1 = matcher.match('company', eid=e1).first() company2 = matcher.match('company', eid=e2).first() if company1 is not None and company2 is not None: relations.append(Relationship(company1, '客户', company2)) graph.create(Subgraph(relationships=relations)) print('import company-customer relation succeeded')
def get_shortest_path_to_name_in_subgraph(self, start_id, end_node_name, max_degree=6, limit=3, max_end_node_number=3): end_nodes = self.find_by_name_property("entity", name=end_node_name, limit=max_end_node_number) total_nodes = [] total_relations = [] for node in end_nodes: end_id = self.get_id_for_node(node) subgraph = self.get_shortest_path_in_subgraph( start_id=start_id, end_id=end_id, max_degree=max_degree, limit=limit) if subgraph: total_nodes.extend(subgraph.nodes()) total_relations.extend(subgraph.relationships()) if total_nodes: return Subgraph(total_nodes, total_relations) else: return None
def import_company_r_guaranty(data_path): df = pd.read_csv(data_path) logger.info(f'''处理公司-担保关系, 数据文件:{data_path}''') r_total = len(df.index) eid1 = df['eid1'].values eid2 = df['eid2'].values data = list(zip(eid1, eid2)) c_g_relations = [] exists_r = 0 for c_id, guaranty_id in tqdm(data): if c_id == guaranty_id: continue company = matcher.match("company").where( f'''_.cid=\'{c_id}\'''').first() guaranty = matcher.match("company").where( f'''_.cid=\'{guaranty_id}\'''').first() c_r_g = "担保" if company is not None and guaranty is not None: r_match = r_matcher.match(nodes=(company, guaranty), r_type=c_r_g) exists_r += len(list(r_match)) if len(list(r_match)) == 0: relations = Relationship(company, c_r_g, guaranty) c_g_relations.append(relations) effective_relations = len(c_g_relations) if effective_relations != 0: graph.create(Subgraph(relationships=c_g_relations)) logger.info( f'''导入公司-供应商关系完成, 共需导入: {r_total} 个, 有效关系: {effective_relations} 个, 已存在关系:{exists_r}个''' )
def _author_to_neo(info, graph): print(info) tx = graph.begin() author = Node("Author", CellID=info['CellID']) # paper_author = [ # Node("PaperAuthor", CellID=hash(info['CellID'] + pid)) for pid in info['PaperIDs'] # ] # papers = [Node("Paper", CellID=pid) for pid in info['PaperIDs']] # affilitation = [Node("Affiliation", CellID=aid) for aid in info['AffiliationIDs']] # authorships1 = [Relationship(p, 'HAS', pa) for p, pa in zip(papers, paper_author)] # authorships2 = [Relationship(pa, 'IS', author) for pa in paper_author] # affiliationship = [Relationship(pa, 'AFFILIATE', aff) for pa, aff in zip(paper_author, affilitation)] # nodes = [author] + papers + paper_author + affilitation # relations = authorships1 + authorships2 + affiliationship nodes = [author] relations = None sg = Subgraph(nodes, relations) tx.merge(sg, primary_label=None, primary_key='CellID') tx.commit() direct_info = keyfilter(author_fields.__contains__, info) author.update(**direct_info) graph.push(author)
def CreateNode(elements): nodes = {} for key in elements.keys(): if key not in ['主体代码', '证券代码']: nodes[key] = [] if key == '主体名称': for element in elements[key]: nodes[key].append( Node(key, name=element, code=elements['主体代码'][0])) elif key == '证券产品': for i in range(len(elements[key])): nodes[key].append( Node(key, name=elements['证券产品'][i], code=elements['证券代码'][i], belonging=elements['主体名称'][0])) else: for element in elements[key]: nodes[key].append( Node(key, name=element, belonging=elements['主体名称'][0])) for node in nodes.values(): graph.create(Subgraph(node)) return nodes
def get_shortest_path_in_subgraph(self, start_id, end_id, max_degree=6, limit=3): ''' get short path in Subgraph object :param start_id: the start id of node :param end_id: the end id of node :param max_degree: the max_degree of the path :param limit: the limit path :return:Subgraph object, if error, return None ''' try: query = 'Match path = shortestPath((n:entity)-[*..{max_degree}]-(m:entity)) where ID(n)={start_id} and ID(m)={end_id} RETURN distinct path limit {limit}' query = query.format(start_id=start_id, end_id=end_id, max_degree=max_degree, limit=limit) record_list = self.graph.run(query) nodes = [] relations = [] for record in record_list: path = record["path"] nodes.extend(path.nodes()) relations.extend(path.relationships()) if nodes: return Subgraph(nodes, relations) else: return None except Exception: _logger.exception("----------") return None
def search_nodes_by_keyword(self, keyword, label='api', top_number=10, index_name="api"): try: keywords = keyword.split() name = "" for k in keywords: name += (k + "* ") name = name.replace("(", " ").replace(")", " ") query = "call apoc.index.search('{index_name}', '{name}') YIELD node match (node:`{label}`) return node limit {top_number}" query = query.format(index_name=index_name, name=name, top_number=top_number, label=label) nodes = [] result = self.graph.run(query) for q in result: nodes.append(q['node']) return Subgraph(nodes=nodes) except Exception: _logger.exception("-----------") return None
def get_relations_between_two_nodes_in_subgraph(self, start_id, end_id): ''' get the relations between two nodes, for two direction, and return result in a subgraph object,but could be Node :param start_id: the start id :param end_id: :return: a Subgraph, could be None ''' try: result = self.get_relations_between_two_nodes(start_id, end_id) if result: nodes = [] relations = [] for record in result: relations.append(record["r"]) nodes.append(record["n"]) nodes.append(record["m"]) if nodes: return Subgraph(nodes, relations) else: return None else: return None except Exception, error: _logger.exception("-----------") return None
def import_company_r_person(data_path): df = pd.read_csv(data_path) logger.info(f'''处理公司-人物关系, 数据文件:{data_path}''') r_total = len(df.index) eid = df['eid'].values pid = df['pid'].values post = df['post'].values data = list(zip(eid, pid, post)) c_p_relations = [] exists_r = 0 for c_id, p_id, c_r_p in tqdm(data): person = matcher.match("person").where(f'''_.pid=\'{p_id}\'''').first() company = matcher.match("company").where( f'''_.cid=\'{c_id}\'''').first() if person is not None and company is not None: r_match = r_matcher.match(nodes=(company, person), r_type=c_r_p) exists_r += len(list(r_match)) if len(list(r_match)) == 0: relations = Relationship(company, c_r_p, person) c_p_relations.append(relations) effective_relations = len(c_p_relations) if effective_relations != 0: graph.create(Subgraph(relationships=c_p_relations)) logger.info( f'''导入公司-人物关系完成, 共需导入: {r_total} 个, 有效关系: {effective_relations} 个, 已存在关系:{exists_r}个''' )
def import_company(data_path): df = pd.read_csv(data_path) c_total = len(df.index) logger.info(f'''处理公司节点入库, 数据文件:{data_path}''') c_name = df['companyname'].values c_id = df['eid'].values c_dishonesty = df['dishonesty'].values data = list(zip(c_id, c_name, c_dishonesty)) nodes = [] exists_n = 0 for id, name, dishonesty in tqdm(data): # 收入 profit = np.random.randint(100000, 10000000, 1)[0] c_match = matcher.match("company").where(f'''_.cid=\'{id}\'''').first() if c_match is None: node = Node("company", name=name, cid=str(id), dishonesty=int(dishonesty), profit=int(profit)) nodes.append(node) else: exists_n += 1 effective_node = len(nodes) if effective_node != 0: graph.create(Subgraph(nodes)) logger.info( f'''共需导入公司节点: {c_total} 个, 有效节点: {effective_node} 个, 已存在节点:{exists_n}个''' )
def expand_node(self, node_id, limit=40): """ get the directly_adjacent_nodes of one node :return: return value is a subgraph """ low_quality_query = "Match (n:entity)-[r]-(m:entity) where ID(n)={start_id} return distinct r,n,m limit {limit}" low_quality_query = low_quality_query.format(start_id=node_id, limit=limit) try: nodes = [] relationships = [] record_list_for_all_relation = self.graph.run(low_quality_query) for record in record_list_for_all_relation: r = record["r"] relationships.append(r) nodes.append(record["n"]) nodes.append(record["m"]) if nodes: return Subgraph(nodes, relationships) else: return None except Exception: traceback.print_exc() return None
def batch_relations(self, rela, relations): tx = self.graph.begin() new_relationships = [] old_relationships = [] for data in relations: entityname1 = data["entityname1"] entityname2 = data["entityname2"] matcher = NodeMatcher(self.graph) node1 = matcher.match(name=entityname1).first() node2 = matcher.match(name=entityname2).first() # print("node-----------", node1, node2) matcher = RelationshipMatcher(self.graph) old_relationship = matcher.match([node1, node2], r_type=rela).first() print("-------old_relationship", old_relationship) if old_relationship is None: relationship = Relationship(node1, rela, node2, score=100) print("-------relationship", relationship) new_relationships.append(relationship) if len(new_relationships) > 0: print("new_relationships--------", new_relationships) sub = Subgraph(relationships=new_relationships) tx.create(sub) tx.commit()
def deleteRelationships(self, rtype): try: subG = Subgraph(relationships=self.graph.relationships.match( r_type=rtype)) # self.graph.create(subG) self.graph.separate(subG) except ValueError as e: print(e)
def to_subgraph(self): """ Convert a RecordList into a Subgraph. """ entities = [] for record in self.records: for value in record: if isinstance(value, (Node, Path)): entities.append(value) return Subgraph(*entities)
def import_harmonized_attribute(self, harmonized_attribute): system = harmonized_attribute['system'] entity = harmonized_attribute['entity'] attribute = harmonized_attribute['attribute'] logger.info( f'Importing HarmonizedAttribute {system}.{entity}.{attribute} ...') ha_node = self.mdr_graph.get_harmonized_attribute( system, entity, attribute) if ha_node is not None: # already exists. Skip # TODO: Update return ha_node = self.mdr_graph.create_harmonized_attribute( system, entity, attribute) ha_node['definition'] = harmonized_attribute['definition'] subgraph = Subgraph([ha_node]) # to-do: What's created here is empty. Is it updated later anywhere? If not, # ...is this just something that Dazhi never got to? - joeflack4 2021/11/30 cs_node = self.mdr_graph.create_code_set() subgraph |= cs_node subgraph |= Relationship(ha_node, 'HAS_MEANING', cs_node) # node_attributes: Looks like will only be mappings, of the format: # ...<MODEL>:<ENTITY>.<ATTR> - joeflack4 2021/11/19 if 'node_attributes' in harmonized_attribute: # TODO: Shouldn't exact_mapping be nested within node_attributes or # ...node_attributes/mappings?= instead? (updated here and in # ...CRDCH.import_harmonized_attributes() for node_attribute in harmonized_attribute['node_attributes']: try: system, entity_attribute = node_attribute.split(':') entity, attribute = entity_attribute.split('.') except ValueError as e: logger.error( f'Failed to parse the mapping attribute name: {node_attribute}' ) logger.error(e) continue na_node = self.mdr_graph.get_node_attribute( system, entity, attribute) if na_node is None: logger.warning(node_attribute + ' not found in database') else: subgraph |= Relationship(na_node, 'MAPS_TO', ha_node) tx = self.graph.begin() tx.create(subgraph) self.graph.commit(tx) logger.info( f'Importing HarmonizedAttribute {system}.{entity}.{attribute} was successful' )
def import_assign(): df = pd.read_csv('company_data/分红.csv') names = df['schemetype'].values nodes = [] for name in tqdm(names): node = Node('assign', name=name) nodes.append(node) graph.create(Subgraph(nodes))
def import_violations(): df = pd.read_csv('company_data/违规类型.csv') names = df['gooltype'].values nodes = [] for name in tqdm(names): node = Node('violations', name=name) nodes.append(node) graph.create(Subgraph(nodes))
def import_industry(): df = pd.read_csv('company_data/行业.csv') names = df['orgtype'].values nodes = [] for name in tqdm(names): node = Node('industry', name=name) nodes.append(node) graph.create(Subgraph(nodes))
def import_bond(): df = pd.read_csv('company_data/债券类型.csv') names = df['securitytype'].values nodes = [] for name in tqdm(names): node = Node('bond', name=name) nodes.append(node) graph.create(Subgraph(nodes))
def create_node(self, names, label): nodes = [] for name in tqdm(names): if list(self.node_match.match(label, name=name))==[]: nodes.append(Node(label, name=name)) if nodes!=[]: subgraf = Subgraph(nodes) self.graf.create(subgraf) print('完成创建节点{}个'.format(len(nodes))) pass
def add_boons(cls, id_, boons=None): if boons is None: boons = [] _, _, guide = Guide.find_by_id(id_) boon_list = list( NodeMatcher(graph_).match("Boon").where(f"_.id in {boons}")) relationships = [] for boon in boon_list: relationships.append(Relationship(guide, "HAS_BOON", boon)) sub_graph = Subgraph(boon_list + [guide], relationships) graph_.create(sub_graph)
def __call__(self, data, indexes=None, *args, **kwargs): assert isinstance( data, list), "except data to be list, but got %s" % type(data) nodes = [] for datum in data: new_node = deepcopy(self.node_template) for attr in datum: new_node[attr] = datum[attr] nodes.append(new_node) nodes = Subgraph(nodes) self._create(nodes, indexes)
def __simplify_graph(self, graph: Subgraph) -> Subgraph: # List edges edge_dict = dict() edge_deny = dict() for edge in graph.relationships(): source = edge.start_node()['name'] target = edge.end_node()['name'] edge_dict.setdefault(source + target, []) if not edge.type() in edge_dict[source + target]: edge_dict[source + target].append(edge.type()) else: edge['__drop__'] = 1 edge_deny.setdefault(source + target, False) edge_deny[source + target] |= bool(edge['DENY']) # Group edges new_edges = [] for edge in graph.relationships(): source = edge.start_node()['name'] target = edge.end_node()['name'] tags = edge_dict.get(source + target, []) if len(tags) > 1: edge['__drop__'] = 1 if '__done__' not in tags: new_rel = Relationship(edge.start_node(), ','.join(tags), edge.end_node()) if edge_deny.get(source + target): new_rel['DENY'] = 1 edge_dict[source + target].append('__done__') new_edges.append(new_rel) # Build the simplified graph new_graph = Subgraph( nodes=[x for x in graph.nodes()], relationships=[ x for x in graph.relationships() if not x['__drop__'] ] + new_edges) return new_graph
def import_company(): df = pd.read_csv('company_data/公司.csv') eid = df['eid'].values name = df['companyname'].values nodes = [] data = list(zip(eid, name)) for eid, name in tqdm(data): profit = np.random.randint(100000, 100000000, 1)[0] node = Node('company', name=name, profit=int(profit), eid=eid) nodes.append(node) graph.create(Subgraph(nodes))
def import_person(): df = pd.read_csv('company_data/人物.csv') pid = df['personcode'].values name = df['personname'].values nodes = [] data = list(zip(pid, name)) for eid, name in tqdm(data): age = np.random.randint(20, 70, 1)[0] node = Node('person', name=name, age=int(age), pid=str(eid)) nodes.append(node) graph.create(Subgraph(nodes))
def expand_node_for_adjacent_nodes_to_subgraph(self, node_id, limit=40): """ get the directly_adjacent_nodes of one node :return: return value is a subgraph """ high_quality_query = "Match (n:entity)-[r]-(m:wikidata) where ID(n)={start_id} return distinct r,n,m limit {limit}" high_quality_query = high_quality_query.format(start_id=node_id, limit=limit) media_quality_query = "Match (n:entity)-[r]-(m:api) where ID(n)={start_id} return distinct r,n,m limit {limit}" media_quality_query = media_quality_query.format(start_id=node_id, limit=limit) low_quality_query = "Match (n:entity)-[r]-(m:entity) where ID(n)={start_id} return distinct r,n,m limit {limit}" low_quality_query = low_quality_query.format(start_id=node_id, limit=limit) try: nodes = [] relationships = [] # todo speed up this by multiple thread record_list_for_all_relation = self.graph.run(high_quality_query) for record in record_list_for_all_relation: r = record["r"] relationships.append(r) nodes.append(record["n"]) nodes.append(record["m"]) record_list_for_all_relation = self.graph.run(media_quality_query) for record in record_list_for_all_relation: r = record["r"] relationships.append(r) nodes.append(record["n"]) nodes.append(record["m"]) record_list_for_all_relation = self.graph.run(low_quality_query) for record in record_list_for_all_relation: r = record["r"] relationships.append(r) nodes.append(record["n"]) nodes.append(record["m"]) if nodes: return Subgraph(nodes, relationships) else: return None except Exception: traceback.print_exc() return None
def add_edges(self, edges: List[tuple]): """ Add a list of edges to the graph and synchronize them to the remote database. """ es = [] # Since we have to synchronize changes as a single chunk, it's not as # simple as calling add_edge() for every element of `edges`. for e in edges: u, rel_type, v, props = e ee = Relationship(u, rel_type, v, props) es.append(ee) self._graph.create(Subgraph(es))
def bacth_node_label(self, label, entity_labes): tx = self.graph.begin() newnodelist = [] oldnodelist = [] matcher = NodeMatcher(self.graph) for data in entity_labes: node = matcher.match(name=data).first() if node is None: oneNode = Node() oneNode.add_label(label=label) oneNode["name"] = data newnodelist.append(oneNode) else: node.add_label(label=label) oldnodelist.append(node) if len(newnodelist) > 0: newsub = Subgraph(newnodelist) print("newnodelist----", newnodelist) tx.create(newsub) if len(oldnodelist) > 0: oldsub = Subgraph(oldnodelist) print("oldnodelist----", oldnodelist) tx.push(oldsub) tx.commit()
def batch_node(self, entitys_items): tx = self.graph.begin() newnodes = [] matcher = NodeMatcher(self.graph) for data in entitys_items: node = matcher.match(name=data).first() if node is None: oneNode = Node() oneNode["name"] = data newnodes.append(oneNode) if len(newnodes) > 0: print("newnodes---------", newnodes) sub = Subgraph(newnodes) tx.create(sub) tx.commit()