def insert_relationships_into_graph(graph: Graph, serialized_groups: dict, batch_size: int = DEFAULT_BATCH_SIZE): """ Create and insert relationships (MEMBER_OF) into the neo4j database using py2neo bulk functions :param batch_size: batch size to upload items in bulk to DB :param graph: represents connection to neo4j backend :param serialized_groups: scraper's output """ relationships_data = [] for group_id, group_properties in serialized_groups.items(): if 'participants' in group_properties: group_participants_numbers = group_properties['participants'] for participant_number in group_participants_numbers: # Prepare relationships info for bulk insertion relationships_data.append((participant_number, {}, group_id)) # Bulk insert all relationships into DB print(f"Inserting {len(relationships_data)} relationships into DB...") for batch_relationships in tqdm( list(divide_into_batches(relationships_data, batch_size=batch_size))): create_relationships(graph.auto(), data=batch_relationships, rel_type="MEMBER_OF", start_node_key=("Participant", "phone_number"), end_node_key=("Group", "group_id"))
class DoNeo(object): def __init__(self, exec_default=False): if exec_default: self.connect_db(url) self.load_data(file) self.populate_db() def connect_db(self, url=None): if url: self.g = Graph(url) else: self.g = Graph() def load_data(self, file): with open(file) as f: self.data = json.load(f) def populate_db(self): for d in self.data: d['Property']['IdUnique'] = d['IdUnique'] if d['Kind'] == 'node': if d['DeDuplication'] == None and self.g.nodes.match( IdUnique=d['IdUnique']).count() == 0: create_nodes(self.g.auto(), [d['Property']], labels={*d['Label']}) else: merge_nodes(self.g.auto(), [d['Property']], (tuple(d['Label']), 'IdUnique'), labels={*d['Label']}) else: da = ((d['FromIdMaster']), d['Property'], (d['ToIdMaster'])) if d['DeDuplication'] == None and self.g.relationships.match( IdUnique=d['IdUnique']).count() == 0: create_relationships(self.g.auto(), [da], d['Type'], start_node_key=(d['FromLabel'], 'IdMaster'), end_node_key=(d['ToLabel'], 'IdMaster')) else: merge_relationships(self.g.auto(), [da], (d['Type'], 'IdUnique'), start_node_key=(d['FromLabel'], 'IdMaster'), end_node_key=(d['ToLabel'], 'IdMaster'))
def insert_nodes_into_graph(graph: Graph, serialized_groups: dict, contacts: Optional[Dict[str, str]], batch_size: int = DEFAULT_BATCH_SIZE): """ Create and insert nodes (Participant/Group) into the neo4j database using py2neo bulk functions :param graph: represents connection to neo4j backend :param serialized_groups: scraper's output :param contacts: a dictionary converting from phone number to name :param batch_size: batch size to upload items in bulk to DB """ participants = defaultdict(lambda: defaultdict(dict)) groups = [] for group_id, group_properties in serialized_groups.items(): if 'participants' in group_properties: group_name = group_properties.get('group_name', None) group_participants_numbers = group_properties['participants'] for participant_number in group_participants_numbers: if participant_number not in participants: # Prepare participant data for bulk insertion participants[participant_number][ 'phone_number'] = participant_number if contacts is not None: participants[participant_number][ 'name'] = contacts.get(participant_number, None) # Prepare group data for bulk insertion groups.append(dict(name=group_name, group_id=group_id)) # Create participants nodes participants_list = participants.values() print(f"Inserting {len(participants_list)} participants into DB...") for batch_participants in tqdm( list( divide_into_batches(list(participants_list), batch_size=batch_size))): create_nodes(graph.auto(), batch_participants, labels={"Participant"}) # Create groups nodes print(f"Inserting {len(groups)} groups into DB...") for batch_groups in tqdm( list(divide_into_batches(groups, batch_size=batch_size))): create_nodes(graph.auto(), batch_groups, labels={"Group"})
def load( url: str = "http://download.geofabrik.de/europe/monaco-latest.osm.pbf", node_batch: int = 25000, relation_batch: int = 100, clear: bool = False, ): """load osm file into neo4j database Args: url (str, optional): location of osm file, could be in the cloud. Defaults to "http://download.geofabrik.de/europe/monaco-latest.osm.pbf". node_batch (int, optional): size of batches for nodes. Defaults to 25000. relation_batch (int, optional): size of batches for relations. Defaults to 50. clear (bool, optional): should database be cleared first. Defaults to False. """ start_time = time() db = Graph(getenv("CON_STRING"), auth=(getenv("CON_USER"), getenv("CON_PASS"))) print("connected to db") if clear: clear_db(db) neo_handler = NeoHandler() osm_handler = FileHandler(neo_handler) osm_handler.apply_file(url, locations=True) num_nodes = len(neo_handler.nodes) num_edges = len(neo_handler.edges) print(f"read file {url}, nodes: {num_nodes}, relations: {num_edges}") nodes = map( lambda n: { "node_id": n.node_id, "lat": n.lat, "long": n.long }, neo_handler.nodes, ) print(f"starting nodes with batch size {node_batch}") for batch in tqdm( batchify(nodes, batch_size=node_batch), total=ceil(num_nodes / node_batch), ): create_nodes(db.auto(), batch, labels={"Node"}) edges = map( lambda e: ( e.start_node_id, { "distance": e.distance, "rating": e.rating, "cost": e.cost }, e.end_node_id, ), neo_handler.edges, ) print(f"starting relations with batch size {relation_batch}") for batch in tqdm( batchify(edges, batch_size=relation_batch), total=ceil(num_edges / relation_batch), ): merge_relationships( db.auto(), batch, merge_key=("Route"), start_node_key=("Node", "node_id"), end_node_key=("Node", "node_id"), ) print("creating gds graph") db.run( "CALL gds.graph.create( 'nodesGraph', 'Node', 'Route', { relationshipProperties: ['rating', 'cost', 'distance'] } )" ) print(f"total time: {(time() - start_time)/60:.2f} minutes")