def get_top_ppr(ppr_seed, max_num=10, lang="en"): ''' Get nodes based on Personalized PageRank(PPR). :param ppr_seed: PPR seed node ID. :param max_num: maximum number of records to retrieve. :param lang: language to return :return: list type, list of the records output by Neo4j query ''' if lang == "en": lang = "" else: lang = "_" + lang results = [] db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: tx = session.begin_transaction() ppr_result = tx.run( f"MATCH (p) WHERE ID(p) = {ppr_seed} " "WITH p CALL gds.pageRank.stream('memex-graph', { maxIterations: 10, dampingFactor: 0.85, sourceNodes: [p] }) YIELD nodeId, score AS pageRank " "RETURN gds.util.asNode(nodeId).wid, gds.util.asNode(nodeId).label" + lang + ", gds.util.asNode(nodeId).description" + lang + ", gds.util.asNode(nodeId).guid " "ORDER BY pageRank DESC " f"LIMIT {max_num}") for record in ppr_result: results.append(record) return results
def get_node_details(node_id, lang="en"): ''' Get node main details. :param node_id: the ID of the source node to get its main details. :param lang: language to return :return: node main details ''' if lang == "en": lang = "" else: lang = "_" + lang results = [] db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: tx = session.begin_transaction() result = tx.run("MATCH (p) " f"WHERE p.wid='{node_id}' " "RETURN p.wid, p.label" + lang + ", p.description" + lang + ", p.guid, p.image, p.location.x, p.location.y") for record in result: results.append(record) if results: return results[0] else: return db_conn._error_status
def get_top_closest(tx, n, long, lat, meters, lang="en"): ''' Search for entites using location information. :param tx: transaction object reference :param n: integer type, number of records :param long: float type, longitude coordinates :param lat: float type, latitude coordinates :param meters: integer type, radial distance from point in meters :param lang: language to return :return: list type, list of the records output by Neo4j query ''' if lang == "en": lang = "" else: lang = "_" + lang results = [] db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: tx = session.begin_transaction() result = tx.run( "MATCH (p) " "WHERE distance(point({latitude:$lat, longitude:$long}),p.location) < $meters " "RETURN p.label_" + lang + ",p.wid,distance(point({latitude:$lat, longitude:$long}),p.location) AS dist, " " id(p) " "ORDER BY dist ASC LIMIT $n", lat=lat, long=long, meters=meters, n=n) for record in result: results.append(record) return results
def get_top_similar_closest(search_mode, sentence, n, long, lat, meters): ''' Search for entites using text and location information based on the different search modes (semantic embeddings or fulltext). :param search_mode: str type, search mode semantic or fulltext (exact search) :param sentence: str type, textual description :param n: integer type, subset the top ranked results :param long: float type, longitude coordinates :param lat: float type, latitude coordinates :param meters: integer type, radial distance from point in meters :return: list of top n closest points by distance and similarity ''' db_conn = db_connection.DB_Connection() if (search_mode == "semantic"): prova = [float(x) for x in get_emb_vect(sentence)] with db_conn._driver.session() as session: results = session.read_transaction(get_within_radius_w2v, long=long, lat=lat, meters=meters, query=prova) else: with db_conn._driver.session() as session: results = session.read_transaction(get_within_radius_fulltext, long=long, lat=lat, meters=meters, query=sentence) return results[:n]
def get_node_neighbors(node_id, lang="en"): ''' Get the 1-hop neighbors of a node and their relation to the source node. :param node_id: the ID of the source node to get its neighbors. :param lang: language to return :return: list of relation type, neighbor id and neighbor label ''' if lang == "en": lang_rel = ".en" lang = "" else: lang_rel = "." + lang lang = "_" + lang results = [] db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: tx = session.begin_transaction() result = tx.run("MATCH (n)-[r]->(b) " f"WHERE n.wid='{node_id}' " "RETURN r" + lang_rel + ", b.wid, b.label" + lang) for record in result: results.append(record) return results
def get_top_visually_similar_closest(image_emb, n, long, lat, meters, lang="en"): ''' Search for entites using image and location information. :param image_emb: float array :param n: integer type, subset the top ranked results :param long: float type, longitude coordinates :param lat: float type, latitude coordinates :param meters: integer type, radial distance from point in meters :param lang: language to return :return: list of top n closest points by distance and similarity ''' results = [] db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: results = session.read_transaction(get_within_radius_visual, long=long, lat=lat, meters=meters, query=image_emb, lang=lang) return results[:n]
def update_fulltext_index(lang="en"): ''' Drop the search index if exists and create a new one over the entity labels and descriptions. :param lang: language to return ''' if lang == "en": lang = "" else: lang = "_" + lang db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: try: tx = session.begin_transaction() tx.run("CALL db.index.fulltext.drop('labelsAndDescriptions')") tx.commit() except: pass with db_conn._driver.session() as session: try: tx = session.begin_transaction() tx.run( "CALL db.index.fulltext.createNodeIndex('labelsAndDescriptions', ['Knowledge', 'Place'], ['label" + lang + "', 'description" + lang + "'])") tx.commit() except: pass
def europeana_ingest_pilots(cities, link_to_nodes=False, annotation_threshold=0.1): """ Reads the list of desired cities to ingest (Default: 'list_cities.txt') Retrieves the list of europeana items and adds to Neo4j. :param cities: list of cities to retrieve :param link_to_nodes: a flag for linking the ingested entities to existing entities in the graph :param annotation_threshold: TagMe annotation threshold """ db_conn = db.DB_Connection() wikidata_found = False for city in cities: print("Europeana ingestion...", city) data = eip.query_only_id_bboxes( city, db_conn=db_conn, link_to_nodes=link_to_nodes, annotation_threshold=annotation_threshold) # print("Processing ", len(data), " entities...") # for n in tqdm(data): # if link_to_wikidata: # #Add wikidata wids # wiki_titles = [] # for idx, property_name in enumerate(n[0]): # if property_name == "label" or property_name == "description" or property_name == "dcCreator": # value = n[1][idx] # europeana_annotations = tagme.annotate(value) # if europeana_annotations: # for ann in europeana_annotations.get_annotations(annotation_threshold): # t = ann.entity_title # t = t[0].lower() + t[1:] # wiki_titles.append(t) # # if property_name == "dcCreator": # print(n[1][idx]) # # wiki_titles = list(dict.fromkeys(wiki_titles)) # all_wids = [] # for title in wiki_titles: # wids = db_conn.get_wikidata_ids_by_label(title) # if wids: # all_wids.extend(wids) # # if all_wids: # wikidata_found = True # n[0].append("wids") # n[1].append(all_wids) # # #Insert the node # db_conn.queue_insert_node(n) # # #Link to wikidata nodes # if link_to_wikidata and wikidata_found: # db_conn.match_with_wikidata() db_conn.close()
def mapillary_ingest_pilots(cities): """ Reads the list of desired cities to ingest (Default: 'list_cities.txt') Retrieves the list of mapillary image items and adds to Neo4j. :param cities: list of cities to retrieve """ db_conn = db.DB_Connection() for city in cities: print("Mapillary ingestion...", city) data = mip.query_only_id_bboxes(city) for n in data: db_conn.queue_insert_node(n) db_conn.close()
def get_top_visually_similar(image_emb, n, lang="en"): ''' Search for entites using image information. :param image_emb: float array :param n: integer type, subset the top ranked results :param lang: language to return :return: list of top n closest points and visual similarity ''' db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: results = session.read_transaction(get_visual, query=image_emb, lang=lang) return results[:n]
def delete_story(story_id): """ Delete story entity from the KG. :return: JSON response with the success status True/False """ db_conn = db.DB_Connection() result = db_conn.get_story(story_id) if result == -1: return jsonify(success=False), 500 else: status = db_conn.delete_story(story_id) if status == 0: return jsonify(success=True) else: return jsonify(success=False), 500
def get_stories_by_location(): """ Retrieves stories by a given location. :return: JSON response with the list of stories """ req_data = request.get_json() if 'latitude' in req_data: latitude = int(req_data["latitude"]) else: return jsonify({'message': 'Missing latitude field.'}), 400 if 'longitude' in req_data: longitude = int(req_data["longitude"]) else: return jsonify({'message': 'Missing longitude field.'}), 400 if 'meters' in req_data: meters = int(req_data["meters"]) else: return jsonify({'message': 'Missing meters field.'}), 400 if 'max_num' in req_data: max_num = int(req_data["max_num"]) else: max_num = 10 db_conn = db.DB_Connection() result = db_conn.get_stories_by_location(latitude, longitude, meters, max_num) res = {} for i in range(len(result)): temp = {} temp['story_id'] = result[i][0] temp['label'] = result[i][1] temp['description'] = result[i][2] temp['image'] = result[i][3] temp['audio'] = result[i][4] temp['video'] = result[i][5] temp['latitude'] = result[i][6] temp['longitude'] = result[i][7] temp['dist'] = result[i][8] res[i] = temp return res
def get_wids(overwrite=True, with_image=False): ''' Get the wids and labels of all the entites in the KG. :param overwrite: flag if true return entity wids regardless of having descriptions or not :param with_image: filters for only nodes that have images :return: list of entity wids and labels ''' db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: if overwrite: if with_image: query = """ MATCH (p) WHERE EXISTS (p.wid) AND EXISTS (p.image) RETURN p.wid, p.label, p.image """ else: query = """ MATCH (p) WHERE EXISTS (p.wid) RETURN p.wid, p.label """ else: if with_image: query = """ MATCH (p) WHERE EXISTS (p.wid) AND EXISTS (p.image) AND (NOT EXISTS (p.description) OR NOT EXISTS (p.image_emb)) RETURN p.wid, p.label, p.image """ else: query = """ MATCH (p) WHERE EXISTS (p.wid) AND NOT EXISTS (p.description) RETURN p.wid, p.label """ results = session.run(query) l = [] if with_image: for i in results: l.append([i[0], i[1], i[2]]) else: for i in results: l.append([i[0], i[1]]) db_conn.close() return l
def update_desc_emb(nodes, lang='en', wikipedia_sentences=2): ''' Get node descriptions from Wikidata enriched with Wikipedia descriptions, generate embeddings based on the labels and descriptions, and set the embeddings as node property. :param nodes: dict, node ids and labels :param lang: language (default 'en') :param wikipedia_sentences: number of sentences to get from wikipedia (default 2) :return: a confirmation message ''' db_conn = db_connection.DB_Connection() session = db_conn._driver.session() print(len(nodes)) for wid, label in tqdm(nodes): wikipedia_descr = "" try: wikipedia_descr = wikipedia.summary(label, sentences=wikipedia_sentences) except: pass desc = "" try: entity = client.get(wid, load=True) if entity.description: descriptions = entity.description.texts if bool(descriptions): if lang in descriptions.keys(): desc = descriptions[lang] except: pass text = label + "\n" + desc + "\n" + wikipedia_descr emb = avg_feature_vector(text) emb = [float(x) for x in emb] set_embedding_w2v(session, wid, desc, emb, wikipedia_descr) return "Desc and embeddings updated successfully"
def wikidata_ingest_pilots(hops, cities): """ Reads the list of desired cities to ingest (Default: 'list_cities.txt') Retrieves the list of wikidata entity ids for each city, and calls download_n_hops_rec function in order to start recursive retrieval. :param hops: number of recursive hops :param cities: list of cities to retrieve """ db_conn = db.DB_Connection() for city in cities: print("Wikidata ingestion...", city) data = wip.query_only_id_bboxes(city) starting_urls = [] for line in data: # line is a wid of the format "wd:Q* ", we need only "Q* " starting_urls.append(line['place'].split("/")[-1]) rhi.download_n_hops_rec(starting_urls, db_conn, hops) db_conn.close()
def get_top_similar(search_mode, sentence, n): ''' Search for entites using text based on the different search modes (semantic embeddings or fulltext). :param search_mode: str type, search mode :param sentence: str type, textual description :param n: integer type, subset the top ranked results :return: list of top n closest points by distance and similarity ''' db_conn = db_connection.DB_Connection() if (search_mode == "semantic"): prova = [float(x) for x in get_emb_vect(sentence)] with db_conn._driver.session() as session: results = session.read_transaction(get_sim_w2v, query=prova) else: with db_conn._driver.session() as session: results = session.read_transaction(get_sim_fulltext, query=sentence) return results[:n]
def get_story(story_id): """ Retrieves story details based on a given story ID. :return: JSON response with the story details """ db_conn = db.DB_Connection() result = db_conn.get_story(story_id) if result == -1: return jsonify(success=False), 500 else: resp = {} resp['label'] = result[0] resp['description'] = result[1] resp['image'] = result[2] resp['audio'] = result[3] resp['video'] = result[4] resp['latitude'] = result[5] resp['longitude'] = result[6] return resp
def update_named_graph(): ''' Drop the named graph if exists and create a new one over "Place" and "Knowledge" nodes. ''' db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: try: tx = session.begin_transaction() tx.run("CALL gds.graph.drop('memex-graph')") tx.commit() except: pass with db_conn._driver.session() as session: try: tx = session.begin_transaction() tx.run( "CALL gds.graph.create('memex-graph',['Place', 'Knowledge'],'*')" ) tx.commit() except: pass
def get_similar_stories(): """ Retrieves stories similar to a given story. :return: JSON response with the list of stories """ req_data = request.get_json() if 'story_id' in req_data: story_id = str(req_data["story_id"]) story_id = "'" + story_id + "'" else: return jsonify({'message': 'Missing story_id field.'}), 400 db_conn = db.DB_Connection() result = db_conn.get_story(story_id) if result == -1: return jsonify(success=False), 500 else: if 'max_num' in req_data: max_num = int(req_data["max_num"]) else: max_num = 10 result = db_conn.get_similar_stories(story_id, max_num) resp = {} for i in range(len(result)): temp = {} temp['story_id'] = result[i][0] temp['label'] = result[i][1] temp['description'] = result[i][2] temp['image'] = result[i][3] temp['audio'] = result[i][4] temp['video'] = result[i][5] temp['latitude'] = result[i][6] temp['longitude'] = result[i][7] temp['sim'] = result[i][8] resp[i] = temp return resp
def update_visual_desc_emb(nodes, model='ResNet18', temp_folder='data/images/', keep_images=False, throttle=2): ''' Fetches the image creates a visual embedding using the described model :param nodes: array of arrays [[node ids, label, image url]] :temp_folder: the folder the images will be downloaded (temporarily if keep_images = false) :keep_images: whether to not delete the image :return: True on success ''' from db.models import prepare_image_from_file, get_visual_model, get_visal_embedding import time os.makedirs(temp_folder, exist_ok=True) db_conn = db_connection.DB_Connection() session = db_conn._driver.session() model = get_visual_model(model) for wid, label, image_url in tqdm(nodes): # Encode only the first image (most common scenario) if type(image_url) == list: image_url = image_url[0] ext = image_url[-3:] filepath = temp_folder + wid + '.' + ext filename = wget.download(image_url, filepath) I = prepare_image_from_file(filepath) if type(I) == type(None): continue emb = get_visal_embedding(model, I) emb = [float(x) for x in emb] set_embedding_visual(session, wid, emb) if not keep_images: os.remove(filepath) time.sleep(throttle) return True
def get_ppr_subgraph(ppr_seed, max_num=10): ''' Get a PPR based subgraph. :param ppr_seed: PPR seed node ID. :param max_num: maximum number of records to retrieve. :return: list type, list of the records output by Neo4j query ''' db_conn = db_connection.DB_Connection() with db_conn._driver.session() as session: tx = session.begin_transaction() ppr_result = tx.run( f"MATCH (p) WHERE ID(p) = {ppr_seed} " "WITH p CALL gds.pageRank.stream('memex-graph', { maxIterations: 10, dampingFactor: 0.85, sourceNodes: [p] }) YIELD nodeId, score AS pageRank " f"WITH gds.util.asNode(nodeId) AS n, pageRank " "MATCH (n)-[r]-(b) " "WHERE EXISTS(b.wid) " "RETURN * " "ORDER BY pageRank DESC " f"LIMIT {max_num}+10") ppr_result.graph() return ppr_result
def get_story_connections(): """ Retrieves the KG entities connected to a given story. :return: JSON response with the list of KG entities """ req_data = request.get_json() if 'story_id' in req_data: story_id = str(req_data["story_id"]) story_id = "'" + story_id + "'" else: return jsonify({'message': 'Missing story_id field.'}), 400 if 'language' in req_data: lang = str(req_data["language"]) else: lang = "en" db_conn = db.DB_Connection() result = db_conn.get_story(story_id) if result == -1: return jsonify(success=False), 500 else: result = db_conn.get_story_connections(story_id, lang) resp = {} for i in range(len(result)): temp = {} temp['wid'] = result[i][0] temp['label'] = result[i][1] temp['description'] = result[i][2] temp['image'] = result[i][3] temp['latitude'] = result[i][4] temp['longitude'] = result[i][5] resp[i] = temp return resp
def add_story(): """ Add story entity to the KG. :return: JSON response with the success status True/False """ req_data = request.get_json() properties = [] values = [] if 'story_id' in req_data: story_id = str(req_data["story_id"]) properties.append('story_id') values.append(story_id) else: return jsonify({'message': 'Missing story_id field.'}), 400 if 'label' in req_data: properties.append('label') values.append(req_data['label']) if 'description' in req_data: properties.append('description') values.append(req_data['description']) if 'keywords' in req_data: properties.append('keywords') values.append(req_data['keywords'] + [' ']) if 'image' in req_data: properties.append('image') values.append(req_data['image']) if 'audio' in req_data: properties.append('audio') values.append(req_data['audio']) if 'video' in req_data: properties.append('video') values.append(req_data['video']) if 'latitude' and 'longitude' in req_data: properties.append('coordinate_location') values.append([req_data['latitude'], req_data['longitude']]) data = [properties, values] db_conn = db.DB_Connection() result = db_conn.add_story(data) if result == 0: if 'language' in req_data: lang = str(req_data["language"]) else: lang = "en" db_conn.connect_story_by_textual_sim("'" + story_id + "'", "Knowledge", lang) db_conn.connect_story_by_textual_sim("'" + story_id + "'", "Place", lang) if 'connections' in req_data: connections = req_data['connections'] db_conn.connect_story_to_target_nodes("'" + story_id + "'", connections) db_conn.link_stories_together() return jsonify(success=True) else: return jsonify(success=False)
def edit_story(story_id): """ Edit story entity in the KG. :return: JSON response with the success status True/False """ req_data = request.get_json() properties = [] values = [] if 'label' in req_data: properties.append('label') values.append(req_data['label']) if 'description' in req_data: properties.append('description') values.append(req_data['description']) if 'keywords' in req_data: properties.append('keywords') values.append(req_data['keywords'] + [' ']) if 'image' in req_data: properties.append('image') values.append(req_data['image']) if 'audio' in req_data: properties.append('audio') values.append(req_data['audio']) if 'video' in req_data: properties.append('video') values.append(req_data['video']) if 'latitude' and 'longitude' in req_data: properties.append('coordinate_location') values.append([req_data['latitude'], req_data['longitude']]) data = [properties, values] db_conn = db.DB_Connection() result = db_conn.get_story(story_id) if result == -1: return jsonify(success=False), 500 else: result = db_conn.edit_story(str(story_id), data) db_conn.remove_story_connections(str(story_id)) if 'language' in req_data: lang = str(req_data["language"]) else: lang = "en" db_conn.connect_story_by_textual_sim(str(story_id), "Knowledge", lang) db_conn.connect_story_by_textual_sim(str(story_id), "Place", lang) if 'connections' in req_data: connections = req_data['connections'] db_conn.connect_story_to_target_nodes("'" + story_id + "'", connections) db_conn.link_stories_together() if result == 0: return jsonify(success=True) else: return jsonify(success=False), 500
type=int, help='[Mode 8] Get the top K') parser.add_argument('--query', metavar='text query', type=str, help='[Mode 8] Text query') parser.add_argument('--meters', default=5000, metavar='text query', type=int, help='[Mode 8] Radial distance in meters') parser.add_argument('--verbose', type=str2bool, default=False) args = parser.parse_args() # Check if Community db_conn = db.DB_Connection() neo4j_version = db_conn.version() db_conn.close() if args.verbose: print("[INFO] Neo4j Currently running on", neo4j_version) #if 'community' not in neo4j_version: #print('not community') #db_conn = db.DB_Connection() #db_conn.initalise_neo4j() #db_conn.close() if args.mode == 0: # Wikidata ingestion tool if args.cities: f = open(args.cities, 'r') cities = []