def add_object_data(objects, image_id, all_nodes, all_edges, wn2label, wn2image): obj2names = {} image_node = create_uri(VG_NS, 'I' + image_id) # image_metadata={'image_ids': [image_id]} image_metadata = {} for o in objects: names = [] for name in o['names']: name = name.strip() if not name: continue o_id = create_uri(VG_NS, name.replace(' ', '_')) o_pos = '' if 'attributes' in o.keys(): for attr in o['attributes']: a_id = create_uri(VG_NS, attr.replace(' ', '_')) a_pos = '' if attr in attr_synsets: a_synset = attr_synsets[attr] attr_wn_edge = [ a_id, WORDNET_SENSE_REL, create_uri(WORDNET_NS, a_synset), data_source, weight, image_metadata ] all_edges.append(attr_wn_edge) # save wordnet data for an attribute wn2label[a_synset].add(attr) wn2image[a_synset].add(image_id) a_pos = a_synset.split('.')[-2] o_pos = a_pos # attribute node a_label, a_aliases = add_lowercase_labels([attr]) attr_node = [ a_id, a_label, ','.join(a_aliases), a_pos, data_source, image_metadata ] all_nodes.append(attr_node) # edge from object to an attribute obj_attr_edge = [ o_id, RELATEDTO_REL, a_id, data_source, weight, image_metadata ] all_edges.append(obj_attr_edge) obj_node = [o_id, name, '', o_pos, data_source, image_metadata] all_nodes.append(obj_node) names.append(name) obj_img_edge = [ o_id, INIMAGE_REL, image_node, data_source, weight, image_metadata ] all_edges.append(obj_img_edge) obj2names[str(o['object_id'])] = names return all_nodes, all_edges, wn2label, wn2image, obj2names
def add_utensil(): """ Ajouter un ustensile """ store_file = STORE['utensils'] label = request.json['label'] actions = [create_uri(uri) for uri in request.json['actions']] uri = create_uri(BASE_URI_UTENSIL + sanitize(label)) load_rdf_file(store_file) Utensil(resUri=uri, label=label, actions=actions) save_rdf_file(store_file) return jsonify({'uri': uri})
def add_relationships_data(rels, image_id, all_nodes, all_edges, wn2label, wn2image): for rel in rels: rel_id = create_uri(VG_NS, 'R' + str(rel['relationship_id'])) sub_id = create_uri(VG_NS, 'O' + str(rel['subject_id'])) obj_id = create_uri(VG_NS, 'O' + str(rel['object_id'])) synsets = rel['synsets'] pred = rel['predicate'] # CREATE REL-SUBJECT and REL-OBJECT EDGES rel_subj_col = [ rel_id, SUBJECT_REL, sub_id, data_source, weight, { 'image_id': image_id } ] all_edges.append(rel_subj_col) rel_obj_col = [ rel_id, OBJECT_REL, obj_id, data_source, weight, { 'image_id': image_id } ] all_edges.append(rel_obj_col) pos = '' for s in synsets: rel_syn_col = [ rel_id, WORDNET_SENSE_REL, create_uri(WORDNET_NS, s), data_source, weight, { 'image_id': image_id } ] all_edges.append(rel_syn_col) wn2label[s].add(pred) wn2image[s].add(image_id) pos = s.split('.')[-2] all_rel_synsets.append(s) # CREATE relationship node label, aliases = add_lowercase_labels([pred]) rel_node = [ rel_id, label, ','.join(aliases), pos, data_source, { 'image_id': image_id } ] all_nodes.append(rel_node) return all_nodes, all_edges, wn2label, wn2image
def add_attr_data(attrs, image_id, all_nodes, all_edges, wn2label, wn2image, attr_id): for a in attrs: a_id = create_uri(VG_NS, 'A' + str(attr_id)) attr_id += 1 # attribute-related edges obj_attr_edge = [ obj_id, HAS_PROPERTY_REL, a_id, data_source, weight, { 'image_id': image_id } ] all_edges.append(obj_attr_edge) a_pos = '' if a in attr_synsets: a_synset = attr_synsets[a] attr_wn_edge = [ a_id, WORDNET_SENSE_REL, create_uri(WORDNET_NS, a_synset), data_source, weight, { 'image_id': image_id } ] all_edges.append(attr_wn_edge) # save wordnet data for an attribute wn2label[a_synset].add(a) wn2image[a_synset].add(image_id) a_pos = a_synset.split('.')[-2] all_attr_synsets.append(a_synset) # attribute node a_label, a_aliases = add_lowercase_labels([a]) attr_node = [ a_id, a_label, ','.join(a_aliases), a_pos, data_source, { 'image_id': image_id } ] all_nodes.append(attr_node) return all_nodes, all_edges, wn2label, wn2image, attr_id
def add_action(): """ Ajouter une action """ store_file = STORE['actions'] label = request.json['label'] uri = create_uri(BASE_URI_ACTION + sanitize(label)) load_rdf_file(store_file) Action(resUri=uri, label=label) save_rdf_file(store_file) # On retourne une chaine vide pour renvoyer un code HTTP 200 return jsonify({'uri': uri})
cn_nodes_file = f'../output_v{VERSION}/conceptnet/nodes_v{VERSION}.csv' wn_nodes_file = f'../output_v{VERSION}/wordnet/nodes_v{VERSION}.csv' vg_nodes_file = f'../output_v{VERSION}/visualgenome/nodes_v{VERSION}.csv' wordnet30_ili_file = '../input/mappings/ili-map-pwn30.tab' wordnet31_ili_file = '../input/mappings/ili-map-pwn31.tab' # OUTPUT FILE output_dir = f'../output_v{VERSION}/mappings' edges_file = f'{output_dir}/wn_wn_mappings.csv' MOWGLI_NS = config.mowgli_ns WORDNET_NS = config.wordnet_ns SAMEAS_REL = create_uri(MOWGLI_NS, config.sameas) if not os.path.exists(output_dir): os.makedirs(output_dir) ### Load the CN filtered data in pandas ### df = pd.read_csv(cnfile, sep='\t', header=None, converters={4: json.loads}) df.columns = ['assertion', 'rel', 'subj', 'obj', 'metadata'] df.drop(columns=['assertion']) print('size of df with external URLs', len(df)) df_wordnet = df.loc[(df['rel'] == '/r/ExternalURL') & (df['obj'].str.contains(r'http://wordnet-'))] print('size of df with wordnet external links', len(df_wordnet))
os.path.split(os.path.abspath(__file__))[0], DATA_DIR) DICTIONARY_FILE = concat(ABS_DATA_DIR, LANG, concat(FILENAME, EXT, sep='.'), sep='/') if __name__ == '__main__': cols = { 'word_id': Integer, 'word': String(35), 'category_1': String(20), 'category_2': String(20), 'category_3': String(20), 'category_4': String(20), 'category_5': String(20), 'category_6': String(20), 'category_7': String(20), 'category_8': String(20), 'category_9': String(20), 'category_10': String(20), 'category_11': String(20), 'category_12': String(20) } builder = Builder(filepath=DICTIONARY_FILE, sep='\t', db_uri=create_uri()) builder.primary_col_index = 0 builder.build('C', cols, LANG) # builder.resume('C', cols, LANG, 0, 4000) # print(builder.read('X', row_num = 150)) del builder
data_source = config.mw_ds weight = 1.0 VERSION = config.VERSION EDGE_COLS = config.edges_cols # INPUT FILE mapping_file = '../input/mappings/Edges_WordNet2Wikidata_New.csv' # OUTPUT FILE output_dir = f'../output_v{VERSION}/mappings' edges_file = f'{output_dir}/wn_wdt_mappings.csv' MOWGLI_NS = config.mowgli_ns WORDNET_NS = config.wordnet_ns SAMEAS_REL = utils.create_uri(MOWGLI_NS, config.sameas) if not os.path.exists(output_dir): os.makedirs(output_dir) edges_df = pd.read_csv(mapping_file, sep='\t', header=0, converters={5: json.loads}) edges_df.sort_values(by=['subject', 'predicate', 'object']).to_csv(edges_file, index=False, sep='\t') print(len(edges_df), 'edges stored')
all_edges.append(edge_data) #### a. Prepare and store nodes #### all_nodes = [] for n, datasets in node_datasets.items(): label = cn.uri_to_label(n) aliases_list = [] aliases = ','.join(aliases_list) mapped_pos, raw_pos = get_cn_pos_tag(n, MOWGLI_NS, POS_MAPPING) other = {'datasets': list(datasets)} col = [n, label, aliases, raw_pos, data_source, other] all_nodes.append(col) for raw_pos, mapped_pos in POS_MAPPING.items(): mowgli_pos = create_uri(MOWGLI_NS, mapped_pos) col = [ mowgli_pos, raw_pos, mapped_pos, '', '', { "datasets": [CUSTOM_DATASET] } ] all_nodes.append(col) nodes_df = pd.DataFrame(all_nodes, columns=NODE_COLS) nodes_df.sort_values('id').to_csv(nodes_file, index=False, sep='\t') print('unique POS tags', nodes_df['pos'].unique()) print(len(nodes_df), 'nodes') #### b. Enrich and store edges #### all_edges_enriched = copy.deepcopy(all_edges)
def add_relationships_data(rels, obj2names, image_id, all_nodes, all_edges, wn2label, wn2image): image_node = create_uri(VG_NS, 'I' + image_id) image_metadata = {} for rel in rels: synsets = rel['synsets'] pred = rel['predicate'] pred_id = create_uri(VG_NS, pred.replace(' ', '_')) sub_names = obj2names[str(rel['subject_id'])] obj_names = obj2names[str(rel['object_id'])] for sub_name in sub_names: sub_id = create_uri(VG_NS, sub_name.replace(' ', '_')) for obj_name in obj_names: obj_id = create_uri(VG_NS, obj_name.replace(' ', '_')) rel_edge = [ pred_id, create_uri(VG_NS, 'subject'), sub_id, data_source, weight, image_metadata ] all_edges.append(rel_edge) rel_edge = [ pred_id, create_uri(VG_NS, 'object'), obj_id, data_source, weight, image_metadata ] all_edges.append(rel_edge) rel_edge = [ sub_id, RELATEDTO_REL, obj_id, data_source, weight, image_metadata ] all_edges.append(rel_edge) rel_edge = [ obj_id, RELATEDTO_REL, sub_id, data_source, weight, image_metadata ] all_edges.append(rel_edge) pos = '' for s in synsets: rel_syn_col = [ pred_id, WORDNET_SENSE_REL, create_uri(WORDNET_NS, s), data_source, weight, image_metadata ] all_edges.append(rel_syn_col) wn2label[s].add(pred) wn2image[s].add(image_id) pos = s.split('.')[-2] all_rel_synsets.append(s) # CREATE relationship node label, aliases = add_lowercase_labels([pred]) rel_node = [ pred_id, label, ','.join(aliases), pos, data_source, image_metadata ] all_nodes.append(rel_node) rel_img_edge = [ pred_id, INIMAGE_REL, image_node, data_source, weight, image_metadata ] all_edges.append(rel_img_edge) return all_nodes, all_edges, wn2label, wn2image
with open(vg_scene_path, 'r') as f: images_data = json.load(f) with open(attr_synsets_path, 'r') as f: attr_synsets = json.load(f) print('num images', len(images_data)) NODE_COLS = config.nodes_cols EDGE_COLS = config.edges_cols MOWGLI_NS = config.mowgli_ns WORDNET_NS = config.wordnet_ns VG_NS = config.visualgenome_ns WORDNET_SENSE_REL = create_uri(VG_NS, config.pwordnet_sense) SUBJECT_REL = create_uri(VG_NS, config.subject) OBJECT_REL = create_uri(VG_NS, config.objct) INIMAGE_REL = create_uri(VG_NS, config.in_image) RELATEDTO_REL = config.related_to CUSTOM_DATASET = config.custom_dataset data_source = config.vg_ds weight = 1.0 debug = False ### Load the data into two tables: nodes (from objects with attributes) and edges (from relationships) WITH deduplication ### #### Process edges first ####
with open(vg_scene_path, 'r') as f: images_data = json.load(f) with open(attr_synsets_path, 'r') as f: attr_synsets = json.load(f) print('num images', len(images_data)) NODE_COLS = config.nodes_cols EDGE_COLS = config.edges_cols MOWGLI_NS = config.mowgli_ns WORDNET_NS = config.wordnet_ns VG_NS = config.visualgenome_ns WORDNET_SENSE_REL = create_uri(VG_NS, config.pwordnet_sense) SUBJECT_REL = create_uri(VG_NS, config.subject) OBJECT_REL = create_uri(VG_NS, config.objct) INIMAGE_REL = create_uri(VG_NS, config.in_image) HAS_PROPERTY_REL = config.has_prop CUSTOM_DATASET = config.custom_dataset data_source = config.vg_ds weight = "1.0" ### Load the data into two tables: nodes (from objects with attributes) and edges (from relationships) WITH deduplication ### #### Process edges first #### all_edges = []