def generate(): hdt_file = None output_filename = None if sys.argv [1] == 'lod': hdt_file = HDTDocument(PATH_LOD) output_filename = 'all_lod_subClassOf.csv' else: hdt_file = HDTDocument(PATH_DBpedia) output_filename = 'all_dbpedia_subClassOf.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) count = 0 with open(output_filename, 'w', newline='') as file: (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) for (s, p, o) in subclass_triples: # store it in a csv file s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) writer.writerow([s_id, s, o_id, o]) # print ([s_id, s, o_id, o]) count += 1 print ('total entries = ', count)
class SubM: # Initializer def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ): self.hdt = HDTDocument(path_hdt) self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.id_subClassOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass" self.id_equivalentClass = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) self.graph = nx.DiGraph() self.equi_graph_manager = None #equiClassManager(path_eq) self.diagnosed_relations = [] # the result self.suggestion_on_relations = [ ] # from the manual decison and Joe's sameAs data. Triple self.leaf_classes = set() # the graph includes all the triples with subClassOf as predicate def setup_graph(self): (subclass_triple_ids, cardinality) = self.enquiry(query=(0, self.id_subClassOf, 0), mode="default") collect_pairs = [] for (s_id, _, o_id) in subclass_triple_ids: # add to the directed graph collect_pairs.append((s_id, o_id)) self.graph.add_edges_from(collect_pairs) # for the sake of effeciency, we remove all the leaf nodes of the graph (classes # that does not have subclasses. They don't participate in any cycle by definition) def filter_leaf_classes(self): for c in self.graph.nodes: #test if this node is a leaf (_, cardi) = self.enquiry(query=(0, self.id_subClassOf, c), mode="default") if cardi == 0: self.leaf_classes.add(c) print('there are a total of', len(self.leaf_classes), 'leaf nodes removed') for c in self.leaf_classes: self.remove_class(c) # a similar funtion as that of networkx def remove_class(self, c, comment='remove'): if self.graph.has_node(c): self.graph.remove_node( c) # this also removes all the edges related # self.diagnosed_classes[c]= comment # automatically, remove the related edges connected # a similar funtion as that of networkx def remove_class_from(self, cs, comment='remove'): for c in cs: self.remove_class(c, comment) # This is for future use of the SUBMASSIVE system. A user may ignore this for now. def enquiry(self, query, mode="subm"): (s, p, o) = query if mode == "default": return self.hdt.search_triples_ids(s, p, o) else: # examine the filtered part first pass # Similar as that of networkx def remove_relation(self, sub, sup, comment='remove'): if self.graph.has_edge(sub, sup): self.graph.remove_edge(sub, sup) self.diagnose_relations(sub, sup, comment) # Similar as that of networkx def remove_relation_from(self, relation_list, comment='remove'): for (sub, sup) in relation_list: self.remove_relation(sub, sup, comment) # there is only one term that has different id when retrieved as Subject or Object def convert_to_id(self, term): if term == "akt742:Intangible-Thing": # this is the only class that has two different ids (as subject and object) return 2601100675 else: return self.hdt.convert_term(term, IdentifierPosition.Subject) # there is only one term that has different id when retrieved as Subject or Object def convert_to_term(self, id): if id == 2601100675: return "akt742:Intangible-Thing" # this is the only one that has two different ids (as subject and object) else: return self.hdt.convert_id(id, IdentifierPosition.Subject) # remove the reflexive edges def filter_reflexsive(self): to_remove = set() file = open('reflexive.csv', 'w', newline='') writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION", "DECISION" ]) for e in self.graph.edges(): (l, r) = e if l == r: to_remove.add(e) print('Number of removed reflexive relations', len(to_remove)) for (l, r) in to_remove: l_term = self.convert_to_term(l) r_term = self.convert_to_term(r) writer.writerow([l, l_term, r, r_term, 'remove', 'o']) self.graph.remove_edges_from(list(to_remove)) def print_graph_info(self): print('there are ', len(self.graph.nodes()), ' nodes') print('there are ', len(self.graph.edges()), ' edges') # compare against the owl:sameAs relations and rdfs:equivalentClass relations # at each iteration, if there is such a edge, then remove this one. def obtain_unnecessary_relations(self): to_remove = set() file = open('equivalent-unnecessary-relations.csv', 'w', newline='') writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION", "DECISION" ]) count_i = 0 count_s = 0 for e in self.graph.edges(): label = '' (l, r) = e # convert to terms l_term = self.convert_to_term(l) r_term = self.convert_to_term(r) # Step 1: Equicalence Class (eq_triple_ids, cardinality1) = self.enquiry(query=(l, self.id_equivalentClass, r), mode="default") (eq_triple_ids, cardinality2) = self.enquiry(query=(r, self.id_equivalentClass, l), mode="default") if (cardinality1 == 1 or cardinality2 == 1): label = 'i' count_i += 1 # Step 2: owl:sameAs if (self.equi_graph_manager.test_equivalent(l_term, r_term)): label += 's' count_s += 1 if label != '': to_remove.add(e) writer.writerow([l, l_term, r, r_term, 'remove', label]) print('count_s = ', count_s) print('count_i = ', count_i) print('Number of removed unnecessary relations', len(to_remove)) self.graph.remove_edges_from(list(to_remove)) # for the sake of memory effeciency, we can load these unnecessary relations directly # this is because the sameas data is very big. def load_unnecessary_relations(self): # to self.suggestion_on_relations eq_file = open('equivalent-unnecessary-relations.csv', 'r') reader = csv.DictReader(eq_file) for row in reader: s_id = int(row["SUBJECT_ID"]) o_id = int(row["OBJECT_ID"]) sug = row["SUGGESTION"] # should be remove self.suggestion_on_relations.append((s_id, o_id, sug)) print(len(self.suggestion_on_relations), ' total relations loaded') # load the manual decisions on size-two cycles def load_manually_decided_relations( self): # to self.suggestion_on_relations man_file = open('lod-two-cycle.csv', 'r') reader = csv.DictReader(man_file) coll_nodes = [] for row in reader: s_id = int(row["SUBJECT_ID"]) # s = row["SUBJECT"] o_id = int(row["OBJECT_ID"]) sug = row["SUGGESTION"] coll_nodes.append(s_id) coll_nodes.append(o_id) self.suggestion_on_relations.append((s_id, o_id, sug)) print(len(self.suggestion_on_relations), ' total relations loaded') return coll_nodes def find_nodes_in_cycles(self, hint_nodes, max, found_min): # create a new graph tmp_graph = self.graph.copy() # find each node that participate in at least one cycle: nodes = set() flag = True # flag for debugging count_found_cycles = 0 while flag: try: c = [] hint_not_working = False #flag try: c = nx.find_cycle(tmp_graph, hint_nodes) # change to simple_cycles ?? except Exception as e: hint_not_working = True if hint_not_working: c = nx.find_cycle(tmp_graph) count_found_cycles += 1 print('Found Cyclce ', count_found_cycles, ' is: ', c) c_nodes = [x for (x, y) in c] (l_tmp, r_tmp) = random.choice(c) tmp_graph.remove_edge(l_tmp, r_tmp) nodes.update(c_nodes) if len(nodes) >= max and count_found_cycles >= found_min: print('total nodes = ', len(nodes)) flag = False else: nodes.update(c_nodes) hint_nodes = c_nodes + hint_nodes except Exception as e: print(e) # print("There is no cycle anymore") flag = False nodes = list(nodes) print('there are in total ', len(nodes), ' nodes that participate in cycles') print(nodes) return nodes def get_cycles_from_nodes(self, nodes): coll_cycles = [] # a list, not a set # obtain a subgraph from the nodes subg = self.graph.subgraph(nodes) simp_c = list(nx.simple_cycles(subg)) print(' and these nodes has ', len(simp_c), ' simple cycles among them') # next, process these cycles and get ready to encode for c in simp_c: if len(c) == 2: (l, r) = c coll_cycles.append([(l, r), (r, l)]) else: # print ('original = ', c) cycle = [] for i in range(len(c)): j = i + 1 if i == len(c) - 1: j = 0 cycle.append((c[i], c[j])) # print ('cycle = ', cycle) coll_cycles.append(cycle) return (coll_cycles) return coll_cycles # get ready for encoding
def generate_reduced(): # Q1 : retrieve the subClassOf relations hdt_file = None output_filename = None output_selfloopClass_filename = None output_leafClass_filename = None output_intermediateClass_filename = None if sys.argv[1] == 'lod': hdt_file = HDTDocument(PATH_LOD) output_filename = 'reduced_lod_subClassOf.csv' output_selfloopClass_filename = 'lod_selfloop_classes.csv' output_leafClass_filename = 'lod_leaf_classes.csv' output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv' else: hdt_file = HDTDocument(PATH_DBpedia) output_filename = 'dbpedia_subClassOf.csv' output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv' output_leafClass_filename = 'dbpedia_leaf_classes.csv' output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") c1 = ['196338233', '196338418', '196338419'] c2 = ['196338233', '196338325', '196338412'] c3 = ['196337995', '196338014', '196338013'] c4 = ['196338014', '196338063', '196338410'] cs = [c1, c2, c3, c4] for c in cs: print('\n\n this cycle = ', c) for n in c: print('id =', n) name = hdt_file.convert_id(int(n), IdentifierPosition.Subject) print('name = ', name) s_id = hdt_file.convert_term(name, IdentifierPosition.Subject) print('when its subject = ', s_id) o_id = hdt_file.convert_term(name, IdentifierPosition.Object) print('when its object = ', o_id) print('==================================') c1 = ['1193056652', '1193056593', '1193056657'] c2 = ['1146303708', '1146299369', '1146331327'] c3 = ['196338400', '196338312', '196338288'] c4 = ['196338013', '196337995', '196338014'] c5 = ['196338242', '196338410', '196337957'] c6 = ['196338418', '196338419', '196338233'] c7 = ['196338233', '196338325', '196338412'] c8 = ['196338014', '196338063', '196338410'] c9 = ['196338014', '196337975', '196338007'] c10 = ['196338050', '196338049', '196337975'] c11 = ['196338197', '196338462', '196338406'] c12 = ['196338220', '196338217', '196338034'] c13 = ['196338145', '196338152', '196338419'] c14 = ['196338288', '196338116', '196337978'] c15 = ['196338070', '196338360', '196338241'] c16 = ['114657709', '114657713', '125181834'] cs = [ c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16 ] for c in cs: print('\n\n that cycle = ', c) for n in c: print('id =', n) name = hdt_file.convert_id(int(n), IdentifierPosition.Subject) print('name = ', name) s_id = hdt_file.convert_term(name, IdentifierPosition.Subject) print('when its subject = ', s_id) o_id = hdt_file.convert_term(name, IdentifierPosition.Object) print('when its object = ', o_id)
return u"", exception.end codecs.register_error("strict", strict_handler) PATH_LOD = "/scratch/wbeek/data/LOD-a-lot/data.hdt" PATH_SAMEAS_NETWORK = "/home/jraad/ssd/data/identity-data/" PATH_ID2TERMS_099 = "/home/jraad/ssd/data/identity-data-0_99/id2terms_0-99.csv" PATH_TERM2ID_099 = "/home/jraad/ssd/data/identity-data-0_99/term2id_0-99.csv" # load the LOD-a-lot HDT file hdt_lod = HDTDocument(PATH_LOD) # these identifiers will be used later to query the HDT file using their IDs id_type = hdt_lod.convert_term( "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", IdentifierPosition.Predicate) id_sameAs = hdt_lod.convert_term("http://www.w3.org/2002/07/owl#sameAs", IdentifierPosition.Predicate) id_subClassOf = hdt_lod.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) id_equivalentClass = hdt_lod.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) # output some stats of LOD-a-lot # we can query the HDT file using the term IDs (e.g. rdf:type and equivalentClass) or the URIs (e.g. subClassOf and sameAs) print("# subjects:", "{:,}".format(hdt_lod.nb_subjects)) print("# predicates:", "{:,}".format(hdt_lod.nb_predicates)) print("# objects:", "{:,}".format(hdt_lod.nb_objects))
def generate_reduced(): # Q1 : retrieve the subClassOf relations # hdt_file = None # output_filename = None # output_selfloopClass_filename = None # output_leafClass_filename = None # output_intermediateClass_filename = None # if sys.argv [1] == 'lod': hdt_file = HDTDocument(PATH_LOD) # output_filename = 'reduced_lod_subClassOf.csv' output_selfloopClass_filename = 'lod_reflexive_classes.csv' output_leafClass_filename = 'lod_leaf_classes.csv' # output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv' output_intermediateClass_filename = 'further_reduced_lod_subClassOf.csv' # else: # hdt_file = HDTDocument(PATH_DBpedia) # output_filename = 'dbpedia_subClassOf.csv' # output_selfloopClass_filename = 'dbpedia_selfloop_classes.csv' # output_leafClass_filename = 'dbpedia_leaf_classes.csv' # output_intermediateClass_filename = 'further_reduced_dbpedia_subClassOf.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term("http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) count = 0 count_selfloop = 0 count_leaf = 0 count_left = 0 count_output_after_further_reduced = 0 # count left of the further reduced # removed_leaf_classes = [] (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") to_explore_ids = set() # to iterate through leaf_ids = set() removed_intermediate_ids = set() # removed intermediate nodes all_ids = set() with open(output_filename, 'w', newline='') as file: writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) with open(output_intermediateClass_filename, 'w', newline='') as inter_file: writer_inter = csv.writer(inter_file) writer_inter.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) # Step 1: remove selfloops and leaf nodes with open(output_selfloopClass_filename, 'w', newline='') as selfloop_file: writer_selfloop = csv.writer(selfloop_file) writer_selfloop.writerow([ "ID", "URI"]) with open(output_leafClass_filename, 'w', newline='') as leaf_file: writer_leaf = csv.writer(leaf_file) writer_leaf.writerow([ "ID", "URI"]) for (s, p, o) in subclass_triples: s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) all_ids.add(s_id) all_ids.add(o_id) count += 1 # store it in a csv file if s == o: # self loop count_selfloop += 1 writer_selfloop.writerow([s_id, s]) else: (_, leaf_cardinality) = hdt_file.search_triples("", subClassOf, s) # test if it is a leaf node if leaf_cardinality == 0: # there is no subclass, this is a leaf node/class # write it to a file and store it writer_leaf.writerow([s_id, s]) leaf_ids.add(s_id) count_leaf += 1 # removed_leaf_classes.append(s) # else: # # write what's left to the file # # SKIP: find intermediate for now # count_left += 1 # writer.writerow([s_id, s, o_id, o]) print ('count leaf statements = ', count_leaf) print ('count leaf (as set) = ', len (leaf_ids)) print ('count total statements = ', count) print ('count_total nodes (as set) = ', len (all_ids)) print ('NOW Part 2: Further Reduce ') # further reduce it visited_sup = set() # near_leaf_sup = set() count_one = 0 count_loop = 0 for l_id in leaf_ids: count_loop += 1 (leaf_triples, cardinality) = hdt_file.search_triples_ids(l_id, id_subClassOf, 0) # get its superclass id : sup_id finished_this_leaf = False if cardinality == 1: (l_id, lp_id, sup_id) = leaf_triples.next() (_, sub_cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id) if sub_cardinality == 1: # remove this superclass count_one += 1 removed_intermediate_ids.add(sup_id) visited_sup.add(sup_id) (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0) for (sup_id, lp_id, supsup_id) in supsup_triples: to_explore_ids.add(supsup_id) finished_this_leaf = True # normal process if not finished_this_leaf: for (l_id, lp_id, sup_id) in leaf_triples: if (sup_id not in visited_sup): # lo_id = hdt_file.convert_term(lo, IdentifierPosition.Object) (sup_triples, cardinality_back) = hdt_file.search_triples_ids(0, id_subClassOf, sup_id) supflag = True # if this superclass only has leaf nodes if cardinality_back != 1: for (child_id, lp_id, sup_id) in sup_triples: if child_id not in leaf_ids: sup_flag = False break if supflag: # near_leaf_sup.add(sup_id) removed_intermediate_ids.add(sup_id) (supsup_triples, cardinality) = hdt_file.search_triples_ids(sup_id, id_subClassOf, 0) for (sup_id, lp_id, supsup_id) in supsup_triples: to_explore_ids.add(supsup_id) else: to_explore_ids.add (sup_id) visited_sup.add (sup_id) if count_loop %100000 ==0: print ('leaf nodes processed:', count_loop) print ('count one = ', count_one) print ('near-leaf nodes = ', len (removed_intermediate_ids)) print ('total visited nodes = ', len (visited_sup)) print ('non-near-leaf nodes = ', len(visited_sup) - len(removed_intermediate_ids)) print ('to explore = ', len(to_explore_ids)) print ('*********** after this data processing, we have only ', len(to_explore_ids), ' to explore for the next step') # # finished data- proprocessing, record_to_explore_size = len (to_explore_ids) record_iteration = 0 continue_flag = True while (len(to_explore_ids) != 0 and continue_flag): # print ('still to explore : ', len(to_explore)) record_iteration +=1 # iternate through this and n_id = to_explore_ids.pop() (triples_id, cardinality) = hdt_file.search_triples_ids(0, id_subClassOf, n_id) flag = True for (ns_id, np_id, no_id) in triples_id: # if each ns is either a leaf or intermediate but removed, then we remove it. # ns_id = hdt_file.convert_term(ns, IdentifierPosition.Object) if ns_id not in leaf_ids and ns_id not in removed_intermediate_ids: # Keep it for now flag = False break if flag == True: # we are sure to remove it removed_intermediate_ids.add (n_id) else: to_explore_ids.add (n_id) # add back :( if record_iteration == 10000: if record_to_explore_size != len (to_explore_ids): # print ('leaf nodes visited = ', count_leaf) print ('total leaf nodes = ', len(leaf_ids)) print ('accummulated removed intermediate = ', len (removed_intermediate_ids)) print ('still to explore = ', len (to_explore_ids)) print ('record to explore = ', record_to_explore_size) print ('changed = ', record_to_explore_size - len (to_explore_ids)) record_iteration = 0 record_to_explore_size = len (to_explore_ids) else: continue_flag = False # to write down the intermediate removed print ('*****size of leaf:', len (leaf_ids)) print ('*****size of removed intermediate node :', len (removed_intermediate_ids)) (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") for (s,p,o) in subclass_triples: s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) # count += 1 # store it in a csv file if s != o: # if s is not a leaf node and not a removed intermediate node if (s_id not in leaf_ids) and (s_id not in removed_intermediate_ids): # write what's left to the file count_output_after_further_reduced += 1 # print ('count output after further reduced', count_output_after_further_reduced) writer_inter.writerow([s_id, s, o_id, o]) # else: # print ('one of them') # else: # print ('nothing') print ('total entries = ', count) print ('total self-loops = ', count_selfloop) print ('total leaf nodes/classes = ', count_leaf) print ('total left = ', count_left) print ('perfectage of reduction: ', count_left/count) print ('=====AFTER FURTHER REDUCTION ======') print ('There are only ', count_output_after_further_reduced) print ('perfectage of reduction: ', count_output_after_further_reduced/count)
class SubP: # Initializer / Instance Attributes def __init__(self, path_hdt=PATH_LOD, path_eq=PATH_EQ): self.hdt = HDTDocument(path_hdt) self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.id_subClassOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass" self.id_equivalentClass = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) self.subPropertyOf = "http://www.w3.org/2000/01/rdf-schema#subPropertyOf" self.id_subPropertyOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subPropertyOf", IdentifierPosition.Predicate) self.equivalentProperty = "http://www.w3.org/2002/07/owl#equivalentProperty" self.id_equivalentProperty = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentProperty", IdentifierPosition.Predicate) self.graph = nx.DiGraph() self.equi_graph_manager = None #equiClassManager(path_eq) print('set up the equivalence class manager') self.diagnosed_relations = [] # the result self.suggestion_on_relations = [ ] # from the manual decison and Joe's sameAs data. Triple self.leaf_classes = set() print('finished initialization') def setup_graph(self): print('set up the graph') (subclass_triple_ids, cardinality) = self.enquiry(query=(0, self.id_subPropertyOf, 0), mode="default") collect_pairs = [] for (s_id, _, o_id) in subclass_triple_ids: # add to the directed graph collect_pairs.append((s_id, o_id)) print('there are ', len(collect_pairs), 'edges') self.graph.add_edges_from(collect_pairs) def convert_to_id(self, term): if term == "akt742:Intangible-Thing": # this is the only class that has two different ids (as subject and object) return 2601100675 else: return self.hdt.convert_term(term, IdentifierPosition.Subject) def convert_to_term(self, id): if id == 2601100675: return "akt742:Intangible-Thing" # this is the only one that has two different ids (as subject and object) else: return self.hdt.convert_id(id, IdentifierPosition.Subject) def enquiry(self, query, mode="subp"): (s, p, o) = query if mode == "default": return self.hdt.search_triples_ids(s, p, o) else: # examine the filtered part first pass def print_info(self, sbj, obj): predicate_names = [ "http://sw.cyc.com/CycAnnotations_v1#label", "http://www.w3.org/2000/01/rdf-schema#comment", "http://www.w3.org/2000/01/rdf-schema#label" ] s_domain = tldextract.extract(sbj).domain o_domain = tldextract.extract(obj).domain # filter that domain # if (s_domain != DOMAIN and o_domain != DOMAIN): # # print (DOMAIN) print('SUBJECT: ', sbj) for p in predicate_names: (triples, cardinality) = self.hdt.search_triples(sbj, p, "") for (s, p, o) in triples: print('\tPREDICATE: ', p) print('\t\t Comments/labels :', o, '\n') print('OBJECT: ', obj) for p in predicate_names: (triples, cardinality) = self.hdt.search_triples(obj, p, "") for (s, p, o) in triples: print('\tPREDICATE: ', p) print('\t\t Comments/labels :', o, '\n') print('\n\n========================\n\n') def export_cycle(self): simp_c = list(nx.simple_cycles(self.graph)) print('find simple cycle in graph') print('there are ', len(simp_c), ' simple cycles') count1 = 0 count_others = 0 count_sameas = 0 count_eqProp = 0 count_bigger = 0 collect_self_loop = [] collect_eq = [] collect_others = [] collect_bigger = [] for c in simp_c: if len(c) == 1: count1 += 1 collect_self_loop.append(c) elif len(c) == 2: # print (c) # for n in c: # t = self.convert_to_term(n) # print ('\t', t) # print ('\n') l_term = self.convert_to_term(c[0]) r_term = self.convert_to_term(c[1]) # id_equivalentProperty (subclass_triple_ids, cardinality) = self.enquiry( query=(c[0], self.id_equivalentProperty, c[1]), mode="default") # if (self.equi_graph_manager.test_equivalent(l_term, r_term)): # print ('There is a owl:sameAs relation in between') # count_sameas += 1 # collect_eq.append(c) if (cardinality > 0): print('There is a owl:equivalentProperty in between') count_eqProp += 1 collect_eq.append(c) else: # self.print_info(c[0], l_term, c[1], r_term) # print ('a longer one for manual decision:',c ) # count_others += 1 collect_others.append(c) count_others += 1 else: count_bigger += 1 collect_bigger.append((c[0], c[1])) collect_bigger.append((c[1], c[2])) collect_bigger.append((c[2], c[0])) print('there are ', count1, ' reflexive cycles') print('there are ', count_sameas, ' sameAs relations') print('there are ', count_eqProp, ' eqProp relations') print('there are ', count_others, ' size-two cycles') print('there are ', count_bigger, ' bigger cycles') # export self-loop cycles: with open(file_name, 'w', newline='') as file: writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION", "DECISION" ]) # write to file # print ('collect self loop: ',collect_self_loop) for [s_id] in collect_self_loop: # convert s_term = self.convert_to_term(s_id) o_term = s_term writer.writerow([s_id, s_term, s_id, o_term, 'remove', 'o']) # removed from automatic method for (s_id, o_id) in collect_eq: # convert s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) writer.writerow([s_id, s_term, o_id, o_term, 'remove', 'e']) # removed from automatic method for (s_id, o_id) in collect_others: s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) self.print_info(s_term, o_term) writer.writerow([s_id, s_term, o_id, o_term, 'remove', '2']) # removed from manual step writer.writerow([o_id, o_term, s_id, s_term, 'remove', '2']) # removed from manual step for (s_id, o_id) in collect_bigger: s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) # print ('===a longer cycle ===', c) writer.writerow([s_id, s_term, o_id, o_term, 'remove', 'x']) # removed from manual step def load_removed(self): # 'pre-subP.csv' subp_file = open('pre-subP.csv', 'r') reader = csv.DictReader(subp_file) coll_removed = [] for row in reader: s_id = int(row["SUBJECT_ID"]) # s = row["SUBJECT"] o_id = int(row["OBJECT_ID"]) sug = row["SUGGESTION"] # should be remove if (sug == 'remove'): coll_removed.append((s_id, o_id)) print('number of removed edges:', len(coll_removed)) self.graph.remove_edges_from(coll_removed) def test_cycle(self): try: c = nx.find_cycle(self.graph) # change to simple_cycles ?? print('cycle = ', c) except Exception as e: # hint_not_working = True print('no cycle') def export_graph_nt(self, name): g = Graph() for (s_id, o_id) in self.graph.edges: s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) bob = URIRef("http://www.w3.org/2000/01/rdf-schema#subPropertyOf") g.add((URIRef(s_term), bob, URIRef(o_term))) # print("--- printing raw triples ---") # for s, p, o in g: # print((s, p, o)) g.serialize(destination=name, format='nt')
def generate(): # Q1 : retrieve the subClassOf relations visited_pair_list = [] # hdt_file = None # output_filename = None # if sys.argv [1] == 'lod': hdt_file = HDTDocument(PATH_LOD) output_filename = 'lod-two-cycle.csv' subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" id_subClassOf = hdt_file.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) count = 0 count_removed = 0 count_sameas = 0 count_left = 0 count_right = 0 count_sameas = 0 count_unknown = 0 cnt_removed = Counter() cnt_sameas = Counter() cnt_left = Counter() cnt_right = Counter() cnt_both = Counter() cnt_sameas = Counter() cnt_unknown = Counter() eq_pair_ids = [] eq_pair_terms = [] eq_file = open('equivalent-unnecessary-relations.csv', 'r') reader = csv.DictReader(eq_file) for row in reader: s_id = row["SUBJECT_ID"] s = row["SUBJECT"] o_id = row["OBJECT_ID"] o = row["OBJECT"] eq_pair_ids.append((s_id, o_id)) eq_pair_terms.append((s, o)) with open(output_filename, 'w', newline='') as file: (subclass_triples, cardinality) = hdt_file.search_triples("", subClassOf, "") writer = csv.writer(file) writer.writerow([ "SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT", "SUGGESTION", "DECISION" ]) for (s, p, o) in subclass_triples: s_id = hdt_file.convert_term(s, IdentifierPosition.Subject) o_id = hdt_file.convert_term(o, IdentifierPosition.Object) if s != o: # otherwise, it is a self-loop # store it in a csv file (reverse_subclass_triples, reverse_cardinality) = hdt_file.search_triples( o, subClassOf, s) if reverse_cardinality == 1: # there is a reverse link back if (s_id, o_id) in eq_pair_ids or (o_id, s_id) in eq_pair_ids: print('this is in the equivalence pair, skip it') print(s, '\n', o, '\n\n\n') # Additional: we also make sure it does not appear in the equivalent set else: if (s, o) not in visited_pair_list and ( o, s) not in visited_pair_list: # ask the user to deal with it: print('sbj=\t', s) print('obj=\t', o) s_domain = tldextract.extract(s).domain o_domain = tldextract.extract(o).domain print('s_domain = ', s_domain) print('o_domain = ', o_domain) print_ino(s, hdt_file) print_ino(o, hdt_file) decision = input() count += 1 if decision == 'x': # if the entry is meaningless, then remove: writer.writerow( [s_id, s, o_id, o, 'remove', 'x']) writer.writerow( [o_id, o, s_id, s, 'remove', 'x']) count_removed += 1 cnt_removed[s_domain] += 1 cnt_removed[o_domain] += 1 elif decision == 'l': writer.writerow( [s_id, s, o_id, o, 'remove', 'l']) writer.writerow( [o_id, o, s_id, s, 'remain', 'l']) count_left += 1 cnt_left[s_domain] += 1 cnt_left[o_domain] += 1 cnt_both[s_domain] += 1 cnt_both[o_domain] += 1 elif decision == 'r': writer.writerow( [s_id, s, o_id, o, 'remain', 'r']) writer.writerow( [o_id, o, s_id, s, 'remove', 'r']) # reverse the order count_right += 1 cnt_right[s_domain] += 1 cnt_right[o_domain] += 1 cnt_both[s_domain] += 1 cnt_both[o_domain] += 1 elif decision == 'e' or decision == 's': # equivalent class. remove both of them writer.writerow( [s_id, s, o_id, o, 'remove', 'e']) writer.writerow( [o_id, o, s_id, s, 'remove', 'e']) count_sameas += 1 cnt_sameas[s_domain] += 1 cnt_sameas[o_domain] += 1 elif decision == 'u': # unknown, remains to be dealt with automatic approach # count_unknown writer.writerow( [s_id, s, o_id, o, 'unknown', 'u']) writer.writerow( [o_id, o, s_id, s, 'unknown', 'u']) count_unknown += 1 cnt_unknown[s_domain] += 1 cnt_unknown[o_domain] += 1 else: print('user input error') visited_pair_list.append((s, o)) visited_pair_list.append((o, s)) elif reverse_cardinality > 1: print('ERROR: there are multiple rdfs:subClassOf edges: ', reverse_cardinality) print(s, '\t and \t', o) for (s_tmp, p_tmp, o_tmp) in reverse_subclass_triples: print('s = ', s_tmp) print('p = ', p_tmp) print('o = ', o_tmp) print('count total pairs = ', count) print('count removed = ', count_removed) print(cnt_removed) print('count left = ', count_left) print(cnt_left) print('count right = ', count_right) print(cnt_right) print('===both====') print(cnt_both) print('===both====') print('count equivalent class = ', count_sameas) print(cnt_sameas) print('count undecided/unknown', count_unknown) print(cnt_unknown)
class HdtExecutor(Executor): def __init__(self, hdt_path: Optional[str] = None, graph: Optional[HDTDocument] = None, redis_client: Optional[redis.Redis] = None): self.cache = redis_client if graph: self.graph = graph else: self.graph = HDTDocument(hdt_path, map=False, progress=True) @cached def triples(self, subject: Optional[str]='', predicate: Optional[str]='', object: Optional[str]='')\ -> Iterable: """ Generator over the triple store Returns triples that match the given triple pattern and the count. """ result_iter, count = self.graph.search_triples(subject, predicate, object) return list(result_iter), count @cached def join(self, patterns: List[Tuple[str, str, str]], outvar: Optional[str] = None) -> Iterable: """ Joins a list of basic graph patterns and returns triples that match multiple triple patterns. """ patterns = self._verify_uris(patterns) result_iter = self.graph.search_join(patterns) if outvar: return [uri for join_set in result_iter for var, uri in join_set if var == outvar] else: return list(result_iter) @cached def subjects(self, predicate=None, object=None) -> Iterable[str]: """ A generator of subjects with the given predicate and object """ return [s for s, p, o in self.triples(predicate=predicate, object=object)[0]] @cached def predicates(self, subject=None, object=None) -> Iterable[str]: """ A generator of predicates with the given subject and object """ return [p for s, p, o in self.triples(subject=subject, object=object)[0]] @cached def objects(self, subject=None, predicate=None) -> Iterable[str]: """ A generator of objects with the given subject and predicate """ return [o for s, p, o in self.triples(subject=subject, predicate=predicate)[0]] @cached def subject_predicates(self, object=None) -> Iterable[Tuple[str, str]]: """ A generator of (subject, predicate) tuples for the given object """ return [(s, p) for s, p, o in self.triples(object=object)[0]] @cached def subject_objects(self, predicate=None) -> Iterable[Tuple[str, str]]: """ A generator of (subject, object) tuples for the given predicate """ return [(s, o) for s, p, o in self.triples(predicate=predicate)[0]] @cached def predicate_objects(self, subject=None) -> Iterable[Tuple[str, str]]: """ A generator of (predicate, object) tuples for the given subject """ return [(p, o) for s, p, o in self.triples(subject=subject)[0]] def _verify_uris(self, pattern: List[Tuple[str, str, str]]) -> List[Tuple[str, str, str]]: return [(self._verify_uri(s, IdentifierPosition.Subject), p, self._verify_uri(o, IdentifierPosition.Object)) for s, p, o in pattern] def _verify_uri(self, uri: str, position: IdentifierPosition) -> Optional[str]: if uri[0] == '?': return uri uri = uri.replace("'", "") sub_id = self.graph.convert_term(uri, position) if not sub_id: uri = ascii(uri.encode())[2:-1].replace("\\x", "x") sub_id = self.graph.convert_term(uri, position) return uri if sub_id else None
class SubM: # Initializer / Instance Attributes def __init__(self, path_hdt=PATH_LOD): self.hdt = HDTDocument(path_hdt) self.subClassOf = "http://www.w3.org/2000/01/rdf-schema#subClassOf" self.id_subClassOf = self.hdt.convert_term( "http://www.w3.org/2000/01/rdf-schema#subClassOf", IdentifierPosition.Predicate) self.equivalent = "http://www.w3.org/2002/07/owl#equivalentClass" self.id_equivalentClass = self.hdt.convert_term( "http://www.w3.org/2002/07/owl#equivalentClass", IdentifierPosition.Predicate) self.graph = nx.DiGraph() self.equi_graph = nx.Graph() self.diagnosed_relations = {} self.diagnosed_classes = {} self.leaf_classes = set() def setup_graph(self): (subclass_triple_ids, cardinality) = self.enquiry(query=(0, self.id_subClassOf, 0), mode="default") collect_pairs = [] for (s_id, _, o_id) in subclass_triple_ids: # add to the directed graph collect_pairs.append((s_id, o_id)) self.graph.add_edges_from(collect_pairs) def remove_unnecessary_relations(self): for n in self.graph.nodes(): # test if there is an edge between this node and another node which is also in the Graph (eq_triple_ids, cardinality) = self.enquiry(query=(n, self.id_equivalentClass, 0), mode="default") for (_, _, m) in eq_triple_ids: # test if it is in the Graph if m in self.graph.nodes(): self.remove_relation(n, m, 'equivalence') (eq_triple_ids, cardinality) = self.enquiry(query=(0, self.id_equivalentClass, n), mode="default") for (m, _, _) in eq_triple_ids: # test if it is in the Graph if m in self.graph.nodes(): self.remove_relation(m, n, 'equivalence') print('total relations diagnosed:', len(self.diagnosed_relations)) def export_graph(self, export_file=None): collect_pairs = self.graph.edges if export_file is not None: file = open(export_file, 'w', newline='') writer = csv.writer(file) writer.writerow(["SUBJECT_ID", "SUBJECT", "OBJECT_ID", "OBJECT"]) for (s_id, o_id) in collect_pairs: s_term = self.convert_to_term(s_id) o_term = self.convert_to_term(o_id) writer.writerow([s_term, s_id, o_term, o_id]) # define a function subgraph, edges # G.edges # https://networkx.github.io/documentation/stable/reference/classes/generated/networkx.DiGraph.edges.html#networkx.DiGraph.edges def remove_all_two_cycles(self): to_remove = set() for (l, r) in self.graph.edges(): if (r, l) in self.graph.edges(): to_remove.add((l, r)) to_remove.add((r, l)) print('there are in total', len(to_remove), ' two-cycle edges removed') for (l, r) in to_remove: self.graph.remove_edge(l, r) def load_manual_decisions(self, file, mode="remove"): #if mode = ignore, then we don't do anything to those marked as unknown #if mode = remove, then we remove all those that are unknown l_two = [] two = open(file, newline='') reader_two = csv.DictReader(two) for row in reader_two: if (row['SUBJECT_ID'], row['OBJECT_ID']) not in self.diagnosed_relations.keys(): if (row['SUGGESTION'] == 'remove'): self.remove_relation(row['SUBJECT_ID'], row['OBJECT_ID'], comment='remove') else: self.diagnosed_relations[ row['SUBJECT_ID'], row['OBJECT_ID']] = row['SUGGESTION'] # l_two.append((row['SUBJECT_ID'], row['OBJECT_ID'])) # if it is labeled as 'remove' then remove, # if it is labeled as 'unknown' then depends on the mode it is in print('there are in total ', len(l_two), ' relations removed from mannual decisions') self.graph.remove_edges_from(l_two) def enquiry(self, query, mode="subm"): (s, p, o) = query if mode == "default": return self.hdt.search_triples_ids(s, p, o) else: # examine the filtered part first pass def convert_to_id(self, term): if term == "akt742:Intangible-Thing": # this is the only class that has two different ids (as subject and object) return 2601100675 else: return self.hdt.convert_term(term, IdentifierPosition.Subject) def convert_to_term(self, id): if id == 2601100675: return "akt742:Intangible-Thing" # this is the only one that has two different ids (as subject and object) else: return self.hdt.convert_id(id, IdentifierPosition.Subject) def remove_relation(self, sub, sup, comment='remove'): if self.graph.has_edge(sub, sup): self.graph.remove_edge(sub, sup) self.diagnose_relations(sub, sup, comment) def remove_relation_from(self, relation_list, comment='remove'): for (sub, sup) in relation_list: self.remove_relation(sub, sup, comment) def diagnose_relations(self, sub, sup, comment='default'): self.diagnosed_relations[(sub, sup)] = comment # change it to a dictionary? def diagnose_class(self, c, comment='default'): self.diagnosed_class[c] = comment # TODO, split the cases of removal and comment def remove_class(self, c, comment='remove'): if self.graph.has_node(c): self.graph.remove_node( c) # this also removes all the edges related self.diagnosed_classes[c] = comment # TODO, also remove the related edges connected def remove_class_from(self, cs, comment='remove'): for c in cs: self.remove_class(c, comment) def filter_leaf_classes(self): count = len(self.diagnosed_classes) for c in self.graph.nodes: #test if this node is a leaf (_, cardi) = self.enquiry(query=(0, self.id_subClassOf, c), mode="default") if cardi == 0: self.leaf_classes.add(c) for c in self.leaf_classes: self.remove_class(c) print('there are a total of', len(self.diagnosed_classes) - count, 'leaf nodes removed') def get_domain_from_id(self, id): t = self.convert_to_term(id) return tldextract.extract(t).domain def filter_domain_classes(self, domain): filtered = set() for c in self.graph.nodes: t = self.convert_to_term(c) if (domain == tldextract.extract(t).domain): filtered.add(c) print('a total of ', len(filtered), ' removed w.r.t. domain ', domain) self.remove_class_from(list(filtered)) def filter_reflexsive(self): to_remove = set() for e in self.graph.edges(): (l, r) = e if l == r: to_remove.add(e) print('removed reflexive relations', len(to_remove)) self.graph.remove_edges_from(list(to_remove)) def print_cycles(self): count = 0 flag = True while flag: try: cycle = nx.find_cycle(self.graph) print('find cycle', cycle) (l, r) = cycle[0] print(self.get_domain_from_id(l)) self.graph.remove_edges_from(cycle) except Exception as e: print(e) flag = False