def composition_lens_name(specs): specs[St.lens_operation] = Ns.lensOpt src_name = get_uri_local_name(specs[St.subjectsTarget]) trg_name = get_uri_local_name(specs[St.objectsTarget]) specs[St.lens] = "{}comp_{}_{}".format(Ns.lens, src_name, trg_name) if len(specs[St.lens]) > 255: specs[St.lens] = "{}comp_{}_{}".format(Ns.lens, Ut.hash_it(src_name), Ut.hash_it(trg_name)) update_specification(specs)
def diff_lens_name(specs): specs[St.lens_operation] = Ns.lensOpd # THE NAMES ARE HASHED AS THEY APPEAR TO BE TOO LONG FOR A FILE NAME # THIS IS AN EXAMPLE # print len("diff_eter_2014_orgreg_20170718_nearbyGeoSim1Kilometer_University_LatitudeLongitude_P871330770" # "_refined_eter_2014_orgreg_20170718_nearbyGeoSim1Kilometer_University_LatitudeLongitude_P871330770" # "_approxStrSim_English_Institution_Name_P255977302-Metadata-20180107.t") src_name = Ut.hash_it(get_uri_local_name(specs[St.subjectsTarget])) trg_name = Ut.hash_it(get_uri_local_name(specs[St.objectsTarget])) specs[St.lens] = "{}diff_{}_{}".format(Ns.lens, src_name, trg_name) update_specification(specs)
def set_refined_name(specs): reducer = "" intermediate = "" threshold = "" extended_graph = "" delta = "" # THE REDUCER if St.reducer in specs[St.source]: reducer += specs[St.source][St.reducer] if St.reducer in specs[St.target]: reducer += specs[St.target][St.reducer] # THE EXTENDED GRAPH if St.extended_graph in specs[St.source]: extended_graph += str(specs[St.source][St.extended_graph]) if St.extended_graph in specs[St.target]: extended_graph += str(specs[St.target][St.extended_graph]) # THE INTERMEDIATE GRAPH if St.intermediate_graph in specs: intermediate = specs[St.intermediate_graph] if St.threshold in specs: threshold += str(specs[St.threshold]) # THE NUMERIC DELTA if St.delta in specs: delta += str(specs[St.delta]) hashed = hash(reducer + extended_graph + intermediate + threshold + delta + specs[St.source][St.aligns_name] + specs[St.target][St.aligns_name] + specs[St.linkset_name]) append = str(hashed).replace( "-", "N") if str(hashed).__contains__("-") else "P{}".format(hashed) specs[St.refined_name] = "refined_{}_{}_{}_{}".format( specs[St.linkset_name], specs[St.mechanism], specs[St.source][St.aligns_name], append) dir_name = DIRECTORY date = datetime.date.isoformat(datetime.date.today()).replace('-', '') singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.refined_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.refined_name]) specs[St.refined_name] = "refined_{}_{}".format( specs[St.mechanism], full_hashed) specs[St.refined] = specs[St.linkset].replace(specs[St.linkset_name], specs[St.refined_name]) specs[St.refined] = specs[St.refined].replace("/lens/", "/linkset/") print "\t- specs[St.refined]", specs[St.refined]
def target_datatype_properties(model, label, linkset_label): main_tabs = "\t\t\t" tabs = "{}\t\t\t\t\t\t\t\t\t\t\t\t".format(main_tabs) # ALIGNMENT COMBINATION: LIST OD DICTIONARIES alignment_targets = "" property_list_bind = "" count = 0 for item in model: count += 1 target = item[St.graph] data = item[St.data] # LIST OF DICTIONARIES for n in range(0, len(data)): code = "llTarget:{}_{}".format(label, Ut.hash_it(target + str(data[n]))) datatype = data[n][St.entity_datatype] properties = data[n][St.properties] property_list = "" # LIST OF PROPERTIES for i in range(0, len(properties)): i_property = properties[i] if Ut.is_nt_format( properties[i]) else "<{}>".format( data[i][St.properties][i]) property_list += "?property_{}_{}_{} ".format(count, n, i) if i == 0 \ else ",\n{}?property_{}_{}_{} ".format(tabs, count, n, i) if i == 0 and count == 1: property_list_bind += """BIND( IRI("{}") AS ?property_{}_{}_{})""".format( i_property, count, n, i) else: property_list_bind += """\n{}BIND( IRI("{}") AS ?property_{}_{}_{})""".format( main_tabs, i_property, count, n, i) triples = """ {5}linkset:{4} ll:hasAlignmentTarget {0} . {5}{0} ll:hasTarget <{1}> . {5}{0} ll:hasDatatype <{2}> . {5}{0} ll:aligns {3}. """.format(code, target, datatype, property_list, linkset_label, main_tabs) # print triples alignment_targets += triples return {"list": alignment_targets, "binds": property_list_bind}
def set_subset_name(specs, inverse=False): if inverse is False: h_name = specs[St.mechanism] + \ specs[St.source][St.graph_name] + specs[St.source][St.link_old_name] + \ specs[St.target][St.graph_name] + specs[St.source][St.entity_datatype] + \ specs[St.target][St.entity_datatype] hashed = hash(h_name) append = str(hashed).replace( "-", "N") if str(hashed).__contains__("-") else "P{}".format(hashed) specs[St.linkset_name] = "subset_{}_{}_{}_{}_{}_{}".format( specs[St.source][St.graph_name], specs[St.target][St.graph_name], specs[St.mechanism], specs[St.source][St.entity_name], specs[St.source][St.link_old_name], append) dir_name = DIRECTORY date = datetime.date.isoformat(datetime.date.today()).replace('-', '') singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.linkset_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset_name] = "{}_{}_{}".format( specs[St.source][St.graph_name], specs[St.mechanism], full_hashed) # if len(specs[St.linkset_name]) > 255: # specs[St.linkset_name] = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset] = "{}{}".format(Ns.linkset, specs[St.linkset_name]) # print specs[St.linkset] return specs[St.linkset]
def set_linkset_name(specs, inverse=False): src_aligns = "" trg_aligns = "" reducer = "" intermediate = "" threshold = "" delta = "" geo = "" unit = "" source = specs[St.source] target = specs[St.target] if St.reducer in source: reducer += source[St.reducer] # GEO DATA unit_value = "" if St.longitude in source: geo += source[St.longitude] if St.latitude in source: geo += source[St.latitude] if St.longitude in target: geo += target[St.longitude] if St.latitude in source: geo += target[St.latitude] if St.unit in specs: geo += str(specs[St.unit]) unit = Ut.get_uri_local_name(str(specs[St.unit])) if St.unit_value in specs: geo += str(specs[St.unit_value]) unit_value = str(specs[St.unit_value]) if St.reducer in specs[St.target]: reducer += target[St.reducer] if St.intermediate_graph in specs: intermediate = str(specs[St.intermediate_graph]) if St.threshold in specs: threshold += str(specs[St.threshold]) if St.delta in specs: delta += str(specs[St.delta]) if St.aligns_name in source: src_aligns += source[St.aligns_name] elif St.latitude_name in source: # src_aligns += source[St.latitude_name] src_aligns += "Latitude" if St.longitude_name in source: # src_aligns += source[St.longitude_name] src_aligns += "Longitude" if St.aligns_name in target: trg_aligns += target[St.aligns_name] elif St.latitude_name in target: # trg_aligns += target[St.latitude_name] trg_aligns += "Latitude" if St.longitude_name in target: # trg_aligns += target[St.longitude_name] trg_aligns += "Longitude" dir_name = DIRECTORY date = datetime.date.isoformat(datetime.date.today()).replace('-', '') if inverse is False: h_name = specs[St.mechanism] + \ source[St.graph_name] + src_aligns + \ target[St.graph_name] + trg_aligns + \ source[St.entity_datatype] + target[St.entity_datatype] + "-" +\ reducer + intermediate + threshold + delta + geo hashed = hash(h_name) append = str(hashed).replace( "-", "N") if str(hashed).__contains__("-") else "P{}".format(hashed) specs[St.linkset_name] = "{}_{}_{}{}{}_{}_{}_{}".format( source[St.graph_name], target[St.graph_name], specs[St.mechanism], unit_value, unit, source[St.entity_name], src_aligns, append) singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.linkset_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset_name] = "{}_{}_{}".format(source[St.graph_name], specs[St.mechanism], full_hashed) # if len(specs[St.linkset_name]) > 255: # specs[St.linkset_name] = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset] = "{}{}".format(Ns.linkset, specs[St.linkset_name]) return specs[St.linkset] else: h_name = specs[St.mechanism] + \ target[St.graph_name] + trg_aligns + \ source[St.graph_name] + src_aligns + \ target[St.entity_datatype] + source[St.entity_datatype] + "-" +\ reducer + intermediate + threshold + delta + geo hashed = hash(h_name) append = str(hashed).replace( "-", "N") if str(hashed).__contains__("-") else "P{}".format(hashed) specs[St.linkset_name] = "{}_{}_{}{}{}_{}_{}_{}".format( target[St.graph_name], source[St.graph_name], specs[St.mechanism], unit_value, unit, target[St.entity_name], trg_aligns, append) singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.linkset_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset_name] = "{}_{}_{}".format(target[St.graph_name], specs[St.mechanism], full_hashed) # if len(specs[St.linkset_name]) > 255: # specs[St.linkset_name] = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset] = "{}{}".format(Ns.linkset, specs[St.linkset_name]) print "\t- specs[St.linkset]", specs[St.linkset] return specs[St.linkset]
def set_linkset_expands_name(specs): unique = "" source = specs[St.source] target = specs[St.target] if St.reducer in source: unique += source[St.reducer] # GEO DATA # unit_value = "" if St.longitude in source: unique += source[St.longitude] if St.latitude in source: unique += source[St.latitude] if St.longitude in target: unique += target[St.longitude] if St.latitude in source: unique += target[St.latitude] if St.unit in specs: unique += str(specs[St.unit]) unit = Ut.get_uri_local_name(str(specs[St.unit])) unique += unit if St.unit_value in specs: unique += str(specs[St.unit_value]) unit_value = str(specs[St.unit_value]) unique += unit_value if St.reducer in specs[St.target]: unique += target[St.reducer] if St.intermediate_graph in specs: unique = str(specs[St.intermediate_graph]) if St.threshold in specs: unique += str(specs[St.threshold]) if St.delta in specs: unique += str(specs[St.delta]) if St.aligns_name in source: unique += source[St.aligns_name] elif St.latitude_name in source: # src_aligns += source[St.latitude_name] unique += "Latitude" if St.longitude_name in source: # src_aligns += source[St.longitude_name] unique += "Longitude" if St.aligns_name in target: unique += target[St.aligns_name] elif St.latitude_name in target: # trg_aligns += target[St.latitude_name] unique += "Latitude" if St.longitude_name in target: # trg_aligns += target[St.longitude_name] unique += "Longitude" dir_name = DIRECTORY date = datetime.date.isoformat(datetime.date.today()).replace('-', '') unique = Ut.hash_it(specs[St.mechanism] + source[St.graph_name] + target[St.graph_name] + source[St.entity_datatype] + target[St.entity_datatype] + unique) if St.expands in specs: specs[St.linkset_name] = "expands_{}_{}".format( specs[St.expands_name], unique) specs[St.linkset] = "{}{}".format(Ns.linkset, specs[St.linkset_name]) singleton_metadata_file = "{}(SingletonMetadata)-{}.trig".format( specs[St.linkset_name], date) singleton_metadata_output = "{}/{}".format(dir_name, singleton_metadata_file) future_path = os.path.join(DIRECTORY, singleton_metadata_output) future_path = future_path.replace("\\", "/").replace("//", "/") if len(future_path) > 255: full_hashed = Ut.hash_it(specs[St.linkset_name]) specs[St.linkset_name] = "expands_{}_{}_{}".format( source[St.graph_name], specs[St.mechanism], full_hashed) print "\t- specs[St.linkset]", specs[St.linkset] return specs[St.linkset] else: return set_linkset_name(specs)
def lens_refine_name(specs, lens_type): extra = "" source = specs[St.source] target = specs[St.target] if St.reducer in source: extra += source[St.reducer] # GEO DATA unit_value = "" if St.longitude in source: extra += source[St.longitude] if St.latitude in source: extra += source[St.latitude] if St.longitude in target: extra += target[St.longitude] if St.latitude in source: extra += target[St.latitude] if St.unit in specs: extra += str(specs[St.unit]) unit = Ut.get_uri_local_name(str(specs[St.unit])) if St.unit_value in specs: extra += str(specs[St.unit_value]) unit_value = str(specs[St.unit_value]) if St.reducer in specs[St.target]: extra += target[St.reducer] if St.intermediate_graph in specs: intermediate = str(specs[St.intermediate_graph]) if St.threshold in specs: extra += str(specs[St.threshold]) if St.delta in specs: extra += str(specs[St.delta]) if St.aligns_name in source: extra += source[St.aligns_name] elif St.latitude_name in source: # src_aligns += source[St.latitude_name] extra += "Latitude" if St.longitude_name in source: # src_aligns += source[St.longitude_name] extra += "Longitude" if St.aligns_name in target: extra += target[St.aligns_name] elif St.latitude_name in target: # trg_aligns += target[St.latitude_name] extra += "Latitude" if St.longitude_name in target: # trg_aligns += target[St.longitude_name] extra += "Longitude" unique = Ut.hash_it(extra) specs[St.lens] = u"{}refine_{}_{}".format(unique, Ns.lens, specs[St.refined_name]) update_specification(specs)
def cluster_d_test(linkset, network_size=3, network_size_max=3, targets=None, constraint_targets=None, constraint_text="", directory=None, greater_equal=True, print_it=False, limit=None, only_good=False, activated=False): # FOR CONSTRAINTS TO WORK, IT SHOULD NOT BE NONE network = [] print "\nLINK NETWORK INVESTIGATION" if activated is False: print "\tTHE FUNCTION I NOT ACTIVATED" return "" elif network_size > network_size_max and greater_equal is False: print "\t[network_size] SHOULD BE SMALLER THAN [network_size_max]" return "" date = datetime.date.isoformat(datetime.date.today()).replace('-', '') linkset_name = Ut.get_uri_local_name(linkset) linkset = linkset.strip() if network_size_max - network_size == 0: greater_equal = False check = False # RUN THE CLUSTER clusters_0 = Cls.links_clustering(linkset, limit) if greater_equal is True: temp_size = 0 for cluster, cluster_val in clusters_0.items(): new_size = len(list(cluster_val["nodes"])) if new_size > temp_size: temp_size = new_size network_size_max = temp_size print "THE BIGGEST NETWORK'S: {}".format(network_size_max) def check_constraint(): text = constraint_text.lower() text = text.split(",") # CONSTRAINT BUILDER c_builder = Buffer.StringIO() if constraint_targets is not None: for dictionary in constraint_targets: graph = dictionary[St.graph] data_list = dictionary[St.data] properties = data_list[0][St.properties] prop = properties[0] if Ut.is_nt_format(properties[0]) else "<{}>".format(properties[0]) # WRITING THE CONSTRAINT ON THE GRAPH graph_q = """ {{ GRAPH <{0}> {{ ?lookup {1} ?constraint . }} }} """.format(graph, prop) c_builder.write(graph_q) if len(c_builder.getvalue()) == 0 else \ c_builder.write("UNION {}".format(graph_q)) # WRITING THE FILTER if len(c_builder.getvalue()) > 0: for i in range(0, len(text)): if i == 0 : c_builder.write(""" FILTER (LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) else: c_builder.write(""" || LCASE(STR(?constraint)) = "{}" """.format(text[i].strip())) c_builder.write(")") # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) query = query.replace("# CONSTRAINTS IF ANY", c_builder.getvalue()) # print query response = Qry.sparql_xml_to_matrix(query) if response[St.result] is None: return False return True for index in range(network_size, network_size_max + 1): count_1 = 0 count_2 = 0 curr_network_size = index print "\nCLUSTERS OF SIZE {}".format(index) sheet_builder = Buffer.StringIO() analysis_builder = Buffer.StringIO() sheet_builder.write("Count ID STRUCTURE E-STRUCTURE-SIZE A. NETWORK QUALITY" " M. NETWORK QUALITY REFERENCE\n") for cluster, cluster_val in clusters_0.items(): # network = [] resources = "" uri_size = 0 count_1 += 1 children = list(cluster_val["nodes"]) strengths = cluster_val["strengths"] cluster_size = len(children) # if "<http://www.grid.ac/institutes/grid.10493.3f>" not in children: # continue check = cluster_size >= curr_network_size if greater_equal else cluster_size == curr_network_size # NETWORK OF A PARTICULAR SIZE if check: # file_name = i_cluster[0] # 2: FETCHING THE CORRESPONDENTS smallest_hash = float('inf') child_list = "" for child in children: # CREATE THE HASHED ID AS THE CLUSTER NAME hashed = hash(child) if hashed <= smallest_hash: smallest_hash = hashed # GENERAL INFO 1: RESOURCES INVOLVED child_list += "\t{}\n".format(child) # LIST OF RESOURCES IN THE CLUTER use = "<{}>".format(child) if Ut.is_nt_format(child) is not True else child resources += "\n\t\t\t\t{}".format(use) if len(child) > uri_size: uri_size = len(child) # MAKE SURE THE FILE NAME OF THE CLUSTER IS ALWAYS THE SAME file_name = "{}".format(str(smallest_hash).replace("-", "N")) if str( smallest_hash).startswith("-") \ else "P{}".format(smallest_hash) if constraint_targets is not None and check_constraint() is False: continue count_2 += 1 # # THE RESULT OF THE QUERY ABOUT THE LINKED RESOURCES query = Qry.cluster_rsc_strengths_query(resources, linkset) response = Qry.sparql_xml_to_matrix(query) # GENERAL INFO 2: info = "SIZE {} \nCLUSTER {} \nNAME {}\n".format(cluster_size, count_1, file_name) info2 = "CLUSTER [{}] NAME [{}] SIZE [{}]".format(count_1, file_name, cluster_size) analysis_builder.write("{}\n".format(info)) analysis_builder.write("RESOURCES INVOLVED\n") analysis_builder.write(child_list) analysis_builder.write("\nCORRESPONDENT FOUND ") analysis_builder.write( Qry.display_matrix(response, spacing=uri_size, output=True, line_feed='.', is_activated=True)) # INFO TYPE 3: PROPERTY-VALUES OF THE RESOURCES INVOLVED analysis_builder.write("\n\nDISAMBIGUATION HELPER ") if targets is None: analysis_builder.write(Cls.disambiguate_network(linkset, children)) else: report = Cls.disambiguate_network_2(children, targets) if report is not None: analysis_builder.write(report) # GENERATING THE NETWORK AS A TUPLE WHERE A TUPLE REPRESENT TWO RESOURCES IN A RELATIONSHIP :-) network = [] link_count = 0 for link in cluster_val["links"]: link_count += 1 name_1 = "{}-{}".format(Ut.hash_it(link[0]), Ut.get_uri_local_name(link[0])) name_2 = "{}-{}".format(Ut.hash_it(link[1]), Ut.get_uri_local_name(link[1])) network += [(name_1, name_2)] # GET THE AUTOMATED FLAG if print_it: print "" print analysis_builder.getvalue() # SETTING THE DIRECTORY if directory: if network: automated_decision = metric(network)["AUTOMATED_DECISION"] if only_good is True and automated_decision.startswith("GOOD") is not True: count_2 -= 1 continue print "{:>5} {}".format(count_2, info2) eval_sheet(targets, count_2, "{}_{}".format(cluster_size, file_name), sheet_builder, linkset, children, automated_decision) else: print network # linkset_name = Ut.get_uri_local_name(linkset) # date = datetime.date.isoformat(datetime.date.today()).replace('-', '') temp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\{}_{}\\".format( curr_network_size, date, linkset_name, cluster_size, file_name)) if not os.path.exists(temp_directory): os.makedirs(temp_directory) """"""""""""" PLOTTING """"""""""""" # FIRE THE DRAWING: Supported formats: eps, pdf, pgf, png, ps, raw, rgba, svg, svgz. analysis_builder.write( draw_graph(graph=network, file_path="{}{}.{}".format(temp_directory, "cluster_{}".format(file_name), "pdf"), show_image=False) ) """"""""""""" WRITING TO DISC """"""""""""" # WRITE TO DISC Ut.write_2_disc(file_directory=temp_directory, file_name="cluster_{}".format(file_name, ), data=analysis_builder.getvalue(), extension="txt") analysis_builder = Buffer.StringIO() if directory: # if len(sheet_builder.getvalue()) > 150 and count_2 == 2: if len(sheet_builder.getvalue()) > 150 and len(clusters_0) == count_1: tmp_directory = "{}{}".format(directory, "\{}_Analysis_{}\{}\\".format( curr_network_size, date, linkset_name)) """"""""""""" WRITING CLUSTER SHEET TO DISC """"""""""""" print "\n\tWRITING CLUSTER SHEET AT\n\t{}".format(tmp_directory) Ut.write_2_disc(file_directory=tmp_directory, file_name="{}_ClusterSheet".format(cluster_size), data=sheet_builder.getvalue(), extension="txt") # if count_2 == 2: # break if greater_equal is True: # no need to continue as we already did all network greater of equal to "network-size" input break print "\t>>> FOUND: {} CLUSTERS OF SIZE {}".format(count_2, curr_network_size) if directory is None: return "{}\t{}".format(curr_network_size, count_2)