def graph_characteristics_csv(pattern_path, output_path, predicate): csv_folder_summary = os.path.join(output_path, 'graph_characteristics_csv') if not os.path.exists(csv_folder_summary): os.makedirs(csv_folder_summary) batch_number = pattern_path.split("/")[-1] file = csv_folder_summary + "/" + batch_number + '_results_final_limit.csv' print "Making file: ", file b = open(file, 'w') if predicate != None: field_names = [ 'pattern_name', 'nr_randvar_values', 'nr_targets', 'has_cycles', 'max_degree', 'average_degree', 'predicate' + predicate, 'invalid' ] else: field_names = [ 'pattern_name', 'nr_randvar_values', 'nr_targets', 'has_cycles', 'max_degree', 'average_degree', 'invalid' ] writer = csv.DictWriter(b, fieldnames=field_names) writer.writeheader() pattern_file_gml = None print "Number of patterns: ", len( sorted(os.listdir(pattern_path), key=lambda x: x[:-5])) for patt in sorted(os.listdir(pattern_path), key=lambda x: x[:-5]): #print os.path.join(pattern_path,patt) if (os.path.isfile(os.path.join(pattern_path, patt))): continue pattern_file_gml = os.path.join(pattern_path, patt, patt + ".gml") #if patt.endswith(".gml"): # pattern_file_gml=os.path.join(pattern_path,patt) #print "Pattern file gml",pattern_file_gml if pattern_file_gml != None: pattern = nx.read_gml(pattern_file_gml) else: continue pattern_file_name = patt #some general pattern charactersitics nr_randvar_values = man.count_nr_randvars_in_graph(pattern) cycles = man.is_there_cycle_in_graph(pattern) max_degree = man.get_maximum_node_degree(pattern) average_degree = man.get_average_node_degree(pattern) n_target_nodes = man.get_nr_target_nodes_other_than_head(pattern) contains_target_predicate = contains_predicate(pattern, predicate) row = {} row['pattern_name'] = pattern_file_name row['nr_randvar_values'] = str(nr_randvar_values) row['nr_targets'] = str(n_target_nodes) row['has_cycles'] = str(cycles) row['max_degree'] = str(max_degree) row['average_degree'] = str(average_degree) row['invalid'] = str(is_invalid(os.path.join(pattern_path, patt))) if predicate != None: row['predicate' + predicate] = str(contains_target_predicate) writer.writerow(row) print "Finished writing csv ...to", file return file
def makecsv_file_for_final_limits(pattern_path, output_path, redo): print "Output path: ", output_path print "Pattern path: ", pattern_path csv_folder_summary = os.path.join(output_path, 'csv_results') batch_number = pattern_path.split("/")[-1] print batch_number print "Does exist csv folder summary", csv_folder_summary, os.path.exists( csv_folder_summary) if not os.path.exists(csv_folder_summary): os.makedirs(csv_folder_summary) file = csv_folder_summary + "/" + batch_number + '_results_final_limit.csv' if os.path.exists(file) and redo == False: print "Results for this batch already exist" return b = open(file, 'w') field_names = [ 'pattern_name', 'selected', 'nr_randvar_values', 'nr_targets', 'has_cycles', 'exh_emb', 'rnd_emb', 'furer_emb', 'ffurer_emb', 'limit16_rnd_emb', 'limit16_fur_emb', 'limit16_ff_emb', 'rnd_KLD_16', 'furer_KLD_16', 'ff_KLD_16', 'exh_rt', 'rnd_avgRT_16', 'furer_avgRT_16', 'ff_avgRT_16' ] writer = csv.DictWriter(b, fieldnames=field_names) writer.writeheader() print "Number of patterns: ", len( sorted(os.listdir(pattern_path), key=lambda x: x[:-5])) counter = 1 nr_patterns = len(os.listdir(pattern_path)) for patt in sorted(os.listdir(pattern_path), key=lambda x: x[:-5]): print "CSV processing :", nr_patterns, " th pattern" nr_patterns -= 1 if (os.path.isfile(os.path.join(pattern_path, patt))): continue pattern_file_gml = None print "Path", os.path.join(pattern_path, patt) if os.path.exists( os.path.join(pattern_path, patt, "results_furer", "input_pattern.gml")): pattern_file_gml = os.path.join(pattern_path, patt, "results_furer", "input_pattern.gml") elif os.path.exists( os.path.join(pattern_path, patt, 'exhaustive_approach', 'input_pattern.gml')): pattern_file_gml = os.path.join(pattern_path, patt, 'exhaustive_approach', 'input_pattern.gml') elif patt.endswith(".gml"): pattern_file_gml = os.path.join(pattern_path, patt) print "Pattern file gml", pattern_file_gml if pattern_file_gml != None: pattern = nx.read_gml(pattern_file_gml) else: continue pattern_file_name = patt #PICKLES RESULTS PATH exhaustive_file_result = os.path.join( pattern_path, patt, 'exhaustive_approach', 'results_' + str(pattern_file_name) + ".res") random_dict_result = os.path.join(pattern_path, patt, 'random_vertex_approach', 'rndicts.pickle') furer_dict_result = os.path.join(pattern_path, patt, 'results_furer', 'fudicts.pickle') false_furer_dict_result = os.path.join(pattern_path, patt, 'results_false_furer', 'fudicts.pickle') #NLIMITS RESULTS PATH random_nlimits_result = os.path.join(pattern_path, patt, 'random_vertex_approach', 'n_limits') furer_nlimits_result = os.path.join(pattern_path, patt, 'results_furer', 'n_limits') false_furer_nlimits_result = os.path.join(pattern_path, patt, 'results_false_furer', 'n_limits') #some general pattern charactersitics nr_randvar_values = man.count_nr_randvars_in_graph(pattern) cycles = man.is_there_cycle_in_graph(pattern) max_degree = man.get_maximum_node_degree(pattern) average_degree = man.get_average_node_degree(pattern) n_target_nodes = man.get_nr_target_nodes_other_than_head(pattern) nr_embeddings_exhaustive, exhaustive_running_time = extract_number_of_embeddings_and_rt_exhaustive( exhaustive_file_result) nr_embeddings_random_final_limit = [] nr_embeddings_furer_final_limit = [] nr_embeddings_false_furer_final_limit = [] nr_embeddings_furer_final = -1 nr_embeddings_false_furer_final = -1 nr_embeddings_random_final = -1 furer_klds = [] furer_SSTDs = [] false_furer_kld = [] false_furer_SSTDs = [] random_klds = [] random_SSTDs = [] furer_avg_rt = [] false_furer_avg_rt = [] random_avg_rt = [] #FIRST CHECK IF EXPPERIMENTS WERE RUN IN SEQUENTIALL OR PARALLEL MODE. IF IT'S PARALLEL MODE RESULTS HAVE TO BE #EXTRACTED FROM RUN DIRECTORIES #EXTRACT RANDOM VERTEX RESULTS if os.path.exists( os.path.join(pattern_path, patt, 'random_vertex_approach')): files_random = sorted([ f for f in os.listdir( os.path.join(pattern_path, patt, 'random_vertex_approach')) if re.match('run_*', f) ]) if (len(files_random) != 0): random_klds, random_SSTDs, random_avg_rt = extract_KLD_sampling_approach_parallel_run( os.path.join(pattern_path, patt, 'random_vertex_approach'), files_random) nr_embeddings_random_final = extract_final_number_of_embeddings_sampling_approach( os.path.join(pattern_path, patt, 'random_vertex_approach')) else: random_klds, random_SSTDs, random_avg_rt = extract_KLD_sampling_approach( random_nlimits_result) #nr_embeddings_random_final_limit=extract_number_of_embeddings_sampling_approach(random_dict_result) nr_embeddings_random_final = extract_final_number_of_embeddings_sampling_approach( os.path.join(pattern_path, patt, 'random_vertex_approach')) #EXTRACT FURER RESULTS if os.path.exists(os.path.join(pattern_path, patt, 'results_furer')): print "FURER" files_furer = sorted([ f for f in os.listdir( os.path.join(pattern_path, patt, 'results_furer')) if re.match('run_*', f) ]) if (len(files_furer) != 0): furer_klds, furer_SSTDs, furer_avg_rt = extract_KLD_sampling_approach_parallel_run( os.path.join(pattern_path, patt, 'results_furer'), files_furer) nr_embeddings_furer_final = extract_final_number_of_embeddings_sampling_approach( os.path.join(pattern_path, patt, 'results_furer')) else: furer_klds, furer_SSTDs, furer_avg_rt = extract_KLD_sampling_approach( furer_nlimits_result) #nr_embeddings_furer_final_limit=extract_number_of_embeddings_sampling_approach(furer_dict_result) nr_embeddings_furer_final = extract_final_number_of_embeddings_sampling_approach( os.path.join(pattern_path, patt, 'results_furer')) #EXTRACT FALSE FURER TIMES if os.path.exists( os.path.join(pattern_path, patt, 'results_false_furer')): files_false_furer = sorted([ f for f in os.listdir( os.path.join(pattern_path, patt, 'results_false_furer')) if re.match('run_*', f) ]) if (len(files_false_furer) != 0): false_furer_kld, false_furer_SSTDs, false_furer_avg_rt = extract_KLD_sampling_approach_parallel_run( os.path.join(pattern_path, patt, 'results_furer'), files_false_furer) nr_embeddings_false_furer_final_limit = extract_number_of_embeddings_sampling_approach( random_dict_result) else: false_furer_kld, false_furer_SSTDs, false_furer_avg_rt = extract_KLD_sampling_approach( false_furer_nlimits_result) #nr_embeddings_false_furer_final_limit=extract_number_of_embeddings_sampling_approach(false_furer_dict_result) nr_embeddings_false_furer_final = extract_final_number_of_embeddings_sampling_approach( os.path.join(pattern_path, patt, 'results_false_furer')) print "PATH ", os.path.join(pattern_path, patt, 'selected.info'), os.path.exists( os.path.join(pattern_path, patt, 'selected.info')) selected = False #check if pattern selected if os.path.exists(os.path.join(pattern_path, patt, 'selected.info')): selected = True row = {} row['pattern_name'] = pattern_file_name row['selected'] = selected row['nr_randvar_values'] = str(nr_randvar_values) row['nr_targets'] = str(n_target_nodes) row['has_cycles'] = str(cycles) if (nr_embeddings_exhaustive == 'NC'): row['exh_emb'] = 'NC' else: row['exh_emb'] = nr_embeddings_exhaustive row['rnd_emb'] = nr_embeddings_random_final row['furer_emb'] = nr_embeddings_furer_final row['ffurer_emb'] = nr_embeddings_false_furer_final row['limit16_rnd_emb'] = "None" row['limit16_fur_emb'] = "None" row['limit16_ff_emb'] = "None" row['rnd_KLD_16'] = str( str(getNTH_limit_value(15, random_klds)) + " +- " + str(getNTH_limit_value(15, random_SSTDs))) row['furer_KLD_16'] = str( str(getNTH_limit_value(15, furer_klds)) + " +- " + str(getNTH_limit_value(15, furer_SSTDs))) row['ff_KLD_16'] = str( str(getNTH_limit_value(15, false_furer_kld)) + " +- " + str(getNTH_limit_value(15, false_furer_SSTDs))) row['exh_rt'] = exhaustive_running_time row['rnd_avgRT_16'] = getNTH_limit_value(15, random_avg_rt) row['furer_avgRT_16'] = getNTH_limit_value(15, furer_avg_rt) row['ff_avgRT_16'] = getNTH_limit_value(15, false_furer_avg_rt) writer.writerow(row) counter += 1 #return path tocreated csv file print "Finished writing csv ...to", file return file
nodes[users[1]] = id_user id_user += 1 if users[0] in edges_map: edges_map[users[0]].append(users[1]) else: edges_map[users[0]] = [users[1]] nr_edges += 1 print nr_edges print len(edges_map.keys()) G = nx.Graph() for n in nodes.keys(): G.add_node(nodes[n], id=nodes[n], predicate='user') for e in edges_map.keys(): for e1 in edges_map[e]: G.add_edge(nodes[e], nodes[e1]) pickle.dump(G, open(FILE_NAME, 'wb')) data = nx.read_gpickle(FILE_NAME) print "Nr nodes TWITTER: ", len(data.nodes()) print "Nr edges TWITTER: ", len(data.edges()) print "Max degree TWITTER: ", an.get_maximum_node_degree(data) print "Density TWITTER: ", nx.density(data) print "INFO TWITTER:", nx.info(data) vis.visualize_graph_standard(data)
edges_map[edge1] = [edge2] nr_edges += 1 print nr_edges print len(edges_map.keys()) G = nx.Graph() for n in nodes.keys(): G.add_node(nodes[n], id=nodes[n], predicate='user') for e in edges_map.keys(): for e1 in edges_map[e]: G.add_edge(nodes[e], nodes[e1]) pickle.dump(G, open(FILE_NAME, 'wb')) data = nx.read_gpickle(FILE_NAME) print "Nr nodes AMAZON: ", len(data.nodes()) print "Nr edges AMAZON: ", len(data.edges()) print "Max degree AMAZON: ", an.get_maximum_node_degree(data) print "Density AMAZON: ", nx.density(data) print "INFO AMAZON:", nx.info(data) #print an.get_maximum_node_degree(graph) number_of_pages = 0 for node in data.nodes(): if data.node[node]['predicate'] == 'page': number_of_pages += 1 print "NUMBER OF PAGES: ", number_of_pages vis.visualize_graph_standard(data)
def get_row_exhaustive(general_path, pattern_result, pattern_path): row = {} print "Pattern exhaustive ", pattern_result print "Pattern path: ", pattern_path pattern = nx.read_gml(os.path.join(general_path, 'input_pattern.gml')) nr_randvar_values = man.count_nr_randvars_in_graph(pattern) cycles = man.is_there_cycle_in_graph(pattern) max_degree = man.get_maximum_node_degree(pattern) average_degree = man.get_average_node_degree(pattern) n_target_nodes = man.get_nr_target_nodes_other_than_head(pattern) parent_id = get_parent_id(os.path.join(pattern_path)) #get nr embeddings of exhaustive nr_emb = None time = None print general_path.split('/') pattern_name = general_path.split('/')[-1] if pattern_name == "": pattern_name = general_path.split('/')[-2] nr_obs = None print "Exists? ", os.path.join( general_path, 'exhaustive_approach', 'results_' + pattern_name + '.res'), os.path.exists( os.path.join(general_path, 'exhaustive_approach', 'results_' + pattern_name + '.res')) if os.path.exists( os.path.join(general_path, 'exhaustive_approach', 'results_' + pattern_name + '.res')): nr_emb, time, nr_obs = extract_nr_embeddings( os.path.join(general_path, 'exhaustive_approach', 'results_' + pattern_name + '.res')) #get the results if os.path.exists(os.path.join(pattern_result, 'monitoring')): embeddings, stdev, klds = get_stat( os.path.join(pattern_result, 'monitoring'), 'exhaustive') else: embeddings = [None] * 120 klds = [None] * 120 is_timeout = False if os.path.exists( os.path.join(general_path, 'exhaustive_approach', 'timeout.info')): is_timeout = True print "Nr of records for embeddings: ", len(embeddings) nodes, edges = man.get_readable_text_format(pattern) row['pattern_name'] = pattern_result row['parent_id'] = parent_id row['nr_randvar_values'] = int(nr_randvar_values) row['nodes'] = nodes row['edges'] = edges row['has_cycles'] = cycles row['density'] = nx.density(pattern) row['max_degree'] = float(max_degree) row['avg_deg'] = float(average_degree) row['nr_targets'] = int(n_target_nodes) if nr_emb: row['exh_emb'] = float(nr_emb) else: row['exh_emb'] = nr_emb row['time'] = time row['timeout'] = is_timeout row['nr_observations'] = nr_obs for i in xrange(1, len(embeddings) + 1): if embeddings[i - 1] == None: row["emb_" + str(i)] = None else: row["emb_" + str(i)] = float(embeddings[i - 1]) return row
def get_row_NS(general_path, pattern_result, experiment_name): row = {} if not (os.path.exists(os.path.join(general_path, 'input_pattern.gml'))): row['pattern_name'] = pattern_result row['nr_randvar_values'] = "NC" row['nodes'] = "NC" row['edges'] = "NC" row['has_cycles'] = "NC" row['density'] = "NC" row['shape'] = "NC" row['max_degree'] = "NC" row['avg_deg'] = "NC" row['nr_targets'] = "NC" row['nr_emb'] = "NC" row['has_obd'] = "NC" row['unequal_size_warn'] = "NC" row['OBD'] = "NC" return row pattern = nx.read_gml(os.path.join(general_path, 'input_pattern.gml')) nr_randvar_values = man.count_nr_randvars_in_graph(pattern) cycles = man.is_there_cycle_in_graph(pattern) max_degree = man.get_maximum_node_degree(pattern) average_degree = man.get_average_node_degree(pattern) n_target_nodes = man.get_nr_target_nodes_other_than_head(pattern) nr_emb = None has_obd = True if os.path.exists(os.path.join(pattern_result, 'no_obdecomp.info')): has_obd = False if os.path.exists(os.path.join(general_path, 'not_selected.info')): nr_emb = extract_nr_embeddings_NS( os.path.join(general_path, 'not_selected.info')) nodes, edges = man.get_readable_text_format(pattern) unequal_size_warning = False if os.path.exists( os.path.join(general_path, 'results_furer', 'unequal_size.warning')): unequal_size_warning = True OBD = None if os.path.exists( os.path.join(general_path, 'results_furer', 'OBDDecomp.info')): OBD = getOBDecomp( os.path.join(general_path, 'results_furer', 'OBDDecomp.info')) row['pattern_name'] = pattern_result row['nr_randvar_values'] = nr_randvar_values row['nodes'] = nodes row['edges'] = edges row['has_cycles'] = cycles row['density'] = nx.density(pattern) row['shape'] = man.get_graph_shape(pattern) row['max_degree'] = max_degree row['avg_deg'] = average_degree row['nr_targets'] = n_target_nodes row['nr_emb'] = nr_emb #row['has_obd']=has_obd #row['unequal_size_warn']=unequal_size_warning row['OBD'] = OBD return row
def get_row(general_path, pattern_result, experiment_name, pattern_path): row = {} pattern = nx.read_gml(os.path.join(general_path, 'input_pattern.gml')) parent_id = get_parent_id(os.path.join(pattern_path)) nr_randvar_values = man.count_nr_randvars_in_graph(pattern) cycles = man.is_there_cycle_in_graph(pattern) max_degree = man.get_maximum_node_degree(pattern) average_degree = man.get_average_node_degree(pattern) n_target_nodes = man.get_nr_target_nodes_other_than_head(pattern) #get nr embeddings of exhaustive nr_emb = None sel_emb = None has_obd = True emb_stds = [] if os.path.exists(os.path.join(pattern_result, 'no_obdecomp.info')): has_obd = False if os.path.exists( os.path.join(os.path.dirname(pattern_result), "selected.info")): sel_emb = extract_nr_embeddings_NS( os.path.join(os.path.dirname(pattern_result), "selected.info")) print "General path: ", general_path print os.path.join( general_path, 'exhaustive_approach', 'results_' + general_path.split('/')[-1] + '.res'), "exists?", os.path.exists( os.path.join(general_path, 'exhaustive_approach', 'results_' + general_path.split('/')[-1] + '.res')) pattern_name = None print general_path.split('/') if general_path.split('/')[-1] == "": pattern_name = general_path.split('/')[-2] else: pattern_name = general_path.split('/')[-1] print pattern_name if os.path.exists( os.path.join(general_path, 'exhaustive_approach', 'results_' + pattern_name + '.res')): nr_emb, time, nr_obs = extract_nr_embeddings( os.path.join(general_path, 'exhaustive_approach', 'results_' + pattern_name + '.res')) #get the results if os.path.exists(os.path.join(pattern_result, 'monitoring')): embeddings, emb_stds, klds = get_stat( os.path.join(pattern_result, 'monitoring'), experiment_name) else: embeddings = [None] * 120 klds = [None] * 120 print "EMBEDDINGS: ", embeddings unequal_size_warning = False OBD = None if os.path.exists( os.path.join(general_path, 'results_furer', 'OBDDecomp.info')): OBD = getOBDecomp( os.path.join(general_path, 'results_furer', 'OBDDecomp.info')) nodes, edges = man.get_readable_text_format(pattern) print "PATTERN NAME: ", pattern_result row['pattern_name'] = pattern_result row['parent_id'] = parent_id row['nr_randvar_values'] = int(nr_randvar_values) row['nodes'] = nodes row['edges'] = edges row['has_cycles'] = cycles row['density'] = float(nx.density(pattern)) row['shape'] = man.get_graph_shape(pattern) row['max_degree'] = float(max_degree) row['avg_deg'] = float(average_degree) row['nr_targets'] = n_target_nodes if sel_emb: row['sel_emb'] = float(sel_emb) else: row['sel_emb'] = sel_emb if nr_emb: row['exh_emb'] = float(nr_emb) else: row['exh_emb'] = nr_emb row['has_obd'] = has_obd #row['unequal_size_warn']=unequal_size_warning row['OBD'] = OBD print "Nr embeddingS: ", len(embeddings) for i in xrange(0, len(embeddings)): row["emb_" + str(i + 1)] = embeddings[i] for i in xrange(0, len(emb_stds)): row["std_" + str(i + 1)] = emb_stds[i] for i in xrange(0, len(klds)): row["KLD_" + str(i + 1)] = klds[i] return row
links=0 edges_pairs=[] for user in edges_map.keys(): for user2 in edges_map[user]: #if not str(user_ids[user])+","+str(user_ids[user2]) in edges_pairs and not str(user_ids[user2])+","+str(user_ids[user]) in edges_pairs: print "edge: ",user_ids[user],user_ids[user2] G.add_edge(user_ids[user],user_ids[user2]) edges_pairs.append(str(user_ids[user])+","+str(user_ids[user2])) links+=1 print "nr nodes before pickling: ",len(G.nodes()) number_of_users=0 print "NUMBER OF USERS: ",number_of_users pickle.dump(G, open(FILE_NAME,'wb')) data=nx.read_gpickle(FILE_NAME) print "Nr nodes FACEBOOK: ",len(data.nodes()) print "Nr edges FACEBOOK: ",len(data.edges()) print "Max degree FACEBOOK: ",an.get_maximum_node_degree(data) print "Density FACEBOOK: ",nx.density(data) print "INFO FACEBOOK:",nx.info(data) print "Nr user links: ",links #print an.get_maximum_node_degree(graph) print "Number of nodes after pickling",len(data.nodes()) #for n in data.nodes(): # print data.node[n] # if data.node[n]['predicate']=='user': # number_of_users+=1 #vis.visualize_graph_standard(data)
print "course->faculty: ", course_faculty print "course->course: ", course_course print "----------------------------------" print "faculty->student: ", faculty_student print "faculty->staff: ", faculty_staff print "faculty->project: ", faculty_project print "faculty->department: ", faculty_department print "faculty->faculty: ", faculty_faculty print "faculty->course: ", faculty_course print "----------------------------------" print "faculty-> faculty: ", faculty_faculty print "project-> course: ", project_course data = nx.read_gml(FILE_NAME) print "Number of directed links: ", number_of_links print "Nr nodes WEBKB: ", len(data.nodes()) print "Nr edges WEBKB: ", len(data.edges()) print "Max degree WEBKB: ", an.get_maximum_node_degree(data) print "Density WEBKB: ", nx.density(data) print "INFO WEBKB:", nx.info(data) #print an.get_maximum_node_degree(graph) number_of_pages = 0 for node in data.nodes(): if data.node[node]['predicate'] == 'page': number_of_pages += 1 print "NUMBER OF PAGES: ", number_of_pages #vis.visualize_graph_standard(data)