def induced_subgraph(G, H): ''' induced_subgraph a function that checks if G has an induced subgraph of H Parameters: G: the graph to check (networkx) H: the induced subgraph (networkx) Returns: induced: the induced subgraph (networkx) Method: just create every permutation of the G graph with as many vertices as H and use networkx to check if isomorphic Note: not solved in polynomial time (only use for small cases) ''' n = len(G) k = len(H) if n < k: return None permutations = create_permutations(n, k) induced = None for subset in permutations: subgraph = G.subgraph(subset) if nx.faster_could_be_isomorphic(subgraph, H): if nx.is_isomorphic(subgraph, H): induced = subgraph break # only want to find one return induced
def compare_trees(T1, T2): """ Returns true if the trees are IDENTICAL (isomorphism is necessary but insufficient), and False otherwise. Two trees are identical iff: 1) same set of nodes 2) same set of branches 3) same branch params (for now, assumed no node params) DEVELOPER'S CORNER: I learned that dictionaries can be tested for equality via ==. There's also talk on StackOverflow about this equality test holding for nested dicts (and other nesting combinations with other data structures). """ if not isinstance(T1, nx.DiGraph) or not isinstance(T2, nx.DiGraph): raise TypeError("This function only accepts NetworkX DiGraphs.") if not nx.faster_could_be_isomorphic(T1, T2): return False # Check 1: Compare nodes if set(T1) != set(T2): return False # Check 2: Compare branches br1 = set(br for br in T1.edges()) br2 = set(br for br in T2.edges()) if br1 != br2: return False # Check 3: Compare branch parameters (final test) return all( [T1.edges[up, down] == T2.edges[up, down] for up, down in br1])
def iso_json(string1,string2): dataG1 = json.loads(string1) graph1 = json_graph.node_link_graph(dataG1) dataG2 = json.loads(string2) graph2 = json_graph.node_link_graph(dataG2) # return nx.is_isomorphic(graph1, graph2) return nx.faster_could_be_isomorphic(graph1, graph2)
def check_isomorphy(letters, rotors, notch_distance=None): graphs = [] #all_single_notches= permutations([i for i in range(letters)], notches_per_rotor) all_notches = permutations([i for i in range(letters)], rotors) #update with more notches if (notch_distance != None): all_notches = [[[(notch + d) % letters for d in [0] + notch_distance[r]] for r, notch in enumerate(notches)] for notches in all_notches] pG = 0 trigger = True for notches, n in zip(all_notches, range(len(all_notches))): print("\rcheck_multiple_notch_isomorphy(5,3,2)rG%d: notching " % n, notches, end='') G = nx.DiGraph() G.add_edges_from(make_cycle_graph(notches, letters)) if n > 0: print(" --- comparing graph %d and %d [%3d%%]" % (n - 1, n, round(100 * n / (len(all_notches) - 1))), end='') if (not nx.faster_could_be_isomorphic(G, pG)): print("non-isomorphic graphs found!") trigger = False pG = G return trigger
def automorphism_groups2(g, graphs): print("len(graphs)", len(graphs)) unique_graphs = [[graphs[0]]] for i in range(1, len(graphs)): add = True for j in range(len(unique_graphs)): uniq_g = unique_graphs[j][0] g1 = nx.subgraph(g, graphs[i]) g2 = nx.subgraph(g, uniq_g) g1 = nx.subgraph(g, graphs[i]) ds1 = [d for n, d in g1.degree()] if (list(np.sort(ds1)) == [1, 2, 2, 2, 3] or list(np.sort(ds1)) == [2, 2, 2, 3, 3]): # usa isomorphism if (nx.is_isomorphic(g1, g2)): add = False unique_graphs[j].append(graphs[i]) else: # usa deg seq if (nx.faster_could_be_isomorphic(g1, g2)): add = False unique_graphs[j].append(graphs[i]) if (add): unique_graphs.append([graphs[i]]) return (unique_graphs)
def isomorphism(pattern, target, nx_structures): """ Uses the NetworkX isomorphism algorithm to check if the pattern graph and the target graph are isomorphic. The faster_could_be_isomorphic method is used to discount two structures if they could not be isomorphic. :param pattern: a molecule object which is to be tested for isomorphism :param target: a molecule object which pattern graph is to be compared against :return: None if the graphs are not isomorphic :return: a dictionary which maps the indices of the two NetworkX graphs together if they are isomorphic """ if pattern not in nx_structures: nx_structures[pattern] = create_nx_graph(pattern, nx_structures) if target not in nx_structures: nx_structures[target] = create_nx_graph(target, nx_structures) if not nx.faster_could_be_isomorphic(nx_structures[pattern], nx_structures[target]): # Graphs are definitely not isomorphic return None #print pattern, target # Ensures the isomorphism considers the vertex label and edge type matcher = iso.GraphMatcher( nx_structures[pattern], nx_structures[target], node_match=iso.categorical_node_match('label', 'C'), edge_match=iso.categorical_edge_match('type', 'single')) if matcher.is_isomorphic(): return matcher.mapping
def count_motifs(network, motifs): """ Returns a dictionary of motif counts, keyed on names and valued on occurances. [string, int] """ m_count = dict(zip(motifs.keys(), [0 for x in motifs.keys()])) sub_graphs = generate_triads_2(network) count = 0 if LOG: print "\t\t Counting triads" for nodes in sub_graphs: sub_graph = network.subgraph(nodes) possibles = {} for motif_name in motifs.keys(): count += 1 if LOG and count % 5000 == 0: print "\t\t\t" + str(count/float(len(sub_graphs) * 13)) if nx.faster_could_be_isomorphic(sub_graph, motifs[motif_name]): possibles[motif_name] = sub_graph # Only one possibility? Add it if len(possibles.keys()) == 1: m_count[possibles.keys()[0]] += 1 else: for possible_name in possibles: if nx.is_isomorphic(sub_graph, motifs[possible_name]): m_count[possible_name] += 1 break return m_count
def not_isomorphic(graph_a, graph_b): """ :param graph_a: :param graph_b: :return: """ return nx.faster_could_be_isomorphic(graph_a.g, graph_b.g)
def check_all_pairs(L): n_trees = len(L) n_pairs = (n_trees * (n_trees - 1)) / 2 nx_time = 0 my_time = 0 true_tests = 0 false_tests = 0 proc_pairs = 0 for i in range(0, n_trees): for j in range(i + 1, n_trees): proc_pairs += 1 begin = time.time() if nx.faster_could_be_isomorphic(L[i], L[j]): nx_iso = nx.is_isomorphic(L[i], L[j]) else: nx_iso = False end = time.time() nx_time += end - begin begin = time.time() my_iso = are_trees_isomorphic(L[i], L[j]) end = time.time() my_time += end - begin assert (my_iso == nx_iso) if my_iso: true_tests += 1 else: false_tests += 1 if proc_pairs % 1000 == 0: print("Processed: %.3f" % (proc_pairs / n_pairs * 100), "%") print(" Time spent so far:") print(" NetworkX: %.3f seconds" % nx_time) print(" My test: %.3f seconds" % my_time) print(" Amount of tests where trees were") print(" Isomorphic:", true_tests) print(" Non-Isomorphic:", false_tests) print("") print("--------------------------") print("") print("All tests agreed") print(" Tests returning true:", true_tests) print(" Tests returning false:", false_tests) print(" Networkx takes: ", nx_time, "s") print(" My function takes:", my_time, "s") print("") print("--------------------------") print("") return True
def add_to_hist_by_subgraph(subg, motifsHist, motifs_veriations): if (len(subg.nodes()) != len(motifs_veriations[0].nodes())): # print 'error' return for i in range(len(motifs_veriations)): if (nx.faster_could_be_isomorphic(subg, motifs_veriations[i])): if nx.is_isomorphic(subg, motifs_veriations[i]): for v in subg: motifsHist[v][i] = motifsHist[v][i] + 1 return
def isomorphic_check(prules, name): print '-' * 20 print 'Isomorphic rules check (within file)' # for f in files: # df1 = pd.read_csv(f, index_col=0, compression='bz2', dtype=dtyps) df1 = pd.DataFrame(prules) df1.columns = ['rnbr', 'lhs', 'rhs', 'pr'] print '... rules', df1.shape, 'reduced to', seen_rules = defaultdict(list) ruleprob2sum = defaultdict(list) cnrules = [] cntr = 0 for r in df1.iterrows(): if DBG: print r[1]['rnbr'], if r[1]['lhs'] not in seen_rules.keys(): seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 else: # lhs already seen # print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values # check the current rhs if the lhs matches to something already seen and check for an isomorphic match # rhs1 = listify_rhs(r[1]['rhs']) rhs1 = r[1]['rhs'] rhs2 = df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values[0] G1 = rhs_tomultigraph(rhs1) G2 = rhs_tomultigraph(rhs2) # if nx.is_isomorphic(G1, G2, edge_match=label_match): if nx.faster_could_be_isomorphic(G1, G2): # print ' ',r[1]['rnbr'], r[1]['rhs'], '::', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values if DBG: print ' <-curr', seen_rules[r[1]['lhs']][0], ':', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]][ 'rnbr'].values ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr']) else: seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 for k in ruleprob2sum.keys(): if DBG: print k if DBG: print " ", ruleprob2sum[k] if DBG: print " ", df1[df1['rnbr'] == k]['pr'].values+ sum(df1[df1['rnbr'] == r]['pr'].values for r in ruleprob2sum[k]) # df1[df1['rnbr'] == k]['pr'] += sum(df1[df1['rnbr'] == r]['pr'].values for r in ruleprob2sum[k]) c_val = df1[df1['rnbr'] == k]['pr'].values + sum(df1[df1['rnbr'] == r]['pr'].values for r in ruleprob2sum[k]) df1.set_value(df1[df1['rnbr'] == k].index, 'pr', c_val) for r in ruleprob2sum[k]: df1 = df1[df1.rnbr != r] print df1.shape # cnrules contains the rules we need to reduce df1 by # and ruleprob2sum will give us the new key for which pr will change. df1.to_csv("./ProdRules/"+name+"_prules.bz2",sep="\t", header=False, index=False, compression="bz2")
def isIsomorphicDuplicate(hcL, hc): """checks if hc is an isomorphism of any of the hc's in hcL Returns True if hcL contains an isomorphism of hc Returns False if it is not found""" #for each cube in hcL, check if hc could be isomorphic #if it could be isomorphic, then check if it is #if it is isomorphic, then return True #if all comparisons have been made already, then it is not an isomorphism and return False for saved_hc in hcL: if nx.faster_could_be_isomorphic(saved_hc, hc): if nx.is_isomorphic(saved_hc, hc): return True return False
def is_isomorph_nx(graph1, graph2): """ graph1, graph2: графы в формате networkx, изоморфность которых проверяется return: True, если графы изоморфны, иначе False """ is_iso = nx.faster_could_be_isomorphic(graph1, graph2) node_match = iso.categorical_node_match('label', 'C') edge_match = iso.categorical_edge_match(['weight', 'label'], [1, '-']) if is_iso: return iso.is_isomorphic(graph1, graph2, node_match=node_match, edge_match=edge_match) return False
def getIsomorphs(self, subset=None): """Get all proteins in the database with an isomorphic supernetwork.""" # Generate the NetworkX graph for the supernetwork G = nx.Graph() for i, j, weight in self.data: G.add_edge(i, j, weight=weight) # Get a cursor for all supernetworks in the database if subset is None: proteins = self.database.extractAllSuperNetworks( pdbref=self.pdbref) if proteins.count() == 0: raise ValueError("no protein supernetworks in database!") else: proteins = subset if len(proteins) == 0: raise ValueError("no protein supernetworks in subset ") isomorphs = [] for protein in proteins: G2 = nx.Graph() if type(protein) is dict: for i, j, weight in protein['data']: G2.add_edge(i, j, weight=weight) if nx.faster_could_be_isomorphic(G, G2) and nx.is_isomorphic( G, G2): isomorphs.append(protein['pdbref']) elif type(protein) is SuperNetwork or type(protein) is SuperNetworkNullModel : for i, j, weight in protein.data: G2.add_edge(i, j, weight=weight) if nx.faster_could_be_isomorphic(G, G2) and nx.is_isomorphic( G, G2): isomorphicProtein = protein.pdbref if protein.chainref is not None: isomorphicProtein += "_{}".format(protein.chainref) isomorphs.append(isomorphicProtein) else: raise TypeError("either a dict or a supernetwork must be provided!") return isomorphs
def __eq__(self, g2): if self.isFinal != g2.isFinal or \ self.maxLevel != g2.maxLevel or \ self.graph.number_of_nodes() != g2.graph.number_of_nodes() or \ self.graph.number_of_edges() != g2.graph.number_of_edges() or \ utilGraph.numInitials(self.graph) != utilGraph.numInitials(g2.graph) or \ utilGraph.numFinals(self.graph) != utilGraph.numFinals(g2.graph): return False if nx.faster_could_be_isomorphic(self.graph, g2.graph) == False: return False return nx.is_isomorphic(self.graph, g2.graph, node_match=self._node_match_function)
def faster_check(blueprint, physical): g_blue = nx.Graph() g_phy = nx.Graph() with open(blueprint) as f: arr = [tuple([int(x) for x in line.split()]) for line in f] g_blue.add_edges_from(arr[:-1]) f.close() with open(physical) as f: arr = [tuple([int(x) for x in line.split()]) for line in f] g_phy.add_edges_from(arr[:-1]) f.close() return nx.faster_could_be_isomorphic(g_blue, g_phy)
def jacc_dist_for_pair_dfrms(df1, df2): slen = len(df1) tlen = len(df2) # +++ conc_df = pd.concat([df1, df2]) # --- seen_rules = defaultdict(list) ruleprob2sum = defaultdict(list) cnrules = [] cntr = 0 for r in conc_df.iterrows(): if DBG: print r[1]['rnbr'], if r[1]['lhs'] not in seen_rules.keys(): seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 else: # lhs already seen # print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values # check the current rhs if the lhs matches to something already seen and check for an isomorphic match # rhs1 = listify_rhs(r[1]['rhs']) rhs1 = r[1]['rhs'] rhs2 = conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values[0] G1 = rhs_tomultigraph(rhs1) G2 = rhs_tomultigraph(rhs2) if nx.faster_could_be_isomorphic(G1, G2): # print ' ',r[1]['rnbr'], r[1]['rhs'], '::', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values if DBG: print ' <-curr', seen_rules[r[1]['lhs']][0], ':', conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rnbr'].values, conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['cate'].values ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr']) else: seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 if DBG: print "len(ruleprob2sum)", len(ruleprob2sum) if DBG: print dumps(ruleprob2sum, indent=4, sort_keys=True) if not DBG: print "Overlapping rules ", len(ruleprob2sum.keys()), sum([len(x) for x in ruleprob2sum.values()]) if DBG: print "Jaccard Sim:\t", (len(ruleprob2sum.keys())+sum([len(x) for x in ruleprob2sum.values()]))/ float(len(df1) + len(df2)) return (len(ruleprob2sum.keys())+sum([len(x) for x in ruleprob2sum.values()]))/ float(len(df1) + len(df2))
def get_mapper(gml): int_gpath = "{}/cat_hier_int.txt".format(os.path.dirname(gml)) read_g = nx.read_graphml(gml) str_nodes = list(read_g.nodes()) int_g = nx.convert_node_labels_to_integers(read_g) int_nodes = list(int_g.nodes()) assert len(str_nodes) == len(int_g), "str to int conversion incorrect" flag = nx.faster_could_be_isomorphic(read_g, int_g) logging.info("Isomorphic check: {}".format(flag)) mapper = {} for i, node in enumerate(str_nodes): mapper[node] = i if not os.path.isfile(int_gpath): nx.write_edgelist(int_g, int_gpath) with open(int_gpath, "r") as fmain: reader = fmain.readlines() file_str = "" for i, lines in enumerate(reader): line = lines.strip().replace("{'weight': 1}", "") line = lines.strip().replace("{}", "") file_str += "{}\n".format(line) with open(int_gpath, "w") as fmain: fmain.write(file_str) return mapper
def _nx_isomorphism(self, pattern, target): """ Uses the NetworkX isomorphism algorithm to check if the pattern graph and the target graph are isomorphic. The faster_could_be_isomorphic method is used to discount two structures if they could not be isomorphic. :param pattern: a molecule object which is to be tested for isomorphism :param target: a molecule object which pattern graph is to be compared against :return: None if the graphs are not isomorphic :return: a dictionary which maps the indices of the two NetworkX graphs together if they are isomorphic """ if pattern not in self.structure_nx: self._create_nx_graph(pattern) if target not in self.structure_nx: self._create_nx_graph(target) if not nx.faster_could_be_isomorphic(self.structure_nx[pattern], self.structure_nx[target]): # Graphs are definitely not isomorphic return None # Ensures the isomorphism considers the vertex label and edge type matcher = iso.GraphMatcher(self.structure_nx[pattern], self.structure_nx[target], node_match=iso.categorical_node_match('label', 'C'), edge_match=iso.categorical_edge_match('type', 'single')) if matcher.is_isomorphic(): return matcher.mapping
def getIsomorphicGroups(self, gs): i = 0 taken = {} map = defaultdict(list) while (i < len(gs)): # self.printdebug(gs, i) if (taken.has_key(i)): i += 1 continue j = i + 1 while (j < len(gs)): # self.printdebug(gs, j) isim = nx.faster_could_be_isomorphic(gs[i], gs[j]) if (isim): if not (map.has_key(i)): map[i].append(i) map[i].append(j) taken[j] = i j += 1 i += 1 self.updateNonIsomorphs(map, gs) return map
def is_isomorphic(g1, g2): # TODO node_match : callable if nx.faster_could_be_isomorphic(g1, g2): return nx.is_isomorphic(g1, g2) else: return False
def test_faster_could_be_isomorphic(self): assert_true(nx.faster_could_be_isomorphic(self.G3,self.G2))
def jaccard_coeff_isomorphic_rules_check(dfrm, headers_d): if dfrm.empty: return dfrm.columns = ['rnbr', 'lhs', 'rhs', 'pr', 'cate'] gb = dfrm.groupby(['cate']).groups if DBG: print gb.keys() sqr_mtrx = np.zeros(shape=(len(headers_d),len(headers_d))) for p in combinations(sorted(gb.keys()), 2): if DBG: print [x.split("_")[1] for x in p], if DBG: print [headers_d[x.split("_")[1]] for x in p] #[0].split("_")[-1] j = headers_d[p[0].split("_dimacs")[0].split("_")[-1]] i = headers_d[p[1].split("_dimacs")[0].split("_")[-1]] sqr_mtrx[i,j] = jaccard_coeff_isomorphic_rules_check_forfilepair(p, dfrm) # break # sqr_mtrx[[headers_d[x.split("_")[1]] for x in p]] = jaccard_coeff_isomorphic_rules_check_forfilepair(p, dfrm) Log_Info() print sqr_mtrx return sqr_mtrx # numpy.savetxt("foo.csv", a, delimiter=",") exit() seen_rules = defaultdict(list) ruleprob2sum = defaultdict(list) cnrules = [] cntr = 0 for r in dfrm.iterrows(): if DBG: print r[1]['rnbr'], if r[1]['lhs'] not in seen_rules.keys(): seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 else: # lhs already seen # print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values # check the current rhs if the lhs matches to something already seen and check for an isomorphic match # rhs1 = listify_rhs(r[1]['rhs']) rhs1 = r[1]['rhs'] rhs2 = dfrm[dfrm['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values[0] G1 = rhs_tomultigraph(rhs1) G2 = rhs_tomultigraph(rhs2) if nx.faster_could_be_isomorphic(G1, G2): # print ' ',r[1]['rnbr'], r[1]['rhs'], '::', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values if DBG: print ' <-curr', seen_rules[r[1]['lhs']][0], ':', dfrm[dfrm['rnbr'] == seen_rules[r[1]['lhs']][0]]['rnbr'].values, dfrm[dfrm['rnbr'] == seen_rules[r[1]['lhs']][0]]['cate'].values ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr']) else: seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 # for k in ruleprob2sum.keys(): # if DBG: print k # if DBG: print " ", ruleprob2sum[k] # if DBG: print " ", dfrm[dfrm['rnbr'] == k]['pr'].values+ sum(dfrm[dfrm['rnbr'] == r]['pr'].values for r in ruleprob2sum[k]) # # dfrm[dfrm['rnbr'] == k]['pr'] += sum(dfrm[dfrm['rnbr'] == r]['pr'].values for r in ruleprob2sum[k]) # c_val = dfrm[dfrm['rnbr'] == k]['pr'].values + sum(dfrm[dfrm['rnbr'] == r]['pr'].values for r in ruleprob2sum[k]) # dfrm.set_value(dfrm[dfrm['rnbr'] == k].index, 'pr', c_val) # for r in ruleprob2sum[k]: # dfrm = dfrm[dfrm.rnbr != r] # print dfrm.shape # cnrules contains the rules we need to reduce df1 by # and ruleprob2sum will give us the new key for which pr will change. # df1.to_csv("./ProdRules/"+name+"_prules.bz2",sep="\t", header="False", index=False, compression="bz2") return True
def jacc_dist_for_pair_dfrms(df1, df2): """ df1 and df2 are each a dataframe (sets) to use for comparison returns: jaccard similarity score """ slen = len(df1) tlen = len(df2) # +++ conc_df = pd.concat([df1, df2]) # print ">>>", conc_df.shape # --- seen_rules = defaultdict(list) ruleprob2sum = defaultdict(list) cnrules = [] cntr = 0 # DBG = True for r in conc_df.iterrows(): # /* for each rule in the stack */ if r[1]['lhs'] not in seen_rules.keys(): # print r[1]['rnbr'], seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 else: # lhs already seen # print r[1]['rnbr'], # print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values # check the current rhs if the lhs matches to something already seen and check for an isomorphic match # rhs1 = listify_rhs(r[1]['rhs']) rhs1 = r[1]['rhs'] rhs2 = conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']] [0]]['rhs'].values[0] # rhs2 = conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'] G1 = rhs_tomultigraph(rhs1) G2 = rhs_tomultigraph(rhs2) # for rl in rhs2.values: # G2 = rhs_tomultigraph(rl) # # if nx.is_isomorphic(G1, G2, edge_match=label_match): if nx.faster_could_be_isomorphic(G1, G2): if DBG: print ' <-curr', seen_rules[r[1]['lhs']][0], ':', \ conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rnbr'].values, \ conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['cate'].values ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr']) seen_rules[r[1]['lhs']].append(r[1]['rnbr']) else: seen_rules[r[1]['lhs']].append(r[1]['rnbr']) cnrules.append(r[1]['rnbr']) if DBG: print "+" cntr += 1 if DBG: print "len(ruleprob2sum)", len(ruleprob2sum) from json import dumps if DBG: print dumps(ruleprob2sum, indent=4, sort_keys=True) # print ruleprob2sum if DBG: print " Overlapping rules ", len(ruleprob2sum.keys()), sum( [len(x) for x in ruleprob2sum.values()]) if DBG: print " Jaccard Sim:\t", (len(ruleprob2sum.keys()) + sum( [len(x) for x in ruleprob2sum.values()])) / float(len(df1) + len(df2)) print df1.groupby(['cate' ]).groups.keys()[0].split('_prules')[0], df2.groupby( ['cate']).groups.keys()[0].rstrip('_prules'), return (len(ruleprob2sum.keys()) + sum( [len(x) for x in ruleprob2sum.values()])) / float(len(df1) + len(df2))