def induced_subgraph(G, H):
    '''
    induced_subgraph
    a function that checks if G has an induced subgraph of H
    Parameters:
        G: the graph to check (networkx)
        H: the induced subgraph (networkx)
    Returns:
        induced: the induced subgraph (networkx)
    Method:
        just create every permutation of the G graph with as many vertices
        as H and use networkx to check if isomorphic
    Note:
         not solved in polynomial time (only use for small cases)
    '''
    n = len(G)
    k = len(H)
    if n < k:
        return None
    permutations = create_permutations(n, k)
    induced = None
    for subset in permutations:
        subgraph = G.subgraph(subset)
        if nx.faster_could_be_isomorphic(subgraph, H):
            if nx.is_isomorphic(subgraph, H):
                induced = subgraph
                break # only want to find one
    return induced
Exemple #2
0
    def compare_trees(T1, T2):
        """
		Returns true if the trees are IDENTICAL (isomorphism is necessary but insufficient), and False otherwise.

		Two trees are identical iff:
			1) same set of nodes
			2) same set of branches
			3) same branch params
			(for now, assumed no node params)

		DEVELOPER'S CORNER:
			I learned that dictionaries can be tested for equality via ==. There's also talk on StackOverflow about this
			equality test holding for nested dicts (and other nesting combinations with other data structures).

		"""
        if not isinstance(T1, nx.DiGraph) or not isinstance(T2, nx.DiGraph):
            raise TypeError("This function only accepts NetworkX DiGraphs.")

        if not nx.faster_could_be_isomorphic(T1, T2):
            return False

        # Check 1: Compare nodes
        if set(T1) != set(T2):
            return False

        # Check 2: Compare branches
        br1 = set(br for br in T1.edges())
        br2 = set(br for br in T2.edges())

        if br1 != br2:
            return False

        # Check 3: Compare branch parameters (final test)
        return all(
            [T1.edges[up, down] == T2.edges[up, down] for up, down in br1])
 def iso_json(string1,string2):
     dataG1 = json.loads(string1)
     graph1 = json_graph.node_link_graph(dataG1)
     dataG2 = json.loads(string2)
     graph2 = json_graph.node_link_graph(dataG2)
    # return nx.is_isomorphic(graph1, graph2)
     return nx.faster_could_be_isomorphic(graph1, graph2)
Exemple #4
0
def check_isomorphy(letters, rotors, notch_distance=None):
    graphs = []
    #all_single_notches= permutations([i for i in range(letters)], notches_per_rotor)
    all_notches = permutations([i for i in range(letters)], rotors)
    #update with more notches
    if (notch_distance != None):
        all_notches = [[[(notch + d) % letters
                         for d in [0] + notch_distance[r]]
                        for r, notch in enumerate(notches)]
                       for notches in all_notches]
    pG = 0
    trigger = True
    for notches, n in zip(all_notches, range(len(all_notches))):
        print("\rcheck_multiple_notch_isomorphy(5,3,2)rG%d: notching " % n,
              notches,
              end='')
        G = nx.DiGraph()
        G.add_edges_from(make_cycle_graph(notches, letters))
        if n > 0:
            print(" --- comparing graph %d and %d [%3d%%]" %
                  (n - 1, n, round(100 * n / (len(all_notches) - 1))),
                  end='')
            if (not nx.faster_could_be_isomorphic(G, pG)):
                print("non-isomorphic graphs found!")
                trigger = False
        pG = G
    return trigger
def automorphism_groups2(g, graphs):
    print("len(graphs)", len(graphs))

    unique_graphs = [[graphs[0]]]
    for i in range(1, len(graphs)):
        add = True
        for j in range(len(unique_graphs)):
            uniq_g = unique_graphs[j][0]
            g1 = nx.subgraph(g, graphs[i])
            g2 = nx.subgraph(g, uniq_g)
            g1 = nx.subgraph(g, graphs[i])
            ds1 = [d for n, d in g1.degree()]
            if (list(np.sort(ds1)) == [1, 2, 2, 2, 3]
                    or list(np.sort(ds1)) == [2, 2, 2, 3, 3]):
                # usa isomorphism
                if (nx.is_isomorphic(g1, g2)):
                    add = False
                    unique_graphs[j].append(graphs[i])
            else:
                # usa deg seq
                if (nx.faster_could_be_isomorphic(g1, g2)):
                    add = False
                    unique_graphs[j].append(graphs[i])
        if (add):
            unique_graphs.append([graphs[i]])

    return (unique_graphs)
Exemple #6
0
def isomorphism(pattern, target, nx_structures):
    """
    Uses the NetworkX isomorphism algorithm to check if the pattern graph and the target graph are isomorphic.

    The faster_could_be_isomorphic method is used to discount two structures if they could not be isomorphic.
    :param pattern: a molecule object which is to be tested for isomorphism
    :param target: a molecule object which pattern graph is to be compared against
    :return: None if the graphs are not isomorphic
    :return: a dictionary which maps the indices of the two NetworkX graphs together if they are isomorphic
    """
    if pattern not in nx_structures:
        nx_structures[pattern] = create_nx_graph(pattern, nx_structures)
    if target not in nx_structures:
        nx_structures[target] = create_nx_graph(target, nx_structures)
    if not nx.faster_could_be_isomorphic(nx_structures[pattern],
                                         nx_structures[target]):
        # Graphs are definitely not isomorphic
        return None

    #print pattern, target
    # Ensures the isomorphism considers the vertex label and edge type
    matcher = iso.GraphMatcher(
        nx_structures[pattern],
        nx_structures[target],
        node_match=iso.categorical_node_match('label', 'C'),
        edge_match=iso.categorical_edge_match('type', 'single'))
    if matcher.is_isomorphic():
        return matcher.mapping
def count_motifs(network, motifs):
	"""
	Returns a dictionary of motif counts, keyed on names and valued on occurances. [string, int]
	"""
	m_count = dict(zip(motifs.keys(), [0 for x in motifs.keys()]))
	sub_graphs = generate_triads_2(network)
	count = 0

	if LOG:
		print "\t\t Counting triads"
	for nodes in sub_graphs:
		sub_graph = network.subgraph(nodes)
		possibles = {}
		for motif_name in motifs.keys():
			count += 1
			if LOG and count % 5000 == 0:
				print "\t\t\t" + str(count/float(len(sub_graphs) * 13))
			if nx.faster_could_be_isomorphic(sub_graph, motifs[motif_name]):
				possibles[motif_name] = sub_graph

		# Only one possibility? Add it
		if len(possibles.keys()) == 1:
			m_count[possibles.keys()[0]] += 1
		else: 
			for possible_name in possibles:
				if nx.is_isomorphic(sub_graph, motifs[possible_name]):
					m_count[possible_name] += 1
					break

	return m_count
def not_isomorphic(graph_a, graph_b):
    """

    :param graph_a:
    :param graph_b:
    :return:
    """

    return nx.faster_could_be_isomorphic(graph_a.g, graph_b.g)
Exemple #9
0
def check_all_pairs(L):
    n_trees = len(L)
    n_pairs = (n_trees * (n_trees - 1)) / 2

    nx_time = 0
    my_time = 0

    true_tests = 0
    false_tests = 0

    proc_pairs = 0
    for i in range(0, n_trees):
        for j in range(i + 1, n_trees):
            proc_pairs += 1

            begin = time.time()
            if nx.faster_could_be_isomorphic(L[i], L[j]):
                nx_iso = nx.is_isomorphic(L[i], L[j])
            else:
                nx_iso = False
            end = time.time()
            nx_time += end - begin

            begin = time.time()
            my_iso = are_trees_isomorphic(L[i], L[j])
            end = time.time()
            my_time += end - begin

            assert (my_iso == nx_iso)

            if my_iso: true_tests += 1
            else: false_tests += 1

            if proc_pairs % 1000 == 0:
                print("Processed: %.3f" % (proc_pairs / n_pairs * 100), "%")
                print("    Time spent so far:")
                print("        NetworkX: %.3f seconds" % nx_time)
                print("        My test: %.3f seconds" % my_time)
                print("    Amount of tests where trees were")
                print("        Isomorphic:", true_tests)
                print("        Non-Isomorphic:", false_tests)

    print("")
    print("--------------------------")
    print("")

    print("All tests agreed")
    print("    Tests returning true:", true_tests)
    print("    Tests returning false:", false_tests)
    print("    Networkx takes:   ", nx_time, "s")
    print("    My function takes:", my_time, "s")

    print("")
    print("--------------------------")
    print("")
    return True
Exemple #10
0
def add_to_hist_by_subgraph(subg, motifsHist, motifs_veriations):
    if (len(subg.nodes()) != len(motifs_veriations[0].nodes())):
        # print 'error'
        return
    for i in range(len(motifs_veriations)):
        if (nx.faster_could_be_isomorphic(subg, motifs_veriations[i])):
            if nx.is_isomorphic(subg, motifs_veriations[i]):
                for v in subg:
                    motifsHist[v][i] = motifsHist[v][i] + 1
                return
def isomorphic_check(prules, name):
	print '-' * 20
	print 'Isomorphic rules check (within file)'
	# for f in files:
	#	 df1 = pd.read_csv(f, index_col=0, compression='bz2', dtype=dtyps)
	df1 = pd.DataFrame(prules)
	df1.columns = ['rnbr', 'lhs', 'rhs', 'pr']
	print '... rules', df1.shape, 'reduced to',
	seen_rules = defaultdict(list)
	ruleprob2sum = defaultdict(list)
	cnrules = []
	cntr = 0
	for r in df1.iterrows():
		if DBG: print r[1]['rnbr'],
		if r[1]['lhs'] not in seen_rules.keys():
			seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
			cnrules.append(r[1]['rnbr'])
			if DBG: print "+"
			cntr += 1
		else:	# lhs already seen
			# print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values
			# check the current rhs if the lhs matches to something already seen and check for an isomorphic match
			# rhs1 = listify_rhs(r[1]['rhs'])
			rhs1 = r[1]['rhs']
			rhs2 = df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values[0]
			G1 = rhs_tomultigraph(rhs1)
			G2 = rhs_tomultigraph(rhs2)
			# if nx.is_isomorphic(G1, G2, edge_match=label_match):
			if nx.faster_could_be_isomorphic(G1, G2):
				# print ' ',r[1]['rnbr'], r[1]['rhs'], '::', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values
				if DBG: print ' <-curr', seen_rules[r[1]['lhs']][0], ':', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]][
					'rnbr'].values
				ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr'])
			else:
				seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
				cnrules.append(r[1]['rnbr'])
				if DBG: print "+"
				cntr += 1
	for k in ruleprob2sum.keys():
		if DBG: print k
		if DBG: print "	", ruleprob2sum[k]
		if DBG: print "	", df1[df1['rnbr'] == k]['pr'].values+ sum(df1[df1['rnbr'] == r]['pr'].values for r in ruleprob2sum[k])
		# df1[df1['rnbr'] == k]['pr'] += sum(df1[df1['rnbr'] == r]['pr'].values for r in ruleprob2sum[k])
		c_val = df1[df1['rnbr'] == k]['pr'].values	+ sum(df1[df1['rnbr'] == r]['pr'].values for r in ruleprob2sum[k])
		df1.set_value(df1[df1['rnbr'] == k].index, 'pr', c_val)
		for r in ruleprob2sum[k]:
			df1 = df1[df1.rnbr != r]
	print df1.shape

	# cnrules contains the rules we need to reduce df1 by
	# and ruleprob2sum will give us the new key for which pr will change.
	df1.to_csv("./ProdRules/"+name+"_prules.bz2",sep="\t", header=False, index=False, compression="bz2")
Exemple #12
0
def isIsomorphicDuplicate(hcL, hc):
    """checks if hc is an isomorphism of any of the hc's in hcL
    Returns True if hcL contains an isomorphism of hc
    Returns False if it is not found"""
    #for each cube in hcL, check if hc could be isomorphic
    #if it could be isomorphic, then check if it is
    #if it is isomorphic, then return True
    #if all comparisons have been made already, then it is not an isomorphism and return False

    for saved_hc in hcL:
        if nx.faster_could_be_isomorphic(saved_hc, hc):
            if nx.is_isomorphic(saved_hc, hc):
                return True
    return False
Exemple #13
0
def is_isomorph_nx(graph1, graph2):
    """
    graph1, graph2: графы в формате networkx, изоморфность которых проверяется
    return: True, если графы изоморфны, иначе False
    """
    is_iso = nx.faster_could_be_isomorphic(graph1, graph2)
    node_match = iso.categorical_node_match('label', 'C')
    edge_match = iso.categorical_edge_match(['weight', 'label'], [1, '-'])
    if is_iso:
        return iso.is_isomorphic(graph1,
                                 graph2,
                                 node_match=node_match,
                                 edge_match=edge_match)
    return False
Exemple #14
0
def isIsomorphicDuplicate(hcL, hc):
    """checks if hc is an isomorphism of any of the hc's in hcL
    Returns True if hcL contains an isomorphism of hc
    Returns False if it is not found"""
    #for each cube in hcL, check if hc could be isomorphic
    #if it could be isomorphic, then check if it is
    #if it is isomorphic, then return True
    #if all comparisons have been made already, then it is not an isomorphism and return False
    
    for saved_hc in hcL:
        if nx.faster_could_be_isomorphic(saved_hc, hc):
            if nx.is_isomorphic(saved_hc, hc):
                return True
    return False
Exemple #15
0
 def getIsomorphs(self, subset=None):
     """Get all proteins in the database with an isomorphic supernetwork."""
     # Generate the NetworkX graph for the supernetwork
     G = nx.Graph()
     for i, j, weight in self.data:
         G.add_edge(i, j, weight=weight)
     # Get a cursor for all supernetworks in the database
     if subset is None:
         proteins = self.database.extractAllSuperNetworks(
             pdbref=self.pdbref)
         if proteins.count() == 0:
             raise ValueError("no protein supernetworks in database!")
     else:
         proteins = subset
         if len(proteins) == 0:
             raise ValueError("no protein supernetworks in subset ")
     isomorphs = []
     for protein in proteins:
         G2 = nx.Graph()
         if type(protein) is dict:
             for i, j, weight in protein['data']:
                 G2.add_edge(i, j, weight=weight)
             if nx.faster_could_be_isomorphic(G, G2) and nx.is_isomorphic(
                     G, G2):
                 isomorphs.append(protein['pdbref'])
         elif type(protein) is SuperNetwork or type(protein) is SuperNetworkNullModel :
             for i, j, weight in protein.data:
                 G2.add_edge(i, j, weight=weight)
             if nx.faster_could_be_isomorphic(G, G2) and nx.is_isomorphic(
                     G, G2):
                 isomorphicProtein = protein.pdbref
                 if protein.chainref is not None:
                     isomorphicProtein += "_{}".format(protein.chainref)
                 isomorphs.append(isomorphicProtein)
         else:
             raise TypeError("either a dict or a supernetwork must be provided!")
     return isomorphs
Exemple #16
0
    def __eq__(self, g2):
        if self.isFinal != g2.isFinal or \
           self.maxLevel != g2.maxLevel or \
           self.graph.number_of_nodes() != g2.graph.number_of_nodes() or \
           self.graph.number_of_edges() != g2.graph.number_of_edges() or \
           utilGraph.numInitials(self.graph) != utilGraph.numInitials(g2.graph) or \
           utilGraph.numFinals(self.graph) != utilGraph.numFinals(g2.graph):
            return False

        if nx.faster_could_be_isomorphic(self.graph, g2.graph) == False:
            return False

        return nx.is_isomorphic(self.graph,
                                g2.graph,
                                node_match=self._node_match_function)
Exemple #17
0
def faster_check(blueprint, physical):
    g_blue = nx.Graph()
    g_phy = nx.Graph()

    with open(blueprint) as f:
        arr = [tuple([int(x) for x in line.split()]) for line in f]
        g_blue.add_edges_from(arr[:-1])
        f.close()

    with open(physical) as f:
        arr = [tuple([int(x) for x in line.split()]) for line in f]
        g_phy.add_edges_from(arr[:-1])
        f.close()

    return nx.faster_could_be_isomorphic(g_blue, g_phy)
def jacc_dist_for_pair_dfrms(df1, df2):
	slen = len(df1)
	tlen = len(df2)
	# +++
	conc_df = pd.concat([df1, df2])
	# ---
	seen_rules = defaultdict(list)
	ruleprob2sum = defaultdict(list)
	cnrules = []
	cntr = 0
	for r in conc_df.iterrows():
		if DBG: print r[1]['rnbr'],
		if r[1]['lhs'] not in seen_rules.keys():
			seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
			cnrules.append(r[1]['rnbr'])
			if DBG: print "+"
			cntr += 1
		else:	# lhs already seen
			# print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values
			# check the current rhs if the lhs matches to something already seen and check for an isomorphic match
			# rhs1 = listify_rhs(r[1]['rhs'])
			rhs1 = r[1]['rhs']
			rhs2 = conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values[0]
			G1 = rhs_tomultigraph(rhs1)
			G2 = rhs_tomultigraph(rhs2)
			if nx.faster_could_be_isomorphic(G1, G2):
				# print ' ',r[1]['rnbr'], r[1]['rhs'], '::', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values
				if DBG: print ' <-curr', seen_rules[r[1]['lhs']][0], ':', conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rnbr'].values, conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['cate'].values
				ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr'])
			else:
				seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
				cnrules.append(r[1]['rnbr'])
				if DBG: print "+"
				cntr += 1
	
	if DBG: print "len(ruleprob2sum)", len(ruleprob2sum)
	if DBG: print	dumps(ruleprob2sum, indent=4, sort_keys=True)

	if not DBG: print "Overlapping rules	", len(ruleprob2sum.keys()), sum([len(x) for x in ruleprob2sum.values()])
	if DBG: print "Jaccard Sim:\t", (len(ruleprob2sum.keys())+sum([len(x) for x in ruleprob2sum.values()]))/ float(len(df1) + len(df2))
	return (len(ruleprob2sum.keys())+sum([len(x) for x in ruleprob2sum.values()]))/ float(len(df1) + len(df2))
Exemple #19
0
def get_mapper(gml):

    int_gpath = "{}/cat_hier_int.txt".format(os.path.dirname(gml))

    read_g = nx.read_graphml(gml)

    str_nodes = list(read_g.nodes())
    int_g = nx.convert_node_labels_to_integers(read_g)
    int_nodes = list(int_g.nodes())

    assert len(str_nodes) == len(int_g), "str to int conversion incorrect"

    flag = nx.faster_could_be_isomorphic(read_g, int_g)
    logging.info("Isomorphic check: {}".format(flag))

    mapper = {}

    for i, node in enumerate(str_nodes):
        mapper[node] = i

    if not os.path.isfile(int_gpath):
        nx.write_edgelist(int_g, int_gpath)

        with open(int_gpath, "r") as fmain:
            reader = fmain.readlines()

        file_str = ""

        for i, lines in enumerate(reader):
            line = lines.strip().replace("{'weight': 1}", "")
            line = lines.strip().replace("{}", "")

            file_str += "{}\n".format(line)

        with open(int_gpath, "w") as fmain:
            fmain.write(file_str)

    return mapper
    def _nx_isomorphism(self, pattern, target):
        """
        Uses the NetworkX isomorphism algorithm to check if the pattern graph and the target graph are isomorphic.

        The faster_could_be_isomorphic method is used to discount two structures if they could not be isomorphic.
        :param pattern: a molecule object which is to be tested for isomorphism
        :param target: a molecule object which pattern graph is to be compared against
        :return: None if the graphs are not isomorphic
        :return: a dictionary which maps the indices of the two NetworkX graphs together if they are isomorphic
        """
        if pattern not in self.structure_nx:
            self._create_nx_graph(pattern)
        if target not in self.structure_nx:
            self._create_nx_graph(target)
        if not nx.faster_could_be_isomorphic(self.structure_nx[pattern], self.structure_nx[target]):
            # Graphs are definitely not isomorphic
            return None
        # Ensures the isomorphism considers the vertex label and edge type
        matcher = iso.GraphMatcher(self.structure_nx[pattern], self.structure_nx[target],
                                   node_match=iso.categorical_node_match('label', 'C'),
                                   edge_match=iso.categorical_edge_match('type', 'single'))
        if matcher.is_isomorphic():
            return matcher.mapping
    def getIsomorphicGroups(self, gs):
        i = 0
        taken = {}
        map = defaultdict(list)
        while (i < len(gs)):
            # self.printdebug(gs, i)
            if (taken.has_key(i)):
                i += 1
                continue
            j = i + 1
            while (j < len(gs)):
                # self.printdebug(gs, j)
                isim = nx.faster_could_be_isomorphic(gs[i], gs[j])
                if (isim):
                    if not (map.has_key(i)):
                        map[i].append(i)
                    map[i].append(j)
                    taken[j] = i
                j += 1
            i += 1

        self.updateNonIsomorphs(map, gs)

        return map
    def getIsomorphicGroups(self, gs):
        i = 0
        taken = {}
        map = defaultdict(list)
        while (i < len(gs)):
            # self.printdebug(gs, i)
            if (taken.has_key(i)):
                i += 1
                continue
            j = i + 1
            while (j < len(gs)):
                # self.printdebug(gs, j)
                isim = nx.faster_could_be_isomorphic(gs[i], gs[j])
                if (isim):
                    if not (map.has_key(i)):
                        map[i].append(i)
                    map[i].append(j)
                    taken[j] = i
                j += 1
            i += 1

        self.updateNonIsomorphs(map, gs)

        return map
Exemple #23
0
 def is_isomorphic(g1, g2):
     # TODO node_match : callable
     if nx.faster_could_be_isomorphic(g1, g2):
         return nx.is_isomorphic(g1, g2)
     else:
         return False
Exemple #24
0
 def test_faster_could_be_isomorphic(self):
     assert_true(nx.faster_could_be_isomorphic(self.G3,self.G2))
def jaccard_coeff_isomorphic_rules_check(dfrm, headers_d):
	if dfrm.empty: return

	dfrm.columns = ['rnbr', 'lhs', 'rhs', 'pr', 'cate']
	gb = dfrm.groupby(['cate']).groups
	if DBG: print gb.keys()
	sqr_mtrx = np.zeros(shape=(len(headers_d),len(headers_d)))

	for p in combinations(sorted(gb.keys()), 2):
		if DBG: print [x.split("_")[1] for x in p],
		if DBG: print [headers_d[x.split("_")[1]] for x in p] #[0].split("_")[-1]
		j = headers_d[p[0].split("_dimacs")[0].split("_")[-1]]
		i = headers_d[p[1].split("_dimacs")[0].split("_")[-1]]
		sqr_mtrx[i,j] = jaccard_coeff_isomorphic_rules_check_forfilepair(p, dfrm)
		# break
		# sqr_mtrx[[headers_d[x.split("_")[1]] for x in p]] = jaccard_coeff_isomorphic_rules_check_forfilepair(p, dfrm)
	Log_Info()
	print sqr_mtrx
	return sqr_mtrx # numpy.savetxt("foo.csv", a, delimiter=",")

	exit()

	seen_rules = defaultdict(list)
	ruleprob2sum = defaultdict(list)
	cnrules = []
	cntr = 0

	for r in dfrm.iterrows():
		if DBG: print r[1]['rnbr'],
		if r[1]['lhs'] not in seen_rules.keys():
			seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
			cnrules.append(r[1]['rnbr'])
			if DBG: print "+"
			cntr += 1
		else:	# lhs already seen
			# print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values
			# check the current rhs if the lhs matches to something already seen and check for an isomorphic match
			# rhs1 = listify_rhs(r[1]['rhs'])
			rhs1 = r[1]['rhs']
			rhs2 = dfrm[dfrm['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values[0]
			G1 = rhs_tomultigraph(rhs1)
			G2 = rhs_tomultigraph(rhs2)
			if nx.faster_could_be_isomorphic(G1, G2):
				# print ' ',r[1]['rnbr'], r[1]['rhs'], '::', df1[df1['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs'].values
				if DBG: print ' <-curr', seen_rules[r[1]['lhs']][0], ':', dfrm[dfrm['rnbr'] == seen_rules[r[1]['lhs']][0]]['rnbr'].values, dfrm[dfrm['rnbr'] == seen_rules[r[1]['lhs']][0]]['cate'].values
				ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr'])
			else:
				seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
				cnrules.append(r[1]['rnbr'])
				if DBG: print "+"
				cntr += 1

#	for k in ruleprob2sum.keys():
#		if DBG: print k
#		if DBG: print "	", ruleprob2sum[k]
#		if DBG: print "	", dfrm[dfrm['rnbr'] == k]['pr'].values+ sum(dfrm[dfrm['rnbr'] == r]['pr'].values for r in ruleprob2sum[k])
#		# dfrm[dfrm['rnbr'] == k]['pr'] += sum(dfrm[dfrm['rnbr'] == r]['pr'].values for r in ruleprob2sum[k])
#		c_val = dfrm[dfrm['rnbr'] == k]['pr'].values	+ sum(dfrm[dfrm['rnbr'] == r]['pr'].values for r in ruleprob2sum[k])
#		dfrm.set_value(dfrm[dfrm['rnbr'] == k].index, 'pr', c_val)
#		for r in ruleprob2sum[k]:
#			dfrm = dfrm[dfrm.rnbr != r]
#	print dfrm.shape

	# cnrules contains the rules we need to reduce df1 by
	# and ruleprob2sum will give us the new key for which pr will change.
	#	df1.to_csv("./ProdRules/"+name+"_prules.bz2",sep="\t", header="False", index=False, compression="bz2")
	return True
Exemple #26
0
def jacc_dist_for_pair_dfrms(df1, df2):
    """
	df1 and df2 are each a dataframe (sets) to use for comparison
	returns: jaccard similarity score
	"""
    slen = len(df1)
    tlen = len(df2)
    # +++
    conc_df = pd.concat([df1, df2])
    #	print ">>>", conc_df.shape
    # ---
    seen_rules = defaultdict(list)
    ruleprob2sum = defaultdict(list)
    cnrules = []
    cntr = 0
    #	DBG = True
    for r in conc_df.iterrows():  # /* for each rule in the stack */
        if r[1]['lhs'] not in seen_rules.keys():
            #			print r[1]['rnbr'],
            seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
            cnrules.append(r[1]['rnbr'])
            if DBG: print "+"
            cntr += 1
        else:  # lhs already seen
            #			print r[1]['rnbr'],
            # print df1[df1['rnbr']==seen_rules[r[1]['lhs']][0]]['rhs'].values
            # check the current rhs if the lhs matches to something already seen and check for an isomorphic match
            # rhs1 = listify_rhs(r[1]['rhs'])
            rhs1 = r[1]['rhs']
            rhs2 = conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']]
                           [0]]['rhs'].values[0]
            #			rhs2 = conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rhs']
            G1 = rhs_tomultigraph(rhs1)
            G2 = rhs_tomultigraph(rhs2)
            #			for rl in rhs2.values:
            #				G2 = rhs_tomultigraph(rl)
            #
            # if nx.is_isomorphic(G1, G2, edge_match=label_match):
            if nx.faster_could_be_isomorphic(G1, G2):
                if DBG:                    print ' <-curr', seen_rules[r[1]['lhs']][0], ':', \
            conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['rnbr'].values, \
            conc_df[conc_df['rnbr'] == seen_rules[r[1]['lhs']][0]]['cate'].values
                ruleprob2sum[seen_rules[r[1]['lhs']][0]].append(r[1]['rnbr'])
                seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
            else:
                seen_rules[r[1]['lhs']].append(r[1]['rnbr'])
                cnrules.append(r[1]['rnbr'])
                if DBG: print "+"
                cntr += 1

    if DBG: print "len(ruleprob2sum)", len(ruleprob2sum)
    from json import dumps
    if DBG: print dumps(ruleprob2sum, indent=4, sort_keys=True)
    # print ruleprob2sum
    if DBG:
        print "  Overlapping rules	", len(ruleprob2sum.keys()), sum(
            [len(x) for x in ruleprob2sum.values()])
    if DBG:
        print "  Jaccard Sim:\t", (len(ruleprob2sum.keys()) + sum(
            [len(x)
             for x in ruleprob2sum.values()])) / float(len(df1) + len(df2))

    print df1.groupby(['cate'
                       ]).groups.keys()[0].split('_prules')[0], df2.groupby(
                           ['cate']).groups.keys()[0].rstrip('_prules'),

    return (len(ruleprob2sum.keys()) + sum(
        [len(x) for x in ruleprob2sum.values()])) / float(len(df1) + len(df2))
Exemple #27
0
 def is_isomorphic(g1, g2):
     # TODO node_match : callable
     if nx.faster_could_be_isomorphic(g1, g2):
         return nx.is_isomorphic(g1, g2)
     else:
         return False