def parse_nobal_ilp_sol(filename, genome_a, genome_b): # build the master graph with matching and adjacency edges: # 1st add capping genes: add_capping_genes(genome_a, genome_b) master_graph = nx.Graph() # matching edges: obj, edges = obj_and_matching_from_sol(filename) for gene, c_a, c_b in edges: master_graph.add_edge(("A", gene, c_a, Ext.HEAD), ("B", gene, c_b, Ext.HEAD)) master_graph.add_edge(("A", gene, c_a, Ext.TAIL), ("B", gene, c_b, Ext.TAIL)) # adjacency edges: for genome, genome_name in [(genome_a, "A"), (genome_b, "B")]: for (g_i, copy_a, e_i), (g_j, copy_b, e_j) in genome.adjacency_iter_with_copies(): master_graph.add_edge((genome_name, g_i, copy_a, e_i), (genome_name, g_j, copy_b, e_j)) # now, graph should be a collection of cycles and paths, where path extremities are balancing extremities. # We need to check those extremities to classify paths in AB-, AA- and BB- # loop for each component: G = nx.Graph() ab_components = [] for comp in connected_components(master_graph): degree_one = sorted([v for v in comp if master_graph.degree(v) == 1]) if len(degree_one) > 1: assert len(degree_one) == 2 # if AA- or BB, add the balancing edge: if degree_one[0][0] == degree_one[1][0]: G.add_edge(*degree_one) # add also the gene edges, from head to tail: for genome, gene, copy, ext in degree_one: G.add_edge((genome, gene, copy, Ext.HEAD), (genome, gene, copy, Ext.TAIL)) else: # AB- component, save for later; ab_components.append(degree_one) # deal with AB pairing 2-by-2 arbitrarly: for ab_i, ab_j in zip(*[iter(ab_components)] * 2): G.add_edge(ab_i[0], ab_j[0]) G.add_edge(ab_i[1], ab_j[1]) # add also the gene edges, from head to tail: for genome, gene, copy, ext in [ab_i[0], ab_i[1], ab_j[0], ab_j[1]]: G.add_edge((genome, gene, copy, Ext.HEAD), (genome, gene, copy, Ext.TAIL)) # Find indels as connected components in A and B: indels = {"A": 0, "B": 0} for comp in connected_components(G): genome, gene, copy, ext = list(comp)[0] indels[genome] += 1 # log file: gap, time = parse_log_file(filename.replace("sol", "log")) # correct objective function, since each AB-component is counted as a full cycle in the ILP, but # should only by half-cycle. obj += len(ab_components) / 2 return {"dcj_distance": obj, "dup_a": indels["A"], "dup_b": indels["B"], "time": time, "gap": gap}
def parse_ilp_sol(filename): G = nx.Graph() obj = 0 edges = [] with open(filename) as f: for l in f: m = obj_regexp.match(l.strip()) if m is not None: obj = int(round(float(m.group(1)), 0)) m = balancing_regexp.match(l.strip()) if m is not None: # print m.groups() # print l genome_1, gene_1, copy_1, ext_1, genome_2, gene_2, copy_2, ext_2, val = m.groups() if float(val) >= 0.9: # sometimes variables with values like 1e-12, 0.99999 or 1.000000001 appear. # connect head and tail: G.add_edge((genome_1, gene_1, copy_1, Ext.HEAD), (genome_1, gene_1, copy_1, Ext.TAIL)) G.add_edge((genome_2, gene_2, copy_2, Ext.HEAD), (genome_2, gene_2, copy_2, Ext.TAIL)) # balancing edge: G.add_edge((genome_1, gene_1, copy_1, ext_1), (genome_2, gene_2, copy_2, ext_2)) # Find indels as connected components in A and B: indels = {"A": 0, "B": 0} for comp in connected_components(G): genome, gene, copy, ext = list(comp)[0] indels[genome] += 1 # log file: gap, time = parse_log_file(filename.replace("sol", "log")) return {"dcj_distance": obj, "dup_a": indels["A"], "dup_b": indels["B"], "time": time, "gap": gap}
def __init__(self, width, height): self.width = width self.height = height self.g = grid_2d_graph(width, height) for node, node_data in self.g.nodes(data=True): node_data['barrier'] = False for src, dst, edge_data in self.g.edges(data=True): edge_data['weight'] = min_int self.walls = set() self.boxes = set() self.bombs = {} # mapping from bomb to rounds self.players = set() self.enemies = set() self.boost_remain = 0 self.boost_renew = 0 max_radius = distance((0, 0), (self.width-1, self.height-1)) self.in_boost_sequence = [False] * max_radius self.radii = range(0, max_radius) self.points = list(product(range(self.width), range(self.height))) self.point_bomb_rounds = {p: max_int for p in self.points} self.bomb_ranges_graph = Graph() self.bomb_ranges_sub_graphs = connected_components(self.bomb_ranges_graph)
def calculate_features(queue,g_file, pairs): print("Started!") G = nx.read_graphml(graph_file) gen = nxa.connected_components(G) mainLst = next(gen) G = G.subgraph(mainLst) f = Featurator(G) print("File: "+g_file) print("Pairs: "+str(len(pairs))) count = 0 for pair in pairs: #queue.put(pair) count += 1 #continue h1 = pair[0] h2 = pair[1] res = f.get_feature_dict(h1,h2) #res = dict() res['pair'] = h1+h2 if h1 < h2 else h2+h1 res['h1'] = h1 if h1 < h2 else h2 res['h2'] = h2 if h1 < h2 else h1 # PUT PAIR IN QUEUE! queue.put(res) print("Calculated everyone! - "+str(count)) queue.put('done')
def calculate_features(queue, g_file, pairs): print("Started!") G = nx.read_graphml(graph_file) gen = nxa.connected_components(G) mainLst = next(gen) G = G.subgraph(mainLst) f = Featurator(G) print("File: " + g_file) print("Pairs: " + str(len(pairs))) count = 0 for pair in pairs: #queue.put(pair) count += 1 #continue h1 = pair[0] h2 = pair[1] res = f.get_feature_dict(h1, h2) #res = dict() res['pair'] = h1 + h2 if h1 < h2 else h2 + h1 res['h1'] = h1 if h1 < h2 else h2 res['h2'] = h2 if h1 < h2 else h1 # PUT PAIR IN QUEUE! queue.put(res) print("Calculated everyone! - " + str(count)) queue.put('done')
def update_bomb_rounds(self): self.bomb_ranges_sub_graphs = connected_components(self.bomb_ranges_graph) for sub_graph in self.bomb_ranges_sub_graphs: rounds = max_int for n in sub_graph: if n in self.bombs and self.bombs[n] < rounds: rounds = self.bombs[n] for n in sub_graph: self.point_bomb_rounds[n] = rounds
import csv import sys if len(sys.argv) < 3: print("Usage: ./generate_csv.py graph_file output_file [#processes]") sys.exit() graph_file = sys.argv[1] output_file = sys.argv[2] processes = 1 if len(sys.argv) < 4 else int(sys.argv[3]) print("Loading graph file...") G = nx.read_graphml(graph_file) print("Obtaining largest connected component...") gen = nxa.connected_components(G) mainLst = next(gen) G = G.subgraph(mainLst) f = Featurator(G) csv_fields = ['pair', 'h1', 'h2'] + f.feature_list() csvfile = open(output_file, 'w') writer = csv.DictWriter(csvfile, fieldnames=csv_fields) writer.writeheader() count = 0 pairs = [(h1, h2) for i, h1 in enumerate(G.nodes()) for j, h2 in enumerate(G.nodes()) if j > i]
import csv import sys if len(sys.argv) < 3: print("Usage: ./generate_csv.py graph_file output_file [#processes]") sys.exit() graph_file = sys.argv[1] output_file = sys.argv[2] processes = 1 if len(sys.argv) < 4 else int(sys.argv[3]) print("Loading graph file...") G = nx.read_graphml(graph_file) print("Obtaining largest connected component...") gen = nxa.connected_components(G) mainLst = next(gen) G = G.subgraph(mainLst) f = Featurator(G) csv_fields = ['pair','h1','h2'] + f.feature_list() csvfile = open(output_file,'w') writer = csv.DictWriter(csvfile,fieldnames=csv_fields) writer.writeheader() count = 0 pairs = [(h1, h2) for i,h1 in enumerate(G.nodes()) for j,h2 in enumerate(G.nodes()) if j > i]
def dcj_dupindel_ilp(genome_a, genome_b, output): # copy genomes to possibly make some changes: genome_a = copy.deepcopy(genome_a) genome_b = copy.deepcopy(genome_b) max_chromosomes = max(genome_a.n_chromosomes(), genome_b.n_chromosomes()) # add capping genes: for genome in [genome_a, genome_b]: for c in genome.chromosomes: if not c.circular: c.gene_order.append(0) c.circular = True for i in range(genome.n_chromosomes(), max_chromosomes): genome.add_chromosome(DupChromosome([0], circular=True)) # count of each gene on each genome gene_count = {"A": genome_a.gene_count(), "B": genome_b.gene_count()} # for all genes ,the total "balanced" count: total_gene_count = {g: max(gene_count["A"][g], gene_count["B"][g]) for g in set(gene_count["A"].keys()).union(set(gene_count["B"].keys()))} # define the y labels -> integer 1..n y_label = define_y_label(total_gene_count) # list of possible edges for each vertex: edges = {} for gene, copies in total_gene_count.iteritems(): for i in xrange(1, copies + 1): edges[(gene, i)] = set(range(1, copies + 1)) # try to fix variables: # Build the BP graph of fixed elements to try to find more variables to fix: master_graph = nx.Graph() # fixed vars: y_fix = {} z_fix = {} balancing_fix = {"A": {}, "B": {}} # add matching edges of genes with single copy: for (gene, copy_a), set_y in edges.iteritems(): if len(set_y) == 1: copy_b = list(set_y)[0] for ext in [Ext.HEAD, Ext.TAIL]: master_graph.add_edge(("A", gene, copy_a, ext), ("B", gene, copy_b, ext)) # add adjacency edges: for genome, genome_name in [(genome_a, "A"), (genome_b, "B")]: for (g_i, copy_a, e_i), (g_j, copy_b, e_j) in adjacency_list(genome, total_gene_count): master_graph.add_edge((genome_name, g_i, copy_a, e_i), (genome_name, g_j, copy_b, e_j)) # Search components to fix: rescan = True edges_to_add = [] vertices_to_remove = [] while rescan: rescan = False master_graph.add_edges_from(edges_to_add) master_graph.remove_nodes_from(vertices_to_remove) edges_to_add = [] vertices_to_remove = [] # check each connected component: for comp in connected_components(master_graph): # get degree-1 vertices: degree_one = [v for v in comp if master_graph.degree(v) == 1] # if two degree one vertices, it is a path; if len(degree_one) == 2: genome_i, g_i, copy_a, e_i = degree_one[0] genome_j, g_j, copy_b, e_j = degree_one[1] # 1 - check if nodes are balancing, to find AA-, BB- and AB- paths that can be fixed. i_is_balancing = g_i != 0 and copy_a > gene_count[genome_i][g_i] j_is_balancing = g_j != 0 and copy_b > gene_count[genome_j][g_j] if i_is_balancing and j_is_balancing: if genome_i == genome_j: # AA- or BB-path, close it balancing_fix[genome_i][degree_one[0][1:]] = degree_one[1][1:] balancing_fix[genome_i][degree_one[1][1:]] = degree_one[0][1:] degree_one = [] else: # TODO: deal with AB-components; pass # if the path has homologous genes at the ends, I can join: elif genome_i != genome_j and g_i == g_j: # invert to put genome A always in variables _i : if genome_j == "A": genome_i, g_i, copy_a, e_i, genome_j, g_j, copy_b, e_j = genome_j, g_j, copy_b, e_j, genome_i, g_i, copy_a, e_i # check conflict, only add edge if ok: if copy_b in edges[(g_i, copy_a)]: edges[(g_i, copy_a)] = {copy_b} # save edges to add to graph: for ext in [Ext.HEAD, Ext.TAIL]: edges_to_add.append((("A", g_i, copy_a, ext), ("B", g_i, copy_b, ext))) # new edges, re-scan: rescan = True # remove possible edges from other copies: for idx in xrange(1, total_gene_count[g_i] + 1): if idx == copy_a: continue try: # if not there already, exception is thrown, that' ok edges[(g_i, idx)].remove(copy_b) # Add new edges to graph, if the removal created degree 1 vertices: if len(edges[(g_i, idx)]) == 1: idx_c = list(edges[(g_i, idx)])[0] for ext in [Ext.HEAD, Ext.TAIL]: edges_to_add.append((("A", g_i, idx, ext), ("B", g_i, idx_c, ext))) except KeyError: pass # if no degree one vertices, it is a cycle, I can fix the y_i: elif len(degree_one) == 0: # get indexes of the y_i: indexes = [(v, y_label[vertex_name(*v)]) for v in comp] min_label = min([x[1] for x in indexes]) for v, label in indexes: y_fix[label] = min_label z_fix[label] = 0 z_fix[min_label] = 1 vertices_to_remove.extend(comp) # DRAW? # nx.draw_circular(master_graph, font_size=8, width=0.5, node_shape="8", node_size=1, with_labels=True) # nx.draw_spring(master_graph, font_size=8, width=0.5, node_shape="8", node_size=20, with_labels=True) # nx.draw_spectral(master_graph, font_size=8, width=0.5, node_shape="8", node_size=20, with_labels=True) # nx.draw_graphviz(master_graph, font_size=8, width=0.5, node_shape="8", node_size=20, with_labels=True) # plt.savefig('graph.pdf', bbox_inches='tight') # all fixed, generate ILP: constraints = [] # consistency and matching 1-to-1 constraints.append("\ Matching and consistency constraints") # sorting just to make it nicer looking: for (gene, copy_a) in sorted(edges): copy_set_b = edges[(gene, copy_a)] if len(copy_set_b) > 1: for copy_b in copy_set_b: constraints.append("%s - %s = 0" % ( matching_edge_name(gene, copy_a, copy_b, Ext.TAIL), matching_edge_name(gene, copy_a, copy_b, Ext.HEAD))) constraints.append( " + ".join([matching_edge_name(gene, copy_a, copy_b, Ext.TAIL) for copy_b in copy_set_b]) + " = 1") constraints.append("\ Balancing:") balancing_genes_A = {g: range(gene_count["A"][g] + 1, gene_count["B"][g] + 1) for g in total_gene_count.iterkeys() if gene_count["A"][g] < gene_count["B"][g]} balancing_genes_B = {g: range(gene_count["B"][g] + 1, gene_count["A"][g] + 1) for g in total_gene_count.iterkeys() if gene_count["B"][g] < gene_count["A"][g]} for genome, balancing in [("A", balancing_genes_A), ("B", balancing_genes_B)]: constraints.append("\ Genome %s" % genome) for gene_i, copy_i, ext_i in balancing_extremities(balancing): # check if fixed: if (gene_i, copy_i, ext_i) in balancing_fix[genome]: gene_j, copy_j, ext_j = balancing_fix[genome][(gene_i, copy_i, ext_i)] if (gene_i, copy_i, ext_i) < (gene_j, copy_j, ext_j): constraints.append( "%s = 1" % balancing_edge_name(genome, gene_i, copy_i, ext_i, gene_j, copy_j, ext_j)) # if not, matching 1-to-1: else: constraints.append( " + ".join([balancing_edge_name(genome, gene_i, copy_i, ext_i, gene_j, copy_j, ext_j) for gene_j, copy_j, ext_j in balancing_extremities(balancing, exclude=balancing_fix[genome].keys()) if (gene_i, copy_i, ext_i) != (gene_j, copy_j, ext_j)]) + " = 1") constraints.append("\ Labelling") # # for each adjacency, fix label: constraints.append("\\ Adjacency have the same label:") for genome, genome_name in [(genome_a, "A"), (genome_b, "B")]: for i, j in adjacency_list(genome, total_gene_count): v_i = vertex_name(genome_name, *i) v_j = vertex_name(genome_name, *j) # if already fixed, skip if y_label[v_i] in y_fix and y_label[v_j] in y_fix: continue # if the edge is 0 for sure, also skip: constraints.append("y_%s - y_%s = 0 \\ %s <-> %s " % (y_label[v_i], y_label[v_j], v_i, v_j)) # constraints.append("\\ Matching edges with the same label:") for (gene, copy_a) in sorted(edges): copy_set_b = edges[(gene, copy_a)] for ext in [Ext.HEAD, Ext.TAIL]: y_i = y_label[vertex_name("A", gene, copy_a, ext)] # if edge is set, just make the y_i's equal; if len(copy_set_b) == 1: y_j = y_label[vertex_name("B", gene, list(copy_set_b)[0], ext)] # skip if this y_i's are already fixed if y_i in y_fix and y_j in y_fix: continue constraints.append("y_%s - y_%s = 0 " % (y_i, y_j)) else: # if edge not set, add both ineqs. for copy_b in copy_set_b: y_j = y_label[vertex_name("B", gene, copy_b, ext)] constraints.append( "y_%s - y_%s + %s %s <= %d" % ( y_i, y_j, y_i, matching_edge_name(gene, copy_a, copy_b, ext), y_i)) constraints.append( "y_%s - y_%s + %s %s <= %d" % ( y_j, y_i, y_j, matching_edge_name(gene, copy_a, copy_b, ext), y_j)) constraints.append("\\ Balancing edges with same label:") for genome, balancing in [("A", balancing_genes_A), ("B", balancing_genes_B)]: constraints.append("\\ Genome %s" % genome) for gene_i, copy_i, ext_i in balancing_extremities(balancing, exclude=balancing_fix[genome].keys()): for gene_j, copy_j, ext_j in balancing_extremities(balancing, exclude=balancing_fix[genome].keys()): if (gene_i, copy_i, ext_i) >= (gene_j, copy_j, ext_j): continue y_i = y_label[vertex_name(genome, gene_i, copy_i, ext_i)] y_j = y_label[vertex_name(genome, gene_j, copy_j, ext_j)] # should not have someone here if I'm excluding fixed edges: if y_i in y_fix and y_j in y_fix: continue constraints.append("y_%s - y_%s + %s %s <= %d" % ( y_i, y_j, y_i, balancing_edge_name(genome, gene_i, copy_i, ext_i, gene_j, copy_j, ext_j), y_i)) constraints.append("y_%s - y_%s + %s %s <= %d" % ( y_j, y_i, y_j, balancing_edge_name(genome, gene_i, copy_i, ext_i, gene_j, copy_j, ext_j), y_j)) # z variables: since all cycles have to contains vertices from both genomes, we only add z variables # for genome A, that have smallest labels, so a genome B z variable will never be =1. constraints.append("\\ Z variables") for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)): if vertex[0] == "A": # if i in z_fix and z_fix[i] == 0: # continue # if i in z_fix and z_fix[i] == 1: # constraints.append("z_%s = 1" % i) if i not in z_fix: constraints.append("%d z_%s - y_%s <= 0" % (i, i, i)) # # # number of genes, to fix distance: constraints.append("n = %d" % (sum(total_gene_count.itervalues()))) # # number of fixed cycles constraints.append("c = %d" % (sum(z_fix.itervalues()))) # for g in sorted(total_gene_count): # print g,total_gene_count[g] # # # bounds: bounds = [] for i in sorted(y_label.itervalues()): if i not in y_fix: bounds.append("y_%d <= %d" % (i, i)) # # # variables: binary = [] # # # matching edges # matching edges, skipping fixed pairs. matching = ["\ match"] # for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)): for (gene, copy_a), copy_set_b in sorted(edges.items(), key=operator.itemgetter(0)): if len(copy_set_b) > 1: for copy_b in copy_set_b: for ext in [Ext.HEAD, Ext.TAIL]: matching.append(matching_edge_name(gene, copy_a, copy_b, ext)) print "%d matching edges" % len(matching) # print "Potentially %d matching edges" % sum([2*x ** 2 for x in gene_count.itervalues()]) binary.extend(matching) # # balancing edges: balancing_edges = [balancing_edge_name(genome, gene_i, copy_i, ext_i, gene_j, copy_j, ext_j) for genome, balancing in [("A", balancing_genes_A), ("B", balancing_genes_B)] for gene_i, copy_i, ext_i in balancing_extremities(balancing, exclude=balancing_fix[genome].keys()) for gene_j, copy_j, ext_j in balancing_extremities(balancing, exclude=balancing_fix[genome].keys()) if (gene_i, copy_i, ext_i) < (gene_j, copy_j, ext_j)] print "%d balancing edges" % len(balancing_edges) binary.extend(balancing_edges) # # z cycles: for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)): if i in z_fix: # and z_fix[i] == 0: continue if vertex[0] == "B": continue binary.append("z_%d" % i) # # # Y label are general: # TODO: remove unused y' and z's from model. If y=1, it can be removed, just set z=1. general = [] for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)): if i not in y_fix: general.append("y_%d" % i) # # # number of genes and fixed cycles: general.append("n") general.append("c") # # objective function: objective = ["obj: n - c - " + " - ".join( ["z_%d" % i for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)) if vertex[0] == "A" and i not in z_fix])] # write: with open(output, "w") as f: for header, lines in [("Minimize", objective), ("Subject to", constraints), ("Bounds", bounds), ("Binary", binary), ("General", general)]: print >> f, header print >> f, "\n".join(lines)
def solve_ilp(timelimit=60): # import here, so only if actually solving we will need gurobi. from gurobipy import read, GRB # pycharm complains of gurobi commands, cannot see them from the import model = read(filename) # set some options: # time limit in seconds: model.params.timeLimit = timelimit # not verbose: # model.setParam('OutputFlag', False) # MIP focus, from 0 to 3: model.params.MIPFocus = 1 # best solutions, less focus on bounds. model.optimize() if model.status != GRB.Status.INFEASIBLE: print('FINISHED: Best objective: %g' % model.objVal) print('Optimization ended with status %d' % model.status) model.write(filename + '.sol') if model.status == GRB.INFEASIBLE: model.computeIIS() model.write("unfeasible.lp") print('\nThe following constraint(s) cannot be satisfied:') for c in model.getConstrs(): if c.IISConstr: print('%s' % c.constrName) else: z = n = c = 0 solution_matching = collections.defaultdict(list) matching_regexp = re.compile("x_A(\d+)_(\d+)h,B(\d+)_(\d+)h") # get basic vars and matching: for v in model.getVars(): if v.varName == "n": n = v.x elif v.varName == "c": c = v.x elif v.varName.startswith("z") and v.x >= 0.9: z += 1 else: m = matching_regexp.match(v.varName) if m is not None and v.x == 1: g_a, c_a, g_b, c_b = map(int, m.groups()) solution_matching[g_a].append((c_a, c_b)) from parse_orthology import build_correct_matching, parse_orthology_quality correct_matching = build_correct_matching(genome_a, genome_b) tp, fp, fn = parse_orthology_quality(solution_matching, correct_matching) print "N: %d cycles:%d (%d fixed, %d from opt)" % (n, z + c, c, z) print "Orthology. TP:%d FP:%d FN:%d" % (len(tp), len(fp), len(fn)) # print match_edges # Now, analyse the BP graph, for the incomplete matching model, to find AA-, BB- and AB- components: master_graph = nx.Graph() # fixed vars: # add matching edges of genes with single copy: # for (gene, copy_a), copy_j in match_edges.iteritems(): for gene, pair_list in solution_matching.iteritems(): for copy_a, copy_b in pair_list: for ext in [Ext.HEAD, Ext.TAIL]: master_graph.add_edge(("A", gene, copy_a, ext), ("B", gene, copy_b, ext)) # add adjacency edges: for genome, genome_name in [(genome_a, "A"), (genome_b, "B")]: for (g_i, copy_a, e_i), (g_j, copy_j, e_j) in genome.adjacency_iter_with_copies(): master_graph.add_edge((genome_name, g_i, copy_a, e_i), (genome_name, g_j, copy_j, e_j)) count = {"A": 0, "B": 0, "AB": 0} c = 0 # print "C:", len([x for x in connected_components(master_graph)]) for comp in connected_components(master_graph): degree_one = [v for v in comp if master_graph.degree(v) == 1] if len(degree_one) == 0: c += 1 else: if len(degree_one) != 2: import ipdb; ipdb.set_trace() if degree_one[0][0] == degree_one[1][0]: count[degree_one[0][0]] += 1 else: count["AB"] += 1 print count if skip_balancing: print "Corrected distance: %d" % (model.objVal + count["AB"] / 2) return model
def dcj_dupindel_ilp(genome_a, genome_b, output, skip_balancing=False, fix_vars=True, solve=False, all_vs_all=False): def solve_ilp(timelimit=60): # import here, so only if actually solving we will need gurobi. from gurobipy import read, GRB # pycharm complains of gurobi commands, cannot see them from the import model = read(filename) # set some options: # time limit in seconds: model.params.timeLimit = timelimit # not verbose: # model.setParam('OutputFlag', False) # MIP focus, from 0 to 3: model.params.MIPFocus = 1 # best solutions, less focus on bounds. model.optimize() if model.status != GRB.Status.INFEASIBLE: print('FINISHED: Best objective: %g' % model.objVal) print('Optimization ended with status %d' % model.status) model.write(filename + '.sol') if model.status == GRB.INFEASIBLE: model.computeIIS() model.write("unfeasible.lp") print('\nThe following constraint(s) cannot be satisfied:') for c in model.getConstrs(): if c.IISConstr: print('%s' % c.constrName) else: z = n = c = 0 solution_matching = collections.defaultdict(list) matching_regexp = re.compile("x_A(\d+)_(\d+)h,B(\d+)_(\d+)h") # get basic vars and matching: for v in model.getVars(): if v.varName == "n": n = v.x elif v.varName == "c": c = v.x elif v.varName.startswith("z") and v.x >= 0.9: z += 1 else: m = matching_regexp.match(v.varName) if m is not None and v.x == 1: g_a, c_a, g_b, c_b = map(int, m.groups()) solution_matching[g_a].append((c_a, c_b)) from parse_orthology import build_correct_matching, parse_orthology_quality correct_matching = build_correct_matching(genome_a, genome_b) tp, fp, fn = parse_orthology_quality(solution_matching, correct_matching) print "N: %d cycles:%d (%d fixed, %d from opt)" % (n, z + c, c, z) print "Orthology. TP:%d FP:%d FN:%d" % (len(tp), len(fp), len(fn)) # print match_edges # Now, analyse the BP graph, for the incomplete matching model, to find AA-, BB- and AB- components: master_graph = nx.Graph() # fixed vars: # add matching edges of genes with single copy: # for (gene, copy_a), copy_j in match_edges.iteritems(): for gene, pair_list in solution_matching.iteritems(): for copy_a, copy_b in pair_list: for ext in [Ext.HEAD, Ext.TAIL]: master_graph.add_edge(("A", gene, copy_a, ext), ("B", gene, copy_b, ext)) # add adjacency edges: for genome, genome_name in [(genome_a, "A"), (genome_b, "B")]: for (g_i, copy_a, e_i), (g_j, copy_j, e_j) in genome.adjacency_iter_with_copies(): master_graph.add_edge((genome_name, g_i, copy_a, e_i), (genome_name, g_j, copy_j, e_j)) count = {"A": 0, "B": 0, "AB": 0} c = 0 # print "C:", len([x for x in connected_components(master_graph)]) for comp in connected_components(master_graph): degree_one = [v for v in comp if master_graph.degree(v) == 1] if len(degree_one) == 0: c += 1 else: if len(degree_one) != 2: import ipdb; ipdb.set_trace() if degree_one[0][0] == degree_one[1][0]: count[degree_one[0][0]] += 1 else: count["AB"] += 1 print count if skip_balancing: print "Corrected distance: %d" % (model.objVal + count["AB"] / 2) return model # copy genomes to possibly make some changes: genome_a = copy.deepcopy(genome_a) genome_b = copy.deepcopy(genome_b) add_capping_genes(genome_a, genome_b) # since the gene set might be different for each genome, find all genes: all_genes = genome_a.gene_set().union(genome_b.gene_set()) # find all gene copies gene_copies = build_gene_copies_dict(all_genes, genome_a, genome_b) # count balancing genes: bal = { g: sum([len([c for c in gene_copies[g][gene].itervalues() if c == CopyType.BALANCING]) for gene in all_genes]) for g in ["A", "B"]} print "Balancing genes:A=%(A)d, B=%(B)d" % bal # define the y labels (vertex = genome,gene,copy,ext) -> integer 1..n y_label = define_y_label(gene_copies) # store all possible matchings (edges) from each family: fixed_matching = {} possible_matching = {} for gene in all_genes: # if only 1 copy, matching is fixed: if len(gene_copies["A"][gene]) == 1: # fix the matching, then remove from the available copies copy_a, type_a = gene_copies["A"][gene].items()[0] copy_j, type_b = gene_copies["B"][gene].items()[0] fixed_matching[(gene, copy_a)] = copy_j else: possible_matching[gene] = {"A": {copy_i for copy_i, type_i in gene_copies["A"][gene].items()}, "B": {copy_i for copy_i, type_i in gene_copies["B"][gene].items()}} # Build the BP graph of fixed matchings to try to find more variables to fix: y_fix = {} z_fix = {} balancing_fix = {"A": {}, "B": {}} if fix_vars: master_graph = nx.Graph() # fixed vars: # add matching edges of genes with single copy: for (gene, copy_a), copy_j in fixed_matching.iteritems(): for ext in [Ext.HEAD, Ext.TAIL]: master_graph.add_edge(("A", gene, copy_a, ext), ("B", gene, copy_j, ext)) # add adjacency edges: for genome, genome_name in [(genome_a, "A"), (genome_b, "B")]: for (g_i, copy_a, e_i), (g_j, copy_j, e_j) in genome.adjacency_iter_with_copies(): master_graph.add_edge((genome_name, g_i, copy_a, e_i), (genome_name, g_j, copy_j, e_j)) # Search components to fix: rescan = True edges_to_add = [] vertices_to_remove = [] ab_components = set() while rescan: rescan = False # Pre-scan: # add and remove edges detected from previous rounds: master_graph.add_edges_from(edges_to_add) master_graph.remove_nodes_from(vertices_to_remove) edges_to_add = [] vertices_to_remove = [] # fix AB-components; while I have at least 2, join pairs arbitrarily: while len(ab_components) > 1: a_i, b_i = ab_components.pop() a_j, b_j = ab_components.pop() master_graph.add_edge(a_i, a_j) balancing_fix["A"][a_i[1:]] = a_j[1:] balancing_fix["A"][a_j[1:]] = a_i[1:] master_graph.add_edge(b_i, b_j) balancing_fix["B"][b_i[1:]] = b_j[1:] balancing_fix["B"][b_j[1:]] = b_i[1:] # Now I search for vertices that have only balancing vertices as matching # candidates. If that is the case, I can fix them arbitrarly. fix_only_bal = True if fix_only_bal: for gene in sorted(possible_matching): set_a = possible_matching[gene]["A"] set_b = possible_matching[gene]["B"] if all([gene_copies["A"][gene][copy_a] == CopyType.BALANCING for copy_a in set_a]) or all( [gene_copies["B"][gene][copy_b] == CopyType.BALANCING for copy_b in set_b]): for copy_a, copy_b in zip(set_a, set_b): fixed_matching[(gene, copy_a)] = copy_b # save edges to add to graph: for ext in [Ext.HEAD, Ext.TAIL]: # edges_to_add.append((("A", gene, copy_a, ext), ("B", gene, copy_b, ext))) master_graph.add_edge(("A", gene, copy_a, ext), ("B", gene, copy_b, ext)) rescan = True # remove from possible matching: del possible_matching[gene] # now loop for each connected component, fixing cycles and trying to close paths to cycles when possible. for comp in connected_components(master_graph): # can only consider even components; if len(comp) % 2 != 0: continue # get degree-1 vertices: degree_one = [v for v in comp if master_graph.degree(v) == 1] # if two degree one vertices, it is a path; if len(degree_one) == 2: genome_i, g_i, copy_a, e_i = degree_one[0] genome_j, g_j, copy_j, e_j = degree_one[1] # 1 - check if both nodes are balancing, to find AA-, BB- and AB- paths that can be fixed. i_is_balancing = g_i != 0 and gene_copies[genome_i][g_i][copy_a] == CopyType.BALANCING j_is_balancing = g_j != 0 and gene_copies[genome_j][g_j][copy_j] == CopyType.BALANCING if i_is_balancing and j_is_balancing: # open-path, both ends are balancing. # If AA- or BB-path, close it to a cycle: if genome_i == genome_j: # fix the cycle: fix_cycle_y_z(comp, y_label, y_fix, z_fix, vertices_to_remove) # fix the balancing variables if we have them: if not skip_balancing: balancing_fix[genome_i][degree_one[0][1:]] = degree_one[1][1:] balancing_fix[genome_i][degree_one[1][1:]] = degree_one[0][1:] else: # If not, it is AB-, add to the list to try to make pairs. if skip_balancing: # if not using balancing edges, I can fix the AB directly, instead of # doing the merge in pairs; fix_cycle_y_z(comp, y_label, y_fix, z_fix, vertices_to_remove) else: # merge in pairs: ab_components.add(tuple(sorted(degree_one))) if len(ab_components) > 1: rescan = True # Not open path; then, check if the path has homologous extremities at both ends, so I can close # to a path: elif genome_i != genome_j and g_i == g_j and e_i == e_j: # invert to put genome A always in variables _i : if genome_j == "A": genome_i, g_i, copy_a, e_i, genome_j, g_j, copy_j, e_j = genome_j, g_j, copy_j, e_j, genome_i, g_i, copy_a, e_i # check conflict, only add edge if it's in the allowed edges: if g_i in possible_matching and copy_a in possible_matching[g_i]["A"] and copy_j in \ possible_matching[g_i]["B"]: # new edges, re-scan: rescan = True fix_new_matching(fixed_matching, edges_to_add, possible_matching, g_i, copy_a, copy_j) # if there are no degree one vertices, it is a cycle; I can fix the y_i and z_i for this cycle: elif len(degree_one) == 0: fix_cycle_y_z(comp, y_label, y_fix, z_fix, vertices_to_remove) rescan = True # DRAW: draw_bp = False if draw_bp: plot_bp('graph.pdf', master_graph, gene_copies, possible_matching) # all fixed, generate ILP # to make it easier to find the matching edges, specially when limiting edges from balancing genes, # I will build a gene connections graph; gene_connection = nx.DiGraph() # make it directed, so the vertex of A is always 1st on the edge tuple. for gene in possible_matching.iterkeys(): set_a = possible_matching[gene]["A"] set_b = possible_matching[gene]["B"] # All vs all model: if all_vs_all: for copy_a in set_a: for copy_b in set_b: gene_connection.add_edge(("A", gene, copy_a), ("B", gene, copy_b)) else: # try to minimise needed matching edges for balancing nodes: real_a = [cp for cp in set_a if gene_copies["A"][gene][cp] == CopyType.REAL] real_b = [cp for cp in set_b if gene_copies["B"][gene][cp] == CopyType.REAL] # all real, then all-vs-all: if len(real_a) == len(real_b): for copy_a in set_a: for copy_b in set_b: gene_connection.add_edge(("A", gene, copy_a), ("B", gene, copy_b)) # a has balancing: if len(real_a) < len(real_b): balancing_a = [cp for cp in set_a if gene_copies["A"][gene][cp] == CopyType.BALANCING] # the real in A match the real in B (which are all) for copy_a in real_a: for copy_b in set_b: gene_connection.add_edge(("A", gene, copy_a), ("B", gene, copy_b)) # then, the balancing in A have len(real_a)+1 incident edges list_b = list(set_b) for idx, copy_a in enumerate(balancing_a): for j in range(len(real_a) + 1): gene_connection.add_edge(("A", gene, copy_a), ("B", gene, list_b[idx + j])) # b has balancing: else: balancing_b = [cp for cp in set_b if gene_copies["B"][gene][cp] == CopyType.BALANCING] # the real in B match the real in A (which are all) for copy_b in real_b: for copy_a in set_a: gene_connection.add_edge(("A", gene, copy_a), ("B", gene, copy_b)) # then, the balancing in B have len(real_b)+1 incident edges list_a = list(set_a) for idx, copy_b in enumerate(balancing_b): for j in range(len(real_b) + 1): gene_connection.add_edge(("A", gene, list_a[idx + j]), ("B", gene, copy_b)) # Start building constraints: constraints = [] # consistency and matching 1-to-1 # Fixed matching: # sorting just to make it nicer looking: constraints.append("\ Fixed matching:") for (gene, copy_a), copy_b in sorted(fixed_matching.items(), key=lambda pair: pair[0]): constraints.append("%s = 1" % matching_edge_name(gene, copy_a, copy_b, Ext.TAIL)) constraints.append("%s = 1" % matching_edge_name(gene, copy_a, copy_b, Ext.HEAD)) # HEAD TAIL consistency: constraints.append("\ Head/Tail consistency:") for (_, gene_a, copy_a), (_, gene_b, copy_b) in gene_connection.edges_iter(): constraints.append("%s - %s = 0" % ( matching_edge_name(gene_a, copy_a, copy_b, Ext.TAIL), matching_edge_name(gene_a, copy_a, copy_b, Ext.HEAD))) # 1 Matching per node : constraints.append("\ Degree 1 per node (Matching):") # for all vertices: for v in gene_connection.nodes_iter(): # find the incident edges: if v[0] == "A": edges = gene_connection.out_edges_iter else: edges = gene_connection.in_edges_iter incident = [matching_edge_name(gene_a, copy_a, copy_b, Ext.TAIL) for (_, gene_a, copy_a), (_, gene_b, copy_b) in edges(v)] # sum of incidents is 1: constraints.append("%s = 1" % (" + ".join(incident))) if not skip_balancing: constraints.append("\ Balancing:") for genome in ["A", "B"]: constraints.append("\ Genome %s" % genome) for gene_i, copy_a, ext_i in balancing_extremities(gene_copies[genome]): # check if fixed: if (gene_i, copy_a, ext_i) in balancing_fix[genome]: gene_j, copy_j, ext_j = balancing_fix[genome][(gene_i, copy_a, ext_i)] if (gene_i, copy_a, ext_i) < (gene_j, copy_j, ext_j): constraints.append( "%s = 1" % balancing_edge_name(genome, gene_i, copy_a, ext_i, gene_j, copy_j, ext_j)) # if not, matching 1-to-1: else: constraints.append( " + ".join([balancing_edge_name(genome, gene_i, copy_a, ext_i, gene_j, copy_j, ext_j) for gene_j, copy_j, ext_j in balancing_extremities(gene_copies[genome], exclude=balancing_fix[genome].keys()) if (gene_i, copy_a, ext_i) != (gene_j, copy_j, ext_j)]) + " = 1") constraints.append("\ Labelling") # for each adjacency, fix the label of adjacent genes: constraints.append("\\ Adjacent nodes have the same label:") for genome, genome_name in [(genome_a, "A"), (genome_b, "B")]: for (g_i, copy_a, ext_i), (g_j, copy_j, ext_j) in genome.adjacency_iter_with_copies(): v_i = genome_name, g_i, copy_a, ext_i v_j = genome_name, g_j, copy_j, ext_j # if already fixed, skip if y_label[v_i] in y_fix and y_label[v_j] in y_fix: continue # if the edge is 0 for sure, also skip: constraints.append("y_%s - y_%s = 0 \\ %s <-> %s " % (y_label[v_i], y_label[v_j], v_i, v_j)) # constraints.append("\\ Matching extremities have the same label:") # if extremities are matched, but I don't know the y_i (cycle was not closed in the fixing phase), # then I know that the y_i's of these extremities are equal: constraints.append("\\ Fixed matching without fixed y_i:") for (gene, copy_a) in sorted(fixed_matching): copy_j = fixed_matching[(gene, copy_a)] for ext in [Ext.HEAD, Ext.TAIL]: y_i = y_label[("A", gene, copy_a, ext)] y_j = y_label[("B", gene, copy_j, ext)] # only add if this y_i's aren't already fixed if y_i not in y_fix and y_j not in y_fix: constraints.append("y_%s - y_%s = 0 " % (y_i, y_j)) # for the "open" matching, for each edge I add the "y fixing" restrictions, that force the y_i's # to be equal whenever the edge variable is set to 1. constraints.append("\\ Open matching:") for (_, gene_a, copy_a), (_, gene_b, copy_b) in gene_connection.edges_iter(): for ext in [Ext.HEAD, Ext.TAIL]: y_a = y_label[("A", gene_a, copy_a, ext)] y_b = y_label[("B", gene_b, copy_b, ext)] constraints.append( "y_%s - y_%s + %s %s <= %d" % ( y_a, y_b, y_a, matching_edge_name(gene_a, copy_a, copy_b, ext), y_a)) constraints.append( "y_%s - y_%s + %s %s <= %d" % ( y_b, y_a, y_b, matching_edge_name(gene_a, copy_a, copy_b, ext), y_b)) if not skip_balancing: constraints.append("\\ Balancing edges have same label:") for genome in ["A", "B"]: constraints.append("\\ Genome %s" % genome) for gene_i, copy_a, ext_i in balancing_extremities(gene_copies[genome], exclude=balancing_fix[genome].keys()): for gene_j, copy_j, ext_j in balancing_extremities(gene_copies[genome], exclude=balancing_fix[genome].keys()): if (gene_i, copy_a, ext_i) >= (gene_j, copy_j, ext_j): continue y_i = y_label[(genome, gene_i, copy_a, ext_i)] y_j = y_label[(genome, gene_j, copy_j, ext_j)] # should not have someone here if I'm excluding fixed edges: if y_i in y_fix and y_j in y_fix: continue constraints.append("y_%s - y_%s + %s %s <= %d" % ( y_i, y_j, y_i, balancing_edge_name(genome, gene_i, copy_a, ext_i, gene_j, copy_j, ext_j), y_i)) constraints.append("y_%s - y_%s + %s %s <= %d" % ( y_j, y_i, y_j, balancing_edge_name(genome, gene_i, copy_a, ext_i, gene_j, copy_j, ext_j), y_j)) # z variables: since all cycles have to contains vertices from both genomes, we only add z variables # for genome A, that have smallest labels, so a genome B z variable will never be =1. constraints.append("\\ Z variables") for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)): if vertex[0] == "A": if i not in z_fix: constraints.append("%d z_%s - y_%s <= 0" % (i, i, i)) # # # number of genes, to fix distance: n_genes = sum([len(copies) for copies in gene_copies["A"].itervalues()]) constraints.append("n = %d" % n_genes) # # number of fixed cycles constraints.append("c = %d" % (sum(z_fix.itervalues()))) # # # bounds: bounds = [] for i in sorted(y_label.itervalues()): if i not in y_fix: bounds.append("y_%d <= %d" % (i, i)) # # # variables: binary = [] # # # matching edges matching = ["\ Fixed matching:"] for (gene, copy_a), copy_b in fixed_matching.iteritems(): matching.append(matching_edge_name(gene, copy_a, copy_b, Ext.TAIL)) matching.append(matching_edge_name(gene, copy_a, copy_b, Ext.HEAD)) matching.append("\ Open matching:") for (_, gene_a, copy_a), (_, gene_b, copy_b) in gene_connection.edges_iter(): for ext in [Ext.HEAD, Ext.TAIL]: matching.append(matching_edge_name(gene_a, copy_a, copy_b, ext)) print "%d fixed matching edges" % (len(fixed_matching) * 2) print "%d open matching edges" % (len(gene_connection.edges()) * 2) binary.extend(matching) if not skip_balancing: # balancing edges: balancing_edges = [balancing_edge_name(genome, gene_i, copy_a, ext_i, gene_j, copy_j, ext_j) for genome in ["A", "B"] for gene_i, copy_a, ext_i in balancing_extremities(gene_copies[genome], exclude=balancing_fix[genome].keys()) for gene_j, copy_j, ext_j in balancing_extremities(gene_copies[genome], exclude=balancing_fix[genome].keys()) if (gene_i, copy_a, ext_i) < (gene_j, copy_j, ext_j)] print "%d balancing edges" % len(balancing_edges) binary.extend(balancing_edges) # # z cycles: for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)): if i in z_fix: # and z_fix[i] == 0: continue if vertex[0] == "B": continue binary.append("z_%d" % i) # # # Y label are general: # TODO: remove unused y' and z's from model. If y=1, it can be removed, just set z=1. general = [] for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)): if i not in y_fix: general.append("y_%d" % i) # # # number of genes and fixed cycles: general.append("n") general.append("c") # # objective function: z_obj = " - ".join(["z_%d" % i for vertex, i in sorted(y_label.items(), key=operator.itemgetter(1)) if vertex[0] == "A" and i not in z_fix]) objective = ["obj: n - c %s" % ("- " + z_obj if len(z_obj) > 0 else "")] # write ILP: with open(output, "w") as f: for header, lines in [("Minimize", objective), ("Subject to", constraints), ("Bounds", bounds), ("Binary", binary), ("General", general)]: print >> f, header print >> f, "\n".join(lines) if solve: model = solve_ilp(timelimit=60) return model
def plot_bp(filename, master_graph, gene_copies, possible_matching, simplified=True): # add isolated vertices (balancing extremities that are not fixed already) for genome_i in ["A", "B"]: for gene_i, copy_i, ext_i in balancing_extremities(gene_copies[genome_i]): if gene_i in possible_matching and copy_i in possible_matching[gene_i][genome_i]: # if (gene_i, copy_i) not in fixed_matching: # print "add bal", (genome_i, gene_i, copy_i, ext_i) master_graph.add_node((genome_i, gene_i, copy_i, ext_i)) # simplified: if simplified: edges = [] vertices = [] for comp in connected_components(master_graph): if len(comp) == 1: vertices.append(comp.pop()) continue degree_one = tuple([v for v in comp if master_graph.degree(v) == 1]) edges.append(degree_one) master_graph = nx.Graph() master_graph.add_edges_from(edges) master_graph.add_nodes_from(vertices) # Relabel nodes to make it easier to read: mapping = {} normal = [] balancing = [] be = {genome_i: list(balancing_extremities(gene_copies[genome_i])) for genome_i in ["A", "B"]} for v in master_graph.nodes(): genome_i, gene_i, copy_i, ext_i = v mapping[v] = "$%s%s_{(%s)}^%s$" % v if (gene_i, copy_i, ext_i) in be[genome_i]: balancing.append(mapping[v]) else: normal.append(mapping[v]) master_graph = nx.relabel_nodes(master_graph, mapping) # Graphviz position: # pos = nx.nx_agraph.graphviz_layout(master_graph, prog="fdp") # custom position: x_pos = 0 y_pos = {"A": 1, "B": 0} pos = {} for comp in sorted(connected_components(master_graph), key=lambda c: (-len(c), min(c))): last_v = None for v in sort_component(master_graph, comp, fmt=False): if last_v == v[1]: x_pos += 1 last_v = v[1] pos[v] = (x_pos, y_pos[v[1]]) x_pos += 1 if x_pos > 7: x_pos = 0 y_pos["A"] -= 2 y_pos["B"] -= 2 # draw and save: for nodelist, color in [(normal, "lightgray"), (balancing, "lightblue")]: nx.draw(master_graph, pos, font_size=5, nodelist=nodelist, node_color=color, linewidths=0.1, width=0.5, node_size=400, with_labels=True) plt.savefig(filename, bbox_inches='tight')
def getConnectedComponents(self, G): return nalgos.connected_components(G)
from networkx import read_edgelist import networkx as nx G = read_edgelist('hartford_drug.edgelist') print(G.number_of_nodes()) print(G.number_of_edges()) import matplotlib.pyplot as plt nx.draw(G) plt.show() # 寻找社区/联通子图 from networkx.algorithms import number_connected_components, connected_components print(number_connected_components(G)) for subG in connected_components(G): print(subG) # 获取联通子图的图结构 from networkx.algorithms import connected_component_subgraphs for i, subG in enumerate(connected_component_subgraphs(G)): print('G%s' % i, subG.number_of_nodes(), subG.number_of_edges()) # 通过三角计算强化社区发现 # 三角计数(triangles counts)和集束系数/聚类系数(clustering coefficient)衡量社区/子图的紧密程度 from networkx.algorithms import triangles, transitivity, average_clustering # 三角计数 print(triangles(G)) # 平均三角计数