def match(GA_orig, GB_orig, order=3, max_depth=10, complexity=4): if len(GA_orig) > len(GB_orig): GA, GB = GB_orig.copy(), GA_orig.copy() logging.warning('Warning: reference graph is B not A') else: GA, GB = GA_orig.copy(), GB_orig.copy() # logging.warning('Matching graph A (%d nodes) to graph B (%d nodes)' % (len(GA_orig), len(GB_orig))) GA, GB = make_same_size(GA, GB) M = vertex_vectorize([GA, GB], complexity=complexity, normalization=True, inner_normalization=True) MA, MB = M[0], M[1] nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA) d, BprefA = nnA.kneighbors(MB) nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB) d, AprefB = nnB.kneighbors(MA) # mark bfv in vec attribute GA, GB = init_vec(GA), init_vec(GB) for k in range(order): ds = d[:, 0] id_max_A = np.argsort(ds)[k] id_max_B = AprefB[id_max_A][0] GA = annotate_with_bfs(GA, id_max_A, max_depth=max_depth) GB = annotate_with_bfs(GB, id_max_B, max_depth=max_depth) # draw_graph_set([GA,GB],n_graphs_per_line=2, size=9, secondary_vertex_label='vec') # vectorize 2nd time with real values this time M = vertex_vectorize([GA, GB], complexity=complexity, discrete=False, normalization=False, inner_normalization=False) MA, MB = M[0], M[1] nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA) d, BprefA = nnA.kneighbors(MB) nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB) d, AprefB = nnB.kneighbors(MA) A = ['A%d' % (i + 1) for i in range(len(GA))] B = ['B%d' % (i + 1) for i in range(len(GB))] Arankings = dict(((A[i], j + 1), B[AprefB[i, j]]) for i, j in product(range(len(GA)), range(len(GA)))) Brankings = dict(((B[i], j + 1), A[BprefA[i, j]]) for i, j in product(range(len(GB)), range(len(GB)))) rankings = Arankings rankings.update(Brankings) pairings = stable(rankings, A, B) # remove dummy node pairings npairings = trim_pairings(pairings, GA_orig, GB_orig) orderA, orderB = zip(*sorted(npairings)) return orderB
def match(GA_orig, GB_orig, order=3, max_depth=10, complexity=4): if len(GA_orig) > len(GB_orig): GA, GB = GB_orig.copy(), GA_orig.copy() logging.warning('Warning: reference graph is B not A') else: GA, GB = GA_orig.copy(), GB_orig.copy() # logging.warning('Matching graph A (%d nodes) to graph B (%d nodes)' % (len(GA_orig), len(GB_orig))) GA, GB = make_same_size(GA, GB) M = vertex_vectorize([GA, GB], complexity=complexity, normalization=True, inner_normalization=True) MA, MB = M[0], M[1] nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA) d, BprefA = nnA.kneighbors(MB) nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB) d, AprefB = nnB.kneighbors(MA) # mark bfv in vec attribute GA, GB = init_vec(GA), init_vec(GB) for k in range(order): ds = d[:, 0] id_max_A = np.argsort(ds)[k] id_max_B = AprefB[id_max_A][0] GA = annotate_with_bfs(GA, id_max_A, max_depth=max_depth) GB = annotate_with_bfs(GB, id_max_B, max_depth=max_depth) # draw_graph_set([GA,GB],n_graphs_per_line=2, size=9, secondary_vertex_label='vec') # vectorize 2nd time with real values this time M = vertex_vectorize([GA, GB], complexity=complexity, discrete=False, normalization=False, inner_normalization=False) MA, MB = M[0], M[1] nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA) d, BprefA = nnA.kneighbors(MB) nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB) d, AprefB = nnB.kneighbors(MA) A = ['A%d' % (i + 1) for i in range(len(GA))] B = ['B%d' % (i + 1) for i in range(len(GB))] Arankings = dict(((A[i], j + 1), B[AprefB[i, j]]) for i, j in product(range(len(GA)), range(len(GA)))) Brankings = dict(((B[i], j + 1), A[BprefA[i, j]]) for i, j in product(range(len(GB)), range(len(GB)))) rankings = Arankings rankings.update(Brankings) pairings = stable(rankings, A, B) # remove dummy node pairings npairings = trim_pairings(pairings, GA_orig, GB_orig) orderA, orderB = list(zip(*sorted(npairings))) return orderB
def vertex_vectorizer(self, exgraph): # TODO vectex vectorization should be part of the vectorizer class, hoever this abstraction needs to also be done in graphlearn.. return eg.vertex_vectorize([exgraph], d=self.eden_d, r=self.eden_r, normalization=False, nbits=16, inner_normalization=False)[0]
def main(args): print "Load input data ..." records = loadRecords(args.input, order="sequence,structure,reactivity") data = {} for name in records.keys(): data[name] = [ records[name]["reactivity"], records[name]["sequence"], records[name]["structure"] ] reactivity = [] for x in data[name][0]: if np.isnan(x): reactivity.append(None) else: reactivity.append(x) data[name][0] = reactivity print "Done ." print "Train SHAKER model ..." fperformance = open(args.performance, "w") if args.performance != "-" else sys.stdout fperformance.write("\t".join([ "name", "spearmanr", "p-value", "AUROC-observed-reactivity", "AUROC-predicted-reactivity", "RMSE" ]) + "\n") fout = open(args.reactivity, "w") for name in data.keys(): print name keys = set(data.keys()) keys.remove(name) # data[name][0] reactivity # data[name][1] sequence # data[name][2] structure model = sim.make_model(data, list(keys)) graph = util.sequence_dotbracket_to_graph(data[name][1], data[name][2]) embedding = eg.vertex_vectorize([graph])[0] reactivity_pred = model.predict(embedding).reshape(-1) fout.write(">" + name + "\n") fout.write(",".join(np.round(reactivity_pred, 3).astype(str)) + "\n") reactivity = np.array(data[name][0]).astype(float) structure = data[name][2] auc = AUC(structure, reactivity) auc_pred = AUC(structure, reactivity_pred) nan_mask = np.isnan(reactivity) reactivity = reactivity[~nan_mask] reactivity_pred = reactivity_pred[~nan_mask] corr, p = spearmanr(reactivity_pred, reactivity) rmse = RMSE(reactivity_pred, reactivity) fperformance.write("\t".join( [name, str(corr), str(p), str(auc), str(auc_pred), str(rmse)]) + "\n") fperformance.close() fout.close()
def predict(model, sequence,seq_to_db_function= rnasubopt): db_list = seq_to_db_function(sequence) if len(db_list)==1: graph = eden_rna.sequence_dotbracket_to_graph(sequence, db_list[0]) return model.predict(eg.vertex_vectorize([graph])[0]) # get probability for each structure struct_proba = probabilities_of_structures(sequence, db_list) structures, weights = zip(*struct_proba) # edenize and predict reacticuty graphs = map(lambda x: getgraph(sequence,x), structures) vecs = list(eg.vertex_vectorize(graphs,r=3,d=3)) predictions_all_structures = [ model.predict(blob) for blob in vecs ] # mix reactivity with probabilities return weighted_average(weights, predictions_all_structures)
def vec_vertex(graph, param=None): if (param != None): n_bits = param else: n_bits = 5 X = vertex_vectorize([graph], complexity=2, nbits=n_bits) x = X[0].A values = [list(xx[1:]) for xx in x] return ([list(values)])
def getXY(data,keys): '''takes entries in data that are in the list keys, returns X,Y for regression task''' # data is name -> (react,sequence,dotbacket) # we first make some graphs react,sequence,stru = zip(*[ data[k] for k in keys ]) graphs = map( getgraph, sequence,stru) # then we edenize x = vstack( eg.vertex_vectorize(graphs,r=3,d=3)) y= [y for reactlist in react for y in reactlist] y= np.array(y) # then done #print x,y return mask(x,y)
def vertex_vect_PCA(graphs, n_bits, pca_n_components, complexity=2): X = vertex_vectorize(graphs, complexity=complexity, nbits=n_bits) X = vstack(X) X = X.A pca = TruncatedSVD(n_components=pca_n_components) X_res = pca.fit_transform(X) counter = 0 for g in graphs: for node in g.nodes(): vec = g.nodes[node]['vec'] new_vec = vec for i in X_res[counter]: new_vec.append(i) g.nodes[node]['vec'] = new_vec counter = counter + 1 return (graphs)
'-m', help="Trained model for simulation", default="data/reactivity/shaker-model.pkl") args = parser.parse_args() print("Load model ...") with open(args.model, 'rb') as fmdl: model = pickle.load(fmdl) print("Done .") fout = open(args.output, "w") first_entry = True with open(args.input) as fin: for line in fin: line = line.strip() if line.startswith(">"): name = line.replace(">", "") print("Processing {} ...".format(name)) line = next(fin) sequence = line.strip() line = next(fin) dbn = line.split(" ")[0].strip() graph = util.sequence_dotbracket_to_graph(sequence, dbn) embedding = eg.vertex_vectorize([graph])[0] reactivity = model.predict(embedding).reshape(-1) data = [name] + list(reactivity.astype(str)) fout.write("\t".join(data) + "\n") else: continue fout.close()