Exemple #1
0
def match(GA_orig, GB_orig, order=3, max_depth=10, complexity=4):
    if len(GA_orig) > len(GB_orig):
        GA, GB = GB_orig.copy(), GA_orig.copy()
        logging.warning('Warning: reference graph is B not A')
    else:
        GA, GB = GA_orig.copy(), GB_orig.copy()
    # logging.warning('Matching graph A (%d nodes) to graph B (%d nodes)' % (len(GA_orig), len(GB_orig)))

    GA, GB = make_same_size(GA, GB)

    M = vertex_vectorize([GA, GB], complexity=complexity, normalization=True, inner_normalization=True)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    # mark bfv in vec attribute
    GA, GB = init_vec(GA), init_vec(GB)
    for k in range(order):
        ds = d[:, 0]
        id_max_A = np.argsort(ds)[k]
        id_max_B = AprefB[id_max_A][0]

        GA = annotate_with_bfs(GA, id_max_A, max_depth=max_depth)
        GB = annotate_with_bfs(GB, id_max_B, max_depth=max_depth)
    # draw_graph_set([GA,GB],n_graphs_per_line=2, size=9, secondary_vertex_label='vec')

    # vectorize 2nd time with real values this time
    M = vertex_vectorize([GA, GB], complexity=complexity, discrete=False, normalization=False, inner_normalization=False)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    A = ['A%d' % (i + 1) for i in range(len(GA))]
    B = ['B%d' % (i + 1) for i in range(len(GB))]

    Arankings = dict(((A[i], j + 1), B[AprefB[i, j]]) for i, j in product(range(len(GA)), range(len(GA))))
    Brankings = dict(((B[i], j + 1), A[BprefA[i, j]]) for i, j in product(range(len(GB)), range(len(GB))))

    rankings = Arankings
    rankings.update(Brankings)
    pairings = stable(rankings, A, B)

    # remove dummy node pairings
    npairings = trim_pairings(pairings, GA_orig, GB_orig)
    orderA, orderB = zip(*sorted(npairings))
    return orderB
Exemple #2
0
def match(GA_orig, GB_orig, order=3, max_depth=10, complexity=4):
    if len(GA_orig) > len(GB_orig):
        GA, GB = GB_orig.copy(), GA_orig.copy()
        logging.warning('Warning: reference graph is B not A')
    else:
        GA, GB = GA_orig.copy(), GB_orig.copy()
    # logging.warning('Matching graph A (%d nodes) to graph B (%d nodes)' % (len(GA_orig), len(GB_orig)))

    GA, GB = make_same_size(GA, GB)

    M = vertex_vectorize([GA, GB], complexity=complexity, normalization=True, inner_normalization=True)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    # mark bfv in vec attribute
    GA, GB = init_vec(GA), init_vec(GB)
    for k in range(order):
        ds = d[:, 0]
        id_max_A = np.argsort(ds)[k]
        id_max_B = AprefB[id_max_A][0]

        GA = annotate_with_bfs(GA, id_max_A, max_depth=max_depth)
        GB = annotate_with_bfs(GB, id_max_B, max_depth=max_depth)
    # draw_graph_set([GA,GB],n_graphs_per_line=2, size=9, secondary_vertex_label='vec')

    # vectorize 2nd time with real values this time
    M = vertex_vectorize([GA, GB], complexity=complexity, discrete=False, normalization=False, inner_normalization=False)
    MA, MB = M[0], M[1]

    nnA = NearestNeighbors(n_neighbors=len(GA)).fit(MA)
    d, BprefA = nnA.kneighbors(MB)

    nnB = NearestNeighbors(n_neighbors=len(GB)).fit(MB)
    d, AprefB = nnB.kneighbors(MA)

    A = ['A%d' % (i + 1) for i in range(len(GA))]
    B = ['B%d' % (i + 1) for i in range(len(GB))]

    Arankings = dict(((A[i], j + 1), B[AprefB[i, j]]) for i, j in product(range(len(GA)), range(len(GA))))
    Brankings = dict(((B[i], j + 1), A[BprefA[i, j]]) for i, j in product(range(len(GB)), range(len(GB))))

    rankings = Arankings
    rankings.update(Brankings)
    pairings = stable(rankings, A, B)

    # remove dummy node pairings
    npairings = trim_pairings(pairings, GA_orig, GB_orig)
    orderA, orderB = list(zip(*sorted(npairings)))
    return orderB
Exemple #3
0
 def vertex_vectorizer(self, exgraph):
     # TODO vectex vectorization should be part of the vectorizer class, hoever this abstraction needs to also be done in graphlearn..
     return eg.vertex_vectorize([exgraph],
                                d=self.eden_d,
                                r=self.eden_r,
                                normalization=False,
                                nbits=16,
                                inner_normalization=False)[0]
def main(args):
    print "Load input data ..."
    records = loadRecords(args.input, order="sequence,structure,reactivity")
    data = {}
    for name in records.keys():
        data[name] = [
            records[name]["reactivity"], records[name]["sequence"],
            records[name]["structure"]
        ]
        reactivity = []
        for x in data[name][0]:
            if np.isnan(x):
                reactivity.append(None)
            else:
                reactivity.append(x)
        data[name][0] = reactivity
    print "Done ."
    print "Train SHAKER model ..."

    fperformance = open(args.performance,
                        "w") if args.performance != "-" else sys.stdout
    fperformance.write("\t".join([
        "name", "spearmanr", "p-value", "AUROC-observed-reactivity",
        "AUROC-predicted-reactivity", "RMSE"
    ]) + "\n")

    fout = open(args.reactivity, "w")

    for name in data.keys():
        print name
        keys = set(data.keys())
        keys.remove(name)
        # data[name][0] reactivity
        # data[name][1] sequence
        # data[name][2] structure
        model = sim.make_model(data, list(keys))
        graph = util.sequence_dotbracket_to_graph(data[name][1], data[name][2])
        embedding = eg.vertex_vectorize([graph])[0]
        reactivity_pred = model.predict(embedding).reshape(-1)
        fout.write(">" + name + "\n")
        fout.write(",".join(np.round(reactivity_pred, 3).astype(str)) + "\n")
        reactivity = np.array(data[name][0]).astype(float)
        structure = data[name][2]
        auc = AUC(structure, reactivity)
        auc_pred = AUC(structure, reactivity_pred)
        nan_mask = np.isnan(reactivity)
        reactivity = reactivity[~nan_mask]
        reactivity_pred = reactivity_pred[~nan_mask]
        corr, p = spearmanr(reactivity_pred, reactivity)
        rmse = RMSE(reactivity_pred, reactivity)
        fperformance.write("\t".join(
            [name, str(corr),
             str(p),
             str(auc),
             str(auc_pred),
             str(rmse)]) + "\n")
    fperformance.close()
    fout.close()
Exemple #5
0
def predict(model, sequence,seq_to_db_function= rnasubopt):
    db_list = seq_to_db_function(sequence)

    if len(db_list)==1:
        graph = eden_rna.sequence_dotbracket_to_graph(sequence, db_list[0])
        return model.predict(eg.vertex_vectorize([graph])[0])

    # get probability for each structure
    struct_proba = probabilities_of_structures(sequence, db_list)
    structures, weights =  zip(*struct_proba)

    # edenize and predict reacticuty
    graphs = map(lambda x: getgraph(sequence,x), structures)
    vecs = list(eg.vertex_vectorize(graphs,r=3,d=3))
    predictions_all_structures = [ model.predict(blob) for blob in vecs ]

    # mix reactivity with probabilities
    return weighted_average(weights, predictions_all_structures)
Exemple #6
0
def vec_vertex(graph, param=None):
    if (param != None):
        n_bits = param
    else:
        n_bits = 5
    X = vertex_vectorize([graph], complexity=2, nbits=n_bits)
    x = X[0].A

    values = [list(xx[1:]) for xx in x]
    return ([list(values)])
Exemple #7
0
def getXY(data,keys):
    '''takes entries in data that are in the list keys, returns X,Y for regression task'''
    # data is  name -> (react,sequence,dotbacket)
    # we first make some graphs
    react,sequence,stru = zip(*[ data[k] for k in keys ])
    graphs  = map( getgraph, sequence,stru)

    # then we edenize
    x = vstack( eg.vertex_vectorize(graphs,r=3,d=3))
    y= [y for reactlist in react for y in reactlist]
    y= np.array(y)
    # then done
    #print x,y
    return mask(x,y)
Exemple #8
0
def vertex_vect_PCA(graphs, n_bits, pca_n_components, complexity=2):
    X = vertex_vectorize(graphs, complexity=complexity, nbits=n_bits)
    X = vstack(X)
    X = X.A

    pca = TruncatedSVD(n_components=pca_n_components)
    X_res = pca.fit_transform(X)

    counter = 0
    for g in graphs:
        for node in g.nodes():
            vec = g.nodes[node]['vec']
            new_vec = vec
            for i in X_res[counter]:
                new_vec.append(i)

            g.nodes[node]['vec'] = new_vec
            counter = counter + 1

    return (graphs)
                    '-m',
                    help="Trained model for simulation",
                    default="data/reactivity/shaker-model.pkl")
args = parser.parse_args()

print("Load model ...")
with open(args.model, 'rb') as fmdl:
    model = pickle.load(fmdl)
print("Done .")
fout = open(args.output, "w")

first_entry = True
with open(args.input) as fin:
    for line in fin:
        line = line.strip()
        if line.startswith(">"):
            name = line.replace(">", "")
            print("Processing {} ...".format(name))
            line = next(fin)
            sequence = line.strip()
            line = next(fin)
            dbn = line.split(" ")[0].strip()
            graph = util.sequence_dotbracket_to_graph(sequence, dbn)
            embedding = eg.vertex_vectorize([graph])[0]
            reactivity = model.predict(embedding).reshape(-1)
            data = [name] + list(reactivity.astype(str))
            fout.write("\t".join(data) + "\n")
        else:
            continue
fout.close()