Ejemplo n.º 1
0
def load_image_data(pdb,load_all=False):

    g   = utils.load_GREMLIN_dataset(pdb)
    fasta,seq = utils.load_seq(pdb)
    NATIVE = utils.load_contact_map(pdb)

    # Sanity checks    
    assert(len(seq) == g.shape[0])
    assert(len(seq) == NATIVE.shape[0])

    # APC corrections and whiten
    g = APC_L2(g)

    N = g.shape[0]
    IDX = generate_matrix_IDX(N,kernel_window)

    idx_true_pos = set()
    idx_true_neg = set()

    # Load the set of TP and TN
    for i,j in IDX:
        if NATIVE[i,j]:
            idx_true_pos.add((i,j))
        else:
            idx_true_neg.add((i,j))

    # Shuffle the contacts
    idx_true_neg = list(idx_true_neg)
    np.random.shuffle(idx_true_neg)

    # If we are only loading a subset of TN, truncate here
    if load_all:
        FP_choosen = len(idx_true_neg)
    else:
        FP_choosen = int(ratio_TP_to_TN*len(idx_true_pos))
        
    ratio = float(len(idx_true_neg))/ len(idx_true_pos)

    status_str = "{} {:5d} {:5d} {:0.4f}"
    print status_str.format(pdb, len(idx_true_pos), FP_choosen, ratio)
    
    idx_true_neg = idx_true_neg[:FP_choosen]

    X0 = generate_feature_vectors(g,seq,idx_true_pos,kernel_window)
    Y0 = [1,]*len(X0)
    
    X1 = generate_feature_vectors(g,seq,idx_true_neg,kernel_window)
    Y1 = [0,]*len(X1)

    # Concatenate the two samples and make them a numpy array    
    X = np.array(X0+X1)
    Y = np.array(Y0+Y1)

    return X,Y
Ejemplo n.º 2
0
def compute_predictions(pdb, clf):

    # Load the GREMLIN data
    g = utils.load_GREMLIN_dataset(pdb)
    fasta, seq = utils.load_seq(pdb)

    # Sanity checks
    assert len(seq) == g.shape[0]

    # APC corrections and whiten
    g = APC_L2(g)
    N = g.shape[0]

    IDX = generate_matrix_IDX(N, kernel_window)

    # Load the native contacts
    # NATIVE_MATRIX = utils.load_contact_map(pdb)
    # native = [NATIVE_MATRIX[idx] for idx in IDX]

    #################################################################

    X = generate_feature_vectors(g, seq, IDX, kernel_window)
    Yp = clf.predict_proba(X)[:, 1]

    g2 = np.zeros(g.shape)
    for (i, j), y in zip(IDX, Yp):
        g2[i, j] = g2[j, i] = y

    """
    # Reorder based off of ranking
    order = np.argsort(Yp)[::-1]
    IDX0 = np.array(map(list,IDX))
    IDX0 = IDX0[order]
    
    W = np.array([G[i,j] for i,j in IDX])
    order = np.argsort(W)[::-1]
    IDX1 = np.array(map(list,IDX))
    IDX1 = IDX1[order]
    """

    return g2
Ejemplo n.º 3
0
known_models = sorted(glob.glob("G2/*.gremlin"))
PDB = [os.path.basename(f).split('.')[0] for f in known_models]
kernel_window = 2

print "Found {} known models to predict.".format(len(PDB))

os.system('mkdir -p predictions')

def score_ordering(IDX, A):
    a  = np.array([A[idx] for idx in IDX])
    order = np.argsort(a)[::-1]
    return np.array(map(list,IDX))[order]

for pdb in PDB:
    G1   = utils.load_GREMLIN_dataset(pdb)
    G2  = utils.load_improved_GREMLIN_dataset(pdb)
    NATIVE = utils.load_contact_map(pdb)

    N = NATIVE.shape[0]
    IDX = utils.generate_matrix_IDX(N,kernel_window)

    sidx1 = score_ordering(IDX, G1)
    sidx2 = score_ordering(IDX, G2)

    args = {"model" : "GREMLIN", "pdb": pdb}
    f_save = "predictions/{pdb}_{model}_{L:0.2f}.txt"

    for L in L_SET:
        args["L"] = L
        cut_idx = int(N*args["L"])