def compute_cmap(f_rank):
    base = os.path.basename(f_rank)
    model_name = os.path.dirname(f_rank)
    pdb  = base.split('.')[0]

    func = {
        "G2" : utils.load_improved_GREMLIN_dataset,
        "APC": utils.load_GREMLIN_dataset,
    }
        
    G = func[model_name](pdb)
    NATIVE_MATRIX = utils.load_contact_map(pdb)

    N   = G.shape[0]
    IDX = list(generate_matrix_IDX(N,kernel_window))

    # Load the native contacts
    native = np.array([NATIVE_MATRIX[idx] for idx in IDX])
    g      = np.array([G[idx] for idx in IDX])

    data = {}
    CUT_IDX = np.linspace(1,5*N,cut_iterations)
    
    for cut_idx in CUT_IDX:
        contacts = fixedL_cut(g,native,cut_idx,index_only=True)
        C = build_cmap_from_index(N, IDX,contacts)
        data[cut_idx/N] = C

    return CUT_IDX,data,NATIVE_MATRIX
Beispiel #2
0
def load_image_data(pdb,load_all=False):

    g   = utils.load_GREMLIN_dataset(pdb)
    fasta,seq = utils.load_seq(pdb)
    NATIVE = utils.load_contact_map(pdb)

    # Sanity checks    
    assert(len(seq) == g.shape[0])
    assert(len(seq) == NATIVE.shape[0])

    # APC corrections and whiten
    g = APC_L2(g)

    N = g.shape[0]
    IDX = generate_matrix_IDX(N,kernel_window)

    idx_true_pos = set()
    idx_true_neg = set()

    # Load the set of TP and TN
    for i,j in IDX:
        if NATIVE[i,j]:
            idx_true_pos.add((i,j))
        else:
            idx_true_neg.add((i,j))

    # Shuffle the contacts
    idx_true_neg = list(idx_true_neg)
    np.random.shuffle(idx_true_neg)

    # If we are only loading a subset of TN, truncate here
    if load_all:
        FP_choosen = len(idx_true_neg)
    else:
        FP_choosen = int(ratio_TP_to_TN*len(idx_true_pos))
        
    ratio = float(len(idx_true_neg))/ len(idx_true_pos)

    status_str = "{} {:5d} {:5d} {:0.4f}"
    print status_str.format(pdb, len(idx_true_pos), FP_choosen, ratio)
    
    idx_true_neg = idx_true_neg[:FP_choosen]

    X0 = generate_feature_vectors(g,seq,idx_true_pos,kernel_window)
    Y0 = [1,]*len(X0)
    
    X1 = generate_feature_vectors(g,seq,idx_true_neg,kernel_window)
    Y1 = [0,]*len(X1)

    # Concatenate the two samples and make them a numpy array    
    X = np.array(X0+X1)
    Y = np.array(Y0+Y1)

    return X,Y
Beispiel #3
0
PDB = [os.path.basename(f).split('.')[0] for f in known_models]
kernel_window = 2

print "Found {} known models to predict.".format(len(PDB))

os.system('mkdir -p predictions')

def score_ordering(IDX, A):
    a  = np.array([A[idx] for idx in IDX])
    order = np.argsort(a)[::-1]
    return np.array(map(list,IDX))[order]

for pdb in PDB:
    G1   = utils.load_GREMLIN_dataset(pdb)
    G2  = utils.load_improved_GREMLIN_dataset(pdb)
    NATIVE = utils.load_contact_map(pdb)

    N = NATIVE.shape[0]
    IDX = utils.generate_matrix_IDX(N,kernel_window)

    sidx1 = score_ordering(IDX, G1)
    sidx2 = score_ordering(IDX, G2)

    args = {"model" : "GREMLIN", "pdb": pdb}
    f_save = "predictions/{pdb}_{model}_{L:0.2f}.txt"

    for L in L_SET:
        args["L"] = L
        cut_idx = int(N*args["L"])
        contacts = sidx1[:cut_idx]