def compute_cmap(f_rank): base = os.path.basename(f_rank) model_name = os.path.dirname(f_rank) pdb = base.split('.')[0] func = { "G2" : utils.load_improved_GREMLIN_dataset, "APC": utils.load_GREMLIN_dataset, } G = func[model_name](pdb) NATIVE_MATRIX = utils.load_contact_map(pdb) N = G.shape[0] IDX = list(generate_matrix_IDX(N,kernel_window)) # Load the native contacts native = np.array([NATIVE_MATRIX[idx] for idx in IDX]) g = np.array([G[idx] for idx in IDX]) data = {} CUT_IDX = np.linspace(1,5*N,cut_iterations) for cut_idx in CUT_IDX: contacts = fixedL_cut(g,native,cut_idx,index_only=True) C = build_cmap_from_index(N, IDX,contacts) data[cut_idx/N] = C return CUT_IDX,data,NATIVE_MATRIX
def load_image_data(pdb,load_all=False): g = utils.load_GREMLIN_dataset(pdb) fasta,seq = utils.load_seq(pdb) NATIVE = utils.load_contact_map(pdb) # Sanity checks assert(len(seq) == g.shape[0]) assert(len(seq) == NATIVE.shape[0]) # APC corrections and whiten g = APC_L2(g) N = g.shape[0] IDX = generate_matrix_IDX(N,kernel_window) idx_true_pos = set() idx_true_neg = set() # Load the set of TP and TN for i,j in IDX: if NATIVE[i,j]: idx_true_pos.add((i,j)) else: idx_true_neg.add((i,j)) # Shuffle the contacts idx_true_neg = list(idx_true_neg) np.random.shuffle(idx_true_neg) # If we are only loading a subset of TN, truncate here if load_all: FP_choosen = len(idx_true_neg) else: FP_choosen = int(ratio_TP_to_TN*len(idx_true_pos)) ratio = float(len(idx_true_neg))/ len(idx_true_pos) status_str = "{} {:5d} {:5d} {:0.4f}" print status_str.format(pdb, len(idx_true_pos), FP_choosen, ratio) idx_true_neg = idx_true_neg[:FP_choosen] X0 = generate_feature_vectors(g,seq,idx_true_pos,kernel_window) Y0 = [1,]*len(X0) X1 = generate_feature_vectors(g,seq,idx_true_neg,kernel_window) Y1 = [0,]*len(X1) # Concatenate the two samples and make them a numpy array X = np.array(X0+X1) Y = np.array(Y0+Y1) return X,Y
def compute_predictions(pdb, clf): # Load the GREMLIN data g = utils.load_GREMLIN_dataset(pdb) fasta, seq = utils.load_seq(pdb) # Sanity checks assert len(seq) == g.shape[0] # APC corrections and whiten g = APC_L2(g) N = g.shape[0] IDX = generate_matrix_IDX(N, kernel_window) # Load the native contacts # NATIVE_MATRIX = utils.load_contact_map(pdb) # native = [NATIVE_MATRIX[idx] for idx in IDX] ################################################################# X = generate_feature_vectors(g, seq, IDX, kernel_window) Yp = clf.predict_proba(X)[:, 1] g2 = np.zeros(g.shape) for (i, j), y in zip(IDX, Yp): g2[i, j] = g2[j, i] = y """ # Reorder based off of ranking order = np.argsort(Yp)[::-1] IDX0 = np.array(map(list,IDX)) IDX0 = IDX0[order] W = np.array([G[i,j] for i,j in IDX]) order = np.argsort(W)[::-1] IDX1 = np.array(map(list,IDX)) IDX1 = IDX1[order] """ return g2
print "Found {} known models to predict.".format(len(PDB)) os.system('mkdir -p predictions') def score_ordering(IDX, A): a = np.array([A[idx] for idx in IDX]) order = np.argsort(a)[::-1] return np.array(map(list,IDX))[order] for pdb in PDB: G1 = utils.load_GREMLIN_dataset(pdb) G2 = utils.load_improved_GREMLIN_dataset(pdb) NATIVE = utils.load_contact_map(pdb) N = NATIVE.shape[0] IDX = utils.generate_matrix_IDX(N,kernel_window) sidx1 = score_ordering(IDX, G1) sidx2 = score_ordering(IDX, G2) args = {"model" : "GREMLIN", "pdb": pdb} f_save = "predictions/{pdb}_{model}_{L:0.2f}.txt" for L in L_SET: args["L"] = L cut_idx = int(N*args["L"]) contacts = sidx1[:cut_idx] upper_diag_contacts = np.array([contacts[:,1],contacts[:,0]]).T contacts = np.vstack([contacts, upper_diag_contacts])