def getNOEData(noe_files, ss_seq): print noe_files noe_lines = io.readFile(noe_files) noe_matrix = zeros((len(ss_seq) + 1, len(ss_seq) + 1)) for noel in noe_lines: if len(noel) <= 1: pass else: res1, atm1, res2, atom2, noe = noel.split() noe_matrix[int(res1), int(res2)] = 1. noe_matrix[int(res2), int(res1)] = 1. return noe_matrix
def FormatRdc(seqlen, rdcfile): """ parses rdc from .npc file. should be in the format #['179', 'H', '179', 'N', '16.042', '0.0'] the rdcs are returned as a dict with res_no as key and rdc def as value. """ import io_util as io rdc_l = io.readFile(rdcfile) rdcs = {} for l in rdc_l: r1, v1, r2, v2, rdc, tol = l.split() rdcs.setdefault(int(r1), []).append([int(r1), v1, int(r2), v2, float(rdc)]) return rdcs
def parseSS(filename): """ Parse the Secondary Structure assignment file for Amino acid sequence and Secondary structure seq :param filename: :return: aa_seq, ss_seq """ data = io.readFile(filename) aa_seq = '' ss_seq = '' # up_index, up_residue, ss_pred ss_conf msa_index, msa_cons%, msa_cons, in_construct # ['232', 'I', 'H', '3', '232', '2', '~', '*\n'] for i in range(1, len(data)): line = data[i].split('\t') aa_seq = aa_seq + line[1] ss_seq = ss_seq + line[2] return aa_seq, ss_seq
def parseContacts(filename, ss_combi, ss_def, nor, cutoff_score): """ Parse the ev_couplings generated using plm method into contact data arrays :param filename, ss_combi: :return contact_matrix, plm_score_matrix: """ data = io.readFile(filename) """ list of all-by-all residue pairings, and score computed by chosen method MI_DI column headers: - 1stResidueNum - 1stResidueCode - 2ndResidueNum - 2ndResidueCode - mutual information score - DI score PLM columns are the same, replacing DI score with PLM score, and omitting MI scores (always 0) """ from collections import defaultdict from operator import itemgetter from itertools import combinations plm_contacts = defaultdict(list) for line in data: r1, a1, r2, a2, pl, score = line.split() if round(float(score), 2) > cutoff_score: # plm_contacts[int(r1)].append([int(r2), float(score)]) # this new modification for the Cell paper dataset only plm_contacts[int(r1)].append([int(r2), float(pl)]) for resi in plm_contacts.keys(): plm_contacts[resi] = sorted(plm_contacts[resi], key=itemgetter(1), reverse=True) #print "matrix order :", nor contact_matrix = zeros( (nor + 1, nor + 1)) # correct for indicies numbering plm_score_matrix = zeros( (nor + 1, nor + 1)) # keep residue numbering as it is contact_ss_matrix = zeros((nor + 1, nor + 1)) for pair in list(combinations(ss_combi.keys(), 2)): sse1 = ss_combi[pair[0]] sse2 = ss_combi[pair[1]] for i in range(0, len(sse1)): for j in range(0, len(sse2)): for k in range(sse1[i][4], sse1[i][5] + 1): for l in range(sse2[j][4], sse2[j][5] + 1): for entry in plm_contacts[k]: if entry[0] == l: #print k, l, sse1[i], sse2[j] contact_matrix[k, l] = 1.0 contact_matrix[l, k] = 1.0 plm_score_matrix[k, l] = entry[1] plm_score_matrix[l, k] = entry[1] for i in range(0, len(ss_def) - 1): print ss_def[i], ss_def[i + 1] for j in range(ss_def[i][3], ss_def[i][4] + 1): for k in range(ss_def[i + 1][3], ss_def[i + 1][4] + 1): # print plm_contacts[j], j, k for entry in plm_contacts[j]: if entry[0] == k: print j, k contact_ss_matrix[j, k] = 1.0 contact_ss_matrix[k, j] = 1.0 return contact_matrix, plm_score_matrix, contact_ss_matrix