def get_design_data(): """ :return: goes over all designed coh-doc pairs and returns a DF similar to the one used to create the decision tree """ dsn_cohs = read_multi_fastas(design_data_root+'all_designed_cohs.fasta', suffix_to_remove='.') dsn_docs = read_multi_fastas(design_data_root+'all_designed_docs.fasta', suffix_to_remove='.') df_ = pd.DataFrame(columns=columns, index=range(1, len(list(dsn_cohs.keys())))) interface_positions = parse_interface_positions() for i, doc_seq in enumerate(list(dsn_docs.values())): coh_seq = dsn_cohs[doc_seq.name] doc_model = doc_seq.name.split('A_')[1].split('_')[0] coh_identities = {typ: coh_seq[pos] for typ, pos in interface_positions['coh']['1ohz'].items()} doc_identities = {typ: doc_seq[pos] for typ, pos in interface_positions['doc'][doc_model].items()} coh_core = [core_res_to_identity(coh_identities[v], 'coh') for v in ordered_positions['coh'] if 'core' in v] coh_rim = [rim_res_to_type_binary(coh_identities[v]) for v in ordered_positions['coh'] if 'rim' in v] doc_core = [core_res_to_identity(doc_identities[v], 'doc') for v in ordered_positions['doc'] if 'core' in v] doc_rim = [rim_res_to_type_binary(doc_identities[v]) for v in ordered_positions['doc'] if 'rim' in v] coh_core_list, coh_rim_list = [], [] [coh_core_list.append(a) for b in coh_core for a in b] [coh_rim_list.append(a) for b in coh_rim for a in b] doc_core_list, doc_rim_list = [], [] [doc_core_list.append(a) for b in doc_core for a in b] [doc_rim_list.append(a) for b in doc_rim for a in b] df_.loc[i+1] = [coh_seq.name, doc_seq.name, 0, 0] + coh_core_list + coh_rim_list + \ doc_core_list + doc_rim_list + [None] return df_
def parse_binding_data() -> pd.DataFrame: """ :return: data frame 'coh_name', 'doc_name', 'coh_seq', 'doc_seq', 'binders' for Rachel's data """ from _binding_data import binding_data rachel_root = '/home/labs/fleishman/jonathaw/decision_tree/' cohs = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True) docs = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True) rachel_bind = binding_data() vered_bind = parse_vered_binding() result = pd.DataFrame(columns=['coh_name', 'doc_name', 'coh_seq', 'doc_seq', 'binders']) i = 1 for coh, docs_dict in rachel_bind.items(): for doc, res in docs_dict.items(): result.loc[i] = [coh, doc, cohs[coh], docs[doc], rachel_bind[coh][doc]] i += 1 for coh, docs_dict in vered_bind.items(): for doc, res in docs_dict.items(): result.loc[i] = [coh, doc, cohs[coh], docs[doc], vered_bind[coh][doc] == 1] i += 1 for name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4', '4fl4', '4fl5', '4dh2', '4uyp', '5new']: result.loc[i] = [name, name, cohs[name], docs[name], True] i += 1 print('there are %i rows in the data' % (i-1)) return result
def analyse_cliques(cliques): """ :param cliques: list of cliques :return: prints an anlysis """ coh_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/reclique_18Nov/stabilisation/all_stabilised/all_j_st_cohs.fasta', '_st.A') doc_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/reclique_18Nov/stabilisation/all_stabilised/all_j_st_docs.fasta', '_st.B') cliques_by_charges = parse_cliques_lists('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/reclique_18Nov/stabilisation/all_stabilised/cliques_2_1.txt', remove='_st') original_names = parse_name_translation('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/clique_6_pdbs/mini_diagonal_11Nov/minidiagonal_pdbs/translate_names.txt') clqs_by_len = {k: [] for k in range(1, 8)} for clq in cliques: clqs_by_len[len(clq)].append(clq) designs_in_all_clqs = [] for length in range(10, 6, -1): if length not in clqs_by_len.keys(): continue for clq in clqs_by_len[length]: coh_diffs, doc_diffs, doc_diffs_symm, doc_bb_diffs = [], [], [], [] print('\n\n\nfor clq', clq) for mem1 in clq: designs_in_all_clqs.append(mem1) wt_doc = original_names[mem1[1]+'.pdb.gz'][10:14] res1 = Result(mem1, coh_seqs[mem1[0]], doc_seqs[mem1[1]], 1, j=True, doc_wt=wt_doc) res1_doc_symm = switch_symm_changer(res1.doc_switch) for mem2 in clq: if mem1 != mem2: wt_doc = original_names[mem2[1]+'.pdb.gz'][10:14] res2 = Result(mem2, coh_seqs[mem2[0]], doc_seqs[mem2[1]], 1, j=True, doc_wt=wt_doc) doc_bb_diffs.append(are_docs_from_diff_clusters(res1.doc_wt, res2.doc_wt)) coh_diffs.append(switches_differ({}, res1.coh_switch, res2.coh_switch)) doc_diffs.append(switches_differ({}, res1.doc_switch, res2.doc_switch)) doc_diffs_symm.append(switches_differ({}, res1_doc_symm, res2.doc_switch)) print('results', res1) print('results', res2) print('docs diff', switches_differ({}, res1.doc_switch, res2.doc_switch)) print('doc symm diff', switches_differ({}, res1_doc_symm, res2.doc_switch)) print('doc BB dif', are_docs_from_diff_clusters(res1.doc_wt, res2.doc_wt)) print('coh diff', switches_differ({}, res1.coh_switch, res2.coh_switch)) N print('for clq %r found the following results:' % clq) print('doc_bb_diffs', doc_bb_diffs) print('doc_diffs', doc_diffs) print('doc_diffs_symm', doc_diffs_symm) print('coh_diffs', coh_diffs) print('total', sum([1 for a in doc_bb_diffs if a] + doc_diffs + doc_diffs_symm + coh_diffs)) all_cohs = list(set([a[0] for a in designs_in_all_clqs])) all_docs = list(set([a[1] for a in designs_in_all_clqs])) print('these are all the cohs i need: %s, total %i' % (', '.join(all_cohs), len(all_cohs))) print('these are all the docs i need: %s, total %i' % (', '.join(all_docs), len(all_docs))) print('LONGEST CLIQUES FOUND ARE %i' % max([len(clq) for clq in cliques])) coh_doc_purples = creat_coh_doc_purples() for clq in clqs_by_len[max(list(clqs_by_len.keys()))]: print('clq', clq) cohs = [a[0] for a in clq] docs = [a[1] for a in clq] df = pd.DataFrame(index=docs, columns=cohs) for coh in cohs: for doc in docs: df[coh][doc] = coh_doc_purples[coh][doc] show_prediction_heat_map(df)
def retrive_relevant_poses() -> (dict, dict): """ :return: seq dicts for cohs and docs, holding only the relevqant positions, determined by 1OHZ """ cohs_old = read_multi_fastas(root_path + 'cohesins_from_rachel.fasta_aln', suffix_to_remove='/') docs_old = read_multi_fastas(root_path + 'dockerins_from_rachel.fasta_aln', suffix_to_remove='/') coh_1ohz = cohs_old['1OHZ'] coh_poses = [ coh_1ohz.non_aligned_position_at_aligned(p) for p in coh_poses_1ohz ] doc_1ohz = docs_old['1OHZ'] doc_poses = [ doc_1ohz.non_aligned_position_at_aligned(p) for p in doc_poses_1ohz ] cohs_new, docs_new = {}, {} for coh, res in cohs_old.items(): cohs_new[coh] = AASeq(string=''.join( res.get_aligned_positions(coh_poses)), name=coh) for doc, res in docs_old.items(): docs_new[doc] = AASeq(string=''.join( res.get_aligned_positions(doc_poses)), name=doc) return cohs_new, docs_new
def post_pred_cliques(args): run_filters = generate_run_filters(args={'ddg': 25.0, 'sasa': 1400, 'shape': 0.6, 'packstat': 0.6, 'buried_2': 3}) if not os.path.isfile('./all_data.obj'): sc_files = [a for a in os.listdir('./') if '.score' in a] cohs_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_cohs.fasta') docs_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_docs.fasta') results = [] for sc_file in sc_files: seq_name = '_'.join(sc_file.split('_')[1:8]) coh_name = seq_name+'.pdb.gz.A' doc_name = seq_name+'.pdb.gz.B' sc_dict = score2dict(sc_file) ynum = re.search('y[0-9]{3}', sc_file).group(0) passed, failed = all_who_pass_run_filters(args, sc_dict, run_filters) if len(passed) >= args['purples_threshold']: r = Result(seq_name, cohs_seqs[coh_name], docs_seqs[doc_name], len(passed)) results.append(r) with open('./all_data.obj', 'wb') as fout: pickle.dump(results, fout) else: with open('./all_data.obj', 'rb') as fin: results = pickle.load(fin) if not os.path.isfile('./graph.obj'): result_dict = {i+1: r for i, r in enumerate(results)} G = nx.Graph() [G.add_node(a) for a in result_dict.keys()] for n1 in G.nodes_iter(): for n2 in G.nodes_iter(): if n1 != n2: coh_sw_1, coh_sw_2 = result_dict[n1].coh_switch, result_dict[n2].coh_switch doc_sw_1, doc_sw_2 = result_dict[n1].doc_switch, result_dict[n2].doc_switch doc_wt_1, doc_wt_2 = result_dict[n1].doc_wt, result_dict[n2].doc_wt doc_diff = 1 if are_docs_from_diff_clusters(doc_wt_1, doc_wt_2) else 0 symm_switch = switch_symm_changer(doc_sw_2) if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']: G.add_edge(n1, n2) print('adding edge\n', result_dict[n1], '\n', result_dict[n2]) else: print('NOT\n', result_dict[n1], '\n', result_dict[n2]) cliques = [a for a in nx.find_cliques(G)] max_len = max([len(a) for a in cliques]) max_cliques = [a for a in cliques if len(a) == max_len] for clq in max_cliques: print(clq, '\n', '\n'.join([str(result_dict[a]) for a in clq]))
def main(): pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) parser = argparse.ArgumentParser() parser.add_argument('-gremlin_file', type=str, help='full path to Gremlin output') parser.add_argument('-MSA', type=str, help='full path the MSA file') parser.add_argument('-probability_threshold', type=float, help='threshold above which probability is considered') parser.add_argument('-query_name', type=str, help='query name as it is written in the MSA') args = vars(parser.parse_args()) gremlin = parse_gremlin(args['gremlin_file'], args['probability_threshold']) msa = read_multi_fastas(args['MSA'], add_aligned=True) with open(args['query_name'] + '.gssm', 'w+') as fout: for k, v in gremlin.items(): iden_frq = create_identitiy_frequency_df(k, msa, args['query_name']) fout.write('pos_1 %i pos_2 %i probability %f\n' % (k[0], k[1], v)) fout.write(str(iden_frq) + '\n')
def validate(args): # original_seqs = read_multi_fastas(args['original_seqs_file'], suffix_to_remove='_') DNA_seqs = read_multi_fastas(args['DNA_seqs_file'], suffix_to_remove='.') for k, v in DNA_seqs.items(): # assert original_seqs[k].get_seq() in DNA2AA(v.get_seq()) if not gen9_standards(v.get_seq): print('seq name %s does not comply with Gen9 standards' % k)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-coh_files', nargs='+') parser.add_argument('-doc_files', nargs='+') parser.add_argument('-threshold', type=float, default=30.0) args = vars(parser.parse_args()) cohs = OrderedDict({ k: v for a in args['coh_files'] for k, v in read_multi_fastas(a).items() }) docs = OrderedDict({ k: v for a in args['doc_files'] for k, v in read_multi_fastas(a).items() }) mw_df = pd.DataFrame(columns=['coh_seq', 'coh_weight'] + list(docs.keys())) for i, coh in enumerate(cohs.values()): coh.add_prefix = flanks['coh']['cbm'] coh_weight = coh.calc_molecular_weight() print(coh.get_seq(), coh_weight) weights_combined = [] for doc in docs.values(): doc.add_prefix(flanks['doc']['xyn']) weights_combined.append(coh_weight + doc.calc_molecular_weight()) mw_df.loc[coh.name] = [coh, coh_weight] + weights_combined print(mw_df) diffs = [] for coh1 in cohs.keys(): for doc1 in docs.keys(): coh_doc_1 = mw_df[doc1][coh1] for coh2 in cohs.keys(): for doc2 in docs.keys(): if coh1 != coh2 and doc1 != doc2: diff = abs(coh_doc_1 - mw_df[doc2][coh2]) diffs.append(diff) if diff <= args['threshold']: print( '%s %s and %s %s have a weight difference of only %f' % (coh1, doc1, coh2, doc2, diff)) plt.boxplot(diffs) plt.show()
def retrive_relevant_poses() -> (dict, dict): """ :return: seq dicts for cohs and docs, holding only the relevqant positions, determined by 1OHZ """ cohs_old = read_multi_fastas(root_path+'cohesins_from_rachel.fasta_aln', suffix_to_remove='/') docs_old = read_multi_fastas(root_path+'dockerins_from_rachel.fasta_aln', suffix_to_remove='/') coh_1ohz = cohs_old['1OHZ'] coh_poses = [coh_1ohz.non_aligned_position_at_aligned(p) for p in coh_poses_1ohz] doc_1ohz = docs_old['1OHZ'] doc_poses = [doc_1ohz.non_aligned_position_at_aligned(p) for p in doc_poses_1ohz] cohs_new, docs_new = {}, {} for coh, res in cohs_old.items(): cohs_new[coh] = AASeq(string=''.join(res.get_aligned_positions(coh_poses)), name=coh) for doc, res in docs_old.items(): docs_new[doc] = AASeq(string=''.join(res.get_aligned_positions(doc_poses)), name=doc) return cohs_new, docs_new
def main(): parser = argparse.ArgumentParser() parser.add_argument('-coh_files', nargs='+') parser.add_argument('-doc_files', nargs='+') parser.add_argument('-threshold', type=float, default=30.0) args = vars(parser.parse_args()) cohs = OrderedDict({k: v for a in args['coh_files'] for k, v in read_multi_fastas(a).items()}) docs = OrderedDict({k: v for a in args['doc_files'] for k, v in read_multi_fastas(a).items()}) mw_df = pd.DataFrame(columns=['coh_seq', 'coh_weight']+list(docs.keys())) for i, coh in enumerate(cohs.values()): coh.add_prefix = flanks['coh']['cbm'] coh_weight = coh.calc_molecular_weight() print(coh.get_seq(), coh_weight) weights_combined = [] for doc in docs.values(): doc.add_prefix(flanks['doc']['xyn']) weights_combined.append(coh_weight+doc.calc_molecular_weight()) mw_df.loc[coh.name] = [coh, coh_weight] + weights_combined print(mw_df) diffs = [] for coh1 in cohs.keys(): for doc1 in docs.keys(): coh_doc_1 = mw_df[doc1][coh1] for coh2 in cohs.keys(): for doc2 in docs.keys(): if coh1 != coh2 and doc1 != doc2: diff = abs(coh_doc_1 - mw_df[doc2][coh2]) diffs.append(diff) if diff <= args['threshold']: print('%s %s and %s %s have a weight difference of only %f' % (coh1, doc1, coh2, doc2, diff)) plt.boxplot(diffs) plt.show()
def main(): pd.set_option('display.max_columns', 500) pd.set_option('display.width', 1000) parser = argparse.ArgumentParser() parser.add_argument('-gremlin_file', type=str, help='full path to Gremlin output') parser.add_argument('-MSA', type=str, help='full path the MSA file') parser.add_argument('-probability_threshold', type=float, help='threshold above which probability is considered') parser.add_argument('-query_name', type=str, help='query name as it is written in the MSA') args = vars(parser.parse_args()) gremlin = parse_gremlin(args['gremlin_file'], args['probability_threshold']) msa = read_multi_fastas(args['MSA'], add_aligned=True) with open(args['query_name']+'.gssm', 'w+') as fout: for k, v in gremlin.items(): iden_frq = create_identitiy_frequency_df(k, msa, args['query_name']) fout.write('pos_1 %i pos_2 %i probability %f\n' % (k[0], k[1], v)) fout.write(str(iden_frq) + '\n')
parser.add_argument('-coh_seqs_file') parser.add_argument('-doc_seqs_file') parser.add_argument('-mode') parser.add_argument('-n', type=int, default=1) parser.add_argument('-diff_by', type=int, default=2) parser.add_argument('-doc_diff_by', type=int, default=1) parser.add_argument('-score_dir', type=str, default='./') parser.add_argument('-purples_threshold', type=int, default=50) args = vars(parser.parse_args()) if args['mode'] != 'bins_diagonal' and args[ 'mode'] != 'post_pred_cliques' and args[ 'mode'] != 'minidiagonal_cliques': scores = score2dict(args['score_file']) run_filters = '' #generate_run_filters() coh_seq_dict = read_multi_fastas(args['coh_seqs_file'], suffix_to_remove='.pdb.gz') doc_seq_dict = read_multi_fastas(args['doc_seqs_file'], suffix_to_remove='.pdb.gz') if args['mode'] == 'switches_n_cliques': switches, num_bins = make_switches(args, scores, run_filters, coh_seq_dict) max_cliques = best_cliques(args, list(switches.keys())) with open('switches.obj', 'wb') as sw_file: pickle.dump(switches, sw_file) with open('max_cliques.obj', 'wb') as clq_file: pickle.dump(max_cliques, clq_file) elif args['mode'] == 'choose_by_identity': with open('switches.obj', 'rb') as sw_in:
def add_primers_to_all(args): DNA_seqs = read_multi_fastas(args['DNA_seqs_file']) for k, v in DNA_seqs.items(): print('>%s' % k) print(add_primers(v.get_seq, args['type']))
def validate_data_frame(data_df: pd.DataFrame, prepared_df: pd.DataFrame) -> None: """ :param data_df: binding data frame :param prepared_df: binary data frame :return: prints if there is something wrong... """ rachel_root = '/home/labs/fleishman/jonathaw/decision_tree/' cohs = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/') docs = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/') # coh_1ohz = cohs['1OHZ'] # doc_1ohz = docs['1OHZ'] coh_crys_seqs = [c for c in cohs.values() if c.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4', '4fl4', '4fl5', '4dh2', '4uyp', '5new']] doc_crys_seqs = [d for d in docs.values() if d.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4', '4fl4', '4fl5', '4dh2', '4uyp', '5new']] # coh_poses = [coh_1ohz.non_aligned_position_at_aligned(p) for p in coh_poses_1ohz] # doc_poses = [doc_1ohz.non_aligned_position_at_aligned(p) for p in doc_poses_1ohz] features = list(prepared_df.columns[4:-1]) interface_positions = parse_interface_positions() coh_poses = {coh: {typ: cohs[coh].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for coh, typos in interface_positions['coh'].items()} doc_poses = {doc: {typ: docs[doc].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for doc, typos in interface_positions['doc'].items()} for i in range(1, len(data_df.index)): # i = len(data_df.index) print('i is %i' % i) print(data_df.loc[i]) if data_df.loc[i]['coh_name'] != prepared_df.loc[i]['coh_name'] or \ data_df.loc[i]['doc_name'] != prepared_df.loc[i]['doc_name']: print('not the same names', data_df.loc[i]['doc_name'], prepared_df.loc[i]['doc_name']) sys.exit() coh_seq = data_df.loc[i]['coh_seq'] # coh_q_poses = coh_seq.get_aligned_positions(coh_poses) doc_seq = data_df.loc[i]['doc_seq'] # doc_q_poses = doc_seq.get_aligned_positions(doc_poses) prepared_row = row_to_dict(prepared_df.loc[i]) similar_coh, coh_iden = highest_seq_similarity(coh_crys_seqs, data_df.loc[i]['coh_seq']) similar_doc, doc_iden = highest_seq_similarity(doc_crys_seqs, data_df.loc[i]['doc_seq']) coh_identities = {typ: data_df.loc[i]['coh_seq'].get_aligned_positions([pos])[0] for typ, pos in coh_poses[similar_coh.name].items()} doc_identities = {typ: data_df.loc[i]['doc_seq'].get_aligned_positions([pos])[0] for typ, pos in doc_poses[similar_doc.name].items()} # for pos in positions['core_coh']: # if coh_q_poses[pos] != prepared_row['coh_core_%i' % pos]: # print('not the same coh query pos differs from row', pos, coh_q_poses[pos], prepared_row['coh_core_%i' % pos]) # sys.exit() # for pos in positions['core_doc']: # if doc_q_poses[pos] != prepared_row['doc_core_%i' % pos]: # print('not the same doc query pos differs from row') # sys.exit() # for pos in positions['rim_coh']: # if [k for k, v in type_to_res.items() if coh_q_poses[pos] in v][0] != prepared_row['coh_rim_%i' % pos] and \ # not ([k for k, v in type_to_res.items() if coh_q_poses[pos] in v][0] == 'NA' and # prepared_row['coh_rim_%i' % pos] == '-'): # print('breaking', [k for k, v in type_to_res.items() if coh_q_poses[pos] in v][0], # prepared_row['coh_rim_%i' % pos]) # sys.exit() for fea in features: if prepared_df.loc[i][fea] not in [0, 1]: print('found problem at row', i, prepared_df.loc[i][fea]) # break print('your df is validated')
def minidiagonal_cliques(args): original_names = parse_name_translation('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/clique_6_pdbs/mini_diagonal_11Nov/minidiagonal_pdbs/translate_names.txt') coh_seqs = read_multi_fastas(args['coh_seqs_file'], suffix_to_remove='.A') doc_seqs = read_multi_fastas(args['doc_seqs_file'], suffix_to_remove='.B') if not os.path.isfile('all_results.obj') or not os.path.isfile('all_bins.obj'): print('creating bins and results') all_results, bins = {}, {} for design in coh_seqs.keys(): r = Result(design, coh_seqs[design], doc_seqs[design], 0, j=True, originals=original_names) if not 2 < r.coh_switch.count('n') + r.coh_switch.count('p') < 8 and not 2 <= r.coh_switch.count('p') <= 3: continue all_results[design] = r d_sw = r.coh_switch+'-'+r.doc_switch+'-'+r.doc_wt if d_sw not in bins.keys(): bins[d_sw] = [] bins[d_sw].append(r) with open('all_results.obj', 'wb') as w_obj: pickle.dump(all_results, w_obj) with open('all_bins.obj', 'wb') as w_obj: pickle.dump(bins, w_obj) else: print('reading results') with open('all_results.obj', 'rb') as r_obj: all_results = pickle.load(r_obj) with open('all_bins.obj', 'rb') as r_obj: bins = pickle.load(r_obj) print('found %i bins' % len(bins)) if not os.path.isfile('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by'])): print('creating graph') G = nx.Graph() [G.add_node(a) for a in bins.keys()] print('found %i nodes' % G.number_of_nodes()) for n1 in G.nodes_iter(): for n2 in G.nodes_iter(): if n1 != n2: coh_sw_1, coh_sw_2 = n1.split('-')[0], n2.split('-')[0] doc_sw_1, doc_sw_2 = n1.split('-')[1], n2.split('-')[1] doc_wt_1, doc_wt_2 = n1.split('-')[2], n2.split('-')[2] doc_diff = 1 if are_docs_from_diff_clusters(doc_wt_1, doc_wt_2) else 0 symm_switch = switch_symm_changer(doc_sw_2) if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']: G.add_edge(n1, n2) with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'wb') as w_obj: pickle.dump(G, w_obj) else: print('reading graph') with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'rb') as r_obj: G = pickle.load(r_obj) if not os.path.isfile('max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by'])): cliques = [a for a in nx.find_cliques(G)] max_len = max([len(a) for a in cliques]) max_cliques = [a for a in cliques if len(a) == max_len] print('there are %i cliques with %i structures in each for diff_by=%i doc_diff_by=%i' % (len(max_cliques), max_len, args['diff_by'], args['doc_diff_by'])) with open('max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'wb') as w_obj: pickle.dump(max_cliques, w_obj) else: print('reading cliques') with open('max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'rb') as r_obj: max_cliques = pickle.load(r_obj) occurences = {a.name: 0 for clq in max_cliques for k in clq for a in bins[k]} for clq in max_cliques: print('in clq', clq) for k in clq: # print(bins[k][0].name) print('\n'.join(set([a.name for a in bins[k]]))) for a in bins[k]: occurences[a.name] += 1 for w in sorted(occurences, key=occurences.get, reverse=False): print(w, occurences[w])
parser = argparse.ArgumentParser() parser.add_argument('-score_file') parser.add_argument('-coh_seqs_file') parser.add_argument('-doc_seqs_file') parser.add_argument('-mode') parser.add_argument('-n', type=int, default=1) parser.add_argument('-diff_by', type=int, default=2) parser.add_argument('-doc_diff_by', type=int, default=1) parser.add_argument('-score_dir', type=str, default='./') parser.add_argument('-purples_threshold', type=int, default=50) args = vars(parser.parse_args()) if args['mode'] != 'bins_diagonal' and args['mode'] != 'post_pred_cliques' and args['mode'] != 'minidiagonal_cliques': scores = score2dict(args['score_file']) run_filters = ''#generate_run_filters() coh_seq_dict = read_multi_fastas(args['coh_seqs_file'], suffix_to_remove='.pdb.gz') doc_seq_dict = read_multi_fastas(args['doc_seqs_file'], suffix_to_remove='.pdb.gz') if args['mode'] == 'switches_n_cliques': switches, num_bins = make_switches(args, scores, run_filters, coh_seq_dict) max_cliques = best_cliques(args, list(switches.keys())) with open('switches.obj', 'wb') as sw_file: pickle.dump(switches, sw_file) with open('max_cliques.obj', 'wb') as clq_file: pickle.dump(max_cliques, clq_file) elif args['mode'] == 'choose_by_identity': with open('switches.obj', 'rb') as sw_in: switches = pickle.load(sw_in) with open('max_cliques.obj', 'rb') as clq_file:
def prepare_data(in_df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame): """ :rtype: (pd.DataFrame, pd.DataFrame) """ rachel_root = '/home/labs/fleishman/jonathaw/decision_tree/' cohs_non_aln = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta', suffix_to_remove='/', lower=True) docs_non_aln = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta', suffix_to_remove='/', lower=True) cohs = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True) docs = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True) interface_positions = parse_interface_positions() coh_poses = {coh: {typ: cohs[coh].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for coh, typos in interface_positions['coh'].items()} doc_poses = {doc: {typ: docs[doc].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for doc, typos in interface_positions['doc'].items()} validate_aligned_non_aligned_interface_positions(interface_positions['coh'], cohs, cohs_non_aln) validate_aligned_non_aligned_interface_positions(interface_positions['doc'], docs, docs_non_aln) coh_crys_seqs = [c for c in cohs.values() if c.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4', '4fl4', '4fl5', '4dh2', '4uyp', '5new']] doc_crys_seqs = [d for d in docs.values() if d.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4', '4fl4', '4fl5', '4dh2', '4uyp', '5new']] # columns = ['coh_name', 'doc_name', 'coh_seq', 'doc_seq'] + \ # ['%s_%s' % (typ, aa) for typ in ordered_positions['coh'] if 'core' in typ for aa in aas] + \ # ['%s_%s' % (typ, aa) for typ in ordered_positions['coh'] if 'rim' in typ for aa in types] + \ # ['%s_%s' % (typ, aa) for typ in ordered_positions['doc'] if 'core' in typ for aa in aas] + \ # ['%s_%s' % (typ, aa) for typ in ordered_positions['doc'] if 'rim' in typ for aa in types] + ['binders'] out_df = pd.DataFrame(index=range(1, len(in_df.index)), columns=columns) id_columns = ['coh_name', 'doc_name', 'coh_seq', 'doc_seq'] + ordered_positions['coh'] + ordered_positions['doc'] identities_df = pd.DataFrame(index=range(1, len(in_df.index)), columns=id_columns) for i in range(1, len(in_df.index)+1): # find which crystal coh+doc are most similar similar_coh, coh_iden = highest_seq_similarity(coh_crys_seqs, in_df.loc[i]['coh_seq']) similar_doc, doc_iden = highest_seq_similarity(doc_crys_seqs, in_df.loc[i]['doc_seq']) # get aligned positions accrotding to interface_positions coh_identities = {typ: in_df.loc[i]['coh_seq'].get_aligned_positions([pos])[0] for typ, pos in coh_poses[similar_coh.name].items()} doc_identities = {typ: in_df.loc[i]['doc_seq'].get_aligned_positions([pos])[0] for typ, pos in doc_poses[similar_doc.name].items()} # coh_ = in_df.loc[i]['coh_seq'].get_aligned_positions(coh_poses[similar_coh]) # doc_ = in_df.loc[i]['doc_seq'].get_aligned_positions(doc_poses[similar_doc]) coh_core = [core_res_to_identity(coh_identities[v], 'coh') for v in ordered_positions['coh'] if 'core' in v] coh_rim = [rim_res_to_type_binary(coh_identities[v]) for v in ordered_positions['coh'] if 'rim' in v] doc_core = [core_res_to_identity(doc_identities[v], 'doc') for v in ordered_positions['doc'] if 'core' in v] doc_rim = [rim_res_to_type_binary(doc_identities[v]) for v in ordered_positions['doc'] if 'rim' in v] coh_core_list, coh_rim_list = [], [] [coh_core_list.append(a) for b in coh_core for a in b] [coh_rim_list.append(a) for b in coh_rim for a in b] doc_core_list, doc_rim_list = [], [] [doc_core_list.append(a) for b in doc_core for a in b] [doc_rim_list.append(a) for b in doc_rim for a in b] out_df.loc[i] = [in_df.loc[i]['coh_name'], in_df.loc[i]['doc_name'], 0, 0] + coh_core_list + coh_rim_list + \ doc_core_list + doc_rim_list + [1 if in_df.loc[i]['binders'] else 0] identities_df.loc[i] = [in_df.loc[i]['coh_name'], in_df.loc[i]['doc_name'], 0, 0] + \ [coh_identities[v] for v in ordered_positions['coh'] if 'core' in v] + \ [coh_identities[v] for v in ordered_positions['coh'] if 'rim' in v] + \ [doc_identities[v] for v in ordered_positions['doc'] if 'core' in v] + \ [doc_identities[v] for v in ordered_positions['doc'] if 'rim' in v] return out_df, identities_df
def post_pred_cliques(args): run_filters = generate_run_filters(args={ 'ddg': 25.0, 'sasa': 1400, 'shape': 0.6, 'packstat': 0.6, 'buried_2': 3 }) if not os.path.isfile('./all_data.obj'): sc_files = [a for a in os.listdir('./') if '.score' in a] cohs_seqs = read_multi_fastas( '/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_cohs.fasta' ) docs_seqs = read_multi_fastas( '/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_docs.fasta' ) results = [] for sc_file in sc_files: seq_name = '_'.join(sc_file.split('_')[1:8]) coh_name = seq_name + '.pdb.gz.A' doc_name = seq_name + '.pdb.gz.B' sc_dict = score2dict(sc_file) ynum = re.search('y[0-9]{3}', sc_file).group(0) passed, failed = all_who_pass_run_filters(args, sc_dict, run_filters) if len(passed) >= args['purples_threshold']: r = Result(seq_name, cohs_seqs[coh_name], docs_seqs[doc_name], len(passed)) results.append(r) with open('./all_data.obj', 'wb') as fout: pickle.dump(results, fout) else: with open('./all_data.obj', 'rb') as fin: results = pickle.load(fin) if not os.path.isfile('./graph.obj'): result_dict = {i + 1: r for i, r in enumerate(results)} G = nx.Graph() [G.add_node(a) for a in result_dict.keys()] for n1 in G.nodes_iter(): for n2 in G.nodes_iter(): if n1 != n2: coh_sw_1, coh_sw_2 = result_dict[ n1].coh_switch, result_dict[n2].coh_switch doc_sw_1, doc_sw_2 = result_dict[ n1].doc_switch, result_dict[n2].doc_switch doc_wt_1, doc_wt_2 = result_dict[n1].doc_wt, result_dict[ n2].doc_wt doc_diff = 1 if are_docs_from_diff_clusters( doc_wt_1, doc_wt_2) else 0 symm_switch = switch_symm_changer(doc_sw_2) if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']: G.add_edge(n1, n2) print('adding edge\n', result_dict[n1], '\n', result_dict[n2]) else: print('NOT\n', result_dict[n1], '\n', result_dict[n2]) cliques = [a for a in nx.find_cliques(G)] max_len = max([len(a) for a in cliques]) max_cliques = [a for a in cliques if len(a) == max_len] for clq in max_cliques: print(clq, '\n', '\n'.join([str(result_dict[a]) for a in clq]))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-mode') parser.add_argument('-make_dt', type=bool, default=False) parser.add_argument('-coh_name', type=str) args = vars(parser.parse_args()) if args['make_dt']: print('making decision tree') data_df = parse_binding_data() prepared_df, identities_df = prepare_data(data_df) decision_tree, features = create_decision_tree(prepared_df) pickle.dump(decision_tree, open(decision_tree_root+'decision_tree_%s.obj' % time.strftime("%d.%0-m"), 'wb')) pickle.dump(features, open(decision_tree_root+'features_%s.obj' % time.strftime("%d.%0-m"), 'wb')) pickle.dump(prepared_df, open(decision_tree_root+'prepared_df_%s.obj' % time.strftime("%d.%0-m"), 'wb')) pickle.dump(identities_df, open(decision_tree_root+'identities_df_%s.obj' % time.strftime("%d.%0-m"), 'wb')) else: print('reading decision tree') decision_tree = pickle.load(open(decision_tree_root+'decision_tree_%s.obj' % time_to_use, 'rb')) features = pickle.load(open(decision_tree_root+'features_%s.obj' % time_to_use, 'rb')) prepared_df = pickle.load(open(decision_tree_root+'prepared_df_%s.obj' % time_to_use, 'rb')) identities_df = pickle.load(open(decision_tree_root+'identities_df_%s.obj' % time_to_use, 'rb')) if args['mode'] == 'k_fold': k_fold_test(prepared_df) elif args['mode'] == 'validate_df': validate_data_frame(data_df, prepared_df) elif args['mode'] == 'analyse_identities_df': analyse_identity_df(identities_df) elif args['mode'] == 'create_dt': with open('decision_tree.dot', 'w') as fout: print('creating decision tree.dot') export_graphviz(decision_tree, out_file=fout, feature_names=features) compare_observed_to_predicted(decision_tree, data_df, prepared_df[features]) elif args['mode'] == 'follow': seq_to_follow(prepared_df, '2b59', '2b59') elif args['mode'] == 'predict_all_designs_diagonal': print('getting design sequences') design_df = get_design_data() print('making prediction') design_df['predict'] = decision_tree.predict(design_df[features]) with open(design_data_root+'diagonal_prediciton.txt', 'w+') as fout: pd.set_option('display.max_rows', 9999999999999999999) fout.write(str(design_df.loc[design_df['predict'] == 1]['coh_name'])) elif args['mode'] == 'pickle_design_sequences': dsn_cohs = read_multi_fastas(design_data_root+'all_designed_cohs.fasta', suffix_to_remove='.') dsn_docs = read_multi_fastas(design_data_root+'all_designed_docs.fasta', suffix_to_remove='.') pickle.dump(dsn_cohs, open(design_data_root+'dsn_cohs_%s.obj' % time.strftime("%d.%0-m"), 'wb')) pickle.dump(dsn_docs, open(design_data_root+'dsn_docs_%s.obj' % time.strftime("%d.%0-m"), 'wb')) elif args['mode'] == 'predict_by_coh': design_df = get_design_data_coh_vs_all(args['coh_name']) print('predicting!!') design_df['predict'] = decision_tree.predict(design_df[features]) with open(design_data_root+'all_vs_all_decision_tree_6Jan/' +args['coh_name']+'.txt', 'w+') as fout: pd.set_option('display.max_rows', len(design_df)) fout.write(str(design_df[[0, 1, -1]])+'\n') pd.reset_option('display.max_rows') else: print('no mode found')
79, 81, 82, 83, 85, 87, 115, 116, 118, 119, 121, 123, 125, 127, ] fastas = read_multi_fastas("/home/labs/fleishman/jonathaw/data/pssm/cohs/making/1ohz_passed_thresholds.fasta") positives, negatives, neutrals, totals = [], [], [], [] for k, v in fastas.items(): charge_config = extract_charge_configuration(v, positions) positives.append(charge_config.count("p")) negatives.append(charge_config.count("n")) neutrals.append(charge_config.count("c")) totals.append(charge_config.count("p") + charge_config.count("n")) if k == "1ohz": print("at 1ohz found %i %s" % (charge_config.count("p"), "positives")) print("at 1ohz found %i %s" % (charge_config.count("n"), "negatives")) print("at 1ohz found %i %s" % (charge_config.count("c"), "neutrals")) print("at 1ohz found %i %s" % (charge_config.count("p") + charge_config.count("n"), "totals")) bins = range(max(positives + negatives + neutrals) + 1) plt.hist(positives, bins=bins, color="b", label="positives") plt.hist(negatives, bins=bins, color="r", label="negatives")
def extract_charge_configuration(seq: AASeq, positions: list): res_in_poses = seq.get_positions(positions) charge = [ res2charge[a] if a in res2charge.keys() else 'c' for a in res_in_poses ] return charge if __name__ == '__main__': positions = [ 32, 33, 35, 37, 39, 63, 66, 68, 70, 73, 75, 77, 79, 81, 82, 83, 85, 87, 115, 116, 118, 119, 121, 123, 125, 127 ] fastas = read_multi_fastas( '/home/labs/fleishman/jonathaw/data/pssm/cohs/making/1ohz_passed_thresholds.fasta' ) positives, negatives, neutrals, totals = [], [], [], [] for k, v in fastas.items(): charge_config = extract_charge_configuration(v, positions) positives.append(charge_config.count('p')) negatives.append(charge_config.count('n')) neutrals.append(charge_config.count('c')) totals.append(charge_config.count('p') + charge_config.count('n')) if k == '1ohz': print('at 1ohz found %i %s' % (charge_config.count('p'), 'positives')) print('at 1ohz found %i %s' % (charge_config.count('n'), 'negatives')) print('at 1ohz found %i %s' % (charge_config.count('c'), 'neutrals'))
def minidiagonal_cliques(args): original_names = parse_name_translation( '/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/clique_6_pdbs/mini_diagonal_11Nov/minidiagonal_pdbs/translate_names.txt' ) coh_seqs = read_multi_fastas(args['coh_seqs_file'], suffix_to_remove='.A') doc_seqs = read_multi_fastas(args['doc_seqs_file'], suffix_to_remove='.B') if not os.path.isfile('all_results.obj') or not os.path.isfile( 'all_bins.obj'): print('creating bins and results') all_results, bins = {}, {} for design in coh_seqs.keys(): r = Result(design, coh_seqs[design], doc_seqs[design], 0, j=True, originals=original_names) if not 2 < r.coh_switch.count('n') + r.coh_switch.count( 'p') < 8 and not 2 <= r.coh_switch.count('p') <= 3: continue all_results[design] = r d_sw = r.coh_switch + '-' + r.doc_switch + '-' + r.doc_wt if d_sw not in bins.keys(): bins[d_sw] = [] bins[d_sw].append(r) with open('all_results.obj', 'wb') as w_obj: pickle.dump(all_results, w_obj) with open('all_bins.obj', 'wb') as w_obj: pickle.dump(bins, w_obj) else: print('reading results') with open('all_results.obj', 'rb') as r_obj: all_results = pickle.load(r_obj) with open('all_bins.obj', 'rb') as r_obj: bins = pickle.load(r_obj) print('found %i bins' % len(bins)) if not os.path.isfile('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by'])): print('creating graph') G = nx.Graph() [G.add_node(a) for a in bins.keys()] print('found %i nodes' % G.number_of_nodes()) for n1 in G.nodes_iter(): for n2 in G.nodes_iter(): if n1 != n2: coh_sw_1, coh_sw_2 = n1.split('-')[0], n2.split('-')[0] doc_sw_1, doc_sw_2 = n1.split('-')[1], n2.split('-')[1] doc_wt_1, doc_wt_2 = n1.split('-')[2], n2.split('-')[2] doc_diff = 1 if are_docs_from_diff_clusters( doc_wt_1, doc_wt_2) else 0 symm_switch = switch_symm_changer(doc_sw_2) if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \ switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']: G.add_edge(n1, n2) with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'wb') as w_obj: pickle.dump(G, w_obj) else: print('reading graph') with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'rb') as r_obj: G = pickle.load(r_obj) if not os.path.isfile('max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by'])): cliques = [a for a in nx.find_cliques(G)] max_len = max([len(a) for a in cliques]) max_cliques = [a for a in cliques if len(a) == max_len] print( 'there are %i cliques with %i structures in each for diff_by=%i doc_diff_by=%i' % (len(max_cliques), max_len, args['diff_by'], args['doc_diff_by'])) with open( 'max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'wb') as w_obj: pickle.dump(max_cliques, w_obj) else: print('reading cliques') with open( 'max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'rb') as r_obj: max_cliques = pickle.load(r_obj) occurences = { a.name: 0 for clq in max_cliques for k in clq for a in bins[k] } for clq in max_cliques: print('in clq', clq) for k in clq: # print(bins[k][0].name) print('\n'.join(set([a.name for a in bins[k]]))) for a in bins[k]: occurences[a.name] += 1 for w in sorted(occurences, key=occurences.get, reverse=False): print(w, occurences[w])