def get_design_data():
    """
    :return: goes over all designed coh-doc pairs and returns a DF similar to the one used to create the decision tree
    """
    dsn_cohs = read_multi_fastas(design_data_root+'all_designed_cohs.fasta', suffix_to_remove='.')
    dsn_docs = read_multi_fastas(design_data_root+'all_designed_docs.fasta', suffix_to_remove='.')
    df_ = pd.DataFrame(columns=columns, index=range(1, len(list(dsn_cohs.keys()))))

    interface_positions = parse_interface_positions()

    for i, doc_seq in enumerate(list(dsn_docs.values())):
        coh_seq = dsn_cohs[doc_seq.name]
        doc_model = doc_seq.name.split('A_')[1].split('_')[0]
        coh_identities = {typ: coh_seq[pos] for typ, pos in interface_positions['coh']['1ohz'].items()}
        doc_identities = {typ: doc_seq[pos] for typ, pos in interface_positions['doc'][doc_model].items()}

        coh_core = [core_res_to_identity(coh_identities[v], 'coh') for v in ordered_positions['coh'] if 'core' in v]
        coh_rim = [rim_res_to_type_binary(coh_identities[v]) for v in ordered_positions['coh'] if 'rim' in v]
        doc_core = [core_res_to_identity(doc_identities[v], 'doc') for v in ordered_positions['doc'] if 'core' in v]
        doc_rim = [rim_res_to_type_binary(doc_identities[v]) for v in ordered_positions['doc'] if 'rim' in v]

        coh_core_list, coh_rim_list = [], []
        [coh_core_list.append(a) for b in coh_core for a in b]
        [coh_rim_list.append(a) for b in coh_rim for a in b]

        doc_core_list, doc_rim_list = [], []
        [doc_core_list.append(a) for b in doc_core for a in b]
        [doc_rim_list.append(a) for b in doc_rim for a in b]

        df_.loc[i+1] = [coh_seq.name, doc_seq.name, 0, 0] + coh_core_list + coh_rim_list + \
                        doc_core_list + doc_rim_list + [None]

    return df_
def parse_binding_data() -> pd.DataFrame:
    """
    :return: data frame 'coh_name', 'doc_name', 'coh_seq', 'doc_seq', 'binders' for Rachel's data
    """
    from _binding_data import binding_data
    rachel_root = '/home/labs/fleishman/jonathaw/decision_tree/'
    cohs = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True)
    docs = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True)
    rachel_bind = binding_data()
    vered_bind = parse_vered_binding()
    result = pd.DataFrame(columns=['coh_name', 'doc_name', 'coh_seq', 'doc_seq', 'binders'])
    i = 1
    for coh, docs_dict in rachel_bind.items():
        for doc, res in docs_dict.items():
            result.loc[i] = [coh, doc, cohs[coh], docs[doc], rachel_bind[coh][doc]]
            i += 1
    for coh, docs_dict in vered_bind.items():
        for doc, res in docs_dict.items():
            result.loc[i] = [coh, doc, cohs[coh], docs[doc], vered_bind[coh][doc] == 1]
            i += 1
    for name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4', '4fl4', '4fl5', '4dh2', '4uyp', '5new']:
        result.loc[i] = [name, name, cohs[name], docs[name], True]
        i += 1
    print('there are %i rows in the data' % (i-1))
    return result
Exemple #3
0
def analyse_cliques(cliques):
    """
    :param cliques: list of cliques
    :return: prints an anlysis
    """
    coh_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/reclique_18Nov/stabilisation/all_stabilised/all_j_st_cohs.fasta', '_st.A')
    doc_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/reclique_18Nov/stabilisation/all_stabilised/all_j_st_docs.fasta', '_st.B')
    cliques_by_charges = parse_cliques_lists('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/reclique_18Nov/stabilisation/all_stabilised/cliques_2_1.txt', remove='_st')
    original_names = parse_name_translation('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/clique_6_pdbs/mini_diagonal_11Nov/minidiagonal_pdbs/translate_names.txt')
    clqs_by_len = {k: [] for k in range(1, 8)}
    for clq in cliques:
        clqs_by_len[len(clq)].append(clq)
    designs_in_all_clqs = []
    for length in range(10, 6, -1):
        if length not in clqs_by_len.keys():
            continue
        for clq in clqs_by_len[length]:
            coh_diffs, doc_diffs, doc_diffs_symm, doc_bb_diffs = [], [], [], []
            print('\n\n\nfor clq', clq)
            for mem1 in clq:
                designs_in_all_clqs.append(mem1)
                wt_doc = original_names[mem1[1]+'.pdb.gz'][10:14]
                res1 = Result(mem1, coh_seqs[mem1[0]], doc_seqs[mem1[1]], 1, j=True, doc_wt=wt_doc)
                res1_doc_symm = switch_symm_changer(res1.doc_switch)
                for mem2 in clq:
                    if mem1 != mem2:
                        wt_doc = original_names[mem2[1]+'.pdb.gz'][10:14]
                        res2 = Result(mem2, coh_seqs[mem2[0]], doc_seqs[mem2[1]], 1, j=True, doc_wt=wt_doc)
                        doc_bb_diffs.append(are_docs_from_diff_clusters(res1.doc_wt, res2.doc_wt))
                        coh_diffs.append(switches_differ({}, res1.coh_switch, res2.coh_switch))
                        doc_diffs.append(switches_differ({}, res1.doc_switch, res2.doc_switch))
                        doc_diffs_symm.append(switches_differ({}, res1_doc_symm, res2.doc_switch))
                        print('results', res1)
                        print('results', res2)
                        print('docs diff', switches_differ({}, res1.doc_switch, res2.doc_switch))
                        print('doc symm diff', switches_differ({}, res1_doc_symm, res2.doc_switch))
                        print('doc BB dif', are_docs_from_diff_clusters(res1.doc_wt, res2.doc_wt))
                        print('coh diff', switches_differ({}, res1.coh_switch, res2.coh_switch))
                        N
            print('for clq %r found the following results:' % clq)
            print('doc_bb_diffs', doc_bb_diffs)
            print('doc_diffs', doc_diffs)
            print('doc_diffs_symm', doc_diffs_symm)
            print('coh_diffs', coh_diffs)
            print('total', sum([1 for a in doc_bb_diffs if a] + doc_diffs + doc_diffs_symm + coh_diffs))
    all_cohs = list(set([a[0] for a in designs_in_all_clqs]))
    all_docs = list(set([a[1] for a in designs_in_all_clqs]))
    print('these are all the cohs i need: %s, total %i' % (', '.join(all_cohs), len(all_cohs)))
    print('these are all the docs i need: %s, total %i' % (', '.join(all_docs), len(all_docs)))
    print('LONGEST CLIQUES FOUND ARE %i' % max([len(clq) for clq in cliques]))
    coh_doc_purples = creat_coh_doc_purples()
    for clq in clqs_by_len[max(list(clqs_by_len.keys()))]:
        print('clq', clq)
        cohs = [a[0] for a in clq]
        docs = [a[1] for a in clq]
        df = pd.DataFrame(index=docs, columns=cohs)
        for coh in cohs:
            for doc in docs:
                df[coh][doc] = coh_doc_purples[coh][doc]
        show_prediction_heat_map(df)
Exemple #4
0
def retrive_relevant_poses() -> (dict, dict):
    """
    :return: seq dicts for cohs and docs, holding only the relevqant positions, determined by 1OHZ
    """
    cohs_old = read_multi_fastas(root_path + 'cohesins_from_rachel.fasta_aln',
                                 suffix_to_remove='/')
    docs_old = read_multi_fastas(root_path + 'dockerins_from_rachel.fasta_aln',
                                 suffix_to_remove='/')

    coh_1ohz = cohs_old['1OHZ']
    coh_poses = [
        coh_1ohz.non_aligned_position_at_aligned(p) for p in coh_poses_1ohz
    ]
    doc_1ohz = docs_old['1OHZ']
    doc_poses = [
        doc_1ohz.non_aligned_position_at_aligned(p) for p in doc_poses_1ohz
    ]

    cohs_new, docs_new = {}, {}

    for coh, res in cohs_old.items():
        cohs_new[coh] = AASeq(string=''.join(
            res.get_aligned_positions(coh_poses)),
                              name=coh)
    for doc, res in docs_old.items():
        docs_new[doc] = AASeq(string=''.join(
            res.get_aligned_positions(doc_poses)),
                              name=doc)
    return cohs_new, docs_new
def post_pred_cliques(args):

    run_filters = generate_run_filters(args={'ddg': 25.0, 'sasa': 1400, 'shape': 0.6, 'packstat': 0.6, 'buried_2': 3})

    if not os.path.isfile('./all_data.obj'):
        sc_files = [a for a in os.listdir('./') if '.score' in a]
        cohs_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_cohs.fasta')
        docs_seqs = read_multi_fastas('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_docs.fasta')
        results = []
        for sc_file in sc_files:
            seq_name = '_'.join(sc_file.split('_')[1:8])
            coh_name = seq_name+'.pdb.gz.A'
            doc_name = seq_name+'.pdb.gz.B'
            sc_dict = score2dict(sc_file)
            ynum = re.search('y[0-9]{3}', sc_file).group(0)
            passed, failed = all_who_pass_run_filters(args, sc_dict, run_filters)
            if len(passed) >= args['purples_threshold']:
                r = Result(seq_name, cohs_seqs[coh_name], docs_seqs[doc_name], len(passed))
                results.append(r)
        with open('./all_data.obj', 'wb') as fout:
            pickle.dump(results, fout)
    else:
        with open('./all_data.obj', 'rb') as fin:
            results = pickle.load(fin)

    if not os.path.isfile('./graph.obj'):
        result_dict = {i+1: r for i, r in enumerate(results)}
        G = nx.Graph()
        [G.add_node(a) for a in result_dict.keys()]
        for n1 in G.nodes_iter():
            for n2 in G.nodes_iter():
                if n1 != n2:
                    coh_sw_1, coh_sw_2 = result_dict[n1].coh_switch, result_dict[n2].coh_switch
                    doc_sw_1, doc_sw_2 = result_dict[n1].doc_switch, result_dict[n2].doc_switch
                    doc_wt_1, doc_wt_2 = result_dict[n1].doc_wt, result_dict[n2].doc_wt
                    doc_diff = 1 if are_docs_from_diff_clusters(doc_wt_1, doc_wt_2) else 0
                    symm_switch = switch_symm_changer(doc_sw_2)
                    if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']:
                        G.add_edge(n1, n2)
                        print('adding edge\n', result_dict[n1], '\n', result_dict[n2])
                    else:
                        print('NOT\n', result_dict[n1], '\n', result_dict[n2])
        cliques = [a for a in nx.find_cliques(G)]
        max_len = max([len(a) for a in cliques])
        max_cliques = [a for a in cliques if len(a) == max_len]
        for clq in max_cliques:
            print(clq, '\n', '\n'.join([str(result_dict[a]) for a in clq]))
Exemple #6
0
def main():
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    parser = argparse.ArgumentParser()
    parser.add_argument('-gremlin_file',
                        type=str,
                        help='full path to Gremlin output')
    parser.add_argument('-MSA', type=str, help='full path the MSA file')
    parser.add_argument('-probability_threshold',
                        type=float,
                        help='threshold above which probability is considered')
    parser.add_argument('-query_name',
                        type=str,
                        help='query name as it is written in the MSA')
    args = vars(parser.parse_args())

    gremlin = parse_gremlin(args['gremlin_file'],
                            args['probability_threshold'])
    msa = read_multi_fastas(args['MSA'], add_aligned=True)

    with open(args['query_name'] + '.gssm', 'w+') as fout:
        for k, v in gremlin.items():
            iden_frq = create_identitiy_frequency_df(k, msa,
                                                     args['query_name'])
            fout.write('pos_1 %i pos_2 %i probability %f\n' % (k[0], k[1], v))
            fout.write(str(iden_frq) + '\n')
def validate(args):
    # original_seqs = read_multi_fastas(args['original_seqs_file'], suffix_to_remove='_')
    DNA_seqs = read_multi_fastas(args['DNA_seqs_file'], suffix_to_remove='.')

    for k, v in DNA_seqs.items():
        # assert original_seqs[k].get_seq() in DNA2AA(v.get_seq())
        if not gen9_standards(v.get_seq):
            print('seq name %s does not comply with Gen9 standards' % k)
Exemple #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-coh_files', nargs='+')
    parser.add_argument('-doc_files', nargs='+')
    parser.add_argument('-threshold', type=float, default=30.0)
    args = vars(parser.parse_args())

    cohs = OrderedDict({
        k: v
        for a in args['coh_files'] for k, v in read_multi_fastas(a).items()
    })
    docs = OrderedDict({
        k: v
        for a in args['doc_files'] for k, v in read_multi_fastas(a).items()
    })

    mw_df = pd.DataFrame(columns=['coh_seq', 'coh_weight'] + list(docs.keys()))
    for i, coh in enumerate(cohs.values()):
        coh.add_prefix = flanks['coh']['cbm']
        coh_weight = coh.calc_molecular_weight()
        print(coh.get_seq(), coh_weight)
        weights_combined = []
        for doc in docs.values():
            doc.add_prefix(flanks['doc']['xyn'])
            weights_combined.append(coh_weight + doc.calc_molecular_weight())
        mw_df.loc[coh.name] = [coh, coh_weight] + weights_combined
    print(mw_df)

    diffs = []
    for coh1 in cohs.keys():
        for doc1 in docs.keys():
            coh_doc_1 = mw_df[doc1][coh1]
            for coh2 in cohs.keys():
                for doc2 in docs.keys():
                    if coh1 != coh2 and doc1 != doc2:
                        diff = abs(coh_doc_1 - mw_df[doc2][coh2])
                        diffs.append(diff)
                        if diff <= args['threshold']:
                            print(
                                '%s %s and %s %s have a weight difference of only %f'
                                % (coh1, doc1, coh2, doc2, diff))

    plt.boxplot(diffs)
    plt.show()
def retrive_relevant_poses() -> (dict, dict):
    """
    :return: seq dicts for cohs and docs, holding only the relevqant positions, determined by 1OHZ
    """
    cohs_old = read_multi_fastas(root_path+'cohesins_from_rachel.fasta_aln', suffix_to_remove='/')
    docs_old = read_multi_fastas(root_path+'dockerins_from_rachel.fasta_aln', suffix_to_remove='/')

    coh_1ohz = cohs_old['1OHZ']
    coh_poses = [coh_1ohz.non_aligned_position_at_aligned(p) for p in coh_poses_1ohz]
    doc_1ohz = docs_old['1OHZ']
    doc_poses = [doc_1ohz.non_aligned_position_at_aligned(p) for p in doc_poses_1ohz]

    cohs_new, docs_new = {}, {}

    for coh, res in cohs_old.items():
        cohs_new[coh] = AASeq(string=''.join(res.get_aligned_positions(coh_poses)), name=coh)
    for doc, res in docs_old.items():
        docs_new[doc] = AASeq(string=''.join(res.get_aligned_positions(doc_poses)), name=doc)
    return cohs_new, docs_new
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-coh_files', nargs='+')
    parser.add_argument('-doc_files', nargs='+')
    parser.add_argument('-threshold', type=float, default=30.0)
    args = vars(parser.parse_args())

    cohs = OrderedDict({k: v for a in args['coh_files'] for k, v in read_multi_fastas(a).items()})
    docs = OrderedDict({k: v for a in args['doc_files'] for k, v in read_multi_fastas(a).items()})

    mw_df = pd.DataFrame(columns=['coh_seq', 'coh_weight']+list(docs.keys()))
    for i, coh in enumerate(cohs.values()):
        coh.add_prefix = flanks['coh']['cbm']
        coh_weight = coh.calc_molecular_weight()
        print(coh.get_seq(), coh_weight)
        weights_combined = []
        for doc in docs.values():
            doc.add_prefix(flanks['doc']['xyn'])
            weights_combined.append(coh_weight+doc.calc_molecular_weight())
        mw_df.loc[coh.name] = [coh, coh_weight] + weights_combined
    print(mw_df)

    diffs = []
    for coh1 in cohs.keys():
        for doc1 in docs.keys():
            coh_doc_1 = mw_df[doc1][coh1]
            for coh2 in cohs.keys():
                for doc2 in docs.keys():
                    if coh1 != coh2 and doc1 != doc2:
                        diff = abs(coh_doc_1 - mw_df[doc2][coh2])
                        diffs.append(diff)
                        if diff <= args['threshold']:
                            print('%s %s and %s %s have a weight difference of only %f' % (coh1, doc1, coh2, doc2, diff))

    plt.boxplot(diffs)
    plt.show()
def main():
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    parser = argparse.ArgumentParser()
    parser.add_argument('-gremlin_file', type=str, help='full path to Gremlin output')
    parser.add_argument('-MSA', type=str, help='full path the MSA file')
    parser.add_argument('-probability_threshold', type=float, help='threshold above which probability is considered')
    parser.add_argument('-query_name', type=str, help='query name as it is written in the MSA')
    args = vars(parser.parse_args())

    gremlin = parse_gremlin(args['gremlin_file'], args['probability_threshold'])
    msa = read_multi_fastas(args['MSA'], add_aligned=True)

    with open(args['query_name']+'.gssm', 'w+') as fout:
        for k, v in gremlin.items():
            iden_frq = create_identitiy_frequency_df(k, msa, args['query_name'])
            fout.write('pos_1 %i pos_2 %i probability %f\n' % (k[0], k[1], v))
            fout.write(str(iden_frq) + '\n')
Exemple #12
0
    parser.add_argument('-coh_seqs_file')
    parser.add_argument('-doc_seqs_file')
    parser.add_argument('-mode')
    parser.add_argument('-n', type=int, default=1)
    parser.add_argument('-diff_by', type=int, default=2)
    parser.add_argument('-doc_diff_by', type=int, default=1)
    parser.add_argument('-score_dir', type=str, default='./')
    parser.add_argument('-purples_threshold', type=int, default=50)
    args = vars(parser.parse_args())

    if args['mode'] != 'bins_diagonal' and args[
            'mode'] != 'post_pred_cliques' and args[
                'mode'] != 'minidiagonal_cliques':
        scores = score2dict(args['score_file'])
        run_filters = ''  #generate_run_filters()
        coh_seq_dict = read_multi_fastas(args['coh_seqs_file'],
                                         suffix_to_remove='.pdb.gz')
        doc_seq_dict = read_multi_fastas(args['doc_seqs_file'],
                                         suffix_to_remove='.pdb.gz')

    if args['mode'] == 'switches_n_cliques':
        switches, num_bins = make_switches(args, scores, run_filters,
                                           coh_seq_dict)
        max_cliques = best_cliques(args, list(switches.keys()))

        with open('switches.obj', 'wb') as sw_file:
            pickle.dump(switches, sw_file)
        with open('max_cliques.obj', 'wb') as clq_file:
            pickle.dump(max_cliques, clq_file)

    elif args['mode'] == 'choose_by_identity':
        with open('switches.obj', 'rb') as sw_in:
def add_primers_to_all(args):
    DNA_seqs = read_multi_fastas(args['DNA_seqs_file'])
    for k, v in DNA_seqs.items():
        print('>%s' % k)
        print(add_primers(v.get_seq, args['type']))
def validate_data_frame(data_df: pd.DataFrame, prepared_df: pd.DataFrame) -> None:
    """
    :param data_df: binding data frame
    :param prepared_df: binary data frame
    :return: prints if there is something wrong...
    """
    rachel_root = '/home/labs/fleishman/jonathaw/decision_tree/'
    cohs = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/')
    docs = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/')
    # coh_1ohz = cohs['1OHZ']
    # doc_1ohz = docs['1OHZ']
    coh_crys_seqs = [c for c in cohs.values() if c.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4',
                                                                    '4fl4', '4fl5', '4dh2', '4uyp', '5new']]
    doc_crys_seqs = [d for d in docs.values() if d.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4',
                                                                    '4fl4', '4fl5', '4dh2', '4uyp', '5new']]
    # coh_poses = [coh_1ohz.non_aligned_position_at_aligned(p) for p in coh_poses_1ohz]
    # doc_poses = [doc_1ohz.non_aligned_position_at_aligned(p) for p in doc_poses_1ohz]
    features = list(prepared_df.columns[4:-1])

    interface_positions = parse_interface_positions()
    coh_poses = {coh: {typ: cohs[coh].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for coh, typos
                 in interface_positions['coh'].items()}
    doc_poses = {doc: {typ: docs[doc].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for doc, typos
                 in interface_positions['doc'].items()}

    for i in range(1, len(data_df.index)):
        # i = len(data_df.index)
        print('i is %i' % i)
        print(data_df.loc[i])
        if data_df.loc[i]['coh_name'] != prepared_df.loc[i]['coh_name'] or \
                        data_df.loc[i]['doc_name'] != prepared_df.loc[i]['doc_name']:
            print('not the same names', data_df.loc[i]['doc_name'], prepared_df.loc[i]['doc_name'])
            sys.exit()
        coh_seq = data_df.loc[i]['coh_seq']
        # coh_q_poses = coh_seq.get_aligned_positions(coh_poses)

        doc_seq = data_df.loc[i]['doc_seq']
        # doc_q_poses = doc_seq.get_aligned_positions(doc_poses)

        prepared_row = row_to_dict(prepared_df.loc[i])

        similar_coh, coh_iden = highest_seq_similarity(coh_crys_seqs, data_df.loc[i]['coh_seq'])
        similar_doc, doc_iden = highest_seq_similarity(doc_crys_seqs, data_df.loc[i]['doc_seq'])
        coh_identities = {typ: data_df.loc[i]['coh_seq'].get_aligned_positions([pos])[0] for typ, pos in
                          coh_poses[similar_coh.name].items()}
        doc_identities = {typ: data_df.loc[i]['doc_seq'].get_aligned_positions([pos])[0] for typ, pos in
                          doc_poses[similar_doc.name].items()}



        # for pos in positions['core_coh']:
        #     if coh_q_poses[pos] != prepared_row['coh_core_%i' % pos]:
        #         print('not the same coh query pos differs from row', pos, coh_q_poses[pos], prepared_row['coh_core_%i' % pos])
        #         sys.exit()

        # for pos in positions['core_doc']:
        #     if doc_q_poses[pos] != prepared_row['doc_core_%i' % pos]:
        #         print('not the same doc query pos differs from row')
        #         sys.exit()

        # for pos in positions['rim_coh']:
        #     if [k for k, v in type_to_res.items() if coh_q_poses[pos] in v][0] != prepared_row['coh_rim_%i' % pos] and \
        #             not ([k for k, v in type_to_res.items() if coh_q_poses[pos] in v][0] == 'NA' and
        #                          prepared_row['coh_rim_%i' % pos] == '-'):
        #         print('breaking', [k for k, v in type_to_res.items() if coh_q_poses[pos] in v][0],
        #               prepared_row['coh_rim_%i' % pos])
        #         sys.exit()

        for fea in features:
            if prepared_df.loc[i][fea] not in [0, 1]:
                print('found problem at row', i, prepared_df.loc[i][fea])

        # break

    print('your df is validated')
def minidiagonal_cliques(args):
    original_names = parse_name_translation('/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/clique_6_pdbs/mini_diagonal_11Nov/minidiagonal_pdbs/translate_names.txt')
    coh_seqs = read_multi_fastas(args['coh_seqs_file'], suffix_to_remove='.A')
    doc_seqs = read_multi_fastas(args['doc_seqs_file'], suffix_to_remove='.B')
    if not os.path.isfile('all_results.obj') or not os.path.isfile('all_bins.obj'):
        print('creating bins and results')
        all_results, bins = {}, {}
        for design in coh_seqs.keys():
            r = Result(design, coh_seqs[design], doc_seqs[design], 0, j=True, originals=original_names)
            if not 2 < r.coh_switch.count('n') + r.coh_switch.count('p') < 8 and not 2 <= r.coh_switch.count('p') <= 3:
                continue
            all_results[design] = r
            d_sw = r.coh_switch+'-'+r.doc_switch+'-'+r.doc_wt
            if d_sw not in bins.keys():
                bins[d_sw] = []
            bins[d_sw].append(r)
        with open('all_results.obj', 'wb') as w_obj:
            pickle.dump(all_results, w_obj)
        with open('all_bins.obj', 'wb') as w_obj:
            pickle.dump(bins, w_obj)
    else:
        print('reading results')
        with open('all_results.obj', 'rb') as r_obj:
            all_results = pickle.load(r_obj)
        with open('all_bins.obj', 'rb') as r_obj:
            bins = pickle.load(r_obj)

    print('found %i bins' % len(bins))

    if not os.path.isfile('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by'])):
        print('creating graph')
        G = nx.Graph()
        [G.add_node(a) for a in bins.keys()]
        print('found %i nodes' % G.number_of_nodes())
        for n1 in G.nodes_iter():
            for n2 in G.nodes_iter():
                if n1 != n2:
                    coh_sw_1, coh_sw_2 = n1.split('-')[0], n2.split('-')[0]
                    doc_sw_1, doc_sw_2 = n1.split('-')[1], n2.split('-')[1]
                    doc_wt_1, doc_wt_2 = n1.split('-')[2], n2.split('-')[2]
                    doc_diff = 1 if are_docs_from_diff_clusters(doc_wt_1, doc_wt_2) else 0
                    symm_switch = switch_symm_changer(doc_sw_2)
                    if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']:
                        G.add_edge(n1, n2)
        with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'wb') as w_obj:
            pickle.dump(G, w_obj)
    else:
        print('reading graph')
        with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'rb') as r_obj:
            G = pickle.load(r_obj)

    if not os.path.isfile('max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by'])):
        cliques = [a for a in nx.find_cliques(G)]
        max_len = max([len(a) for a in cliques])
        max_cliques = [a for a in cliques if len(a) == max_len]
        print('there are %i cliques with %i structures in each for diff_by=%i doc_diff_by=%i' %
              (len(max_cliques), max_len, args['diff_by'], args['doc_diff_by']))
        with open('max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'wb') as w_obj:
            pickle.dump(max_cliques, w_obj)
    else:
        print('reading cliques')
        with open('max_cliques_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']), 'rb') as r_obj:
            max_cliques = pickle.load(r_obj)

    occurences = {a.name: 0 for clq in max_cliques for k in clq for a in bins[k]}
    for clq in max_cliques:
        print('in clq', clq)
        for k in clq:
            # print(bins[k][0].name)
            print('\n'.join(set([a.name for a in bins[k]])))
            for a in bins[k]:
                occurences[a.name] += 1
    for w in sorted(occurences, key=occurences.get, reverse=False):
        print(w, occurences[w])
    parser = argparse.ArgumentParser()
    parser.add_argument('-score_file')
    parser.add_argument('-coh_seqs_file')
    parser.add_argument('-doc_seqs_file')
    parser.add_argument('-mode')
    parser.add_argument('-n', type=int, default=1)
    parser.add_argument('-diff_by', type=int, default=2)
    parser.add_argument('-doc_diff_by', type=int, default=1)
    parser.add_argument('-score_dir', type=str, default='./')
    parser.add_argument('-purples_threshold', type=int, default=50)
    args = vars(parser.parse_args())

    if args['mode'] != 'bins_diagonal' and args['mode'] != 'post_pred_cliques' and args['mode'] != 'minidiagonal_cliques':
        scores = score2dict(args['score_file'])
        run_filters = ''#generate_run_filters()
        coh_seq_dict = read_multi_fastas(args['coh_seqs_file'], suffix_to_remove='.pdb.gz')
        doc_seq_dict = read_multi_fastas(args['doc_seqs_file'], suffix_to_remove='.pdb.gz')

    if args['mode'] == 'switches_n_cliques':
        switches, num_bins = make_switches(args, scores, run_filters, coh_seq_dict)
        max_cliques = best_cliques(args, list(switches.keys()))

        with open('switches.obj', 'wb') as sw_file:
            pickle.dump(switches, sw_file)
        with open('max_cliques.obj', 'wb') as clq_file:
            pickle.dump(max_cliques, clq_file)

    elif args['mode'] == 'choose_by_identity':
        with open('switches.obj', 'rb') as sw_in:
            switches = pickle.load(sw_in)
        with open('max_cliques.obj', 'rb') as clq_file:
def prepare_data(in_df: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame):
    """
    :rtype: (pd.DataFrame, pd.DataFrame)
    """
    rachel_root = '/home/labs/fleishman/jonathaw/decision_tree/'
    cohs_non_aln = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta', suffix_to_remove='/', lower=True)
    docs_non_aln = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta', suffix_to_remove='/', lower=True)

    cohs = read_multi_fastas(rachel_root+'cohesins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True)
    docs = read_multi_fastas(rachel_root+'dockerins_from_rachel_and_vered.fasta_aln', suffix_to_remove='/', lower=True)
    interface_positions = parse_interface_positions()
    coh_poses = {coh: {typ: cohs[coh].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for coh, typos
                 in interface_positions['coh'].items()}
    doc_poses = {doc: {typ: docs[doc].non_aligned_position_at_aligned(pos) for typ, pos in typos.items()} for doc, typos
                 in interface_positions['doc'].items()}

    validate_aligned_non_aligned_interface_positions(interface_positions['coh'], cohs, cohs_non_aln)
    validate_aligned_non_aligned_interface_positions(interface_positions['doc'], docs, docs_non_aln)

    coh_crys_seqs = [c for c in cohs.values() if c.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4',
                                                                    '4fl4', '4fl5', '4dh2', '4uyp', '5new']]
    doc_crys_seqs = [d for d in docs.values() if d.name in ['1ohz', '2b59', '2ozn', '2vn5', '2y3n', '3ul4',
                                                                    '4fl4', '4fl5', '4dh2', '4uyp', '5new']]
    # columns = ['coh_name', 'doc_name', 'coh_seq', 'doc_seq'] + \
    #           ['%s_%s' % (typ, aa) for typ in ordered_positions['coh'] if 'core' in typ for aa in aas] + \
    #           ['%s_%s' % (typ, aa) for typ in ordered_positions['coh'] if 'rim' in typ for aa in types] + \
    #           ['%s_%s' % (typ, aa) for typ in ordered_positions['doc'] if 'core' in typ for aa in aas] + \
    #           ['%s_%s' % (typ, aa) for typ in ordered_positions['doc'] if 'rim' in typ for aa in types] + ['binders']

    out_df = pd.DataFrame(index=range(1, len(in_df.index)), columns=columns)

    id_columns = ['coh_name', 'doc_name', 'coh_seq', 'doc_seq'] + ordered_positions['coh'] + ordered_positions['doc']
    identities_df = pd.DataFrame(index=range(1, len(in_df.index)), columns=id_columns)

    for i in range(1, len(in_df.index)+1):
        # find which crystal coh+doc are most similar
        similar_coh, coh_iden = highest_seq_similarity(coh_crys_seqs, in_df.loc[i]['coh_seq'])
        similar_doc, doc_iden = highest_seq_similarity(doc_crys_seqs, in_df.loc[i]['doc_seq'])

        # get aligned positions accrotding to interface_positions
        coh_identities = {typ: in_df.loc[i]['coh_seq'].get_aligned_positions([pos])[0] for typ, pos in
                          coh_poses[similar_coh.name].items()}
        doc_identities = {typ: in_df.loc[i]['doc_seq'].get_aligned_positions([pos])[0] for typ, pos in
                          doc_poses[similar_doc.name].items()}
        # coh_ = in_df.loc[i]['coh_seq'].get_aligned_positions(coh_poses[similar_coh])
        # doc_ = in_df.loc[i]['doc_seq'].get_aligned_positions(doc_poses[similar_doc])

        coh_core = [core_res_to_identity(coh_identities[v], 'coh') for v in ordered_positions['coh'] if 'core' in v]
        coh_rim = [rim_res_to_type_binary(coh_identities[v]) for v in ordered_positions['coh'] if 'rim' in v]
        doc_core = [core_res_to_identity(doc_identities[v], 'doc') for v in ordered_positions['doc'] if 'core' in v]
        doc_rim = [rim_res_to_type_binary(doc_identities[v]) for v in ordered_positions['doc'] if 'rim' in v]

        coh_core_list, coh_rim_list = [], []
        [coh_core_list.append(a) for b in coh_core for a in b]
        [coh_rim_list.append(a) for b in coh_rim for a in b]

        doc_core_list, doc_rim_list = [], []
        [doc_core_list.append(a) for b in doc_core for a in b]
        [doc_rim_list.append(a) for b in doc_rim for a in b]

        out_df.loc[i] = [in_df.loc[i]['coh_name'], in_df.loc[i]['doc_name'], 0, 0] + coh_core_list + coh_rim_list + \
                        doc_core_list + doc_rim_list + [1 if in_df.loc[i]['binders'] else 0]
        identities_df.loc[i] = [in_df.loc[i]['coh_name'], in_df.loc[i]['doc_name'], 0, 0] + \
                               [coh_identities[v] for v in ordered_positions['coh'] if 'core' in v] + \
                               [coh_identities[v] for v in ordered_positions['coh'] if 'rim' in v] + \
                               [doc_identities[v] for v in ordered_positions['doc'] if 'core' in v] + \
                               [doc_identities[v] for v in ordered_positions['doc'] if 'rim' in v]
    return out_df, identities_df
Exemple #18
0
def post_pred_cliques(args):

    run_filters = generate_run_filters(args={
        'ddg': 25.0,
        'sasa': 1400,
        'shape': 0.6,
        'packstat': 0.6,
        'buried_2': 3
    })

    if not os.path.isfile('./all_data.obj'):
        sc_files = [a for a in os.listdir('./') if '.score' in a]
        cohs_seqs = read_multi_fastas(
            '/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_cohs.fasta'
        )
        docs_seqs = read_multi_fastas(
            '/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/all_docs.fasta'
        )
        results = []
        for sc_file in sc_files:
            seq_name = '_'.join(sc_file.split('_')[1:8])
            coh_name = seq_name + '.pdb.gz.A'
            doc_name = seq_name + '.pdb.gz.B'
            sc_dict = score2dict(sc_file)
            ynum = re.search('y[0-9]{3}', sc_file).group(0)
            passed, failed = all_who_pass_run_filters(args, sc_dict,
                                                      run_filters)
            if len(passed) >= args['purples_threshold']:
                r = Result(seq_name, cohs_seqs[coh_name], docs_seqs[doc_name],
                           len(passed))
                results.append(r)
        with open('./all_data.obj', 'wb') as fout:
            pickle.dump(results, fout)
    else:
        with open('./all_data.obj', 'rb') as fin:
            results = pickle.load(fin)

    if not os.path.isfile('./graph.obj'):
        result_dict = {i + 1: r for i, r in enumerate(results)}
        G = nx.Graph()
        [G.add_node(a) for a in result_dict.keys()]
        for n1 in G.nodes_iter():
            for n2 in G.nodes_iter():
                if n1 != n2:
                    coh_sw_1, coh_sw_2 = result_dict[
                        n1].coh_switch, result_dict[n2].coh_switch
                    doc_sw_1, doc_sw_2 = result_dict[
                        n1].doc_switch, result_dict[n2].doc_switch
                    doc_wt_1, doc_wt_2 = result_dict[n1].doc_wt, result_dict[
                        n2].doc_wt
                    doc_diff = 1 if are_docs_from_diff_clusters(
                        doc_wt_1, doc_wt_2) else 0
                    symm_switch = switch_symm_changer(doc_sw_2)
                    if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']:
                        G.add_edge(n1, n2)
                        print('adding edge\n', result_dict[n1], '\n',
                              result_dict[n2])
                    else:
                        print('NOT\n', result_dict[n1], '\n', result_dict[n2])
        cliques = [a for a in nx.find_cliques(G)]
        max_len = max([len(a) for a in cliques])
        max_cliques = [a for a in cliques if len(a) == max_len]
        for clq in max_cliques:
            print(clq, '\n', '\n'.join([str(result_dict[a]) for a in clq]))
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-mode')
    parser.add_argument('-make_dt', type=bool, default=False)
    parser.add_argument('-coh_name', type=str)
    args = vars(parser.parse_args())

    if args['make_dt']:
        print('making decision tree')
        data_df = parse_binding_data()
        prepared_df, identities_df = prepare_data(data_df)
        decision_tree, features = create_decision_tree(prepared_df)
        pickle.dump(decision_tree, open(decision_tree_root+'decision_tree_%s.obj' % time.strftime("%d.%0-m"), 'wb'))
        pickle.dump(features, open(decision_tree_root+'features_%s.obj' % time.strftime("%d.%0-m"), 'wb'))
        pickle.dump(prepared_df, open(decision_tree_root+'prepared_df_%s.obj' % time.strftime("%d.%0-m"), 'wb'))
        pickle.dump(identities_df, open(decision_tree_root+'identities_df_%s.obj' % time.strftime("%d.%0-m"), 'wb'))
    else:
        print('reading decision tree')
        decision_tree = pickle.load(open(decision_tree_root+'decision_tree_%s.obj' % time_to_use, 'rb'))
        features = pickle.load(open(decision_tree_root+'features_%s.obj' % time_to_use, 'rb'))
        prepared_df = pickle.load(open(decision_tree_root+'prepared_df_%s.obj' % time_to_use, 'rb'))
        identities_df = pickle.load(open(decision_tree_root+'identities_df_%s.obj' % time_to_use, 'rb'))

    if args['mode'] == 'k_fold':
        k_fold_test(prepared_df)

    elif args['mode'] == 'validate_df':
        validate_data_frame(data_df, prepared_df)

    elif args['mode'] == 'analyse_identities_df':
        analyse_identity_df(identities_df)

    elif args['mode'] == 'create_dt':
        with open('decision_tree.dot', 'w') as fout:
            print('creating decision tree.dot')
            export_graphviz(decision_tree, out_file=fout, feature_names=features)
            compare_observed_to_predicted(decision_tree, data_df, prepared_df[features])

    elif args['mode'] == 'follow':
        seq_to_follow(prepared_df, '2b59', '2b59')

    elif args['mode'] == 'predict_all_designs_diagonal':
        print('getting design sequences')
        design_df = get_design_data()
        print('making prediction')
        design_df['predict'] = decision_tree.predict(design_df[features])
        with open(design_data_root+'diagonal_prediciton.txt', 'w+') as fout:
            pd.set_option('display.max_rows', 9999999999999999999)
            fout.write(str(design_df.loc[design_df['predict'] == 1]['coh_name']))

    elif args['mode'] == 'pickle_design_sequences':
        dsn_cohs = read_multi_fastas(design_data_root+'all_designed_cohs.fasta', suffix_to_remove='.')
        dsn_docs = read_multi_fastas(design_data_root+'all_designed_docs.fasta', suffix_to_remove='.')
        pickle.dump(dsn_cohs, open(design_data_root+'dsn_cohs_%s.obj' % time.strftime("%d.%0-m"), 'wb'))
        pickle.dump(dsn_docs, open(design_data_root+'dsn_docs_%s.obj' % time.strftime("%d.%0-m"), 'wb'))

    elif args['mode'] == 'predict_by_coh':
        design_df = get_design_data_coh_vs_all(args['coh_name'])
        print('predicting!!')
        design_df['predict'] = decision_tree.predict(design_df[features])
        with open(design_data_root+'all_vs_all_decision_tree_6Jan/'
                          +args['coh_name']+'.txt', 'w+') as fout:
            pd.set_option('display.max_rows', len(design_df))
            fout.write(str(design_df[[0, 1, -1]])+'\n')
            pd.reset_option('display.max_rows')

    else:
        print('no mode found')
     79,
     81,
     82,
     83,
     85,
     87,
     115,
     116,
     118,
     119,
     121,
     123,
     125,
     127,
 ]
 fastas = read_multi_fastas("/home/labs/fleishman/jonathaw/data/pssm/cohs/making/1ohz_passed_thresholds.fasta")
 positives, negatives, neutrals, totals = [], [], [], []
 for k, v in fastas.items():
     charge_config = extract_charge_configuration(v, positions)
     positives.append(charge_config.count("p"))
     negatives.append(charge_config.count("n"))
     neutrals.append(charge_config.count("c"))
     totals.append(charge_config.count("p") + charge_config.count("n"))
     if k == "1ohz":
         print("at 1ohz found %i %s" % (charge_config.count("p"), "positives"))
         print("at 1ohz found %i %s" % (charge_config.count("n"), "negatives"))
         print("at 1ohz found %i %s" % (charge_config.count("c"), "neutrals"))
         print("at 1ohz found %i %s" % (charge_config.count("p") + charge_config.count("n"), "totals"))
 bins = range(max(positives + negatives + neutrals) + 1)
 plt.hist(positives, bins=bins, color="b", label="positives")
 plt.hist(negatives, bins=bins, color="r", label="negatives")
def extract_charge_configuration(seq: AASeq, positions: list):
    res_in_poses = seq.get_positions(positions)
    charge = [
        res2charge[a] if a in res2charge.keys() else 'c' for a in res_in_poses
    ]
    return charge


if __name__ == '__main__':
    positions = [
        32, 33, 35, 37, 39, 63, 66, 68, 70, 73, 75, 77, 79, 81, 82, 83, 85, 87,
        115, 116, 118, 119, 121, 123, 125, 127
    ]
    fastas = read_multi_fastas(
        '/home/labs/fleishman/jonathaw/data/pssm/cohs/making/1ohz_passed_thresholds.fasta'
    )
    positives, negatives, neutrals, totals = [], [], [], []
    for k, v in fastas.items():
        charge_config = extract_charge_configuration(v, positions)
        positives.append(charge_config.count('p'))
        negatives.append(charge_config.count('n'))
        neutrals.append(charge_config.count('c'))
        totals.append(charge_config.count('p') + charge_config.count('n'))
        if k == '1ohz':
            print('at 1ohz found %i %s' %
                  (charge_config.count('p'), 'positives'))
            print('at 1ohz found %i %s' %
                  (charge_config.count('n'), 'negatives'))
            print('at 1ohz found %i %s' %
                  (charge_config.count('c'), 'neutrals'))
Exemple #22
0
def minidiagonal_cliques(args):
    original_names = parse_name_translation(
        '/home/labs/fleishman/jonathaw/no_backup/designs/multi_docs_15Oct/recliques_4Nov/clique_6_pdbs/mini_diagonal_11Nov/minidiagonal_pdbs/translate_names.txt'
    )
    coh_seqs = read_multi_fastas(args['coh_seqs_file'], suffix_to_remove='.A')
    doc_seqs = read_multi_fastas(args['doc_seqs_file'], suffix_to_remove='.B')
    if not os.path.isfile('all_results.obj') or not os.path.isfile(
            'all_bins.obj'):
        print('creating bins and results')
        all_results, bins = {}, {}
        for design in coh_seqs.keys():
            r = Result(design,
                       coh_seqs[design],
                       doc_seqs[design],
                       0,
                       j=True,
                       originals=original_names)
            if not 2 < r.coh_switch.count('n') + r.coh_switch.count(
                    'p') < 8 and not 2 <= r.coh_switch.count('p') <= 3:
                continue
            all_results[design] = r
            d_sw = r.coh_switch + '-' + r.doc_switch + '-' + r.doc_wt
            if d_sw not in bins.keys():
                bins[d_sw] = []
            bins[d_sw].append(r)
        with open('all_results.obj', 'wb') as w_obj:
            pickle.dump(all_results, w_obj)
        with open('all_bins.obj', 'wb') as w_obj:
            pickle.dump(bins, w_obj)
    else:
        print('reading results')
        with open('all_results.obj', 'rb') as r_obj:
            all_results = pickle.load(r_obj)
        with open('all_bins.obj', 'rb') as r_obj:
            bins = pickle.load(r_obj)

    print('found %i bins' % len(bins))

    if not os.path.isfile('graph_%i_%i.obj' %
                          (args['diff_by'], args['doc_diff_by'])):
        print('creating graph')
        G = nx.Graph()
        [G.add_node(a) for a in bins.keys()]
        print('found %i nodes' % G.number_of_nodes())
        for n1 in G.nodes_iter():
            for n2 in G.nodes_iter():
                if n1 != n2:
                    coh_sw_1, coh_sw_2 = n1.split('-')[0], n2.split('-')[0]
                    doc_sw_1, doc_sw_2 = n1.split('-')[1], n2.split('-')[1]
                    doc_wt_1, doc_wt_2 = n1.split('-')[2], n2.split('-')[2]
                    doc_diff = 1 if are_docs_from_diff_clusters(
                        doc_wt_1, doc_wt_2) else 0
                    symm_switch = switch_symm_changer(doc_sw_2)
                    if switches_differ({'diff_by': args['diff_by']}, coh_sw_1, coh_sw_2) >= args['diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, doc_sw_2) + doc_diff >= args['doc_diff_by'] and \
                            switches_differ({'diff_by': args['doc_diff_by']}, doc_sw_1, symm_switch) + doc_diff >= args['doc_diff_by']:
                        G.add_edge(n1, n2)
        with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']),
                  'wb') as w_obj:
            pickle.dump(G, w_obj)
    else:
        print('reading graph')
        with open('graph_%i_%i.obj' % (args['diff_by'], args['doc_diff_by']),
                  'rb') as r_obj:
            G = pickle.load(r_obj)

    if not os.path.isfile('max_cliques_%i_%i.obj' %
                          (args['diff_by'], args['doc_diff_by'])):
        cliques = [a for a in nx.find_cliques(G)]
        max_len = max([len(a) for a in cliques])
        max_cliques = [a for a in cliques if len(a) == max_len]
        print(
            'there are %i cliques with %i structures in each for diff_by=%i doc_diff_by=%i'
            %
            (len(max_cliques), max_len, args['diff_by'], args['doc_diff_by']))
        with open(
                'max_cliques_%i_%i.obj' %
            (args['diff_by'], args['doc_diff_by']), 'wb') as w_obj:
            pickle.dump(max_cliques, w_obj)
    else:
        print('reading cliques')
        with open(
                'max_cliques_%i_%i.obj' %
            (args['diff_by'], args['doc_diff_by']), 'rb') as r_obj:
            max_cliques = pickle.load(r_obj)

    occurences = {
        a.name: 0
        for clq in max_cliques for k in clq for a in bins[k]
    }
    for clq in max_cliques:
        print('in clq', clq)
        for k in clq:
            # print(bins[k][0].name)
            print('\n'.join(set([a.name for a in bins[k]])))
            for a in bins[k]:
                occurences[a.name] += 1
    for w in sorted(occurences, key=occurences.get, reverse=False):
        print(w, occurences[w])