Beispiel #1
0
def assemble_pysb(stmts, data_genes, out_file):
    """Return an assembled PySB model."""
    base_file, _ = os.path.splitext(out_file)
    #stmts = ac.load_statements('%s.pkl' % base_file)
    stmts = preprocess_stmts(stmts, data_genes)

    # This is the "final" set of statements going into the assembler so it
    # makes sense to cache these.
    # This is also the point where index cards can be generated
    ac.dump_statements(stmts, '%s_before_pa.pkl' % base_file)
    assemble_index_cards(stmts, 'output/index_cards')

    # Save a version of statements with no evidence for faster loading
    for s in stmts:
        s.evidence = []
        for ss in s.supports + s.supported_by:
            ss.evidence = []
    ac.dump_statements(stmts, '%s_no_evidence.pkl' % base_file)

    # Assemble model
    pa = PysbAssembler()
    pa.add_statements(stmts)
    pa.make_model(reverse_effects=False)
    #ac.dump_statements(pa.statements, '%s_after_pa.pkl' % base_file)
    # Set context
    set_context(pa)
    # Add observables
    add_observables(pa.model)
    pa.save_model(out_file)
    with open('korkut_pysb.pkl', 'wb') as fh:
        pickle.dump(pa.model, fh)
    #pa.export_model('kappa', '%s.ka' % base_file)
    return pa.model
Beispiel #2
0
def filter(stmts, cutoff, filename):
    stmts = ac.filter_belief(stmts, cutoff)
    stmts = ac.filter_top_level(stmts)
    stmts = ac.filter_direct(stmts)
    #stmts = ac.filter_enzyme_kinase(stmts)
    ac.dump_statements(stmts, filename)
    return stmts
Beispiel #3
0
def get_indradb_pa_stmts():
    """Get preassembled INDRA Stmts for PMC articles from INDRA DB.

    DEPRECATED. Get Raw Statements instead.
    """
    # Get the list of all PMCIDs from the corpus metadata
    pmcids = get_ids('pmcid')
    paper_refs = [('pmcid', p) for p in pmcids]
    stmt_jsons = []
    batch_size = 1000
    start = time.time()
    for batch_ix, paper_batch in enumerate(batch_iter(paper_refs, batch_size)):
        if batch_ix <= 5:
            continue
        papers = list(paper_batch)
        print("Querying DB for statements for %d papers" % batch_size)
        batch_start = time.time()
        result = get_statement_jsons_from_papers(papers)
        batch_elapsed = time.time() - batch_start
        batch_jsons = [
            stmt_json for stmt_hash, stmt_json in result['statements'].items()
        ]
        print("Returned %d stmts in %f sec" %
              (len(batch_jsons), batch_elapsed))
        batch_stmts = stmts_from_json(batch_jsons)
        ac.dump_statements(batch_stmts, 'batch_%02d.pkl' % batch_ix)
        stmt_jsons += batch_jsons
    elapsed = time.time() - start
    print("Total time: %f sec, %d papers" % (elapsed, len(paper_refs)))
    stmts = stmts_from_json(stmt_jsons)
    ac.dump_statements(stmts, 'cord19_pmc_stmts.pkl')
    return stmt_jsons
Beispiel #4
0
def run_assembly(stmts, filename):
    stmts = ac.map_grounding(stmts)
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.filter_human_only(stmts)
    #stmts = ac.expand_families(stmts)
    stmts = ac.filter_gene_list(stmts, gene_names, 'one', allow_families=True)
    stmts = ac.map_sequence(stmts)
    stmts = ac.run_preassembly(stmts, return_toplevel=False, poolsize=4)
    ac.dump_statements(stmts, filename)
    return stmts
Beispiel #5
0
def combine_all_stmts(pkl_list, output_file):
    all_stmts = []
    for pkl_file in pkl_list:
        all_stmts.extend(ac.load_statements(pkl_file))
    ac.dump_statements(all_stmts, output_file)
    stmt_json = stmts_to_json(all_stmts)
    output_json = f"{output_file.rsplit('.', maxsplit=1)[0]}.json"
    with open(output_json, 'wt') as f:
        json.dump(stmt_json, f, indent=2)
    return all_stmts
Beispiel #6
0
def assemble_ras_pathway(fname, reader):
    # Make original pathway map
    with open(fname, 'rt') as fh:
        txt = fh.read()
    if reader == 'reach':
        stmts = process_reach(txt)
    elif reader == 'trips':
        stmts = process_trips(txt, reread=True)
    ac.dump_statements(stmts, 'ras_pathway.pkl')
    draw_graph(stmts, 'ras_pathway')
Beispiel #7
0
def read_extra_sources(out_file):
    sparser_stmts = process_sparser.read_stmts(process_sparser.base_folder)
    #sparser_stmts += \
    #    process_sparser.read_stmts(process_sparser.sentences_folder)
    r3_stmts = process_r3.read_stmts(process_r3.active_forms_files[0])
    trips_stmts = process_trips.read_stmts(process_trips.base_folder)
    phosphosite_stmts = \
        read_phosphosite.read_phosphosite_owl(read_phosphosite.phosphosite_owl_file)
    stmts = trips_stmts + sparser_stmts + r3_stmts + phosphosite_stmts
    ac.dump_statements(stmts, out_file)
    return stmts
Beispiel #8
0
def assemble_extension(fname_orig, fname, reader):
    with open(fname_orig, 'rt') as fh:
        orig_txt = fh.read()
    with open(fname, 'rt') as fh:
        extension_txt = fh.read()
    txt = '\n'.join([orig_txt, extension_txt])
    if reader == 'reach':
        stmts = process_reach(txt)
    elif reader == 'trips':
        stmts = process_trips(txt, reread=True)
    ac.dump_statements(stmts, 'ras_pathway_extension.pkl')
    draw_graph(stmts, 'ras_pathway_extension')
Beispiel #9
0
def run_assembly(stmts, save_file):
    stmts = ac.map_grounding(stmts)
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.filter_human_only(stmts)
    stmts = ac.expand_families(stmts)
    stmts = ac.filter_gene_list(stmts, gene_names, 'one')
    stmts = ac.map_sequence(stmts)
    stmts = ac.run_preassembly(stmts, return_toplevel=False)
    stmts = ac.filter_belief(stmts, 0.95)
    stmts = ac.filter_top_level(stmts)
    stmts = ac.filter_direct(stmts)
    stmts = ac.filter_enzyme_kinase(stmts)
    ac.dump_statements(stmts, save_file)
    return stmts
Beispiel #10
0
def make_cyjs_network(results, model, stmts):
    path_stmts = get_path_stmts(results, model, stmts)
    path_genes = get_path_genes(path_stmts)
    # Get UUIDs to use as filter
    path_uuids = [list(path.keys()) for path in path_stmts]
    all_path_uuids = []
    for p in path_uuids:
        all_path_uuids += p
    #filtered_stmts = ac.filter_gene_list(stmts, path_genes, 'one')
    filtered_stmts = ac.filter_uuid_list(stmts, all_path_uuids)
    ac.dump_statements(filtered_stmts, 'output/korkut_cyjs_model.pkl')
    ca = CyJSAssembler(filtered_stmts)
    cm = ca.make_model()
    ca.set_CCLE_context(['SKMEL28_SKIN'])
    ca.save_json('output/korkut_model')
Beispiel #11
0
def get_indra_phos_stmts():
    stmts = by_gene_role_type(stmt_type='Phosphorylation')
    stmts += by_gene_role_type(stmt_type='Dephosphorylation')
    stmts = ac.map_grounding(stmts)
    # Expand families before site mapping
    stmts = ac.expand_families(stmts)
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.map_sequence(stmts)
    ac.dump_statements(stmts, 'sources/indra_phos_sitemap.pkl')
    stmts = ac.run_preassembly(stmts,
                               poolsize=4,
                               save='sources/indra_phos_stmts_pre.pkl')
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts, specific_only=True)
    ac.dump_statements(stmts, 'sources/indra_phos_stmts.pkl')
    return stmts
Beispiel #12
0
def run_preassembly(statements, hierarchies):
    print('%d total statements' % len(statements))
    # Filter to grounded only
    statements = map_onto(statements)
    ac.dump_statements(statements, 'pi_mtg_demo_unfiltered.pkl')
    statements = ac.filter_grounded_only(statements, score_threshold=0.7)

    #statements = ac.filter_by_db_refs(statements, 'UN',
    #    ['conflict', 'food_security', 'precipitation'], policy='one',
    #    match_suffix=True)
    statements = ac.filter_by_db_refs(
        statements,
        'UN', [
            'conflict', 'food_security', 'flooding', 'food_production',
            'human_migration', 'drought', 'food_availability', 'market',
            'food_insecurity'
        ],
        policy='all',
        match_suffix=True)
    assume_polarity(statements)
    statements = filter_has_polarity(statements)

    # Make a Preassembler with the Eidos and TRIPS ontology
    pa = Preassembler(hierarchies, statements)
    # Make a BeliefEngine and run combine duplicates
    be = BeliefEngine()
    unique_stmts = pa.combine_duplicates()
    print('%d unique statements' % len(unique_stmts))
    be.set_prior_probs(unique_stmts)
    # Run combine related
    related_stmts = pa.combine_related(return_toplevel=False)
    be.set_hierarchy_probs(related_stmts)
    #related_stmts = ac.filter_belief(related_stmts, 0.8)
    # Filter to top-level Statements
    top_stmts = ac.filter_top_level(related_stmts)

    pa.stmts = top_stmts
    print('%d top-level statements' % len(top_stmts))
    conflicts = pa.find_contradicts()
    top_stmts = remove_contradicts(top_stmts, conflicts)

    ac.dump_statements(top_stmts, 'pi_mtg_demo.pkl')

    return top_stmts
Beispiel #13
0
def get_indra_reg_act_stmts():
    try:
        stmts = ac.load_statements('sources/indra_reg_act_stmts.pkl')
        return stmts
    except:
        pass
    stmts = []
    for stmt_type in ('Activation', 'Inhibition', 'ActiveForm'):
        print("Getting %s statements from INDRA DB" % stmt_type)
        stmts += by_gene_role_type(stmt_type=stmt_type)
    stmts = ac.map_grounding(stmts, save='sources/indra_reg_act_gmap.pkl')
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.run_preassembly(stmts,
                               poolsize=4,
                               save='sources/indra_reg_act_pre.pkl')
    stmts = ac.filter_human_only(stmts)
    stmts = ac.filter_genes_only(stmts, specific_only=True)
    ac.dump_statements(stmts, 'sources/indra_reg_act_stmts.pkl')
    return stmts
Beispiel #14
0
def assemble_pysb(stmts, data_genes, out_file):
    """Return an assembled PySB model."""
    base_file, _ = os.path.splitext(out_file)
    #stmts = ac.load_statements('%s.pkl' % base_file)
    stmts = preprocess_stmts(stmts, data_genes)

    # Make a SIF model equivalent to the PySB model
    # Useful for making direct comparisons in pathfinding
    sa = SifAssembler(stmts)
    sa.make_model(use_name_as_key=True,
                  include_mods=True,
                  include_complexes=True)
    sif_str = sa.print_model(include_unsigned_edges=True)
    with open('%s_pysb.sif' % base_file, 'wt') as f:
        f.write(sif_str)

    # This is the "final" set of statements going into the assembler so it
    # makes sense to cache these.
    # This is also the point where index cards can be generated
    ac.dump_statements(stmts, '%s_before_pa.pkl' % base_file)
    assemble_index_cards(stmts, 'output/index_cards')

    # Save a version of statements with no evidence for faster loading
    for s in stmts:
        s.evidence = []
        for ss in s.supports + s.supported_by:
            ss.evidence = []
    ac.dump_statements(stmts, '%s_no_evidence.pkl' % base_file)

    # Assemble model
    pa = PysbAssembler()
    pa.add_statements(stmts)
    pa.make_model(reverse_effects=False)
    #ac.dump_statements(pa.statements, '%s_after_pa.pkl' % base_file)
    # Set context
    set_context(pa)
    # Add observables
    add_observables(pa.model)
    pa.save_model(out_file)
    with open('korkut_pysb.pkl', 'wb') as fh:
        pickle.dump(pa.model, fh)
    #pa.export_model('kappa', '%s.ka' % base_file)
    return pa.model
Beispiel #15
0
def dump_raw_stmts(tr_dicts, stmt_file):
    """Dump all raw stmts in INDRA DB for a given set of TextRef IDs.

    Parameters
    ----------
    tr_dicts : dict of text ref information
        Keys are text ref IDs (ints) mapped to dictionaries of text ref
        metadata.
    stmt_file : str
        Path to file to dump pickled raw statements.

    Returns
    -------
    list of stmts
        Raw INDRA Statements retrieved from the INDRA DB.
    """
    # Get the INDRA Statement JSON for the Statement IDs
    stmts_flat = get_raw_stmts(tr_dicts)
    ac.dump_statements(stmts_flat, stmt_file)
    return stmts_flat
Beispiel #16
0
def preprocess_db_stmts(stmts, output_file, filter_stmt_site):
    """Take the statements from the database and grounding map them; """
    print("Mapping grounding")
    gmap_stmts = ac.map_grounding(stmts)
    #ac.dump_statements(gmap_stmts, prefix + '_gmap.pkl')
    print("Sorting and filtering")
    # Next, eliminate exact duplicates
    stmts_by_deep_hash = [(s.get_hash(shallow=False), s) for s in gmap_stmts]
    stmts_by_deep_hash.sort(key=lambda x: x[0])
    uniq_stmts = []
    for k, group in itertools.groupby(stmts_by_deep_hash, key=lambda x: x[0]):
        uniq_stmts.append(list(group)[0][1])
    if filter_stmt_site:
        # Filter to statements with residue and position
        site_stmts = [s for s in uniq_stmts if s.residue and s.position]
    else:
        site_stmts = uniq_stmts
    # Organize into a dictionary indexed by site
    ac.dump_statements(site_stmts, output_file)
    return site_stmts
Beispiel #17
0
def assemble_correction(fname_orig, fname, reader):
    # Read correction
    with open(fname_orig, 'rt') as fh:
        orig_txt = [ln.strip() for ln in fh.readlines()]
    with open(fname, 'rt') as fh:
        correct_txt = [ln.strip() for ln in fh.readlines()]
    for ln in correct_txt:
        if ln.startswith('<'):
            remove_line = ln[2:]
            orig_txt.remove(remove_line)
        elif ln.startswith('>'):
            add_line = ln[2:]
            orig_txt.append(add_line)
    txt = '\n'.join(orig_txt)
    if reader == 'reach':
        stmts = process_reach(txt)
    elif reader == 'trips':
        stmts = process_trips(txt, reread=True)
    ac.dump_statements(stmts, 'ras_pathway_correction.pkl')
    draw_graph(stmts, 'ras_pathway_correction')
Beispiel #18
0
    def get_statements(self, output_file=None):
        """Get the full set of model statements including extra statements.

        Optionally dumps a pickle of statements to given output file.

        Parameters
        ----------
        output_file : str
            File to save the statements.

        Returns
        -------
        list of INDRA Statements
        """
        stmts_by_group = self.get_stmts_by_group()
        self.statements = [
            s for stmts_by_line in stmts_by_group.values()
            for stmt_list in stmts_by_line.values() for s in stmt_list
        ]
        # Dump the statements
        if output_file is not None:
            ac.dump_statements(self.statements, output_file)
        return self.statements
Beispiel #19
0
def build_prior(gene_names):
    """Build a corpus of prior Statements from PC and BEL."""
    gn = GeneNetwork(gene_names, basen)
    # Read BEL Statements
    bel_stmts = gn.get_bel_stmts(filter=False)
    ac.dump_statements(bel_stmts, prefixed_pkl('bel'))
    # Read Pathway Commons Statements
    database_filter = ['reactome', 'kegg', 'pid']
    biopax_stmts = gn.get_biopax_stmts(database_filter=database_filter)
    # Eliminate blacklisted interactions
    tmp_stmts = []
    for stmt in biopax_stmts:
        source_ids = [ev.source_id for ev in stmt.evidence]
        if set(source_ids) & set(biopax_blacklist):
            continue
        tmp_stmts.append(stmt)
    biopax_stmts = tmp_stmts
    ac.dump_statements(biopax_stmts, prefixed_pkl('biopax'))
    # Read Phosphosite Statements
    phosphosite_stmts = read_phosphosite_owl(phosphosite_owl_file)
    ac.dump_statements(phosphosite_stmts, prefixed_pkl('phosphosite'))
Beispiel #20
0
def test_dump_stmts():
    ac.dump_statements([st1], '_test.pkl')
    st_loaded = ac.load_statements('_test.pkl')
    assert (len(st_loaded) == 1)
    assert (st_loaded[0].equals(st1))
def test_dump_stmts():
    ac.dump_statements([st1], '_test.pkl')
    st_loaded = ac.load_statements('_test.pkl')
    assert len(st_loaded) == 1
    assert st_loaded[0].equals(st1)
Beispiel #22
0
def main(args):
    uniq_pairs, all_hgnc_ids, fsort_corrs = \
            get_correlations(args.ceres_file, args.geneset_file,
                             args.corr_file, args.strict,
                             args.outbasename, args.recalc, args.ll, args.ul)

    # Get statements from file or from database that contain any gene from
    # provided list as set
    if args.statements_in:  # Get statments from file
        stmts_all = set(ac.load_statements(args.statements_in))
    else:  # Use api to get statements. NOT the same as querying for each ID
        if args.geneset_file:
            stmts_all = dnf.dbc_load_statements(gene_filter_list)
        else:
            # if there is no gene set file, restrict to gene ids in
            # correlation data
            stmts_all = dnf.dbc_load_statements(list(all_hgnc_ids))

    # Dump statements to pickle file if output name has been given
    if args.statements_out:
        ac.dump_statements(stmts=stmts_all, fname=args.statements_out)

    # Get nested dicts from statements
    nested_dict_statements = dnf.nested_dict_gen(stmts_all)

    # Loop through the unique pairs
    dir_conn_pairs = []
    dir_neg_conn_pairs = []
    unexplained = []
    npairs = len(uniq_pairs)

    f_con = open(args.outbasename + '_connections_latex.tex', 'w')

    f_neg_c = open(args.outbasename + '_neg_conn_latex.tex', 'w')

    logger.info('Looking for connections between %i pairs' % npairs)
    for pair in uniq_pairs:
        pl = list(pair)
        for li in pl:
            if _is_float(li):
                correlation = li
                fmt_corr = '{0:.04}'.format(correlation)
                break
        pl.remove(correlation)
        id1, id2 = pl

        forward_fail = False
        backward_fail = False

        if (nested_dict_statements.get(id1) and
                nested_dict_statements.get(id1).get(id2)) or \
                (nested_dict_statements.get(id2) and
                 nested_dict_statements.get(id2).get(id1)):
            new_pair = r'\section{{{}, {}: {}}}'.format(id1, id2, fmt_corr) \
                 +'\n'+ \
                 r'See correlation plot \href{{' \
                 r'https://depmap.org/portal/interactive/?xDataset=Avana' \
                 r'&xFeature={}&yDataset=Avana&yFeature={}&colorDataset=' \
                 r'lineage&colorFeature=all&filterDataset=context' \
                 r'&filterFeature=&regressionLine=false&statisticsTable=false' \
                 r'&associationTable=true&plotOnly=false}}{{here}}'.format(
                     id1, id2) + '\n\n'
            f_con.write(new_pair)
            if correlation < 0:
                f_neg_c.write(new_pair)

        # nested_dict_statements.get(id1).get(id2) raises AttributeError
        # if nested_dict_statements.get(id1) returns {}

        ev_fltr = 0

        # Checks subj=id1, obj=id2
        if nested_dict_statements.get(id1) and \
                nested_dict_statements.get(id1).get(id2):
            stmts = nested_dict_statements[id1][id2]
            logger.info('Found connection between %s and %s' % (id1, id2))
            dir_conn_pairs.append((id1, id2, correlation, stmts))
            output = dnf.latex_output(subj=id1,
                                      obj=id2,
                                      corr=correlation,
                                      ev_len_fltr=ev_fltr,
                                      stmts=stmts,
                                      ignore_str='parent')
            f_con.write(output)

            if correlation < 0:
                dir_neg_conn_pairs.append((id1, id2, correlation, stmts))
                f_neg_c.write(output)
        else:
            forward_fail = True

        # Checks subj=id2, obj=id1
        if nested_dict_statements.get(id2) and \
                nested_dict_statements.get(id2).get(id1):
            stmts = nested_dict_statements[id2][id1]
            logger.info('Found connection between %s and %s' % (id2, id1))
            dir_conn_pairs.append((id2, id1, correlation, stmts))
            output = dnf.latex_output(subj=id2,
                                      obj=id1,
                                      corr=correlation,
                                      ev_len_fltr=ev_fltr,
                                      stmts=stmts,
                                      ignore_str='parent')
            f_con.write(output)

            if correlation < 0:
                dir_neg_conn_pairs.append((id2, id1, correlation, stmts))
                f_neg_c.write(output)

        else:
            backward_fail = True

        # If both failed, count as unexplained
        if forward_fail and backward_fail:
            unexplained.append([id1, id2, correlation])

    with open(args.outbasename + '_connections.csv', 'w', newline='') as csvf:
        wrtr = csv.writer(csvf, delimiter=',')
        wrtr.writerows(dir_conn_pairs)

    with open(args.outbasename + '_neg_conn.csv', 'w', newline='') as csvf:
        wrtr = csv.writer(csvf, delimiter=',')
        wrtr.writerows(dir_neg_conn_pairs)

    with open(args.outbasename + '_unexplained.csv', 'w', newline='') as csvf:
        wrtr = csv.writer(csvf, delimiter=',')
        wrtr.writerows(unexplained)

    f_con.close()
    f_neg_c.close()
Beispiel #23
0
from os.path import abspath, dirname, join
from indra.tools import assemble_corpus as ac
from indra.databases import hgnc_client
from indra.assemblers.indranet import IndraNetAssembler
from indra.sources import indra_db_rest as idr

if __name__ == '__main__':
    stmts_path = join(dirname(abspath(__file__)), '..', '..', '..', 'covid-19',
                      'stmts')
    gordon_stmts_path = join(stmts_path, 'gordon_ndex_stmts.pkl')

    gordon_stmts = ac.load_statements(gordon_stmts_path)

    # Get human interactors of viral proteins from Gordon et al.
    hgnc_ids = [
        ag.db_refs['HGNC'] for stmt in gordon_stmts
        for ag in stmt.agent_list() if ag is not None and 'HGNC' in ag.db_refs
    ]
    hgnc_names = [hgnc_client.get_hgnc_name(id) for id in hgnc_ids]

    stmts = []
    for gene in hgnc_names:
        idrp = idr.get_statements(agents=[gene])
        stmts.extend(idrp.statements)

    ac.dump_statements(stmts, 'gordon_ppi_stmts.pkl')
Beispiel #24
0
def build_prior(genes, out_file):
    gn = GeneNetwork(genes)
    stmts = gn.get_statements(filter=False)
    ac.dump_statements(stmts, out_file)
    return stmts
Beispiel #25
0
    # The file in which the preassembled statements will be saved
    pre_stmts_file = prefixed_pkl('preassembled')
    if reassemble:
        # Load various files that were previously produced
        sources = [
            'indradb', 'trips', 'bel', 'biopax', 'phosphosite', 'r3', 'sparser'
        ]
        stmts = []
        for source in sources:
            stmts += ac.load_statements(prefixed_pkl(source))
        stmts = ac.filter_no_hypothesis(stmts)
        # Fix grounding and filter to grounded entities and for proteins,
        # filter to the human ones
        stmts = ac.map_grounding(stmts)
        stmts = ac.filter_grounded_only(stmts)
        stmts = ac.filter_human_only(stmts)
        # Combinatorially expand protein families
        stmts = ac.expand_families(stmts)
        # Apply a strict filter to statements based on the gene names
        stmts = ac.filter_gene_list(stmts, gene_names, 'all')
        # Fix errors in references to protein sequences
        stmts = ac.map_sequence(stmts)
        # Run preassembly and save result
        stmts = ac.run_preassembly(stmts, return_toplevel=False)
        ac.dump_statements(stmts, pre_stmts_file)

    # Load the preassembled statements
    stmts = ac.load_statements(pre_stmts_file)
    # Run assembly into a PySB model
    assemble_pysb.assemble_pysb(stmts, gene_names, contextualize=True)
def main(args):

    global any_expl, any_expl_not_sr, common_parent, ab_expl_count, \
        directed_im_expl_count, both_im_dir_expl_count, \
        any_axb_non_sr_expl_count, sr_expl_count, \
        shared_regulator_only_expl_count, explanations_of_pairs, unexplained, \
        explained_nested_dict, id1, id2, nested_dict_statements, dataset_dict, \
        avg_corr, dir_node_set, nx_dir_graph, explained_set, part_of_explained,\
        sr_explanations, any_expl_ign_sr

    if args.cell_line_filter and not len(args.cell_line_filter) > 2:
        logger.info('Filtering to provided cell lines in correlation '
                    'calculations.')
        cell_lines = _parse_cell_filter(*args.cell_line_filter)
        assert len(cell_lines) > 0
    elif args.cell_line_filter and len(args.cell_line_filter) > 2:
        sys.exit('Argument --cell-line-filter only takes one or two arguments')
    # No cell line dictionary and rnai data and filtering is requested
    elif args.cell_line_filter and len(args.cell_line_filter) == 1 and \
            args.rnai_data_file:
        sys.exit('Need a translation dictionary if RNAi data is provided and '
                 'filter is requested')
    else:
        # Should be empty only when --cell-line-filter is not provided
        logger.info('No cell line filter provided. Using all cell lines in '
                    'correlation calculations.')
        cell_lines = []

    # Parse "explained genes"
    if args.explained_set and len(args.explained_set) == 2:
        explained_set = _parse_explained_genes(
            gene_set_file=args.explained_set[0],
            check_column=args.explained_set[1])
        logger.info('Loading "explained pairs."')
    elif args.explained_set and len(args.explained_set) != 2:
        sys.exit('Argument --explained-set takes exactly two arguments: '
                 '--explained-set <file> <column name>')

    # Check if belief dict is provided
    if not args.belief_score_dict and not args.nested_dict_in:
        logger.error('Belief dict must be provided through the `-b ('
                     '--belief-score-dict)` argument if no nested dict '
                     'of statements with belief score is provided through the '
                     '`-ndi (--nested-dict-in)` argument.')
        raise FileNotFoundError

    # Get dict of {hash: belief score}
    belief_dict = None  # ToDo use api to query belief scores if not loaded
    if args.belief_score_dict:
        if args.belief_score_dict.endswith('.json'):
            belief_dict = _json_open(args.belief_score_dict)
        elif args.belief_score_dict.endswith('.pkl'):
            belief_dict = _pickle_open(args.belief_score_dict)

    args_dict = _arg_dict(args)
    npairs = 0

    filter_settings = {
        'gene_set_filter':
        args.gene_set_filter,
        'strict':
        args.strict,
        'cell_line_filter':
        cell_lines,
        'cell_line_translation_dict':
        _pickle_open(args.cell_line_filter[1])
        if args.cell_line_filter and len(args.cell_line_filter) == 2 else None,
        'margin':
        args.margin,
        'filter_type': (args.filter_type if args.filter_type else None)
    }

    output_settings = {
        'dump_unique_pairs': args.dump_unique_pairs,
        'outbasename': args.outbasename
    }

    # Parse CRISPR and/or RNAi data
    if args_dict.get('crispr') or args_dict.get('rnai'):
        if not filter_settings['filter_type'] and \
            args.crispr_data_file and \
                args.rnai_data_file:
            logger.info('No merge filter set. Output will be intersection of '
                        'the two data sets.')
        elif filter_settings.get('filter_type'):
            logger.info('Using filter type "%s"' %
                        filter_settings['filter_type'])
        master_corr_dict, all_hgnc_ids, stats_dict = \
            dnf.get_combined_correlations(dict_of_data_sets=args_dict,
                                          filter_settings=filter_settings,
                                          output_settings=output_settings)

        # Count pairs in merged correlation dict and dum it
        npairs = dnf._dump_master_corr_dict_to_pairs_in_csv(
            fname=args.outbasename + '_merged_corr_pairs.csv',
            nest_dict=master_corr_dict)

        if args.gene_set_filter:
            gene_filter_list = None
            if args_dict.get('crispr') and not args_dict.get('rnai'):
                gene_filter_list = dnf._read_gene_set_file(
                    gf=filter_settings['gene_set_filter'],
                    data=pd.read_csv(args_dict['crispr']['data'],
                                     index_col=0,
                                     header=0))
            elif args_dict.get('rnai') and not args_dict.get('crispr'):
                gene_filter_list = dnf._read_gene_set_file(
                    gf=filter_settings['gene_set_filter'],
                    data=pd.read_csv(args_dict['rnai']['data'],
                                     index_col=0,
                                     header=0))
            elif args_dict.get('crispr') and args_dict.get('rnai'):
                gene_filter_list = \
                    set(dnf._read_gene_set_file(
                        gf=filter_settings['gene_set_filter'],
                        data=pd.read_csv(args_dict['crispr']['data'],
                                         index_col=0, header=0))) & \
                    set(dnf._read_gene_set_file(
                        gf=filter_settings['gene_set_filter'],
                        data=pd.read_csv(args_dict['rnai']['data'],
                                         index_col=0, header=0)))
            assert gene_filter_list is not None

        else:
            gene_filter_list = None
    else:
        stats_dict = None

    # LOADING INDRA STATEMENTS
    # Get statements from file or from database that contain any gene from
    # provided list as set unless you're already loading a pre-calculated
    # nested dict and/or precalculated directed graph.

    if not (args.light_weight_stmts or args.nested_dict_in):
        if args.statements_in:  # Get statments from file
            stmts_all = set(ac.load_statements(args.statements_in))
        # Use api to get statements. _NOT_ the same as querying for each ID
        else:
            if args.gene_set_filter:
                stmts_all = dnf.dbc_load_statements(gene_filter_list)
            else:
                # if there is no gene set file, restrict to gene ids in
                # input data
                stmts_all = dnf.dbc_load_statements(list(all_hgnc_ids))

        # Dump statements to pickle file if output name has been given
        if args.statements_out:
            logger.info('Dumping read raw statements')
            ac.dump_statements(stmts=stmts_all, fname=args.statements_out)

    # Get nested dicts from statements
    if args.light_weight_stmts:
        hash_df = pd.read_csv(args.light_weight_stmts, delimiter='\t')
        nested_dict_statements = dnf.nested_hash_dict_from_pd_dataframe(
            hash_df)
    elif args.nested_dict_in:
        nested_dict_statements = _pickle_open(args.nested_dict_in)
    else:
        nested_dict_statements = dnf.dedupl_nested_dict_gen(
            stmts_all, belief_dict)
        if args.nested_dict_out:
            _dump_it_to_pickle(fname=args.nested_dict_out,
                               pyobj=nested_dict_statements)

    # Get directed simple graph
    if args.directed_graph_in:
        with open(args.directed_graph_in, 'rb') as rpkl:
            nx_dir_graph = pkl.load(rpkl)
    else:
        # Create directed graph from statement dict
        nx_dir_graph = dnf.nx_directed_graph_from_nested_dict_2layer(
            nest_d=nested_dict_statements, belief_dict=belief_dict)
        # Save as pickle file
        if args.directed_graph_out:
            _dump_it_to_pickle(fname=args.directed_graph_out,
                               pyobj=nx_dir_graph)
    dir_node_set = set(nx_dir_graph.nodes)

    # LOOP THROUGH THE UNIQUE CORRELATION PAIRS, MATCH WITH INDRA NETWORK
    any_expl = 0  # Count if any explanation per (A,B) correlation found
    any_expl_not_sr = 0  # Count any explanation, exlcuding when shared
    # regulator is the only explanation
    any_expl_ign_sr = 0  # Count any explanation, ingoring shared regulator
    # explanations
    common_parent = 0  # Count if common parent found per set(A,B)
    part_of_explained = 0  # Count pairs part the "explained set"
    ab_expl_count = 0  # Count A-B/B-A as one per set(A,B)
    directed_im_expl_count = 0  # Count any A->X->B,B->X->A as one per set(A,B)
    any_axb_non_sr_expl_count = 0  # Count if shared target found per set(A,B)
    sr_expl_count = 0  # Count if shared regulator found per set(A,B)
    shared_regulator_only_expl_count = 0  # Count if only shared regulator found
    explanations_of_pairs = []  # Saves all non shared regulator explanations
    sr_explanations = []  # Saves all shared regulator explanations
    unexplained = []  # Unexplained correlations
    skipped = 0

    # The explained nested dict: (1st key = subj, 2nd key = obj, 3rd key =
    # connection type or correlation).
    #
    # directed: any A->B or B->A
    # undirected: any of complex, selfmodification, parent
    # x_is_intermediary: A->X->B or B->X->A
    # x_is_downstream: A->X<-B
    # x_is_upstream: A<-X->B
    #
    # d[subj][obj] = {correlation: {gene_set1: corr, gene_set2: corr, ...},
    #                 directed: [(stmt/stmt hash, belief score)],
    #                 undirected: [(stmt/stmt hash, belief score)],
    #                 common_parents: [list of parents]
    #                 x_is_intermediary: [(X, belief rank)],
    #                 x_is_downstream: [(X, belief rank)],
    #                 x_is_upstream: [(X, belief rank)]}
    #
    # Then in javascript you can for example do:
    # if SUBJ_is_subj_dict.obj.direct.length <-- should return zero if []
    #
    # Used to get: directed graph
    # 1. all nodes of directed graph -> 1st dropdown
    # 2. dir -> undir graph -> jsons to check all corr neighbors -> 2nd dropdown
    # 3. jsons to check if connection is direct or intermediary

    # Using the following loop structure for counter variables:
    # a = 2
    # def for_loop_body():
    #     global a
    #     a += 1
    # # Then loop like:
    # if dict:
    #     for pairs in dict:
    #         for_loop_body(args)
    # elif random:
    #     for random pair:
    #         for_loop_body(args)

    explained_nested_dict = dnf.create_nested_dict()

    # Loop rnai and/or crispr only
    if args_dict.get('rnai') or args_dict.get('crispr') and \
            not args.brca_dependencies:
        logger.info('Gene pairs generated from DepMap knockout screening data '
                    'sets')
        logger.info('Looking for connections between %i pairs' %
                    (npairs if npairs > 0 else args.max_pairs))
        for outer_id, do in master_corr_dict.items():
            for inner_id, dataset_dict in do.items():
                if len(dataset_dict.keys()) == 0:
                    skipped += 1
                    if args.verbosity:
                        logger.info('Skipped outer_id=%s and inner_id=%s' %
                                    (outer_id, inner_id))
                    continue

                id1, id2 = outer_id, inner_id
                loop_body(args)

    # Loop rnai and/or crispr AND BRCA cell line dependencies
    elif args_dict.get('rnai') or args_dict.get('crispr') and \
            args.brca_dependencies:
        logger.info('Gene pairs generated from combined knockout screens. '
                    'Output data will incluide BRCA cell line dependency\n'
                    'data as well as correlation data from knockout screens.')
        logger.info('Looking for connections between %i pairs' %
                    (npairs if npairs > 0 else args.max_pairs))

        # Load BRCA dependency data
        brca_data_set = pd.read_csv(args.brca_dependencies, header=0)
        depend_in_breast_genes = brca_data_set.drop(
            axis=1, labels=['Url Label',
                            'Type'])[brca_data_set['Type'] == 'gene']
        genes = set(depend_in_breast_genes['Gene/Compound'].values)

        for outer_id, do in master_corr_dict.items():
            for inner_id, knockout_dict in do.items():
                if len(knockout_dict.keys()) == 0:
                    skipped += 1
                    if args.verbosity:
                        logger.info('Skipped outer_id=%s and inner_id=%s' %
                                    (outer_id, inner_id))
                    continue

                id1, id2 = outer_id, inner_id
                dataset_dict = {}
                gene1_data = []
                gene2_data = []

                # Get BRCA dep data
                if id1 in genes:
                    for row in depend_in_breast_genes[
                            depend_in_breast_genes['Gene/Compound'] ==
                            id1].iterrows():
                        gene1_data.append(
                            (row[1]['Dataset'], row[1]['T-Statistic'],
                             row[1]['P-Value']))
                if id2 in genes:
                    for row in depend_in_breast_genes[
                            depend_in_breast_genes['Gene/Compound'] ==
                            id2].iterrows():
                        gene2_data.append(
                            (row[1]['Dataset'], row[1]['T-Statistic'],
                             row[1]['P-Value']))

                dataset_dict[id1] = gene1_data
                dataset_dict[id2] = gene2_data

                dataset_dict['crispr'] = (knockout_dict['crispr']
                                          if knockout_dict.get('crispr') else
                                          None),
                dataset_dict['rnai'] = (knockout_dict['rnai']
                                        if knockout_dict.get('rnai') else None)

                if id1 not in genes and id2 not in genes:
                    dataset_dict = knockout_dict

                # Run loop body
                loop_body(args)

    # loop brca dependency ONLY
    elif args.brca_dependencies and not \
            (args_dict.get('rnai') or args_dict.get('crispr')):
        logger.info(
            'Gene pairs generated from BRCA gene enrichment data only.')
        brca_data_set = pd.read_csv(args.brca_dependencies, header=0)
        depend_in_breast_genes = brca_data_set.drop(
            axis=1, labels=['Url Label',
                            'Type'])[brca_data_set['Type'] == 'gene']
        genes = set(depend_in_breast_genes['Gene/Compound'].values)
        npairs = len(list(itt.combinations(genes, 2)))
        logger.info('Looking for connections between %i pairs' %
                    (npairs if npairs > 0 else args.max_pairs))
        for id1, id2 in itt.combinations(genes, 2):
            gene1_data = []
            gene2_data = []
            # For each non-diagonal pair in file, insert in dataset_dict:
            # geneA, geneB,
            # dataset for A, dataset for B,
            # T-stat for A, T-stat for B,
            # P-value for A, P-value
            for row in depend_in_breast_genes[
                    depend_in_breast_genes['Gene/Compound'] == id1].iterrows():
                gene1_data.append((row[1]['Dataset'], row[1]['T-Statistic'],
                                   row[1]['P-Value']))

            for row in depend_in_breast_genes[
                    depend_in_breast_genes['Gene/Compound'] == id2].iterrows():
                gene2_data.append((row[1]['Dataset'], row[1]['T-Statistic'],
                                   row[1]['P-Value']))
            # dataset_dict = {id1:
            #                 [(dataset1, T-stat1, P-value1),
            #                  (dataset2, T-stat2, P-value2)],
            #                 id2:
            #                  [(..., ...)],
            #                  ...}
            dataset_dict = {id1: gene1_data, id2: gene2_data}
            loop_body(args)

    # loop random pairs from data set
    elif args_dict.get('sampling_gene_file'):
        logger.info('Gene pairs generated at random from %s' %
                    args_dict['sampling_gene_file'])
        with open(args_dict['sampling_gene_file'], 'r') as fi:
            rnd_gene_set = [l.strip() for l in fi.readlines()]

        npairs = args.max_pairs
        dataset_dict = None
        logger.info('Looking for connections between %i pairs' %
                    (npairs if npairs > 0 else args.max_pairs))
        for _ in range(npairs):
            id1, id2 = _rnd_pair_gen(rnd_gene_set)
            assert not isinstance(id1, list)
            loop_body(args)

    long_string = ''
    long_string += '-' * 63 + '\n'
    long_string += 'Summary for matching INDRA network to correlation pairs:'\
                   + '\n\n'
    long_string += '> Total number of correlation pairs checked: %i' % npairs\
                   + '\n'
    if args.verbosity:
        long_string += '> Skipped %i empty doublets in corr dict\n' % skipped

    long_string += '> Total correlations unexplained: %i' % len(unexplained)\
                   + '\n'
    long_string += '> Total correlations explained: %i' % any_expl + '\n'
    long_string += '> Total correlations explained, ignoring shared ' \
                   'regulator: %i' % any_expl_ign_sr + '\n'
    long_string += '> Total correlations explained, excluding shared ' \
                   'regulator (total - shared only): %i' % \
                   (any_expl - shared_regulator_only_expl_count) + '\n'
    long_string += '>    %i correlations have an explanation involving a ' \
                   'common parent' % common_parent + '\n'
    if args.explained_set:
        long_string += '>    %i gene pairs were considered explained as part ' \
                       'of the "explained set"' % part_of_explained + '\n'
    long_string += '>    %i explanations involving direct connection or ' \
                   'complex' % ab_expl_count + '\n'
    long_string += '>    %i correlations have a directed explanation ' \
                   'involving an intermediate node (A->X->B/A<-X<-B)' \
                   % directed_im_expl_count + '\n'
    long_string += '>    %i correlations have an explanation involving an ' \
                   'intermediate node excluding shared regulators' % \
                   any_axb_non_sr_expl_count + '\n'
    long_string += '>    %i correlations have an explanation involving a ' \
                   'shared regulator (A<-X->B)' % sr_expl_count + '\n'
    long_string += '>    %i correlations have shared regulator as only ' \
                   'explanation' % shared_regulator_only_expl_count + '\n\n'

    if stats_dict and (stats_dict.get('rnai') or stats_dict.get('crispr')):
        long_string += 'Statistics of input data:' + '\n\n'
    if stats_dict and stats_dict.get('rnai'):
        long_string += '  RNAi data ' + '\n'
        long_string += ' -----------' + '\n'
        long_string += '> mean: %f\n' % stats_dict['rnai']['mean']
        long_string += '> SD: %f\n' % stats_dict['rnai']['sigma']
        long_string += '> lower bound: %.3f*SD = %.4f\n' % (
            args_dict['rnai']['ll'],
            args_dict['rnai']['ll'] * stats_dict['rnai']['sigma'])
        if args_dict['rnai']['ul']:
            long_string += '> upper bound: %.3f*SD = %.4f\n\n' % (
                args_dict['rnai']['ul'],
                args_dict['rnai']['ul'] * stats_dict['rnai']['sigma'])
    if stats_dict and stats_dict.get('crispr'):
        long_string += '  CRISPR data ' + '\n'
        long_string += ' -------------' + '\n'
        long_string += '> mean: %f\n' % stats_dict['crispr']['mean']
        long_string += '> SD: %f\n' % stats_dict['crispr']['sigma']
        long_string += '> lower bound: %.3f*SD = %.4f\n' % (
            args_dict['crispr']['ll'],
            args_dict['crispr']['ll'] * stats_dict['crispr']['sigma'])
        if args_dict['crispr']['ul']:
            long_string += '> upper bound: %.3f*SD = %.4f\n\n' % (
                args_dict['crispr']['ul'],
                args_dict['crispr']['ul'] * stats_dict['crispr']['sigma'])
    long_string += '-' * 63 + '\n\n'

    logger.info('\n' + long_string)

    # Here create directed graph from explained nested dict
    nx_expl_dir_graph = dnf.nx_directed_graph_from_nested_dict_3layer(
        nest_d=explained_nested_dict)

    if not args.no_web_files:
        # 'explained_nodes' are used to produce first drop down
        explained_nodes = list(nx_expl_dir_graph.nodes)
        logger.info('Dumping json "explainable_ids.json" for first dropdown.')
        _dump_it_to_json(args.outbasename + '_explainable_ids.json',
                         explained_nodes)

        # Get undir graph and save each neighbor lookup as json for 2nd dropdown
        nx_expl_undir_graph = nx_expl_dir_graph.to_undirected()
        dnf.nx_undir_to_neighbor_lookup_json(
            expl_undir_graph=nx_expl_undir_graph, outbasename=args.outbasename)

    # Easiest way to check if pairs are explained or not is to loop explained
    # dict. Skip shared regulators.
    _dump_nest_dict_to_csv(fname=args.outbasename +
                           '_explained_correlations.csv',
                           nested_dict=explained_nested_dict,
                           header=['gene1', 'gene2', 'meta_data'],
                           excl_sr=True)

    _dump_it_to_pickle(fname=args.outbasename + '_explained_nest_dict.pkl',
                       pyobj=explained_nested_dict)
    headers = ['subj', 'obj', 'type', 'X', 'meta_data']
    _dump_it_to_csv(fname=args.outbasename + '_explanations_of_pairs.csv',
                    pyobj=explanations_of_pairs,
                    header=headers)
    _dump_it_to_csv(fname=args.outbasename +
                    '_explanations_of_shared_regulators.csv',
                    pyobj=sr_explanations,
                    header=headers)
    _dump_it_to_csv(fname=args.outbasename + '_unexpl_correlations.csv',
                    pyobj=unexplained,
                    header=headers[:-2])
    with open(args.outbasename + '_script_summary.txt', 'w') as fo:
        fo.write(long_string)
    return 0
            site = '%s_%s%s' % (stmt.sub.name, stmt.residue, stmt.position)
            regulons[kinase].add(site)
    rows = []
    for kinase, sites in regulons.items():
        rows.append([kinase, 'Description'] + [s for s in sites])
    with open(filename, 'wt') as f:
        csvwriter = csv.writer(f, delimiter='\t')
        csvwriter.writerows(rows)


if __name__ == '__main__':
    reload = False
    if reload:
        phos_stmts = \
                get_phosphorylation_stmts('../work/gsea_sites.rnk')
        ac.dump_statements(phos_stmts, '../work/phospho_stmts.pkl')
    else:
        phos_stmts = ac.load_statements('../work/phospho_stmts.pkl')

    regulons_from_stmts(phos_stmts, '../work/kinase_regulons.gmt')

    #kinases = get_kinase_counts(phos_stmts)

    target_list = get_stmt_subject_object(phos_stmts, 'SUBJECT')

    # Get all Tubulin child nodes as the source list
    source_list = [('FPLX', 'Tubulin')]
    tubulin_ag = Agent('Tubulin', db_refs={'FPLX': 'Tubulin'})
    ex = Expander(bio_ontology)
    for ag_ns, ag_id in ex.get_children(tubulin_ag, ns_filter=None):
        #if ag_ns == 'HGNC':
Beispiel #28
0
def build_prior(genes, out_file):
    gn = GeneNetwork(genes)
    stmts = gn.get_statements(filter=False)
    ac.dump_statements(stmts, out_file)
    return stmts
Beispiel #29
0
                        '--ctd_stmts',
                        help='Path to CTD statements pkl file',
                        required=True)
    parser.add_argument('-f',
                        '--output_file',
                        help='Output file for combined pkl',
                        required=True)
    args = parser.parse_args()

    # Load everything
    logger.info('Loading statements from pickle files')
    with open(args.old_mm, 'rb') as f:
        old_mm_emmaa_stmts = pickle.load(f)
        old_mm_stmts = [es.stmt for es in old_mm_emmaa_stmts]
    if args.new_cord:
        new_cord_stmts = ac.load_statements(args.new_cord)
    else:
        new_cord_stmts = None
    drug_stmts = ac.load_statements(args.drug_stmts)
    gordon_stmts = ac.load_statements(args.gordon_stmts)
    virhostnet_stmts = ac.load_statements(args.virhostnet_stmts)
    ctd_stmts = ac.load_statements(args.ctd_stmts)

    other_stmts = drug_stmts + gordon_stmts + virhostnet_stmts + ctd_stmts

    combined_stmts = make_model_stmts(old_mm_stmts, other_stmts,
                                      new_cord_stmts)

    # Dump new pickle
    ac.dump_statements(combined_stmts, args.output_file)
Beispiel #30
0
def build_prior(genes, out_file):
    gn = GeneNetwork(genes, 'dna_damage_prior')
    #stmts = gn.get_statements(filter=False)
    stmts = gn.get_biopax_stmts(filter=False)
    ac.dump_statements(stmts, out_file)
    return stmts
Beispiel #31
0
def build_prior(genes, file_prefix):
    gn = GeneNetwork(genes, file_prefix)
    #stmts = gn.get_statements(filter=False)
    stmts = gn.get_biopax_stmts(filter=False)
    ac.dump_statements(stmts, '%s.pkl' % file_prefix)
    return stmts
Beispiel #32
0
def assemble_pysb(stmts, data_genes, contextualize=False):
    # Filter the INDRA Statements to be put into the model
    stmts = ac.filter_by_type(stmts, Complex, invert=True)
    stmts = ac.filter_direct(stmts)
    stmts = ac.filter_belief(stmts, 0.95)
    stmts = ac.filter_top_level(stmts)
    # Strip the extraneous supports/supported by here
    strip_supports(stmts)
    stmts = ac.filter_gene_list(stmts, data_genes, 'all')
    stmts = ac.filter_enzyme_kinase(stmts)
    stmts = ac.filter_mod_nokinase(stmts)
    stmts = ac.filter_transcription_factor(stmts)
    # Simplify activity types
    ml = MechLinker(stmts)
    ml.gather_explicit_activities()
    ml.reduce_activities()
    ml.gather_modifications()
    ml.reduce_modifications()
    stmts = normalize_active_forms(ml.statements)
    # Replace activations when possible
    ml = MechLinker(stmts)
    ml.gather_explicit_activities()
    ml.replace_activations()
    # Require active forms
    ml.require_active_forms()
    num_stmts = len(ml.statements)
    while True:
        # Remove inconsequential PTMs
        ml.statements = ac.filter_inconsequential_mods(ml.statements,
                                                       get_mod_whitelist())
        ml.statements = ac.filter_inconsequential_acts(ml.statements,
                                                       get_mod_whitelist())
        if num_stmts <= len(ml.statements):
            break
        num_stmts = len(ml.statements)
    stmts = ml.statements
    # Save the Statements here
    ac.dump_statements(stmts, prefixed_pkl('pysb_stmts'))


    # Add drug target Statements
    drug_target_stmts = get_drug_target_statements()
    stmts += drug_target_stmts

    # Just generate the generic model
    pa = PysbAssembler()
    pa.add_statements(stmts)
    model = pa.make_model()
    with open(prefixed_pkl('pysb_model'), 'wb') as f:
        pickle.dump(model, f)

    # Run this extra part only if contextualize is set to True
    if not contextualize:
        return

    cell_lines_no_data = ['COLO858', 'K2', 'MMACSF', 'MZ7MEL', 'WM1552C']
    for cell_line in cell_lines:
        if cell_line not in cell_lines_no_data:
            stmtsc = contextualize_stmts(stmts, cell_line, data_genes)
        else:
            stmtsc = stmts
        pa = PysbAssembler()
        pa.add_statements(stmtsc)
        model = pa.make_model()
        if cell_line not in cell_lines_no_data:
            contextualize_model(model, cell_line, data_genes)
        ac.dump_statements(stmtsc, prefixed_pkl('pysb_stmts_%s' % cell_line))
        with open(prefixed_pkl('pysb_model_%s' % cell_line), 'wb') as f:
            pickle.dump(model, f)