def test_filter_gene_list_one():
    st_out = ac.filter_gene_list([st1, st2], ['a'], 'one')
    assert len(st_out) == 2
    st_out = ac.filter_gene_list([st1, st2], ['a'], 'all')
    assert len(st_out) == 0
    st_out = ac.filter_gene_list([st1, st2], ['a', 'b'], 'all')
    assert len(st_out) == 1
    st_out = ac.filter_gene_list([st1, st2], ['a', 'b'], 'invalid')
    assert len(st_out) == 2

    # Can we exclude a statement with a bound condition agent not on the filter
    # list?
    st_out = ac.filter_gene_list([st18], ['a', 'b', 'd'], 'all')
    # All genes in the list
    assert len(st_out) == 1
    st_out = ac.filter_gene_list([st18], ['a', 'b'], 'all')
    # Bound condition for sub not in list
    assert len(st_out) == 0
    st_out = ac.filter_gene_list([st18], ['a', 'b'], 'one')
    # Bound condition for sub not in list but we only need to match one
    assert len(st_out) == 1
    st_out = ac.filter_gene_list([st18], ['d'], 'one')
    # Only the bound condition is in filter list
    assert len(st_out) == 1

    # Can we remove bound conditions that are not in the filter list?
    st_out = ac.filter_gene_list([st18], ['a', 'b', 'd'], 'all',
                                 remove_bound=True)
    assert len(st_out[0].sub.bound_conditions) == 1
    st_out = ac.filter_gene_list([st18], ['a', 'b'], 'all',
                                 remove_bound=True)
    assert len(st_out[0].sub.bound_conditions) == 0
def test_filter_gene_list_families():
    stmts_out = ac.filter_gene_list([st16, st17], ['MAPK1'], 'one',
                                    allow_families=False)
    assert len(stmts_out) == 1
    assert stmts_out[0] == st16
    stmts_out = ac.filter_gene_list([st16, st17], ['MAPK1'], 'one',
                                    allow_families=True)
    assert len(stmts_out) == 2
    assert st16 in stmts_out
    assert st17 in stmts_out
Beispiel #3
0
def test_filter_gene_list_families():
    stmts_out = ac.filter_gene_list([st16, st17], ['MAPK1'],
                                    'one',
                                    allow_families=False)
    assert len(stmts_out) == 1
    assert stmts_out[0] == st16
    stmts_out = ac.filter_gene_list([st16, st17], ['MAPK1'],
                                    'one',
                                    allow_families=True)
    assert len(stmts_out) == 2
    assert st16 in stmts_out
    assert st17 in stmts_out
def test_filter_gene_list_invert():
    st_out = ac.filter_gene_list([st1, st2], ['a'], 'one', invert=True)
    assert len(st_out) == 0
    st_out = ac.filter_gene_list([st1, st2], ['d'], 'one', invert=True)
    assert len(st_out) == 1
    assert st_out[0].sub.name == 'b'
    st_out = ac.filter_gene_list([st1, st2], ['a', 'd'], 'all', invert=True)
    assert len(st_out) == 1
    assert st_out[0].sub.name == 'b'
    st_out = ac.filter_gene_list([st1, st2], ['a', 'b', 'd'], 'all',
                                 invert=True)
    assert len(st_out) == 0
def test_filter_gene_list_invert():
    st_out = ac.filter_gene_list([st1, st2], ['a'], 'one', invert=True)
    assert len(st_out) == 0
    st_out = ac.filter_gene_list([st1, st2], ['d'], 'one', invert=True)
    assert len(st_out) == 1
    assert st_out[0].sub.name == 'b'
    st_out = ac.filter_gene_list([st1, st2], ['a', 'd'], 'all', invert=True)
    assert len(st_out) == 1
    assert st_out[0].sub.name == 'b'
    st_out = ac.filter_gene_list([st1, st2], ['a', 'b', 'd'], 'all',
                                 invert=True)
    assert len(st_out) == 0
Beispiel #6
0
    def get_bel_stmts(self, filter=False):
        """Get relevant statements from the BEL large corpus.

        Performs a series of neighborhood queries and then takes the union of
        all the statements. Because the query process can take a long time for
        large gene lists, the resulting list of statements are cached in a
        pickle file with the filename `<basename>_bel_stmts.pkl`.  If the
        pickle file is present, it is used by default; if not present, the
        queries are performed and the results are cached.

        Parameters
        ----------
        filter : bool
            If True, includes only those statements that exclusively mention
            genes in :py:attr:`gene_list`. Default is False. Note that the
            full (unfiltered) set of statements are cached.

        Returns
        -------
        list of :py:class:`indra.statements.Statement`
            List of INDRA statements extracted from the BEL large corpus.
        """
        if self.basename is not None:
            bel_stmt_path = '%s_bel_stmts.pkl' % self.basename
        # Check for cached BEL stmt file
        if self.basename is not None and os.path.isfile(bel_stmt_path):
            logger.info("Loading BEL statements from %s" % bel_stmt_path)
            with open(bel_stmt_path, 'rb') as f:
                bel_statements = pickle.load(f)
        # No cache, so perform the queries
        else:
            bel_statements = []
            for gene in self.gene_list:
                logger.info("Getting BEL statements for gene %s" % gene)
                bel_proc = bel.process_ndex_neighborhood([gene])
                if bel_proc is not None:
                    bel_statements += bel_proc.statements
            # Save to pickle file if we're caching
            if self.basename is not None:
                with open(bel_stmt_path, 'wb') as f:
                    pickle.dump(bel_statements, f, protocol=2)
        # Optionally filter out statements not involving only our gene set
        if filter:
            if len(self.gene_list) > 1:
                bel_statements = ac.filter_gene_list(bel_statements,
                                                     self.gene_list, 'one')
            else:
                bel_statements = ac.filter_gene_list(bel_statements,
                                                     self.gene_list, 'all')
        return bel_statements
Beispiel #7
0
    def get_bel_stmts(self, filter=False):
        """Get relevant statements from the BEL large corpus.

        Performs a series of neighborhood queries and then takes the union of
        all the statements. Because the query process can take a long time for
        large gene lists, the resulting list of statements are cached in a
        pickle file with the filename `<basename>_bel_stmts.pkl`.  If the
        pickle file is present, it is used by default; if not present, the
        queries are performed and the results are cached.

        Parameters
        ----------
        filter : bool
            If True, includes only those statements that exclusively mention
            genes in :py:attr:`gene_list`. Default is False. Note that the
            full (unfiltered) set of statements are cached.

        Returns
        -------
        list of :py:class:`indra.statements.Statement`
            List of INDRA statements extracted from the BEL large corpus.
        """
        bel_proc = bel.process_pybel_neighborhood(self.gene_list)
        bel_statements = bel_proc.statements
        # Save to pickle file if we're caching
        if self.basename is not None:
            with open('%s_bel_stmts.pkl' % self.basename, 'wb') as f:
                pickle.dump(bel_statements, f)
        # Optionally filter out statements not involving only our gene set
        if filter:
            if len(self.gene_list) > 1:
                bel_statements = ac.filter_gene_list(bel_statements,
                                                     self.gene_list, 'all')
        return bel_statements
Beispiel #8
0
def get_stmts_for_gene_list(gene_list, other_entities):
    """Return all Statements between genes in a given list.

    Parameters
    ----------
    gene_list : list[str]
        A list of HGNC symbols for genes to query.
    other_entities : list[str]
        A list of other entities to keep as part of the set of Statements.

    Returns
    -------
    list[indra.statements.Statement]
        A list of INDRA Statements between the given list of genes and other
        entities specified.
    """
    stmts = []
    for gene in gene_list:
        logger.info(f'Querying {gene}')
        st = get_stmts_for_gene(gene)
        logger.info(f'Got {len(st)} statements for {gene}')
        stmts += st
    stmts = ac.filter_gene_list(stmts,
                                gene_list + other_entities,
                                policy='all')
    return stmts
Beispiel #9
0
def run_assembly(stmts, filename):
    stmts = ac.map_grounding(stmts)
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.filter_human_only(stmts)
    #stmts = ac.expand_families(stmts)
    stmts = ac.filter_gene_list(stmts, gene_names, 'one', allow_families=True)
    stmts = ac.map_sequence(stmts)
    stmts = ac.run_preassembly(stmts, return_toplevel=False, poolsize=4)
    ac.dump_statements(stmts, filename)
    return stmts
Beispiel #10
0
def run_assembly(stmts, save_file):
    stmts = ac.map_grounding(stmts)
    stmts = ac.filter_grounded_only(stmts)
    stmts = ac.filter_human_only(stmts)
    stmts = ac.expand_families(stmts)
    stmts = ac.filter_gene_list(stmts, gene_names, 'one')
    stmts = ac.map_sequence(stmts)
    stmts = ac.run_preassembly(stmts, return_toplevel=False)
    stmts = ac.filter_belief(stmts, 0.95)
    stmts = ac.filter_top_level(stmts)
    stmts = ac.filter_direct(stmts)
    stmts = ac.filter_enzyme_kinase(stmts)
    ac.dump_statements(stmts, save_file)
    return stmts
Beispiel #11
0
def assemble_pysb(stmts, data_genes, out_file):
    """Return an assembled PySB model."""
    stmts = ac.filter_direct(stmts)
    stmts = ac.filter_belief(stmts, 0.95)
    stmts = ac.filter_top_level(stmts)
    stmts = ac.filter_gene_list(stmts, data_genes, 'all')
    stmts = ac.reduce_activities(stmts)
    pa = PysbAssembler()
    pa.add_statements(stmts)
    model = pa.make_model()
    # Add observables
    o = Observable('MAPK1p', model.monomers['MAPK1'](T185='p', Y187='p'))
    model.add_component(o)
    o = Observable('MAPK3p', model.monomers['MAPK3'](T202='p', Y204='p'))
    model.add_component(o)
    o = Observable('GSK3Ap', model.monomers['GSK3A'](S21='p'))
    model.add_component(o)
    o = Observable('GSK3Bp', model.monomers['GSK3B'](S9='p'))
    model.add_component(o)
    o = Observable('RPS6p', model.monomers['RPS6'](S235='p'))
    model.add_component(o)
    o = Observable('EIF4EBP1p', model.monomers['EIF4EBP1'](S65='p'))
    model.add_component(o)
    o = Observable('JUNp', model.monomers['JUN'](S73='p'))
    model.add_component(o)
    o = Observable('FOXO3p', model.monomers['FOXO3'](S315='p'))
    model.add_component(o)
    o = Observable('AKT1p', model.monomers['AKT1'](S473='p'))
    model.add_component(o)
    o = Observable('AKT2p', model.monomers['AKT2'](S474='p'))
    model.add_component(o)
    o = Observable('AKT3p', model.monomers['AKT3'](S='p'))
    model.add_component(o)
    o = Observable('ELK1', model.monomers['ELK1'](S383='p'))
    model.add_component(o)
    # Set context
    pa.set_context('SKMEL28_SKIN')
    pa.save_model(out_file)

    ke = KappaExporter(model)
    with open('%s.ka' % base_file, 'wb') as fh:
        base_file, _ = os.path.splitext(out_file)
        fh.write(ke.export().encode('utf-8'))

    return model
Beispiel #12
0
    def get_bel_stmts(self, filter=False):
        """Get relevant statements from the BEL large corpus.

        Performs a series of neighborhood queries and then takes the union of
        all the statements. Because the query process can take a long time for
        large gene lists, the resulting list of statements are cached in a
        pickle file with the filename `<basename>_bel_stmts.pkl`.  If the
        pickle file is present, it is used by default; if not present, the
        queries are performed and the results are cached.

        Parameters
        ----------
        filter : bool
            If True, includes only those statements that exclusively mention
            genes in :py:attr:`gene_list`. Default is False. Note that the
            full (unfiltered) set of statements are cached.

        Returns
        -------
        list of :py:class:`indra.statements.Statement`
            List of INDRA statements extracted from the BEL large corpus.
        """
        if self.basename is not None:
            bel_stmt_path = '%s_bel_stmts.pkl' % self.basename
        # Check for cached BEL stmt file
        if self.basename is not None and os.path.isfile(bel_stmt_path):
            logger.info("Loading BEL statements from %s" % bel_stmt_path)
            with open(bel_stmt_path, 'rb') as f:
                bel_statements = pickle.load(f)
        # No cache, so perform the queries
        else:
            bel_proc = bel.process_pybel_neighborhood(self.gene_list,
                network_file=self.bel_corpus)
            bel_statements = bel_proc.statements
            # Save to pickle file if we're caching
            if self.basename is not None:
                with open(bel_stmt_path, 'wb') as f:
                    pickle.dump(bel_statements, f)
        # Optionally filter out statements not involving only our gene set
        if filter:
            if len(self.gene_list) > 1:
                bel_statements = ac.filter_gene_list(bel_statements,
                                                     self.gene_list, 'all')
        return bel_statements
Beispiel #13
0
def preprocess_stmts(stmts, data_genes):
    # Filter the INDRA Statements to be put into the model
    stmts = ac.filter_mutation_status(stmts,
                                      {'BRAF': [('V', '600', 'E')]}, ['PTEN'])
    stmts = ac.filter_by_type(stmts, Complex, invert=True)
    stmts = ac.filter_direct(stmts)
    stmts = ac.filter_belief(stmts, 0.95)
    stmts = ac.filter_top_level(stmts)
    stmts = ac.filter_gene_list(stmts, data_genes, 'all')
    stmts = ac.filter_enzyme_kinase(stmts)
    stmts = ac.filter_mod_nokinase(stmts)
    stmts = ac.filter_transcription_factor(stmts)
    # Simplify activity types
    ml = MechLinker(stmts)
    ml.gather_explicit_activities()
    ml.reduce_activities()
    ml.gather_modifications()
    ml.reduce_modifications()
    af_stmts = ac.filter_by_type(ml.statements, ActiveForm)
    non_af_stmts = ac.filter_by_type(ml.statements, ActiveForm, invert=True)
    af_stmts = ac.run_preassembly(af_stmts)
    stmts = af_stmts + non_af_stmts
    # Replace activations when possible
    ml = MechLinker(stmts)
    ml.gather_explicit_activities()
    ml.replace_activations()
    # Require active forms
    ml.require_active_forms()
    num_stmts = len(ml.statements)
    while True:
        # Remove inconsequential PTMs
        ml.statements = ac.filter_inconsequential_mods(ml.statements,
                                                       get_mod_whitelist())
        ml.statements = ac.filter_inconsequential_acts(ml.statements,
                                                       get_mod_whitelist())
        if num_stmts <= len(ml.statements):
            break
        num_stmts = len(ml.statements)
    stmts = ml.statements
    return stmts
Beispiel #14
0
def assemble_sif(stmts, data, out_file):
    """Return an assembled SIF."""
    # Filter for high-belief statements
    stmts = ac.filter_belief(stmts, 0.99)
    stmts = ac.filter_top_level(stmts)
    # Filter for Activation / Inhibition
    stmts_act = ac.filter_by_type(stmts, Activation)
    stmts_inact = ac.filter_by_type(stmts, Inhibition)
    stmts = stmts_act + stmts_inact
    # Get Ras227 and filter statments
    ras_genes = process_data.get_ras227_genes()
    ras_genes = [x for x in ras_genes if x not in ['YAP1']]
    stmts = ac.filter_gene_list(stmts, ras_genes, 'all')
    # Get the drugs inhibiting their targets as INDRA
    # statements
    def get_drug_statements():
        drug_targets = process_data.get_drug_targets()
        drug_stmts = []
        for dn, tns in drug_targets.items():
            da = Agent(dn + ':Drugs')
            for tn in tns:
                ta = Agent(tn)
                drug_stmt = Inhibition(da, ta)
                drug_stmts.append(drug_stmt)
        return drug_stmts
    drug_stmts = get_drug_statements()
    stmts = stmts + drug_stmts
    # Because of a bug in CNO, node names containing AND
    # need to be replaced
    def rename_and_nodes(st):
        for s in st:
            for a in s.agent_list():
                if a is not None:
                    if a.name.find('AND') != -1:
                        a.name = a.name.replace('AND', 'A_ND')
    rename_and_nodes(stmts)
    # Rewrite statements to replace genes with their corresponding
    # antibodies when possible
    stmts = rewrite_ab_stmts(stmts, data)
    def filter_ab_edges(st, policy='all'):
        st_out = []
        for s in st:
            if policy == 'all':
                all_ab = True
                for a in s.agent_list():
                    if a is not None:
                        if a.name.find('_p') == -1 and \
                           a.name.find('Drugs') == -1:
                            all_ab = False
                            break
                if all_ab:
                    st_out.append(s)
            elif policy == 'one':
                any_ab = False
                for a in s.agent_list():
                    if a is not None and a.name.find('_p') != -1:
                        any_ab = True
                        break
                if any_ab:
                    st_out.append(s)
        return st_out
    stmts = filter_ab_edges(stmts, 'all')
    # Get a list of the AB names that end up being covered in the prior network
    # This is important because other ABs will need to be taken out of the
    # MIDAS file to work.
    def get_ab_names(st):
        prior_abs = set()
        for s in st:
            for a in s.agent_list():
                if a is not None:
                    if a.name.find('_p') != -1:
                        prior_abs.add(a.name)
        return sorted(list(prior_abs))
    pkn_abs = get_ab_names(stmts)
    print('Boolean PKN contains these antibodies: %s' % ', '.join(pkn_abs))
    # Make the SIF model
    sa = SifAssembler(stmts)
    sa.make_model(use_name_as_key=True)
    sif_str = sa.print_model()
    with open(out_file, 'wb') as fh:
        fh.write(sif_str.encode('utf-8'))
    # Make the MIDAS data file used for training the model
    midas_data = process_data.get_midas_data(data, pkn_abs)
    return sif_str
Beispiel #15
0
        #stmts = ac.load_statements(pjoin(outf, 'prior.pkl'))
    else:
        #prior_stmts = build_prior(data_genes, pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.load_statements(pjoin(outf, 'prior.pkl'))
        prior_stmts = ac.map_grounding(prior_stmts,
                                       save=pjoin(outf, 'gmapped_prior.pkl'))
        reading_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl'))
        reading_stmts = ac.map_grounding(reading_stmts,
                                    save=pjoin(outf, 'gmapped_reading.pkl'))
        stmts = prior_stmts + reading_stmts

        stmts = ac.filter_grounded_only(stmts)
        stmts = ac.filter_genes_only(stmts, specific_only=False)
        stmts = ac.filter_human_only(stmts)
        stmts = ac.expand_families(stmts)
        stmts = ac.filter_gene_list(stmts, data_genes, 'one')
        stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl'))
        stmts = ac.run_preassembly(stmts, return_toplevel=False,
                                   save=pjoin(outf, 'preassembled.pkl'))

    assemble_models = []
    assemble_models.append('sif')
    assemble_models.append('pysb')
    assemble_models.append('cx')

    ### PySB assembly
    if 'pysb' in assemble_models:
        pysb_model = assemble_pysb(stmts, data_genes,
                                   pjoin(outf, 'korkut_model_pysb.py'))
    ### SIF assembly
    if 'sif' in assemble_models:
Beispiel #16
0
    def get_biopax_stmts(self,
                         filter=False,
                         query='pathsbetween',
                         database_filter=None):
        """Get relevant statements from Pathway Commons.

        Performs a "paths between" query for the genes in :py:attr:`gene_list`
        and uses the results to build statements. This function caches two
        files: the list of statements built from the query, which is cached in
        `<basename>_biopax_stmts.pkl`, and the OWL file returned by the Pathway
        Commons Web API, which is cached in `<basename>_pc_pathsbetween.owl`.
        If these cached files are found, then the results are returned based
        on the cached file and Pathway Commons is not queried again.

        Parameters
        ----------
        filter : Optional[bool]
            If True, includes only those statements that exclusively mention
            genes in :py:attr:`gene_list`. Default is False.
        query : Optional[str]
            Defined what type of query is executed. The two options are
            'pathsbetween' which finds paths between the given list of genes
            and only works if more than 1 gene is given, and 'neighborhood'
            which searches the immediate neighborhood of each given gene.
            Note that for pathsbetween queries with more thatn 60 genes, the
            query will be executed in multiple blocks for scalability.
        database_filter: Optional[list[str]]
            A list of PathwayCommons databases to include in the query.

        Returns
        -------
        list of :py:class:`indra.statements.Statement`
            List of INDRA statements extracted from Pathway Commons.
        """
        # If we're using a cache, initialize the appropriate filenames
        if self.basename is not None:
            biopax_stmt_path = '%s_biopax_stmts.pkl' % self.basename
            biopax_ras_owl_path = '%s_pc_pathsbetween.owl' % self.basename
        # Check for cached Biopax stmt file at the given path
        # if it's there, return the statements from the cache
        if self.basename is not None and os.path.exists(biopax_stmt_path):
            logger.info("Loading Biopax statements from %s" % biopax_stmt_path)
            with open(biopax_stmt_path, 'rb') as f:
                bp_statements = pickle.load(f)
            return bp_statements
        # Check for cached file before querying Pathway Commons Web API
        if self.basename is not None and os.path.exists(biopax_ras_owl_path):
            logger.info("Loading Biopax from OWL file %s" %
                        biopax_ras_owl_path)
            bp = biopax.process_owl(biopax_ras_owl_path)
        # OWL file not found; do query and save to file
        else:
            if (len(self.gene_list) < 2) and (query == 'pathsbetween'):
                logger.warning('Using neighborhood query for one gene.')
                query = 'neighborhood'
            if query == 'pathsbetween':
                if len(self.gene_list) > 60:
                    block_size = 60
                else:
                    block_size = None
                bp = biopax.process_pc_pathsbetween(
                    self.gene_list,
                    database_filter=database_filter,
                    block_size=block_size)
            elif query == 'neighborhood':
                bp = biopax.process_pc_neighborhood(
                    self.gene_list, database_filter=database_filter)
            else:
                logger.error('Invalid query type: %s' % query)
                return []
            # Save the file if we're caching
            if self.basename is not None:
                bp.save_model(biopax_ras_owl_path)
        # Save statements to pickle file if we're caching
        if self.basename is not None:
            with open(biopax_stmt_path, 'wb') as f:
                pickle.dump(bp.statements, f)
        # Optionally filter out statements not involving only our gene set
        if filter:
            policy = 'one' if len(self.gene_list) > 1 else 'all'
            stmts = ac.filter_gene_list(bp.statements, self.gene_list, policy)
        else:
            stmts = bp.statements
        return stmts
if __name__ == '__main__':
    db = get_db('primary')
    db_curations = get_curations(db=db)
    tp = tas.process_from_web()
    #targets = ['TMPRSS2', 'ACE2', 'FURIN', 'CTSB', 'CTSL']
    targets = [
        'PIKFYVE', 'INPP5E', 'PIK3C2A', 'PIK3C2B', 'PIK3C2G', 'PI4K2A',
        'PI4K2B', 'PI4KB', 'EHD3', 'PIK3C3'
    ]
    all_stmts = []
    all_ev_counts = {}
    with open('ctd_drugbank_tas_pikfyve.pkl', 'rb') as f:
        all_ctd_stmts = pickle.load(f)
        all_ctd_stmts = filter_neg(all_ctd_stmts)
    for target in targets:
        stmts = get_statements(target)
        fname = '%s.html' % target
        ctd_stmts = ac.filter_gene_list(all_ctd_stmts, [target], policy='one')
        stmts += ctd_stmts
        all_stmts += stmts
        make_html(stmts, fname)
        s3_client = boto3.client('s3')
        with open(fname, 'r') as fh:
            html_str = fh.read()
            s3_client.put_object(Bucket='indra-covid19',
                                 Key='drugs_for_target/%s' % fname,
                                 Body=html_str.encode('utf-8'),
                                 ContentType='text/html',
                                 ACL='public-read')
    make_drug_list(all_stmts, all_ev_counts)
Beispiel #18
0
    # The file in which the preassembled statements will be saved
    pre_stmts_file = prefixed_pkl('preassembled')
    if reassemble:
        # Load various files that were previously produced
        sources = [
            'indradb', 'trips', 'bel', 'biopax', 'phosphosite', 'r3', 'sparser'
        ]
        stmts = []
        for source in sources:
            stmts += ac.load_statements(prefixed_pkl(source))
        stmts = ac.filter_no_hypothesis(stmts)
        # Fix grounding and filter to grounded entities and for proteins,
        # filter to the human ones
        stmts = ac.map_grounding(stmts)
        stmts = ac.filter_grounded_only(stmts)
        stmts = ac.filter_human_only(stmts)
        # Combinatorially expand protein families
        stmts = ac.expand_families(stmts)
        # Apply a strict filter to statements based on the gene names
        stmts = ac.filter_gene_list(stmts, gene_names, 'all')
        # Fix errors in references to protein sequences
        stmts = ac.map_sequence(stmts)
        # Run preassembly and save result
        stmts = ac.run_preassembly(stmts, return_toplevel=False)
        ac.dump_statements(stmts, pre_stmts_file)

    # Load the preassembled statements
    stmts = ac.load_statements(pre_stmts_file)
    # Run assembly into a PySB model
    assemble_pysb.assemble_pysb(stmts, gene_names, contextualize=True)
Beispiel #19
0
                                       save=pjoin(outf, 'gmapped_prior.pkl'))
        reach_stmts = ac.load_statements(pjoin(outf, 'phase3_stmts.pkl'))
        reach_stmts = ac.filter_no_hypothesis(reach_stmts)
        #extra_stmts = ac.load_statements(pjoin(outf, 'extra_stmts.pkl'))
        extra_stmts = read_extra_sources(pjoin(outf, 'extra_stmts.pkl'))
        reading_stmts = reach_stmts + extra_stmts
        reading_stmts = ac.map_grounding(reading_stmts,
                                         save=pjoin(outf,
                                                    'gmapped_reading.pkl'))
        stmts = prior_stmts + reading_stmts + extra_stmts

        stmts = ac.filter_grounded_only(stmts)
        stmts = ac.filter_genes_only(stmts, specific_only=False)
        stmts = ac.filter_human_only(stmts)
        stmts = ac.expand_families(stmts)
        stmts = ac.filter_gene_list(stmts, data_genes, 'one')
        stmts = ac.map_sequence(stmts, save=pjoin(outf, 'smapped.pkl'))
        #stmts = ac.load_statements(pjoin(outf, 'smapped.pkl'))
        stmts = ac.run_preassembly(stmts,
                                   return_toplevel=False,
                                   save=pjoin(outf, 'preassembled.pkl'),
                                   poolsize=4)

    ### PySB assembly
    if 'pysb' in assemble_models:
        pysb_model = assemble_pysb(stmts, data_genes,
                                   pjoin(outf, 'korkut_model_pysb.py'))
    ### SIF assembly
    if 'sif' in assemble_models:
        sif_str = assemble_sif(stmts, data, pjoin(outf,
                                                  'PKN-korkut_all_ab.sif'))
Beispiel #20
0
    def get_biopax_stmts(self, filter=False, query='pathsbetween',
                         database_filter=None):
        """Get relevant statements from Pathway Commons.

        Performs a "paths between" query for the genes in :py:attr:`gene_list`
        and uses the results to build statements. This function caches two
        files: the list of statements built from the query, which is cached in
        `<basename>_biopax_stmts.pkl`, and the OWL file returned by the Pathway
        Commons Web API, which is cached in `<basename>_pc_pathsbetween.owl`.
        If these cached files are found, then the results are returned based
        on the cached file and Pathway Commons is not queried again.

        Parameters
        ----------
        filter : Optional[bool]
            If True, includes only those statements that exclusively mention
            genes in :py:attr:`gene_list`. Default is False.
        query : Optional[str]
            Defined what type of query is executed. The two options are
            'pathsbetween' which finds paths between the given list of genes
            and only works if more than 1 gene is given, and 'neighborhood'
            which searches the immediate neighborhood of each given gene.
            Note that for pathsbetween queries with more thatn 60 genes, the
            query will be executed in multiple blocks for scalability.
        database_filter: Optional[list[str]]
            A list of PathwayCommons databases to include in the query.

        Returns
        -------
        list of :py:class:`indra.statements.Statement`
            List of INDRA statements extracted from Pathway Commons.
        """
        # If we're using a cache, initialize the appropriate filenames
        if self.basename is not None:
            biopax_stmt_path = '%s_biopax_stmts.pkl' % self.basename
            biopax_ras_owl_path = '%s_pc_pathsbetween.owl' % self.basename
        # Check for cached Biopax stmt file at the given path
        # if it's there, return the statements from the cache
        if self.basename is not None and os.path.isfile(biopax_stmt_path):
            logger.info("Loading Biopax statements from %s" % biopax_stmt_path)
            with open(biopax_stmt_path, 'rb') as f:
                bp_statements = pickle.load(f)
            return bp_statements
        # Check for cached file before querying Pathway Commons Web API
        if self.basename is not None and os.path.isfile(biopax_ras_owl_path):
            logger.info("Loading Biopax from OWL file %s" % biopax_ras_owl_path)
            bp = biopax.process_owl(biopax_ras_owl_path)
        # OWL file not found; do query and save to file
        else:
            if (len(self.gene_list) < 2) and (query == 'pathsbetween'):
                logger.warning('Using neighborhood query for one gene.')
                query = 'neighborhood'
            if query == 'pathsbetween':
                if len(self.gene_list) > 60:
                    block_size = 60
                else:
                    block_size = None
                bp = biopax.process_pc_pathsbetween(self.gene_list,
                                                database_filter=database_filter,
                                                block_size=block_size)
            elif query == 'neighborhood':
                bp = biopax.process_pc_neighborhood(self.gene_list,
                                                database_filter=database_filter)
            else:
                logger.error('Invalid query type: %s' % query)
                return []
            # Save the file if we're caching
            if self.basename is not None:
                bp.save_model(biopax_ras_owl_path)
        # Save statements to pickle file if we're caching
        if self.basename is not None:
            with open(biopax_stmt_path, 'wb') as f:
                pickle.dump(bp.statements, f)
        # Optionally filter out statements not involving only our gene set
        if filter:
            policy = 'one' if len(self.gene_list) > 1 else 'all'
            stmts = ac.filter_gene_list(bp.statements, self.gene_list, policy)
        else:
            stmts = bp.statements
        return stmts
Beispiel #21
0
def assemble_pysb(stmts, data_genes, contextualize=False):
    # Filter the INDRA Statements to be put into the model
    stmts = ac.filter_by_type(stmts, Complex, invert=True)
    stmts = ac.filter_direct(stmts)
    stmts = ac.filter_belief(stmts, 0.95)
    stmts = ac.filter_top_level(stmts)
    # Strip the extraneous supports/supported by here
    strip_supports(stmts)
    stmts = ac.filter_gene_list(stmts, data_genes, 'all')
    stmts = ac.filter_enzyme_kinase(stmts)
    stmts = ac.filter_mod_nokinase(stmts)
    stmts = ac.filter_transcription_factor(stmts)
    # Simplify activity types
    ml = MechLinker(stmts)
    ml.gather_explicit_activities()
    ml.reduce_activities()
    ml.gather_modifications()
    ml.reduce_modifications()
    stmts = normalize_active_forms(ml.statements)
    # Replace activations when possible
    ml = MechLinker(stmts)
    ml.gather_explicit_activities()
    ml.replace_activations()
    # Require active forms
    ml.require_active_forms()
    num_stmts = len(ml.statements)
    while True:
        # Remove inconsequential PTMs
        ml.statements = ac.filter_inconsequential_mods(ml.statements,
                                                       get_mod_whitelist())
        ml.statements = ac.filter_inconsequential_acts(ml.statements,
                                                       get_mod_whitelist())
        if num_stmts <= len(ml.statements):
            break
        num_stmts = len(ml.statements)
    stmts = ml.statements
    # Save the Statements here
    ac.dump_statements(stmts, prefixed_pkl('pysb_stmts'))


    # Add drug target Statements
    drug_target_stmts = get_drug_target_statements()
    stmts += drug_target_stmts

    # Just generate the generic model
    pa = PysbAssembler()
    pa.add_statements(stmts)
    model = pa.make_model()
    with open(prefixed_pkl('pysb_model'), 'wb') as f:
        pickle.dump(model, f)

    # Run this extra part only if contextualize is set to True
    if not contextualize:
        return

    cell_lines_no_data = ['COLO858', 'K2', 'MMACSF', 'MZ7MEL', 'WM1552C']
    for cell_line in cell_lines:
        if cell_line not in cell_lines_no_data:
            stmtsc = contextualize_stmts(stmts, cell_line, data_genes)
        else:
            stmtsc = stmts
        pa = PysbAssembler()
        pa.add_statements(stmtsc)
        model = pa.make_model()
        if cell_line not in cell_lines_no_data:
            contextualize_model(model, cell_line, data_genes)
        ac.dump_statements(stmtsc, prefixed_pkl('pysb_stmts_%s' % cell_line))
        with open(prefixed_pkl('pysb_model_%s' % cell_line), 'wb') as f:
            pickle.dump(model, f)