Beispiel #1
0
def update_lspci():
    # We first create a dict of LSPCIs and their members but only for ones
    # that actually have TAS statements corresponding to them
    from indra.sources import tas
    tp = tas.process_from_web(affinity_class_limit=10)
    lspci_members = defaultdict(set)
    for stmt in tp.statements:
        if 'LSPCI' not in stmt.subj.db_refs:
            continue
        for k, v in stmt.subj.db_refs.items():
            if k in {'TEXT', 'LSPCI'}:
                continue
            lspci_members[stmt.subj.db_refs.get('LSPCI')].add((k, v))

    # We then process the names table in a way that we always prioritize the
    # first row for each LSPCI since the table is pre-sorted by priority
    df = pandas.read_csv('lsp_compound_names.csv', dtype={'lspci_id': str})
    lspcid_names = {}
    for _, row in df.iterrows():
        if row['lspci_id'] not in lspcid_names:
            lspcid_names[row['lspci_id']] = row['name']

    # We can now combine the two sources filtering to only entries that have
    # names
    rows = [['lspcid', 'name', 'members']]
    for lspcid, members in lspci_members.items():
        if lspcid not in lspcid_names:
            continue
        row = [lspcid, lspcid_names[lspcid],
               '|'.join(sorted(['%s:%s' % member for member in members]))]
        rows.append(row)
    write_unicode_csv(get_resource_path('lspci.tsv'), rows, delimiter='\t')
Beispiel #2
0
def find_drugs_for_genes(search_terms, drug_gene_stmts=None):
    """Return list of drugs targeting at least one gene from a list of genes

    Parameters
    ----------
    search_terms : list of :py:class:`emmaa.priors.SearchTerm`
        List of search terms for genes

    Returns
    -------
    drug_terms : list of :py:class:`emmaa.priors.SearchTerm`
        List of search terms of drugs targeting at least one of the input genes
    """
    if not drug_gene_stmts:
        drug_gene_stmts = tas.process_from_web().statements
    drug_terms = []
    already_added = set()
    for search_term in search_terms:
        if search_term.type == 'gene':
            hgnc_id = search_term.db_refs['HGNC']
            drugs = get_drugs_for_gene(drug_gene_stmts, hgnc_id)
            for drug in drugs:
                if drug.name not in already_added:
                    drug_terms.append(drug)
                    already_added.add(drug.name)
    return sorted(drug_terms, key=lambda x: x.name)
Beispiel #3
0
def test_processor():
    tp = process_from_web(affinity_class_limit=10)
    assert tp
    assert tp.statements
    num_stmts = len(tp.statements)
    # This is the total number of statements about human genes
    assert num_stmts == 1123724, num_stmts
    assert all(len(s.evidence) >= 1 for s in tp.statements), \
        'Some statements lack any evidence'
Beispiel #4
0
 def find_drugs_for_genes(node_list):
     """Return list of drugs targeting gene nodes."""
     tas_statements = tas.process_from_web().statements
     already_added = set()
     drug_terms = []
     for node in node_list:
         if node.startswith('HGNC:'):
             hgnc_id = node.split(':')[1]
             drugs = get_drugs_for_gene(tas_statements, hgnc_id)
             for drug in drugs:
                 if drug.name not in already_added:
                     drug_terms.append(drug)
                     already_added.add(drug.name)
     return sorted(drug_terms, key=lambda x: x.name)
Beispiel #5
0
 def _get_statements(self):
     from indra.sources import tas
     # The settings we use here are justified as follows:
     # - only affinities that indicate binding are included
     # - only agents that have some kind of a name available are
     #   included, with ones that get just an ID as a name are
     #   not included.
     # - we do not require full standardization, thereby allowing
     #   set of drugs to be extracted for which we have a name from CHEBML,
     #   HMS-LINCS, or DrugBank
     logger.info('Processing TAS from web')
     tp = tas.process_from_web(affinity_class_limit=2,
                               named_only=True,
                               standardized_only=False)
     logger.info('Expanding evidences and deduplicating')
     filtered_stmts = [s for s in _expanded(tp.statements)]
     unique_stmts, _ = extract_duplicates(filtered_stmts,
                                          KeyFunc.mk_and_one_ev_src)
     return unique_stmts
Beispiel #6
0
    def make_search_terms(self, drug_gene_stmts=None):
        """Generate search terms from the gene list."""
        if not drug_gene_stmts:
            drug_gene_stmts = tas.process_from_web().statements
        already_added = set()
        terms = []
        for gene in self.gene_list:
            # Gene search term
            agent = agent_from_gene_name(gene)
            term = SearchTerm(type='gene', name=agent.name,
                              search_term=f'"{agent.name}"',
                              db_refs={'HGNC': agent.db_refs['HGNC'],
                                       'UP': agent.db_refs['UP']})
            terms.append(term)

            # Drug search term
            drug_terms = get_drugs_for_gene(drug_gene_stmts,
                                            agent.db_refs['HGNC'])
            for drug_term in drug_terms:
                if drug_term.name not in already_added:
                    terms.append(drug_term)
                    already_added.add(drug_term.name)
        self.search_terms = terms
        return terms
        for compound in drug_list:
            fh.write(
                '%s\t%s\t%s\n' %
                (compound[0], compound[1], 'INDRA (text mining/databases)'))


misgrounding_map = {
    'CTSL': ['MEP'],
    'CTSB': ['APPs'],
    'FURIN': ['pace', 'Fur']
}

if __name__ == '__main__':
    db = get_db('primary')
    db_curations = get_curations(db=db)
    tp = tas.process_from_web()
    #targets = ['TMPRSS2', 'ACE2', 'FURIN', 'CTSB', 'CTSL']
    targets = [
        'PIKFYVE', 'INPP5E', 'PIK3C2A', 'PIK3C2B', 'PIK3C2G', 'PI4K2A',
        'PI4K2B', 'PI4KB', 'EHD3', 'PIK3C3'
    ]
    all_stmts = []
    all_ev_counts = {}
    with open('ctd_drugbank_tas_pikfyve.pkl', 'rb') as f:
        all_ctd_stmts = pickle.load(f)
        all_ctd_stmts = filter_neg(all_ctd_stmts)
    for target in targets:
        stmts = get_statements(target)
        fname = '%s.html' % target
        ctd_stmts = ac.filter_gene_list(all_ctd_stmts, [target], policy='one')
        stmts += ctd_stmts
        normalize_drug(stmt.subj)
    stmts = sorted(stmt_group,
                   key=lambda x:
                    (len(score_drug(x.subj)),
                     len(x.subj.name)))
    if len(stmt_group) > 1:
        print('Choosing: %s (%s) from' %
              (stmts[0].subj, score_drug(stmts[0].subj)))
        for stmt in stmts:
            print(stmt.subj, score_drug(stmt.subj))
        print()
    return stmts[0]


if __name__ == '__main__':
    tp = tas.process_from_web(affinity_class_limit=2, named_only=True,
                              standardized_only=False)
    grouped = defaultdict(list)
    for stmt in tp.statements:
        grouped[(stmt.subj.db_refs['LSPCI'], stmt.obj.name)].append(stmt)

    opt_stmts = []
    for (lspci, obj_name), stmts in grouped.items():
        opt_stmt = choose_best_stmt(stmts)
        opt_stmts.append(opt_stmt)

    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         os.pardir, 'resources', 'tas_stmts_filtered.pkl')
    with open(fname, 'wb') as fh:
        pickle.dump(opt_stmts, fh)
Beispiel #9
0
    for stmt in stmts:
        new_ev = [e for e in stmt.evidence if e.source_api not in sources]
        if not new_ev:
            continue
        stmt.evidence = new_ev
        new_stmts.append(stmt)
    return new_stmts


if __name__ == '__main__':

    # Loading premliminary data structures
    db = get_db('primary')
    db_curations = get_curations(db=db)

    tas_processor = tas.process_from_web()
    # List of entities that are not of interest to get INDRA Statements
    # e.g., ATP, oxygen
    with open('black_list.txt', 'r') as fh:
        black_list = {line.strip() for line in fh.readlines()}

    with open('minerva_disease_map_indra_ids.csv', 'r') as fh:
        groundings = [line.strip().split(',') for line in fh.readlines()]

    with open('../../grounding_map.json', 'r') as fh:
        grounding_map = json.load(fh)
    #####################

    # Querying for and assembling statements
    all_stmts = []
    for db_ns, db_id, name in groundings:
    reg_stmts = []
    for stmt in stmts_by_channel[channel]:
        if isinstance(stmt, (Inhibition, DecreaseAmount)):
            if stmt.subj.name == reg_agent.name:
                reg_stmts.append(stmt)
    return reg_stmts


def assemble_html(stmts, fname_key):
    ha = HtmlAssembler(stmts)
    ha.make_model()
    ha.save_model('%s.html' % fname_key)


if __name__ == '__main__':
    tp = tas.process_from_web(affinity_class_limit=10)

    neg_regs = defaultdict(dict)
    non_neg_regs = defaultdict(dict)
    for channel, (stmts, _, _) in stmts_by_channel.items():
        stmts = [
            s for s in stmts if isinstance(s, (Inhibition, DecreaseAmount))
        ]
        stmts = [s for s in stmts if s.obj.name == channel]
        for stmt in stmts:
            neg_regs[channel][get_key(stmt.subj)] = stmt.subj

    for stmt in tp.statements:
        if stmt.obj.name in neg_regs:
            if stmt.evidence[0].annotations['class_min'] in \
                    {'100nM < Kd < 1uM', 'Kd < 100nM'}: