def add_hms_lincs_xrefs(self): from indra.databases.lincs_client import LincsClient lc = LincsClient() edges = [] for hmsl_id, data in lc._sm_data.items(): if '-' in hmsl_id: hmsl_base_id, suffix = hmsl_id.split('-') else: hmsl_base_id, suffix = hmsl_id, None if suffix == '999': continue refs = lc.get_small_molecule_refs(hmsl_id) for ref_ns, ref_id in refs.items(): edges.append( (self.label('HMS-LINCS', hmsl_base_id), self.label(ref_ns, ref_id), { 'type': 'xref', 'source': 'hms-lincs' })) edges.append((self.label(ref_ns, ref_id), self.label('HMS-LINCS', hmsl_base_id), { 'type': 'xref', 'source': 'hms-lincs' })) self.add_edges_from(edges)
def __init__(self, lincs_data): self._data = lincs_data self._lc = LincsClient() # Process all the lines (skipping the header) self.statements = [] for line in self._data: self._process_line(line) return
class TasProcessor(object): """A processor for the Target Affinity Spectrum data table.""" def __init__(self, data, affinity_class_limit): self._data = data self._lc = LincsClient() self.affinity_class_limit = affinity_class_limit self.statements = [] for row in data: # Skip rows that are above the affinity class limit if int(row['class_min']) > affinity_class_limit: continue self._process_row(row) return def _process_row(self, row): drug = self._extract_drug(row['hms_id']) prot = self._extract_protein(row['approved_symbol'], row['gene_id']) ev = self._make_evidence(row['class_min']) # NOTE: there are several entries in this data set that refer to # non-human Entrez genes, e.g. # https://www.ncbi.nlm.nih.gov/gene/3283880 # We skip these for now because resources for Entrez-based # mappings for non-human genes are not integrated, and would cause # pre-assembly issues. if 'HGNC' not in prot.db_refs: return self.statements.append(Inhibition(drug, prot, evidence=ev)) def _extract_drug(self, hms_id): refs = self._lc.get_small_molecule_refs(hms_id) name = self._lc.get_small_molecule_name(hms_id) if 'PUBCHEM' in refs: chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM']) if chebi_id: refs['CHEBI'] = 'CHEBI:%s' % chebi_id return Agent(name, db_refs=refs) def _extract_protein(self, name, gene_id): refs = {'EGID': gene_id} hgnc_id = hgnc_client.get_hgnc_from_entrez(gene_id) if hgnc_id is not None: refs['HGNC'] = hgnc_id up_id = hgnc_client.get_uniprot_id(hgnc_id) if up_id: refs['UP'] = up_id # If there is a HGNC ID, we standardize the gene name name = hgnc_client.get_hgnc_name(hgnc_id) return Agent(name, db_refs=refs) def _make_evidence(self, class_min): ev = Evidence(source_api='tas', epistemics={'direct': True}, annotations={'class_min': CLASS_MAP[class_min]}) return ev
def __init__(self, data, affinity_class_limit): self._data = data self._lc = LincsClient() self.affinity_class_limit = affinity_class_limit self.statements = [] for row in data: # Skip rows that are above the affinity class limit if int(row['class_min']) > affinity_class_limit: continue self._process_row(row) return
def add_hms_lincs_nodes(self): from indra.databases.lincs_client import LincsClient lc = LincsClient() nodes = [] for hmsl_id, data in lc._sm_data.items(): if '-' in hmsl_id: hmsl_base_id, suffix = hmsl_id.split('-') else: hmsl_base_id, suffix = hmsl_id, None if suffix == '999': continue nodes.append((self.label('HMS-LINCS', hmsl_base_id), {'name': data['Name']})) self.add_nodes_from(nodes)
class LincsProcessor(object): """Processor for the HMS LINCS drug target dataset. Parameters ---------- lincs_data : list[dict] A list of dicts with keys set by the header of the csv, and values from the data in the csv. Attributes ---------- statements : list[indra.statements.Statement] A list of indra statements extracted from the CSV file. """ def __init__(self, lincs_data): self._data = lincs_data self._lc = LincsClient() # Process all the lines (skipping the header) self.statements = [] for line in self._data: self._process_line(line) return def _process_line(self, line): drug = self._extract_drug(line) prot = self._extract_protein(line) if prot is None: return evidence = self._make_evidence(line) self.statements.append(Inhibition(drug, prot, evidence=evidence)) def _extract_drug(self, line): drug_name = line['Small Molecule Name'] lincs_id = line['Small Molecule HMS LINCS ID'] refs = self._lc.get_small_molecule_refs(lincs_id) if 'PUBCHEM' in refs: chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM']) if chebi_id: refs['CHEBI'] = chebi_id return Agent(drug_name, db_refs=refs) def _extract_protein(self, line): # Extract key information from the lines. prot_name = line['Protein Name'] prot_id = line['Protein HMS LINCS ID'] # Get available db-refs. db_refs = {} if prot_id: db_refs.update(self._lc.get_protein_refs(prot_id)) # Since the resource only gives us an UP ID (not HGNC), we # try to get that and standardize the name to the gene name up_id = db_refs.get('UP') if up_id: hgnc_id = uniprot_client.get_hgnc_id(up_id) if hgnc_id: db_refs['HGNC'] = hgnc_id prot_name = hgnc_client.get_hgnc_name(hgnc_id) else: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: prot_name = gene_name # In some cases lines are missing protein information in which # case we return None else: return None # Create the agent. return Agent(prot_name, db_refs=db_refs) def _make_evidence(self, line): ev_list = [] key_refs = line['Key References'].split(';') generic_notes = { 'is_nominal': line['Is Nominal'], 'effective_concentration': line['Effective Concentration'] } patt = re.compile('(?:pmid|pubmed\s+id):\s+(\d+)', re.IGNORECASE) for ref in key_refs: # Only extracting pmids, but there is generally more info available. m = patt.search(ref) if m is None: pmid = None else: pmid = m.groups()[0] annotations = {'reference': ref} annotations.update(generic_notes) ev = Evidence('lincs_drug', pmid=pmid, annotations=annotations, epistemics={'direct': True}) ev_list.append(ev) return ev_list
from __future__ import absolute_import, print_function, unicode_literals import unittest from nose.plugins.attrib import attr from indra.databases.lincs_client import get_drug_target_data, LincsClient lc = LincsClient() @attr('webservice') @unittest.skip('LINCS web service very unreliable.') def test_get_drug_target_data(): data_list = get_drug_target_data() assert len(data_list) > 100, len(data_list) def test_get_protein_refs(): prot_refs = lc.get_protein_refs('200020') assert prot_refs.get('UP') == 'P00519' assert prot_refs.get('EGID') == '25' assert prot_refs.get('HMS-LINCS') == '200020' def test_get_sm_name(): sm_name = lc.get_small_molecule_name('10001') assert sm_name == 'Seliciclib', sm_name def test_get_sm_refs(): sm_refs = lc.get_small_molecule_refs('10001') assert sm_refs.get('HMS-LINCS') == '10001', sm_refs
class LincsProcessor(object): """Processor for the HMS LINCS drug target dataset. Parameters ---------- lincs_data : list[dict] A list of dicts with keys set by the header of the csv, and values from the data in the csv. Attributes ---------- statements : list[indra.statements.Statement] A list of indra statements extracted from the CSV file. """ def __init__(self, lincs_data): self._data = lincs_data self._lc = LincsClient() # Process all the lines (skipping the header) self.statements = [] for line in self._data: self._process_line(line) return def _process_line(self, line): drug = self._extract_drug(line) prot = self._extract_protein(line) if prot is None: return evidence = self._make_evidence(line) self.statements.append(Inhibition(drug, prot, evidence=evidence)) def _extract_drug(self, line): drug_name = line['Small Molecule Name'] lincs_id = line['Small Molecule HMS LINCS ID'] refs = self._lc.get_small_molecule_refs(lincs_id) if 'PUBCHEM' in refs: chebi_id = chebi_client.get_chebi_id_from_pubchem(refs['PUBCHEM']) if chebi_id: refs['CHEBI'] = 'CHEBI:%s' % chebi_id return Agent(drug_name, db_refs=refs) def _extract_protein(self, line): # Extract key information from the lines. prot_name = line['Protein Name'] prot_id = line['Protein HMS LINCS ID'] # Get available db-refs. db_refs = {} if prot_id: db_refs.update(self._lc.get_protein_refs(prot_id)) # Since the resource only gives us an UP ID (not HGNC), we # try to get that and standardize the name to the gene name up_id = db_refs.get('UP') if up_id: gene_name = uniprot_client.get_gene_name(up_id) if gene_name: prot_name = gene_name hgnc_id = hgnc_client.get_hgnc_id(gene_name) if hgnc_id: db_refs['HGNC'] = hgnc_id # In some cases lines are missing protein information in which # case we return None else: return None # Create the agent. return Agent(prot_name, db_refs=db_refs) def _make_evidence(self, line): ev_list = [] key_refs = line['Key References'].split(';') generic_notes = { 'is_nominal': line['Is Nominal'], 'effective_concentration': line['Effective Concentration'] } patt = re.compile('(?:pmid|pubmed\s+id):\s+(\d+)', re.IGNORECASE) for ref in key_refs: # Only extracting pmids, but there is generally more info available. m = patt.search(ref) if m is None: pmid = None else: pmid = m.groups()[0] annotations = {'reference': ref} annotations.update(generic_notes) ev = Evidence('lincs_drug', pmid=pmid, annotations=annotations, epistemics={'direct': True}) ev_list.append(ev) return ev_list