Beispiel #1
0
def save_sentences(twg, stmts, filename, agent_limit=300):
    """Write evidence sentences for stmts with ungrounded agents to csv file.

    Parameters
    ----------
    twg: list of tuple
        list of tuples of ungrounded agent_texts with counts of the
        number of times they are mentioned in the list of statements.
        Should be sorted in descending order by the counts.
        This is of the form output by the function ungrounded texts.

    stmts: list of :py:class:`indra.statements.Statement`

    filename : str
        Path to output file

    agent_limit : Optional[int]
        Number of agents to include in output file. Takes the top agents
        by count.
    """
    sentences = []
    unmapped_texts = [t[0] for t in twg]
    counter = 0
    logger.info('Getting sentences for top %d unmapped agent texts.' %
                agent_limit)
    for text in unmapped_texts:
        agent_sentences = get_sentences_for_agent(text, stmts)
        sentences += map(lambda tup: (text,) + tup, agent_sentences)
        counter += 1
        if counter >= agent_limit:
            break
    # Write sentences to CSV file
    write_unicode_csv(filename, sentences, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Beispiel #2
0
def update_pubchem_mesh_map():
    url = 'https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-MeSH'
    res = requests.get(url)

    # We first get mapping pairs from the table
    mappings = []
    for line in res.text.split('\n'):
        parts = line.split('\t')
        for part in parts[1:]:
            mappings.append((parts[0], part))
    # The table has (1) rows with multiple MeSH terms separated by tabs,
    # (2) multiple rows with the same PubChem CID and (3) multiple rows
    # with the same MeSH term. We retain only one-to-one mappings here.
    pc_count = Counter([m[0] for m in mappings])
    mesh_count = Counter([m[1] for m in mappings])
    unique_mappings = [m for m in mappings if pc_count[m[0]] == 1
                       and mesh_count[m[1]] == 1]

    # The mappings table is given using MeSH term names so we need
    # to convert these to IDs. Lookups can fail for several reasons:
    # some entries are simply not valid MeSH names, others are not
    # yet included in the INDRA MeSH resources/ontology.
    unique_with_id = []
    for pcid, meshname in unique_mappings:
        mesh_ns_id_tuple = bio_ontology.get_id_from_name('MESH', meshname)
        if mesh_ns_id_tuple:
            unique_with_id.append((pcid, mesh_ns_id_tuple[1]))

    fname = os.path.join(path, 'pubchem_mesh_map.tsv')
    logger.info('Saving into %s' % fname)
    write_unicode_csv(fname, unique_with_id, delimiter='\t')
Beispiel #3
0
def save_base_map(filename, grouped_by_text):
    """Dump a list of agents along with groundings and counts into a csv file

    Parameters
    ----------
    filename : str
        Filepath for output file
    grouped_by_text : list of tuple
        List of tuples of the form output by agent_texts_with_grounding
    """
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, db_id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(db_id)
            else:
                name = ''
            row = [text_string, db, db_id, count, name]
            rows.append(row)

    write_unicode_csv(filename,
                      rows,
                      delimiter=',',
                      quotechar='"',
                      quoting=csv.QUOTE_MINIMAL,
                      lineterminator='\r\n')
def update_hmdb_chebi_map():
    logger.info('--Updating HMDB to ChEBI entries----')
    ns = {'hmdb': 'http://www.hmdb.ca'}
    url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip'
    fname = os.path.join(path, 'hmdb_metabolites.zip')
    logger.info('Downloading %s' % url)
    #urlretrieve(url, fname)
    mappings = []
    with ZipFile(fname) as input_zip:
        with input_zip.open('hmdb_metabolites.xml') as fh:
            for event, elem in ET.iterparse(fh, events=('start', 'end')):
                #print(elem.tag)
                if event == 'start' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    hmdb_id = None
                    chebi_id = None
                # Important: we only look at accession if there's no HMDB
                # ID yet, otherwise we pick up secondary accession tags
                elif event == 'start' and \
                        elem.tag == '{%s}accession' % ns['hmdb'] and \
                        not hmdb_id:
                    hmdb_id = elem.text
                elif event == 'start' and \
                        elem.tag == '{%s}chebi_id' % ns['hmdb']:
                    chebi_id = elem.text
                elif event == 'end' and \
                        elem.tag == '{%s}metabolite' % ns['hmdb']:
                    if hmdb_id and chebi_id:
                        print(hmdb_id, chebi_id)
                        mappings.append([hmdb_id, chebi_id])
                elem.clear()
    fname = os.path.join(path, 'hmdb_to_chebi.tsv')
    mappings = [['HMDB_ID', 'CHEBI_ID']] + sorted(mappings, key=lambda x: x[0])
    write_unicode_csv(fname, mappings, delimiter='\t')
Beispiel #5
0
def update_mesh_names():
    url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/desc2018.xml'
    urlretrieve(url, 'desc2018.xml')
    # Process the XML and find descriptor records
    et = ET.parse('desc2018.xml')
    records = et.findall('DescriptorRecord')
    rows = []
    for record in records:
        # We first get the ID and the name
        uid = record.find('DescriptorUI').text
        name = record.find('DescriptorName/String').text
        # We then need to look for additional terms related to the
        # preferred concept to get additional names
        concepts = record.findall('ConceptList/Concept')
        all_term_names = []
        for concept in concepts:
            # We only look at the preferred concept here
            if concept.attrib['PreferredConceptYN'] == 'Y':
                terms = concept.findall('TermList/Term')
                for term in terms:
                    term_name = term.find('String').text
                    if term_name != name:
                        all_term_names.append(term_name)
        # Append a list of term names separated by pipes to the table
        term_name_str = '|'.join(all_term_names)
        rows.append((uid, name, term_name_str))
    fname = os.path.join(path, 'mesh_id_label_mappings.tsv')
    write_unicode_csv(fname, rows, delimiter='\t')
Beispiel #6
0
def update_mesh_supplementary_names():
    """Update MeSH ID to name mappings for supplementary terms."""
    supp_url = ('ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/'
                'xmlmesh/supp2021.gz')
    supp_path = os.path.join(path, 'mesh_supp2021.gz')
    if not os.path.exists(supp_path):
        logging.info('Download MeSH supplement from %s', supp_url)
        urlretrieve(supp_url, supp_path)
        logging.info('Done downloading MeSH supplement')
    with gzip.open(supp_path) as supp_file:
        logging.info('Parsing MeSH supplement')
        supp_et = ET.parse(supp_file)
    supp_rows = []
    for record in supp_et.iterfind('SupplementalRecord'):
        uid = record.find('SupplementalRecordUI').text
        name = record.find('SupplementalRecordName/String').text
        mapped_to_terms = record.findall('HeadingMappedToList/HeadingMappedTo/'
                                         'DescriptorReferredTo/DescriptorUI')
        mapped_to = ','.join([term.text.replace('*', '')
                              for term in mapped_to_terms])
        term_name_str = _get_term_name_str(record, name)
        supp_rows.append((uid, name, term_name_str, mapped_to))

    fname = os.path.join(path, 'mesh_supp_id_label_mappings.tsv')
    write_unicode_csv(fname, supp_rows, delimiter='\t')
Beispiel #7
0
def update_mesh_names():
    """Update Mesh ID to name and tree number mappings."""
    url = ('ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/'
           'xmlmesh/desc2021.gz')
    desc_path = os.path.join(path, 'mesh_desc2021.gz')
    if not os.path.exists(desc_path):
        logging.info('Download MeSH descriptors from %s', url)
        urlretrieve(url, desc_path)
        logging.info('Done downloading MeSH descriptors')
    # Process the XML and find descriptor records
    with gzip.open(desc_path) as desc_file:
        logging.info('Parsing MeSH descriptors')
        et = ET.parse(desc_file)
    rows = []
    for record in et.iterfind('DescriptorRecord'):
        # We first get the ID and the name
        uid = record.find('DescriptorUI').text
        name = record.find('DescriptorName/String').text
        term_name_str = _get_term_name_str(record, name)
        tree_numbers = record.findall('TreeNumberList/TreeNumber')
        tree_numbers_str = '|'.join([t.text for t in tree_numbers])
        rows.append((uid, name, term_name_str, tree_numbers_str))

    fname = os.path.join(path, 'mesh_id_label_mappings.tsv')
    write_unicode_csv(fname, rows, delimiter='\t')
def update_mesh_names():
    url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/desc2018.xml'
    urlretrieve(url, 'desc2018.xml')
    # Process the XML and find descriptor records
    et = ET.parse('desc2018.xml')
    records = et.findall('DescriptorRecord')
    rows = []
    for record in records:
        # We first get the ID and the name
        uid = record.find('DescriptorUI').text
        name = record.find('DescriptorName/String').text
        # We then need to look for additional terms related to the
        # preferred concept to get additional names
        concepts = record.findall('ConceptList/Concept')
        all_term_names = []
        for concept in concepts:
            # We only look at the preferred concept here
            if concept.attrib['PreferredConceptYN'] == 'Y':
                terms = concept.findall('TermList/Term')
                for term in terms:
                    term_name = term.find('String').text
                    if term_name != name:
                        all_term_names.append(term_name)
        # Append a list of term names separated by pipes to the table
        term_name_str = '|'.join(all_term_names)
        rows.append((uid, name, term_name_str))
    fname = os.path.join(path, 'mesh_id_label_mappings.tsv')
    write_unicode_csv(fname, rows, delimiter='\t')
Beispiel #9
0
def update_drugbank_mappings():
    """Update mappings from DrugBank to CHEBI/CHEMBL"""
    # Note that for this to work, PyOBO (https://github.com/pyobo/pyobo) has
    # to be installed and the DrugBank download
    # (https://www.drugbank.ca/releases/latest) put into ~/.obo/drugbank/
    # Note that the DrugBank download requires signing up for an account and
    # waiting for approval.
    import pyobo
    drugbank_chembl = pyobo.get_filtered_xrefs('drugbank', 'chembl.compound')
    drugbank_chebi = pyobo.get_filtered_xrefs('drugbank', 'chebi')
    chebi_drugbank = pyobo.get_filtered_xrefs('chebi', 'drugbank')
    drugbank_names = pyobo.get_id_name_mapping('drugbank')
    rows = []
    for drugbank_id, chembl_id in drugbank_chembl.items():
        rows.append([drugbank_id, 'CHEMBL', chembl_id, 'drugbank'])
    for drugbank_id, chebi_id in drugbank_chebi.items():
        rows.append([drugbank_id, 'CHEBI', chebi_id, 'drugbank'])
    for chebi_id, drugbank_id in chebi_drugbank.items():
        rows.append([drugbank_id, 'CHEBI', chebi_id, 'chebi'])
    for drugbank_id, name in drugbank_names.items():
        rows.append([drugbank_id, 'NAME', name, 'drugbank'])
    fname = os.path.join(path, 'drugbank_mappings.tsv')
    header = ['DRUGBANK_ID', 'NAMESPACE', 'ID', 'SOURCE']
    rows = [header] + sorted(rows)
    write_unicode_csv(fname, rows, delimiter='\t')
Beispiel #10
0
def map_statements(stmts, source, outfile=None):
    """Tabulate valid, invalid, and mapped sites from a set of Statements."""
    # Look for errors in database statements
    sm = SiteMapper(default_site_map)
    valid_stmts, mapped_stmts = sm.map_sites(stmts)
    # Collect stats from SiteMapper itself
    sites = []
    for site_key, mapping in sm._cache.items():
        gene, res, pos = site_key
        freq = sm._sitecount[site_key]
        if mapping == 'VALID':
            valid, mapped, mapped_res, mapped_pos, explanation = \
                                                      (1, 0, None, None, None)
        else:
            valid = 0
            # Not mapped
            if mapping is None:
                mapped, mapped_res, mapped_pos, explanation = \
                                                    (0, None, None, None)
            # Mapped!
            else:
                mapped_res, mapped_pos, explanation = mapping
                mapped = 1 if mapped_pos else 0
        si = SiteInfo(gene, res, pos, valid, mapped, mapped_res, mapped_pos,
                      explanation, freq, source)
        sites.append(si)
    # Write to CSV file
    if outfile:
        header = [[field.upper() for field in si._asdict().keys()]]
        rows = header + replace_nones(sites)
        write_unicode_csv(outfile, rows)
    return sites
Beispiel #11
0
def update_lspci():
    # We first create a dict of LSPCIs and their members but only for ones
    # that actually have TAS statements corresponding to them
    from indra.sources import tas
    tp = tas.process_from_web(affinity_class_limit=10)
    lspci_members = defaultdict(set)
    for stmt in tp.statements:
        if 'LSPCI' not in stmt.subj.db_refs:
            continue
        for k, v in stmt.subj.db_refs.items():
            if k in {'TEXT', 'LSPCI'}:
                continue
            lspci_members[stmt.subj.db_refs.get('LSPCI')].add((k, v))

    # We then process the names table in a way that we always prioritize the
    # first row for each LSPCI since the table is pre-sorted by priority
    df = pandas.read_csv('lsp_compound_names.csv', dtype={'lspci_id': str})
    lspcid_names = {}
    for _, row in df.iterrows():
        if row['lspci_id'] not in lspcid_names:
            lspcid_names[row['lspci_id']] = row['name']

    # We can now combine the two sources filtering to only entries that have
    # names
    rows = [['lspcid', 'name', 'members']]
    for lspcid, members in lspci_members.items():
        if lspcid not in lspcid_names:
            continue
        row = [lspcid, lspcid_names[lspcid],
               '|'.join(sorted(['%s:%s' % member for member in members]))]
        rows.append(row)
    write_unicode_csv(get_resource_path('lspci.tsv'), rows, delimiter='\t')
Beispiel #12
0
def get_stmt_sif(stmts, fname):
    rows = []
    for stmt in stmts:
        agent_names = [a.name for a in stmt.agent_list() if a is not None]
        if len(agent_names) != 2:
            continue
        rows.append((agent_names[0], stmt.uuid, agent_names[1]))
    write_unicode_csv(fname, rows)
def update_famplex_map():
    logger.info('--Updating FamPlex map----')
    # Currently this is a trivial "copy" of the FamPlex equivalences.csv
    # file. Later, name spaces may need to be adapted and other format changes
    # may be needed.
    fname_in = os.path.join(path, 'famplex/equivalences.csv')
    fname_out = os.path.join(path, 'famplex_map.tsv')
    rows = read_unicode_csv(fname_in)
    write_unicode_csv(fname_out, rows, delimiter='\t')
Beispiel #14
0
def print_stmts(stmts, file_name):
    rows = []
    for s in stmts:
        agents = s.agent_list()
        db_refs = [('%s(%s)' % (a.name, a.db_refs)) 
                    for a in agents if a is not None]
        db_refs_str = (', '.join(db_refs))
        rows.append([str(s), db_refs_str, s.evidence[0].text])
    write_unicode_csv(file_name, rows, delimiter='\t')
Beispiel #15
0
def update_chebi_entries():
    logger.info('--Updating ChEBI entries----')
    url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \
        'Flat_file_tab_delimited/reference.tsv.gz'
    fname = os.path.join(path, 'reference.tsv.gz')
    urlretrieve(url, fname)
    with gzip.open(fname, 'rb') as fh:
        logger.info('Loading %s' % fname)
        df = pandas.read_csv(fh,
                             sep='\t',
                             index_col=None,
                             parse_dates=True,
                             encoding='latin-1')
    # Save PubChem mapping
    fname = os.path.join(path, 'chebi_to_pubchem.tsv')
    logger.info('Saving into %s' % fname)
    df_pubchem = df[df['REFERENCE_DB_NAME'] == 'PubChem']
    df_pubchem.sort_values(['COMPOUND_ID', 'REFERENCE_ID'],
                           ascending=True,
                           inplace=True)
    df_pubchem.to_csv(fname,
                      sep='\t',
                      columns=['COMPOUND_ID', 'REFERENCE_ID'],
                      header=['CHEBI', 'PUBCHEM'],
                      index=False)

    # Process PubChem mapping to eliminate SID rows and strip CID: prefix
    # If the second column of the row starts with SID:, ignore the row
    # If the second column of the row starts with CID:, strip out the CID prefix
    # Otherwise, include the row unchanged
    original_rows = read_unicode_csv(fname, '\t')
    new_rows = []
    for original_row in original_rows:
        if original_row[1].startswith('CID:'):
            new_row = original_row
            new_row[1] = new_row[1][5:]  # Strip out CID:
            new_rows.append(new_row)
        elif original_row[1].startswith('SID:'):
            # Skip SID rows
            continue
        else:
            # Include other rows unchanges
            new_rows.append(original_row)
    write_unicode_csv(fname, new_rows, '\t')

    # Save ChEMBL mapping
    fname = os.path.join(path, 'chebi_to_chembl.tsv')
    logger.info('Saving into %s' % fname)
    df_chembl = df[df['REFERENCE_DB_NAME'] == 'ChEMBL']
    df_chembl.sort_values(['COMPOUND_ID', 'REFERENCE_ID'],
                          ascending=True,
                          inplace=True)
    df_chembl.to_csv(fname,
                     sep='\t',
                     columns=['COMPOUND_ID', 'REFERENCE_ID'],
                     header=['CHEBI', 'CHEMBL'],
                     index=False)
Beispiel #16
0
def print_stmts(stmts, file_name):
    rows = []
    for s in stmts:
        agents = s.agent_list()
        db_refs = [('%s(%s)' % (a.name, a.db_refs)) for a in agents
                   if a is not None]
        db_refs_str = (', '.join(db_refs))
        rows.append([str(s), db_refs_str, s.evidence[0].text])
    write_unicode_csv(file_name, rows, delimiter='\t')
def update_chebi_entries():
    term_entries = _get_chebi_obo_terms()
    # Make the name and secondary table
    fname = os.path.join(path, 'chebi_entries.tsv')
    rows = [['CHEBI_ID', 'NAME', 'SECONDARIES']]
    for term_id, name, secondaries, parents in term_entries:
        rows.append([term_id, name, ','.join(secondaries)])
    with open(fname, 'wb') as fh:
        write_unicode_csv(fname, rows, '\t')
Beispiel #18
0
def update_bioentities_map():
    logger.info('--Updating Bioentities map----')
    # Currently this is a trivial "copy" of the Bioentities equivalences.csv
    # file. Later, name spaces may need to be adapted and other format changes
    # may be needed.
    fname_in = os.path.join(path, '../../bioentities/equivalences.csv')
    fname_out = os.path.join(path, 'bioentities_map.tsv')
    rows = read_unicode_csv(fname_in)
    write_unicode_csv(fname_out, rows, delimiter='\t')
Beispiel #19
0
    def make_model(self, output_file, add_curation_cols=False, up_only=False):
        """Export the statements into a tab-separated text file.

        Parameters
        ----------
        output_file : str
            Name of the output file.
        add_curation_cols : bool
            Whether to add columns to facilitate statement curation. Default
            is False (no additional columns).
        up_only : bool
            Whether to include identifiers.org links *only* for the Uniprot
            grounding of an agent when one is available. Because most
            spreadsheets allow only a single hyperlink per cell, this can makes
            it easier to link to Uniprot information pages for curation
            purposes. Default is False.
        """
        stmt_header = [
            'INDEX', 'UUID', 'TYPE', 'STR', 'AG_A_TEXT', 'AG_A_LINKS',
            'AG_A_STR', 'AG_B_TEXT', 'AG_B_LINKS', 'AG_B_STR', 'PMID', 'TEXT',
            'IS_HYP', 'IS_DIRECT'
        ]
        if add_curation_cols:
            stmt_header = stmt_header + \
                          ['AG_A_IDS_CORRECT', 'AG_A_STATE_CORRECT',
                           'AG_B_IDS_CORRECT', 'AG_B_STATE_CORRECT',
                           'EVENT_CORRECT',
                           'RES_CORRECT', 'POS_CORRECT', 'SUBJ_ACT_CORRECT',
                           'OBJ_ACT_CORRECT', 'HYP_CORRECT', 'DIRECT_CORRECT']
        rows = [stmt_header]

        for ix, stmt in enumerate(self.statements):
            # Complexes
            if len(stmt.agent_list()) > 2:
                logger.info(
                    "Skipping statement with more than two members: %s" % stmt)
                continue
            # Self-modifications, ActiveForms
            elif len(stmt.agent_list()) == 1:
                ag_a = stmt.agent_list()[0]
                ag_b = None
            # All others
            else:
                (ag_a, ag_b) = stmt.agent_list()
            # Put together the data row
            row = [ix+1, stmt.uuid, stmt.__class__.__name__, str(stmt)] + \
                  _format_agent_entries(ag_a, up_only) + \
                  _format_agent_entries(ag_b, up_only) + \
                  [stmt.evidence[0].pmid, stmt.evidence[0].text,
                   stmt.evidence[0].epistemics.get('hypothesis', ''),
                   stmt.evidence[0].epistemics.get('direct', '')]
            if add_curation_cols:
                row = row + ([''] * 11)
            rows.append(row)
        # Write to file
        write_unicode_csv(output_file, rows, delimiter='\t')
Beispiel #20
0
def update_grounding_map():
    famplex_gmap = os.path.join(path, 'famplex', 'grounding_map.csv')
    covid_gmap = os.path.join(path, 'grounding', 'covid_grounding.csv')
    famplex_rows = list(read_unicode_csv(famplex_gmap))
    row_len = len(famplex_rows[0])
    covid_rows = list(read_unicode_csv(covid_gmap))
    covid_rows = [r + [''] * (row_len - len(r)) for r in covid_rows]
    all_rows = famplex_rows + covid_rows
    grounding_map = os.path.join(path, 'grounding', 'grounding_map.csv')
    write_unicode_csv(grounding_map, all_rows)
Beispiel #21
0
    def make_model(self, output_file, add_curation_cols=False, up_only=False):
        """Export the statements into a tab-separated text file.

        Parameters
        ----------
        output_file : str
            Name of the output file.
        add_curation_cols : bool
            Whether to add columns to facilitate statement curation. Default
            is False (no additional columns).
        up_only : bool
            Whether to include identifiers.org links *only* for the Uniprot
            grounding of an agent when one is available. Because most
            spreadsheets allow only a single hyperlink per cell, this can makes
            it easier to link to Uniprot information pages for curation
            purposes. Default is False.
        """
        stmt_header = ['INDEX', 'UUID', 'TYPE', 'STR',
                       'AG_A_TEXT', 'AG_A_LINKS', 'AG_A_STR',
                       'AG_B_TEXT', 'AG_B_LINKS', 'AG_B_STR',
                       'PMID', 'TEXT', 'IS_HYP', 'IS_DIRECT']
        if add_curation_cols:
            stmt_header = stmt_header + \
                          ['AG_A_IDS_CORRECT', 'AG_A_STATE_CORRECT',
                           'AG_B_IDS_CORRECT', 'AG_B_STATE_CORRECT',
                           'EVENT_CORRECT',
                           'RES_CORRECT', 'POS_CORRECT', 'SUBJ_ACT_CORRECT',
                           'OBJ_ACT_CORRECT', 'HYP_CORRECT', 'DIRECT_CORRECT']
        rows = [stmt_header]

        for ix, stmt in enumerate(self.statements):
            # Complexes
            if len(stmt.agent_list()) > 2:
                logger.info("Skipping statement with more than two members: %s"
                            % stmt)
                continue
            # Self-modifications, ActiveForms
            elif len(stmt.agent_list()) == 1:
                ag_a = stmt.agent_list()[0]
                ag_b = None
            # All others
            else:
                (ag_a, ag_b) = stmt.agent_list()
            # Put together the data row
            row = [ix+1, stmt.uuid, stmt.__class__.__name__, str(stmt)] + \
                  _format_agent_entries(ag_a, up_only) + \
                  _format_agent_entries(ag_b, up_only) + \
                  [stmt.evidence[0].pmid, stmt.evidence[0].text,
                   stmt.evidence[0].epistemics.get('hypothesis', ''),
                   stmt.evidence[0].epistemics.get('direct', '')]
            if add_curation_cols:
                row = row + ([''] * 11)
            rows.append(row)
        # Write to file
        write_unicode_csv(output_file, rows, delimiter='\t')
Beispiel #22
0
def save_indra_db_stmts(stmts):
    csv_rows = [('KINASE', 'KINASE_TEXT', 'SUBSTRATE', 'SUBSTRATE_TEXT',
                 'RESIDUE', 'POSITION', 'SOURCE', 'DIRECT', 'PMID', 'SENTENCE')
                ]
    for s in stmts:
        for e in s.evidence:
            is_direct = 'True' if e.epistemics.get('direct') else 'False'
            csv_rows.append((s.enz.name, s.enz.db_refs.get('TEXT'), s.sub.name,
                             s.sub.db_refs.get('TEXT'), s.residue, s.position,
                             e.source_api, is_direct, e.pmid, e.text))
    write_unicode_csv('indra_phosphosites.csv', csv_rows)
Beispiel #23
0
def dump_table(text_grounding_cnt, ev_text_for_agent_text, fname):
    # Dump the results into a TSV file
    rows = [[
        'text', 'grounding', 'standard_name', 'url', 'gilda_grounding',
        'count', 'pmid', 'ev_text'
    ]]
    for data, count in text_grounding_cnt.most_common():
        pmid, ev_text = ev_text_for_agent_text[data[0]]
        row = list(data) + [str(count), pmid, ev_text]
        rows.append(row)
    write_unicode_csv(fname, rows, delimiter='\t')
Beispiel #24
0
def save_base_map(filename, grouped_by_text):
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(id)
            else:
                name = ''
            row = [text_string, db, id, count, name]
            rows.append(row)

    write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Beispiel #25
0
def save_base_map(filename, grouped_by_text):
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(id)
            else:
                name = ''
            row = [text_string, db, id, count, name]
            rows.append(row)

    write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Beispiel #26
0
def save_sentences(twg, stmts, filename, agent_limit=300):
    sentences = []
    unmapped_texts = [t[0] for t in twg]
    counter = 0
    logger.info('Getting sentences for top %d unmapped agent texts.' %
                agent_limit)
    for text in unmapped_texts:
        agent_sentences = get_sentences_for_agent(text, stmts)
        sentences += map(lambda tup: (text,) + tup, agent_sentences)
        counter += 1
        if counter >= agent_limit:
            break
    # Write sentences to CSV file
    write_unicode_csv(filename, sentences, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Beispiel #27
0
def save_sentences(twg, stmts, filename, agent_limit=300):
    sentences = []
    unmapped_texts = [t[0] for t in twg]
    counter = 0
    logger.info('Getting sentences for top %d unmapped agent texts.' %
                agent_limit)
    for text in unmapped_texts:
        agent_sentences = get_sentences_for_agent(text, stmts)
        sentences += map(lambda tup: (text,) + tup, agent_sentences)
        counter += 1
        if counter >= agent_limit:
            break
    # Write sentences to CSV file
    write_unicode_csv(filename, sentences, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Beispiel #28
0
def update_selventa_entries():
    fname = os.path.join(path, 'selventa_entries.tsv')

    xref_mappings = {
        'CHEBI': 'CHEBI',
        'MESHC': 'MESH',
        'MESHD': 'MESH',
        'MESHPP': 'MESH',
        'GOBP': 'GO',
        'GOCC': 'GO',
        'DO': 'DOID',
        }

    def process_selventa_xref(xref):
        if pandas.isna(xref):
            return ''
        db_refs = {}
        for xref_part in xref.split('|'):
            prefix, db_id = xref_part.split(':', maxsplit=1)
            ns = xref_mappings.get(prefix)
            if not ns:
                logger.info('Unknown namespace: %s' % prefix)
                continue
            db_id = ensure_prefix_if_needed(ns, db_id)
            db_refs[ns] = db_id
        assert_valid_db_refs(db_refs)
        db_refs_str = '|'.join(['%s:%s' % (k, v)
                                for k, v in sorted(db_refs.items())])
        return db_refs_str

    resources = {
        'SCHEM': 'selventa-legacy-chemical-names',
        'SDIS': 'selventa-legacy-diseases',
        'SCOMP': 'selventa-named-complexes',
        'SFAM': 'selventa-protein-families'
    }
    base_url = ('https://raw.githubusercontent.com/OpenBEL/resource-generator'
                '/master/datasets/')
    rows = []
    for ns, resource in resources.items():
        url = base_url + resource + '.txt'
        df = pandas.read_csv(url, sep='\t', comment='#')
        for _, df_row in df.iterrows():
            row = [ns, df_row['ID'], df_row['LABEL'],
                   process_selventa_xref(df_row['XREF'])]
            rows.append(row)
    write_unicode_csv(fname, sorted(rows))
Beispiel #29
0
def update_chebi_entries():
    logger.info('--Updating ChEBI entries----')
    url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \
        'Flat_file_tab_delimited/reference.tsv.gz'
    fname = os.path.join(path, 'reference.tsv.gz')
    urlretrieve(url, fname)
    with gzip.open(fname, 'rb') as fh:
        logger.info('Loading %s' % fname)
        df = pandas.read_csv(fh, sep='\t', index_col=None,
                             parse_dates=True, encoding='latin-1')
    # Save PubChem mapping
    fname = os.path.join(path, 'chebi_to_pubchem.tsv')
    logger.info('Saving into %s' % fname)
    df_pubchem = df[df['REFERENCE_DB_NAME']=='PubChem']
    df_pubchem.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True,
                           inplace=True)
    df_pubchem.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'],
                      header=['CHEBI', 'PUBCHEM'], index=False)

    # Process PubChem mapping to eliminate SID rows and strip CID: prefix
    # If the second column of the row starts with SID:, ignore the row
    # If the second column of the row starts with CID:, strip out the CID prefix
    # Otherwise, include the row unchanged
    original_rows = read_unicode_csv(fname, '\t')
    new_rows = []
    for original_row in original_rows:
        if original_row[1].startswith('CID:'):
            new_row = original_row
            new_row[1] = new_row[1][5:] # Strip out CID:
            new_rows.append(new_row)
        elif original_row[1].startswith('SID:'):
            # Skip SID rows
            continue
        else:
            # Include other rows unchanges
            new_rows.append(original_row)
    write_unicode_csv(fname, new_rows, '\t')

    # Save ChEMBL mapping
    fname = os.path.join(path, 'chebi_to_chembl.tsv')
    logger.info('Saving into %s' % fname)
    df_chembl = df[df['REFERENCE_DB_NAME']=='ChEMBL']
    df_chembl.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True,
                          inplace=True)
    df_chembl.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'],
                      header=['CHEBI', 'CHEMBL'], index=False)
Beispiel #30
0
def map_agents(mod_agents_file, sm, source, save_csv=True):
    """Tabulate valid, invalid, and mapped sites from a set of Agents."""
    # Load the agents
    with open(mod_agents_file, 'rb') as f:
        mod_agents = pickle.load(f)
    print("Mapping %s" % mod_agents_file)
    sites = []
    for ag_ix, ag in enumerate(mod_agents):
        #if ag_ix % 1000 == 0:
        #    print('%d of %d' % (ag_ix, len(mod_agents)))
        invalid_sites = sm._check_agent_mod(ag, ag.mods, True, True, True)
        # Valid
        if not invalid_sites:
            valid, mapped, mapped_res, mapped_pos, explanation = \
                                                      (1, 0, None, None, None)
        else:
            assert len(invalid_sites) == 1
            mapping = invalid_sites[0][1]
            valid = 0
            # Not mapped
            if mapping is None:
                mapped, mapped_res, mapped_pos, explanation = \
                                                    (0, None, None, None)
            # Mapped!
            else:
                mapped_res, mapped_pos, explanation = mapping
                mapped = 1 if mapped_pos else 0
        si = SiteInfo(ag.name, ag.mods[0].residue, ag.mods[0].position, valid,
                      mapped, mapped_res, mapped_pos, explanation, None,
                      source)
        sites.append(si)
    # Now that we've collected a list of all the sites, tabulate frequencies
    site_counter = Counter(sites)
    sites_with_freq = []
    for site, site_freq in site_counter.items():
        site_args = site._asdict()
        site_args.update({'freq': site_freq})
        si = SiteInfo(**site_args)
        sites_with_freq.append(si)
    # Write to CSV file
    if save_csv:
        header = [field.upper() for field in si._asdict().keys()]
        rows = header + replace_nones(sites)
        write_unicode_csv(agent_file.split('.')[0] + '.csv', rows)
    return sites_with_freq
Beispiel #31
0
def update_grounding_map():
    famplex_gmap = os.path.join(path, 'famplex', 'grounding_map.csv')
    famplex_rows = list(read_unicode_csv(famplex_gmap))
    row_len = len(famplex_rows[0])
    extra_rows = []
    # read in json file containing filenames for non-famplex grounding maps
    with open(os.path.join(path, 'grounding', 'extra_gmap_files.json')) as f:
        extra_gm_files = json.load(f)
    # Add non-famplex grounding map rows, adding blank values to synchronize
    # the number of columns with the number in the famplex grounding map
    for gm_filename in extra_gm_files:
        gmap = os.path.join(path, 'grounding', gm_filename)
        new_rows = list(read_unicode_csv(gmap))
        new_rows = [r + [''] * (row_len - len(r)) for r in new_rows]
        extra_rows.extend(new_rows)
    all_rows = famplex_rows + extra_rows
    grounding_map = os.path.join(path, 'grounding', 'grounding_map.csv')
    write_unicode_csv(grounding_map, all_rows)
Beispiel #32
0
def update_mesh_supplementary_names():
    supp_url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/supp2018.gz'
    supp_path = os.path.join(path, 'mesh_supp2018.gz')
    if not os.path.exists(supp_path):
        logging.info('Download MeSH supplement from %s', supp_url)
        urlretrieve(supp_url, supp_path)
        logging.info('Done downloading MeSH supplement')
    with gzip.open(supp_path) as supp_file:
        logging.info('Parsing MeSH supplement')
        supp_et = ET.parse(supp_file)
    supp_rows = []
    for record in supp_et.iterfind('SupplementalRecord'):
        uid = record.find('SupplementalRecordUI').text
        name = record.find('SupplementalRecordName/String').text
        term_name_str = _get_term_name_str(record, name)
        supp_rows.append((uid, name, term_name_str))

    fname = os.path.join(path, 'mesh_supp_id_label_mappings.tsv')
    write_unicode_csv(fname, supp_rows, delimiter='\t')
Beispiel #33
0
def update_uniprot_subcell_loc():
    logger.info('--Updating UniProt subcellular location--')
    url = 'https://www.uniprot.org/docs/subcell.txt'
    res = requests.get(url)
    res.raise_for_status()
    header, entry_block = res.text.split('_' * 75)
    entries = entry_block.split('//')
    mappings = []
    for entry in entries:
        slid = None
        goid = None
        lines = entry.split('\n')
        for line in lines:
            if line.startswith('AC'):
                slid = line[5:].strip()
            if line.startswith('GO'):
                goid = line[5:].split(';')[0]
        if slid and goid:
            mappings.append((slid, goid))
    fname = os.path.join(path, 'uniprot_subcell_loc.tsv')
    write_unicode_csv(fname, mappings, delimiter='\t')
Beispiel #34
0
def update_mesh_names():
    url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/desc2018.gz'
    desc_path = os.path.join(path, 'mesh_desc2018.gz')
    if not os.path.exists(desc_path):
        logging.info('Download MeSH descriptors from %s', url)
        urlretrieve(url, desc_path)
        logging.info('Done downloading MeSH descriptors')
    # Process the XML and find descriptor records
    with gzip.open(desc_path) as desc_file:
        logging.info('Parsing MeSH descriptors')
        et = ET.parse(desc_file)
    rows = []
    for record in et.iterfind('DescriptorRecord'):
        # We first get the ID and the name
        uid = record.find('DescriptorUI').text
        name = record.find('DescriptorName/String').text
        term_name_str = _get_term_name_str(record, name)
        rows.append((uid, name, term_name_str))

    fname = os.path.join(path, 'mesh_id_label_mappings.tsv')
    write_unicode_csv(fname, rows, delimiter='\t')
def update_uniprot_subcell_loc():
    logger.info('--Updating UniProt subcellular location--')
    url = 'https://www.uniprot.org/docs/subcell.txt'
    res = requests.get(url)
    res.raise_for_status()
    header, entry_block = res.text.split('_' * 75)
    entries = entry_block.split('//')
    mappings = []
    for entry in entries:
        slid = None
        goid = None
        lines = entry.split('\n')
        for line in lines:
            if line.startswith('AC'):
                slid = line[5:].strip()
            if line.startswith('GO'):
                goid = line[5:].split(';')[0]
        if slid and goid:
            mappings.append((slid, goid))
    fname = os.path.join(path, 'uniprot_subcell_loc.tsv')
    write_unicode_csv(fname, mappings, delimiter='\t')
Beispiel #36
0
def update_secondary_mappings(g):
    """Compile all secondary ID->primary ID mappings and save to a TSV file.

    Parameters
    ----------
    g : rdflib.Graph
        RDF graph containing GO data.
    """
    query = _prefixes + """
        SELECT ?id ?secid
        WHERE {
            ?class oboInOwl:id ?id .
            ?class oboInOwl:hasAlternativeId ?secid
        }
    """
    logger.info("Querying for GO secondary ID mappings")
    res = g.query(query)
    mappings = []
    for id_lit, sec_id_lit in sorted(res, key=lambda x: x[0]):
        mappings.append((sec_id_lit.value, id_lit.value))
    # Write to file
    write_unicode_csv(secondary_mappings_file, mappings, delimiter='\t')
Beispiel #37
0
def update_id_mappings(g):
    """Compile all ID->label mappings and save to a TSV file.

    Parameters
    ----------
    g : rdflib.Graph
        RDF graph containing GO data.
    """
    query = _prefixes + """
        SELECT ?id ?label
        WHERE {
            ?class oboInOwl:id ?id .
            ?class rdfs:label ?label
        }
    """
    logger.info("Querying for GO ID mappings")
    res = g.query(query)
    mappings = []
    for id_lit, label_lit in sorted(res, key=lambda x: x[0]):
        mappings.append((id_lit.value, label_lit.value))
    # Write to file
    write_unicode_csv(go_mappings_file, mappings, delimiter='\t')
def create_site_csv():
    all_sites = []
    # Load Biopax sites
    with open(BIOPAX_SITES_BY_DB, 'rb') as f:
        pc_sites = pickle.load(f)
    for db, sites in pc_sites.items():
        for ms, freq in sites:
            all_sites.append(ms_to_si(db, freq, ms))
    # Load BEL sites
    with open(BEL_SITES, 'rb') as f:
        bel_sites = pickle.load(f)
    for ms, freq in bel_sites:
        all_sites.append(ms_to_si('bel', freq, ms))
    # Load sites from reading
    with open(READER_SITES, 'rb') as f:
        sites_by_reader = pickle.load(f)
    for reader, sites in sites_by_reader.items():
        for ms, freq in sites.items():
            all_sites.append(ms_to_si(reader, freq, ms))
    header = [[field.upper() for field in all_sites[0]._asdict().keys()]]
    rows = header + replace_nones(all_sites)
    write_unicode_csv(ALL_SITES_CSV, rows)
Beispiel #39
0
def save_sentences(twg, stmts, filename, agent_limit=300):
    """Write evidence sentences for stmts with ungrounded agents to csv file.

    Parameters
    ----------
    twg: list of tuple
        list of tuples of ungrounded agent_texts with counts of the
        number of times they are mentioned in the list of statements.
        Should be sorted in descending order by the counts.
        This is of the form output by the function ungrounded texts.

    stmts: list of :py:class:`indra.statements.Statement`

    filename : str
        Path to output file

    agent_limit : Optional[int]
        Number of agents to include in output file. Takes the top agents
        by count.
    """
    sentences = []
    unmapped_texts = [t[0] for t in twg]
    counter = 0
    logger.info('Getting sentences for top %d unmapped agent texts.' %
                agent_limit)
    for text in unmapped_texts:
        agent_sentences = get_sentences_for_agent(text, stmts)
        sentences += map(lambda tup: (text, ) + tup, agent_sentences)
        counter += 1
        if counter >= agent_limit:
            break
    # Write sentences to CSV file
    write_unicode_csv(filename,
                      sentences,
                      delimiter=',',
                      quotechar='"',
                      quoting=csv.QUOTE_MINIMAL,
                      lineterminator='\r\n')
Beispiel #40
0
def save_base_map(filename, grouped_by_text):
    """Dump a list of agents along with groundings and counts into a csv file

    Parameters
    ----------
    filename : str
        Filepath for output file
    grouped_by_text : list of tuple
        List of tuples of the form output by agent_texts_with_grounding
    """
    rows = []
    for group in grouped_by_text:
        text_string = group[0]
        for db, db_id, count in group[1]:
            if db == 'UP':
                name = uniprot_client.get_mnemonic(db_id)
            else:
                name = ''
            row = [text_string, db, db_id, count, name]
            rows.append(row)

    write_unicode_csv(filename, rows, delimiter=',', quotechar='"',
                      quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
Beispiel #41
0
def update_id_mappings(g):
    """Compile all ID->label mappings and save to a TSV file.

    Parameters
    ----------
    g : rdflib.Graph
        RDF graph containing GO data.
    """
    g = load_go_graph(go_owl_path)

    query = _prefixes + """
        SELECT ?id ?label
        WHERE {
            ?class oboInOwl:id ?id .
            ?class rdfs:label ?label
        }
    """
    logger.info("Querying for GO ID mappings")
    res = g.query(query)
    mappings = []
    for id_lit, label_lit in sorted(res, key=lambda x: x[0]):
        mappings.append((id_lit.value, label_lit.value))
    # Write to file
    write_unicode_csv(go_mappings_file, mappings, delimiter='\t')
Beispiel #42
0
def analyze(filename):
    results = load_file(filename)

    all_stmts = [stmt for paper_stmts in results.values()
                      for stmt in paper_stmts]

    # Map grounding
    logger.info('Mapping grounding...')
    gmap = gm.GroundingMapper(gm.default_grounding_map)
    map_stmts = gmap.map_agents(all_stmts)
    map_stmts = gmap.rename_agents(map_stmts)

    # Combine duplicates
    logger.info('Removing duplicates...')
    pa = Preassembler(hierarchies, map_stmts)
    pa.combine_duplicates()

    # Get complexes
    complexes = [s for s in pa.unique_stmts if isinstance(s, Complex)]
    # Get HGNC grounding
    protein_complexes = [s for s in complexes
                           if all([True if 'HGNC' in ag.db_refs.keys()
                                        else False
                                        for ag in s.agent_list()])]

    logger.info('Mapping gene IDs to gene symbols')
    gene_ids = list(set([ag.db_refs['HGNC'] for stmt in protein_complexes
                                            for ag in stmt.members]))
    genes = [hgnc_client.get_hgnc_name(id) for id in gene_ids]

    # Get complexes from BioGrid and combine duplicates
    num_genes_per_query = 50
    start_indices = range(0, len(genes), num_genes_per_query)
    end_indices = [i + num_genes_per_query
                   if i + num_genes_per_query < len(genes) else len(genes)
                   for i in start_indices]
    bg_complexes = []
    for i in range(len(start_indices)):
        logger.info("Querying biogrid for %s" %
                    str(genes[start_indices[i]:end_indices[i]]))
        bg_complexes += (bg.get_statements(
                                genes[start_indices[i]:end_indices[i]]))

    # Filter out Biogrid statements not involving genes in the gene list
    # (this will make duplicate removal more efficient
    bg_filt = []
    for stmt in bg_complexes:
        if stmt.members[0].name in genes and \
           stmt.members[1].name in genes:
            bg_filt.append(stmt)
    # Might as well free up some memory
    del bg_complexes

    logger.info("Combining duplicates with biogrid...")
    pa = Preassembler(hierarchies, bg_filt + protein_complexes)
    pa.combine_duplicates()

    indra_only = []
    bg_only = []
    indra_and_bg = []
    for stmt in pa.unique_stmts:
        evidence_source_list = set([])
        for e in stmt.evidence:
            evidence_source_list.add(e.source_api)
        if 'reach' in evidence_source_list and \
           'biogrid' in evidence_source_list:
            indra_and_bg.append(stmt)
        elif 'reach' in evidence_source_list and \
             'biogrid' not in evidence_source_list:
            indra_only.append(stmt)
        elif 'reach' not in evidence_source_list and \
             'biogrid' in evidence_source_list:
            bg_only.append(stmt)

    rows = []
    for stmt in indra_only:
        rows.append([stmt.members[0].name, stmt.members[1].name,
                     str(len(stmt.evidence))])
    write_unicode_csv('unmatched_complexes.tsv', rows, delimiter='\t')

    return {'indra_only': indra_only,
            'bg_only': bg_only,
            'indra_and_bg': indra_and_bg}
            # Convert HGNC ids to names
            if 'HGNC' in db_refs and string_is_integer(db_refs['HGNC']):
                db_refs['HGNC'] = get_hgnc_name(db_refs['HGNC'])

            if len(db_refs.keys()) > 0:
                text_to_refs[text] = db_refs
        counter = counter + 1

        progress = math.floor(100.0 * float(counter)
                              / float(len(statement_list)))
        if progress > percent_done:
            percent_done = progress
            ellapsed_min = (time.time()-start_time) / 60.0
            logger.info(('%d%% done with processing statements '
                         '(%f minutes elapsed)')
                        % (percent_done, ellapsed_min))
    logger.info('\tDone!')

    # Convert into a list of lists
    logger.info('Writing grounding map to file')
    refs_list = []
    for text in text_to_refs.keys():
        row = [text]
        for (db, ref) in text_to_refs[text].items():
            row.append(db)
            row.append(ref)
        refs_list.append(row)
    write_unicode_csv(args.output_file, refs_list)
    logger.info('\tDone!')
    stmts_by_rule = {}
    for paper, stmts in stmts_by_paper.items():
        for stmt in stmts:
            found_by_rule = stmt.evidence[0].annotations['found_by']
            stmt_list = stmts_by_rule.get(found_by_rule)
            if stmt_list is None:
                stmts_by_rule[found_by_rule] = [stmt]
            else:
                stmt_list.append(stmt)

    with open('reach_stmts_by_rule.pkl', 'wb') as f:
        pickle.dump(stmts_by_rule, f, protocol=2)

    frequencies = [(k, len(v)) for k, v in stmts_by_rule.items()]
    frequencies.sort(key=lambda x: x[1], reverse=True)
    write_unicode_csv('reach_rule_frequencies.tsv', frequencies,
                      delimiter='\t')

    sample_rows = []
    max_sample_size = 20
    for rule, freq in frequencies:
        stmts = stmts_by_rule[rule]
        if max_sample_size < len(stmts):
            sample_stmts = np.random.choice(stmts,
                                            max_sample_size, replace=False)
        else:
            sample_stmts = stmts
        for stmt in sample_stmts:
            for ag in stmt.agent_list():
                if ag is not None:
                    ag.name = ag.db_refs.get('TEXT')
            is_hypothesis = stmt.evidence[0].epistemics.get('hypothesis', '')