def save_sentences(twg, stmts, filename, agent_limit=300): """Write evidence sentences for stmts with ungrounded agents to csv file. Parameters ---------- twg: list of tuple list of tuples of ungrounded agent_texts with counts of the number of times they are mentioned in the list of statements. Should be sorted in descending order by the counts. This is of the form output by the function ungrounded texts. stmts: list of :py:class:`indra.statements.Statement` filename : str Path to output file agent_limit : Optional[int] Number of agents to include in output file. Takes the top agents by count. """ sentences = [] unmapped_texts = [t[0] for t in twg] counter = 0 logger.info('Getting sentences for top %d unmapped agent texts.' % agent_limit) for text in unmapped_texts: agent_sentences = get_sentences_for_agent(text, stmts) sentences += map(lambda tup: (text,) + tup, agent_sentences) counter += 1 if counter >= agent_limit: break # Write sentences to CSV file write_unicode_csv(filename, sentences, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def update_pubchem_mesh_map(): url = 'https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Extras/CID-MeSH' res = requests.get(url) # We first get mapping pairs from the table mappings = [] for line in res.text.split('\n'): parts = line.split('\t') for part in parts[1:]: mappings.append((parts[0], part)) # The table has (1) rows with multiple MeSH terms separated by tabs, # (2) multiple rows with the same PubChem CID and (3) multiple rows # with the same MeSH term. We retain only one-to-one mappings here. pc_count = Counter([m[0] for m in mappings]) mesh_count = Counter([m[1] for m in mappings]) unique_mappings = [m for m in mappings if pc_count[m[0]] == 1 and mesh_count[m[1]] == 1] # The mappings table is given using MeSH term names so we need # to convert these to IDs. Lookups can fail for several reasons: # some entries are simply not valid MeSH names, others are not # yet included in the INDRA MeSH resources/ontology. unique_with_id = [] for pcid, meshname in unique_mappings: mesh_ns_id_tuple = bio_ontology.get_id_from_name('MESH', meshname) if mesh_ns_id_tuple: unique_with_id.append((pcid, mesh_ns_id_tuple[1])) fname = os.path.join(path, 'pubchem_mesh_map.tsv') logger.info('Saving into %s' % fname) write_unicode_csv(fname, unique_with_id, delimiter='\t')
def save_base_map(filename, grouped_by_text): """Dump a list of agents along with groundings and counts into a csv file Parameters ---------- filename : str Filepath for output file grouped_by_text : list of tuple List of tuples of the form output by agent_texts_with_grounding """ rows = [] for group in grouped_by_text: text_string = group[0] for db, db_id, count in group[1]: if db == 'UP': name = uniprot_client.get_mnemonic(db_id) else: name = '' row = [text_string, db, db_id, count, name] rows.append(row) write_unicode_csv(filename, rows, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def update_hmdb_chebi_map(): logger.info('--Updating HMDB to ChEBI entries----') ns = {'hmdb': 'http://www.hmdb.ca'} url = 'http://www.hmdb.ca/system/downloads/current/hmdb_metabolites.zip' fname = os.path.join(path, 'hmdb_metabolites.zip') logger.info('Downloading %s' % url) #urlretrieve(url, fname) mappings = [] with ZipFile(fname) as input_zip: with input_zip.open('hmdb_metabolites.xml') as fh: for event, elem in ET.iterparse(fh, events=('start', 'end')): #print(elem.tag) if event == 'start' and \ elem.tag == '{%s}metabolite' % ns['hmdb']: hmdb_id = None chebi_id = None # Important: we only look at accession if there's no HMDB # ID yet, otherwise we pick up secondary accession tags elif event == 'start' and \ elem.tag == '{%s}accession' % ns['hmdb'] and \ not hmdb_id: hmdb_id = elem.text elif event == 'start' and \ elem.tag == '{%s}chebi_id' % ns['hmdb']: chebi_id = elem.text elif event == 'end' and \ elem.tag == '{%s}metabolite' % ns['hmdb']: if hmdb_id and chebi_id: print(hmdb_id, chebi_id) mappings.append([hmdb_id, chebi_id]) elem.clear() fname = os.path.join(path, 'hmdb_to_chebi.tsv') mappings = [['HMDB_ID', 'CHEBI_ID']] + sorted(mappings, key=lambda x: x[0]) write_unicode_csv(fname, mappings, delimiter='\t')
def update_mesh_names(): url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/desc2018.xml' urlretrieve(url, 'desc2018.xml') # Process the XML and find descriptor records et = ET.parse('desc2018.xml') records = et.findall('DescriptorRecord') rows = [] for record in records: # We first get the ID and the name uid = record.find('DescriptorUI').text name = record.find('DescriptorName/String').text # We then need to look for additional terms related to the # preferred concept to get additional names concepts = record.findall('ConceptList/Concept') all_term_names = [] for concept in concepts: # We only look at the preferred concept here if concept.attrib['PreferredConceptYN'] == 'Y': terms = concept.findall('TermList/Term') for term in terms: term_name = term.find('String').text if term_name != name: all_term_names.append(term_name) # Append a list of term names separated by pipes to the table term_name_str = '|'.join(all_term_names) rows.append((uid, name, term_name_str)) fname = os.path.join(path, 'mesh_id_label_mappings.tsv') write_unicode_csv(fname, rows, delimiter='\t')
def update_mesh_supplementary_names(): """Update MeSH ID to name mappings for supplementary terms.""" supp_url = ('ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/' 'xmlmesh/supp2021.gz') supp_path = os.path.join(path, 'mesh_supp2021.gz') if not os.path.exists(supp_path): logging.info('Download MeSH supplement from %s', supp_url) urlretrieve(supp_url, supp_path) logging.info('Done downloading MeSH supplement') with gzip.open(supp_path) as supp_file: logging.info('Parsing MeSH supplement') supp_et = ET.parse(supp_file) supp_rows = [] for record in supp_et.iterfind('SupplementalRecord'): uid = record.find('SupplementalRecordUI').text name = record.find('SupplementalRecordName/String').text mapped_to_terms = record.findall('HeadingMappedToList/HeadingMappedTo/' 'DescriptorReferredTo/DescriptorUI') mapped_to = ','.join([term.text.replace('*', '') for term in mapped_to_terms]) term_name_str = _get_term_name_str(record, name) supp_rows.append((uid, name, term_name_str, mapped_to)) fname = os.path.join(path, 'mesh_supp_id_label_mappings.tsv') write_unicode_csv(fname, supp_rows, delimiter='\t')
def update_mesh_names(): """Update Mesh ID to name and tree number mappings.""" url = ('ftp://nlmpubs.nlm.nih.gov/online/mesh/MESH_FILES/' 'xmlmesh/desc2021.gz') desc_path = os.path.join(path, 'mesh_desc2021.gz') if not os.path.exists(desc_path): logging.info('Download MeSH descriptors from %s', url) urlretrieve(url, desc_path) logging.info('Done downloading MeSH descriptors') # Process the XML and find descriptor records with gzip.open(desc_path) as desc_file: logging.info('Parsing MeSH descriptors') et = ET.parse(desc_file) rows = [] for record in et.iterfind('DescriptorRecord'): # We first get the ID and the name uid = record.find('DescriptorUI').text name = record.find('DescriptorName/String').text term_name_str = _get_term_name_str(record, name) tree_numbers = record.findall('TreeNumberList/TreeNumber') tree_numbers_str = '|'.join([t.text for t in tree_numbers]) rows.append((uid, name, term_name_str, tree_numbers_str)) fname = os.path.join(path, 'mesh_id_label_mappings.tsv') write_unicode_csv(fname, rows, delimiter='\t')
def update_mesh_names(): url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/desc2018.xml' urlretrieve(url, 'desc2018.xml') # Process the XML and find descriptor records et = ET.parse('desc2018.xml') records = et.findall('DescriptorRecord') rows = [] for record in records: # We first get the ID and the name uid = record.find('DescriptorUI').text name = record.find('DescriptorName/String').text # We then need to look for additional terms related to the # preferred concept to get additional names concepts = record.findall('ConceptList/Concept') all_term_names = [] for concept in concepts: # We only look at the preferred concept here if concept.attrib['PreferredConceptYN'] == 'Y': terms = concept.findall('TermList/Term') for term in terms: term_name = term.find('String').text if term_name != name: all_term_names.append(term_name) # Append a list of term names separated by pipes to the table term_name_str = '|'.join(all_term_names) rows.append((uid, name, term_name_str)) fname = os.path.join(path, 'mesh_id_label_mappings.tsv') write_unicode_csv(fname, rows, delimiter='\t')
def update_drugbank_mappings(): """Update mappings from DrugBank to CHEBI/CHEMBL""" # Note that for this to work, PyOBO (https://github.com/pyobo/pyobo) has # to be installed and the DrugBank download # (https://www.drugbank.ca/releases/latest) put into ~/.obo/drugbank/ # Note that the DrugBank download requires signing up for an account and # waiting for approval. import pyobo drugbank_chembl = pyobo.get_filtered_xrefs('drugbank', 'chembl.compound') drugbank_chebi = pyobo.get_filtered_xrefs('drugbank', 'chebi') chebi_drugbank = pyobo.get_filtered_xrefs('chebi', 'drugbank') drugbank_names = pyobo.get_id_name_mapping('drugbank') rows = [] for drugbank_id, chembl_id in drugbank_chembl.items(): rows.append([drugbank_id, 'CHEMBL', chembl_id, 'drugbank']) for drugbank_id, chebi_id in drugbank_chebi.items(): rows.append([drugbank_id, 'CHEBI', chebi_id, 'drugbank']) for chebi_id, drugbank_id in chebi_drugbank.items(): rows.append([drugbank_id, 'CHEBI', chebi_id, 'chebi']) for drugbank_id, name in drugbank_names.items(): rows.append([drugbank_id, 'NAME', name, 'drugbank']) fname = os.path.join(path, 'drugbank_mappings.tsv') header = ['DRUGBANK_ID', 'NAMESPACE', 'ID', 'SOURCE'] rows = [header] + sorted(rows) write_unicode_csv(fname, rows, delimiter='\t')
def map_statements(stmts, source, outfile=None): """Tabulate valid, invalid, and mapped sites from a set of Statements.""" # Look for errors in database statements sm = SiteMapper(default_site_map) valid_stmts, mapped_stmts = sm.map_sites(stmts) # Collect stats from SiteMapper itself sites = [] for site_key, mapping in sm._cache.items(): gene, res, pos = site_key freq = sm._sitecount[site_key] if mapping == 'VALID': valid, mapped, mapped_res, mapped_pos, explanation = \ (1, 0, None, None, None) else: valid = 0 # Not mapped if mapping is None: mapped, mapped_res, mapped_pos, explanation = \ (0, None, None, None) # Mapped! else: mapped_res, mapped_pos, explanation = mapping mapped = 1 if mapped_pos else 0 si = SiteInfo(gene, res, pos, valid, mapped, mapped_res, mapped_pos, explanation, freq, source) sites.append(si) # Write to CSV file if outfile: header = [[field.upper() for field in si._asdict().keys()]] rows = header + replace_nones(sites) write_unicode_csv(outfile, rows) return sites
def update_lspci(): # We first create a dict of LSPCIs and their members but only for ones # that actually have TAS statements corresponding to them from indra.sources import tas tp = tas.process_from_web(affinity_class_limit=10) lspci_members = defaultdict(set) for stmt in tp.statements: if 'LSPCI' not in stmt.subj.db_refs: continue for k, v in stmt.subj.db_refs.items(): if k in {'TEXT', 'LSPCI'}: continue lspci_members[stmt.subj.db_refs.get('LSPCI')].add((k, v)) # We then process the names table in a way that we always prioritize the # first row for each LSPCI since the table is pre-sorted by priority df = pandas.read_csv('lsp_compound_names.csv', dtype={'lspci_id': str}) lspcid_names = {} for _, row in df.iterrows(): if row['lspci_id'] not in lspcid_names: lspcid_names[row['lspci_id']] = row['name'] # We can now combine the two sources filtering to only entries that have # names rows = [['lspcid', 'name', 'members']] for lspcid, members in lspci_members.items(): if lspcid not in lspcid_names: continue row = [lspcid, lspcid_names[lspcid], '|'.join(sorted(['%s:%s' % member for member in members]))] rows.append(row) write_unicode_csv(get_resource_path('lspci.tsv'), rows, delimiter='\t')
def get_stmt_sif(stmts, fname): rows = [] for stmt in stmts: agent_names = [a.name for a in stmt.agent_list() if a is not None] if len(agent_names) != 2: continue rows.append((agent_names[0], stmt.uuid, agent_names[1])) write_unicode_csv(fname, rows)
def update_famplex_map(): logger.info('--Updating FamPlex map----') # Currently this is a trivial "copy" of the FamPlex equivalences.csv # file. Later, name spaces may need to be adapted and other format changes # may be needed. fname_in = os.path.join(path, 'famplex/equivalences.csv') fname_out = os.path.join(path, 'famplex_map.tsv') rows = read_unicode_csv(fname_in) write_unicode_csv(fname_out, rows, delimiter='\t')
def print_stmts(stmts, file_name): rows = [] for s in stmts: agents = s.agent_list() db_refs = [('%s(%s)' % (a.name, a.db_refs)) for a in agents if a is not None] db_refs_str = (', '.join(db_refs)) rows.append([str(s), db_refs_str, s.evidence[0].text]) write_unicode_csv(file_name, rows, delimiter='\t')
def update_chebi_entries(): logger.info('--Updating ChEBI entries----') url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \ 'Flat_file_tab_delimited/reference.tsv.gz' fname = os.path.join(path, 'reference.tsv.gz') urlretrieve(url, fname) with gzip.open(fname, 'rb') as fh: logger.info('Loading %s' % fname) df = pandas.read_csv(fh, sep='\t', index_col=None, parse_dates=True, encoding='latin-1') # Save PubChem mapping fname = os.path.join(path, 'chebi_to_pubchem.tsv') logger.info('Saving into %s' % fname) df_pubchem = df[df['REFERENCE_DB_NAME'] == 'PubChem'] df_pubchem.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True, inplace=True) df_pubchem.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'], header=['CHEBI', 'PUBCHEM'], index=False) # Process PubChem mapping to eliminate SID rows and strip CID: prefix # If the second column of the row starts with SID:, ignore the row # If the second column of the row starts with CID:, strip out the CID prefix # Otherwise, include the row unchanged original_rows = read_unicode_csv(fname, '\t') new_rows = [] for original_row in original_rows: if original_row[1].startswith('CID:'): new_row = original_row new_row[1] = new_row[1][5:] # Strip out CID: new_rows.append(new_row) elif original_row[1].startswith('SID:'): # Skip SID rows continue else: # Include other rows unchanges new_rows.append(original_row) write_unicode_csv(fname, new_rows, '\t') # Save ChEMBL mapping fname = os.path.join(path, 'chebi_to_chembl.tsv') logger.info('Saving into %s' % fname) df_chembl = df[df['REFERENCE_DB_NAME'] == 'ChEMBL'] df_chembl.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True, inplace=True) df_chembl.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'], header=['CHEBI', 'CHEMBL'], index=False)
def print_stmts(stmts, file_name): rows = [] for s in stmts: agents = s.agent_list() db_refs = [('%s(%s)' % (a.name, a.db_refs)) for a in agents if a is not None] db_refs_str = (', '.join(db_refs)) rows.append([str(s), db_refs_str, s.evidence[0].text]) write_unicode_csv(file_name, rows, delimiter='\t')
def update_chebi_entries(): term_entries = _get_chebi_obo_terms() # Make the name and secondary table fname = os.path.join(path, 'chebi_entries.tsv') rows = [['CHEBI_ID', 'NAME', 'SECONDARIES']] for term_id, name, secondaries, parents in term_entries: rows.append([term_id, name, ','.join(secondaries)]) with open(fname, 'wb') as fh: write_unicode_csv(fname, rows, '\t')
def update_bioentities_map(): logger.info('--Updating Bioentities map----') # Currently this is a trivial "copy" of the Bioentities equivalences.csv # file. Later, name spaces may need to be adapted and other format changes # may be needed. fname_in = os.path.join(path, '../../bioentities/equivalences.csv') fname_out = os.path.join(path, 'bioentities_map.tsv') rows = read_unicode_csv(fname_in) write_unicode_csv(fname_out, rows, delimiter='\t')
def make_model(self, output_file, add_curation_cols=False, up_only=False): """Export the statements into a tab-separated text file. Parameters ---------- output_file : str Name of the output file. add_curation_cols : bool Whether to add columns to facilitate statement curation. Default is False (no additional columns). up_only : bool Whether to include identifiers.org links *only* for the Uniprot grounding of an agent when one is available. Because most spreadsheets allow only a single hyperlink per cell, this can makes it easier to link to Uniprot information pages for curation purposes. Default is False. """ stmt_header = [ 'INDEX', 'UUID', 'TYPE', 'STR', 'AG_A_TEXT', 'AG_A_LINKS', 'AG_A_STR', 'AG_B_TEXT', 'AG_B_LINKS', 'AG_B_STR', 'PMID', 'TEXT', 'IS_HYP', 'IS_DIRECT' ] if add_curation_cols: stmt_header = stmt_header + \ ['AG_A_IDS_CORRECT', 'AG_A_STATE_CORRECT', 'AG_B_IDS_CORRECT', 'AG_B_STATE_CORRECT', 'EVENT_CORRECT', 'RES_CORRECT', 'POS_CORRECT', 'SUBJ_ACT_CORRECT', 'OBJ_ACT_CORRECT', 'HYP_CORRECT', 'DIRECT_CORRECT'] rows = [stmt_header] for ix, stmt in enumerate(self.statements): # Complexes if len(stmt.agent_list()) > 2: logger.info( "Skipping statement with more than two members: %s" % stmt) continue # Self-modifications, ActiveForms elif len(stmt.agent_list()) == 1: ag_a = stmt.agent_list()[0] ag_b = None # All others else: (ag_a, ag_b) = stmt.agent_list() # Put together the data row row = [ix+1, stmt.uuid, stmt.__class__.__name__, str(stmt)] + \ _format_agent_entries(ag_a, up_only) + \ _format_agent_entries(ag_b, up_only) + \ [stmt.evidence[0].pmid, stmt.evidence[0].text, stmt.evidence[0].epistemics.get('hypothesis', ''), stmt.evidence[0].epistemics.get('direct', '')] if add_curation_cols: row = row + ([''] * 11) rows.append(row) # Write to file write_unicode_csv(output_file, rows, delimiter='\t')
def update_grounding_map(): famplex_gmap = os.path.join(path, 'famplex', 'grounding_map.csv') covid_gmap = os.path.join(path, 'grounding', 'covid_grounding.csv') famplex_rows = list(read_unicode_csv(famplex_gmap)) row_len = len(famplex_rows[0]) covid_rows = list(read_unicode_csv(covid_gmap)) covid_rows = [r + [''] * (row_len - len(r)) for r in covid_rows] all_rows = famplex_rows + covid_rows grounding_map = os.path.join(path, 'grounding', 'grounding_map.csv') write_unicode_csv(grounding_map, all_rows)
def make_model(self, output_file, add_curation_cols=False, up_only=False): """Export the statements into a tab-separated text file. Parameters ---------- output_file : str Name of the output file. add_curation_cols : bool Whether to add columns to facilitate statement curation. Default is False (no additional columns). up_only : bool Whether to include identifiers.org links *only* for the Uniprot grounding of an agent when one is available. Because most spreadsheets allow only a single hyperlink per cell, this can makes it easier to link to Uniprot information pages for curation purposes. Default is False. """ stmt_header = ['INDEX', 'UUID', 'TYPE', 'STR', 'AG_A_TEXT', 'AG_A_LINKS', 'AG_A_STR', 'AG_B_TEXT', 'AG_B_LINKS', 'AG_B_STR', 'PMID', 'TEXT', 'IS_HYP', 'IS_DIRECT'] if add_curation_cols: stmt_header = stmt_header + \ ['AG_A_IDS_CORRECT', 'AG_A_STATE_CORRECT', 'AG_B_IDS_CORRECT', 'AG_B_STATE_CORRECT', 'EVENT_CORRECT', 'RES_CORRECT', 'POS_CORRECT', 'SUBJ_ACT_CORRECT', 'OBJ_ACT_CORRECT', 'HYP_CORRECT', 'DIRECT_CORRECT'] rows = [stmt_header] for ix, stmt in enumerate(self.statements): # Complexes if len(stmt.agent_list()) > 2: logger.info("Skipping statement with more than two members: %s" % stmt) continue # Self-modifications, ActiveForms elif len(stmt.agent_list()) == 1: ag_a = stmt.agent_list()[0] ag_b = None # All others else: (ag_a, ag_b) = stmt.agent_list() # Put together the data row row = [ix+1, stmt.uuid, stmt.__class__.__name__, str(stmt)] + \ _format_agent_entries(ag_a, up_only) + \ _format_agent_entries(ag_b, up_only) + \ [stmt.evidence[0].pmid, stmt.evidence[0].text, stmt.evidence[0].epistemics.get('hypothesis', ''), stmt.evidence[0].epistemics.get('direct', '')] if add_curation_cols: row = row + ([''] * 11) rows.append(row) # Write to file write_unicode_csv(output_file, rows, delimiter='\t')
def save_indra_db_stmts(stmts): csv_rows = [('KINASE', 'KINASE_TEXT', 'SUBSTRATE', 'SUBSTRATE_TEXT', 'RESIDUE', 'POSITION', 'SOURCE', 'DIRECT', 'PMID', 'SENTENCE') ] for s in stmts: for e in s.evidence: is_direct = 'True' if e.epistemics.get('direct') else 'False' csv_rows.append((s.enz.name, s.enz.db_refs.get('TEXT'), s.sub.name, s.sub.db_refs.get('TEXT'), s.residue, s.position, e.source_api, is_direct, e.pmid, e.text)) write_unicode_csv('indra_phosphosites.csv', csv_rows)
def dump_table(text_grounding_cnt, ev_text_for_agent_text, fname): # Dump the results into a TSV file rows = [[ 'text', 'grounding', 'standard_name', 'url', 'gilda_grounding', 'count', 'pmid', 'ev_text' ]] for data, count in text_grounding_cnt.most_common(): pmid, ev_text = ev_text_for_agent_text[data[0]] row = list(data) + [str(count), pmid, ev_text] rows.append(row) write_unicode_csv(fname, rows, delimiter='\t')
def save_base_map(filename, grouped_by_text): rows = [] for group in grouped_by_text: text_string = group[0] for db, id, count in group[1]: if db == 'UP': name = uniprot_client.get_mnemonic(id) else: name = '' row = [text_string, db, id, count, name] rows.append(row) write_unicode_csv(filename, rows, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def save_base_map(filename, grouped_by_text): rows = [] for group in grouped_by_text: text_string = group[0] for db, id, count in group[1]: if db == 'UP': name = uniprot_client.get_mnemonic(id) else: name = '' row = [text_string, db, id, count, name] rows.append(row) write_unicode_csv(filename, rows, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def save_sentences(twg, stmts, filename, agent_limit=300): sentences = [] unmapped_texts = [t[0] for t in twg] counter = 0 logger.info('Getting sentences for top %d unmapped agent texts.' % agent_limit) for text in unmapped_texts: agent_sentences = get_sentences_for_agent(text, stmts) sentences += map(lambda tup: (text,) + tup, agent_sentences) counter += 1 if counter >= agent_limit: break # Write sentences to CSV file write_unicode_csv(filename, sentences, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def save_sentences(twg, stmts, filename, agent_limit=300): sentences = [] unmapped_texts = [t[0] for t in twg] counter = 0 logger.info('Getting sentences for top %d unmapped agent texts.' % agent_limit) for text in unmapped_texts: agent_sentences = get_sentences_for_agent(text, stmts) sentences += map(lambda tup: (text,) + tup, agent_sentences) counter += 1 if counter >= agent_limit: break # Write sentences to CSV file write_unicode_csv(filename, sentences, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def update_selventa_entries(): fname = os.path.join(path, 'selventa_entries.tsv') xref_mappings = { 'CHEBI': 'CHEBI', 'MESHC': 'MESH', 'MESHD': 'MESH', 'MESHPP': 'MESH', 'GOBP': 'GO', 'GOCC': 'GO', 'DO': 'DOID', } def process_selventa_xref(xref): if pandas.isna(xref): return '' db_refs = {} for xref_part in xref.split('|'): prefix, db_id = xref_part.split(':', maxsplit=1) ns = xref_mappings.get(prefix) if not ns: logger.info('Unknown namespace: %s' % prefix) continue db_id = ensure_prefix_if_needed(ns, db_id) db_refs[ns] = db_id assert_valid_db_refs(db_refs) db_refs_str = '|'.join(['%s:%s' % (k, v) for k, v in sorted(db_refs.items())]) return db_refs_str resources = { 'SCHEM': 'selventa-legacy-chemical-names', 'SDIS': 'selventa-legacy-diseases', 'SCOMP': 'selventa-named-complexes', 'SFAM': 'selventa-protein-families' } base_url = ('https://raw.githubusercontent.com/OpenBEL/resource-generator' '/master/datasets/') rows = [] for ns, resource in resources.items(): url = base_url + resource + '.txt' df = pandas.read_csv(url, sep='\t', comment='#') for _, df_row in df.iterrows(): row = [ns, df_row['ID'], df_row['LABEL'], process_selventa_xref(df_row['XREF'])] rows.append(row) write_unicode_csv(fname, sorted(rows))
def update_chebi_entries(): logger.info('--Updating ChEBI entries----') url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \ 'Flat_file_tab_delimited/reference.tsv.gz' fname = os.path.join(path, 'reference.tsv.gz') urlretrieve(url, fname) with gzip.open(fname, 'rb') as fh: logger.info('Loading %s' % fname) df = pandas.read_csv(fh, sep='\t', index_col=None, parse_dates=True, encoding='latin-1') # Save PubChem mapping fname = os.path.join(path, 'chebi_to_pubchem.tsv') logger.info('Saving into %s' % fname) df_pubchem = df[df['REFERENCE_DB_NAME']=='PubChem'] df_pubchem.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True, inplace=True) df_pubchem.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'], header=['CHEBI', 'PUBCHEM'], index=False) # Process PubChem mapping to eliminate SID rows and strip CID: prefix # If the second column of the row starts with SID:, ignore the row # If the second column of the row starts with CID:, strip out the CID prefix # Otherwise, include the row unchanged original_rows = read_unicode_csv(fname, '\t') new_rows = [] for original_row in original_rows: if original_row[1].startswith('CID:'): new_row = original_row new_row[1] = new_row[1][5:] # Strip out CID: new_rows.append(new_row) elif original_row[1].startswith('SID:'): # Skip SID rows continue else: # Include other rows unchanges new_rows.append(original_row) write_unicode_csv(fname, new_rows, '\t') # Save ChEMBL mapping fname = os.path.join(path, 'chebi_to_chembl.tsv') logger.info('Saving into %s' % fname) df_chembl = df[df['REFERENCE_DB_NAME']=='ChEMBL'] df_chembl.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True, inplace=True) df_chembl.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'], header=['CHEBI', 'CHEMBL'], index=False)
def map_agents(mod_agents_file, sm, source, save_csv=True): """Tabulate valid, invalid, and mapped sites from a set of Agents.""" # Load the agents with open(mod_agents_file, 'rb') as f: mod_agents = pickle.load(f) print("Mapping %s" % mod_agents_file) sites = [] for ag_ix, ag in enumerate(mod_agents): #if ag_ix % 1000 == 0: # print('%d of %d' % (ag_ix, len(mod_agents))) invalid_sites = sm._check_agent_mod(ag, ag.mods, True, True, True) # Valid if not invalid_sites: valid, mapped, mapped_res, mapped_pos, explanation = \ (1, 0, None, None, None) else: assert len(invalid_sites) == 1 mapping = invalid_sites[0][1] valid = 0 # Not mapped if mapping is None: mapped, mapped_res, mapped_pos, explanation = \ (0, None, None, None) # Mapped! else: mapped_res, mapped_pos, explanation = mapping mapped = 1 if mapped_pos else 0 si = SiteInfo(ag.name, ag.mods[0].residue, ag.mods[0].position, valid, mapped, mapped_res, mapped_pos, explanation, None, source) sites.append(si) # Now that we've collected a list of all the sites, tabulate frequencies site_counter = Counter(sites) sites_with_freq = [] for site, site_freq in site_counter.items(): site_args = site._asdict() site_args.update({'freq': site_freq}) si = SiteInfo(**site_args) sites_with_freq.append(si) # Write to CSV file if save_csv: header = [field.upper() for field in si._asdict().keys()] rows = header + replace_nones(sites) write_unicode_csv(agent_file.split('.')[0] + '.csv', rows) return sites_with_freq
def update_grounding_map(): famplex_gmap = os.path.join(path, 'famplex', 'grounding_map.csv') famplex_rows = list(read_unicode_csv(famplex_gmap)) row_len = len(famplex_rows[0]) extra_rows = [] # read in json file containing filenames for non-famplex grounding maps with open(os.path.join(path, 'grounding', 'extra_gmap_files.json')) as f: extra_gm_files = json.load(f) # Add non-famplex grounding map rows, adding blank values to synchronize # the number of columns with the number in the famplex grounding map for gm_filename in extra_gm_files: gmap = os.path.join(path, 'grounding', gm_filename) new_rows = list(read_unicode_csv(gmap)) new_rows = [r + [''] * (row_len - len(r)) for r in new_rows] extra_rows.extend(new_rows) all_rows = famplex_rows + extra_rows grounding_map = os.path.join(path, 'grounding', 'grounding_map.csv') write_unicode_csv(grounding_map, all_rows)
def update_mesh_supplementary_names(): supp_url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/supp2018.gz' supp_path = os.path.join(path, 'mesh_supp2018.gz') if not os.path.exists(supp_path): logging.info('Download MeSH supplement from %s', supp_url) urlretrieve(supp_url, supp_path) logging.info('Done downloading MeSH supplement') with gzip.open(supp_path) as supp_file: logging.info('Parsing MeSH supplement') supp_et = ET.parse(supp_file) supp_rows = [] for record in supp_et.iterfind('SupplementalRecord'): uid = record.find('SupplementalRecordUI').text name = record.find('SupplementalRecordName/String').text term_name_str = _get_term_name_str(record, name) supp_rows.append((uid, name, term_name_str)) fname = os.path.join(path, 'mesh_supp_id_label_mappings.tsv') write_unicode_csv(fname, supp_rows, delimiter='\t')
def update_uniprot_subcell_loc(): logger.info('--Updating UniProt subcellular location--') url = 'https://www.uniprot.org/docs/subcell.txt' res = requests.get(url) res.raise_for_status() header, entry_block = res.text.split('_' * 75) entries = entry_block.split('//') mappings = [] for entry in entries: slid = None goid = None lines = entry.split('\n') for line in lines: if line.startswith('AC'): slid = line[5:].strip() if line.startswith('GO'): goid = line[5:].split(';')[0] if slid and goid: mappings.append((slid, goid)) fname = os.path.join(path, 'uniprot_subcell_loc.tsv') write_unicode_csv(fname, mappings, delimiter='\t')
def update_mesh_names(): url = 'ftp://nlmpubs.nlm.nih.gov/online/mesh/2018/xmlmesh/desc2018.gz' desc_path = os.path.join(path, 'mesh_desc2018.gz') if not os.path.exists(desc_path): logging.info('Download MeSH descriptors from %s', url) urlretrieve(url, desc_path) logging.info('Done downloading MeSH descriptors') # Process the XML and find descriptor records with gzip.open(desc_path) as desc_file: logging.info('Parsing MeSH descriptors') et = ET.parse(desc_file) rows = [] for record in et.iterfind('DescriptorRecord'): # We first get the ID and the name uid = record.find('DescriptorUI').text name = record.find('DescriptorName/String').text term_name_str = _get_term_name_str(record, name) rows.append((uid, name, term_name_str)) fname = os.path.join(path, 'mesh_id_label_mappings.tsv') write_unicode_csv(fname, rows, delimiter='\t')
def update_uniprot_subcell_loc(): logger.info('--Updating UniProt subcellular location--') url = 'https://www.uniprot.org/docs/subcell.txt' res = requests.get(url) res.raise_for_status() header, entry_block = res.text.split('_' * 75) entries = entry_block.split('//') mappings = [] for entry in entries: slid = None goid = None lines = entry.split('\n') for line in lines: if line.startswith('AC'): slid = line[5:].strip() if line.startswith('GO'): goid = line[5:].split(';')[0] if slid and goid: mappings.append((slid, goid)) fname = os.path.join(path, 'uniprot_subcell_loc.tsv') write_unicode_csv(fname, mappings, delimiter='\t')
def update_secondary_mappings(g): """Compile all secondary ID->primary ID mappings and save to a TSV file. Parameters ---------- g : rdflib.Graph RDF graph containing GO data. """ query = _prefixes + """ SELECT ?id ?secid WHERE { ?class oboInOwl:id ?id . ?class oboInOwl:hasAlternativeId ?secid } """ logger.info("Querying for GO secondary ID mappings") res = g.query(query) mappings = [] for id_lit, sec_id_lit in sorted(res, key=lambda x: x[0]): mappings.append((sec_id_lit.value, id_lit.value)) # Write to file write_unicode_csv(secondary_mappings_file, mappings, delimiter='\t')
def update_id_mappings(g): """Compile all ID->label mappings and save to a TSV file. Parameters ---------- g : rdflib.Graph RDF graph containing GO data. """ query = _prefixes + """ SELECT ?id ?label WHERE { ?class oboInOwl:id ?id . ?class rdfs:label ?label } """ logger.info("Querying for GO ID mappings") res = g.query(query) mappings = [] for id_lit, label_lit in sorted(res, key=lambda x: x[0]): mappings.append((id_lit.value, label_lit.value)) # Write to file write_unicode_csv(go_mappings_file, mappings, delimiter='\t')
def create_site_csv(): all_sites = [] # Load Biopax sites with open(BIOPAX_SITES_BY_DB, 'rb') as f: pc_sites = pickle.load(f) for db, sites in pc_sites.items(): for ms, freq in sites: all_sites.append(ms_to_si(db, freq, ms)) # Load BEL sites with open(BEL_SITES, 'rb') as f: bel_sites = pickle.load(f) for ms, freq in bel_sites: all_sites.append(ms_to_si('bel', freq, ms)) # Load sites from reading with open(READER_SITES, 'rb') as f: sites_by_reader = pickle.load(f) for reader, sites in sites_by_reader.items(): for ms, freq in sites.items(): all_sites.append(ms_to_si(reader, freq, ms)) header = [[field.upper() for field in all_sites[0]._asdict().keys()]] rows = header + replace_nones(all_sites) write_unicode_csv(ALL_SITES_CSV, rows)
def save_sentences(twg, stmts, filename, agent_limit=300): """Write evidence sentences for stmts with ungrounded agents to csv file. Parameters ---------- twg: list of tuple list of tuples of ungrounded agent_texts with counts of the number of times they are mentioned in the list of statements. Should be sorted in descending order by the counts. This is of the form output by the function ungrounded texts. stmts: list of :py:class:`indra.statements.Statement` filename : str Path to output file agent_limit : Optional[int] Number of agents to include in output file. Takes the top agents by count. """ sentences = [] unmapped_texts = [t[0] for t in twg] counter = 0 logger.info('Getting sentences for top %d unmapped agent texts.' % agent_limit) for text in unmapped_texts: agent_sentences = get_sentences_for_agent(text, stmts) sentences += map(lambda tup: (text, ) + tup, agent_sentences) counter += 1 if counter >= agent_limit: break # Write sentences to CSV file write_unicode_csv(filename, sentences, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def save_base_map(filename, grouped_by_text): """Dump a list of agents along with groundings and counts into a csv file Parameters ---------- filename : str Filepath for output file grouped_by_text : list of tuple List of tuples of the form output by agent_texts_with_grounding """ rows = [] for group in grouped_by_text: text_string = group[0] for db, db_id, count in group[1]: if db == 'UP': name = uniprot_client.get_mnemonic(db_id) else: name = '' row = [text_string, db, db_id, count, name] rows.append(row) write_unicode_csv(filename, rows, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL, lineterminator='\r\n')
def update_id_mappings(g): """Compile all ID->label mappings and save to a TSV file. Parameters ---------- g : rdflib.Graph RDF graph containing GO data. """ g = load_go_graph(go_owl_path) query = _prefixes + """ SELECT ?id ?label WHERE { ?class oboInOwl:id ?id . ?class rdfs:label ?label } """ logger.info("Querying for GO ID mappings") res = g.query(query) mappings = [] for id_lit, label_lit in sorted(res, key=lambda x: x[0]): mappings.append((id_lit.value, label_lit.value)) # Write to file write_unicode_csv(go_mappings_file, mappings, delimiter='\t')
def analyze(filename): results = load_file(filename) all_stmts = [stmt for paper_stmts in results.values() for stmt in paper_stmts] # Map grounding logger.info('Mapping grounding...') gmap = gm.GroundingMapper(gm.default_grounding_map) map_stmts = gmap.map_agents(all_stmts) map_stmts = gmap.rename_agents(map_stmts) # Combine duplicates logger.info('Removing duplicates...') pa = Preassembler(hierarchies, map_stmts) pa.combine_duplicates() # Get complexes complexes = [s for s in pa.unique_stmts if isinstance(s, Complex)] # Get HGNC grounding protein_complexes = [s for s in complexes if all([True if 'HGNC' in ag.db_refs.keys() else False for ag in s.agent_list()])] logger.info('Mapping gene IDs to gene symbols') gene_ids = list(set([ag.db_refs['HGNC'] for stmt in protein_complexes for ag in stmt.members])) genes = [hgnc_client.get_hgnc_name(id) for id in gene_ids] # Get complexes from BioGrid and combine duplicates num_genes_per_query = 50 start_indices = range(0, len(genes), num_genes_per_query) end_indices = [i + num_genes_per_query if i + num_genes_per_query < len(genes) else len(genes) for i in start_indices] bg_complexes = [] for i in range(len(start_indices)): logger.info("Querying biogrid for %s" % str(genes[start_indices[i]:end_indices[i]])) bg_complexes += (bg.get_statements( genes[start_indices[i]:end_indices[i]])) # Filter out Biogrid statements not involving genes in the gene list # (this will make duplicate removal more efficient bg_filt = [] for stmt in bg_complexes: if stmt.members[0].name in genes and \ stmt.members[1].name in genes: bg_filt.append(stmt) # Might as well free up some memory del bg_complexes logger.info("Combining duplicates with biogrid...") pa = Preassembler(hierarchies, bg_filt + protein_complexes) pa.combine_duplicates() indra_only = [] bg_only = [] indra_and_bg = [] for stmt in pa.unique_stmts: evidence_source_list = set([]) for e in stmt.evidence: evidence_source_list.add(e.source_api) if 'reach' in evidence_source_list and \ 'biogrid' in evidence_source_list: indra_and_bg.append(stmt) elif 'reach' in evidence_source_list and \ 'biogrid' not in evidence_source_list: indra_only.append(stmt) elif 'reach' not in evidence_source_list and \ 'biogrid' in evidence_source_list: bg_only.append(stmt) rows = [] for stmt in indra_only: rows.append([stmt.members[0].name, stmt.members[1].name, str(len(stmt.evidence))]) write_unicode_csv('unmatched_complexes.tsv', rows, delimiter='\t') return {'indra_only': indra_only, 'bg_only': bg_only, 'indra_and_bg': indra_and_bg}
# Convert HGNC ids to names if 'HGNC' in db_refs and string_is_integer(db_refs['HGNC']): db_refs['HGNC'] = get_hgnc_name(db_refs['HGNC']) if len(db_refs.keys()) > 0: text_to_refs[text] = db_refs counter = counter + 1 progress = math.floor(100.0 * float(counter) / float(len(statement_list))) if progress > percent_done: percent_done = progress ellapsed_min = (time.time()-start_time) / 60.0 logger.info(('%d%% done with processing statements ' '(%f minutes elapsed)') % (percent_done, ellapsed_min)) logger.info('\tDone!') # Convert into a list of lists logger.info('Writing grounding map to file') refs_list = [] for text in text_to_refs.keys(): row = [text] for (db, ref) in text_to_refs[text].items(): row.append(db) row.append(ref) refs_list.append(row) write_unicode_csv(args.output_file, refs_list) logger.info('\tDone!')
stmts_by_rule = {} for paper, stmts in stmts_by_paper.items(): for stmt in stmts: found_by_rule = stmt.evidence[0].annotations['found_by'] stmt_list = stmts_by_rule.get(found_by_rule) if stmt_list is None: stmts_by_rule[found_by_rule] = [stmt] else: stmt_list.append(stmt) with open('reach_stmts_by_rule.pkl', 'wb') as f: pickle.dump(stmts_by_rule, f, protocol=2) frequencies = [(k, len(v)) for k, v in stmts_by_rule.items()] frequencies.sort(key=lambda x: x[1], reverse=True) write_unicode_csv('reach_rule_frequencies.tsv', frequencies, delimiter='\t') sample_rows = [] max_sample_size = 20 for rule, freq in frequencies: stmts = stmts_by_rule[rule] if max_sample_size < len(stmts): sample_stmts = np.random.choice(stmts, max_sample_size, replace=False) else: sample_stmts = stmts for stmt in sample_stmts: for ag in stmt.agent_list(): if ag is not None: ag.name = ag.db_refs.get('TEXT') is_hypothesis = stmt.evidence[0].epistemics.get('hypothesis', '')