Example #1
0
def load_grounding_map(grounding_map_path, ignore_path=None):
    g_map = {}
    map_rows = read_unicode_csv(grounding_map_path,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL,
                                lineterminator='\r\n')
    if ignore_path and os.path.exists(ignore_path):
        ignore_rows = read_unicode_csv(ignore_path,
                                       delimiter=',',
                                       quotechar='"',
                                       quoting=csv.QUOTE_MINIMAL,
                                       lineterminator='\r\n')
    else:
        ignore_rows = []
    csv_rows = chain(map_rows, ignore_rows)
    for row in csv_rows:
        key = row[0]
        db_refs = {'TEXT': key}
        keys = [entry for entry in row[1::2] if entry != '']
        values = [entry for entry in row[2::2] if entry != '']
        if len(keys) != len(values):
            logger.info('ERROR: Mismatched keys and values in row %s' %
                        str(row))
            continue
        else:
            db_refs.update(dict(zip(keys, values)))
            if len(db_refs.keys()) > 1:
                g_map[key] = db_refs
            else:
                g_map[key] = None
    return g_map
Example #2
0
def load_grounding_map(grounding_map_path, ignore_path=None):
    g_map = {}
    map_rows = read_unicode_csv(grounding_map_path, delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL,
                                lineterminator='\r\n')
    if ignore_path and os.path.exists(ignore_path):
        ignore_rows = read_unicode_csv(ignore_path, delimiter=',',
                                    quotechar='"',
                                    quoting=csv.QUOTE_MINIMAL,
                                    lineterminator='\r\n')
    else:
        ignore_rows = []
    csv_rows = chain(map_rows, ignore_rows)
    for row in csv_rows:
        key = row[0]
        db_refs = {'TEXT': key}
        keys = [entry for entry in row[1::2] if entry != '']
        values = [entry for entry in row[2::2] if entry != '']
        if len(keys) != len(values):
            logger.info('ERROR: Mismatched keys and values in row %s' %
                        str(row))
            continue
        else:
            db_refs.update(dict(zip(keys, values)))
            if len(db_refs.keys()) > 1:
                g_map[key] = db_refs
            else:
                g_map[key] = None
    return g_map
Example #3
0
def process_from_file(signor_data_file, signor_complexes_file=None):
    """Process Signor interaction data from CSV files.

    Parameters
    ----------
    signor_data_file : str
        Path to the Signor interaction data file in CSV format.
    signor_complexes_file : str
        Path to the Signor complexes data in CSV format. If unspecified,
        Signor complexes will not be expanded to their constitutents.

    Returns
    -------
    indra.sources.signor.SignorProcessor
        SignorProcessor containing Statements extracted from the Signor data.
    """
    # Get generator over the CSV file
    data_iter = read_unicode_csv(signor_data_file, delimiter=';', skiprows=1)
    complexes_iter = None
    if signor_complexes_file:
        complexes_iter = read_unicode_csv(signor_complexes_file, delimiter=';',
                                          skiprows=1)
    else:
        logger.warning('Signor complex mapping file not provided, Statements '
                       'involving complexes will not be expanded to members.')
    return _processor_from_data(data_iter, complexes_iter)
Example #4
0
def process_from_file(signor_data_file, signor_complexes_file=None):
    """Process Signor interaction data from CSV files.

    Parameters
    ----------
    signor_data_file : str
        Path to the Signor interaction data file in CSV format.
    signor_complexes_file : str
        Path to the Signor complexes data in CSV format. If unspecified,
        Signor complexes will not be expanded to their constitutents.

    Returns
    -------
    indra.sources.signor.SignorProcessor
        SignorProcessor containing Statements extracted from the Signor data.
    """
    # Get generator over the CSV file
    data_iter = read_unicode_csv(signor_data_file, delimiter=';', skiprows=1)
    complexes_iter = None
    if signor_complexes_file:
        complexes_iter = read_unicode_csv(signor_complexes_file,
                                          delimiter=';',
                                          skiprows=1)
    else:
        logger.warning('Signor complex mapping file not provided, Statements '
                       'involving complexes will not be expanded to members.')
    return _processor_from_data(data_iter, complexes_iter)
Example #5
0
def update_grounding_map():
    famplex_gmap = os.path.join(path, 'famplex', 'grounding_map.csv')
    covid_gmap = os.path.join(path, 'grounding', 'covid_grounding.csv')
    famplex_rows = list(read_unicode_csv(famplex_gmap))
    row_len = len(famplex_rows[0])
    covid_rows = list(read_unicode_csv(covid_gmap))
    covid_rows = [r + [''] * (row_len - len(r)) for r in covid_rows]
    all_rows = famplex_rows + covid_rows
    grounding_map = os.path.join(path, 'grounding', 'grounding_map.csv')
    write_unicode_csv(grounding_map, all_rows)
Example #6
0
def _build_uniprot_entries():
    up_entries_file = os.path.dirname(os.path.abspath(__file__)) + \
        '/../resources/uniprot_entries.tsv'
    uniprot_gene_name = {}
    uniprot_mnemonic = {}
    uniprot_mnemonic_reverse = {}
    uniprot_mgi = {}
    uniprot_rgd = {}
    uniprot_mgi_reverse = {}
    uniprot_rgd_reverse = {}
    try:
        csv_rows = read_unicode_csv(up_entries_file, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        for row in csv_rows:
            up_id, gene_name, up_mnemonic, rgd, mgi = row
            uniprot_gene_name[up_id] = gene_name
            uniprot_mnemonic[up_id] = up_mnemonic
            uniprot_mnemonic_reverse[up_mnemonic] = up_id
            if mgi:
                mgi_ids = mgi.split(';')
                if mgi_ids:
                    uniprot_mgi[up_id] = mgi_ids[0]
                    uniprot_mgi_reverse[mgi_ids[0]] = up_id
            if rgd:
                rgd_ids = rgd.split(';')
                if rgd_ids:
                    uniprot_rgd[up_id] = rgd_ids[0]
                    uniprot_rgd_reverse[rgd_ids[0]] = up_id
    except IOError:
        pass
    return (uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse, \
            uniprot_mgi, uniprot_rgd, uniprot_mgi_reverse, uniprot_rgd_reverse)
Example #7
0
def build_prior(site_map, site_labels, prot_corr_dict, rna_corr_dict,
                prot_default, rna_default, peptide_specific=True,
                num_features=100):
    peptide_file = \
        'sources/retrospective_ova_phospho_sort_common_gene_10057.txt'
    counter = 0
    prior = {}
    for row in read_unicode_csv(peptide_file, delimiter='\t', skiprows=1):
        site_id = row[0]
        gene_sym, rem = site_id.split('.', maxsplit=1)
        rs_id, site_info = rem.split(':')
        if site_id in site_map and peptide_specific:
            brca_site_ix_list = site_map[site_id]
            if len(brca_site_ix_list) > 1:
                print("More than one site for %s" % site_id)
            brca_site_ix = brca_site_ix_list[0] # FIXME
            brca_site = site_labels[brca_site_ix]
            prot_prior = [t[0][0] for t in prot_corr_dict[brca_site]]
            rna_prior = [t[0][0] for t in rna_corr_dict[brca_site]]
            prior[site_id] = (prot_prior[0:num_features],
                              rna_prior[0:num_features])
            counter += 1
        else:
            prior[site_id] = (prot_default[0:num_features],
                              rna_default[0:num_features])
    print("%d peptide-specific priors used" % counter)
    return prior
Example #8
0
def get_site_map(site_labels):
    # For each site in the SC3 phospho data, see if we have a matching site in
    # the BRCA phospho data
    site_map = {}
    brca_site_keys = [t[0] for t in site_labels]
    brca_ix_map = {}
    for ix, brca_site in enumerate(brca_site_keys):
        brca_ix_map[brca_site] = ix
    for row in read_unicode_csv('mapped_peptides.txt', delimiter='\t',
                                skiprows=1):
        site_id = row[0]
        gene_name = row[2]
        orig_site = row[4]
        mapped_site = row[7]
        site_ixs = set()
        for site in ((gene_name, orig_site), (gene_name, mapped_site)):
            brca_ix = brca_ix_map.get(site)
            if brca_ix:
                site_ixs.add(brca_ix)
        # If there's no mapping, don't add to the map
        if not site_ixs:
            continue
        # Otherwise, add
        if site_id in site_map:
            site_map[site_id] |= site_ixs
        else:
            site_map[site_id] = site_ixs
    site_map_list = {}
    for k, v in site_map.items():
        site_map_list[k] = list(v)
    return site_map_list
Example #9
0
def _build_human_mouse_rat():
    hgnc_file = os.path.dirname(os.path.abspath(__file__)) +\
                '/../resources/hgnc_entries.tsv'
    csv_rows = read_unicode_csv(hgnc_file, delimiter='\t')
    # Skip the header row
    next(csv_rows)
    uniprot_mouse = {}
    uniprot_rat = {}
    for row in csv_rows:
        human_id, mgi_id, rgd_id = row[6:9]
        if human_id:
            if mgi_id:
                mgi_id = mgi_id.split(', ')[0]
                if mgi_id.startswith('MGI:'):
                    mgi_id = mgi_id[4:]
                mouse_id = uniprot_mgi_reverse.get(mgi_id)
                if mouse_id:
                    uniprot_mouse[human_id] = mouse_id
            if rgd_id:
                rgd_id = rgd_id.split(', ')[0]
                if rgd_id.startswith('RGD:'):
                    rgd_id = rgd_id[4:]
                rat_id = uniprot_rgd_reverse.get(rgd_id)
                if rat_id:
                    uniprot_rat[human_id] = rat_id
    return uniprot_mouse, uniprot_rat
Example #10
0
    def __init__(self, biogrid_file=None, physical_only=True):
        self.statements = []
        self.physical_only = physical_only

        # If a path to the file is included, process it, skipping the header
        if biogrid_file:
            rows = read_unicode_csv(biogrid_file, '\t', skiprows=1)
        # If no file is provided, download from web
        else:
            logger.info('No data file specified, downloading from BioGrid '
                        'at %s' % biogrid_file_url)
            rows = _download_biogrid_data(biogrid_file_url)

        # Process the rows into Statements
        for row in rows:
            filt_row = [None if item == '-' else item for item in row]
            bg_row = _BiogridRow(*filt_row)
            # Filter out non-physical interactions if desired
            if self.physical_only and bg_row.exp_system_type != 'physical':
                continue
            # Ground agents
            agent_a = self._make_agent(bg_row.entrez_a, bg_row.syst_name_a)
            agent_b = self._make_agent(bg_row.entrez_b, bg_row.syst_name_b)
            # Skip any agents with neither HGNC grounding or string name
            if agent_a is None or agent_b is None:
                continue
            # Get evidence
            ev = Evidence(source_api='biogrid',
                          source_id=bg_row.biogrid_int_id,
                          pmid=bg_row.pmid,
                          text=None,
                          annotations=dict(bg_row._asdict()))
            # Make statement
            s = Complex([agent_a, agent_b], evidence=ev)
            self.statements.append(s)
Example #11
0
def _load_label_id_mappings():
    go_label_to_id = {}
    go_id_to_label = {}
    for go_id, go_label in read_unicode_csv(go_mappings_file, delimiter='\t'):
        go_id_to_label[go_id] = go_label
        go_label_to_id[go_label] = go_id
    return go_id_to_label, go_label_to_id
Example #12
0
def _read_hgnc_maps():
    hgnc_file = os.path.dirname(os.path.abspath(__file__)) + \
                '/../resources/hgnc_entries.tsv'
    csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8')
    hgnc_names = {}
    hgnc_ids = {}
    hgnc_withdrawn = []
    uniprot_ids = {}
    entrez_ids = {}
    for row in csv_rows:
        hgnc_id = row[0][5:]
        hgnc_status = row[3]
        if hgnc_status == 'Approved':
            hgnc_name = row[1]
            hgnc_names[hgnc_id] = hgnc_name
            hgnc_ids[hgnc_name] = hgnc_id
        elif hgnc_status == 'Symbol Withdrawn':
            descr = row[2]
            m = re.match(r'symbol withdrawn, see ([^ ]*)', descr)
            new_name = m.groups()[0]
            hgnc_withdrawn.append(hgnc_id)
            hgnc_names[hgnc_id] = new_name
        # Uniprot
        uniprot_id = row[6]
        uniprot_ids[hgnc_id] = uniprot_id
        # Entrez
        entrez_id = row[5]
        entrez_ids[hgnc_id] = entrez_id
    return (hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids)
Example #13
0
def _read_phosphatases():
    p_table = read_unicode_csv(_indra_path + '/resources/phosphatases.tsv',
                               delimiter='\t')
    # First column is phosphatase names
    # Second column is HGNC ids
    p_names = [row[0] for row in p_table]
    return p_names
Example #14
0
    def __init__(self, biogrid_file=None, physical_only=True):
        self.statements = []
        self.physical_only = physical_only

        # If a path to the file is included, process it, skipping the header
        if biogrid_file:
            rows = read_unicode_csv(biogrid_file, '\t', skiprows=1)
        # If no file is provided, download from web
        else:
            logger.info('No data file specified, downloading from BioGrid '
                        'at %s' % biogrid_file_url)
            rows = _download_biogrid_data(biogrid_file_url)

        # Process the rows into Statements
        for row in rows:
            filt_row = [None if item == '-' else item for item in row]
            bg_row = _BiogridRow(*filt_row)
            # Filter out non-physical interactions if desired
            if self.physical_only and bg_row.exp_system_type != 'physical':
                continue
            # Ground agents
            agent_a = self._make_agent(bg_row.entrez_a, bg_row.syst_name_a)
            agent_b = self._make_agent(bg_row.entrez_b, bg_row.syst_name_b)
            # Skip any agents with neither HGNC grounding or string name
            if agent_a is None or agent_b is None:
                continue
            # Get evidence
            ev = Evidence(source_api='biogrid',
                          source_id=bg_row.biogrid_int_id,
                          pmid=bg_row.pmid,
                          text=None,
                          annotations=dict(bg_row._asdict()))
            # Make statement
            s = Complex([agent_a, agent_b], evidence=ev)
            self.statements.append(s)
Example #15
0
def filter_transcription_factor(stmts_in, **kwargs):
    """Filter out RegulateAmounts where subject is not a transcription factor.

    Parameters
    ----------
    stmts_in : list[indra.statements.Statement]
        A list of statements to filter.
    save : Optional[str]
        The name of a pickle file to save the results (stmts_out) into.

    Returns
    -------
    stmts_out : list[indra.statements.Statement]
        A list of filtered statements.
    """
    logger.info('Filtering %d statements to remove ' % len(stmts_in) +
                'amount regulations by non-transcription-factors...')
    path = os.path.dirname(os.path.abspath(__file__))
    tf_table = \
        read_unicode_csv(path + '/../resources/transcription_factors.csv')
    gene_names = [lin[1] for lin in list(tf_table)[1:]]
    stmts_out = []
    for st in stmts_in:
        if isinstance(st, RegulateAmount):
            if st.subj is not None:
                if st.subj.name in gene_names:
                    stmts_out.append(st)
        else:
            stmts_out.append(st)
    logger.info('%d statements after filter...' % len(stmts_out))
    dump_pkl = kwargs.get('save')
    if dump_pkl:
        dump_statements(stmts_out, dump_pkl)
    return stmts_out
Example #16
0
def filter_mod_nokinase(stmts_in, **kwargs):
    """Filter non-phospho Modifications to ones with a non-kinase enzyme.

    Parameters
    ----------
    stmts_in : list[indra.statements.Statement]
        A list of statements to filter.
    save : Optional[str]
        The name of a pickle file to save the results (stmts_out) into.

    Returns
    -------
    stmts_out : list[indra.statements.Statement]
        A list of filtered statements.
    """
    logger.info('Filtering %d statements to remove ' % len(stmts_in) +
                'non-phospho modifications by kinases...')
    path = os.path.dirname(os.path.abspath(__file__))
    kinase_table = read_unicode_csv(path + '/../resources/kinases.tsv',
                                    delimiter='\t')
    gene_names = [lin[1] for lin in list(kinase_table)[1:]]
    stmts_out = []
    for st in stmts_in:
        if isinstance(st, Modification) and not \
           isinstance(st, Phosphorylation):
            if st.enz is not None:
                if st.enz.name not in gene_names:
                    stmts_out.append(st)
        else:
            stmts_out.append(st)
    logger.info('%d statements after filter...' % len(stmts_out))
    dump_pkl = kwargs.get('save')
    if dump_pkl:
        dump_statements(stmts_out, dump_pkl)
    return stmts_out
Example #17
0
def _load_data():
    """Load the data from the csv in data.

    The "gene_id" is the Entrez gene id, and the "approved_symbol" is the
    standard gene symbol. The "hms_id" is the LINCS ID for the drug.

    Returns
    -------
    data : list[dict]
        A list of dicts of row values keyed by the column headers extracted from
        the csv file, described above.
    """
    # Get the cwv reader object.
    csv_path = path.join(HERE, path.pardir, path.pardir, 'resources',
                         DATAFILE_NAME)
    data_iter = list(read_unicode_csv(csv_path))

    # Get the headers.
    headers = data_iter[0]

    # For some reason this heading is oddly formatted and inconsistent with the
    # rest, or with the usual key-style for dicts.
    headers[headers.index('Approved.Symbol')] = 'approved_symbol'
    return [{header: val for header, val in zip(headers, line)}
            for line in data_iter[1:]]
Example #18
0
def _get_phospho_site_dataset():
    """Read phosphosite data into dicts keyed by Uniprot ID and by site group.

    Returns
    -------
    tuple
        The first element of the tuple contains the PhosphoSite data keyed
        by Uniprot ID, the second element contains data keyed by site group.
        Both dicts have instances of the PhosphoSite namedtuple as values.
        If the PhosphoSite data file cannot be loaded, returns (None, None).
    """
    global _data_by_up
    global _data_by_site_grp
    if _data_by_up is None or _data_by_site_grp is None:
        # Get the csv reader generator
        reader = read_unicode_csv(phosphosite_data_file,
                                  delimiter='\t',
                                  skiprows=4)
        # Build up a dict by protein
        data_by_up = defaultdict(lambda: defaultdict(list))
        data_by_site_grp = defaultdict(list)
        for row in reader:
            site = PhosphoSite(*row)
            res_pos = site.MOD_RSD.split('-')[0]
            base_acc_id = site.ACC_ID.split('-')[0]
            data_by_up[base_acc_id][res_pos].append(site)
            data_by_site_grp[site.SITE_GRP_ID].append(site)
        _data_by_up = data_by_up
        _data_by_site_grp = data_by_site_grp
    return (_data_by_up, _data_by_site_grp)
Example #19
0
def _read_hgnc_maps():
    hgnc_file = os.path.dirname(os.path.abspath(__file__)) + \
                '/../resources/hgnc_entries.tsv'
    csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8')
    hgnc_names = {}
    hgnc_ids = {}
    hgnc_withdrawn = []
    uniprot_ids = {}
    entrez_ids = {}
    for row in csv_rows:
        hgnc_id = row[0][5:]
        hgnc_status = row[3]
        if hgnc_status == 'Approved':
            hgnc_name = row[1]
            hgnc_names[hgnc_id] = hgnc_name
            hgnc_ids[hgnc_name] = hgnc_id
        elif hgnc_status == 'Symbol Withdrawn':
            descr = row[2]
            m = re.match(r'symbol withdrawn, see ([^ ]*)', descr)
            new_name = m.groups()[0]
            hgnc_withdrawn.append(hgnc_id)
            hgnc_names[hgnc_id] = new_name
        # Uniprot
        uniprot_id = row[6]
        uniprot_ids[hgnc_id] = uniprot_id
        # Entrez
        entrez_id = row[5]
        entrez_ids[hgnc_id] = entrez_id
    return (hgnc_names, hgnc_ids, hgnc_withdrawn, uniprot_ids, entrez_ids)
Example #20
0
def _load_mappings():
    drugbank_to_db = {}
    db_to_drugbank = {}
    drugbank_names = {}
    to_db_ambigs = set()
    db_to_ambigs = set()
    for drugbank_id, db_ns, db_id, source in \
            read_unicode_csv(mappings_file, delimiter='\t', skiprows=1):
        # We skip DBSALTs for now, see https://github.com/pyobo/pyobo/issues/80
        if drugbank_id.startswith('DBSALT'):
            continue
        if db_ns == 'CHEBI':
            db_id = 'CHEBI:%s' % db_id
        if db_ns == 'NAME':
            drugbank_names[drugbank_id] = db_id
            continue
        key = (db_ns, db_id)
        if key in db_to_drugbank and db_to_drugbank[key] != drugbank_id:
            db_to_ambigs.add(key)
        db_to_drugbank[key] = drugbank_id
        key = (drugbank_id, db_ns)
        if key in drugbank_to_db and \
                drugbank_to_db[key] != db_id:
            to_db_ambigs.add(key)
        drugbank_to_db[key] = db_id
    db_to_drugbank = {
        k: v
        for k, v in db_to_drugbank.items() if k not in db_to_ambigs
    }
    drugbank_to_db = {
        k: v
        for k, v in drugbank_to_db.items() if k not in to_db_ambigs
    }
    return drugbank_to_db, db_to_drugbank, drugbank_names
Example #21
0
def _load_mappings():
    drugbank_to_db = {}
    db_to_drugbank = {}
    drugbank_names = {}
    to_db_ambigs = set()
    db_to_ambigs = set()
    for drugbank_id, db_ns, db_id, source in \
            read_unicode_csv(mappings_file, delimiter='\t', skiprows=1):
        if db_ns == 'CHEBI':
            db_id = 'CHEBI:%s' % db_id
        if db_ns == 'NAME':
            drugbank_names[drugbank_id] = db_id
            continue
        key = (db_ns, db_id)
        if key in db_to_drugbank and db_to_drugbank[key] != drugbank_id:
            db_to_ambigs.add(key)
        db_to_drugbank[key] = drugbank_id
        key = (drugbank_id, db_ns)
        if key in drugbank_to_db and \
                drugbank_to_db[key] != db_id:
            to_db_ambigs.add(key)
        drugbank_to_db[key] = db_id
    db_to_drugbank = {k: v for k, v in db_to_drugbank.items()
                      if k not in db_to_ambigs}
    drugbank_to_db = {k: v for k, v in drugbank_to_db.items()
                      if k not in to_db_ambigs}
    return drugbank_to_db, db_to_drugbank, drugbank_names
Example #22
0
def _read_phosphatases():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         'resources', 'phosphatases.tsv')
    p_table = read_unicode_csv(fname, delimiter='\t')
    # First column is phosphatase names
    # Second column is HGNC ids
    p_names = [row[0] for row in p_table]
    return p_names
Example #23
0
def save_default_prior(gene_list, filename):
    with open(filename, 'wt') as f:
        for row in read_unicode_csv('mapped_peptides.txt',
                                    delimiter='\t',
                                    skiprows=1):
            # Check for uniprot ID
            site_id = row[0]
            f.write('%s\t%s\n' % (site_id, ','.join(gene_list)))
Example #24
0
def get_mrna_list():
    mrna_set = set()
    for row in read_unicode_csv('sources/gene_types.csv', skiprows=1):
        hgnc_id = row[2]
        gene_type = row[4]
        if gene_type == 'protein_coding':
            mrna_set.add(hgnc_id)
    return list(mrna_set)
Example #25
0
def read_chebi_to_chembl():
    chebi_to_chembl_file = join(dirname(abspath(__file__)),
                                '../resources/chebi_to_chembl.tsv')
    csv_reader = read_unicode_csv(chebi_to_chembl_file, delimiter='\t')
    chebi_chembl = {}
    for row in csv_reader:
        chebi_chembl[row[0]] = row[1]
    return chebi_chembl
Example #26
0
def read_chebi_to_chembl():
    chebi_to_chembl_file = join(dirname(abspath(__file__)),
                                '../resources/chebi_to_chembl.tsv')
    csv_reader = read_unicode_csv(chebi_to_chembl_file, delimiter='\t')
    chebi_chembl = {}
    for row in csv_reader:
        chebi_chembl[row[0]] = row[1]
    return chebi_chembl
Example #27
0
def _read_famplex_map():
    fname = join(dirname(__file__), '../../resources/famplex_map.tsv')
    raw_map = read_unicode_csv(fname, '\t')

    m = {}
    for row in raw_map:
        m[(row[0], row[1])] = row[2]
    return m
Example #28
0
def _read_famplex_map():
    fname = join(dirname(__file__), '../../resources/famplex_map.tsv')
    raw_map = read_unicode_csv(fname, '\t')

    m = {}
    for row in raw_map:
        m[(row[0], row[1])] = row[2]
    return m
Example #29
0
def _read_famplex_map():
    fname = get_resource_path('famplex_map.tsv')
    raw_map = read_unicode_csv(fname, '\t')

    m = {}
    for row in raw_map:
        m[(row[0], row[1])] = row[2]
    return m
Example #30
0
def _read_phosphatases():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         'resources', 'phosphatases.tsv')
    p_table = read_unicode_csv(fname, delimiter='\t')
    # First column is phosphatase names
    # Second column is HGNC ids
    p_names = [row[0] for row in p_table]
    return p_names
Example #31
0
def _load_mesh_file(path):
    it = read_unicode_csv(path, delimiter='\t')
    for mesh_id, mesh_label, mesh_terms_str in it:
        mesh_id_to_name[mesh_id] = mesh_label
        mesh_name_to_id[mesh_label] = mesh_id
        mesh_terms = mesh_terms_str.split('|')
        for term in mesh_terms:
            mesh_name_to_id_name[term] = [mesh_id, mesh_label]
Example #32
0
 def add_famplex_nodes(self):
     nodes = []
     for row in read_unicode_csv(get_resource_path(
             os.path.join('famplex', 'entities.csv')),
                                 delimiter=','):
         entity = row[0]
         nodes.append((self.label('FPLX', entity), {'name': entity}))
     self.add_nodes_from(nodes)
Example #33
0
def _build_chebi_map():
    fname = get_resource_path('bel_chebi_map.tsv')
    chebi_name_id = {}
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    for row in csv_rows:
        chebi_name = row[0]
        chebi_id = row[1]
        chebi_name_id[chebi_name] = chebi_id
    return chebi_name_id
Example #34
0
def update_chebi_entries():
    logger.info('--Updating ChEBI entries----')
    url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \
        'Flat_file_tab_delimited/reference.tsv.gz'
    fname = os.path.join(path, 'reference.tsv.gz')
    urlretrieve(url, fname)
    with gzip.open(fname, 'rb') as fh:
        logger.info('Loading %s' % fname)
        df = pandas.read_csv(fh,
                             sep='\t',
                             index_col=None,
                             parse_dates=True,
                             encoding='latin-1')
    # Save PubChem mapping
    fname = os.path.join(path, 'chebi_to_pubchem.tsv')
    logger.info('Saving into %s' % fname)
    df_pubchem = df[df['REFERENCE_DB_NAME'] == 'PubChem']
    df_pubchem.sort_values(['COMPOUND_ID', 'REFERENCE_ID'],
                           ascending=True,
                           inplace=True)
    df_pubchem.to_csv(fname,
                      sep='\t',
                      columns=['COMPOUND_ID', 'REFERENCE_ID'],
                      header=['CHEBI', 'PUBCHEM'],
                      index=False)

    # Process PubChem mapping to eliminate SID rows and strip CID: prefix
    # If the second column of the row starts with SID:, ignore the row
    # If the second column of the row starts with CID:, strip out the CID prefix
    # Otherwise, include the row unchanged
    original_rows = read_unicode_csv(fname, '\t')
    new_rows = []
    for original_row in original_rows:
        if original_row[1].startswith('CID:'):
            new_row = original_row
            new_row[1] = new_row[1][5:]  # Strip out CID:
            new_rows.append(new_row)
        elif original_row[1].startswith('SID:'):
            # Skip SID rows
            continue
        else:
            # Include other rows unchanges
            new_rows.append(original_row)
    write_unicode_csv(fname, new_rows, '\t')

    # Save ChEMBL mapping
    fname = os.path.join(path, 'chebi_to_chembl.tsv')
    logger.info('Saving into %s' % fname)
    df_chembl = df[df['REFERENCE_DB_NAME'] == 'ChEMBL']
    df_chembl.sort_values(['COMPOUND_ID', 'REFERENCE_ID'],
                          ascending=True,
                          inplace=True)
    df_chembl.to_csv(fname,
                     sep='\t',
                     columns=['COMPOUND_ID', 'REFERENCE_ID'],
                     header=['CHEBI', 'CHEMBL'],
                     index=False)
Example #35
0
def update_bioentities_map():
    logger.info('--Updating Bioentities map----')
    # Currently this is a trivial "copy" of the Bioentities equivalences.csv
    # file. Later, name spaces may need to be adapted and other format changes
    # may be needed.
    fname_in = os.path.join(path, '../../bioentities/equivalences.csv')
    fname_out = os.path.join(path, 'bioentities_map.tsv')
    rows = read_unicode_csv(fname_in)
    write_unicode_csv(fname_out, rows, delimiter='\t')
Example #36
0
def update_grounding_map():
    famplex_gmap = os.path.join(path, 'famplex', 'grounding_map.csv')
    famplex_rows = list(read_unicode_csv(famplex_gmap))
    row_len = len(famplex_rows[0])
    extra_rows = []
    # read in json file containing filenames for non-famplex grounding maps
    with open(os.path.join(path, 'grounding', 'extra_gmap_files.json')) as f:
        extra_gm_files = json.load(f)
    # Add non-famplex grounding map rows, adding blank values to synchronize
    # the number of columns with the number in the famplex grounding map
    for gm_filename in extra_gm_files:
        gmap = os.path.join(path, 'grounding', gm_filename)
        new_rows = list(read_unicode_csv(gmap))
        new_rows = [r + [''] * (row_len - len(r)) for r in new_rows]
        extra_rows.extend(new_rows)
    all_rows = famplex_rows + extra_rows
    grounding_map = os.path.join(path, 'grounding', 'grounding_map.csv')
    write_unicode_csv(grounding_map, all_rows)
def update_famplex_map():
    logger.info('--Updating FamPlex map----')
    # Currently this is a trivial "copy" of the FamPlex equivalences.csv
    # file. Later, name spaces may need to be adapted and other format changes
    # may be needed.
    fname_in = os.path.join(path, 'famplex/equivalences.csv')
    fname_out = os.path.join(path, 'famplex_map.tsv')
    rows = read_unicode_csv(fname_in)
    write_unicode_csv(fname_out, rows, delimiter='\t')
Example #38
0
def load_grounding_map(grounding_map_path,
                       lineterminator='\r\n',
                       hgnc_symbols=True):
    """Return a grounding map dictionary loaded from a csv file.

    In the file pointed to by grounding_map_path, the number of name_space ID
    pairs can vary per row and commas are
    used to pad out entries containing fewer than the maximum amount of
    name spaces appearing in the file. Lines should be terminated with \r\n
    both a carriage return and a new line by default.

    Optionally, one can specify another csv file (pointed to by ignore_path)
    containing agent texts that are degenerate and should be filtered out.

    It is important to note that this function assumes that the mapping file
    entries for the HGNC key are symbols not IDs. These symbols are converted
    to IDs upon loading here.

    Parameters
    ----------
    grounding_map_path : str
        Path to csv file containing grounding map information. Rows of the file
        should be of the form <agent_text>,<name_space_1>,<ID_1>,...
        <name_space_n>,<ID_n>
    lineterminator : Optional[str]
        Line terminator used in input csv file. Default: \r\n
    hgnc_symbols : Optional[bool]
        Set to True if the grounding map file contains HGNC symbols rather than
        IDs. In this case, the entries are replaced by IDs. Default: True

    Returns
    -------
    g_map : dict
        The grounding map constructed from the given files.
    """
    gmap = {}
    map_rows = read_unicode_csv(grounding_map_path,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL,
                                lineterminator=lineterminator)
    for row in map_rows:
        txt = row[0]
        keys = [entry for entry in row[1::2] if entry]
        values = [entry for entry in row[2::2] if entry]
        if not keys or not values:
            logger.warning('Missing grounding entries for %s, skipping.' % txt)
            continue
        if len(keys) != len(values):
            logger.warning('Mismatched keys and values in row %s, skipping.' %
                           str(row))
            continue
        gmap[txt] = dict(zip(keys, values))
    if hgnc_symbols:
        gmap = replace_hgnc_symbols(gmap)
    return gmap
Example #39
0
def _build_chebi_map():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         '../resources/bel_chebi_map.tsv')
    chebi_name_id = {}
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    for row in csv_rows:
        chebi_name = row[0]
        chebi_id = row[1]
        chebi_name_id[chebi_name] = chebi_id
    return chebi_name_id
Example #40
0
def read_chebi_to_pubchem():
    chebi_to_pubchem_file = join(dirname(abspath(__file__)),
                                 '../resources/chebi_to_pubchem.tsv')
    csv_reader = read_unicode_csv(chebi_to_pubchem_file, delimiter='\t')
    chebi_pubchem = {}
    pubchem_chebi = {}
    for row in csv_reader:
        chebi_pubchem[row[0]] = row[1]
        pubchem_chebi[row[1]] = row[0]
    return chebi_pubchem, pubchem_chebi
Example #41
0
def _read_famplex_map():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         '../../resources/famplex_map.tsv')
    famplex_map = {}
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    for row in csv_rows:
        source_ns = row[0]
        source_id = row[1]
        be_id = row[2]
        famplex_map[(source_ns, source_id)] = be_id
    return famplex_map
Example #42
0
def _build_uniprot_subcell_loc():
    fname = os.path.dirname(os.path.abspath(__file__)) +\
                '/../resources/uniprot_subcell_loc.tsv'
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    # Skip the header row
    up_to_go = {}
    for row in csv_rows:
        upid = row[0]
        goid = row[1]
        up_to_go[upid] = goid
    return up_to_go
Example #43
0
def _build_bioentities_map():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         '../resources/bioentities_map.tsv')
    bel_to_indra = {}
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    for row in csv_rows:
        namespace = row[0]
        entry = row[1]
        indra_name = row[2]
        if namespace == 'BEL':
            bel_to_indra[entry] = indra_name
    return bel_to_indra
Example #44
0
def _read_ncit_map():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                         '../../resources/ncit_map.tsv')
    ncit_map = {}
    csv_rows = read_unicode_csv(fname, delimiter='\t')
    next(csv_rows)
    for row in csv_rows:
        ncit_id = row[0]
        target_ns = row[1]
        target_id = row[2]
        ncit_map[ncit_id] = (target_ns, target_id)
    return ncit_map
Example #45
0
def _build_uniprot_subcell_loc():
    fname = os.path.dirname(os.path.abspath(__file__)) +\
                '/../resources/uniprot_subcell_loc.tsv'
    try:
        csv_rows = read_unicode_csv(fname, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        subcell_loc = {}
        for row in csv_rows:
            loc_id = row[0]
            loc_alias = row[3]
            subcell_loc[loc_id] = loc_alias
    except IOError:
        subcell_loc = {}
    return subcell_loc
Example #46
0
def _build_uniprot_hgnc():
    hgnc_file = os.path.dirname(os.path.abspath(__file__)) +\
                '/../resources/hgnc_entries.txt'
    try:
        csv_rows = read_unicode_csv(hgnc_file, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        uniprot_hgnc = {}
        for row in csv_rows:
            hgnc_name = row[1]
            uniprot_id = row[6]
            if uniprot_id:
                uniprot_hgnc[uniprot_id] = hgnc_name
    except IOError:
        uniprot_hgnc = {}
    return uniprot_hgnc
Example #47
0
def update_chebi_entries():
    logger.info('--Updating ChEBI entries----')
    url = 'ftp://ftp.ebi.ac.uk/pub/databases/chebi/' + \
        'Flat_file_tab_delimited/reference.tsv.gz'
    fname = os.path.join(path, 'reference.tsv.gz')
    urlretrieve(url, fname)
    with gzip.open(fname, 'rb') as fh:
        logger.info('Loading %s' % fname)
        df = pandas.read_csv(fh, sep='\t', index_col=None,
                             parse_dates=True, encoding='latin-1')
    # Save PubChem mapping
    fname = os.path.join(path, 'chebi_to_pubchem.tsv')
    logger.info('Saving into %s' % fname)
    df_pubchem = df[df['REFERENCE_DB_NAME']=='PubChem']
    df_pubchem.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True,
                           inplace=True)
    df_pubchem.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'],
                      header=['CHEBI', 'PUBCHEM'], index=False)

    # Process PubChem mapping to eliminate SID rows and strip CID: prefix
    # If the second column of the row starts with SID:, ignore the row
    # If the second column of the row starts with CID:, strip out the CID prefix
    # Otherwise, include the row unchanged
    original_rows = read_unicode_csv(fname, '\t')
    new_rows = []
    for original_row in original_rows:
        if original_row[1].startswith('CID:'):
            new_row = original_row
            new_row[1] = new_row[1][5:] # Strip out CID:
            new_rows.append(new_row)
        elif original_row[1].startswith('SID:'):
            # Skip SID rows
            continue
        else:
            # Include other rows unchanges
            new_rows.append(original_row)
    write_unicode_csv(fname, new_rows, '\t')

    # Save ChEMBL mapping
    fname = os.path.join(path, 'chebi_to_chembl.tsv')
    logger.info('Saving into %s' % fname)
    df_chembl = df[df['REFERENCE_DB_NAME']=='ChEMBL']
    df_chembl.sort_values(['COMPOUND_ID', 'REFERENCE_ID'], ascending=True,
                          inplace=True)
    df_chembl.to_csv(fname, sep='\t', columns=['COMPOUND_ID', 'REFERENCE_ID'],
                      header=['CHEBI', 'CHEMBL'], index=False)
Example #48
0
def load_site_map(path):
    """Load the modification site map from a file.

    The site map file should be a comma-separated file with six columns::

        Gene: HGNC gene name
        OrigRes: Original (incorrect) residue
        OrigPos: Original (incorrect) residue position
        CorrectRes: The correct residue for the modification
        CorrectPos: The correct residue position
        Comment: Description of the reason for the error.

    Parameters
    ----------
    path : string
        Path to the tab-separated site map file.

    Returns
    -------
    dict
        A dict mapping tuples of the form `(gene, orig_res, orig_pos)` to a
        tuple of the form `(correct_res, correct_pos, comment)`, where `gene`
        is the string name of the gene (canonicalized to HGNC); `orig_res` and
        `orig_pos` are the residue and position to be mapped; `correct_res` and
        `correct_pos` are the corrected residue and position, and `comment` is
        a string describing the reason for the mapping (species error, isoform
        error, wrong residue name, etc.).
    """
    site_map = {}
    maprows = read_unicode_csv(path)
    # Skip the header line
    next(maprows)
    for row in maprows:
        # Don't allow empty entries in the key section
        if not (row[0] and row[1] and row[2]):
            raise Exception("Entries in the key (gene, residue, position) "
                            "may not be empty.")
        correct_res = row[3].strip() if row[3] else None
        correct_pos = row[4].strip() if row[4] else None
        comment = row[5].strip() if row[5] else None
        site_map[(row[0].strip(), row[1].strip(), row[2].strip())] = \
                                (correct_res, correct_pos, comment)
    return site_map
Example #49
0
def _build_uniprot_entries():
    up_entries_file = os.path.dirname(os.path.abspath(__file__)) + \
        '/../resources/uniprot_entries.tsv'
    uniprot_gene_name = {}
    uniprot_mnemonic = {}
    uniprot_mnemonic_reverse = {}
    try:
        csv_rows = read_unicode_csv(up_entries_file, delimiter='\t')
        # Skip the header row
        next(csv_rows)
        for row in csv_rows:
            up_id = row[0]
            gene_name = row[1]
            up_mnemonic = row[3]
            uniprot_gene_name[up_id] = gene_name
            uniprot_mnemonic[up_id] = up_mnemonic
            uniprot_mnemonic_reverse[up_mnemonic] = up_id
    except IOError:
        pass
    return uniprot_gene_name, uniprot_mnemonic, uniprot_mnemonic_reverse
def main(relations_file):
    g = Graph()
    isa = indra_rel_ns.term('isa')
    partof = indra_rel_ns.term('partof')

    family_names = set([])
    csv_rows = read_unicode_csv(relations_file, delimiter=',', quotechar='"',
                                quoting=csv.QUOTE_MINIMAL,
                                lineterminator='\r\n')
    for line in csv_rows:
        ns1, id1, rel, ns2, id2 = line
        term1 = make_term(ns1, id1)
        term2 = make_term(ns2, id2)
        if rel in ('isa', 'partof'):
            rel_term = indra_rel_ns.term(rel)
        else:
            raise ValueError("Invalid relation %s" % rel)
        g.add((term1, rel_term, term2))

    save_hierarchy(g, hierarchy_path)
Example #51
0

logger = logging.getLogger(__name__)


go_mappings_file = join(dirname(abspath(__file__)), '..', 'resources',
                 'go_id_label_mappings.tsv')


# This file can be donwloaded from: http://geneontology.org/ontology/go.owl
go_owl_path = join(dirname(abspath(__file__)), '..', '..', 'data', 'go.owl')


# Dictionary to store GO ID->Label mappings
go_mappings = {}
for go_id, go_label in read_unicode_csv(go_mappings_file, delimiter='\t'):
    go_mappings[go_id] = go_label


_prefixes = """
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX go: <http://purl.obolibrary.org/obo/go#>
    PREFIX obo: <http://purl.obolibrary.org/obo/>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX oboInOwl: <http://www.geneontology.org/formats/oboInOwl#>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    """



# Lazily initialize the GO RDF graph because parsing the RDF is expensive
Example #52
0
def _read_kinases():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         'resources', 'kinases.tsv')
    kinase_table = read_unicode_csv(fname, delimiter='\t')
    gene_names = [lin[1] for lin in list(kinase_table)[1:]]
    return gene_names
Example #53
0
def _read_tfs():
    fname = os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir,
                         'resources', 'transcription_factors.csv')
    tf_table = read_unicode_csv(fname)
    gene_names = [lin[1] for lin in list(tf_table)[1:]]
    return gene_names
Example #54
0
def _read_hgnc_maps():
    hgnc_file = os.path.dirname(os.path.abspath(__file__)) + \
                '/../resources/hgnc_entries.tsv'
    csv_rows = read_unicode_csv(hgnc_file, delimiter='\t', encoding='utf-8')
    hgnc_names = {}
    hgnc_ids = {}
    hgnc_withdrawn = []
    uniprot_ids = {}
    entrez_ids = {}
    entrez_ids_reverse = {}
    mouse_map = {}
    rat_map = {}
    prev_sym_map = {}
    for row in csv_rows:
        hgnc_id = row[0][5:]
        hgnc_status = row[3]
        if hgnc_status == 'Approved':
            hgnc_name = row[1]
            hgnc_names[hgnc_id] = hgnc_name
            hgnc_ids[hgnc_name] = hgnc_id
        elif hgnc_status == 'Symbol Withdrawn':
            descr = row[2]
            m = re.match(r'symbol withdrawn, see ([^ ]*)', descr)
            new_name = m.groups()[0]
            hgnc_withdrawn.append(hgnc_id)
            hgnc_names[hgnc_id] = new_name
        # Uniprot
        uniprot_id = row[6]
        uniprot_ids[hgnc_id] = uniprot_id
        # Entrez
        entrez_id = row[5]
        entrez_ids[hgnc_id] = entrez_id
        entrez_ids_reverse[entrez_id] = hgnc_id
        # Mouse
        mgi_id = row[7]
        if mgi_id:
            mgi_ids = mgi_id.split(', ')
            for mgi_id in mgi_ids:
                if mgi_id.startswith('MGI:'):
                    mgi_id = mgi_id[4:]
                mouse_map[mgi_id] = hgnc_id
        # Rat
        rgd_id = row[8]
        if rgd_id:
            rgd_ids = rgd_id.split(', ')
            for rgd_id in rgd_ids:
                if rgd_id.startswith('RGD:'):
                    rgd_id = rgd_id[4:]
                rat_map[rgd_id] = hgnc_id
        # Previous symbols
        prev_sym_entry = row[9]
        if prev_sym_entry:
            prev_syms = prev_sym_entry.split(', ')
            for prev_sym in prev_syms:
                # If we already mapped this previous symbol to another ID
                if prev_sym in prev_sym_map:
                    # If we already have a list here, we just extend it
                    if isinstance(prev_sym_map[prev_sym], list):
                        prev_sym_map[prev_sym].append(hgnc_id)
                    # Otherwise we create a list and start it with the two
                    # IDs we know the symbol is mapped to
                    else:
                        prev_sym_map[prev_sym] = [prev_sym_map[prev_sym],
                                                  hgnc_id]
                # Otherwise we just make a string entry here
                else:
                    prev_sym_map[prev_sym] = hgnc_id

    return (hgnc_names, hgnc_ids, hgnc_withdrawn,
            uniprot_ids, entrez_ids, entrez_ids_reverse, mouse_map, rat_map,
            prev_sym_map)
Example #55
0
               'ja': 'http://www.elsevier.com/xml/ja/dtd',
               'xocs': 'http://www.elsevier.com/xml/xocs/dtd',
               'common': 'http://www.elsevier.com/xml/common/dtd',
               'atom': 'http://www.w3.org/2005/Atom',
               'prism': 'http://prismstandard.org/namespaces/basic/2.0/'}

# THE API KEY IS NOT UNDER VERSION CONTROL FOR SECURITY
# For more information see http://dev.elsevier.com/
api_key_file = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                            'elsevier_api_keys')
api_key_env_name = 'ELSEVIER_API_KEY'
inst_key_env_name = 'ELSEVIER_INST_KEY'

# Try to read the API key from a file
try:
    elsevier_keys = dict(read_unicode_csv(api_key_file))
    # Check whether the institution key is present
    if not elsevier_keys.get('X-ELS-Insttoken'):
        logger.info('Optional institution key X-ELS-Insttoken not found in '
                    'elsevier key file.')
    # Check that the API key entry has the right name
    if not elsevier_keys.get('X-ELS-APIKey'):
        logger.error('API key X-ELS-APIKey not found in elsevier key file.')
        elsevier_keys = None
except IOError:
    logger.warning('Elsevier API keys file could not be read, trying '
                   'environment variables $%s and $%s.' %
                   (api_key_env_name, inst_key_env_name))
    logger.debug('Tried key file: %s' % api_key_file)
    # Try the environment variable for the api key. This one is optional,
    # so if it is not found then we just leave it out of the keys dict
Example #56
0
import sys
import csv
import shutil
import pickle
from indra.sources import reach
from indra.util import read_unicode_csv
from indra.literature import pmc_client, get_full_text, id_lookup
from assembly_eval import have_file, run_assembly

if __name__ == '__main__':
    # This script assumes that the papers have been processed offline,
    # e.g., using the submit_reading_pipeline.py script on Amazon,
    # and the results placed in a dict (mapping PMID -> lists of statements)
    # and put in the folder reach/reach_stmts_batch_4_eval.pkl.
    folder = 'reach'

    # Load the PMID to PMCID map
    pmid_to_pmcid = {}
    csvreader = read_unicode_csv('pmc_batch_4_id_map.txt', delimiter='\t')
    for row in csvreader:
        pmid_to_pmcid[row[1]] = row[0]

    # Load the REACH reading output
    with open(os.path.join(folder, 'reach_stmts_batch_4_eval.pkl'), 'rb') as f:
        stmts = pickle.load(f)

    # Iterate over all of the PMIDs
    for pmid, stmts in stmts.items():
        pmcid = pmid_to_pmcid[pmid]
        run_assembly(stmts, folder, pmcid)
Example #57
0
from functools import lru_cache
from urllib.parse import urlencode
from os.path import abspath, dirname, join, pardir
import requests
from indra.util import read_unicode_csv

MESH_URL = 'https://id.nlm.nih.gov/mesh/'
HERE = dirname(abspath(__file__))
RESOURCES = join(HERE, pardir, 'resources')
MESH_FILE = join(RESOURCES, 'mesh_id_label_mappings.tsv')
MESH_REV_LOOKUPS = join(RESOURCES, 'mesh_name_id_maps.json')


mesh_id_to_name = {}
mesh_name_to_id = {}
for mesh_id, mesh_label in read_unicode_csv(MESH_FILE, delimiter='\t'):
    mesh_id_to_name[mesh_id] = mesh_label
    mesh_name_to_id[mesh_label] = mesh_id

with open(MESH_REV_LOOKUPS, 'r') as f:
    mesh_name_to_id_name = json.load(f)


@lru_cache(maxsize=1000)
def get_mesh_name_from_web(mesh_id):
    """Get the MESH label for the given MESH ID using the NLM REST API.

    Parameters
    ----------
    mesh_id : str
        MESH Identifier, e.g. 'D003094'.
Example #58
0
def load_grounding_map(grounding_map_path, ignore_path=None,
                       lineterminator='\r\n'):
    """Return a grounding map dictionary loaded from a csv file.

    In the file pointed to by grounding_map_path, the number of name_space ID
    pairs can vary per row and commas are
    used to pad out entries containing fewer than the maximum amount of
    name spaces appearing in the file. Lines should be terminated with \r\n
    both a carriage return and a new line by default.

    Optionally, one can specify another csv file (pointed to by ignore_path)
    containing agent texts that are degenerate and should be filtered out.

    Parameters
    ----------
    grounding_map_path : str
        Path to csv file containing grounding map information. Rows of the file
        should be of the form <agent_text>,<name_space_1>,<ID_1>,...
        <name_space_n>,<ID_n>
    ignore_path : Optional[str]
        Path to csv file containing terms that should be filtered out during
        the grounding mapping process. The file Should be of the form
        <agent_text>,,..., where the number of commas that
        appear is the same as in the csv file at grounding_map_path.
        Default: None
    lineterminator : Optional[str]
        Line terminator used in input csv file. Default: \r\n

    Returns
    -------
    g_map : dict
        The grounding map constructed from the given files.
    """
    g_map = {}
    map_rows = read_unicode_csv(grounding_map_path, delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL,
                                lineterminator='\r\n')
    if ignore_path and os.path.exists(ignore_path):
        ignore_rows = read_unicode_csv(ignore_path, delimiter=',',
                                       quotechar='"',
                                       quoting=csv.QUOTE_MINIMAL,
                                       lineterminator=lineterminator)
    else:
        ignore_rows = []
    csv_rows = chain(map_rows, ignore_rows)
    for row in csv_rows:
        key = row[0]
        db_refs = {'TEXT': key}
        keys = [entry for entry in row[1::2] if entry != '']
        values = [entry for entry in row[2::2] if entry != '']
        if len(keys) != len(values):
            logger.info('ERROR: Mismatched keys and values in row %s' %
                        str(row))
            continue
        else:
            db_refs.update(dict(zip(keys, values)))
            if len(db_refs.keys()) > 1:
                g_map[key] = db_refs
            else:
                g_map[key] = None
    return g_map
Example #59
0
def _read_relative_csv(rel_path):
    file_path = join(dirname(abspath(__file__)), rel_path)
    csv_reader = read_unicode_csv(file_path, delimiter='\t')
    return csv_reader