Ejemplo n.º 1
0
def get_gene_pmids(genes):
    pmids = []
    for gene in genes:
        pmids_gene = pubmed_client.get_ids_for_gene(gene)
        print('%s: %d' % (gene, len(pmids_gene)))
        pmids += pmids_gene
    return list(set(pmids))
Ejemplo n.º 2
0
def get_pmids(gene_names):
    pmids = []
    for gene_name in gene_names:
        pm = get_ids_for_gene(gene_name)
        pmids += pm
        print('%s: %d PMIDs' % (gene_name, len(pm)))
    return pmids
Ejemplo n.º 3
0
def get_pmids(ambig_terms):
    term_pmids = {}
    pmid_counter = Counter()
    for term in ambig_terms:
        key = (term.db, term.id)
        if term.db == 'HGNC':
            gene = term.entry_name
            try:
                term_pmids[key] = pubmed_client.get_ids_for_gene(gene)
            except ValueError:
                print('Could not get PMIDs for gene: %s' % gene)
                term_pmids[key] = []
            pmid_counter.update(term_pmids[key])
            time.sleep(0.5)
        elif term.db == 'MESH':
            pmids = pubmed_client.get_ids_for_mesh(term.id, major_topic=False)
            if len(pmids) > 1000:
                pmids = pubmed_client.get_ids_for_mesh(term.id,
                                                       major_topic=True)
            term_pmids[key] = pmids[:1000]
            pmid_counter.update(term_pmids[key])
            time.sleep(0.5)
        else:
            print('Unhandled ambiguous term: %s' % str(key))
    term_pmids = {
        k: [p for p in pmids if pmid_counter[p] == 1]
        for k, pmids in term_pmids.items()
    }
    return term_pmids
Ejemplo n.º 4
0
def get_gene_pmids(gene_names):
    """Return PMIDs for all genes of interest."""
    genes_pmid_list = []
    for gene in gene_names:
        genes_pmid_list += pubmed_client.get_ids_for_gene(gene)
    genes_pmid_list = list(set(genes_pmid_list))
    print('Found %d PMIDs for genes' % len(genes_pmid_list))
    return genes_pmid_list
Ejemplo n.º 5
0
def get_gene_pmids(genes):
    all_pmids = set()
    for gene in genes:
        print(gene)
        pmids = pubmed_client.get_ids_for_gene(gene)
        all_pmids = all_pmids.union(set(pmids))
    all_pmids = sorted(list(all_pmids))
    return all_pmids
Ejemplo n.º 6
0
def get_searchgenes_pmids(search_genes, num_days):
    pmids = {}
    for s in search_genes:
        try:
            pmids[s] = pubmed_client.get_ids_for_gene(s, reldate=num_days)
        except ValueError as e:
            logger.error('Gene symbol %s is invalid')
            continue
    return pmids
Ejemplo n.º 7
0
def get_searchgenes_pmids(search_genes, num_days):
    pmids = {}
    for s in search_genes:
        try:
            pmids[s] = pubmed_client.get_ids_for_gene(s, reldate=num_days)
        except ValueError:
            logger.error('Gene symbol %s is invalid')
            continue
    return pmids
Ejemplo n.º 8
0
def get_gene_pmids(genes, out_file='pmids.txt'):
    all_pmids = set()
    for gene in genes:
        print(gene)
        pmids = pubmed_client.get_ids_for_gene(gene)
        all_pmids = all_pmids.union(set(pmids))
    all_pmids = sorted(list(all_pmids))
    with open(out_file, 'wb') as fh:
        for pmid in all_pmids:
            fh.write(('%s\n' % pmid).encode('utf-8'))
    return all_pmids
Ejemplo n.º 9
0
def get_gene_pmids(genes, out_file='pmids.txt'):
    all_pmids = set()
    for gene in genes:
        print(gene)
        pmids = pubmed_client.get_ids_for_gene(gene)
        all_pmids = all_pmids.union(set(pmids))
    all_pmids = sorted(list(all_pmids))
    with open(out_file, 'wb') as fh:
        for pmid in all_pmids:
            fh.write(('%s\n' % pmid).encode('utf-8'))
    return all_pmids
Ejemplo n.º 10
0
def get_text_content_for_gene(hgnc_name):
    """Get articles that have been annotated to contain gene in entrez

    Parameters
    ----------
    hgnc_name : str
       HGNC name for gene

    Returns
    -------
    text_content : list of str
        xmls of fulltext if available otherwise abstracts for all articles
        that haven been annotated in entrez to contain the given gene
    """
    pmids = pubmed_client.get_ids_for_gene(hgnc_name)
    return get_text_content_for_pmids(pmids)
Ejemplo n.º 11
0
def get_text_content_for_gene(hgnc_name):
    """Get articles that have been annotated to contain gene in entrez

    Parameters
    ----------
    hgnc_name : str
       HGNC name for gene

    Returns
    -------
    text_content : list of str
        xmls of fulltext if available otherwise abstracts for all articles
        that haven been annotated in entrez to contain the given gene
    """
    pmids = pubmed_client.get_ids_for_gene(hgnc_name)
    return get_text_content_for_pmids(pmids)
Ejemplo n.º 12
0
def get_ids():
    """Search PubMed for references for the Ras 227 gene set."""
    # Check if we've got the files already
    if os.path.isfile('reading/pmids.pkl') and \
       os.path.isfile('reading/pmids_from_gene.pkl'):
        with open('reading/pmids.pkl') as pmids_file:
            pmids = pickle.load(pmids_file)
        with open('reading/pmids_from_gene.pkl') as pmids_from_gene_file:
            pmids_from_gene = pickle.load(pmids_from_gene_file)
        return (pmids, pmids_from_gene)

    # STEP 0: Get gene list
    gene_list = []
    # Get gene list from ras_pathway_proteins.csv
    fname = os.path.join(indra.__path__[0], 'resources',
                         'ras_pathway_proteins.csv')
    with open(fname) as f:
        csvreader = csv.reader(f, delimiter='\t')
        for row in csvreader:
            gene_list.append(row[0].strip())

    pmids = OrderedDict()
    pmids_from_gene = OrderedDict()

    for gene in gene_list:
        print("Querying for %s" % gene)
        ids_gene = set(pubmed_client.get_ids_for_gene(gene))
        print("Found %d in gene query" % len(ids_gene))
        # Hack to deal with excessive number of names
        if gene == 'MET':
            query_gene = 'CMET'
        elif gene == 'JUN':
            query_gene = 'CJUN'
        else:
            query_gene = gene
        ids_pubmed = set(
            pubmed_client.get_ids(query_gene, **{'retmax': 100000}))
        print("Found %d in string query" % len(ids_pubmed))
        pmids[gene] = ids_pubmed
        pmids_from_gene[gene] = ids_gene

    with open('reading/pmids.pkl', 'wb') as f:
        pickle.dump(pmids, f)
    with open('reading/pmids_from_gene.pkl', 'wb') as f:
        pickle.dump(pmids_from_gene, f)
    return pmids, pmids_from_gene
Ejemplo n.º 13
0
def get_ids():
    """Search PubMed for references for the Ras 227 gene set."""
    # Check if we've got the files already
    if os.path.isfile('reading/pmids.pkl') and \
       os.path.isfile('reading/pmids_from_gene.pkl'):
        with open('reading/pmids.pkl') as pmids_file:
            pmids = pickle.load(pmids_file)
        with open('reading/pmids_from_gene.pkl') as pmids_from_gene_file:
            pmids_from_gene = pickle.load(pmids_from_gene_file)
        return (pmids, pmids_from_gene)

    # STEP 0: Get gene list
    gene_list = []
    # Get gene list from ras_pathway_proteins.csv
    with open('../../data/ras_pathway_proteins.csv') as f:
        csvreader = csv.reader(f, delimiter='\t')
        for row in csvreader:
            gene_list.append(row[0].strip())

    pmids = OrderedDict()
    pmids_from_gene = OrderedDict()

    for gene in gene_list:
        print "Querying for", gene
        ids_gene = set(pubmed_client.get_ids_for_gene(gene))
        print "Found %d in gene query" % len(ids_gene)
        # Hack to deal with excessive number of names
        if gene == 'MET':
            query_gene = 'CMET'
        elif gene == 'JUN':
            query_gene = 'CJUN'
        else:
            query_gene = gene
        ids_pubmed = set(pubmed_client.get_ids(query_gene,
                                               **{'retmax': 100000}))
        print "Found %d in string query" % len(ids_pubmed)
        pmids[gene] = ids_pubmed
        pmids_from_gene[gene] = ids_gene

    with open('reading/pmids.pkl', 'w') as f:
        pickle.dump(pmids, f)
    with open('reading/pmids_from_gene.pkl', 'w') as f:
        pickle.dump(pmids_from_gene, f)
    return (pmids, pmids_from_gene)
Ejemplo n.º 14
0
def test_get_ids_for_gene():
    ids = pubmed_client.get_ids_for_gene('EXOC1')
    assert ids
    assert unicode_strs(ids)
Ejemplo n.º 15
0
def get_pmids_entrez(kinase):
    pmids = get_ids_for_gene(kinase)
    time.sleep(1)
    return pmids
Ejemplo n.º 16
0
    import pickle
    from indra.literature import pubmed_client
    from indra.tools.reading import submit_reading_pipeline as sub_aws
    from indra.tools import assemble_corpus as ac
    from indra.util import write_unicode_csv

    basename = sys.argv[1]
    # Get gene list
    with open('genes.txt', 'rt') as f:
        genes = [line.strip() for line in f.readlines()]

    # Assemble a list of PMIDs curated in Entrez gene
    pmids_for_genes = {}
    for gene_ix, gene in enumerate(genes):
        try:
            pmids = pubmed_client.get_ids_for_gene(gene)
        except ValueError:
            print("%s: Invalid gene name, skipping" % gene)
            continue
        print("%s: %d articles" % (gene, len(pmids)))
        pmids_for_genes[gene] = pmids
    pmids = set(
        [pmid for pmid_list in pmids_for_genes.values() for pmid in pmid_list])

    # Save the PMIDs to a file
    print("Saving PMIDs")
    with open('lab_meeting_pmids.txt', 'wt') as f:
        for pmid in pmids:
            f.write('%s\n' % pmid)

    #job_ids = sub_aws.submit_run_reach(basename, 'lab_meeting_pmids.txt',
Ejemplo n.º 17
0
 dict_filename = 'pmids_for_gene.pkl'
 if os.path.exists(dict_filename):
     with open(dict_filename, 'rb') as f:
         pmids_for_gene = pickle.load(f)
 else:
     pmids_for_gene = {}
 # Get PMIDs for each HGNC ID
 num_added = 0
 for hgnc_name in hgnc_names:
     # If HGN
     #print('Getting PMIDs for %s' % hgnc_name)
     if hgnc_name in pmids_for_gene:
         print('%s: already got PMIDs, skipping' % hgnc_name)
         continue
     try:
         pmids = pubmed_client.get_ids_for_gene(hgnc_name)
     except ValueError as ex:
         print("Exception in gettting PMIDs for %s: %s" % (hgnc_name, ex))
         print("Continuing...")
         continue
     print('%s: %d PMIDs' % (hgnc_name, len(pmids)))
     pmids_for_gene[hgnc_name] = pmids
     num_added += 1
     if num_added % 50 == 0:
         print("Saving info for %d genes" % len(pmids_for_gene))
         with open(dict_filename, 'wb') as f:
             pickle.dump(pmids_for_gene, f)
         unique_pmids = set([
             pmid for pmid_list in pmids_for_gene.values()
             for pmid in pmid_list
         ])
Ejemplo n.º 18
0
def test_get_ids_for_gene():
    ids = pubmed_client.get_ids_for_gene('EXOC1')
    assert ids
    assert unicode_strs(ids)
Ejemplo n.º 19
0
def test_get_ids_for_gene():
    time.sleep(0.3)
    ids = pubmed_client.get_ids_for_gene('EXOC1')
    assert ids
    assert unicode_strs(ids)
Ejemplo n.º 20
0
def test_get_ids_for_gene():
    time.sleep(0.5)
    ids = pubmed_client.get_ids_for_gene('EXOC1')
    assert ids
Ejemplo n.º 21
0
def test_get_ids_for_gene():
    time.sleep(0.3)
    ids = pubmed_client.get_ids_for_gene('EXOC1')
    assert ids
    assert unicode_strs(ids)