Ejemplo n.º 1
0
def check_trialpubs_nctids(review_id, review_doi=None, sess_id=None):
    """
    resolve the references of a review to PMIDs and NCTIDs
    @param review_id: PubMed ID of review
    @param review_doi: DOI of review
    @param sess_id: session ID if transitting progress via websocket
    @return: namedtuple with found PMIDs and NCTIDs
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
    ec = Client(api_key=eutils_key)
    cr = Crossref(mailto=config.MAIL_USERNAME)
    print('bp1')
    if not review_doi:
        while True:
            try:
                paset = ec.efetch(db='pubmed', id=review_id)
                break
            except (eutils.EutilsNCBIError, eutils.EutilsRequestError,
                    requests.exceptions.SSLError,
                    requests.exceptions.ConnectionError) as e:
                print(e)
                time.sleep(5)
        try:
            pa = next(iter(paset))
        except StopIteration as e:
            print('##EMPTY ITERATOR', e)
            print('retrying...')
            time.sleep(60)
            return check_trialpubs_nctids(review_id, review_doi, sess_id)

        if hasattr(pa, 'doi'):
            review_doi = pa.doi
        if not review_doi:
            if sess_id:
                socketio.emit('crossrefbot_update',
                              {'msg': 'No trials found. Crossrefbot complete'},
                              room=sess_id)
            return
    print('bp2')
    retry_attempts = 0
    while True:
        try:
            if review_doi[-1] == '.':
                review_doi = review_doi[:-1]
            resp = cr.works(ids=[str(review_doi)])
            break
        except requests.HTTPError as e:
            if e.response.status_code == 404:
                if sess_id:
                    socketio.emit(
                        'crossrefbot_update',
                        {'msg': 'No trials found. Crossrefbot complete'},
                        room=sess_id)
                print(e)
                return
            else:
                time.sleep(5)
                print('UNHANDLED HTTP ERROR', e)
                print('retrying...')
                continue
        except requests.exceptions.ConnectionError as e:
            print(e)
            time.sleep(10)
            print('connection error, retrying...')
            if retry_attempts >= 6:
                raise Exception('failed too many times')
                break
            retry_attempts += 1
    print('bp3')
    if resp['status'] == 'ok':
        parsed = resp['message']
        if "reference" in parsed:
            if sess_id:
                socketio.emit('crossrefbot_update', {
                    'msg':
                    '%s references in crossref. trying to resolve to PubMed articles'
                    % len(parsed['reference'])
                },
                              room=sess_id)
                eventlet.sleep(0)
            print('%s references found in crossref' % len(parsed['reference']))
            to_resolve = []
            references = parsed['reference']
            dois = [doi["DOI"] for doi in references if 'DOI' in doi]
            print('bp4')
            if dois:
                # if we get pubmed metadata for these DOIs, we can cross-check which dois match the ones in our set of references
                # what if > 250 TODO: WARNING:eutils._internal.client:NCBI found 251 results, but we truncated the reply at 250 results; see https://github.com/biocommons/eutils/issues/124/
                chunk_dois = utils.chunks(dois, 250)
                for dois in chunk_dois:
                    while True:
                        print(
                            'bp4.1',
                            ' OR '.join(['"' + doi + '"[AID]'
                                         for doi in dois]))
                        try:
                            with eventlet.Timeout(300):
                                esr = ec.esearch(db='pubmed',
                                                 term=' OR '.join([
                                                     '"' + doi + '"[AID]'
                                                     for doi in dois
                                                 ]))
                            break
                        except (eutils.EutilsNCBIError,
                                eutils.EutilsRequestError,
                                requests.exceptions.SSLError,
                                requests.exceptions.ConnectionError,
                                lxml.etree.XMLSyntaxError,
                                eventlet.timeout.Timeout) as e:
                            print('possible timeout?', e)
                            time.sleep(5)
                    if esr.ids:
                        while True:
                            print('bp4.2', esr.ids)
                            try:
                                paset = ec.efetch(db='pubmed', id=esr.ids)
                                break
                            except (eutils.EutilsNCBIError,
                                    eutils.EutilsRequestError,
                                    requests.exceptions.SSLError,
                                    requests.exceptions.ConnectionError,
                                    requests.exceptions.ReadTimeout,
                                    requests.exceptions.ChunkedEncodingError
                                    ) as e:
                                print(e)
                                time.sleep(5)
                        pa_iter = iter(paset)
                        while True:
                            try:
                                pma = next(pa_iter)
                            except StopIteration:
                                break
                            if pma.doi is not None and pma.doi in dois:
                                dois.remove(pma.doi)
                                to_resolve.append(pma.pmid)
            print('bp5')
            remaining = [
                x for x in references
                if ('DOI' not in x or ('DOI' in x and x['DOI'] in dois)) and (
                    'first-page' in x or 'author' in x or 'article-title' in x
                    or 'volume' in x or 'journal-title' in x or 'year' in x)
            ]
            if remaining:
                citation_pmids = ecitmatch_tools.batch_pmids_for_citation(
                    remaining, debug=True)
                check_metadata = []
                if citation_pmids:
                    for i, citation in enumerate(citation_pmids):
                        if utils.RepresentsInt(citation):
                            to_resolve.append(citation)
                            check_metadata.append(citation)
                            continue
                        elif citation_pmids[i].startswith('AMBIGUOUS'):
                            cand = citation[10:].split(',')
                            if utils.RepresentsInt(cand[0]):
                                to_resolve.extend(cand)
                                check_metadata.append(cand)
                if check_metadata:
                    while True:
                        try:
                            with eventlet.Timeout(300):
                                paset = ec.efetch(db='pubmed',
                                                  id=check_metadata)
                            break
                        except (eutils.EutilsNCBIError,
                                eutils.EutilsRequestError,
                                requests.exceptions.SSLError,
                                requests.exceptions.ConnectionError,
                                eventlet.timeout.Timeout) as e:
                            print('possible timeout?')
                            print(e)
                            time.sleep(5)
                    pa_iter = iter(paset)
                    while True:
                        try:
                            pma = next(pa_iter)
                        except StopIteration:
                            break
                        if pma.doi is not None and pma.doi in dois:
                            dois.remove(pma.doi)
                            to_resolve.append(pma.pmid)
            print('bp6')
            try_doi = batch_doi2pmid(dois)
            if try_doi:
                for doi in try_doi:
                    if utils.RepresentsInt(str(doi)):
                        to_resolve.append(doi)
            nct_ids = []
            for i, citation in enumerate(references):
                if 'unstructured' in citation.keys():
                    spl = citation['unstructured'].split(' ')
                    for i in spl:
                        if re.match(r"(NCT|nct)[0-9]{8}", i):
                            if len(i) == 11:
                                nct_ids.append(i)
                                continue
            print('bp11')
            to_resolve = [str(x) for x in to_resolve]
            to_resolve = list(set(to_resolve))
            content = collections.namedtuple('ids', ['pmids', 'nctids'])
            return content(to_resolve, nct_ids)
    return False
Ejemplo n.º 2
0
#########################################################################################################

##### API-key (NCBI)
eclient = Client(api_key="8ecce891e7fa036ff84bccc7c74e5138dc09")
#gene_efetch = eclient.efetch(db='gene', id=91039)
Entrez.email = "*****@*****.**"

#########################################################################################################

##### nucleotide search
### Setting up query
mRNAtranscripts = []
transcriptmRNA_esearch = eclient.esearch(
    db='nucleotide',
    term='(' + gene +
    '[gene] AND "H**o sapiens"[Primary Organism] AND refseq[filter]) NOT biomol_genomic[PROP]'
)
print("\nLoading currently available ids from Entrez nucleotide...")
print("=" * 70)
print("\nTranscript variant ids: ")
print(transcriptmRNA_esearch.ids)
for item in transcriptmRNA_esearch.ids:
    mRNAtranscripts.append(item)
print("\nSearch results: {}\n".format(transcriptmRNA_esearch.count))
### Esummary for retrieving information
### For each id in mRNAtranscripts
### Save data to csv file
with open('results-nucleotide.csv', mode='w') as result_nucleotide:
    result_writer = csv.writer(result_nucleotide, delimiter=';')
    result_writer.writerow([
Ejemplo n.º 3
0
                    "--max",
                    help="number of words to test",
                    nargs='?',
                    const=1,
                    type=int,
                    default=50)
parser.add_argument("-c",
                    "--corpus",
                    help="the corpus (brown,webtext,gutenberg)",
                    default="brown")
args = parser.parse_args()

print(args.corpus)
corpus = eval(args.corpus)

ec = Client(api_key=api.apikey)  #replace with your NCBI apikey
frequency_list = FreqDist(i.lower() for i in corpus.words())

print("word\tcorpusFreq\tpubmedFreq")
for word in random.sample(set(corpus.words()), args.max):
    freq = frequency_list[word.lower()]
    #let's focus on somewhat common words
    if (freq > 1):
        try:
            a = ec.esearch(db='pubmed', term=word)
            print("{}\t{}\t{}".format(word, freq, a.count))
        except (TimeoutError):
            time.sleep(5)  #slow down buddy
            ec = Client(api_key=api.apikey)
        time.sleep(0.1)  #ncbi will complain otherwise
Ejemplo n.º 4
0
                if not re.search('Ontology|Taxonomy', ont_name, flags=re.IGNORECASE):
                    if ont_name in cits:
                        del cits[ont_name]
                    ont_name = ont_name+' Ontology'
                t1 = time.time()
                #print("time: {}".format(t1-t0))
                if t1-t0 > 60:
                    ec = Client(api_key=api.apikey)
                if ont_name in cits and cits[ont_name] > 0:
                    pass
                else:
                    rp = re.compile("^The ")
                    ont_name = rp.sub('', ont_name)
                    ont_name = ont_name.replace('"', '')
                    term = "({})".format(ont_name.replace(" ", "+"))
                    a = ec.esearch(db='pubmed', term=term)
                    cits[ont_name] = a.count
                    if showUids:
                        print("{}\t{}\t{}".format(ont_name, a.count, a.ids))
                    else:
                        print("{}\t{}".format(ont_name, a.count))
                newcites.seek(0)
                json.dump(cits, newcites)
            except:
                print("I probably timed out again or whatever...lemme catch my breath")
                time.sleep(2)
                newcites.seek(0)
                json.dump(cits, newcites)
                ec = Client(api_key=api.apikey)

print("I identified {} ontologies".format(len(cits.keys())))
Ejemplo n.º 5
0
			else:
				try:
					r[key] = str(r[key])
				except UnicodeEncodeError:
					r[key] = r[key].encode('utf-8')
					
		if callable(record):
			r = record(r)
		elif record is not None:
			raise ValueError('Unknown record transform function (args.record).')
		if r:
			writer.write(r)

client = Client(api_key = apikey)
if prog == 'esearch':
	sret  = client.esearch(db = db, term = term)
	try:
		error = list(sret._xml_root.find('ErrorList').iterchildren())
	except:
		error = None
	
	print sret.count if not error else 0
	
	if not sret.ids:
		rets = []
	else:
		rets = client.efetch(db = db, id = sret.ids)
		rets = list(iter(rets))
	writerResults(rets)

else:
Ejemplo n.º 6
0
def check_trialpubs_nctids(review_id, review_doi=None, sess_id=None):
    """
    resolve the references of a review to PMIDs and NCTIDs
    @param review_id: PubMed ID of review
    @param review_doi: DOI of review
    @param sess_id: session ID if transitting progress via websocket
    @return: namedtuple with found PMIDs and NCTIDs
    """
    if sess_id:
        socketio = SocketIO(message_queue='amqp://localhost')
    ec = Client(api_key=eutils_key)
    cr = Crossref(mailto=config.MAIL_USERNAME)
    if not review_doi:
        while True:
            try:
                paset = ec.efetch(db='pubmed', id=review_id)
                break
            except (
            eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError, requests.exceptions.SSLError,
            requests.exceptions.ConnectionError) as e:
                print e
                time.sleep(5)
        pa = iter(paset).next()
        if hasattr(pa, 'doi'):
            review_doi = pa.doi
        if not review_doi:
            if sess_id:
                socketio.emit('crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id)
            return
    try:
        if review_doi[-1] == '.':
            review_doi = review_doi[:-1]
        resp = cr.works(ids=[str(review_doi)])
    except requests.HTTPError as e:
        if sess_id:
            socketio.emit('crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id)
        print e
        return
    if resp['status'] == 'ok':
        parsed = resp['message']
        if "reference" in parsed:
            if sess_id:
                socketio.emit('crossrefbot_update', {'msg': str(len(parsed[
                                                                        'reference'])) + ' references found in crossref. trying to resolve these to PubMed articles...'},
                              room=sess_id)
                eventlet.sleep(0)
            print str(len(parsed['reference'])) + ' references found in crossref'
            to_resolve = []
            references = parsed['reference']
            dois = [doi["DOI"] for doi in references if 'DOI' in doi]
            if dois:
                # if we get pubmed metadata for these DOIs, we can cross-check which dois match the ones in our set of references
                # what if > 250
                chunk_dois = utils.chunks(dois, 250)
                for dois in chunk_dois:
                    while True:
                        try:
                            esr = ec.esearch(db='pubmed', term=' OR '.join(['"' + doi + '"[AID]' for doi in dois]))
                            break
                        except (eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError,
                                requests.exceptions.SSLError, requests.exceptions.ConnectionError,
                                lxml.etree.XMLSyntaxError) as e:
                            print e
                            time.sleep(5)
                    if esr.ids:
                        while True:
                            try:
                                paset = ec.efetch(db='pubmed', id=esr.ids)
                                break
                            except (eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError,
                                    requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e:
                                print e
                                time.sleep(5)
                        pa_iter = iter(paset)
                        while True:
                            try:
                                pma = pa_iter.next()
                            except StopIteration:
                                break
                            if pma.doi is not None and pma.doi in dois:
                                dois.remove(pma.doi)
                                to_resolve.append(pma.pmid)
            remaining = [x for x in references if ('DOI' not in x or ('DOI' in x and x['DOI'] in dois)) and (
                        'first-page' in x or 'author' in x or 'article-title' in x or 'volume' in x or 'journal-title' in x or 'year' in x)]
            if remaining:
                citation_pmids = ecitmatch_tools.batch_pmids_for_citation(remaining, debug=False)
                check_metadata = []
                if citation_pmids:
                    for i, citation in enumerate(citation_pmids):
                        if utils.RepresentsInt(citation):
                            to_resolve.append(citation)
                            check_metadata.append(citation)
                            continue
                        elif citation_pmids[i].startswith('AMBIGUOUS'):
                            cand = citation[10:].split(',')
                            if utils.RepresentsInt(cand[0]):
                                to_resolve.extend(cand)
                                check_metadata.append(cand)
                if check_metadata:
                    while True:
                        try:
                            paset = ec.efetch(db='pubmed', id=check_metadata)
                            break
                        except (eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError,
                                requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e:
                            print  e
                            time.sleep(5)
                    pa_iter = iter(paset)
                    while True:
                        try:
                            pma = pa_iter.next()
                        except StopIteration:
                            break
                        if pma.doi is not None and pma.doi in dois:
                            dois.remove(pma.doi)
                            to_resolve.append(pma.pmid)
            try_doi = batch_doi2pmid(dois)
            if try_doi:
                for doi in try_doi:
                    if utils.RepresentsInt(str(doi)):
                        to_resolve.append(doi)
            nct_ids = []
            for i, citation in enumerate(references):
                if 'unstructured' in citation.keys():
                    spl = citation['unstructured'].split(' ')
                    for i in spl:
                        if re.match(r"(NCT|nct)[0-9]{8}", i):
                            if len(i) == 11:
                                nct_ids.append(i)
                                continue
            to_resolve = [str(x) for x in to_resolve]
            to_resolve = list(set(to_resolve))
            content = collections.namedtuple('ids', ['pmids', 'nctids'])
            return content(to_resolve, nct_ids)
    return False