def test_review_publication(self): ec = Client() trialpub_ids = [29871025, 29859785, 29866619] review_ids = [28775712, 28549125, 29929949] trialpubs = ec.efetch(db='pubmed', id=trialpub_ids) reviews = ec.efetch(db='pubmed', id=review_ids) for i, a in enumerate(trialpubs): crud.pubmedarticle_to_db(a, 'trial_publications') for i, a in enumerate(reviews): crud.pubmedarticle_to_db(a, 'systematic_reviews') crud.review_publication(a.pmid, trialpub_ids[i], 1) conn = self.mock_conn(True) cur = conn.cursor() cur.execute( "SELECT trialpub_id from review_trialpubs where review_id = %s;", (a.pmid, )) trialpub = cur.fetchone() self.assertEqual(trialpub[0], trialpub_ids[i]) conn.close()
def populate_reviews(period): """ download all new reviews made available on pubmed in the last <period> # days & save to db if they have trials in CrossRef or Cochrane """ base_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' r = utils.requests_retry_session().get( base_url, params={ 'db': 'pubmed', 'term': 'systematic review[ti] OR meta analysis[ti] OR cochrane database of systematic reviews[ta]', 'format': 'json', 'retmax': 300000, 'email': crud.eutils_email, 'tool': crud.eutils_tool, 'api_key': eutils_key, 'date_type': 'edat', 'mindate': (datetime.now().date() - timedelta(days=period)).strftime('%Y/%m/%d'), 'maxdate': '3000' }) json = r.json() pmids = json['esearchresult']['idlist'] print len(pmids) segments = utils.chunks(pmids, 100) ec = Client(api_key=eutils_key) for s in segments: while True: try: articles = ec.efetch(db='pubmed', id=s) break except (eutils.exceptions.EutilsNCBIError, eutils.exceptions.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: print e time.sleep(5) a_iter = iter(articles) while True: try: article = a_iter.next() except StopIteration: break print '-----------------' + article.pmid + '-------------------------' if article.doi is not None: ids = bot.check_trialpubs_nctids(article.pmid, article.doi) else: ids = bot.check_trialpubs_nctids(article.pmid) if ids: if ids.pmids: print ids.pmids count = crud.articles_with_nctids( tuple(x for x in ids.pmids)) print count if count and len(count) > 0: print 'articles with links = ' + str(len(count)) print 'inserting ' + str(article.pmid) crud.pubmedarticle_to_db(article, 'systematic_reviews') for trialpub in count: crud.review_publication(article.pmid, trialpub, 9) linked_ncts = crud.linked_nctids(trialpub) for nct in linked_ncts: crud.review_trial(article.pmid, nct, False, 'included', user_id=9, nickname='crossrefbot') if ids.nctids: crud.pubmedarticle_to_db(article, 'systematic_reviews') print 'nct ids in crossref = ' + str(len(ids.nctids)) for nct_id in ids.nctids: crud.review_trial(article.pmid, nct_id, False, 'included', 'crossrefbot', 9) if not ids.nctids and not ids.pmids: print 'found nothing' else: print 'nothing' if 'Cochrane' in article.jrnl: print 'Cochrane' crud.pubmedarticle_to_db(article, 'systematic_reviews') bot.cochranebot(article.doi, article.pmid) bot.cochrane_ongoing_excluded(article.doi, article.pmid) conn = dblib.create_con(VERBOSE=True) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) cur.execute( "select rt.review_id, json_agg(distinct v.user_id) as users from review_rtrial rt" " inner join votes v on rt.id = v.link_id where rt.review_id = %s group by" " rt.review_id;", (article.pmid, )) new_users = cur.fetchone() if not new_users: new_users = {'users': []} if not {17, 9} & set(new_users['users']): print 'deleting ' + str(new_users['users']), article.pmid cur.execute( "delete from votes where link_id in (select id from review_rtrial where review_id = %s);", (article.pmid, )) conn.commit() cur.execute( "delete from review_trialpubs where review_id = %s;", (article.pmid, )) conn.commit() cur.execute( "delete from review_rtrial where review_id = %s;", (article.pmid, )) conn.commit() cur.execute( "delete from systematic_reviews where review_id = %s;", (article.pmid, )) conn.commit() conn.close() else: print 'not cochrane'
def cochranebot(doi, review_id, sess_id=None): """ extract & save included trial IDs for a review from Cochrane Library website text @param doi: DOI of review @param review_id: PMID of review @param sess_id: session ID if transitting progress via websocket """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') socketio.emit('cochranebot_update', {'msg': 'searching cochrane for included studies'}, room=sess_id) socketio.sleep(0) base_url = "https://www.cochranelibrary.com/cdsr/doi/{}/references".format( doi) while True: try: r = requests.get( base_url, headers={ 'User-Agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36' }) break except requests.exceptions.TooManyRedirects: if sess_id: socketio.emit('cochranebot_update', {'msg': 'nothing found by cochranebot'}, room=sess_id) socketio.sleep(0) socketio.emit('cochranebot_update', {'msg': 'cochranebot complete'}, room=sess_id) return except requests.exceptions.ChunkedEncodingError as e: print(e) print('retrying... chunked encoding error, ') time.sleep(10) if r.status_code == 200: soup = bs4.BeautifulSoup(r.content, 'html.parser') spl_doi = doi.split('.')[2] if 'CD' not in spl_doi: if sess_id: socketio.emit('cochranebot_update', {'msg': 'nothing found by cochranebot'}, room=sess_id) socketio.sleep(0) socketio.emit('cochranebot_update', {'msg': 'cochranebot complete'}, room=sess_id) return included_studies = soup.find_all( "div", {"class": "references_includedStudies"}) if included_studies: nct_ids = [] pmids = [] for b in included_studies: for x in re.finditer(r"(NCT|nct)[0-9]{8}", str(b)): nct_ids.append(x.group().upper()) if sess_id: socketio.emit('cochranebot_update', {'msg': 'found nct ID ' + nct_ids[-1]}, room=sess_id) socketio.sleep(0) for x in re.finditer(r"pubmed/[0-9]{8}", str(b)): pmids.append(x.group().split('/')[1]) if sess_id: socketio.emit('cochranebot_update', {'msg': 'found PMID ' + pmids[-1]}, room=sess_id) socketio.sleep(0) for x in re.finditer(r"PUBMED: [0-9]{8}", str(b)): pmids.append(x.group().split(' ')[1]) if sess_id: socketio.emit('cochranebot_update', {'msg': 'found PMID ' + pmids[-1]}, room=sess_id) socketio.sleep(0) if sess_id: socketio.emit('cochranebot_update', { 'msg': 'trying to resolve automatic links from PubMed IDs' }, room=sess_id) socketio.sleep(0) if pmids: count = crud.articles_with_nctids(pmids) print('cochrane included articles with links = ' + str(count)) if count and len(count) > 0: for trialpub in count: crud.review_publication(review_id, trialpub, 17) linked_ncts = crud.linked_nctids(trialpub) for nct in linked_ncts: crud.review_trial(review_id, nct, False, 'included', user_id=17, nickname='cochranebot', vote_type='up') if sess_id: socketio.emit('cochranebot_update', { 'msg': 'cochranebot found included trials with IDs ' + ', '.join(linked_ncts) }, room=sess_id) nct_ids = list(set(nct_ids)) print('cochrane nct_ids ' + str(nct_ids)) for id in nct_ids: crud.review_trial(review_id, id, False, 'included', 'cochranebot', 17) if not included_studies: if sess_id: socketio.emit('cochranebot_update', {'msg': 'nothing found by cochranebot'}, room=sess_id) socketio.sleep(0) socketio.emit('cochranebot_update', {'msg': 'cochranebot complete'}, room=sess_id) return if sess_id: socketio.emit('cochranebot_update', {'msg': 'cochranebot complete'}, room=sess_id) socketio.sleep(0)
def cochrane_ongoing_excluded(doi, review_id, sess_id=None): """ extract & save ongoing and excluded trial IDs for a review from Cochrane Library website text @param doi: DOI of review @param review_id: PMID of review @param sess_id: session ID if transitting progress via websocket @return: """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') socketio.emit( 'cochranebot_update', {'msg': 'searching cochrane for ongoing or excluded studies'}, room=sess_id) socketio.sleep(0) base_url = "https://www.cochranelibrary.com/cdsr/doi/{}/references".format( doi) try: r = requests.get( base_url, headers={ 'User-Agent': 'Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M; wv) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/60.0.3112.107 Mobile Safari/537.36' }) except requests.exceptions.TooManyRedirects: if sess_id: socketio.emit('cochranebot_update', {'msg': 'nothing found by cochranebot'}, room=sess_id) socketio.sleep(0) socketio.emit('cochranebot_update', {'msg': 'cochranebot complete'}, room=sess_id) return if r.status_code == 200: soup = bs4.BeautifulSoup(r.content, 'html.parser') spl_doi = doi.split('.')[2] if 'CD' not in spl_doi: if sess_id: socketio.emit('cochranebot_update', {'msg': 'nothing found by cochranebot'}, room=sess_id) socketio.sleep(0) socketio.emit('cochranebot_update', {'msg': 'cochranebot complete'}, room=sess_id) return excluded_studies = soup.find_all( "div", {"class": "references_excludedStudies"}) if excluded_studies: nct_ids = [] pmids = [] for b in excluded_studies: for x in re.finditer(r"(NCT|nct)[0-9]{8}", str(b)): nct_ids.append(x.group().upper()) if sess_id: socketio.emit('cochranebot_update', {'msg': 'found nct ID ' + nct_ids[-1]}, room=sess_id) socketio.sleep(0) for x in re.finditer(r"pubmed/[0-9]{8}", str(b)): pmids.append(x.group().split('/')[1]) if sess_id: socketio.emit('cochranebot_update', {'msg': 'found PMID ' + pmids[-1]}, room=sess_id) socketio.sleep(0) for x in re.finditer(r"PUBMED: [0-9]{8}", str(b)): pmids.append(x.group().split(' ')[1]) if sess_id: socketio.emit('cochranebot_update', {'msg': 'found PMID ' + pmids[-1]}, room=sess_id) socketio.sleep(0) # if included by crossrefbot, move it if pmids: count = crud.articles_with_nctids(pmids) print('cochrane excluded articles with links = ' + str(count)) if count and len(count) > 0: for trialpub in count: crud.review_publication(review_id, trialpub, 17) linked_ncts = crud.linked_nctids(trialpub) for nct in linked_ncts: crud.review_trial(review_id, nct, False, 'relevant', user_id=17, nickname='cochranebot', vote_type='down') if sess_id: socketio.emit('cochranebot_update', { 'msg': 'cochranebot found excluded trials with IDs ' + ', '.join(linked_ncts) }, room=sess_id) nct_ids = list(set(nct_ids)) print('excluded: ' + ', '.join(nct_ids)) for id in nct_ids: # if included by crossrefbot, move it crud.review_trial(review_id, id, False, 'relevant', 'cochranebot', 17, vote_type='down') ongoing_studies = soup.find_all("div", {"class": "references_ongoingStudies"}) if ongoing_studies: relevant_nct = [] for b in ongoing_studies: for x in re.finditer(r"(NCT|nct)[0-9]{8}", str(b)): relevant_nct.append(x.group().upper()) if sess_id: socketio.emit( 'cochranebot_update', {'msg': 'found nct ID ' + relevant_nct[-1]}, room=sess_id) relevant_nct = list(set(relevant_nct)) print(relevant_nct) for nct in relevant_nct: # TODO ensure that already included gets moved to relevant crud.review_trial(review_id, nct, False, 'relevant', 'cochranebot', 17) awaiting_studies = soup.find_all( "div", {"class": "references_awaitingAssessmentStudies"}) if awaiting_studies: relevant_nct = [] for b in awaiting_studies: for x in re.finditer(r"(NCT|nct)[0-9]{8}", str(b)): relevant_nct.append(x.group().upper()) if sess_id: socketio.emit( 'cochranebot_update', {'msg': 'found nct ID ' + relevant_nct[-1]}, room=sess_id) relevant_nct = list(set(relevant_nct)) print(relevant_nct) for nct in relevant_nct: crud.review_trial(review_id, nct, False, 'relevant', 'cochranebot', 17) if not excluded_studies and not awaiting_studies and not ongoing_studies: if sess_id: socketio.emit('cochranebot_update', {'msg': 'nothing found by cochranebot'}, room=sess_id) socketio.sleep(0) socketio.emit('cochranebot_update', {'msg': 'cochranebot complete'}, room=sess_id) return if sess_id: socketio.emit('cochranebot_update', { 'msg': 'cochranebot complete', 'refresh_both': True }, room=sess_id) socketio.sleep(0)
def check_citations(review_id, sess_id=None, review_doi=None): """ check IDs obtained from the references of a review for automatic links, and save these links @param review_id: PubMed ID of review @param sess_id: session ID if transitting progress via websocket @param review_doi: DOI of review @return: """ if sess_id: socketio = SocketIO(message_queue='amqp://localhost') ec = Client(api_key=eutils_key) while True: try: articles = ec.efetch(db='pubmed', id=review_id) break except (eutils.EutilsNCBIError, eutils.EutilsRequestError, requests.exceptions.SSLError, requests.exceptions.ConnectionError) as e: print(e) time.sleep(5) a_iter = iter(articles) while True: try: article = next(a_iter) except StopIteration: break print('-----------------' + article.pmid + '-------------------------') if article.doi is not None: ids = check_trialpubs_nctids(article.pmid, article.doi, sess_id=sess_id) else: ids = check_trialpubs_nctids(article.pmid, sess_id=sess_id) if ids: if ids.pmids: if sess_id: socketio.emit('crossrefbot_update', { 'msg': 'crossrefbot found references to ' + str(len(ids.pmids)) + ' PubMed articles. Checking articles for links to included trials...' }, room=sess_id) count = crud.articles_with_nctids(ids.pmids) if count and len(count) > 0: if sess_id: socketio.emit('crossrefbot_update', { 'msg': str(len(count)) + ' articles have links to included trials' }, room=sess_id) for trialpub in count: crud.review_publication(article.pmid, trialpub, 9) linked_ncts = crud.linked_nctids(trialpub) for nct in linked_ncts: crud.review_trial(review_id, nct, False, 'included', user_id=9, nickname='crossrefbot') if ids.nctids: print('nct ids in crossref = ' + str(len(ids.nctids))) if sess_id: socketio.emit('crossrefbot_update', { 'msg': str(len(ids.nctids)) + ' included trials were listed directly in crossref' }, room=sess_id) for nct_id in ids.nctids: crud.review_trial(article.pmid, nct_id, False, 'included', 'crossrefbot', 9) if not ids.nctids and not ids.pmids: if sess_id: socketio.emit( 'crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id) elif sess_id: socketio.emit('crossrefbot_update', {'msg': 'crossrefbot complete'}, room=sess_id) elif sess_id: socketio.emit('crossrefbot_update', {'msg': 'No trials found. Crossrefbot complete'}, room=sess_id)