def process_pdf_task(pub): """ This task processes a PDF and generates a thumbnail preview of it. """ logger.info("STARTING TASK {}".format(pub)) # Process pub IDs with prefixes if pub.get('pub_pmid'): pub_id = pub.get('pub_pmid') elif pub.get('pub_pmc'): pub_id = "PMC" + str(pub.get('pub_pmc')) elif pub.get('pub_arxiv'): pub_id = "ARXIV:" + pub.get('pub_arxiv') elif pub.get('pub_biorxiv'): pub_id = "BIORXIV:" + pub.get('pub_biorxiv') elif pub.get('pub_doi'): pub_id = pub.get('pub_doi') pub_item = get_publication(pub_id) pub_type, pub_id = id_type(pub_id) url = pub.get('pub_pdf_url') # Attempt to find the publication if url == 'searching': try: if pub_type == 'pmid': found_pdf = FindIt(pmid=pub['pub_pmid']) elif pub_type == 'doi': found_pdf = FindIt(doi=pub['pub_doi']) url = found_pdf.url pub_item.pub_pdf_url = url # Update status to indicate PDF found! db.session.commit() except MetaPubError: url = None if url: fname = download_pdf(url) sha1_fname = sha1_file(fname) thumbnail_fname = pdf_to_thumb(fname, sha1_fname) gs_client = google_storage() bucket = gs_client.get_bucket('pdf_thumbnails') thumbnail_url_fname = "{}.png".format(sha1_fname) thumbnail_obj = bucket.blob(thumbnail_url_fname) try: thumbnail_obj.upload_from_filename(thumbnail_fname) # Delete after upload os.remove(thumbnail_fname) # Update database - set thumbnail to sha1_fname logger.info("Stored thumbnail: " + thumbnail_url_fname) pub_item.pub_thumbnail = sha1_fname except FileNotFoundError: pub_item.pub_pdf_url = None else: pub_item.pub_pdf_url = None db.session.commit()
def test_using_cache(self): src = FindIt(pmid=SAMPLE_PMIDS['nonembargoed'][0]) assert src.url is not None assert src._cache is not None # source from the same pmid. check that result is same as if we used no cache. cached_src = FindIt(pmid=SAMPLE_PMIDS['nonembargoed'][0]) fresh_src = FindIt(pmid=SAMPLE_PMIDS['nonembargoed'][0]) assert cached_src.url == fresh_src.url
def test_aaas_tango(self): pmid_needs_form = '18385036' # Sci Signal requiring form negotiation # pmid_needs_form_url = 'http://stke.sciencemag.org/content/1/13/eg3.full.pdf' pmid_no_form = '25678633' # Science pmid_no_form_url = 'http://sciencemag.org/content/347/6223/695.full.pdf' source = FindIt(pmid=pmid_no_form) assert source.url == pmid_no_form_url source = FindIt(pmid=pmid_needs_form) # TODO: update this when the_aaas_tango knows how to navigate forms. assert source.url is None
def test_pmc_twist(self): #TODO: get a new embargoed PMID embargoed = '25554792' # Science / pmc-release = Jan 2, 2016 / PMC4380271 embargoed_url = 'http://sciencemag.org/content/347/6217/1258522.full.pdf' nonembargoed = '26106273' # Saudi Pharm / pmc-release = None / PMC4475813 source = FindIt(pmid=embargoed) assert source.pma.pmc == '4380271' #assert source.pma.history['pmc-release'] is not None #assert source.url == embargoed_url source = FindIt(pmid=nonembargoed) assert source.pma.pmc == '4475813' assert source.pma.history.get('pmc-release', None) is None print(source.url)
def pdf_url(self): if self.pma.journal.lower().startswith('genereviews'): #TODO: make book_url a @property in metapub PubMedArticle return GENEREVIEWS_URL.format(bookid=self.pma.book_accession_id) if not self._pdf_src: self._pdf_src = FindIt(self.pmid, verify=False) return self._pdf_src.url
def main(): jrnls = jstage_journals for jrnl in jrnls: pmids = get_sample_pmids_for_journal(jrnl) for pmid in pmids: source = FindIt(pmid) print( '[{source.pma.journal}]\t{source.pmid}: {source.url} ({source.reason})' .format(source=source)) write_findit_result_to_csv(source)
def main(start_pmid=0): pmids = open(PMID_OUTPUT_FILENAME).read() if start_pmid: idx = pmids.find(str(start_pmid)) else: idx = 0 for pmid in pmids[idx:].split('\n'): source = FindIt(pmid, verify=False) print('[{source.pma.journal}]\t{source.pmid}: {source.url} ({source.reason})'.format(source=source)) write_findit_result_to_csv(source)
def print_article_for_pmid(pmid): try: source = FindIt(pmid, verify=False) except Exception as error: print("Something's wrong with Gilligan's Island... %s" % pmid) return print('----- PMID: %s' % pmid) print(source.pma.title) if source.url: print(source.url) else: print(source.reason)
def get_pdf_url(ref): pdf_array = [] for ident in ref.identifiers: if ident["identifier_type"].lower() in [ "pmid", "pubmed", "pubmed id", "pubmed identifier" ]: src = FindIt(ident["identifier"]) if src.url is None: print(src.reason) elif src.url not in pdf_array: pdf_array.append(src.url) elif ident["identifier_type"].lower() in [ "doi", "digital object id", "digital object identifier" ]: src = FindIt(doi=ident["identifier"]) if src.url is None: print(src.reason) elif src.url not in pdf_array: pdf_array.append(src.url) return pdf_array
def test_jama_dance(self): doi_but_unfree = '26575068' source = FindIt(doi_but_unfree) #TODO re-examine ^^ assert source.url is not None
def findit(pmid): source = FindIt(pmid=pmid) outd = source.to_dict() outd['article'] = source.pma.to_dict() return HTTP200(outd)
def test_scielo_chula(self): pmid = 26840468 source = FindIt(pmid) assert source.url == 'http://www.scielo.br/pdf/ag/v52n4/0004-2803-ag-52-04-00278.pdf'
def test_jci_polka(self): pmid = 26030226 source = FindIt(pmid=pmid)
def test_skipping_cache(self): # use a known working, non-PMC pubmed ID src = FindIt(pmid=26111251, cachedir=None) assert src._cache is None assert src.url is not None assert not src.reason
print( 'Supply text2gene table name containing PMID column as argument to this script.' ) sys.exit() entries = PubtatorDB().fetchall( 'select distinct(PMID) from text2gene.{}'.format(tablename)) print() print('%i PMIDs found in text2gene.%s' % (len(entries), tablename)) print() def dmesg(pmid, msg): print('[%s] <%i> %s' % (pmid, time.time(), msg)) for entry in entries: pmid = entry['PMID'] dmesg(pmid, 'collecting') try: src = FindIt(pmid, verify=False) except MetaPubError as error: dmesg(pmid, '%r' % error) continue if src.url: dmesg(src.pmid, src.url) else: dmesg(src.pmid, src.reason)
print("supply filename of PMID list as argument to this script") sys.exit() re_pmid = re.compile('^\d+$') def validate_pmid(pmid): pmid = pmid.strip() if re_pmid.findall(pmid): return True else: return False pmids = list(set(open(filename, 'r').readlines())) for pmid in [item.strip() for item in pmids if validate_pmid(item)]: print(pmid) try: src = FindIt(pmid=pmid, debug=True) print('{src.pmid}\t{src.doi}\tScore: {src.doi_score}\t{src.pma.title}'. format(src=src)) if src.url: print(src.url) else: print(src.reason) except Exception as error: print(error) print()
#DEBUG = True #### logging.getLogger("requests").setLevel(logging.WARNING) logging.getLogger("eutils").setLevel(logging.INFO) #### if __name__ == '__main__': try: filename = sys.argv[1] except IndexError: print( 'Supply a filename containing a list of PMIDs as argument to this script.' ) sys.exit() pmids = open(filename, 'r').readlines() for pmid in [item.strip() for item in pmids if item.strip() != '']: try: src = FindIt(pmid, retry_errors=True) except Exception as error: print(error) continue print(pmid, src.doi, src.pma.title) if src.url: print(" url: ", src.url) else: print(" reason: ", src.reason)
def test_jstage_dive(self): pmid = 21297370 source = FindIt(pmid) assert source.url == 'https://www.jstage.jst.go.jp/article/yakushi/131/2/131_2_247/_pdf'
def test_backup_url(self): src = FindIt(18048598) # from journal "Tobacco Control" assert 'europepmc.org' in src.url assert 'bmj.com' in src.backup_url
article.mesh.get('qualifier_name', '')) if article.publication_types: print('\nPublication Type Information') for pt in list(article.publication_types.keys()): print('\t', pt, article.publication_types[pt]) if article.chemicals: print('\nChemical List') for DUI in list(article.chemicals.keys()): print('\t', DUI, article.chemicals[DUI]['substance_name']) if article.grants: print('\nGrant Information') for gr in grants: print('\t', gr) if article.history: print('\nArticle History') for hist in article.history: print('\t', hist, article.history[hist]) print('') print('FindIt results:') source = FindIt(pmid=pmid) print('\tdoi:', source.doi) print('\turl:', source.url) print('\tbackup:', source.backup_url) print('\treason:', source.reason)
def test_jama_dance(self): doi_but_unfree = '26575068' source = FindIt(doi_but_unfree) assert source.url is None