Ejemplo n.º 1
0
def process_pdf_task(pub):
    """
        This task processes a PDF and generates a thumbnail preview of it.
    """
    logger.info("STARTING TASK {}".format(pub))

    # Process pub IDs with prefixes
    if pub.get('pub_pmid'):
        pub_id = pub.get('pub_pmid')
    elif pub.get('pub_pmc'):
        pub_id = "PMC" + str(pub.get('pub_pmc'))
    elif pub.get('pub_arxiv'):
        pub_id = "ARXIV:" + pub.get('pub_arxiv')
    elif pub.get('pub_biorxiv'):
        pub_id = "BIORXIV:" + pub.get('pub_biorxiv')
    elif pub.get('pub_doi'):
        pub_id = pub.get('pub_doi')

    pub_item = get_publication(pub_id)

    pub_type, pub_id = id_type(pub_id)

    url = pub.get('pub_pdf_url')

    # Attempt to find the publication
    if url == 'searching':
        try:
            if pub_type == 'pmid':
                found_pdf = FindIt(pmid=pub['pub_pmid'])
            elif pub_type == 'doi':
                found_pdf = FindIt(doi=pub['pub_doi'])
            url = found_pdf.url
            pub_item.pub_pdf_url = url
            # Update status to indicate PDF found!
            db.session.commit()
        except MetaPubError:
            url = None

    if url:
        fname = download_pdf(url)
        sha1_fname = sha1_file(fname)
        thumbnail_fname = pdf_to_thumb(fname, sha1_fname)

        gs_client = google_storage()
        bucket = gs_client.get_bucket('pdf_thumbnails')
        thumbnail_url_fname = "{}.png".format(sha1_fname)
        thumbnail_obj = bucket.blob(thumbnail_url_fname)
        try:
            thumbnail_obj.upload_from_filename(thumbnail_fname)
            # Delete after upload
            os.remove(thumbnail_fname)

            # Update database - set thumbnail to sha1_fname
            logger.info("Stored thumbnail: " + thumbnail_url_fname)
            pub_item.pub_thumbnail = sha1_fname
        except FileNotFoundError:
            pub_item.pub_pdf_url = None
    else:
        pub_item.pub_pdf_url = None
    db.session.commit()
Ejemplo n.º 2
0
    def test_using_cache(self):
        src = FindIt(pmid=SAMPLE_PMIDS['nonembargoed'][0])
        assert src.url is not None
        assert src._cache is not None

        # source from the same pmid. check that result is same as if we used no cache.
        cached_src = FindIt(pmid=SAMPLE_PMIDS['nonembargoed'][0])
        fresh_src = FindIt(pmid=SAMPLE_PMIDS['nonembargoed'][0])

        assert cached_src.url == fresh_src.url
Ejemplo n.º 3
0
    def test_aaas_tango(self):
        pmid_needs_form = '18385036'  # Sci Signal requiring form negotiation
        # pmid_needs_form_url = 'http://stke.sciencemag.org/content/1/13/eg3.full.pdf'
        pmid_no_form = '25678633'  # Science
        pmid_no_form_url = 'http://sciencemag.org/content/347/6223/695.full.pdf'

        source = FindIt(pmid=pmid_no_form)
        assert source.url == pmid_no_form_url

        source = FindIt(pmid=pmid_needs_form)
        # TODO: update this when the_aaas_tango knows how to navigate forms.
        assert source.url is None
Ejemplo n.º 4
0
    def test_pmc_twist(self):
        #TODO: get a new embargoed PMID
        embargoed = '25554792'  # Science / pmc-release = Jan 2, 2016 / PMC4380271
        embargoed_url = 'http://sciencemag.org/content/347/6217/1258522.full.pdf'

        nonembargoed = '26106273'  # Saudi Pharm / pmc-release = None / PMC4475813

        source = FindIt(pmid=embargoed)
        assert source.pma.pmc == '4380271'
        #assert source.pma.history['pmc-release'] is not None
        #assert source.url == embargoed_url

        source = FindIt(pmid=nonembargoed)
        assert source.pma.pmc == '4475813'
        assert source.pma.history.get('pmc-release', None) is None
        print(source.url)
Ejemplo n.º 5
0
    def pdf_url(self):
        if self.pma.journal.lower().startswith('genereviews'):
            #TODO: make book_url a @property in metapub PubMedArticle
            return GENEREVIEWS_URL.format(bookid=self.pma.book_accession_id)

        if not self._pdf_src:
            self._pdf_src = FindIt(self.pmid, verify=False)
        return self._pdf_src.url
def main():
    jrnls = jstage_journals

    for jrnl in jrnls:
        pmids = get_sample_pmids_for_journal(jrnl)
        for pmid in pmids:
            source = FindIt(pmid)
            print(
                '[{source.pma.journal}]\t{source.pmid}: {source.url} ({source.reason})'
                .format(source=source))
            write_findit_result_to_csv(source)
Ejemplo n.º 7
0
def main(start_pmid=0):
    pmids = open(PMID_OUTPUT_FILENAME).read()

    if start_pmid:
        idx = pmids.find(str(start_pmid))
    else:
        idx = 0

    for pmid in pmids[idx:].split('\n'):
        source = FindIt(pmid, verify=False)
        print('[{source.pma.journal}]\t{source.pmid}: {source.url} ({source.reason})'.format(source=source))
        write_findit_result_to_csv(source)
Ejemplo n.º 8
0
def print_article_for_pmid(pmid):
    try:
        source = FindIt(pmid, verify=False)
    except Exception as error:
        print("Something's wrong with Gilligan's Island... %s" % pmid)
        return

    print('----- PMID: %s' % pmid)
    print(source.pma.title)
    if source.url:
        print(source.url)
    else:
        print(source.reason)
Ejemplo n.º 9
0
def get_pdf_url(ref):
    pdf_array = []
    for ident in ref.identifiers:
        if ident["identifier_type"].lower() in [
                "pmid", "pubmed", "pubmed id", "pubmed identifier"
        ]:

            src = FindIt(ident["identifier"])
            if src.url is None:
                print(src.reason)
            elif src.url not in pdf_array:
                pdf_array.append(src.url)

        elif ident["identifier_type"].lower() in [
                "doi", "digital object id", "digital object identifier"
        ]:

            src = FindIt(doi=ident["identifier"])
            if src.url is None:
                print(src.reason)
            elif src.url not in pdf_array:
                pdf_array.append(src.url)

    return pdf_array
Ejemplo n.º 10
0
 def test_jama_dance(self):
     doi_but_unfree = '26575068'
     source = FindIt(doi_but_unfree)
     #TODO re-examine ^^
     assert source.url is not None
Ejemplo n.º 11
0
def findit(pmid):
    source = FindIt(pmid=pmid)
    outd = source.to_dict()
    outd['article'] = source.pma.to_dict()
    return HTTP200(outd)
Ejemplo n.º 12
0
 def test_scielo_chula(self):
     pmid = 26840468
     source = FindIt(pmid)
     assert source.url == 'http://www.scielo.br/pdf/ag/v52n4/0004-2803-ag-52-04-00278.pdf'
Ejemplo n.º 13
0
 def test_jci_polka(self):
     pmid = 26030226
     source = FindIt(pmid=pmid)
Ejemplo n.º 14
0
 def test_skipping_cache(self):
     # use a known working, non-PMC pubmed ID
     src = FindIt(pmid=26111251, cachedir=None)
     assert src._cache is None
     assert src.url is not None
     assert not src.reason
Ejemplo n.º 15
0
    print(
        'Supply text2gene table name containing PMID column as argument to this script.'
    )
    sys.exit()

entries = PubtatorDB().fetchall(
    'select distinct(PMID) from text2gene.{}'.format(tablename))

print()
print('%i PMIDs found in text2gene.%s' % (len(entries), tablename))
print()


def dmesg(pmid, msg):
    print('[%s] <%i> %s' % (pmid, time.time(), msg))


for entry in entries:
    pmid = entry['PMID']
    dmesg(pmid, 'collecting')
    try:
        src = FindIt(pmid, verify=False)
    except MetaPubError as error:
        dmesg(pmid, '%r' % error)
        continue

    if src.url:
        dmesg(src.pmid, src.url)
    else:
        dmesg(src.pmid, src.reason)
Ejemplo n.º 16
0
    print("supply filename of PMID list as argument to this script")
    sys.exit()

re_pmid = re.compile('^\d+$')


def validate_pmid(pmid):
    pmid = pmid.strip()
    if re_pmid.findall(pmid):
        return True
    else:
        return False


pmids = list(set(open(filename, 'r').readlines()))

for pmid in [item.strip() for item in pmids if validate_pmid(item)]:
    print(pmid)
    try:
        src = FindIt(pmid=pmid, debug=True)
        print('{src.pmid}\t{src.doi}\tScore: {src.doi_score}\t{src.pma.title}'.
              format(src=src))
        if src.url:
            print(src.url)
        else:
            print(src.reason)
    except Exception as error:
        print(error)

    print()
#DEBUG = True

####
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("eutils").setLevel(logging.INFO)
####

if __name__ == '__main__':
    try:
        filename = sys.argv[1]
    except IndexError:
        print(
            'Supply a filename containing a list of PMIDs as argument to this script.'
        )
        sys.exit()

    pmids = open(filename, 'r').readlines()
    for pmid in [item.strip() for item in pmids if item.strip() != '']:
        try:
            src = FindIt(pmid, retry_errors=True)
        except Exception as error:
            print(error)
            continue

        print(pmid, src.doi, src.pma.title)
        if src.url:
            print("     url: ", src.url)
        else:
            print("     reason: ", src.reason)
Ejemplo n.º 18
0
 def test_jstage_dive(self):
     pmid = 21297370
     source = FindIt(pmid)
     assert source.url == 'https://www.jstage.jst.go.jp/article/yakushi/131/2/131_2_247/_pdf'
Ejemplo n.º 19
0
 def test_backup_url(self):
     src = FindIt(18048598)  # from journal "Tobacco Control"
     assert 'europepmc.org' in src.url
     assert 'bmj.com' in src.backup_url
          article.mesh.get('qualifier_name', ''))

if article.publication_types:
    print('\nPublication Type Information')
    for pt in list(article.publication_types.keys()):
        print('\t', pt, article.publication_types[pt])

if article.chemicals:
    print('\nChemical List')
    for DUI in list(article.chemicals.keys()):
        print('\t', DUI, article.chemicals[DUI]['substance_name'])

if article.grants:
    print('\nGrant Information')
    for gr in grants:
        print('\t', gr)

if article.history:
    print('\nArticle History')
    for hist in article.history:
        print('\t', hist, article.history[hist])

print('')

print('FindIt results:')
source = FindIt(pmid=pmid)
print('\tdoi:', source.doi)
print('\turl:', source.url)
print('\tbackup:', source.backup_url)
print('\treason:', source.reason)
Ejemplo n.º 21
0
 def test_jama_dance(self):
     doi_but_unfree = '26575068'
     source = FindIt(doi_but_unfree)
     assert source.url is None