def upload_to_libgen(paperpath, doi): """ Store the paper on libgen. """ # need to provide some credentials to libgen authfragment = build_libgen_auth_fragment() files = {"uploadedfile": ("derp.pdf", paperpath)} data = {"doi": doi} kwargs = {"auth": authfragment, "files": files, "data": data} log.debug("Uploading to libgen doi {} path {}".format(doi, paperpath)) response = requests.post("http://libgen.org/scimag/librarian/form.php", **kwargs) # parse returned html tree = parse_html(response) # build dict with all named fields from html formp = dict(map(lambda x: (x.get("name"), x.get("value")), tree.xpath("//input[@name]"))) log.debug("Submitting form back to libgen.") response = requests.get("http://libgen.org/scimag/librarian/register.php", data=formp, auth=authfragment) urldoi = make_libgen_doi_url(doi) log.debug("Completed libgen upload: {}".format(urldoi)) return urldoi
def upload_to_libgen(paperpath, doi): """ Store the paper on libgen. """ # need to provide some credentials to libgen authfragment = build_libgen_auth_fragment() files = { "uploadedfile": ("derp.pdf", paperpath), } data = { "doi": doi, } kwargs = { "auth": authfragment, "files": files, "data": data, } log.debug("Uploading to libgen doi {} path {}".format(doi, paperpath)) response = requests.post("http://libgen.org/scimag/librarian/form.php", **kwargs) # parse returned html tree = parse_html(response) # build dict with all named fields from html formp = dict( map(lambda x: (x.get("name"), x.get("value")), tree.xpath("//input[@name]"))) log.debug("Submitting form back to libgen.") response = requests.get("http://libgen.org/scimag/librarian/register.php", data=formp, auth=authfragment) urldoi = make_libgen_doi_url(doi) log.debug("Completed libgen upload: {}".format(urldoi)) return urldoi
def download(url, paper=None): """ Main entry point for executing paperbot's primary function, paper fetching. The given url may be to a pdf file, which should be archived, or it may be to an academic publisher's website which points to a paper. The paper needs to be downloaded and the metadata should be stored. Returns a tuple of (paper, json_path, pdf_path, logpath). :param url: url to fetch and examine :type url: str """ # store logs in tempfile (templogpath, loghandler) = loghijack() if paper is None: paper = Paper.create({}) # clean up url if necessary url = run_url_fixers(url) # whether or not metadata has already been populated populated_metadata = False for (url2, response) in iterdownload(url, paper=paper): if is_response_pdf(response): log.debug("Got pdf.") pdfcontent = remove_watermarks(response.content) paper.pdf = pdfcontent store(paper) break paper.html = response.content # Was not pdf. Attempt to parse the HTML based on normal expected # HTML elements. The HTML elements may say that the actual pdf url # is something else. If this happens, then attempt to download that # pdf url instead and then break out of this loop. # no reason to get same metadata on every iteration of loop if not populated_metadata: tree = parse_html(response.content) # most publishers show paper metadata in html in same way because ? populate_metadata_from_tree(tree, paper) # TODO: better way to check if populate_metadata_from_tree did # anything useful? if paper.title in [None, ""]: log.debug("# TODO: parse metadata from html using plugins here") else: populated_metadata = True # can't try anything else if the url is still bad if paper.pdf_url in [None, ""]: continue # Normalize the two urls. The url from the metadata on the page # might be different from the url that was originally passed in, # even though both urls might still refer to the same resource. if is_same_url(url, paper.pdf_url): # pdf_url is same as original url, no pdf found yet. This # happens when the pdf url is correct, but the publisher is # returning html instead. And the html happens to reference the # url that was originally requested in the first place. Argh. continue log.debug("Switching activity to pdf_url {}".format(paper.pdf_url)) # paper pdf is stored at a different url. Attempt to fetch that # url now. Only do this if pdf_url != url because otherwise # this will be an endless loop. for (url3, response2) in iterdownload(paper.pdf_url, paper=paper): if is_response_pdf(response2): log.debug("Got pdf on second-level page.") pdfcontent = remove_watermarks(response.content) paper.pdf = pdfcontent store(paper) break else: log.debug("Couldn't download pdf from {}".format(paper.pdf_url)) break # was pdf downloaded? if (hasattr(paper, "pdf") and paper.pdf not in [None, ""]) or os.path.exists(paper.file_path_pdf): fetched = True else: fetched = False hasdoi = (paper.doi not in [None, ""]) if hasdoi: # check if libgen has this paper already libgenhas = check_libgen_has_paper(paper.doi) if fetched and not libgenhas: # upload if libgen doesn't already have it upload_to_libgen(paper.file_path_pdf, paper.doi) elif not fetched and libgenhas: urldoi = make_libgen_doi_url(paper.doi) # get from libgen log.debug("Haven't yet fetched paper. Have doi. Also, libgenhas.") log.debug("HTTP GET {}".format(urldoi)) response = requests.get(urldoi, headers=DEFAULT_HEADERS) if is_pdf_response(response): log.debug("Got pdf from libgen.") # skip pdfparanoia because it's from libgen pdfcontent = response.content paper.pdf = pdfcontent store(paper) fetched = True else: log.debug("libgen lied about haspdf :(") else: log.debug("Don't know doi, can't check if libgen has this paper.") libgenhas = None # store(paper) usually handles json but in case of failure there needs to # be an explicit save of paper metadata. if not fetched: store_json(paper) # move logs into position logpath = store_logs(paper, templogpath) # remove loghandler from logger mainlogger = logging.getLogger("paperbot") mainlogger.handlers.remove(loghandler) return (paper, paper.file_path_json, paper.file_path_pdf, logpath)
def download(url, paper=None): """ Main entry point for executing paperbot's primary function, paper fetching. The given url may be to a pdf file, which should be archived, or it may be to an academic publisher's website which points to a paper. The paper needs to be downloaded and the metadata should be stored. Returns a tuple of (paper, json_path, pdf_path, logpath). :param url: url to fetch and examine :type url: str """ # store logs in tempfile (templogpath, loghandler) = loghijack() if paper is None: paper = Paper.create({}) # clean up url if necessary url = run_url_fixers(url) # whether or not metadata has already been populated populated_metadata = False for (url2, response) in iterdownload(url, paper=paper): if is_response_pdf(response): log.debug("Got pdf.") pdfcontent = remove_watermarks(response.content) paper.pdf = pdfcontent store(paper) break paper.html = response.content # Was not pdf. Attempt to parse the HTML based on normal expected # HTML elements. The HTML elements may say that the actual pdf url # is something else. If this happens, then attempt to download that # pdf url instead and then break out of this loop. # no reason to get same metadata on every iteration of loop if not populated_metadata: tree = parse_html(response.content) # most publishers show paper metadata in html in same way because ? populate_metadata_from_tree(tree, paper) # TODO: better way to check if populate_metadata_from_tree did # anything useful? if paper.title in [None, ""]: log.debug( "# TODO: parse metadata from html using plugins here") else: populated_metadata = True # can't try anything else if the url is still bad if paper.pdf_url in [None, ""]: continue # Normalize the two urls. The url from the metadata on the page # might be different from the url that was originally passed in, # even though both urls might still refer to the same resource. if is_same_url(url, paper.pdf_url): # pdf_url is same as original url, no pdf found yet. This # happens when the pdf url is correct, but the publisher is # returning html instead. And the html happens to reference the # url that was originally requested in the first place. Argh. continue log.debug("Switching activity to pdf_url {}".format(paper.pdf_url)) # paper pdf is stored at a different url. Attempt to fetch that # url now. Only do this if pdf_url != url because otherwise # this will be an endless loop. for (url3, response2) in iterdownload(paper.pdf_url, paper=paper): if is_response_pdf(response2): log.debug("Got pdf on second-level page.") pdfcontent = remove_watermarks(response.content) paper.pdf = pdfcontent store(paper) break else: log.debug("Couldn't download pdf from {}".format(paper.pdf_url)) break # was pdf downloaded? if (hasattr(paper, "pdf") and paper.pdf not in [None, ""]) or \ os.path.exists(paper.file_path_pdf): fetched = True else: fetched = False hasdoi = (paper.doi not in [None, ""]) if hasdoi: # check if libgen has this paper already libgenhas = check_libgen_has_paper(paper.doi) if fetched and not libgenhas: # upload if libgen doesn't already have it upload_to_libgen(paper.file_path_pdf, paper.doi) elif not fetched and libgenhas: urldoi = make_libgen_doi_url(paper.doi) # get from libgen log.debug("Haven't yet fetched paper. Have doi. Also, libgenhas.") log.debug("HTTP GET {}".format(urldoi)) response = requests.get(urldoi, headers=DEFAULT_HEADERS) if is_response_pdf(response): log.debug("Got pdf from libgen.") # skip pdfparanoia because it's from libgen pdfcontent = response.content paper.pdf = pdfcontent store(paper) fetched = True else: log.debug("libgen lied about haspdf :(") else: log.debug("Don't know doi, can't check if libgen has this paper.") libgenhas = None # store(paper) usually handles json but in case of failure there needs to # be an explicit save of paper metadata. if not fetched: store_json(paper) # move logs into position logpath = store_logs(paper, templogpath) # remove loghandler from logger mainlogger = logging.getLogger("paperbot") mainlogger.handlers.remove(loghandler) return (paper, paper.file_path_json, paper.file_path_pdf, logpath)