def remove_watermarks(pdfcontent): """ Use pdfparanoia to remove watermarks from the pdf. """ log.debug("Removing pdf watermarks.") pdfcontent = pdfparanoia.scrub(StringIO(pdfcontent)) return pdfcontent
def remove_watermarks(pdfcontent): """ Use pdfparanoia to remove watermarks from the pdf. """ log.debug("Removing pdf watermarks.") pdfcontent = pdfparanoia.scrub(StringIO(pdfontent)) return pdfcontent
def download_url(url, _log=nullLog, **kwargs): paperbot_download_request_obj = paperbot_download_request() paperbot_download_request_obj._log = _log response_generator = paperbot_download_request_obj.get( url, use_generator=True, headers={"User-Agent": "origami-pdf"}) cc = 0 for response in response_generator: _log('using generator for %s time' % cc) cc += 1 paperbot_download_request_obj2 = paperbot_download_request() paperbot_download_request_obj2._log = _log content = response.content # response = requests.get(url, headers={"User-Agent": "origami-pdf"}, **kwargs) # content = response.content # just make up a default filename title = "%0.2x" % random.getrandbits(128) # default extension extension = ".txt" if "pdf" in response.headers["content-type"]: extension = ".pdf" elif check_if_html(response): # parse the html string with lxml.etree tree = parse_html(content) # extract some metadata with xpaths citation_pdf_url = find_citation_pdf_url(tree, url) citation_title = find_citation_title(tree) # aip.org sucks, citation_pdf_url is wrong if citation_pdf_url and "link.aip.org/" in citation_pdf_url: citation_pdf_url = None if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url: content = requests.get(citation_pdf_url).content tree = parse_html(content) # citation_title = ... # wow, this seriously needs to be cleaned up if citation_pdf_url and citation_title and \ "ieeexplore.ieee.org" not in citation_pdf_url: citation_title = citation_title.encode("ascii", "ignore") response = requests.get(citation_pdf_url, headers=HEADERS_DEFENSE) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" title = citation_title else: if "sciencedirect.com" in url and "ShoppingCart" not in url: _log('download_url got a sciencedirect URL') try: try: title_xpath = "//h1[@class='svTitle']" title = tree.xpath(title_xpath)[0].text pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] except IndexError: title = tree.xpath("//title")[0].text pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] if 'http' not in pdf_url: main_url_split = response.url.split('//') http_prefix = main_url_split[0] if 'http' in http_prefix: domain_url = main_url_split[1].split('/')[0] slash = '/' if pdf_url[0] != '/' else '' pdf_url = http_prefix + '//' + domain_url + slash + pdf_url gen = paperbot_download_request_obj2.get( pdf_url, use_generator=False, headers={"User-Agent": "sdf-macross"}) # this is stupidly ugly for genresponse in gen: new_response, extension = genresponse new_content = new_response.content _log( 'paperbot_download_request_obj2 content-type: %s' % new_response.headers["content-type"]) if "pdf" in new_response.headers["content-type"]: extension = ".pdf" break except Exception as e: _log(traceback.format_exc()) pass else: content = new_content response = new_response elif "jstor.org/" in url: # clean up the url if "?" in url: url = url[0:url.find("?")] # not all pages have the <input type="hidden" name="ppv-title"> element try: title = tree.xpath("//div[@class='hd title']")[0].text except Exception: try: input_xpath = "//input[@name='ppv-title']/@value" title = tree.xpath(input_xpath)[0] except Exception: pass # get the document id document_id = None if url[-1] != "/": # if "stable/" in url: # elif "discover/" in url: # elif "action/showShelf?candidate=" in url: # elif "pss/" in url: document_id = url.split("/")[-1] if document_id.isdigit(): try: pdf_url = make_jstor_url(document_id) new_response = requests.get(pdf_url, headers=HEADERS_TM_11) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif ".aip.org/" in url: try: title = tree.xpath("//title/text()")[0].split(" | ")[0] pdf_url = [ link for link in tree.xpath("//a/@href") if "getpdf" in link ][0] new_response = requests.get(pdf_url, headers=HEADERS_TM_1) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "ieeexplore.ieee.org" in url: try: pdf_url = [ url for url in tree.xpath("//frame/@src") if "pdf" in url ][0] new_response = requests.get(pdf_url, headers=HEADERS_TM_2) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "h1 class=\"articleTitle" in content: try: title_xpath = "//h1[@class='articleTitle']" title = tree.xpath(title_xpath)[0].text title = title.encode("ascii", "ignore") url_xpath = "//a[@title='View the Full Text PDF']/@href" pdf_url = tree.xpath(url_xpath)[0] except: pass else: if pdf_url.startswith("/"): url_start = url[:url.find("/", 8)] pdf_url = url_start + pdf_url response = requests.get(pdf_url, headers=HEADERS_TEAPOT) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" # raise Exception("problem with citation_pdf_url or citation_title") # well, at least save the contents from the original url pass # make the title again just in case if not title: title = "%0.2x" % random.getrandbits(128) # can't create directories title = title.replace("/", "_") path = os.path.join(ARCHIVE_DIR, title + extension) if extension in [".pdf", "pdf"]: try: content = pdfparanoia.scrub(StringIO(content)) except: # this is to avoid a PDFNotImplementedError pass file_handler = open(path, "w") file_handler.write(content) file_handler.close() title = title.encode("ascii", "ignore") url = ARCHIVE_BASE + requests.utils.quote(title) + extension return url
def download(phenny, input, verbose=True): """ Downloads a paper. """ if logchannel: _log = lambda x: phenny.msg("#%s" % logchannel, x) else: _log = lambda x: None # only accept requests in a channel if not input.sender.startswith('#'): # unless the user is an admin, of course if not input.admin: phenny.say("i only take requests in the ##hplusroadmap channel.") return else: # just give a warning message to the admin.. not a big deal. phenny.say( "okay i'll try, but please send me requests in ##hplusroadmap in the future." ) # get the input line = input.group() # was this an explicit command? explicit = False if line.startswith(phenny.nick): explicit = True line = line[len(phenny.nick):] if line.startswith(",") or line.startswith(":"): line = line[1:] if line.startswith(" "): line = line.strip() # don't bother if there's nothing there if len(line) < 5 or ("http://" not in line and "https://" not in line) or \ not line.startswith("http"): return for line in re.findall(URL_REGEX, line): # fix an UnboundLocalError problem shurl = None line = filter_fix(line) # fix for login.jsp links to ieee xplore line = fix_ieee_login_urls(line) line = fix_jstor_pdf_urls(line) translation_url = "http://localhost:1969/web" headers = { "Content-Type": "application/json", } data = {"url": line, "sessionid": "what"} data = json.dumps(data) response = requests.post(translation_url, data=data, headers=headers) if response.status_code == 200 and response.content != "[]": # see if there are any attachments content = json.loads(response.content) item = content[0] title = item["title"] if "DOI" in item: _log("Translator DOI") lgre = requests.post(LIBGEN_FORM, data={"doi": item["DOI"]}) tree = parse_html(lgre.content) if tree.xpath("//h1")[0].text != "No file selected": phenny.say("http://libgen.info/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) return if "attachments" in item: pdf_url = None for attachment in item["attachments"]: if "mimeType" in attachment and \ "application/pdf" in attachment["mimeType"]: pdf_url = attachment["url"] break if pdf_url: user_agent = USER_AGENT paperbot_download_request_obj = paperbot_download_request() paperbot_download_request_obj._log = _log gen = paperbot_download_request_obj.get( pdf_url, use_generator=False, headers=headers) # this is stupidly ugly for genresponse in gen: response, extension = genresponse # detect failure if response.status_code != 200: shurl, _ = modules.scihub.scihubber(pdf_url) if shurl: if "libgen" in shurl: phenny.say( "http://libgen.info/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) elif "pdfcache" not in shurl: phenny.say(shurl) else: pdfstr = modules.scihub.scihub_dl(shurl) phenny.say( modules.scihub.libgen(pdfstr, item["DOI"])) return data = response.content if "pdf" in response.headers["content-type"]: try: data = pdfparanoia.scrub(StringIO(data)) try: _log('after pdfparanoia.scrub') requests.get( 'http://localhost:8500/remoteprint', headers={'msg': 'after pdfparanoia.scrub'}) except: pass break except: # this is to avoid a PDFNotImplementedError pass if "DOI" in item: phenny.say(modules.scihub.libgen(data, item["DOI"])) return # grr.. title = title.encode("ascii", "ignore") path = os.path.join(ARCHIVE_DIR, title + ".pdf") file_handler = open(path, "w") file_handler.write(data) file_handler.close() filename = requests.utils.quote(title) # Remove an ending period, which sometimes happens when the # title of the paper has a period at the end. if filename[-1] == ".": filename = filename[:-1] url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf" phenny.say(url) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) phenny.say(download_url(line, _log)) continue elif verbose and explicit: _log("Translation server fail") shurl, doi = modules.scihub.scihubber(line) _log("Scihubber -> (%s, %s)" % (shurl, doi)) if shurl: if "pdfcache" in shurl: if doi: pdfstr = modules.scihub.scihub_dl(shurl) phenny.say(modules.scihub.libgen(pdfstr, doi)) else: phenny.say( download_url(shurl, _log, cookies=modules.scihub.shcookie)) else: phenny.say(shurl) elif verbose and explicit: _log("All approaches failed") phenny.say(download_url(line, _log)) return
def download_url(url, _log=nullLog, **kwargs): paperbot_download_request_obj = paperbot_download_request() paperbot_download_request_obj._log = _log response_generator = paperbot_download_request_obj.get(url, use_generator=True, headers={"User-Agent": "origami-pdf"}) cc=0 for response in response_generator: _log('using generator for %s time' % cc) cc+=1 paperbot_download_request_obj2 = paperbot_download_request() paperbot_download_request_obj2._log = _log content = response.content #response = requests.get(url, headers={"User-Agent": "origami-pdf"}, **kwargs) #content = response.content # just make up a default filename title = "%0.2x" % random.getrandbits(128) # default extension extension = ".txt" if "pdf" in response.headers["content-type"]: extension = ".pdf" elif check_if_html(response): # parse the html string with lxml.etree tree = parse_html(content) # extract some metadata with xpaths citation_pdf_url = find_citation_pdf_url(tree, url) citation_title = find_citation_title(tree) # aip.org sucks, citation_pdf_url is wrong if citation_pdf_url and "link.aip.org/" in citation_pdf_url: citation_pdf_url = None if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url: content = requests.get(citation_pdf_url).content tree = parse_html(content) # citation_title = ... # wow, this seriously needs to be cleaned up if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url: citation_title = citation_title.encode("ascii", "ignore") response = requests.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"}) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" title = citation_title else: if "sciencedirect.com" in url and not "ShoppingCart" in url: _log('download_url got a sciencedirect URL') try: try: title = tree.xpath("//h1[@class='svTitle']")[0].text pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] except IndexError: title = tree.xpath("//title")[0].text pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] if 'http' not in pdf_url: main_url_split = response.url.split('//') http_prefix = main_url_split[0] if 'http' in http_prefix: domain_url = main_url_split[1].split('/')[0] pdf_url = http_prefix + '//' + domain_url + ('/' if pdf_url[0]!='/' else '') + pdf_url gen = paperbot_download_request_obj2.get(pdf_url, use_generator=False, headers={"User-Agent": "sdf-macross"}) #this is stupidly ugly for genresponse in gen: new_response, extension = genresponse new_content = new_response.content _log('paperbot_download_request_obj2 content-type: %s' % new_response.headers["content-type"]) if "pdf" in new_response.headers["content-type"]: extension = ".pdf" break except Exception as e: _log(traceback.format_exc()) pass else: content = new_content response = new_response elif "jstor.org/" in url: # clean up the url if "?" in url: url = url[0:url.find("?")] # not all pages have the <input type="hidden" name="ppv-title"> element try: title = tree.xpath("//div[@class='hd title']")[0].text except Exception: try: title = tree.xpath("//input[@name='ppv-title']/@value")[0] except Exception: pass # get the document id document_id = None if url[-1] != "/": #if "stable/" in url: #elif "discover/" in url: #elif "action/showShelf?candidate=" in url: #elif "pss/" in url: document_id = url.split("/")[-1] if document_id.isdigit(): try: pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true" new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.1"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif ".aip.org/" in url: try: title = tree.xpath("//title/text()")[0].split(" | ")[0] pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0] new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/1.0"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "ieeexplore.ieee.org" in url: try: pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0] new_response = requests.get(pdf_url, headers={"User-Agent": "time-machine/2.0"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "h1 class=\"articleTitle" in content: try: title = tree.xpath("//h1[@class='articleTitle']")[0].text title = title.encode("ascii", "ignore") pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0] except: pass else: if pdf_url.startswith("/"): url_start = url[:url.find("/",8)] pdf_url = url_start + pdf_url response = requests.get(pdf_url, headers={"User-Agent": "pdf-teapot"}) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" # raise Exception("problem with citation_pdf_url or citation_title") # well, at least save the contents from the original url pass # make the title again just in case if not title: title = "%0.2x" % random.getrandbits(128) # can't create directories title = title.replace("/", "_") path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + extension) if extension in [".pdf", "pdf"]: try: content = pdfparanoia.scrub(StringIO(content)) except: # this is to avoid a PDFNotImplementedError pass file_handler = open(path, "w") file_handler.write(content) file_handler.close() title = title.encode("ascii", "ignore") url = "http://diyhpl.us/~bryan/papers2/paperbot/" + requests.utils.quote(title) + extension return url
def download(phenny, input, verbose=True): """ Downloads a paper. """ if logchannel: _log = lambda x: phenny.msg("#%s" % logchannel, x) else: _log = lambda x: None # only accept requests in a channel if not input.sender.startswith('#'): # unless the user is an admin, of course if not input.admin: phenny.say("i only take requests in the ##hplusroadmap channel.") return else: # just give a warning message to the admin.. not a big deal. phenny.say("okay i'll try, but please send me requests in ##hplusroadmap in the future.") # get the input line = input.group() # was this an explicit command? explicit = False if line.startswith(phenny.nick): explicit = True line = line[len(phenny.nick):] if line.startswith(",") or line.startswith(":"): line = line[1:] if line.startswith(" "): line = line.strip() # don't bother if there's nothing there if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"): return for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line): # fix an UnboundLocalError problem shurl = None line = filter_fix(line) # fix for login.jsp links to ieee xplore line = fix_ieee_login_urls(line) line = fix_jstor_pdf_urls(line) translation_url = "http://localhost:1969/web" headers = { "Content-Type": "application/json", } data = { "url": line, "sessionid": "what" } data = json.dumps(data) response = requests.post(translation_url, data=data, headers=headers) if response.status_code == 200 and response.content != "[]": # see if there are any attachments content = json.loads(response.content) item = content[0] title = item["title"] if item.has_key("DOI"): _log("Translator DOI") lgre = requests.post("http://libgen.org/scimag/librarian/form.php", data={"doi":item["DOI"]}) tree = parse_html(lgre.content) if tree.xpath("//h1")[0].text != "No file selected": phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) return if item.has_key("attachments"): pdf_url = None for attachment in item["attachments"]: if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]: pdf_url = attachment["url"] break if pdf_url: user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" """ proxies_left_to_try = len(proxy_list) request_iteration = 0 proxy_url_index=0 _log('before while proxies_left_to_try') while proxies_left_to_try: headers = { "User-Agent": user_agent, } response = None proxy_url = proxy_list[proxy_url_index]['proxy_url'] proxy_type = proxy_list[proxy_url_index]['proxy_type'] _log('proxies_left_to_try: %d' % proxies_left_to_try) #perform default behaviour if proxy is None if proxy_url is None: if pdf_url.startswith("https://"): response = requests.get(pdf_url, headers=headers, verify=False) else: response = requests.get(pdf_url, headers=headers) else: #check type of proxy if proxy_type == 'custom_flask_json': headers["Content-Type"] = "application/json" data = {'pdf_url' : pdf_url, 'request_iteration' : request_iteration } request_iteration+=1 response = requests.get(proxy_url, data=json.dumps(data), headers=headers) elif proxy_type == 'normal': #i'm not even checking if http or https is in the pdf_url, since the default proxy of None is already being tried in this loop proxies = { "http": proxy_url, "https": proxy_url, } response = requests.get(pdf_url, headers=headers, proxies=proxies) """ paperbot_download_request_obj = paperbot_download_request() paperbot_download_request_obj._log = _log gen = paperbot_download_request_obj.get(pdf_url, use_generator=False, headers=headers) #this is stupidly ugly for genresponse in gen: response, extension = genresponse # detect failure if response.status_code != 200: shurl, _ = modules.scihub.scihubber(pdf_url) if shurl: if "libgen" in shurl: phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) elif "pdfcache" not in shurl: phenny.say(shurl) else: phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"])) return data = response.content if "pdf" in response.headers["content-type"]: try: data = pdfparanoia.scrub(StringIO(data)) try: _log('after pdfparanoia.scrub') requests.get('http://localhost:8500/remoteprint', headers={'msg':'after pdfparanoia.scrub'}) except: pass break except: """ #check for custom_flask_json proxy response, which indicates if the given custom proxy has more internal proxies to try with if 'proxies_remaining' in response.headers: #decrement the index if the custom proxy doesn't have any more internal proxies to try if response.headers['proxies_remaining'] == 0: proxies_left_to_try-=1 proxy_url_index+=1 request_iteration=0 else: #decrement the index to move on to the next proxy in our proxy_list proxies_left_to_try-=1 proxy_url_index+=1 """ # this is to avoid a PDFNotImplementedError pass if item.has_key("DOI"): phenny.say(modules.scihub.libgen(data, item["DOI"])) return # grr.. title = title.encode("ascii", "ignore") path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf") file_handler = open(path, "w") file_handler.write(data) file_handler.close() filename = requests.utils.quote(title) # Remove an ending period, which sometimes happens when the # title of the paper has a period at the end. if filename[-1] == ".": filename = filename[:-1] url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf" phenny.say(url) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) phenny.say(download_url(line, _log)) continue elif verbose and explicit: _log("Translation server fail") shurl, doi = modules.scihub.scihubber(line) _log("Scihubber -> (%s, %s)" % (shurl, doi)) if shurl: if "pdfcache" in shurl: if doi: phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), doi)) else: phenny.say(download_url(shurl, _log, cookies=modules.scihub.shcookie)) else: phenny.say(shurl) elif verbose and explicit: _log("All approaches failed") phenny.say(download_url(line, _log)) return
def download(phenny, input, verbose=True): """ Downloads a paper. """ if logchannel: _log = lambda x: phenny.msg("#%s" % logchannel, x) else: _log = lambda x: None # only accept requests in a channel if not input.sender.startswith('#'): # unless the user is an admin, of course if not input.admin: phenny.say("i only take requests in the ##hplusroadmap channel.") return else: # just give a warning message to the admin.. not a big deal. phenny.say("okay i'll try, but please send me requests in ##hplusroadmap in the future.") # get the input line = input.group() # was this an explicit command? explicit = False if line.startswith(phenny.nick): explicit = True line = line[len(phenny.nick):] if line.startswith(",") or line.startswith(":"): line = line[1:] if line.startswith(" "): line = line.strip() # don't bother if there's nothing there if len(line) < 5 or (not "http://" in line and not "https://" in line) or not line.startswith("http"): return for line in re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', line): # fix an UnboundLocalError problem shurl = None line = filter_fix(line) # fix for login.jsp links to ieee xplore line = fix_ieee_login_urls(line) line = fix_jstor_pdf_urls(line) translation_url = "http://localhost:1969/web" headers = { "Content-Type": "application/json", } data = { "url": line, "sessionid": "what" } data = json.dumps(data) response = requests.post(translation_url, data=data, headers=headers) if response.status_code == 200 and response.content != "[]": # see if there are any attachments content = json.loads(response.content) item = content[0] title = item["title"] if item.has_key("DOI"): _log("Translator DOI") lgre = requests.post("http://libgen.org/scimag/librarian/form.php", data={"doi":item["DOI"]}) tree = parse_html(lgre.content) if tree.xpath("//h1")[0].text != "No file selected": phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) return if item.has_key("attachments"): pdf_url = None for attachment in item["attachments"]: if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]: pdf_url = attachment["url"] break if pdf_url: user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" headers = { "User-Agent": user_agent, } response = None if pdf_url.startswith("https://"): response = requests.get(pdf_url, headers=headers, verify=False) else: response = requests.get(pdf_url, headers=headers) # detect failure if response.status_code != 200: shurl, _ = modules.scihub.scihubber(pdf_url) if shurl: if "libgen" in shurl: phenny.say("http://libgen.org/scimag/get.php?doi=%s" % urllib.quote_plus(item["DOI"])) elif "pdfcache" not in shurl: phenny.say(shurl) else: phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), item["DOI"])) return data = response.content if "pdf" in response.headers["content-type"]: try: data = pdfparanoia.scrub(StringIO(data)) except: # this is to avoid a PDFNotImplementedError pass if item.has_key("DOI"): phenny.say(modules.scihub.libgen(data, item["DOI"])) return # grr.. title = title.encode("ascii", "ignore") path = os.path.join("/home/bryan/public_html/papers2/paperbot/", title + ".pdf") file_handler = open(path, "w") file_handler.write(data) file_handler.close() filename = requests.utils.quote(title) # Remove an ending period, which sometimes happens when the # title of the paper has a period at the end. if filename[-1] == ".": filename = filename[:-1] url = "http://diyhpl.us/~bryan/papers2/paperbot/" + filename + ".pdf" phenny.say(url) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) continue elif verbose and explicit: _log("Translation server PDF fail") shurl, doi = modules.scihub.scihubber(line) phenny.say(download_url(line)) continue elif verbose and explicit: _log("Translation server fail") shurl, doi = modules.scihub.scihubber(line) _log("Scihubber -> (%s, %s)" % (shurl, doi)) if shurl: if "pdfcache" in shurl: if doi: phenny.say(modules.scihub.libgen(modules.scihub.scihub_dl(shurl), doi)) else: phenny.say(download_url(shurl, cookies=modules.scihub.shcookie)) else: phenny.say(shurl) elif verbose and explicit: _log("All approaches failed") phenny.say(download_url(line)) return
def download_url(url, proxy, last_resort=False): sys.stderr.write("attempting direct for %s through %s\n" % (url, proxy)) session = requests.Session() session.proxies = { 'http': proxy, 'https': proxy} try: response = session.get(url, headers={"User-Agent": "origami-pdf"}) except requests.exceptions.ConnectionError: sys.stderr.write("network failure on download " + str(url) + "\n") return 1 content = response.content # just make up a default filename title = "%0.2x" % random.getrandbits(128) # default extension extension = ".txt" if "pdf" in response.headers["content-type"]: extension = ".pdf" elif check_if_html(response): # parse the html string with lxml.etree tree = parse_html(content) # extract some metadata with xpaths citation_pdf_url = find_citation_pdf_url(tree, url) citation_title = find_citation_title(tree) # aip.org sucks, citation_pdf_url is wrong if citation_pdf_url and "link.aip.org/" in citation_pdf_url: citation_pdf_url = None if citation_pdf_url and "ieeexplore.ieee.org" in citation_pdf_url: content = session.get(citation_pdf_url).content tree = parse_html(content) # citation_title = ... # wow, this seriously needs to be cleaned up if citation_pdf_url and citation_title and not "ieeexplore.ieee.org" in citation_pdf_url: citation_title = citation_title.encode("ascii", "ignore") response = session.get(citation_pdf_url, headers={"User-Agent": "pdf-defense-force"}) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" title = citation_title else: if "sciencedirect.com" in url and not "ShoppingCart" in url: try: title = tree.xpath("//h1[@class='svTitle']")[0].text pdf_url = tree.xpath("//a[@id='pdfLink']/@href")[0] new_response = session.get(pdf_url, headers={"User-Agent": "sdf-macross"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "jstor.org/" in url: # clean up the url if "?" in url: url = url[0:url.find("?")] # not all pages have the <input type="hidden" name="ppv-title"> element try: title = tree.xpath("//div[@class='hd title']")[0].text except Exception: try: title = tree.xpath("//input[@name='ppv-title']/@value")[0] except Exception: pass # get the document id document_id = None if url[-1] != "/": #if "stable/" in url: #elif "discover/" in url: #elif "action/showShelf?candidate=" in url: #elif "pss/" in url: document_id = url.split("/")[-1] if document_id.isdigit(): try: pdf_url = "http://www.jstor.org/stable/pdfplus/" + document_id + ".pdf?acceptTC=true" new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.1"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif ".aip.org/" in url: try: title = tree.xpath("//title/text()")[0].split(" | ")[0] pdf_url = [link for link in tree.xpath("//a/@href") if "getpdf" in link][0] new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/1.0"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "ieeexplore.ieee.org" in url: try: pdf_url = [url for url in tree.xpath("//frame/@src") if "pdf" in url][0] new_response = session.get(pdf_url, headers={"User-Agent": "time-machine/2.0"}) new_content = new_response.content if "pdf" in new_response.headers["content-type"]: extension = ".pdf" except Exception: pass else: content = new_content response = new_response elif "h1 class=\"articleTitle" in content: try: title = tree.xpath("//h1[@class='articleTitle']")[0].text title = title.encode("ascii", "ignore") pdf_url = tree.xpath("//a[@title='View the Full Text PDF']/@href")[0] except: pass else: if pdf_url.startswith("/"): url_start = url[:url.find("/",8)] pdf_url = url_start + pdf_url response = session.get(pdf_url, headers={"User-Agent": "pdf-teapot"}) content = response.content if "pdf" in response.headers["content-type"]: extension = ".pdf" # raise Exception("problem with citation_pdf_url or citation_title") # well, at least save the contents from the original url pass # make the title again just in case if not title: title = "%0.2x" % random.getrandbits(128) # can't create directories title = title.replace("/", "_") title = title.replace(" ", "_") title = title[:params.maxlen] path = os.path.join(params.folder, title + extension) if extension in [".pdf", "pdf"]: try: sys.stderr.write("got it! " + str(url) + "\n") content = pdfparanoia.scrub(StringIO(content)) except: # this is to avoid a PDFNotImplementedError pass file_handler = open(path, "w") file_handler.write(content) file_handler.close() title = title.encode("ascii", "ignore") url = params.url + requests.utils.quote(title) + extension if extension in [".pdf", "pdf"]: print url return 0 else: sys.stderr.write("couldn't find it, dump: %s\n" % url) if last_resort: print "pas possible de le trouver, dump: %s" % url else: return 1 return 0
def download_proxy(line, zotero, proxy, verbose=True): sys.stderr.write("tente de télécharger %s through %s and %s\n" % (line, zotero, proxy)) headers = { "Content-Type": "application/json", } data = { "url": line, "sessionid": "what" } data = json.dumps(data) response = requests.post(zotero, data=data, headers=headers) if response.status_code != 200 or response.content == "[]": sys.stderr.write("no valid reply from zotero\n") sys.stderr.write("status %d\n" % response.status_code) sys.stderr.write("content %s\n" % response.content) return -1 # fatal sys.stderr.write("content %s\n" % response.content) # see if there are any attachments content = json.loads(response.content) item = content[0] title = item["title"] if not item.has_key("attachments"): sys.stderr.write("no attachement with this proxy\n") return 1 # try another proxy pdf_url = None for attachment in item["attachments"]: if attachment.has_key("mimeType") and "application/pdf" in attachment["mimeType"]: pdf_url = attachment["url"] break if not pdf_url: sys.stderr.write("no PDF attachement with this proxy\n") return 1 # try another proxy user_agent = "Mozilla/5.0 (X11; Linux i686 (x86_64)) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11" headers = { "User-Agent": user_agent, } sys.stderr.write("try retrieving " + str(pdf_url) + " through proxy " + proxy + "\n") response = None session = requests.Session() session.proxies = { 'http': proxy, 'https': proxy} try: if pdf_url.startswith("https://"): response = session.get(pdf_url, headers=headers, verify=False) else: response = session.get(pdf_url, headers=headers) except requests.exceptions.ConnectionError: sys.stderr.write("network failure on download " + str(pdf_url) + "\n") return 1 # detect failure if response.status_code == 401: sys.stderr.write("HTTP 401 unauthorized when trying to fetch " + str(pdf_url) + "\n") return 1 elif response.status_code != 200: sys.stderr.write("HTTP " + str(response.status_code) + " when trying to fetch " + str(pdf_url) + "\n") return 1 data = response.content if "pdf" in response.headers["content-type"]: try: data = pdfparanoia.scrub(StringIO(data)) except: # this is to avoid a PDFNotImplementedError pass # grr.. title = title.encode("ascii", "ignore") title = title.replace(" ", "_") title = title[:params.maxlen] path = os.path.join(params.folder, title + ".pdf") file_handler = open(path, "w") file_handler.write(data) file_handler.close() filename = requests.utils.quote(title) # Remove an ending period, which sometimes happens when the # title of the paper has a period at the end. if filename[-1] == ".": filename = filename[:-1] url = params.url + filename + ".pdf" print(url) return 0