def __init__(self, argv, ip_addr, baseline_type="OS", base_dir="/opt/apache-tomcat-8.5.35"): self.parse_argv(argv) session = HTMLSession() session.mount('file://', FileAdapter()) # Windows系统路径目录分隔符为反斜杠,但get需要正斜杠所以先进行一下替换 pwd = os.getcwd().replace("\\", "/") # 测试发现使用相对路径读不到文件,需要使用绝对路径 baseline_type = baseline_type.lower() self.ip_addr = ip_addr self.baseline_type = baseline_type self.base_dir = base_dir # ip_reg = "(\d{1,3}\.{1}){3}\d{1,3}" # full_reg = f"{ip_reg}_{baseline_type}\.html" # pwd_file_list = os.listdir() # for file in pwd_file_list: # if re.search(full_reg,file): # ip_addr = re.search(ip_reg,file).group() self.html_obj = session.get( f'file:///{pwd}/../4_report/{ip_addr}_{baseline_type}_report.html') self.shell_script_obj = open( f"../6_fix/{ip_addr}_{baseline_type}_fix.sh", "w+", encoding='utf-8', newline='\n') self.fix_item_list = {}
def _init_session(self): sess = HTMLSession() adapter = adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) sess.mount('http://', adapter) sess.mount('https://', adapter) return sess
def parse_home(self): session = HTMLSession() session.mount('http://', HTTPAdapter(max_retries=3)) session.mount('https://', HTTPAdapter(max_retries=3)) print("parsing: " + self.homepage + "/read-the-story/") try: with session.get(self.homepage + "/read-the-story/", timeout=(5, 10)) as buf: chapters = buf.html.find('#chapters', first=True) if chapters == None: return chapter_list = chapters.find('.chapter__box') for chapter in chapter_list: url = chapter.links.pop() name = re.sub(r'Chapter [\d]*', '', chapter.full_text.strip()) name = name.strip() index = re.search( r'Chapter [\d]*', chapter.full_text.strip()) index = index.group() chapter = BookChapter(name, index, url) self.chapters.append(chapter) except Exception as e: print(e) print("finish: " + self.homepage + "/read-the-story/") session.close()
def parse_chapters(self): session = HTMLSession() session.mount('http://', HTTPAdapter(max_retries=3)) session.mount('https://', HTTPAdapter(max_retries=3)) for chapter in self.chapters: html = PAGES_DIR + os.sep + self.name + os.sep + chapter.index + ".html" if os.path.exists(html): continue self.parse_chapter(session, chapter) time.sleep(5) session.close()
class MensaBase(object): def __init__(self, endpoints, location): """Constructor.""" self.location = location # dict of language specific endpoints # { Language : url-string } self.endpoints = endpoints adapter = CacheControlAdapter(heuristic=ExpiresAfter(days=1)) self.session = HTMLSession() self.session.mount('https://', adapter) def retrieve(self, datum=None, language=None, meals=None, emojize=None) -> Plan: # overwrite this # TODO how to make design more pythonic? # In Java terms: abstract class -> two implementation classes pass # Helper method to make a language-specific request def do_request(self, language=Language.DE): resp = self.session.get(self.endpoints[language.name]) code = resp.status_code if code != 200: logger.warning(f'Non-200 status: {code}') logger.debug(f'Status Code: {code}') return resp.html @staticmethod def _normalize_key(k: str) -> str: return None if not k else k.strip().lower().replace(' ', '_') @staticmethod def _strip_additives(text: str) -> str: return re.sub('\((\s*(\d+)?[a-z]?[,.]?\s*)+\)', '', text) @staticmethod def _normalize_whitespace(text: str) -> str: return re.sub('\s{2,}', ' ', text) @staticmethod def _normalize_orthography(text: str) -> str: return re.sub('\s,', ',', text) @staticmethod def _clean_text(text: str) -> str: return MensaBase._normalize_orthography(MensaBase._normalize_whitespace(MensaBase._strip_additives(text.strip()))) @staticmethod def _text_replace(text: str) -> str: return re.sub('Züricher', "Zürcher", text)
def get_signature(user_id): """获取所需的签名信息 @oaram: user_id @return: signature """ session = HTMLSession() signature_url = 'file://' + os.getcwd() + os.sep +'signature.html?user_id=' + str(user_id) session.mount("file://", LocalFileAdapter()) r = session.get(signature_url, headers=MOBIE_HEADERS) r.html.render() sign = r.html.find('#signature', first=True) return sign.text
def session(self) -> HTMLSession: """ Often when using a third party API you want to verify that the returned response is indeed valid. Requests offers the shorthand helper raise_for_status() which asserts that the response HTTP status code is not a 4xx or a 5xx, """ session = HTMLSession() adapter = HTTPAdapter(max_retries=self.retry_strategy) session.mount("https://", adapter) session.mount("http://", adapter) assert_status_hook = ( lambda response, *args, **kwargs: response.raise_for_status() ) # the requests library offers a 'hooks' interface # where you can attach callbacks on certain parts of the request process. session.hooks["response"] = [assert_status_hook] return session
def get_report_values() -> tuple: r"""获取测试报告中的数据值,用于传参与email中的数据统计 """ session = HTMLSession() session.mount('file://', FileAdapter()) filepath = (os.path.join(setting.BASE_DIR, setting.REPORT, 'Report.html')).replace("\\", "/") html_obj = session.get(f'file:///{filepath}') test_pass_pattern = re.findall( '"testPass": \d+,', html_obj.html.text)[0].split(':')[1].replace(',', '') test_all_pattern = re.findall('"testAll": \d+,', html_obj.html.text)[0].split(':')[1].replace( ',', '') test_fail_pattern = re.findall( '"testFail": \d+,', html_obj.html.text)[0].split(':')[1].replace(',', '') test_skip_pattern = re.findall( '"testSkip": \d+,', html_obj.html.text)[0].split(':')[1].replace(',', '') return test_all_pattern, test_pass_pattern, test_fail_pattern, test_skip_pattern
def main(): login_session = login(username, password) code, msg = report(login_session) code_run, msg_run = runway(login_session) nameservers = ['8.8.8.8', "210.39.39.153"] session = HTMLSession() session.mount('http://', CustomAdapter(nameservers)) session.mount('https://', CustomAdapter(nameservers)) if server_chan_enable == 1: session.get('https://sctapi.ftqq.com/' + sckey + '.send', params={'text': str(msg) + str(msg_run)}, timeout=5) if telegram_bot_enable == 1: session.get('https://api.telegram.org/bot' + telegram_bot_token + '/sendMessage', params={ 'chat_id': telegram_chat_id, 'text': str(msg) + str(msg_run) }, timeout=5) print(str(msg) + str(msg_run))
# Please refer to `https://api.fanyi.baidu.com/doc/21` for complete api document import json import random from hashlib import md5 # import requests # 基本配置 from loguru import logger from requests.adapters import HTTPAdapter from requests_html import HTMLSession # ua = UserAgent(use_cache_server=False) # ua = UserAgent(verify_ssl=False) session = HTMLSession() session.mount('http://', HTTPAdapter(max_retries=3)) session.mount('https://', HTTPAdapter(max_retries=3)) session.keep_alive = False # Set your own appid/appkey. appid = '20190508000295298' appkey = 'SpZnEM6HliTHK1Mlp96I' # For list of language codes, please refer to `https://api.fanyi.baidu.com/doc/21` from_lang = 'en' to_lang = 'zh' endpoint = 'http://api.fanyi.baidu.com' path = '/api/trans/vip/translate' url = endpoint + path
def __init__(self, **kwargs): ''' Base class for common scraping tasks Args: ''' logging.getLogger(__name__).addHandler(logging.NullHandler()) self.urls = [] # use requests HTML to aid parsing # has all same methods as requests.Session _s = HTMLSession() # delay/expire if kwargs.get('delay'): self.delay = kwargs['delay'] else: self.delay = 2 if kwargs.get('expire_hours'): self.expire_hours = kwargs['expire_hours'] else: self.expire_hours = 168 # add cookies if kwargs.get('cookies'): _s.cookies = kwargs['cookies'] else: try: import cookielib _s.cookies = cookielib.MozillaCookieJar() except (NameError, ImportError): import http.cookiejar _s.cookies = http.cookiejar.MozillaCookieJar() # add headers if kwargs.get('headers'): _s.headers = kwargs['headers'] else: ua = ('Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36') _s.headers = {'User-Agent': ua} # add proxies if kwargs.get('proxies'): _s.proxies = kwargs['proxies'] # add cache if not '/' in kwargs.get('cache_name', ''): self.cache_name = os.path.join('/tmp', kwargs['cache_name']) try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount('http://', CacheControlAdapter(cache=FileCache(self.cache_name), cache_etags = False, heuristic=ExpiresAfter(hours=self.expire_hours))) except ImportError as e: try: import requests_cache requests_cache.install_cache(self.cache_name) except: logging.exception('could not install cache') self.s = _s
class Worker(multiprocessing.Process): def __init__(self, unvisited_urls_queue, fetched_urls_queue, result_queue, counter, config): multiprocessing.Process.__init__(self) self.unvisited_urls_queue = unvisited_urls_queue self.fetched_urls_queue = fetched_urls_queue self.result_queue = result_queue self.counter = counter self.config = config self.kwargs = config["kwargs"] self.session = HTMLSession() a = adapters.HTTPAdapter( pool_connections = 100, pool_maxsize = 100 ) self.session.mount("http://", a) self.session.mount("https://", a) def get_url_type(self, url, resp): for include_snippet in self.config["include"]: if include_snippet in url: content_type = resp.headers.get('Content-Type', None) if content_type and "text/html" in content_type: url_type = 'recursive' else: url_type = 'static' return url_type else: continue return "external" def check_url_info(self, url): for exclude_snippet in self.config["exclude"]: if exclude_snippet in url: status_code = None url_type = "exclude" return (status_code, url_type) try: resp = self.session.head(url, **self.kwargs) status_code = resp.status_code url_type = self.get_url_type(url, resp) except exceptions.ConnectTimeout as ex: color_logging(f"{url}: {str(ex)}", 'WARNING') status_code = "ConnectTimeout" url_type = None except exceptions.ConnectionError as ex: color_logging(f"{url}: {str(ex)}", 'WARNING') status_code = "ConnectionError" url_type = None return (status_code, url_type) def get_hyper_links(self, url): # session.browser status_code = None hyper_links = set() try: resp = self.session.get(url, **self.kwargs) status_code = resp.status_code except exceptions.ConnectionError as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') status_code = "ConnectionError" try: resp.html.render(sleep=1, timeout=30) hyper_links = resp.html.absolute_links except lxml.etree.ParserError as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') except UnicodeDecodeError as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') except MaxRetries as ex: color_logging(f"{url}: {str(ex)}", 'ERROR') return (status_code, hyper_links) def run(self): while True: unvisited_url = self.unvisited_urls_queue.get() if unvisited_url is None: # Poison pill means shutdown color_logging(f'{self.name}: Exiting') self.unvisited_urls_queue.task_done() break start_time = time.time() status_code, url_type = self.check_url_info(unvisited_url) method = "HEAD" if url_type in ["exclude"]: color_logging(f"skip url: {unvisited_url}", color="blue") self.unvisited_urls_queue.task_done() continue if url_type in ['static', 'external']: hyper_links = set() elif url_type in ['recursive']: method = "GET & Render" status_code, hyper_links = self.get_hyper_links(unvisited_url) else: # url_type is None # TODO: raise exception hyper_links = set() duration_time = time.time() - start_time result = (unvisited_url, status_code, duration_time, hyper_links) self.result_queue.put(result) for link in hyper_links: self.fetched_urls_queue.put(link) self.unvisited_urls_queue.task_done() self.counter.value += 1 color_logging(f"index: {self.counter.value}, {method} {unvisited_url}, status_code: {status_code}, duration_time: {duration_time}, worker: {self.name}", color="white")
def __init__(self, **kwargs): """ """ logging.getLogger(__name__).addHandler(logging.NullHandler()) self.urls = [] # use requests HTML to aid parsing # has all same methods as requests.Session _s = HTMLSession() self.delay = kwargs.get("delay", 2) self.expire_hours = kwargs.get("expire_hours", 168) # add cookies if kwargs.get("cookies"): _s.cookies = kwargs["cookies"] else: import http.cookiejar _s.cookies = http.cookiejar.MozillaCookieJar() # add headers default_headers = { "User-Agent": random.choice(USER_AGENTS), "accept-encoding": "gzip, deflate, br", "accept-language": "en-US,en;q=0.9", "accept": "application/json, text/plain, */*", } _s.headers.update(default_headers) if kwargs.get("headers"): _s.headers.update(kwargs["headers"]) # add proxies if kwargs.get("proxies"): _s.proxies = kwargs["proxies"] # add cache if not kwargs.get("cache_name"): self.cache_name = os.path.join("/tmp", random_string(32)) elif "/" not in kwargs.get("cache_name", ""): self.cache_name = os.path.join("/tmp", kwargs["cache_name"]) else: self.cache_name = kwargs.get("cache_name") try: from cachecontrol import CacheControlAdapter from cachecontrol.heuristics import ExpiresAfter from cachecontrol.caches import FileCache _s.mount( "http://", CacheControlAdapter( cache=FileCache(self.cache_name), cache_etags=False, heuristic=ExpiresAfter(hours=self.expire_hours), ), ) except ImportError: try: import requests_cache requests_cache.install_cache(self.cache_name) except BaseException: logging.exception("could not install cache") self.session = _s
import os from bs4 import BeautifulSoup from requests_html import HTMLSession from requests.adapters import HTTPAdapter SITE_URL = 'https://itpanda.net' session = HTMLSession() session.mount(SITE_URL, HTTPAdapter(max_retries=5)) PROXIES_SERVER = os.environ.get('PROXIES_SERVER') PROXIES = {'http': PROXIES_SERVER, 'https': PROXIES_SERVER} class Spider: def get_markdown(self): r = session.get(SITE_URL, proxies=PROXIES) nav = r.html.find('ul.nav', first=True) soup = BeautifulSoup(nav.html, features="lxml") ul = soup.find('ul') items = ul.find_all('li', recursive=False) print('# IT eBOOK') for item in items: sub_items = item.ul.find_all('li') cate_title = item.a.string.split('(')[0].strip() cate_link = f'{SITE_URL}{item.a["href"]}' print(f'## {cate_title}') for sub in sub_items: sub_cate_title = sub.a.string.split('(')[0].strip()
# fb_post_id = row[8] # csv_writer.writerow([fb_post_id]) # multithread for each post id, request the story url and extract start_time = datetime.now() log_format = '%(relativeCreated)8d %(threadName)4s %(message)s' logging.basicConfig(level=logging.DEBUG, format=log_format) file_handler = logging.FileHandler('logs/converter.txt', 'w') file_handler.setLevel(logging.DEBUG) file_handler.setFormatter(logging.Formatter(log_format)) logger = logging.getLogger() logger.addHandler(file_handler) threading.current_thread().name = 'M' ses = HTMLSession() ses.mount('https://', HTTPAdapter(pool_maxsize=2000)) base_url = 'https://m.facebook.com/story.php?story_fbid=%s&id=695707917166339' user_agent = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "Gecko/20100101 Firefox/86.0") default_headers = { 'User-Agent': user_agent, 'Accept-Language': 'en-US,en;q=0.5' } ses.headers.update(default_headers) # prev_data = ses.get('https://github.com/davidchoo12/nuswhispers-analysis/releases/latest/download/data-converted.csv').text # with open('data-converted.csv', 'w') as fd: # fd.write(prev_data) # last_no = 0 # with open('data-converted.csv', 'r') as fd: # csv_reader = csv.reader(fd)
import time from requests_html import HTMLSession from requests import adapters from tqdm import tqdm import filtering import multiprocessing as mp import os import json session = HTMLSession() adapter = adapters.HTTPAdapter(pool_connections=100, pool_maxsize=100) session.mount("https://", adapter) url = "https://dbpedia.org/sparql" # !!!be modified when running on other servers proxy = {"http": "http://127.0.0.1:10809", "https": "http://127.0.0.1:10809"} with open("meaningless_predicates.txt", "r", encoding="utf-8") as fmp: bad_predicates = fmp.readline() # position # 0: query by the subject # 1: query by the predicate # 2: query by the object # 3: query by the subject and the predicate # 4: query by the predicate and the object def crawl(query: tuple, position: int, lock: mp.Lock = None,
import os from requests_html import HTMLSession, HTML from requests_file import FileAdapter session = HTMLSession() session.mount('file://', FileAdapter()) def get(): path = os.path.sep.join((os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) return session.get(url) def test_file_get(): r = get() assert r.status_code == 200 def test_css_selector(): r = get() about = r.html.find('#about', first=True) for menu_item in ( 'About', 'Applications', 'Quotes', 'Getting Started', 'Help', 'Python Brochure' ): assert menu_item in about.text.split('\n')
import os from functools import partial import pytest import psutil from pyppeteer.browser import Browser from pyppeteer.page import Page from requests_html import HTMLSession, AsyncHTMLSession, HTML from requests_file import FileAdapter session = HTMLSession() session.mount('file://', FileAdapter()) def get(): path = os.path.sep.join( (os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path) return session.get(url) @pytest.fixture def async_get(event_loop): """ AsyncSession cannot be created global since it will create a different loop from pytest-asyncio. """ async_session = AsyncHTMLSession() async_session.mount('file://', FileAdapter()) path = os.path.sep.join( (os.path.dirname(os.path.abspath(__file__)), 'python.html')) url = 'file://{}'.format(path)
class KaniRequests(object): def __init__(self, headers={}, proxy={}, default_timeout=None, max_retries=3): def __init__(self, *args, **kwargs): if kwargs["connect"] is None: kwargs["connect"] = default_timeout if kwargs["read"] is None: kwargs["read"] = default_timeout return TimeoutSauce.__init__(self, *args, **kwargs) DefaultTimeout = type("DefaultTimeout", (TimeoutSauce, ), {"__init__": __init__}) self.headers = headers self.proxy = proxy self.session = HTMLSession() self.session.headers.update(headers) if proxy != {}: self.session.proxies = proxy # self.session.verify = os.path.join(os.path.dirname(__file__), "FiddlerRoot.pem") self.session.verify = None self.adapters = requests.adapters.HTTPAdapter(max_retries=max_retries) self.adapters.TimeoutSauce = DefaultTimeout requests.adapters.TimeoutSauce = DefaultTimeout self.session.mount("http://", self.adapters) self.session.mount("https://", self.adapters) self.yag = None self.mail_to = None self.subject = None self.log = logging.getLogger(self.__class__.__name__) def set_error_mailer(self, yag, mail_to, subject): self.yag = yag self.mail_to = mail_to self.subject = subject def mount(self, prefix, adapters): self.session.mount(prefix, adapters) self.session.mount(prefix, adapters) def get(self, url, *args, **kwargs): try: kwargs["cookies"] = self.session.cookies result = self.session.get(url, *args, **kwargs) if self.yag is not None: if result.status_code != 200: status_code = result.status_code body = f"status_code is not 200 on Get {url=} {args=} {kwargs=}\n" body += f"{status_code=}" self.yag.send( to=self.mail_to, subject=self.subject, contents=body, ) self.log.error( "Sending error email because of status_code=%s.", status_code) return result except Exception as e: if self.yag is not None: body = f"Error on Get {url=} {args=} {kwargs=}" body += "\n[sys.exe_info]\n" body += str(sys.exc_info()) body = "\n[traceback.format_exc]\n" body += traceback.format_exc() self.yag.send( to=self.mail_to, subject=self.subject, contents=body, ) self.log.error("Sending error email because of Exception=%s.", e) raise def post(self, url, *args, **kwargs): try: kwargs["cookies"] = self.session.cookies result = self.session.post(url, *args, **kwargs) if self.yag is not None: if result.status_code != 200: status_code = result.status_code body = f"status_code is not 200 on Get {url=} {args=} {kwargs=}\n" body += f"{status_code=}" self.yag.send( to=self.mail_to, subject=self.subject, contents=body, ) self.log.error( "Sending error email because of status_code=%s.", status_code) return result except Exception as e: if self.yag is not None: body = f"Error on Get {url=} {args=} {kwargs=}\n" body += "\n[sys.exe_info]\n" body += sys.exc_info() body = "\n[traceback.format_exc]\n" body += traceback.format_exc() self.yag.send( to=self.mail_to, subject=self.subject, contents=body, ) self.log.error("Sending error email because of Exception=%s.", e) raise def put(self, url, *args, **kwargs): kwargs["cookies"] = self.session.cookies return self.session.put(url, *args, **kwargs) def delete(self, url, *args, **kwargs): kwargs["cookies"] = self.session.cookies return self.session.delete(url, *args, **kwargs) def close(self): self.session.close() def cookies_to_dict(self): return dict_from_cookiejar(self.session.cookies) def add_cookies(self, cookies): add_dict_to_cookiejar(self.session.cookies, cookies)
def _get_page_posts(path, pages=10, timeout=5, sleep=0, credentials=None, extra_info=False, begin_url=None, max_retries=5): """Gets posts for a given account.""" global _session, _timeout _session = HTMLSession() _session.headers.update(_headers) a = HTTPAdapter(max_retries=max_retries) b = HTTPAdapter(max_retries=max_retries) _session.mount('http://', a) _session.mount('https://', b) if credentials: _login_user(*credentials) _timeout = timeout html = None cursor_blob = None if begin_url: try: response = _session.get(begin_url, timeout=timeout) response.raise_for_status() data = json.loads(response.text.replace('for (;;);', '', 1)) except (RequestException, ValueError): return for action in data['payload']['actions']: if action['cmd'] == 'replace': html = HTML(html=action['html'], url=_base_url) elif action['cmd'] == 'script': cursor_blob = action['code'] if not html: html = HTML(html=response.html.html.replace('<!--', '').replace('-->', '')) if not cursor_blob: cursor_blob = html.html else: url = f'{_base_url}/{path}' response = _session.get(url, timeout=_timeout) html = HTML(html=response.html.html.replace('<!--', '').replace('-->', '')) cursor_blob = html.html cursor = None next_url = None try: while True: for article in html.find('article'): try: post = _extract_post(article) if extra_info: post = fetch_share_and_reactions(post) yield post except: print(traceback.format_exc()) print("But continuing...") pages -= 1 if pages <= 0: return cursor = _find_cursor(cursor_blob) if not cursor: return next_url = f'{_base_url}{cursor}' if sleep: time.sleep(sleep) try: response = _session.get(next_url, timeout=timeout) response.raise_for_status() data = json.loads(response.text.replace('for (;;);', '', 1)) except (RequestException, ValueError): raise for action in data['payload']['actions']: if action['cmd'] == 'replace': html = HTML(html=action['html'], url=_base_url) elif action['cmd'] == 'script': cursor_blob = action['code'] except: print(f"Current url: {next_url}") print("-----------------------------------") print(f"Current page (total-cur_page): {pages}") print("-----------------------------------") print("Traceback:") print(traceback.format_exc()) raise
def download_resource(self): text = '' # default to empty string # Was this file already downloaded? if (len(self.content_type) >= 1): print("ALREADY DOWNLOADED.") return self.request_dict # Is the file cached locally? # Does this already exist in database? *************************************** file_dict = get_from_cache(self.url_protocol_removed()) if (len(file_dict['text']) > 0): self.request_dict = file_dict return file_dict['text'] # --------- Download file from internet ------------- try: self.increment_num_downloads() error = '' url = self.url # Use a User Agent to simulate what a Firefox user would see # session = requests.Sesson() session = HTMLSession() retry = Retry(connect=5, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) try: r = session.get(url, headers=HEADERS, verify=False) # Invalid URL except requests.exceptions.MissingSchema: return { 'text': '', # unicode 'unicode': '', 'content': '', # raw 'encoding': '', 'error': "Connection refused", 'language': '', 'content_type': '' } except requests.exceptions.ConnectionError: # r.status_code = "Connection refused" return { 'text': '', # unicode 'unicode': url, 'content': url, # raw 'encoding': '', 'error': "Connection refused", 'language': '', 'content_type': '' } print('Downloaded ' + url) self.request_stop = datetime.now() print("Encoding: %s" % r.encoding) print("num downloads: " + str(self.num_downloads)) # Correct the Character Encoding if url in self.url_encoding_hardcoded(): hardcoded_encoding = self.url_encoding_hardcoded()[url] r.encoding = hardcoded_encoding text = r.text self.unicode = r.text self.content = r.content self.encoding = r.encoding self.error = error self.language = detect( r.text ) # https://www.geeksforgeeks.org/detect-an-unknown-language-using-python/ if 'Content-Type' in r.headers.keys(): self.content_type = r.headers['Content-Type'] else: self.content_type = 'application/html' print(r.headers ) # TODO: Research why content-type is not always set print('Content-Type: ' + self.content_type) print('Language: ' + self.language) print('Length: ' + str(len(self.content))) print("Attempting to save .. ") print(self.unicode) ####### Archive a Copy of the Original File ######## doc_type = self.doc_type() print("DocType::: " + doc_type) if (doc_type == 'pdf'): text = r.content # file contents if (settings.SAVE_DOWNLOADS_TO_FILE): write_format = 'w' local_filename = self.filename_original() remote_path = ''.join(["archive/", self.canonical_url()]) content_type = self.content_type if not content_type: content_type = 'text/html' # Create Directory if it doesn't exist dirname = os.path.dirname(local_filename) if not os.path.exists(dirname): os.makedirs(dirname) if (content_type == 'application/pdf'): text = self.content write_format = 'wb' # Archive file print( "ARCHIVE: local filename----------------------------------" ) print(local_filename) my_file = Path(local_filename) if my_file.is_file(): pass else: try: with open(local_filename, write_format) as f: f.write(text) except IsADirectoryError: pass # Add file extension if there is none filename, file_extension = os.path.splitext(remote_path) if not ((file_extension == '.html') or (file_extension == '.htm')): remote_path = os.path.splitext( remote_path)[0] + 'index.html' print("Remote path:") print(remote_path) else: remote_path = remote_path + '.' + doc_type save_file_to_cloud(local_filename, remote_path, content_type, 'gzip') print("Saved original: " + self.filename_original()) print('Content-Type: ' + self.content_type) print('Language: ' + self.language) print('Length: ' + str(len(r.content))) except requests.HTTPError: self.request_stop = datetime.now() """ TODO: Add better error tracking """ error = "document: HTTPError" self.request_dict = { 'text': text, # unicode 'unicode': self.unicode, 'content': self.content, # raw 'encoding': self.encoding, 'error': self.error, 'language': self.language, 'content_type': self.content_type } # SAVE DB: TODO ********************************* return self.request_dict
def retry_session(url): session = HTMLSession() session.mount(url, HTTPAdapter(max_retries=5)) return session