def fetch(url): data = "" r = "" # p = current_process() # if(p.name != 'MainProcess' and p._identity[0] and os.getpid()): # print('process counter:', p._identity[0], 'pid:', os.getpid()) asession = HTMLSession() asession.headers.update({'User-Agent': fake_useragent.UserAgent().random}) asession.max_redirects = 60 #parsing from proxy # proxy = { 'http': 'http://' + choice(read_file("proxies.txt","\n")) +'/' } # asession.proxies.update(proxy) unf = uniform(1,6) time.sleep(unf) try: r = asession.request('GET', url, allow_redirects=False) except Exception as e: print('Failed to get page %s. Reason: %s' % (url, e)) asession.close() return data try: if(r.status_code == 200): r.html.render(sleep = 2, timeout = 200) data = r.html asession.close() return data else: asession.close() return data except Exception as e: print('Failed to render page %s. Reason: %s' % (url, e)) asession.close() return data
from bs4 import BeautifulSoup import re import time from urllib.parse import urlparse, parse_qs import requests import requests.exceptions as exceptions logging.basicConfig( format="%(asctime)s - [%(threadName)s]- [%(levelname)s] - %(message)s", level=logging.INFO) log = logging.getLogger(__name__) # Initiate HTML session session = HTMLSession() session.browser session.max_redirects = 3 # Supress warnings requests.packages.urllib3.disable_warnings() class Consumer(object): def __init__(self, url_queue, data_queue, result_queue=None, extract=True): self.headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \ AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 \ Safari/537.36" } if not extract: self.data_queue = data_queue