Example #1
0
def fetch(url):
    data = ""
    r = ""
    # p = current_process()
    # if(p.name != 'MainProcess' and p._identity[0] and os.getpid()):
    #     print('process counter:', p._identity[0], 'pid:', os.getpid())
    asession = HTMLSession()
    asession.headers.update({'User-Agent': fake_useragent.UserAgent().random})
    asession.max_redirects = 60
    #parsing from proxy
    # proxy = { 'http': 'http://' + choice(read_file("proxies.txt","\n")) +'/' }
    # asession.proxies.update(proxy)
    unf = uniform(1,6)
    time.sleep(unf)
    try:
        r = asession.request('GET', url, allow_redirects=False)
    except Exception as e:
        print('Failed to get page %s. Reason: %s' % (url, e))
        asession.close()
        return data
    try:
        if(r.status_code == 200):
            r.html.render(sleep = 2, timeout = 200)
            data = r.html
            asession.close()
            return data
        else:
            asession.close()
            return data
    except Exception as e:
        print('Failed to render page %s. Reason: %s' % (url, e))
        asession.close()
        return data
Example #2
0
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import urlparse, parse_qs
import requests
import requests.exceptions as exceptions

logging.basicConfig(
    format="%(asctime)s - [%(threadName)s]- [%(levelname)s] - %(message)s",
    level=logging.INFO)
log = logging.getLogger(__name__)

# Initiate HTML session
session = HTMLSession()
session.browser
session.max_redirects = 3

# Supress warnings
requests.packages.urllib3.disable_warnings()


class Consumer(object):
    def __init__(self, url_queue, data_queue, result_queue=None, extract=True):
        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) \
                AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 \
                Safari/537.36"
        }
        if not extract:
            self.data_queue = data_queue