Ejemplo n.º 1
0
class Soup:
    def __init__(self):
        self.fail_count = 0
        self.m = Material()

    def getSoup(self, url, user_agents, proxies):

        headers = requests.utils.default_headers()
        user_agent = random.choice(user_agents)
        headers.update({"User-Agent": user_agent})

        while True:

            proxy = random.choice(proxies)

            try:
                session = requests.Session()
                cookies = dict(session.get(url).cookies)
                response = session.post(url,
                                        headers=headers,
                                        proxies={"https": proxy},
                                        timeout=(1.2, 5),
                                        cookies=cookies)

                # If response status code is not 200
                if (response.status_code != 200):

                    if (proxy in proxies):
                        proxies.remove(
                            proxy
                        )  # Remove proxy which got detected from proxies set

                    continue

                if (setting.LXML_PARSER):
                    # Using lxml parser
                    soup = BeautifulSoup(response.content, "lxml")

                if (setting.HTML_PARSER):
                    # Using html parser
                    soup = BeautifulSoup(response.content, "html.parser")

                break

            except Exception as e:

                self.fail_count += 1

                # if the number of fail count is more than 50, reset proxies pool
                if (self.fail_count >= setting.FAIL_COUNT):
                    proxies = self.m.getFreeProxies()

        return soup, proxies
Ejemplo n.º 2
0
#!/usr/bin/python3

from soup import Soup
from material import Material
from extractor import Extractor

import setting
import queue
import threading

# Bring proxies, user_agents, urls
m = Material()
proxies = m.getFreeProxies()
user_agents = m.getUserAgents()
urls = m.getUrls()

# Put url list into queue
urlQueue = queue.Queue()
[urlQueue.put(url) for url in urls]


class myCrawlThread(threading.Thread):
    def __init__(self):
        threading.Thread.__init__(self)

    def run(self):

        while True:
            if (urlQueue.empty()):
                break
            else: