Example #1
0
    def getCaptchaAnswer(self, captchaType, url, siteKey, reCaptchaParams):
        if not reCaptchaParams.get('api_key'):
            raise reCaptchaParameter("anticaptcha: Missing api_key parameter.")

        client = AnticaptchaClient(reCaptchaParams.get('api_key'))

        if reCaptchaParams.get('proxy'):
            client.session.proxies = reCaptchaParams.get('proxies')

        captchaMap = {
            'reCaptcha': NoCaptchaTaskProxylessTask,
            'hCaptcha': HCaptchaTaskProxyless
        }

        task = captchaMap[captchaType](url, siteKey)

        if not hasattr(client, 'createTaskSmee'):
            raise NotImplementedError(
                "Please upgrade 'python_anticaptcha' via pip or download it from "
                "https://github.com/ad-m/python-anticaptcha/tree/hcaptcha")

        job = client.createTaskSmee(task)

        try:
            job.join(maximum_time=180)
        except (AnticaptchaException) as e:
            raise reCaptchaTimeout('{}'.format(getattr(e, 'message', e)))

        if 'solution' in job._last_result:
            return job.get_solution_response()
        else:
            raise reCaptchaAPIError(
                'Job did not return `solution` key in payload.')
Example #2
0
def process(url):
    client = AnticaptchaClient(api_key)
    task = CustomCaptchaTask(
        imageUrl=url,
        assignment="Count the dots and indicate their color.",
        form=form)
    job = client.createTaskSmee(task)
    answer = job.get_answers()
    return answer["dot_count"], answer["dot_color"]
Example #3
0
    def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
        if not reCaptchaParams.get('api_key'):
            raise ValueError("reCaptcha provider 'anticaptcha' was not provided an 'api_key' parameter.")

        client = AnticaptchaClient(reCaptchaParams.get('api_key'))

        if reCaptchaParams.get('proxy'):
            client.session.proxies = reCaptchaParams.get('proxies')

        task = NoCaptchaTaskProxylessTask(site_url, site_key)

        if not hasattr(client, 'createTaskSmee'):
            sys.tracebacklimit = 0
            raise RuntimeError("Please upgrade 'python_anticaptcha' via pip or download it from https://github.com/ad-m/python-anticaptcha")

        job = client.createTaskSmee(task)
        return job.get_solution_response()
Example #4
0
    def getCaptchaAnswer(self, site_url, site_key, reCaptchaParams):
        if not reCaptchaParams.get('api_key'):
            raise reCaptchaParameter("anticaptcha: Missing api_key parameter.")

        client = AnticaptchaClient(reCaptchaParams.get('api_key'))

        if reCaptchaParams.get('proxy'):
            client.session.proxies = reCaptchaParams.get('proxies')

        task = NoCaptchaTaskProxylessTask(site_url, site_key)

        if not hasattr(client, 'createTaskSmee'):
            raise NotImplementedError(
                "Please upgrade 'python_anticaptcha' via pip or download it from "
                "https://github.com/ad-m/python-anticaptcha")

        job = client.createTaskSmee(task)
        return job.get_solution_response()
Example #5
0
    def getCaptchaAnswer(self, captchaType, url, siteKey, captchaParams):
        if not captchaParams.get('api_key'):
            raise CaptchaParameter("anticaptcha: Missing api_key parameter.")

        client = AnticaptchaClient(captchaParams.get('api_key'))

        if captchaParams.get('proxy') and not captchaParams.get('no_proxy'):
            captchaMap = {'reCaptcha': NoCaptchaTask, 'hCaptcha': HCaptchaTask}

            proxy = self.parseProxy(
                captchaParams.get('proxy', {}).get('https'),
                captchaParams.get('User-Agent', ''))

            task = captchaMap[captchaType](url, siteKey, **proxy)
        else:
            captchaMap = {
                'reCaptcha': NoCaptchaTaskProxylessTask,
                'hCaptcha': HCaptchaTaskProxyless
            }
            task = captchaMap[captchaType](url, siteKey)

        if not hasattr(client, 'createTaskSmee'):
            raise NotImplementedError(
                "Please upgrade 'python_anticaptcha' via pip or download it from "
                "https://github.com/ad-m/python-anticaptcha/tree/hcaptcha")

        job = client.createTaskSmee(task, timeout=180)

        try:
            job.join(maximum_time=180)
        except (AnticaptchaException) as e:
            raise CaptchaTimeout(f"{getattr(e, 'message', e)}")

        if 'solution' in job._last_result:
            return job.get_solution_response()
        else:
            raise CaptchaAPIError(
                'Job did not return `solution` key in payload.')
def process(path):
    captcha_fp = open(path, "rb")
    client = AnticaptchaClient(api_key)
    task = ImageToTextTask(captcha_fp)
    job = client.createTaskSmee(task)
    return job.get_captcha_text()
Example #7
0
class Scrapping:
    def __init__(self):
        self.domain = 'http://www.ceo.kerala.gov.in'
        self.base_url = self.domain + '/electoralrolls.html'
        # self.list_url = self.domain + '/electoralroll/partsListAjax.html?currentYear=2021&distNo=6&lacNo=60&sEcho=1&iColumns=5&sColumns=&iDisplayStart=0&iDisplayLength=500&iSortingCols=1&iSortCol_0=0&sSortDir_0=asc&bSortable_0=false&bSortable_1=false&bSortable_2=false&bSortable_3=false&bSortable_4=false&undefined=undefined'
        self.list_url = self.domain + '/electoralroll/partsListAjax.html?currentYear=2021&distNo={}&lacNo={}&sEcho=1&iColumns=5&sColumns=&iDisplayStart=0&iDisplayLength=500&iSortingCols=1&iSortCol_0=0&sSortDir_0=asc&bSortable_0=false&bSortable_1=false&bSortable_2=false&bSortable_3=false&bSortable_4=false&undefined=undefined'
        self.session = requests.Session()

        self.api_key = "e00923a245a141aabc38bf7031e7d35b"
        self.site_key_pattern = 'data-sitekey="(.+?)"'
        self.client = AnticaptchaClient(self.api_key)
        self.site_key = None
        self.task = None
        self.laclist = [
            5, 16, 19, 32, 48, 60, 73, 87, 92, 101, 110, 115, 126, 140
        ]

    def extract_url(self, s):
        match = re.search(r'href=[\'"]?([^\'" >]+)', s)
        if match:
            url = match.group(1)
            # print(url)
            return url
        return ""

    def start(self):
        lacNo = 25
        for i in range(0, 14):
            distNo = i + 1
            while lacNo <= self.laclist[i]:
                self.start_lac(distNo, lacNo)
                lacNo += 1
                if lacNo == 35:
                    return

    def start_lac(self, distNo, lacNo):
        self.save_path = "pdfs{}".format(lacNo)
        self.create_folder_if_not_exists(self.save_path)

        list_url = self.list_url.format(distNo, lacNo)
        r = self.session.get(list_url)

        print(self.session.cookies.get_dict())
        # self.cookie = "PHPSESSID={}; rufc=false".format(self.session.cookies.get_dict()['PHPSESSID'])
        self.cookie = ""
        print(self.cookie)
        booths = json.loads(r.content)
        # soup=BeautifulSoup(html,'lxml')
        # print(booths)
        nbooths = len(booths["aaData"])
        print(nbooths)
        # self.token = soup.find('input', {'id':'token'}).get('value')
        # print(self.token)
        cnt = 0
        for i in range(cnt, nbooths):
            booth = booths["aaData"][i]
            self.pdf_url = self.extract_url(booth[4])
            print(cnt, self.pdf_url)
            self.download_pdf()
            cnt += 1

    def create_folder_if_not_exists(self, foldername):
        if not os.path.exists(foldername):
            os.makedirs(foldername)

    def download_pdf(self):
        # self.pdf_url = "http://www.ceo.kerala.gov.in:80/pdf/voterslist/2021/FINAL/KLA2021/AC060/S11A60P1.pdf"

        # token = self.solve_captcha1()
        # if token == None:
        #     print("Solving Captcha Failed")
        #     return False
        # print("Token = {}".format(token))

        html = self.session.get(self.pdf_url).text
        soup = BeautifulSoup(html, 'lxml')
        self.pdf_download_url = soup.find('form', {
            'id': 'eRollDownloadFormId'
        }).get('action')
        print(self.pdf_download_url)

        # self.pdf_download_url = "{}?download={}".format(self.pdf_url, 'GFM0u%2Fb%2F41P59Jkt0qxLAw%3D%3D')
        self.download_file()

    def download_file(self):
        local_filename = self.save_path + "/" + self.pdf_url.split('/')[-1]
        # NOTE the stream=True parameter below
        with requests.get(self.pdf_download_url, stream=True) as r:
            r.raise_for_status()
            with open(local_filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    # If you have chunk encoded response uncomment if
                    # and set chunk_size parameter to None.
                    #if chunk:
                    f.write(chunk)
        return local_filename

    def solve_captcha1(self):
        print("Solving Captcha Started !")
        try:
            task = self.get_task()
            job = self.client.createTaskSmee(task, timeout=10 * 60)
            token = job.get_solution_response()
            print("Solving Captcha Completed !")
            return token
        except Exception as e:
            print("Get Captcha Token error !")
            print(e)
            return None

    def get_task(self):
        if self.task is None:
            site_key = self.get_site_key()
            self.task = NoCaptchaTaskProxylessTask(website_url=self.pdf_url,
                                                   website_key=site_key)
        return self.task

    def get_site_key(self):
        if self.site_key is None:
            html = self.session.get(self.pdf_url).text
            # self.cookie = "PHPSESSID={};".format(self.session.cookies.get_dict()['PHPSESSID'])

            # self.headers = {'Content-Type': 'application/x-www-form-urlencoded',
            #                 'Cookie': self.cookie,
            #                 'Upgrade-Insecure-Requests': '1'}

            self.site_key = re.search(self.site_key_pattern, html).group(1)
        return self.site_key