예제 #1
0
    'method': 'GET',
    'accept':
    'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.9'
}
initial_response = requests.get(search_url, headers=page_headers)
response_html = initial_response.text

# Parse HTML
initial_soup = soup(response_html, 'html.parser')

# Check for CAPTCHA
if 'Are you a human?' in initial_soup.text:
    print('CAPTCHA REQUIRED! Launching browser.')
    initial_soup = solve_captcha(initial_soup, initial_response.url)

# Get number of pages for results
pages = initial_soup.find(class_='list-tool-pagination-text')
page_amount = 1
if pages is not None:
    page_amount = int(pages.text.split("/", 1)[1])
    print(f'Page Count: {page_amount}')

# Ask how many pages to scrape for
max_page_amount = 1
if page_amount > 1:
    max_page_amount = input(
        'What page would you like to scrape until (inclusive)? ').strip()
    while not isinstance(max_page_amount, int):
        try:
예제 #2
0
def process_mirror_pages(allData):

    for time, notifier, mirror in allData:

        try:

            print notifier, time, mirror

            browser_get(mirror)

            for i in range(0, ALERT_CONFIRMS +
                           1):  #number of alert confirmations plus content

                try:

                    havepage = False

                    for k in range(1,
                                   5):  #Loop of uncorrectly solved captachas

                        try:
                            WebDriverWait(browser, 10).until(
                                EC.presence_of_element_located(
                                    (By.TAG_NAME, "iframe")
                                )  #TODO: wait for iframe OR propdeface. How to handle flows in that case?
                            )

                        except TimeoutException as e:

                            WebDriverWait(browser, 5).until(
                                EC.presence_of_element_located(
                                    (By.ID, "propdeface")))

                            #In case of timeout on iframe, we are checking if captcha is present.
                            #If captcha is not present generic exception handling code will take over control.
                            #Next mirror page will tried to be downloaded.
                            #This differs from v0.9 that continued to scan next page immedatelly.
                            #Software robustness(resiliency) is kept.
                            #Timeout message will be printed out as part of GEH code.
                            #print "Time elapsed for zone-h page processing\n"

                            #Here captcha is resolved

                            cookies = browser.get_cookies()
                            cookie = ' '.join([
                                i['name'] + '=' + i['value'] + ';'
                                for i in cookies
                            ])

                            url = browser.find_elements_by_xpath(
                                "//*[@id='cryptogram']")[0].get_attribute(
                                    'src')

                            req = urllib2.Request(url)
                            req.add_header('Cookie', cookie)
                            resp = urllib2_urlopen(req)
                            pic = resp.read()

                            f = open("captcha.png", "wb")
                            f.write(pic)
                            f.close()

                            try:
                                solution = captcha.solve_captcha('captcha.png')
                                print "Captcha solved in process_mirror_pages (%s).\n" % solution
                                elem = browser.find_element_by_name("captcha")
                                elem.send_keys(solution)
                                elem.submit()
                            except:
                                print "Something wrong in solving captcha.\n"
                                print traceback.format_exc()
                                print "\n"

                        else:
                            havepage = True
                            break  #break out of captcha solving loop

                    if havepage:

                        time_.sleep(
                            2)  #safety hold in case HTML is not fully loaded

                        mirrorsrc = browser.find_element_by_tag_name(
                            'iframe').get_attribute('src')
                        url = browser.find_elements_by_xpath(
                            "//*[@id='propdeface']/ul/li[2]/ul[1]/li[2]"
                        )[0].text.split(": ")[1].strip()

                        print url
                        print "\n"

                        processDefacement(time, notifier, url, mirrorsrc)
                        break  #break out of alert confirmation loop

                    else:

                        #captchas uncorrectly solved 5 times
                        #TODO: save and make claim?
                        break  #break out of alert confirmation loop

                except UnexpectedAlertPresentException as e:
                    print "Accepting alert in process_mirror_pages.\n"
                    #confirmation of alert here
                    Alert(browser).accept()
                    if i == ALERT_CONFIRMS:
                        print traceback.format_exc()
                        print "\n"

        except:
            print "Unsuccessful processing in process_mirror_pages.\n"
            print traceback.format_exc()
            print "\n"
예제 #3
0
im.save("captcha_full.png")
bg = Image.open("data/resources/bg.png").convert('LA')

for x in range(1, 7):
    x_bound = (x - 1) * 8

    crop = im.crop((x_bound, 0, x_bound + 8, 18))
    name = "slice_" + str(x) + ".png"
    #print(name)

    back_im = bg.copy()
    back_im.paste(crop, (10, 5))
    back_im.save("data/captcha_slices/" + name)

captcha = solve_captcha()


def enter(field, text):
    driver.find_element_by_xpath(field).send_keys(text)


try:
    dropdown = Select(driver.find_element_by_xpath(choice_field2))
except NoSuchElementException:
    dropdown = Select(driver.find_element_by_xpath(choice_field))

time.sleep(3)

try:
    enter(name_field, name_mail[0])
예제 #4
0
def process_zoneh_pages(f):

    ttime, tnotifier, tmirror = ctime, cnotifier, cmirror = f.read().split(
        '\n')[:3]
    tnotifier = cnotifier = cnotifier.decode('utf-8')
    allData = []

    print[ctime, cnotifier, cmirror]
    print "\n"

    i = -1

    for pagenum in range(1, 2):  #looking for defaces in first two pages

        try:
            print "Downloading zone-h.org page: %s\n" % pagenum

            #TODO: Connecting over TOR (captcha recognition and change of circuit)
            browser_get('http://zone-h.org/archive/page=%d' % (pagenum, ))

            havepage = False

            for k in range(1, 5):  #Loop of uncorrectly solved captachas

                try:
                    WebDriverWait(browser, 10).until(
                        EC.presence_of_element_located(
                            (By.ID, "ldeface")
                        )  #TODO: wait for ldeface OR propdeface. How to handle flows in that case?
                    )

                except TimeoutException as e:

                    WebDriverWait(browser, 5).until(
                        EC.presence_of_element_located((By.ID, "propdeface")))

                    #In case of timeout on ldeface, we are checking if captcha is present.
                    #If captcha is not present generic exception handling code will take over control.
                    #Next page will tried to be downloaded.
                    #This differs from v0.9 that continued to scan next page immedatelly.
                    #Software robustness(resiliency) is kept.
                    #Timeout message will be printed out as part of GEH code.
                    #print "Time elapsed for zone-h page processing\n"

                    #Here captcha is resolved

                    cookies = browser.get_cookies()
                    cookie = ' '.join(
                        [i['name'] + '=' + i['value'] + ';' for i in cookies])

                    url = browser.find_elements_by_xpath(
                        "//*[@id='cryptogram']")[0].get_attribute('src')

                    req = urllib2.Request(url)
                    req.add_header('Cookie', cookie)
                    resp = urllib2_urlopen(req)
                    pic = resp.read()

                    f = open("captcha.png", "wb")
                    f.write(pic)
                    f.close()

                    try:
                        solution = captcha.solve_captcha('captcha.png')
                        elem = browser.find_element_by_name("captcha")
                        print "Captcha solved in process_zoneh_pages (%s).\n" % solution
                        elem.send_keys(solution)
                        elem.submit()
                    except:
                        print "Something wrong in solving captcha.\n"
                        print traceback.format_exc()
                        print "\n"

                else:
                    havepage = True
                    break

            if havepage:

                mirrors = browser.find_elements_by_link_text('mirror')
                ntdata = map(lambda x: x.find_elements_by_xpath("../../*"),
                             mirrors)
                data = map(
                    lambda (x, y):
                    (x[0].text, x[1].text, y.get_attribute('href')),
                    zip(ntdata, mirrors))

                #TODO: [HIGH PRIORITY] If there is unsuccessful processing for some page which contains (ctime, cnotifier, cmirror), code will continue
                #download next pages that have mirrors already in database. Do I deduplicate in that case?
                #CHECK DONE: in insertInDatabase is seen that new deface is always inserted in database.
                #(time, notifier_id, url, mirrorsrc) tuple gives unique deface id that can be used for deduplication

                allData += data

                if (ctime, cnotifier, cmirror) in data:
                    i = data.index((ctime, cnotifier, cmirror))
                    allData = list(reversed(allData[:i]))
                    break

            else:

                #captchas uncorrectly solved 5 times
                #TODO: save and make claim?
                pass

        except:
            print "Unsuccessful processing of zone-h.org page\n"
            print traceback.format_exc()
            print "\n"

    if i == -1:
        allData = list(reversed(allData))

    return allData