'method': 'GET', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en-US,en;q=0.9' } initial_response = requests.get(search_url, headers=page_headers) response_html = initial_response.text # Parse HTML initial_soup = soup(response_html, 'html.parser') # Check for CAPTCHA if 'Are you a human?' in initial_soup.text: print('CAPTCHA REQUIRED! Launching browser.') initial_soup = solve_captcha(initial_soup, initial_response.url) # Get number of pages for results pages = initial_soup.find(class_='list-tool-pagination-text') page_amount = 1 if pages is not None: page_amount = int(pages.text.split("/", 1)[1]) print(f'Page Count: {page_amount}') # Ask how many pages to scrape for max_page_amount = 1 if page_amount > 1: max_page_amount = input( 'What page would you like to scrape until (inclusive)? ').strip() while not isinstance(max_page_amount, int): try:
def process_mirror_pages(allData): for time, notifier, mirror in allData: try: print notifier, time, mirror browser_get(mirror) for i in range(0, ALERT_CONFIRMS + 1): #number of alert confirmations plus content try: havepage = False for k in range(1, 5): #Loop of uncorrectly solved captachas try: WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.TAG_NAME, "iframe") ) #TODO: wait for iframe OR propdeface. How to handle flows in that case? ) except TimeoutException as e: WebDriverWait(browser, 5).until( EC.presence_of_element_located( (By.ID, "propdeface"))) #In case of timeout on iframe, we are checking if captcha is present. #If captcha is not present generic exception handling code will take over control. #Next mirror page will tried to be downloaded. #This differs from v0.9 that continued to scan next page immedatelly. #Software robustness(resiliency) is kept. #Timeout message will be printed out as part of GEH code. #print "Time elapsed for zone-h page processing\n" #Here captcha is resolved cookies = browser.get_cookies() cookie = ' '.join([ i['name'] + '=' + i['value'] + ';' for i in cookies ]) url = browser.find_elements_by_xpath( "//*[@id='cryptogram']")[0].get_attribute( 'src') req = urllib2.Request(url) req.add_header('Cookie', cookie) resp = urllib2_urlopen(req) pic = resp.read() f = open("captcha.png", "wb") f.write(pic) f.close() try: solution = captcha.solve_captcha('captcha.png') print "Captcha solved in process_mirror_pages (%s).\n" % solution elem = browser.find_element_by_name("captcha") elem.send_keys(solution) elem.submit() except: print "Something wrong in solving captcha.\n" print traceback.format_exc() print "\n" else: havepage = True break #break out of captcha solving loop if havepage: time_.sleep( 2) #safety hold in case HTML is not fully loaded mirrorsrc = browser.find_element_by_tag_name( 'iframe').get_attribute('src') url = browser.find_elements_by_xpath( "//*[@id='propdeface']/ul/li[2]/ul[1]/li[2]" )[0].text.split(": ")[1].strip() print url print "\n" processDefacement(time, notifier, url, mirrorsrc) break #break out of alert confirmation loop else: #captchas uncorrectly solved 5 times #TODO: save and make claim? break #break out of alert confirmation loop except UnexpectedAlertPresentException as e: print "Accepting alert in process_mirror_pages.\n" #confirmation of alert here Alert(browser).accept() if i == ALERT_CONFIRMS: print traceback.format_exc() print "\n" except: print "Unsuccessful processing in process_mirror_pages.\n" print traceback.format_exc() print "\n"
im.save("captcha_full.png") bg = Image.open("data/resources/bg.png").convert('LA') for x in range(1, 7): x_bound = (x - 1) * 8 crop = im.crop((x_bound, 0, x_bound + 8, 18)) name = "slice_" + str(x) + ".png" #print(name) back_im = bg.copy() back_im.paste(crop, (10, 5)) back_im.save("data/captcha_slices/" + name) captcha = solve_captcha() def enter(field, text): driver.find_element_by_xpath(field).send_keys(text) try: dropdown = Select(driver.find_element_by_xpath(choice_field2)) except NoSuchElementException: dropdown = Select(driver.find_element_by_xpath(choice_field)) time.sleep(3) try: enter(name_field, name_mail[0])
def process_zoneh_pages(f): ttime, tnotifier, tmirror = ctime, cnotifier, cmirror = f.read().split( '\n')[:3] tnotifier = cnotifier = cnotifier.decode('utf-8') allData = [] print[ctime, cnotifier, cmirror] print "\n" i = -1 for pagenum in range(1, 2): #looking for defaces in first two pages try: print "Downloading zone-h.org page: %s\n" % pagenum #TODO: Connecting over TOR (captcha recognition and change of circuit) browser_get('http://zone-h.org/archive/page=%d' % (pagenum, )) havepage = False for k in range(1, 5): #Loop of uncorrectly solved captachas try: WebDriverWait(browser, 10).until( EC.presence_of_element_located( (By.ID, "ldeface") ) #TODO: wait for ldeface OR propdeface. How to handle flows in that case? ) except TimeoutException as e: WebDriverWait(browser, 5).until( EC.presence_of_element_located((By.ID, "propdeface"))) #In case of timeout on ldeface, we are checking if captcha is present. #If captcha is not present generic exception handling code will take over control. #Next page will tried to be downloaded. #This differs from v0.9 that continued to scan next page immedatelly. #Software robustness(resiliency) is kept. #Timeout message will be printed out as part of GEH code. #print "Time elapsed for zone-h page processing\n" #Here captcha is resolved cookies = browser.get_cookies() cookie = ' '.join( [i['name'] + '=' + i['value'] + ';' for i in cookies]) url = browser.find_elements_by_xpath( "//*[@id='cryptogram']")[0].get_attribute('src') req = urllib2.Request(url) req.add_header('Cookie', cookie) resp = urllib2_urlopen(req) pic = resp.read() f = open("captcha.png", "wb") f.write(pic) f.close() try: solution = captcha.solve_captcha('captcha.png') elem = browser.find_element_by_name("captcha") print "Captcha solved in process_zoneh_pages (%s).\n" % solution elem.send_keys(solution) elem.submit() except: print "Something wrong in solving captcha.\n" print traceback.format_exc() print "\n" else: havepage = True break if havepage: mirrors = browser.find_elements_by_link_text('mirror') ntdata = map(lambda x: x.find_elements_by_xpath("../../*"), mirrors) data = map( lambda (x, y): (x[0].text, x[1].text, y.get_attribute('href')), zip(ntdata, mirrors)) #TODO: [HIGH PRIORITY] If there is unsuccessful processing for some page which contains (ctime, cnotifier, cmirror), code will continue #download next pages that have mirrors already in database. Do I deduplicate in that case? #CHECK DONE: in insertInDatabase is seen that new deface is always inserted in database. #(time, notifier_id, url, mirrorsrc) tuple gives unique deface id that can be used for deduplication allData += data if (ctime, cnotifier, cmirror) in data: i = data.index((ctime, cnotifier, cmirror)) allData = list(reversed(allData[:i])) break else: #captchas uncorrectly solved 5 times #TODO: save and make claim? pass except: print "Unsuccessful processing of zone-h.org page\n" print traceback.format_exc() print "\n" if i == -1: allData = list(reversed(allData)) return allData