class CaptchaForm(HTMLParser): """""" def __init__(self, url): """""" HTMLParser.__init__(self) self.link = None self.located = False while not self.link: p = CaptchaParser(URLOpen().open(url).read()) if p.captcha: handle = URLOpen().open(p.captcha) if handle.info()["Content-Type"] == "image/gif": self.tess = Tesseract(handle.read()) captcha = self.get_captcha() if captcha: handle = URLOpen().open(url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)])) self.reset() self.feed(handle.read()) self.close() logger.info("Captcha %s: %s" % (p.captcha, captcha)) def handle_starttag(self, tag, attrs): """""" if tag == "a": if ((self.located) and (attrs[0][0] == "href")): self.located = False self.link = attrs[0][1] elif tag == "div": if ((len(attrs) > 1) and (attrs[1][1] == "downloadlink")): self.located = True def get_captcha(self): result = self.tess.get_captcha() if len(result) == 4: return result
class CaptchaForm(HTMLParser): """""" def __init__(self, url): """""" HTMLParser.__init__(self) self.link = None self.located = False while not self.link: p = CaptchaParser(URLOpen().open(url).read()) if p.captcha: handle = URLOpen().open(p.captcha) if handle.info()["Content-Type"] == "image/gif": self.tess = Tesseract(handle.read()) captcha = self.get_captcha() if captcha: handle = URLOpen().open( url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)])) self.reset() self.feed(handle.read()) self.close() logger.info("Captcha %s: %s" % (p.captcha, captcha)) def handle_starttag(self, tag, attrs): """""" if tag == "a": if ((self.located) and (attrs[0][0] == "href")): self.located = False self.link = attrs[0][1] elif tag == "div": if ((len(attrs) > 1) and (attrs[1][1] == "downloadlink")): self.located = True def get_captcha(self): result = self.tess.get_captcha() if len(result) == 4: return result
def build_candidates(self, characters4_pixels_list, uncertain_pixels): """Build word candidates from characters and uncertains groups.""" for plindex, characters4_pixels in enumerate(characters4_pixels_list): logging.debug("Generating words (%d) %d/%d" % (2**len(uncertain_pixels), plindex+1, len(characters4_pixels_list))) for length in range(len(uncertain_pixels)+1): for groups in combinations_no_repetition(uncertain_pixels, length): characters4_pixels_test = [x.copy() for x in characters4_pixels] for pixels in groups: pair = get_pair_inclussion(characters4_pixels_test, center_of_mass(pixels)[0], pred=lambda x: center_of_mass(x)[0]) if not pair: continue char1, char2 = pair char1.update(pixels) char2.update(pixels) images = [self.rotate_character(pixels, cindex) for cindex, pixels in enumerate(characters4_pixels_test)] clean_image = smooth(join_images_horizontal(images), 0) ocr = Tesseract(self.data, lambda x: clean_image) text = ocr.get_captcha().strip() filtered_text = filter_word(text) if filtered_text: yield filtered_text
self.form_action = attrs[0][1] if __name__ == "__main__": urllib2.install_opener( urllib2.build_opener(urllib2.HTTPCookieProcessor( cookielib.CookieJar()))) urllib2.urlopen( urllib2.Request( "http://www.gigasize.com/get.php/3196987695/p3x03sp.avi")) tes = Tesseract( urllib2.urlopen( urllib2.Request("http://www.gigasize.com/randomImage.php")).read(), True) captcha = tes.get_captcha(3) data = urllib.urlencode({ "txtNumber": captcha, "btnLogin.x": "124", "btnLogin.y": "12", "btnLogin": "******" }) handle = urllib2.urlopen( urllib2.Request("http://www.gigasize.com/formdownload.php"), data) f = FormParser(handle.read()) handle.close() if f.form_action: timer = 60 while timer > 0: time.sleep(1)
self.feed(data) self.close() print self.form_action def handle_starttag(self, tag, attrs): """""" if tag == "form": if ((len(attrs) == 3) and (attrs[2][1] == "formDownload")): self.form_action = attrs[0][1] if __name__ == "__main__": urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))) urllib2.urlopen(urllib2.Request("http://www.gigasize.com/get.php/3196987695/p3x03sp.avi")) tes = Tesseract(urllib2.urlopen(urllib2.Request("http://www.gigasize.com/randomImage.php")).read(), True) captcha = tes.get_captcha(3) data = urllib.urlencode({"txtNumber": captcha, "btnLogin.x": "124", "btnLogin.y": "12", "btnLogin": "******"}) handle = urllib2.urlopen(urllib2.Request("http://www.gigasize.com/formdownload.php"), data) f = FormParser(handle.read()) handle.close() if f.form_action: timer = 60 while timer > 0: time.sleep(1) timer -= 1 print timer data = urllib.urlencode({"dlb": "Download"}) handle = urllib2.urlopen(urllib2.Request("http://www.gigasize.com" + f.form_action), data) while len(data) > 0: data = handle.read(1024)