class CaptchaForm(HTMLParser): """""" def __init__(self, url): """""" HTMLParser.__init__(self) self.link = None self.located = False while not self.link: p = CaptchaParser(URLOpen().open(url).read()) if p.captcha: handle = URLOpen().open(p.captcha) if handle.info()["Content-Type"] == "image/gif": self.tess = Tesseract(handle.read()) captcha = self.get_captcha() if captcha: handle = URLOpen().open(url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)])) self.reset() self.feed(handle.read()) self.close() logger.info("Captcha %s: %s" % (p.captcha, captcha)) def handle_starttag(self, tag, attrs): """""" if tag == "a": if ((self.located) and (attrs[0][0] == "href")): self.located = False self.link = attrs[0][1] elif tag == "div": if ((len(attrs) > 1) and (attrs[1][1] == "downloadlink")): self.located = True def get_captcha(self): result = self.tess.get_captcha() if len(result) == 4: return result
def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == '__main__': self.g.setup(log_dir='dump') if proxy: self.g.load_proxylist(proxy, 'text_file', 'http', auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)
def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == "__main__": self.g.setup(log_dir="dump") if proxy: self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)
def __init__(self, url): """""" HTMLParser.__init__(self) self.link = None self.located = False while not self.link: p = CaptchaParser(URLOpen().open(url).read()) if p.captcha: handle = URLOpen().open(p.captcha) if handle.info()["Content-Type"] == "image/gif": self.tess = Tesseract(handle.read()) captcha = self.get_captcha() if captcha: handle = URLOpen().open( url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)])) self.reset() self.feed(handle.read()) self.close() logger.info("Captcha %s: %s" % (p.captcha, captcha))
def ocr(self): def is_rect(column): column = self.model.headerData(c, QtCore.Qt.Horizontal) if column[:4] == 'rect' or column[:4] == 'crop': try: temp = int(column[4:]) return True except: return False return False columns = list(range(self.model.columnCount()))[::-1] for c in columns: column = self.model.headerData(c, QtCore.Qt.Horizontal) if not is_rect(column): continue column_text = column + '_text' if not self.model.headerData(c + 1, QtCore.Qt.Horizontal) == column_text: self.model.insertColumn(c + 1) self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal, column_text) children = self.model.root().children() child_count = len(children) column_count = self.model.columnCount() loop_count = child_count * column_count self.progress_bar.setMaximum(loop_count) self.progress_bar.show() for r, item in enumerate(children): for c in range(column_count): print(r, c, c + column_count * r, loop_count) self.progress_bar.setValue(c + column_count * r) QtWidgets.QApplication.processEvents() column = self.model.headerData(c, QtCore.Qt.Horizontal) if not is_rect(column): continue rect = item.data(column) if rect is None: continue item.data(column + '_text', Tesseract().OCR(rect).strip()) self.progress_bar.hide()
def build_candidates(self, characters4_pixels_list, uncertain_pixels): """Build word candidates from characters and uncertains groups.""" for plindex, characters4_pixels in enumerate(characters4_pixels_list): logging.debug("Generating words (%d) %d/%d" % (2**len(uncertain_pixels), plindex+1, len(characters4_pixels_list))) for length in range(len(uncertain_pixels)+1): for groups in combinations_no_repetition(uncertain_pixels, length): characters4_pixels_test = [x.copy() for x in characters4_pixels] for pixels in groups: pair = get_pair_inclussion(characters4_pixels_test, center_of_mass(pixels)[0], pred=lambda x: center_of_mass(x)[0]) if not pair: continue char1, char2 = pair char1.update(pixels) char2.update(pixels) images = [self.rotate_character(pixels, cindex) for cindex, pixels in enumerate(characters4_pixels_test)] clean_image = smooth(join_images_horizontal(images), 0) ocr = Tesseract(self.data, lambda x: clean_image) text = ocr.get_captcha().strip() filtered_text = filter_word(text) if filtered_text: yield filtered_text
def __init__(self, url): """""" HTMLParser.__init__(self) self.link = None self.located = False while not self.link: p = CaptchaParser(URLOpen().open(url).read()) if p.captcha: handle = URLOpen().open(p.captcha) if handle.info()["Content-Type"] == "image/gif": self.tess = Tesseract(handle.read()) captcha = self.get_captcha() if captcha: handle = URLOpen().open(url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)])) self.reset() self.feed(handle.read()) self.close() logger.info("Captcha %s: %s" % (p.captcha, captcha))
def ocr(self): for item in self.model.root().children(): i = 0 while i < 1000: rect = item.data('Rect' + str(i)) if rect is None: break item.data('Text' + str(i), Tesseract().OCR(rect).strip()) i = i + 1 columns = list(range(self.model.columnCount()))[::-1] for c in columns: column = self.model.headerData(c, QtCore.Qt.Horizontal) if not 'Rect' in column: continue replace = column.replace('Rect', 'Text') if self.model.headerData(c + 1, QtCore.Qt.Horizontal) == replace: continue self.model.insertColumn(c + 1) self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal, replace)
class CaptchaForm(HTMLParser): """""" def __init__(self, url): """""" HTMLParser.__init__(self) self.link = None self.located = False while not self.link: p = CaptchaParser(URLOpen().open(url).read()) if p.captcha: handle = URLOpen().open(p.captcha) if handle.info()["Content-Type"] == "image/gif": self.tess = Tesseract(handle.read()) captcha = self.get_captcha() if captcha: handle = URLOpen().open( url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)])) self.reset() self.feed(handle.read()) self.close() logger.info("Captcha %s: %s" % (p.captcha, captcha)) def handle_starttag(self, tag, attrs): """""" if tag == "a": if ((self.located) and (attrs[0][0] == "href")): self.located = False self.link = attrs[0][1] elif tag == "div": if ((len(attrs) > 1) and (attrs[1][1] == "downloadlink")): self.located = True def get_captcha(self): result = self.tess.get_captcha() if len(result) == 4: return result
"""""" if tag == "form": if ((len(attrs) == 3) and (attrs[2][1] == "formDownload")): self.form_action = attrs[0][1] if __name__ == "__main__": urllib2.install_opener( urllib2.build_opener(urllib2.HTTPCookieProcessor( cookielib.CookieJar()))) urllib2.urlopen( urllib2.Request( "http://www.gigasize.com/get.php/3196987695/p3x03sp.avi")) tes = Tesseract( urllib2.urlopen( urllib2.Request("http://www.gigasize.com/randomImage.php")).read(), True) captcha = tes.get_captcha(3) data = urllib.urlencode({ "txtNumber": captcha, "btnLogin.x": "124", "btnLogin.y": "12", "btnLogin": "******" }) handle = urllib2.urlopen( urllib2.Request("http://www.gigasize.com/formdownload.php"), data) f = FormParser(handle.read()) handle.close() if f.form_action: timer = 60
from pathlib import Path from subprocess import PIPE, CalledProcessError, Popen, check_output from tempfile import TemporaryDirectory from cv2 import cv2 from pdf2image import convert_from_path from tesseract import Tesseract, PageSegMode __all__ = [ "pdf_to_text", "ocr_to_text", "get_page_count", ] TESS = Tesseract() def pdf_to_text(pdf_path: str, target_dir: str): """ Convert pdf at `pdf_path` to a txt file in `target_dir` using XpdfReader's pdftotext. """ file_name = Path(pdf_path).stem command = [ "pdftotext", "-layout", pdf_path, str(Path(target_dir) / f"{file_name}.txt"), ] proc = Popen(command, stdout=PIPE, stderr=PIPE) proc.wait()
class Avito(): PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == '__main__': self.g.setup(log_dir='dump') if proxy: self.g.load_proxylist(proxy, 'text_file', 'http', auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception('%s left %i', tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('%s error' % tag) def get_links(self, url): self._go3(url, 'start page') c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception('no links') for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug('last page?') break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug('open next page %s', nurl) self._go3(nurl, 'next page') def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith('http:'): url = 'http:' + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring('JFIF', byte=True) datas.append(rc.body) break except: log.exception('get_item left %i', c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get photo error') return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring( u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception('get_item left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception('get item error') doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [ s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href') ] if not photos: egg = doc.select( '//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') #price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select( './/span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning('xpath town not found, try another way') town = item.select( './/div[@id="map"]/span[@itemprop="name"]').text() #desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select( "//div[contains(@class,\"description-text\")]").text() #<span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}} log.debug('jslock enter <--') with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval('''function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}''') #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval( "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug('js rc %s', egg) ctx.leave() log.debug('jslock leave -->') phone = '' c = self.PAGETRY while c: log.debug('read phone image') try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist='0123456789-') break except: g.change_proxy() log.exception('get_phone left %i', c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug('get phone error') return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
self.form_action = None self.feed(data) self.close() print self.form_action def handle_starttag(self, tag, attrs): """""" if tag == "form": if ((len(attrs) == 3) and (attrs[2][1] == "formDownload")): self.form_action = attrs[0][1] if __name__ == "__main__": urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar()))) urllib2.urlopen(urllib2.Request("http://www.gigasize.com/get.php/3196987695/p3x03sp.avi")) tes = Tesseract(urllib2.urlopen(urllib2.Request("http://www.gigasize.com/randomImage.php")).read(), True) captcha = tes.get_captcha(3) data = urllib.urlencode({"txtNumber": captcha, "btnLogin.x": "124", "btnLogin.y": "12", "btnLogin": "******"}) handle = urllib2.urlopen(urllib2.Request("http://www.gigasize.com/formdownload.php"), data) f = FormParser(handle.read()) handle.close() if f.form_action: timer = 60 while timer > 0: time.sleep(1) timer -= 1 print timer data = urllib.urlencode({"dlb": "Download"}) handle = urllib2.urlopen(urllib2.Request("http://www.gigasize.com" + f.form_action), data) while len(data) > 0:
@app.route('/services/1/ocr', methods=['POST']) def extract_text_from_image(): #Check for the presence of the image if 'image' not in request.files: return jsonify(JSON_NOIMAGE) if 'image_id' not in request.form: return jsonify(JSON_NOIMAGEID) #extract the image file = request.files['image'] try: image = Image.open(file) except Exception, e: if MASK_ERRORS: return jsonify(JSON_NONEIMAGE) raise e image_id= request.form['image_id'] tesseract = Tesseract(image_id, TESSERACT_EXE, TESSERACT_SCRATCH, TESSERACT_CLEANUP) try: text = tesseract.image_to_string(image).encode('utf-8') except Exception, e: if MASK_ERRORS: return jsonify(JSON_OCRFAILED) raise e; return jsonify(JSON_SUCCESS, text=text)
class Avito: PAGETRY = 3 SLEEP = 1 def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == "__main__": self.g.setup(log_dir="dump") if proxy: self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE) def _go3(self, url, tag): c = self.PAGETRY while c: try: self.g.change_proxy() self.g.go(url) self.g.assert_substring(u'content="51a6d66a02fb23c7"') break except: log.exception("%s left %i", tag, c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("%s error" % tag) def get_links(self, url): self._go3(url, "start page") c = 999 while True: links = self.g.doc.select('//h3[@class="title"]/a/@href') if not links: raise Exception("no links") for link in links: c -= 1 if not c: return yield urljoin(url, link.text()) next = self.g.doc.select('//a[@class="next"]/@href') if not next: log.debug("last page?") break QtCore.QThread.sleep(self.SLEEP) nurl = urljoin(url, next.text()) log.debug("open next page %s", nurl) self._go3(nurl, "next page") def get_photos(self, photos): g = self.g.clone() datas = [] for url in photos: if not url.startswith("http:"): url = "http:" + url c = self.PAGETRY while c: try: rc = g.go(url) g.assert_substring("JFIF", byte=True) datas.append(rc.body) break except: log.exception("get_item left %i", c) c -= 1 g.change_proxy() QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get photo error") return datas def get_item(self, url): g = self.g.clone() c = self.PAGETRY while c: try: g.change_proxy() g.go(url) g.assert_substring(u'content="499bdc75d3636c55"') # avitos ya id break except: log.exception("get_item left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: raise Exception("get item error") doc = g.doc title = doc.select('//h1[@itemprop="name"]').text() gallery = doc.select('//div[@class="gallery-item"]') photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')] if not photos: egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src') if egg: photos.append(egg.text()) item = doc.select('//div[@itemprop="offers"]') # price = item.select('.//strong[@itemprop="price"]').text() price = item.select('.//span[@itemprop="price"]').text() name = item.select('.//strong[@itemprop="name"]').text() try: town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text() except: log.warning("xpath town not found, try another way") town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text() # desc = item.select('.//div[@id="desc_text"]').text() desc = doc.select('//div[contains(@class,"description-text")]').text() # <span id="item_id">348967351</span> item_id = doc.select('//span[@id="item_id"]').text() _url = g.rex_text("avito.item.url = '([^>]+?)'") _phone = g.rex_text("avito.item.phone = '([^>]+?)'") loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}} log.debug("jslock enter <--") with PyV8.JSContext(loc) as ctx: ctx.enter() ctx.eval( """function phoneDemixer(key){var pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}} return r;}""" ) # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4 egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)") # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)") log.debug("js rc %s", egg) ctx.leave() log.debug("jslock leave -->") phone = "" c = self.PAGETRY while c: log.debug("read phone image") try: rc = g.go(urljoin(url, egg)) img = Image.open(StringIO(rc.body)) phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-") break except: g.change_proxy() log.exception("get_phone left %i", c) c -= 1 QtCore.QThread.sleep(self.SLEEP) else: log.debug("get phone error") return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
def test_tesseract_exists(self): self.assertEqual(Tesseract().run().returncode, 0)