def ocr(self): def is_rect(column): column = self.model.headerData(c, QtCore.Qt.Horizontal) if column[:4] == 'rect' or column[:4] == 'crop': try: temp = int(column[4:]) return True except: return False return False columns = list(range(self.model.columnCount()))[::-1] for c in columns: column = self.model.headerData(c, QtCore.Qt.Horizontal) if not is_rect(column): continue column_text = column + '_text' if not self.model.headerData(c + 1, QtCore.Qt.Horizontal) == column_text: self.model.insertColumn(c + 1) self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal, column_text) children = self.model.root().children() child_count = len(children) column_count = self.model.columnCount() loop_count = child_count * column_count self.progress_bar.setMaximum(loop_count) self.progress_bar.show() for r, item in enumerate(children): for c in range(column_count): print(r, c, c + column_count * r, loop_count) self.progress_bar.setValue(c + column_count * r) QtWidgets.QApplication.processEvents() column = self.model.headerData(c, QtCore.Qt.Horizontal) if not is_rect(column): continue rect = item.data(column) if rect is None: continue item.data(column + '_text', Tesseract().OCR(rect).strip()) self.progress_bar.hide()
def __init__(self, proxy=None): self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt() self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt() self.g = Grab() self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10))) if __name__ == '__main__': self.g.setup(log_dir='dump') if proxy: self.g.load_proxylist(proxy, 'text_file', 'http', auto_init=True, auto_change=False) self.tess = Tesseract() self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)
def ocr(self): for item in self.model.root().children(): i = 0 while i < 1000: rect = item.data('Rect' + str(i)) if rect is None: break item.data('Text' + str(i), Tesseract().OCR(rect).strip()) i = i + 1 columns = list(range(self.model.columnCount()))[::-1] for c in columns: column = self.model.headerData(c, QtCore.Qt.Horizontal) if not 'Rect' in column: continue replace = column.replace('Rect', 'Text') if self.model.headerData(c + 1, QtCore.Qt.Horizontal) == replace: continue self.model.insertColumn(c + 1) self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal, replace)
def __init__(self, url): """""" HTMLParser.__init__(self) self.link = None self.located = False while not self.link: p = CaptchaParser(URLOpen().open(url).read()) if p.captcha: handle = URLOpen().open(p.captcha) if handle.info()["Content-Type"] == "image/gif": self.tess = Tesseract(handle.read()) captcha = self.get_captcha() if captcha: handle = URLOpen().open( url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)])) self.reset() self.feed(handle.read()) self.close() logger.info("Captcha %s: %s" % (p.captcha, captcha))
def build_candidates(self, characters4_pixels_list, uncertain_pixels): """Build word candidates from characters and uncertains groups.""" for plindex, characters4_pixels in enumerate(characters4_pixels_list): logging.debug("Generating words (%d) %d/%d" % (2**len(uncertain_pixels), plindex+1, len(characters4_pixels_list))) for length in range(len(uncertain_pixels)+1): for groups in combinations_no_repetition(uncertain_pixels, length): characters4_pixels_test = [x.copy() for x in characters4_pixels] for pixels in groups: pair = get_pair_inclussion(characters4_pixels_test, center_of_mass(pixels)[0], pred=lambda x: center_of_mass(x)[0]) if not pair: continue char1, char2 = pair char1.update(pixels) char2.update(pixels) images = [self.rotate_character(pixels, cindex) for cindex, pixels in enumerate(characters4_pixels_test)] clean_image = smooth(join_images_horizontal(images), 0) ocr = Tesseract(self.data, lambda x: clean_image) text = ocr.get_captcha().strip() filtered_text = filter_word(text) if filtered_text: yield filtered_text
"""""" if tag == "form": if ((len(attrs) == 3) and (attrs[2][1] == "formDownload")): self.form_action = attrs[0][1] if __name__ == "__main__": urllib2.install_opener( urllib2.build_opener(urllib2.HTTPCookieProcessor( cookielib.CookieJar()))) urllib2.urlopen( urllib2.Request( "http://www.gigasize.com/get.php/3196987695/p3x03sp.avi")) tes = Tesseract( urllib2.urlopen( urllib2.Request("http://www.gigasize.com/randomImage.php")).read(), True) captcha = tes.get_captcha(3) data = urllib.urlencode({ "txtNumber": captcha, "btnLogin.x": "124", "btnLogin.y": "12", "btnLogin": "******" }) handle = urllib2.urlopen( urllib2.Request("http://www.gigasize.com/formdownload.php"), data) f = FormParser(handle.read()) handle.close() if f.form_action: timer = 60
from pathlib import Path from subprocess import PIPE, CalledProcessError, Popen, check_output from tempfile import TemporaryDirectory from cv2 import cv2 from pdf2image import convert_from_path from tesseract import Tesseract, PageSegMode __all__ = [ "pdf_to_text", "ocr_to_text", "get_page_count", ] TESS = Tesseract() def pdf_to_text(pdf_path: str, target_dir: str): """ Convert pdf at `pdf_path` to a txt file in `target_dir` using XpdfReader's pdftotext. """ file_name = Path(pdf_path).stem command = [ "pdftotext", "-layout", pdf_path, str(Path(target_dir) / f"{file_name}.txt"), ] proc = Popen(command, stdout=PIPE, stderr=PIPE) proc.wait()
def test_tesseract_exists(self): self.assertEqual(Tesseract().run().returncode, 0)