def ocr(self):
        def is_rect(column):
            column = self.model.headerData(c, QtCore.Qt.Horizontal)
            if column[:4] == 'rect' or column[:4] == 'crop':
                try:
                    temp = int(column[4:])
                    return True
                except:
                    return False
            return False

        columns = list(range(self.model.columnCount()))[::-1]

        for c in columns:
            column = self.model.headerData(c, QtCore.Qt.Horizontal)
            if not is_rect(column):
                continue

            column_text = column + '_text'
            if not self.model.headerData(c + 1,
                                         QtCore.Qt.Horizontal) == column_text:
                self.model.insertColumn(c + 1)
                self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal,
                                         column_text)

        children = self.model.root().children()
        child_count = len(children)
        column_count = self.model.columnCount()
        loop_count = child_count * column_count

        self.progress_bar.setMaximum(loop_count)
        self.progress_bar.show()

        for r, item in enumerate(children):
            for c in range(column_count):
                print(r, c, c + column_count * r, loop_count)

                self.progress_bar.setValue(c + column_count * r)
                QtWidgets.QApplication.processEvents()

                column = self.model.headerData(c, QtCore.Qt.Horizontal)
                if not is_rect(column):
                    continue

                rect = item.data(column)
                if rect is None:
                    continue

                item.data(column + '_text', Tesseract().OCR(rect).strip())

        self.progress_bar.hide()
Exemple #2
0
    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == '__main__':
            self.g.setup(log_dir='dump')
        if proxy:
            self.g.load_proxylist(proxy,
                                  'text_file',
                                  'http',
                                  auto_init=True,
                                  auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)
Exemple #3
0
    def ocr(self):
        for item in self.model.root().children():
            i = 0
            while i < 1000:
                rect = item.data('Rect' + str(i))
                if rect is None:
                    break
                item.data('Text' + str(i), Tesseract().OCR(rect).strip())
                i = i + 1

        columns = list(range(self.model.columnCount()))[::-1]
        for c in columns:
            column = self.model.headerData(c, QtCore.Qt.Horizontal)
            if not 'Rect' in column:
                continue

            replace = column.replace('Rect', 'Text')
            if self.model.headerData(c + 1, QtCore.Qt.Horizontal) == replace:
                continue

            self.model.insertColumn(c + 1)
            self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal, replace)
Exemple #4
0
 def __init__(self, url):
     """"""
     HTMLParser.__init__(self)
     self.link = None
     self.located = False
     while not self.link:
         p = CaptchaParser(URLOpen().open(url).read())
         if p.captcha:
             handle = URLOpen().open(p.captcha)
             if handle.info()["Content-Type"] == "image/gif":
                 self.tess = Tesseract(handle.read())
                 captcha = self.get_captcha()
                 if captcha:
                     handle = URLOpen().open(
                         url,
                         urllib.urlencode([(CAPTCHACODE, p.captchacode),
                                           (MEGAVAR, p.megavar),
                                           ("captcha", captcha)]))
                     self.reset()
                     self.feed(handle.read())
                     self.close()
                     logger.info("Captcha %s: %s" % (p.captcha, captcha))
Exemple #5
0
	def build_candidates(self, characters4_pixels_list, uncertain_pixels):
		"""Build word candidates from characters and uncertains groups."""
		for plindex, characters4_pixels in enumerate(characters4_pixels_list):
			logging.debug("Generating words (%d) %d/%d" % (2**len(uncertain_pixels), plindex+1, len(characters4_pixels_list)))
			for length in range(len(uncertain_pixels)+1):
				for groups in combinations_no_repetition(uncertain_pixels, length):
					characters4_pixels_test = [x.copy() for x in characters4_pixels]
					for pixels in groups: 
						pair = get_pair_inclussion(characters4_pixels_test, center_of_mass(pixels)[0], pred=lambda x: center_of_mass(x)[0])
						if not pair:
							continue
						char1, char2 = pair
						char1.update(pixels)
						char2.update(pixels)

					images = [self.rotate_character(pixels, cindex) for cindex, pixels in enumerate(characters4_pixels_test)]
					clean_image = smooth(join_images_horizontal(images), 0)

					ocr = Tesseract(self.data, lambda x: clean_image)
					text = ocr.get_captcha().strip()

					filtered_text = filter_word(text)
					if filtered_text:
						yield filtered_text
Exemple #6
0
        """"""
        if tag == "form":
            if ((len(attrs) == 3) and (attrs[2][1] == "formDownload")):
                self.form_action = attrs[0][1]


if __name__ == "__main__":
    urllib2.install_opener(
        urllib2.build_opener(urllib2.HTTPCookieProcessor(
            cookielib.CookieJar())))
    urllib2.urlopen(
        urllib2.Request(
            "http://www.gigasize.com/get.php/3196987695/p3x03sp.avi"))

    tes = Tesseract(
        urllib2.urlopen(
            urllib2.Request("http://www.gigasize.com/randomImage.php")).read(),
        True)
    captcha = tes.get_captcha(3)

    data = urllib.urlencode({
        "txtNumber": captcha,
        "btnLogin.x": "124",
        "btnLogin.y": "12",
        "btnLogin": "******"
    })
    handle = urllib2.urlopen(
        urllib2.Request("http://www.gigasize.com/formdownload.php"), data)
    f = FormParser(handle.read())
    handle.close()
    if f.form_action:
        timer = 60
from pathlib import Path
from subprocess import PIPE, CalledProcessError, Popen, check_output
from tempfile import TemporaryDirectory

from cv2 import cv2
from pdf2image import convert_from_path

from tesseract import Tesseract, PageSegMode

__all__ = [
    "pdf_to_text",
    "ocr_to_text",
    "get_page_count",
]

TESS = Tesseract()


def pdf_to_text(pdf_path: str, target_dir: str):
    """
    Convert pdf at `pdf_path` to a txt file in `target_dir` using XpdfReader's pdftotext.
    """
    file_name = Path(pdf_path).stem
    command = [
        "pdftotext",
        "-layout",
        pdf_path,
        str(Path(target_dir) / f"{file_name}.txt"),
    ]
    proc = Popen(command, stdout=PIPE, stderr=PIPE)
    proc.wait()
Exemple #8
0
 def test_tesseract_exists(self):
     self.assertEqual(Tesseract().run().returncode, 0)