Exemple #1
0
class CaptchaForm(HTMLParser):
	""""""
	def __init__(self, url):
		""""""
		HTMLParser.__init__(self)
		self.link = None
		self.located = False
		while not self.link:
			p = CaptchaParser(URLOpen().open(url).read())
			if p.captcha:
				handle = URLOpen().open(p.captcha)
				if handle.info()["Content-Type"] == "image/gif":
					self.tess = Tesseract(handle.read())
					captcha = self.get_captcha()
					if captcha:
						handle = URLOpen().open(url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)]))
						self.reset()
						self.feed(handle.read())
						self.close()
						logger.info("Captcha %s: %s" % (p.captcha, captcha))

	def handle_starttag(self, tag, attrs):
		""""""
		if tag == "a":
			if ((self.located) and (attrs[0][0] == "href")):
				self.located = False
				self.link = attrs[0][1]
		elif tag == "div":
			if ((len(attrs) > 1) and (attrs[1][1] == "downloadlink")):
				self.located = True

	def get_captcha(self):
		result = self.tess.get_captcha()
		if len(result) == 4:
			return result
Exemple #2
0
    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == '__main__':
            self.g.setup(log_dir='dump')
        if proxy:
            self.g.load_proxylist(proxy,
                                  'text_file',
                                  'http',
                                  auto_init=True,
                                  auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)
Exemple #3
0
    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == "__main__":
            self.g.setup(log_dir="dump")
        if proxy:
            self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)
Exemple #4
0
 def __init__(self, url):
     """"""
     HTMLParser.__init__(self)
     self.link = None
     self.located = False
     while not self.link:
         p = CaptchaParser(URLOpen().open(url).read())
         if p.captcha:
             handle = URLOpen().open(p.captcha)
             if handle.info()["Content-Type"] == "image/gif":
                 self.tess = Tesseract(handle.read())
                 captcha = self.get_captcha()
                 if captcha:
                     handle = URLOpen().open(
                         url,
                         urllib.urlencode([(CAPTCHACODE, p.captchacode),
                                           (MEGAVAR, p.megavar),
                                           ("captcha", captcha)]))
                     self.reset()
                     self.feed(handle.read())
                     self.close()
                     logger.info("Captcha %s: %s" % (p.captcha, captcha))
    def ocr(self):
        def is_rect(column):
            column = self.model.headerData(c, QtCore.Qt.Horizontal)
            if column[:4] == 'rect' or column[:4] == 'crop':
                try:
                    temp = int(column[4:])
                    return True
                except:
                    return False
            return False

        columns = list(range(self.model.columnCount()))[::-1]

        for c in columns:
            column = self.model.headerData(c, QtCore.Qt.Horizontal)
            if not is_rect(column):
                continue

            column_text = column + '_text'
            if not self.model.headerData(c + 1,
                                         QtCore.Qt.Horizontal) == column_text:
                self.model.insertColumn(c + 1)
                self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal,
                                         column_text)

        children = self.model.root().children()
        child_count = len(children)
        column_count = self.model.columnCount()
        loop_count = child_count * column_count

        self.progress_bar.setMaximum(loop_count)
        self.progress_bar.show()

        for r, item in enumerate(children):
            for c in range(column_count):
                print(r, c, c + column_count * r, loop_count)

                self.progress_bar.setValue(c + column_count * r)
                QtWidgets.QApplication.processEvents()

                column = self.model.headerData(c, QtCore.Qt.Horizontal)
                if not is_rect(column):
                    continue

                rect = item.data(column)
                if rect is None:
                    continue

                item.data(column + '_text', Tesseract().OCR(rect).strip())

        self.progress_bar.hide()
Exemple #6
0
	def build_candidates(self, characters4_pixels_list, uncertain_pixels):
		"""Build word candidates from characters and uncertains groups."""
		for plindex, characters4_pixels in enumerate(characters4_pixels_list):
			logging.debug("Generating words (%d) %d/%d" % (2**len(uncertain_pixels), plindex+1, len(characters4_pixels_list)))
			for length in range(len(uncertain_pixels)+1):
				for groups in combinations_no_repetition(uncertain_pixels, length):
					characters4_pixels_test = [x.copy() for x in characters4_pixels]
					for pixels in groups: 
						pair = get_pair_inclussion(characters4_pixels_test, center_of_mass(pixels)[0], pred=lambda x: center_of_mass(x)[0])
						if not pair:
							continue
						char1, char2 = pair
						char1.update(pixels)
						char2.update(pixels)

					images = [self.rotate_character(pixels, cindex) for cindex, pixels in enumerate(characters4_pixels_test)]
					clean_image = smooth(join_images_horizontal(images), 0)

					ocr = Tesseract(self.data, lambda x: clean_image)
					text = ocr.get_captcha().strip()

					filtered_text = filter_word(text)
					if filtered_text:
						yield filtered_text
Exemple #7
0
	def __init__(self, url):
		""""""
		HTMLParser.__init__(self)
		self.link = None
		self.located = False
		while not self.link:
			p = CaptchaParser(URLOpen().open(url).read())
			if p.captcha:
				handle = URLOpen().open(p.captcha)
				if handle.info()["Content-Type"] == "image/gif":
					self.tess = Tesseract(handle.read())
					captcha = self.get_captcha()
					if captcha:
						handle = URLOpen().open(url, urllib.urlencode([(CAPTCHACODE, p.captchacode), (MEGAVAR, p.megavar), ("captcha", captcha)]))
						self.reset()
						self.feed(handle.read())
						self.close()
						logger.info("Captcha %s: %s" % (p.captcha, captcha))
Exemple #8
0
    def ocr(self):
        for item in self.model.root().children():
            i = 0
            while i < 1000:
                rect = item.data('Rect' + str(i))
                if rect is None:
                    break
                item.data('Text' + str(i), Tesseract().OCR(rect).strip())
                i = i + 1

        columns = list(range(self.model.columnCount()))[::-1]
        for c in columns:
            column = self.model.headerData(c, QtCore.Qt.Horizontal)
            if not 'Rect' in column:
                continue

            replace = column.replace('Rect', 'Text')
            if self.model.headerData(c + 1, QtCore.Qt.Horizontal) == replace:
                continue

            self.model.insertColumn(c + 1)
            self.model.setHeaderData(c + 1, QtCore.Qt.Horizontal, replace)
Exemple #9
0
class CaptchaForm(HTMLParser):
    """"""
    def __init__(self, url):
        """"""
        HTMLParser.__init__(self)
        self.link = None
        self.located = False
        while not self.link:
            p = CaptchaParser(URLOpen().open(url).read())
            if p.captcha:
                handle = URLOpen().open(p.captcha)
                if handle.info()["Content-Type"] == "image/gif":
                    self.tess = Tesseract(handle.read())
                    captcha = self.get_captcha()
                    if captcha:
                        handle = URLOpen().open(
                            url,
                            urllib.urlencode([(CAPTCHACODE, p.captchacode),
                                              (MEGAVAR, p.megavar),
                                              ("captcha", captcha)]))
                        self.reset()
                        self.feed(handle.read())
                        self.close()
                        logger.info("Captcha %s: %s" % (p.captcha, captcha))

    def handle_starttag(self, tag, attrs):
        """"""
        if tag == "a":
            if ((self.located) and (attrs[0][0] == "href")):
                self.located = False
                self.link = attrs[0][1]
        elif tag == "div":
            if ((len(attrs) > 1) and (attrs[1][1] == "downloadlink")):
                self.located = True

    def get_captcha(self):
        result = self.tess.get_captcha()
        if len(result) == 4:
            return result
Exemple #10
0
        """"""
        if tag == "form":
            if ((len(attrs) == 3) and (attrs[2][1] == "formDownload")):
                self.form_action = attrs[0][1]


if __name__ == "__main__":
    urllib2.install_opener(
        urllib2.build_opener(urllib2.HTTPCookieProcessor(
            cookielib.CookieJar())))
    urllib2.urlopen(
        urllib2.Request(
            "http://www.gigasize.com/get.php/3196987695/p3x03sp.avi"))

    tes = Tesseract(
        urllib2.urlopen(
            urllib2.Request("http://www.gigasize.com/randomImage.php")).read(),
        True)
    captcha = tes.get_captcha(3)

    data = urllib.urlencode({
        "txtNumber": captcha,
        "btnLogin.x": "124",
        "btnLogin.y": "12",
        "btnLogin": "******"
    })
    handle = urllib2.urlopen(
        urllib2.Request("http://www.gigasize.com/formdownload.php"), data)
    f = FormParser(handle.read())
    handle.close()
    if f.form_action:
        timer = 60
from pathlib import Path
from subprocess import PIPE, CalledProcessError, Popen, check_output
from tempfile import TemporaryDirectory

from cv2 import cv2
from pdf2image import convert_from_path

from tesseract import Tesseract, PageSegMode

__all__ = [
    "pdf_to_text",
    "ocr_to_text",
    "get_page_count",
]

TESS = Tesseract()


def pdf_to_text(pdf_path: str, target_dir: str):
    """
    Convert pdf at `pdf_path` to a txt file in `target_dir` using XpdfReader's pdftotext.
    """
    file_name = Path(pdf_path).stem
    command = [
        "pdftotext",
        "-layout",
        pdf_path,
        str(Path(target_dir) / f"{file_name}.txt"),
    ]
    proc = Popen(command, stdout=PIPE, stderr=PIPE)
    proc.wait()
Exemple #12
0
class Avito():
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value('pagetry', Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value('sleep', Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == '__main__':
            self.g.setup(log_dir='dump')
        if proxy:
            self.g.load_proxylist(proxy,
                                  'text_file',
                                  'http',
                                  auto_init=True,
                                  auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception('%s left %i', tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('%s error' % tag)

    def get_links(self, url):
        self._go3(url, 'start page')
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception('no links')
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug('last page?')
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug('open next page %s', nurl)
            self._go3(nurl, 'next page')

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith('http:'):
                url = 'http:' + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring('JFIF', byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception('get_item left %i', c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception('get photo error')
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(
                    u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception('get_item left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception('get item error')
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [
            s.text()
            for s in gallery.select('.//a[@class="gallery-link"]/@href')
        ]
        if not photos:
            egg = doc.select(
                '//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        #price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select(
                './/span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning('xpath town not found, try another way')
            town = item.select(
                './/div[@id="map"]/span[@itemprop="name"]').text()
        #desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select(
            "//div[contains(@class,\"description-text\")]").text()
        #<span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {'item_phone': _phone, 'item': {'id': item_id, 'url': _url}}
        log.debug('jslock enter <--')
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval('''function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}''')

            #http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval(
                "'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            #egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug('js rc %s', egg)
            ctx.leave()
        log.debug('jslock leave -->')
        phone = ''
        c = self.PAGETRY
        while c:
            log.debug('read phone image')
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img,
                                             basewidth=300,
                                             whitelist='0123456789-')
                break
            except:
                g.change_proxy()
                log.exception('get_phone left %i', c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug('get phone error')

        return dict(item=item_id,
                    title=title,
                    photos=photos,
                    price=price,
                    name=name,
                    town=town,
                    desc=desc,
                    phone=phone)
Exemple #13
0
		self.form_action = None
		self.feed(data)
		self.close()
		print self.form_action

	def handle_starttag(self, tag, attrs):
		""""""
		if tag == "form":
			if ((len(attrs) == 3) and (attrs[2][1] == "formDownload")):
				self.form_action = attrs[0][1]

if __name__ == "__main__":
	urllib2.install_opener(urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())))
	urllib2.urlopen(urllib2.Request("http://www.gigasize.com/get.php/3196987695/p3x03sp.avi"))

	tes = Tesseract(urllib2.urlopen(urllib2.Request("http://www.gigasize.com/randomImage.php")).read(), True)
	captcha = tes.get_captcha(3)

	data = urllib.urlencode({"txtNumber": captcha, "btnLogin.x": "124", "btnLogin.y": "12", "btnLogin": "******"})
	handle = urllib2.urlopen(urllib2.Request("http://www.gigasize.com/formdownload.php"), data)
	f = FormParser(handle.read())
	handle.close()
	if f.form_action:
		timer = 60
		while timer > 0:
			time.sleep(1)
			timer -= 1
			print timer
		data = urllib.urlencode({"dlb": "Download"})
		handle = urllib2.urlopen(urllib2.Request("http://www.gigasize.com" + f.form_action), data)
		while len(data) > 0:
Exemple #14
0
@app.route('/services/1/ocr', methods=['POST'])
def extract_text_from_image():
    #Check for the presence of the image
    if 'image' not in request.files:
        return jsonify(JSON_NOIMAGE)
    
    if 'image_id' not in request.form:
        return jsonify(JSON_NOIMAGEID)
    
    #extract the image 
    file = request.files['image']
    
    try:
        image = Image.open(file)
    except Exception, e:
        if MASK_ERRORS:
            return jsonify(JSON_NONEIMAGE)
        raise e
    
    image_id= request.form['image_id']
    
    tesseract = Tesseract(image_id, TESSERACT_EXE, TESSERACT_SCRATCH, TESSERACT_CLEANUP)
    
    try:
        text = tesseract.image_to_string(image).encode('utf-8')
    except Exception, e:
        if MASK_ERRORS:
            return jsonify(JSON_OCRFAILED)
        raise e;
        
    return jsonify(JSON_SUCCESS, text=text)
Exemple #15
0
class Avito:
    PAGETRY = 3
    SLEEP = 1

    def __init__(self, proxy=None):
        self.PAGETRY, rc = startup.CFG.value("pagetry", Avito.PAGETRY).toInt()
        self.SLEEP, rc = startup.CFG.value("sleep", Avito.SLEEP).toInt()

        self.g = Grab()
        self.g.setup(hammer_mode=True, hammer_timeouts=((2, 5), (5, 10)))
        if __name__ == "__main__":
            self.g.setup(log_dir="dump")
        if proxy:
            self.g.load_proxylist(proxy, "text_file", "http", auto_init=True, auto_change=False)
        self.tess = Tesseract()
        self.tess.set_pagemode(TessPageSegMode.PSM_SINGLE_LINE)

    def _go3(self, url, tag):
        c = self.PAGETRY
        while c:
            try:
                self.g.change_proxy()
                self.g.go(url)
                self.g.assert_substring(u'content="51a6d66a02fb23c7"')
                break
            except:
                log.exception("%s left %i", tag, c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("%s error" % tag)

    def get_links(self, url):
        self._go3(url, "start page")
        c = 999
        while True:
            links = self.g.doc.select('//h3[@class="title"]/a/@href')
            if not links:
                raise Exception("no links")
            for link in links:
                c -= 1
                if not c:
                    return
                yield urljoin(url, link.text())
            next = self.g.doc.select('//a[@class="next"]/@href')
            if not next:
                log.debug("last page?")
                break
            QtCore.QThread.sleep(self.SLEEP)
            nurl = urljoin(url, next.text())
            log.debug("open next page %s", nurl)
            self._go3(nurl, "next page")

    def get_photos(self, photos):
        g = self.g.clone()
        datas = []
        for url in photos:
            if not url.startswith("http:"):
                url = "http:" + url
            c = self.PAGETRY
            while c:
                try:
                    rc = g.go(url)
                    g.assert_substring("JFIF", byte=True)
                    datas.append(rc.body)
                    break
                except:
                    log.exception("get_item left %i", c)
                    c -= 1
                    g.change_proxy()
                    QtCore.QThread.sleep(self.SLEEP)
            else:
                raise Exception("get photo error")
        return datas

    def get_item(self, url):
        g = self.g.clone()
        c = self.PAGETRY
        while c:
            try:
                g.change_proxy()
                g.go(url)
                g.assert_substring(u'content="499bdc75d3636c55"')  # avitos ya id
                break
            except:
                log.exception("get_item left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            raise Exception("get item error")
        doc = g.doc
        title = doc.select('//h1[@itemprop="name"]').text()
        gallery = doc.select('//div[@class="gallery-item"]')
        photos = [s.text() for s in gallery.select('.//a[@class="gallery-link"]/@href')]
        if not photos:
            egg = doc.select('//img[contains(@class,"j-zoom-gallery-init")]/@src')
            if egg:
                photos.append(egg.text())
        item = doc.select('//div[@itemprop="offers"]')
        # price = item.select('.//strong[@itemprop="price"]').text()
        price = item.select('.//span[@itemprop="price"]').text()
        name = item.select('.//strong[@itemprop="name"]').text()

        try:
            town = item.select('.//span[@id="toggle_map"]/span[@itemprop="name"]').text()
        except:
            log.warning("xpath town not found, try another way")
            town = item.select('.//div[@id="map"]/span[@itemprop="name"]').text()
        # desc = item.select('.//div[@id="desc_text"]').text()
        desc = doc.select('//div[contains(@class,"description-text")]').text()
        # <span id="item_id">348967351</span>
        item_id = doc.select('//span[@id="item_id"]').text()
        _url = g.rex_text("avito.item.url = '([^>]+?)'")
        _phone = g.rex_text("avito.item.phone = '([^>]+?)'")

        loc = {"item_phone": _phone, "item": {"id": item_id, "url": _url}}
        log.debug("jslock enter <--")
        with PyV8.JSContext(loc) as ctx:
            ctx.enter()
            ctx.eval(
                """function phoneDemixer(key){var
    pre=key.match(/[0-9a-f]+/g),mixed=(item.id%2===0?pre.reverse():pre).join(''),s=mixed.length,r='',k;for(k=0;k<s;++k){if(k%3===0){r+=mixed.substring(k,k+1);}}
    return r;}"""
            )

            # http://www.avito.ru/items/phone/348967351?pkey=6fec07a9bb9902ad4ccbd87c240cfdc4
            egg = ctx.eval("'/items/phone/'+item.id+'?pkey='+phoneDemixer(item_phone)")
            # egg = ctx.eval("'/items/phone/'+item.url+'?pkey='+phoneDemixer(item_phone)")
            log.debug("js rc %s", egg)
            ctx.leave()
        log.debug("jslock leave -->")
        phone = ""
        c = self.PAGETRY
        while c:
            log.debug("read phone image")
            try:
                rc = g.go(urljoin(url, egg))
                img = Image.open(StringIO(rc.body))
                phone = self.tess.from_image(img, basewidth=300, whitelist="0123456789-")
                break
            except:
                g.change_proxy()
                log.exception("get_phone left %i", c)
                c -= 1
                QtCore.QThread.sleep(self.SLEEP)
        else:
            log.debug("get phone error")

        return dict(item=item_id, title=title, photos=photos, price=price, name=name, town=town, desc=desc, phone=phone)
Exemple #16
0
 def test_tesseract_exists(self):
     self.assertEqual(Tesseract().run().returncode, 0)