コード例 #1
0
ファイル: ocr.py プロジェクト: jbaehne/aleph
    def extract_text(self, data, languages=None):
        key = make_key('ocr', sha1(data).hexdigest())
        text = kv.get(key)
        if text is not None:
            # log.info('%s chars cached', len(text))
            return text.decode('utf-8')

        data = self.ensure_size(data)
        if data is None:
            return

        for attempt in range(1000):
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = Image(data=data, languages=languages)
                response = service.Recognize(image)
                text = response.text or ''
                log.info('OCR: %s chars', len(text))
                kv.set(key, text.encode('utf-8'))
                return text
            except self.Error as e:
                if e.code() == self.Status.RESOURCE_EXHAUSTED:
                    continue
                log.warning("gRPC [%s]: %s", e.code(), e.details())
                backoff(failures=attempt)
                self.reset_channel()
コード例 #2
0
    def extract_text(self, data, languages=None):
        if not MIN_SIZE < len(data) < MAX_SIZE:
            log.info('OCR: file size out of range (%d)', len(data))
            return None

        key = make_key('ocr', sha1(data).hexdigest())
        if kv.exists(key):
            text = kv.get(key)
            if text is not None:
                text = text.decode('utf-8')
                log.info('OCR: %s chars cached', len(text))
            return text

        # data = self.ensure_size(data)
        # if data is None:
        #     return

        for attempt in service_retries():
            try:
                service = RecognizeTextStub(self.channel)
                languages = ensure_list(languages)
                image = Image(data=data, languages=languages)
                response = service.Recognize(image)
                text = response.text
                if text is not None:
                    log.info('OCR: %s chars (from %s bytes)', len(text),
                             len(data))
                kv.set(key, text)
                return text
            except self.Error as e:
                if e.code() not in self.TEMPORARY_ERRORS:
                    return
                self.reset_channel()
                log.warning("gRPC [%s]: %s", e.code(), e.details())
                backoff(failures=attempt)
コード例 #3
0
ファイル: names.py プロジェクト: we1l1n/aleph
def name_frequency(name):
    total = float(kv.get(TOTAL_KEY) or 1)
    tokens = name_tokens(name)
    counts = kv.hmget(TOKEN_KEY, tokens)
    counts = [int(c or 1) for c in counts]
    dists = kv.hmget(DIST_KEY, counts)
    dists = [int(d or 0) / total for d in dists]
    score = 1 - sum(dists)
    # TODO: maybe we can normalise this over the number of
    # characters in the string such that it biases towards
    # longer names with rare name parts.
    print(tokens, counts, dists, score)
コード例 #4
0
ファイル: ocr.py プロジェクト: jbaehne/aleph
    def extract_text(self, data, languages=None):
        key = make_key('ocr', sha1(data).hexdigest())
        text = kv.get(key)
        if text is not None:
            log.info('Vision API: %s chars cached', len(text))
            return text

        data = self.ensure_size(data)
        if data is not None:
            image = types.Image(content=data)
            res = self.client.document_text_detection(image)
            ann = res.full_text_annotation
            log.info('Vision API: %s chars recognized', len(ann.text))
            kv.set(key, ann.text)
            return ann.text
コード例 #5
0
def load_places():
    if kv.get(PLACE_KEY) or settings.TESTING:
        return
    total = 0
    pipe = kv.pipeline(transaction=False)
    log.info("Loading geonames...")
    with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh:
        for row in csv.reader(fh, delimiter='\t'):
            country = row[8].lower().strip()
            if not len(country):
                continue
            names = set(row[3].split(','))
            names.add(row[1])
            names.add(row[2])
            for name in names:
                name = tag_key(name)
                if name is not None:
                    total += 1
                    pipe.lpush(place_key(name), country)
    pipe.set(PLACE_KEY, total)
    pipe.execute()
    log.info("Loaded %s geonames.", total)
コード例 #6
0
ファイル: util.py プロジェクト: pudo/aleph
def load_places():
    if kv.get(PLACE_KEY) or settings.TESTING:
        return
    total = 0
    pipe = kv.pipeline(transaction=False)
    log.debug("Loading geonames...")
    with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh:
        for row in csv.reader(fh, delimiter='\t'):
            country = row[8].lower().strip()
            if not len(country):
                continue
            names = set(row[3].split(','))
            names.add(row[1])
            names.add(row[2])
            for name in names:
                name = normalize_label(name)
                if name is not None:
                    total += 1
                    pipe.lpush(place_key(name), country)
    pipe.set(PLACE_KEY, total)
    pipe.execute()
    log.debug("Loaded %s geonames.", total)
コード例 #7
0
    def extract_text(self, data, languages=None):
        if not MIN_SIZE < len(data) < MAX_SIZE:
            log.info('OCR: file size out of range (%d)', len(data))
            return None

        key = make_key('ocr', sha1(data).hexdigest())
        if kv.exists(key):
            text = kv.get(key)
            if text is not None:
                text = text.decode('utf-8')
                log.info('Vision API: %s chars cached', len(text))
            return text

        # data = self.ensure_size(data)
        # if data is None:
        #     return

        image = types.Image(content=data)
        res = self.client.document_text_detection(image)
        ann = res.full_text_annotation
        log.info('Vision API: %s chars recognized', len(ann.text))
        kv.set(key, ann.text)
        return ann.text