def extract_text(self, data, languages=None): key = make_key('ocr', sha1(data).hexdigest()) text = kv.get(key) if text is not None: # log.info('%s chars cached', len(text)) return text.decode('utf-8') data = self.ensure_size(data) if data is None: return for attempt in range(1000): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = Image(data=data, languages=languages) response = service.Recognize(image) text = response.text or '' log.info('OCR: %s chars', len(text)) kv.set(key, text.encode('utf-8')) return text except self.Error as e: if e.code() == self.Status.RESOURCE_EXHAUSTED: continue log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt) self.reset_channel()
def extract_text(self, data, languages=None): if not MIN_SIZE < len(data) < MAX_SIZE: log.info('OCR: file size out of range (%d)', len(data)) return None key = make_key('ocr', sha1(data).hexdigest()) if kv.exists(key): text = kv.get(key) if text is not None: text = text.decode('utf-8') log.info('OCR: %s chars cached', len(text)) return text # data = self.ensure_size(data) # if data is None: # return for attempt in service_retries(): try: service = RecognizeTextStub(self.channel) languages = ensure_list(languages) image = Image(data=data, languages=languages) response = service.Recognize(image) text = response.text if text is not None: log.info('OCR: %s chars (from %s bytes)', len(text), len(data)) kv.set(key, text) return text except self.Error as e: if e.code() not in self.TEMPORARY_ERRORS: return self.reset_channel() log.warning("gRPC [%s]: %s", e.code(), e.details()) backoff(failures=attempt)
def name_frequency(name): total = float(kv.get(TOTAL_KEY) or 1) tokens = name_tokens(name) counts = kv.hmget(TOKEN_KEY, tokens) counts = [int(c or 1) for c in counts] dists = kv.hmget(DIST_KEY, counts) dists = [int(d or 0) / total for d in dists] score = 1 - sum(dists) # TODO: maybe we can normalise this over the number of # characters in the string such that it biases towards # longer names with rare name parts. print(tokens, counts, dists, score)
def extract_text(self, data, languages=None): key = make_key('ocr', sha1(data).hexdigest()) text = kv.get(key) if text is not None: log.info('Vision API: %s chars cached', len(text)) return text data = self.ensure_size(data) if data is not None: image = types.Image(content=data) res = self.client.document_text_detection(image) ann = res.full_text_annotation log.info('Vision API: %s chars recognized', len(ann.text)) kv.set(key, ann.text) return ann.text
def load_places(): if kv.get(PLACE_KEY) or settings.TESTING: return total = 0 pipe = kv.pipeline(transaction=False) log.info("Loading geonames...") with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh: for row in csv.reader(fh, delimiter='\t'): country = row[8].lower().strip() if not len(country): continue names = set(row[3].split(',')) names.add(row[1]) names.add(row[2]) for name in names: name = tag_key(name) if name is not None: total += 1 pipe.lpush(place_key(name), country) pipe.set(PLACE_KEY, total) pipe.execute() log.info("Loaded %s geonames.", total)
def load_places(): if kv.get(PLACE_KEY) or settings.TESTING: return total = 0 pipe = kv.pipeline(transaction=False) log.debug("Loading geonames...") with io.open(settings.GEONAMES_DATA, 'r', encoding='utf-8') as fh: for row in csv.reader(fh, delimiter='\t'): country = row[8].lower().strip() if not len(country): continue names = set(row[3].split(',')) names.add(row[1]) names.add(row[2]) for name in names: name = normalize_label(name) if name is not None: total += 1 pipe.lpush(place_key(name), country) pipe.set(PLACE_KEY, total) pipe.execute() log.debug("Loaded %s geonames.", total)
def extract_text(self, data, languages=None): if not MIN_SIZE < len(data) < MAX_SIZE: log.info('OCR: file size out of range (%d)', len(data)) return None key = make_key('ocr', sha1(data).hexdigest()) if kv.exists(key): text = kv.get(key) if text is not None: text = text.decode('utf-8') log.info('Vision API: %s chars cached', len(text)) return text # data = self.ensure_size(data) # if data is None: # return image = types.Image(content=data) res = self.client.document_text_detection(image) ann = res.full_text_annotation log.info('Vision API: %s chars recognized', len(ann.text)) kv.set(key, ann.text) return ann.text