def get_languages(codes): """Turn some ISO2 language codes into ISO3 codes.""" supported = [] for code in list_to_alpha3(codes): if code in LANGUAGES: supported.append(code) return '+'.join(sorted(supported))
def get_languages(self, languages): if not hasattr(self, 'supported_languages'): from tesserocr import get_languages _, self.supported_languages = get_languages() codes = set(['eng']) for lang in list_to_alpha3(codes): if lang in self.supported_languages: codes.add(lang) return '+'.join(sorted(codes))
def get_models(entity): """Iterate over the NER models applicable to the given entity.""" languages = entity.get_type_values(registry.language) models = set() for lang in list_to_alpha3(languages): model = settings.NER_MODELS.get(lang) if model is not None: models.add(model) for model in models: yield _load_model(model)
def extract_text(self, data, languages=None): """Extract text from a binary string of data.""" codes = set(['eng']) for lang in list_to_alpha3(codes): if lang in self.supported_languages: codes.add(lang) languages = '+'.join(sorted(codes)) api = self.get_api(languages) if languages != api.GetInitLanguagesAsString(): api.Init(lang=languages) try: # TODO: play with contrast and sharpening the images. image = Image.open(BytesIO(data)) if not self.image_size_ok(image): return api.SetImage(image) return api.GetUTF8Text() except Exception as ex: log.warning("Failed to OCR: %s", ex) finally: api.Clear()
def test_list(self): assert 'srp' in list_to_alpha3('bs') assert 'srp' not in list_to_alpha3('bs', synonyms=False) assert 'deu' in list_to_alpha3(['bs', 'de'])
def test_list(self): assert "srp" in list_to_alpha3("bs") assert "srp" not in list_to_alpha3("bs", synonyms=False) assert "deu" in list_to_alpha3(["bs", "de"])