def format_address(self, country, components, minimal_only=True, tag_components=True, replace_aliases=True, template_replacements=False): template = self.config.get(country.upper()) if not template: return None template_text = template['address_template'] if replace_aliases: self.replace_aliases(components) if minimal_only and not self.minimal_components(components): return None if template_replacements: self.apply_replacements(template, components) if tag_components: template_text = self.tag_template_separators(template_text) components = {k: u' '.join([u'{}/{}'.format(t.replace(' ', ''), k.replace(' ', '_')) for t, c in tokenize(v)]) for k, v in components.iteritems()} text = self.render_template(template_text, components, tagged=tag_components) text = self.post_replacements(template, text) return text
def format_address(self, country, components, minimal_only=True, tag_components=True, replace_aliases=True, template_replacements=False): template = self.config.get(country.upper()) if not template: return None template_text = template['address_template'] if replace_aliases: self.replace_aliases(components) if minimal_only and not self.minimal_components(components): return None if template_replacements: self.apply_replacements(template, components) if tag_components: template_text = self.tag_template_separators(template_text) components = { k: u' '.join([ u'{}/{}'.format(t.replace(' ', ''), k.replace(' ', '_')) for t, c in tokenize(v) ]) for k, v in components.iteritems() } text = self.render_template(template_text, components, tagged=tag_components) text = self.post_replacements(template, text) return text
def disambiguate_language(text, languages): valid_languages = OrderedDict(languages) script_langs = {} read_len = 0 while read_len < len(text): script, script_len, is_ascii = get_string_script(text[read_len:]) if script != LATIN_SCRIPT: script_langs[script] = set([ l for l, d in languages if l in script_languages.get(script, []) ]) read_len += script_len num_defaults = sum( (1 for lang, default in valid_languages.iteritems() if default)) tokens = [ (c, t.rstrip('.')) for t, c in tokenize(safe_decode(text).replace(u'-', u' ').lower()) ] current_lang = None possible_lang = None seen_languages = set() for c, t, data in street_types_gazetteer.filter(tokens): if c is token_types.PHRASE: valid = [] data = [d.split('|') for d in data] potentials = [l for l, c, s in data if l in valid_languages] for lang, canonical, stopword in data: canonical = int(canonical) stopword = int(stopword) if lang not in valid_languages or (stopword and len(potentials) > 1): continue is_default = valid_languages[lang] lang_valid = is_default or not seen_languages or lang in seen_languages if lang_valid and ((canonical and not stopword) or (is_default and len(potentials) == 1)): valid.append(lang) elif is_default and num_defaults > 1 and current_lang is not None and current_lang != lang: return AMBIGUOUS_LANGUAGE elif stopword and canonical and not is_default and lang in seen_languages: valid.append(lang) elif not seen_languages and len(potentials) == 1 and len( t[0][1]) > 1: possible_lang = lang if possible_lang is None or possible_lang == lang else None if seen_languages and valid and not any((l in seen_languages for l in valid)) and \ (not any((valid_languages.get(l) for l in valid)) or any((valid_languages.get(l) for l in seen_languages))): return AMBIGUOUS_LANGUAGE if len(valid) == 1: current_lang = valid[0] else: valid_default = [l for l in valid if valid_languages.get(l)] if len(valid_default ) == 1 and current_lang is not None and valid_default[ 0] != current_lang: return AMBIGUOUS_LANGUAGE elif len(valid_default) == 1: current_lang = valid_default[0] if any((current_lang not in langs for script, langs in script_langs.iteritems())): return AMBIGUOUS_LANGUAGE seen_languages.update(valid) if current_lang is not None: return current_lang elif possible_lang is not None: if not any((possible_lang not in langs for script, langs in script_langs.iteritems())): return possible_lang else: return AMBIGUOUS_LANGUAGE return UNKNOWN_LANGUAGE