def gen_phrases(self, s, canonical_only=False, languages=None): tokens = tokenize(s) norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] if not languages: languages = None elif not hasattr(languages, '__iter__'): languages = [languages] if not hasattr(languages, '__contains__'): languages = set(languages) for t, c, length, data in self.filter(norm_tokens): if c == token_types.PHRASE: if not canonical_only and languages is None: yield six.u(' ').join([t_i for t_i, c_i in t]) else: phrase = None for d in data: lang, dictionary, is_canonical, canonical = d.split( six.b('|')) if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'): phrase = phrase if phrase is not None else six.u( ' ').join([t_i for t_i, c_i in t]) yield phrase
def validate_chinese_house_number(cls, house_number): if not house_number: return False tokens = tokenize(house_number) if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens): return True return cls.validate_house_number(house_number)
def add_ngrams(self, s, n=2): sequences = [] seq = [] for t, c in tokenize(s): if c in self.WORD_TOKEN_TYPES: seq.append((t, c)) elif seq: sequences.append(seq) seq = [] if seq: sequences.append(seq) for seq in sequences: for gram in self.ngrams(seq, n=n): last_c = None prev_tokens = tuple([(t.lower(), c) for t, c in gram[:-1]]) if prev_tokens in self.vocab: t, c = gram[-1] current_token = (t.lower(), c) self.frequencies[(prev_tokens, current_token)] += 1
def gen_phrases(self, s, canonical_only=False, languages=None): tokens = tokenize(s) norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens] if not languages: languages = None elif not hasattr(languages, '__iter__'): languages = [languages] if not hasattr(languages, '__contains__'): languages = set(languages) for t, c, length, data in self.filter(norm_tokens): if c == token_types.PHRASE: if not canonical_only and languages is None: yield six.u(' ').join([t_i for t_i, c_i in t]) else: phrase = None for d in data: lang, dictionary, is_canonical, canonical = d.split(six.b('|')) if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'): phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t]) yield phrase
def format_address(self, country, components, minimal_only=True, tag_components=True, replace_aliases=True, template_replacements=False): template = self.config.get(country.upper()) if not template: return None template_text = template['address_template'] if replace_aliases: self.replace_aliases(components) if minimal_only and not self.minimal_components(components): return None if template_replacements: self.apply_replacements(template, components) if tag_components: template_text = self.tag_template_separators(template_text) components = { k: u' '.join([ u'{}/{}'.format(t.replace(' ', ''), k.replace(' ', '_')) for t, c in tokenize(v) ]) for k, v in components.iteritems() } text = self.render_template(template_text, components, tagged=tag_components) text = self.post_replacements(template, text) return text
def tagged_tokens(self, name, label): return six.u(' ').join([ six.u('{}/{}').format(t.replace(' ', ''), label if t != ',' else self.separator_tag) for t, c in tokenize(name) ])
def is_numeric_strict(s): tokens = tokenize(s) return sum( (1 for t, c in tokens if c == token_types.NUMERIC)) == len(tokens)
def add_tokens(self, s): for t, c in tokenize(s): if c in self.WORD_TOKEN_TYPES: self.vocab[((t.lower(), c), )] += 1 self.train_words += 1
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False): has_alpha = False has_numeric = True is_integer = False is_none = False if num is not None: try: num_int = int(num) is_integer = True except ValueError: try: num_float = float(num) except ValueError: tokens = tokenize(safe_decode(num)) has_numeric = False for t, c in tokens: if c == token_types.NUMERIC: has_numeric = True if any((ch.isalpha() for ch in t)): has_alpha = True if strict_numeric and has_alpha: return safe_decode(num) else: is_none = True values, probs = None, None if is_alpha: values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country) # Pick a phrase given the probability distribution from the config if values is None: values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country) if not values: return safe_decode(num) if not is_none else None phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] # Dictionaries are lowercased, so title case here if phrase_props.get('title_case', True): phrase = phrase.title() ''' There are a few ways we can express the number itself 1. Alias it as some standalone word like basement (for floor "-1") 2. Use the number itself, so "Floor 2" 3. Append/prepend an affix e.g. 2/F for second floor 4. As an ordinal expression e.g. "2nd Floor" ''' have_standalone = False have_null = False for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'): key = '{}_probability'.format(num_type) prob = phrase_props.get(key) if prob is not None: if num_type == 'standalone': have_standalone = True elif num_type == 'null': have_null = True values.append(num_type) probs.append(prob) elif num_type in phrase_props: values.append(num_type) probs.append(1.0) break if not probs or is_none: return phrase # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items if has_alpha: values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')]) total = float(sum(probs)) if isclose(total, 0.0): return None probs = [p / total for p in probs] probs = cdf(probs) if len(values) < 2: if have_standalone: num_type = 'standalone' elif have_null: num_type = 'null' else: num_type = 'numeric' else: num_type = weighted_choice(values, probs) if num_type == 'standalone': return phrase elif num_type == 'null': return safe_decode(num) props = phrase_props[num_type] if is_integer: num_int = int(num) if phrase_props.get('number_abs_value', False): num_int = abs(num_int) num = num_int if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']: return None if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']: return None if phrase_props.get('number_subtract_abs_value'): num_int -= phrase_props['number_subtract_abs_value'] num = num_int num = safe_decode(num) digits_props = props.get('digits') if digits_props: # Inherit the gender and category e.g. for ordinals for k in ('gender', 'category'): if k in props: digits_props[k] = props[k] num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL) # Do we add the numeric phrase e.g. Floor No 1 add_number_phrase = props.get('add_number_phrase', False) if add_number_phrase and random.random() < props['add_number_phrase_probability']: num = Number.phrase(num, language, country=country) whitespace_default = True if num_type == 'numeric_affix': phrase = props['affix'] if props.get('upper_case', True): phrase = phrase.upper() if 'zero_pad' in props and num.isdigit(): num = num.rjust(props['zero_pad'], props.get('zero_char', '0')) whitespace_default = False elif num_type == 'ordinal' and safe_decode(num).isdigit(): ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None)) if ordinal_expression is not None: num = ordinal_expression if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))): if random.random() < props['null_phrase_probability']: return num direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) whitespace_probability = props.get('whitespace_probability') if whitespace_probability is not None: whitespace = random.random() < whitespace_probability # Occasionally switch up if direction_probability is specified if random.random() > props.get('direction_probability', 1.0): if direction == 'left': direction = 'right' elif direction == 'right': direction = 'left' whitespace_phrase = six.u(' ') if whitespace else six.u('') # Phrase goes to the left of hte number if direction == 'left': return six.u('{}{}{}').format(phrase, whitespace_phrase, num) # Phrase goes to the right of the number elif direction == 'right': return six.u('{}{}{}').format(num, whitespace_phrase, phrase) # Need to specify a direction, otherwise return naked number else: return safe_decode(num)
def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False): has_alpha = False has_numeric = True is_integer = False is_none = False if num is not None: try: num_int = int(num) is_integer = True except ValueError: try: num_float = float(num) except ValueError: tokens = tokenize(safe_decode(num)) has_numeric = False for t, c in tokens: if c == token_types.NUMERIC: has_numeric = True if any((ch.isalpha() for ch in t)): has_alpha = True if strict_numeric and has_alpha: return safe_decode(num) else: is_none = True values, probs = None, None if is_alpha: values, probs = address_config.alternative_probabilities( '{}.alpha'.format(key), language, dictionaries=dictionaries, country=country) # Pick a phrase given the probability distribution from the config if values is None: values, probs = address_config.alternative_probabilities( key, language, dictionaries=dictionaries, country=country) if not values: return safe_decode(num) if not is_none else None phrase, phrase_props = weighted_choice(values, probs) values = [] probs = [] # Dictionaries are lowercased, so title case here if phrase_props.get('title_case', True): phrase = phrase.title() ''' There are a few ways we can express the number itself 1. Alias it as some standalone word like basement (for floor "-1") 2. Use the number itself, so "Floor 2" 3. Append/prepend an affix e.g. 2/F for second floor 4. As an ordinal expression e.g. "2nd Floor" ''' have_standalone = False have_null = False for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'): key = '{}_probability'.format(num_type) prob = phrase_props.get(key) if prob is not None: if num_type == 'standalone': have_standalone = True elif num_type == 'null': have_null = True values.append(num_type) probs.append(prob) elif num_type in phrase_props: values.append(num_type) probs.append(1.0) break if not probs or is_none: return phrase # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items if has_alpha: values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')]) total = float(sum(probs)) if isclose(total, 0.0): return None probs = [p / total for p in probs] probs = cdf(probs) if len(values) < 2: if have_standalone: num_type = 'standalone' elif have_null: num_type = 'null' else: num_type = 'numeric' else: num_type = weighted_choice(values, probs) if num_type == 'standalone': return phrase elif num_type == 'null': return safe_decode(num) props = phrase_props[num_type] if is_integer: num_int = int(num) if phrase_props.get('number_abs_value', False): num_int = abs(num_int) num = num_int if 'number_min_abs_value' in phrase_props and num_int < phrase_props[ 'number_min_abs_value']: return None if 'number_max_abs_value' in phrase_props and num_int > phrase_props[ 'number_max_abs_value']: return None if phrase_props.get('number_subtract_abs_value'): num_int -= phrase_props['number_subtract_abs_value'] num = num_int num = safe_decode(num) digits_props = props.get('digits') if digits_props: # Inherit the gender and category e.g. for ordinals for k in ('gender', 'category'): if k in props: digits_props[k] = props[k] num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL) # Do we add the numeric phrase e.g. Floor No 1 add_number_phrase = props.get('add_number_phrase', False) if add_number_phrase and random.random( ) < props['add_number_phrase_probability']: num = Number.phrase(num, language, country=country) whitespace_default = True if num_type == 'numeric_affix': phrase = props['affix'] if props.get('upper_case', True): phrase = phrase.upper() if 'zero_pad' in props and num.isdigit(): num = num.rjust(props['zero_pad'], props.get('zero_char', '0')) whitespace_default = False elif num_type == 'ordinal' and safe_decode(num).isdigit(): ordinal_expression = ordinal_expressions.suffixed_number( num, language, gender=props.get('gender', None)) if ordinal_expression is not None: num = ordinal_expression if 'null_phrase_probability' in props and ( num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))): if random.random() < props['null_phrase_probability']: return num direction = props['direction'] whitespace = props.get('whitespace', whitespace_default) whitespace_probability = props.get('whitespace_probability') if whitespace_probability is not None: whitespace = random.random() < whitespace_probability # Occasionally switch up if direction_probability is specified if random.random() > props.get('direction_probability', 1.0): if direction == 'left': direction = 'right' elif direction == 'right': direction = 'left' whitespace_phrase = six.u(' ') if whitespace else six.u('') # Phrase goes to the left of hte number if direction == 'left': return six.u('{}{}{}').format(phrase, whitespace_phrase, num) # Phrase goes to the right of the number elif direction == 'right': return six.u('{}{}{}').format(num, whitespace_phrase, phrase) # Need to specify a direction, otherwise return naked number else: return safe_decode(num)
def is_numeric_strict(s): tokens = tokenize(s) return sum((1 for t, c in tokens if c == token_types.NUMERIC)) == len(tokens)
def is_numeric(s): tokens = tokenize(s) return sum((1 for t, c in tokens if c in token_types.NUMERIC_TOKEN_TYPES)) == len(tokens)
def tagged_tokens(self, name, label): return six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), label if t != ',' else self.separator_tag) for t, c in tokenize(name)])