Python tokenizeの例、geodata.text.tokenize.tokenize Pythonの例

コード例 #1

0

ファイルを表示

    def gen_phrases(self, s, canonical_only=False, languages=None):
        tokens = tokenize(s)
        norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t,
                        c) for t, c in tokens]

        if not languages:
            languages = None
        elif not hasattr(languages, '__iter__'):
            languages = [languages]

        if not hasattr(languages, '__contains__'):
            languages = set(languages)

        for t, c, length, data in self.filter(norm_tokens):
            if c == token_types.PHRASE:
                if not canonical_only and languages is None:
                    yield six.u(' ').join([t_i for t_i, c_i in t])
                else:
                    phrase = None
                    for d in data:
                        lang, dictionary, is_canonical, canonical = d.split(
                            six.b('|'))

                        if (bool(int(is_canonical)) or
                                not canonical_only) and (languages is None
                                                         or lang in languages
                                                         or lang == 'all'):
                            phrase = phrase if phrase is not None else six.u(
                                ' ').join([t_i for t_i, c_i in t])
                            yield phrase

コード例 #2

0

ファイルを表示

ファイル: formatter.py プロジェクト: BERENZ/libpostal

        def validate_chinese_house_number(cls, house_number):
            if not house_number:
                return False
            tokens = tokenize(house_number)

            if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋', u'附')) for t, c in tokens):
                return True
            return cls.validate_house_number(house_number)

コード例 #3

0

ファイルを表示

ファイル: formatter.py プロジェクト: rinigus/deb-libpostal

        def validate_chinese_house_number(cls, house_number):
            if not house_number:
                return False
            tokens = tokenize(house_number)

            if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋',
                                                                  u'附'))
                   for t, c in tokens):
                return True
            return cls.validate_house_number(house_number)

コード例 #4

0

ファイルを表示

ファイル: extraction.py プロジェクト: BERENZ/libpostal

    def add_ngrams(self, s, n=2):
        sequences = []
        seq = []
        for t, c in tokenize(s):
            if c in self.WORD_TOKEN_TYPES:
                seq.append((t, c))
            elif seq:
                sequences.append(seq)
                seq = []
        if seq:
            sequences.append(seq)

        for seq in sequences:
            for gram in self.ngrams(seq, n=n):
                last_c = None

                prev_tokens = tuple([(t.lower(), c) for t, c in gram[:-1]])
                if prev_tokens in self.vocab:
                    t, c = gram[-1]
                    current_token = (t.lower(), c)

                    self.frequencies[(prev_tokens, current_token)] += 1

コード例 #5

0

ファイルを表示

    def add_ngrams(self, s, n=2):
        sequences = []
        seq = []
        for t, c in tokenize(s):
            if c in self.WORD_TOKEN_TYPES:
                seq.append((t, c))
            elif seq:
                sequences.append(seq)
                seq = []
        if seq:
            sequences.append(seq)

        for seq in sequences:
            for gram in self.ngrams(seq, n=n):
                last_c = None

                prev_tokens = tuple([(t.lower(), c) for t, c in gram[:-1]])
                if prev_tokens in self.vocab:
                    t, c = gram[-1]
                    current_token = (t.lower(), c)

                    self.frequencies[(prev_tokens, current_token)] += 1

コード例 #6

0

ファイルを表示

ファイル: gazetteers.py プロジェクト: BERENZ/libpostal

    def gen_phrases(self, s, canonical_only=False, languages=None):
        tokens = tokenize(s)
        norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]

        if not languages:
            languages = None
        elif not hasattr(languages, '__iter__'):
            languages = [languages]

        if not hasattr(languages, '__contains__'):
            languages = set(languages)

        for t, c, length, data in self.filter(norm_tokens):
            if c == token_types.PHRASE:
                if not canonical_only and languages is None:
                    yield six.u(' ').join([t_i for t_i, c_i in t])
                else:
                    phrase = None
                    for d in data:
                        lang, dictionary, is_canonical, canonical = d.split(six.b('|'))

                        if (bool(int(is_canonical)) or not canonical_only) and (languages is None or lang in languages or lang == 'all'):
                            phrase = phrase if phrase is not None else six.u(' ').join([t_i for t_i, c_i in t])
                            yield phrase

コード例 #7

0

ファイルを表示

ファイル: formatter.py プロジェクト: xiamx/libpostal

    def format_address(self,
                       country,
                       components,
                       minimal_only=True,
                       tag_components=True,
                       replace_aliases=True,
                       template_replacements=False):
        template = self.config.get(country.upper())
        if not template:
            return None
        template_text = template['address_template']
        if replace_aliases:
            self.replace_aliases(components)

        if minimal_only and not self.minimal_components(components):
            return None

        if template_replacements:
            self.apply_replacements(template, components)

        if tag_components:
            template_text = self.tag_template_separators(template_text)
            components = {
                k: u' '.join([
                    u'{}/{}'.format(t.replace(' ', ''), k.replace(' ', '_'))
                    for t, c in tokenize(v)
                ])
                for k, v in components.iteritems()
            }

        text = self.render_template(template_text,
                                    components,
                                    tagged=tag_components)

        text = self.post_replacements(template, text)
        return text

コード例 #8

0

ファイルを表示

 def tagged_tokens(self, name, label):
     return six.u(' ').join([
         six.u('{}/{}').format(t.replace(' ', ''),
                               label if t != ',' else self.separator_tag)
         for t, c in tokenize(name)
     ])

コード例 #9

0

ファイルを表示

ファイル: utils.py プロジェクト: rinigus/deb-libpostal

def is_numeric_strict(s):
    tokens = tokenize(s)
    return sum(
        (1 for t, c in tokens if c == token_types.NUMERIC)) == len(tokens)

コード例 #10

0

ファイルを表示

ファイル: extraction.py プロジェクト: BERENZ/libpostal

 def add_tokens(self, s):
     for t, c in tokenize(s):
         if c in self.WORD_TOKEN_TYPES:
             self.vocab[((t.lower(), c), )] += 1
             self.train_words += 1

コード例 #11

0

ファイルを表示

ファイル: numbering.py プロジェクト: BERENZ/libpostal

    def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
        has_alpha = False
        has_numeric = True
        is_integer = False
        is_none = False
        if num is not None:
            try:
                num_int = int(num)
                is_integer = True
            except ValueError:
                try:
                    num_float = float(num)
                except ValueError:
                    tokens = tokenize(safe_decode(num))
                    has_numeric = False
                    for t, c in tokens:
                        if c == token_types.NUMERIC:
                            has_numeric = True
                        if any((ch.isalpha() for ch in t)):
                            has_alpha = True

                    if strict_numeric and has_alpha:
                        return safe_decode(num)

        else:
            is_none = True

        values, probs = None, None

        if is_alpha:
            values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)

        # Pick a phrase given the probability distribution from the config
        if values is None:
            values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)

        if not values:
            return safe_decode(num) if not is_none else None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        # Dictionaries are lowercased, so title case here
        if phrase_props.get('title_case', True):
            phrase = phrase.title()

        '''
        There are a few ways we can express the number itself

        1. Alias it as some standalone word like basement (for floor "-1")
        2. Use the number itself, so "Floor 2"
        3. Append/prepend an affix e.g. 2/F for second floor
        4. As an ordinal expression e.g. "2nd Floor"
        '''
        have_standalone = False
        have_null = False
        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key)
            if prob is not None:
                if num_type == 'standalone':
                    have_standalone = True
                elif num_type == 'null':
                    have_null = True
                values.append(num_type)
                probs.append(prob)
            elif num_type in phrase_props:
                values.append(num_type)
                probs.append(1.0)
                break

        if not probs or is_none:
            return phrase

        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
        if has_alpha:
            values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
            total = float(sum(probs))
            if isclose(total, 0.0):
                return None

            probs = [p / total for p in probs]

        probs = cdf(probs)

        if len(values) < 2:
            if have_standalone:
                num_type = 'standalone'
            elif have_null:
                num_type = 'null'
            else:
                num_type = 'numeric'
        else:
            num_type = weighted_choice(values, probs)

        if num_type == 'standalone':
            return phrase
        elif num_type == 'null':
            return safe_decode(num)

        props = phrase_props[num_type]

        if is_integer:
            num_int = int(num)
            if phrase_props.get('number_abs_value', False):
                num_int = abs(num_int)
                num = num_int

            if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
                return None

            if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
                return None

            if phrase_props.get('number_subtract_abs_value'):
                num_int -= phrase_props['number_subtract_abs_value']
                num = num_int

        num = safe_decode(num)
        digits_props = props.get('digits')
        if digits_props:
            # Inherit the gender and category e.g. for ordinals
            for k in ('gender', 'category'):
                if k in props:
                    digits_props[k] = props[k]
            num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)

        # Do we add the numeric phrase e.g. Floor No 1
        add_number_phrase = props.get('add_number_phrase', False)
        if add_number_phrase and random.random() < props['add_number_phrase_probability']:
            num = Number.phrase(num, language, country=country)

        whitespace_default = True

        if num_type == 'numeric_affix':
            phrase = props['affix']
            if props.get('upper_case', True):
                phrase = phrase.upper()
            if 'zero_pad' in props and num.isdigit():
                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
            whitespace_default = False
        elif num_type == 'ordinal' and safe_decode(num).isdigit():
            ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))

            if ordinal_expression is not None:
                num = ordinal_expression

        if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
            if random.random() < props['null_phrase_probability']:
                return num

        direction = props['direction']
        whitespace = props.get('whitespace', whitespace_default)

        whitespace_probability = props.get('whitespace_probability')
        if whitespace_probability is not None:
            whitespace = random.random() < whitespace_probability

        # Occasionally switch up if direction_probability is specified
        if random.random() > props.get('direction_probability', 1.0):
            if direction == 'left':
                direction = 'right'
            elif direction == 'right':
                direction = 'left'

        whitespace_phrase = six.u(' ') if whitespace else six.u('')
        # Phrase goes to the left of hte number
        if direction == 'left':
            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
        # Phrase goes to the right of the number
        elif direction == 'right':
            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
        # Need to specify a direction, otherwise return naked number
        else:
            return safe_decode(num)

コード例 #12

0

ファイルを表示

 def add_tokens(self, s):
     for t, c in tokenize(s):
         if c in self.WORD_TOKEN_TYPES:
             self.vocab[((t.lower(), c), )] += 1
             self.train_words += 1

コード例 #13

0

ファイルを表示

    def numeric_phrase(cls,
                       key,
                       num,
                       language,
                       country=None,
                       dictionaries=(),
                       strict_numeric=False,
                       is_alpha=False):
        has_alpha = False
        has_numeric = True
        is_integer = False
        is_none = False
        if num is not None:
            try:
                num_int = int(num)
                is_integer = True
            except ValueError:
                try:
                    num_float = float(num)
                except ValueError:
                    tokens = tokenize(safe_decode(num))
                    has_numeric = False
                    for t, c in tokens:
                        if c == token_types.NUMERIC:
                            has_numeric = True
                        if any((ch.isalpha() for ch in t)):
                            has_alpha = True

                    if strict_numeric and has_alpha:
                        return safe_decode(num)

        else:
            is_none = True

        values, probs = None, None

        if is_alpha:
            values, probs = address_config.alternative_probabilities(
                '{}.alpha'.format(key),
                language,
                dictionaries=dictionaries,
                country=country)

        # Pick a phrase given the probability distribution from the config
        if values is None:
            values, probs = address_config.alternative_probabilities(
                key, language, dictionaries=dictionaries, country=country)

        if not values:
            return safe_decode(num) if not is_none else None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        # Dictionaries are lowercased, so title case here
        if phrase_props.get('title_case', True):
            phrase = phrase.title()
        '''
        There are a few ways we can express the number itself

        1. Alias it as some standalone word like basement (for floor "-1")
        2. Use the number itself, so "Floor 2"
        3. Append/prepend an affix e.g. 2/F for second floor
        4. As an ordinal expression e.g. "2nd Floor"
        '''
        have_standalone = False
        have_null = False
        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix',
                         'ordinal'):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key)
            if prob is not None:
                if num_type == 'standalone':
                    have_standalone = True
                elif num_type == 'null':
                    have_null = True
                values.append(num_type)
                probs.append(prob)
            elif num_type in phrase_props:
                values.append(num_type)
                probs.append(1.0)
                break

        if not probs or is_none:
            return phrase

        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
        if has_alpha:
            values, probs = zip(*[(v, p) for v, p in zip(values, probs)
                                  if v in ('numeric', 'null', 'standalone')])
            total = float(sum(probs))
            if isclose(total, 0.0):
                return None

            probs = [p / total for p in probs]

        probs = cdf(probs)

        if len(values) < 2:
            if have_standalone:
                num_type = 'standalone'
            elif have_null:
                num_type = 'null'
            else:
                num_type = 'numeric'
        else:
            num_type = weighted_choice(values, probs)

        if num_type == 'standalone':
            return phrase
        elif num_type == 'null':
            return safe_decode(num)

        props = phrase_props[num_type]

        if is_integer:
            num_int = int(num)
            if phrase_props.get('number_abs_value', False):
                num_int = abs(num_int)
                num = num_int

            if 'number_min_abs_value' in phrase_props and num_int < phrase_props[
                    'number_min_abs_value']:
                return None

            if 'number_max_abs_value' in phrase_props and num_int > phrase_props[
                    'number_max_abs_value']:
                return None

            if phrase_props.get('number_subtract_abs_value'):
                num_int -= phrase_props['number_subtract_abs_value']
                num = num_int

        num = safe_decode(num)
        digits_props = props.get('digits')
        if digits_props:
            # Inherit the gender and category e.g. for ordinals
            for k in ('gender', 'category'):
                if k in props:
                    digits_props[k] = props[k]
            num = Digits.rewrite(num,
                                 language,
                                 digits_props,
                                 num_type=Digits.CARDINAL
                                 if num_type != 'ordinal' else Digits.ORDINAL)

        # Do we add the numeric phrase e.g. Floor No 1
        add_number_phrase = props.get('add_number_phrase', False)
        if add_number_phrase and random.random(
        ) < props['add_number_phrase_probability']:
            num = Number.phrase(num, language, country=country)

        whitespace_default = True

        if num_type == 'numeric_affix':
            phrase = props['affix']
            if props.get('upper_case', True):
                phrase = phrase.upper()
            if 'zero_pad' in props and num.isdigit():
                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
            whitespace_default = False
        elif num_type == 'ordinal' and safe_decode(num).isdigit():
            ordinal_expression = ordinal_expressions.suffixed_number(
                num, language, gender=props.get('gender', None))

            if ordinal_expression is not None:
                num = ordinal_expression

        if 'null_phrase_probability' in props and (
                num_type == 'ordinal' or
            (has_alpha and
             (has_numeric or 'null_phrase_alpha_only' in props))):
            if random.random() < props['null_phrase_probability']:
                return num

        direction = props['direction']
        whitespace = props.get('whitespace', whitespace_default)

        whitespace_probability = props.get('whitespace_probability')
        if whitespace_probability is not None:
            whitespace = random.random() < whitespace_probability

        # Occasionally switch up if direction_probability is specified
        if random.random() > props.get('direction_probability', 1.0):
            if direction == 'left':
                direction = 'right'
            elif direction == 'right':
                direction = 'left'

        whitespace_phrase = six.u(' ') if whitespace else six.u('')
        # Phrase goes to the left of hte number
        if direction == 'left':
            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
        # Phrase goes to the right of the number
        elif direction == 'right':
            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
        # Need to specify a direction, otherwise return naked number
        else:
            return safe_decode(num)

コード例 #14

0

ファイルを表示

ファイル: utils.py プロジェクト: BERENZ/libpostal

def is_numeric_strict(s):
    tokens = tokenize(s)
    return sum((1 for t, c in tokens if c == token_types.NUMERIC)) == len(tokens)

コード例 #15

0

ファイルを表示

ファイル: utils.py プロジェクト: rinigus/deb-libpostal

def is_numeric(s):
    tokens = tokenize(s)
    return sum((1 for t, c in tokens
                if c in token_types.NUMERIC_TOKEN_TYPES)) == len(tokens)

コード例 #16

0

ファイルを表示

ファイル: utils.py プロジェクト: BERENZ/libpostal

def is_numeric(s):
    tokens = tokenize(s)
    return sum((1 for t, c in tokens if c in token_types.NUMERIC_TOKEN_TYPES)) == len(tokens)

コード例 #17

0

ファイルを表示

ファイル: formatter.py プロジェクト: BERENZ/libpostal

 def tagged_tokens(self, name, label):
     return six.u(' ').join([six.u('{}/{}').format(t.replace(' ', ''), label if t != ',' else self.separator_tag) for t, c in tokenize(name)])