Esempio n. 1
0
def to_valid_latitude(latitude):
    '''Convert longitude into the -180 to 180 scale'''
    if not is_valid_latitude(latitude):
        raise ValueError('Invalid latitude {}'.format(latitude))

    if isclose(latitude, 90.0):
        latitude = 89.9999
    elif isclose(latitude, -90.0):
        latitude = -89.9999

    return latitude
Esempio n. 2
0
def to_valid_latitude(latitude):
    '''Convert longitude into the -180 to 180 scale'''
    if not is_valid_latitude(latitude):
        raise ValueError('Invalid latitude {}'.format(latitude))

    if isclose(latitude, 90.0):
        latitude = 89.9999
    elif isclose(latitude, -90.0):
        latitude = -89.9999

    return latitude
Esempio n. 3
0
    def insertion_distribution(self, insertions):
        values = []
        probs = []

        for k, v in six.iteritems(insertions):
            if k == 'conditional' or not v:
                continue

            if 'before' in v:
                val = (self.BEFORE, v['before'])
            elif 'after' in v:
                val = (self.AFTER, v['after'])
            elif 'last' in v:
                val = (self.LAST, None)
            elif 'first' in v:
                val = (self.FIRST, None)
            else:
                raise ValueError(
                    'Insertions must contain one of {{first, before, after, last}}. Value was: {}'
                    .format(v))

            prob = v['probability']
            values.append(val)
            probs.append(prob)

        # If the probabilities don't sum to 1, add a "do nothing" action
        if not isclose(sum(probs), 1.0):
            probs.append(1.0 - sum(probs))
            values.append((None, None, False))

        return values, cdf(probs)
Esempio n. 4
0
def check_probability_distribution(probs):
    cumulative = 0.0
    for p in probs:
        assert p >= 0.0, 'Probabilities cannot be negative'
        assert p <= 1.0, 'Probabilities cannot be > 1.0'
        cumulative += p
    assert isclose(cumulative, 1.0), 'Probabilities must sum to 1: probs={}, cumulative={}'.format(probs, cumulative)
Esempio n. 5
0
    def insertion_distribution(self, insertions):
        values = []
        probs = []

        for k, v in six.iteritems(insertions):
            if k == 'conditional' or not v:
                continue

            if 'before' in v:
                val = (self.BEFORE, v['before'])
            elif 'after' in v:
                val = (self.AFTER, v['after'])
            elif 'last' in v:
                val = (self.LAST, None)
            elif 'first' in v:
                val = (self.FIRST, None)
            else:
                raise ValueError('Insertions must contain one of {{first, before, after, last}}. Value was: {}'.format(v))

            prob = v['probability']
            values.append(val)
            probs.append(prob)

        # If the probabilities don't sum to 1, add a "do nothing" action
        if not isclose(sum(probs), 1.0):
            probs.append(1.0 - sum(probs))
            values.append((None, None, False))

        return values, cdf(probs)
Esempio n. 6
0
def check_probability_distribution(probs):
    cumulative = 0.0
    for p in probs:
        assert p >= 0.0, 'Probabilities cannot be negative'
        assert p <= 1.0, 'Probabilities cannot be > 1.0'
        cumulative += p
    assert isclose(
        cumulative,
        1.0), 'Probabilities must sum to 1: probs={}, cumulative={}'.format(
            probs, cumulative)
Esempio n. 7
0
    def create_from_osm_file(cls, filename, output_dir, precision=None):
        '''
        Given an OSM file (planet or some other bounds) containing relations
        and their dependencies, create an R-tree index for coarse-grained
        reverse geocoding.

        Note: the input file is expected to have been created using
        osmfilter. Use fetch_osm_address_data.sh for planet or copy the
        admin borders commands if using other bounds.
        '''
        if precision is None:
            precision = cls.GEOHASH_PRECISION

        index = cls(save_dir=output_dir, precision=precision)

        i = 0
        for element_id, props, deps in parse_osm(filename):
            props = {
                safe_decode(k): safe_decode(v)
                for k, v in six.iteritems(props)
            }

            node_id = long(element_id.split(':')[-1])
            lat = props.get('lat')
            lon = props.get('lon')
            if lat is None or lon is None:
                continue
            lat, lon = latlon_to_decimal(lat, lon)
            if lat is None or lon is None:
                continue

            if isclose(lon, 180.0):
                lon = 179.999

            props = {
                k: v
                for k, v in six.iteritems(props)
                if k in ('id', 'type') or k in cls.include_property_patterns or
                (six.u(':') in k and six.u('{}:*').format(
                    k.split(six.u(':'), 1)[0]) in cls.include_property_patterns
                 )
            }

            props['type'] = 'node'
            props['id'] = node_id

            index.add_point(lat, lon, props)

            if i % 1000 == 0 and i > 0:
                print('did {} points'.format(i))
            i += 1

        return index
Esempio n. 8
0
    def check_components(self, language, country):
        conf = address_config.get_property('components',
                                           language,
                                           country=country)
        for component, value in six.iteritems(conf):
            if component == 'combinations':
                continue
            total_prob = 0.0
            for k, v in six.iteritems(value):
                if k.endswith('probability'):
                    total_prob += v

            self.assertTrue(
                isclose(total_prob, 1.0),
                six.u('language: {}, country: {}, component: {}'.format(
                    language, country, component)))
Esempio n. 9
0
    def rewrite(cls, d, lang, props, num_type=CARDINAL):
        if not props:
            return d

        d = safe_decode(d)

        values = []
        probs = []

        for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH,
                           cls.ROMAN_NUMERAL):
            key = '{}_probability'.format(digit_type)
            if key in props:
                values.append(digit_type)
                probs.append(props[key])

        if not isclose(sum(probs), 1.0):
            values.append(cls.ASCII)
            probs.append(1.0 - sum(probs))

        probs = cdf(probs)
        digit_type = weighted_choice(values, probs)

        if digit_type == cls.ASCII:
            return d
        elif digit_type == cls.SPELLOUT:
            return cls.rewrite_spellout(d, lang, num_type, props)
        elif digit_type == cls.ROMAN_NUMERAL:
            roman_numeral = cls.rewrite_roman_numeral(d)
            if random.random() < props.get('ordinal_suffix_probability', 0.0):
                ordinal_suffix = ordinal_expressions.get_suffix(
                    d, lang, gender=props.get('gender', None))
                if ordinal_suffix:
                    roman_numeral = six.u('{}{}').format(
                        roman_numeral, ordinal_suffix)
            return roman_numeral
        elif digit_type == cls.UNICODE_FULL_WIDTH:
            return cls.rewrite_full_width(d)
        else:
            return d
Esempio n. 10
0
    def rewrite(cls, d, lang, props, num_type=CARDINAL):
        if not props:
            return d

        d = safe_decode(d)

        values = []
        probs = []

        for digit_type in (cls.SPELLOUT, cls.UNICODE_FULL_WIDTH, cls.ROMAN_NUMERAL):
            key = '{}_probability'.format(digit_type)
            if key in props:
                values.append(digit_type)
                probs.append(props[key])

        if not isclose(sum(probs), 1.0):
            values.append(cls.ASCII)
            probs.append(1.0 - sum(probs))

        probs = cdf(probs)
        digit_type = weighted_choice(values, probs)

        if digit_type == cls.ASCII:
            return d
        elif digit_type == cls.SPELLOUT:
            return cls.rewrite_spellout(d, lang, num_type, props)
        elif digit_type == cls.ROMAN_NUMERAL:
            roman_numeral = cls.rewrite_roman_numeral(d)
            if random.random() < props.get('ordinal_suffix_probability', 0.0):
                ordinal_suffix = ordinal_expressions.get_suffix(d, lang, gender=props.get('gender', None))
                if ordinal_suffix:
                    roman_numeral = six.u('{}{}').format(roman_numeral, ordinal_suffix)
            return roman_numeral
        elif digit_type == cls.UNICODE_FULL_WIDTH:
            return cls.rewrite_full_width(d)
        else:
            return d
Esempio n. 11
0
    def polygons(self, properties_only=False):
        '''
        Generator which yields tuples like:

        (relation_id, properties, outer_polygons, inner_polygons)

        At this point a polygon is a list of coordinate tuples,
        suitable for passing to shapely's Polygon constructor
        but may be used for other purposes.

        outer_polygons is a list of the exterior polygons for this
        boundary. inner_polygons is a list of "holes" in the exterior
        polygons although donuts and donut-holes need to be matched
        by the caller using something like shapely's contains.
        '''
        i = 0

        for element_id, props, deps in parse_osm(self.filename,
                                                 dependencies=True):
            props = {
                safe_decode(k): safe_decode(v)
                for k, v in six.iteritems(props)
            }
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                lat = props.get('lat')
                lon = props.get('lon')
                if lat is None or lon is None:
                    continue
                lat, lon = latlon_to_decimal(lat, lon)
                if lat is None or lon is None:
                    continue

                if isclose(lat, 90.0):
                    lat = 89.999

                if isclose(lon, 180.0):
                    lon = 179.999

                if 'name' in props and 'place' in props:
                    self.nodes[node_id] = props

                # Nodes are stored in a sorted array, coordinate indices are simply
                # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
                # Note that the pairs are lon, lat instead of lat, lon for geometry purposes
                self.coords.append(lon)
                self.coords.append(lat)
                self.node_ids.append(node_id)
            elif element_id.startswith('way'):
                way_id = long(element_id.split(':')[-1])

                # Get node indices by binary search
                try:
                    node_indices = [
                        self.binary_search(self.node_ids, node_id)
                        for node_id in deps
                    ]
                except ValueError:
                    continue

                # Way ids stored in a sorted array
                self.way_ids.append(way_id)

                # way_deps is the list of dependent node ids
                # way_coords is a copy of coords indexed by way ids
                for node_id, node_index in izip(deps, node_indices):
                    self.way_deps.append(node_id)
                    self.way_coords.append(self.coords[node_index * 2])
                    self.way_coords.append(self.coords[node_index * 2 + 1])

                self.way_indptr.append(len(self.way_deps))

                if deps[0] == deps[-1] and self.include_polygon(props):
                    way_id_offset = WAY_OFFSET + way_id
                    if not properties_only:
                        outer_polys = self.create_polygons([way_id])
                        inner_polys = []
                        yield way_id_offset, props, {}, outer_polys, inner_polys
                    else:
                        yield way_id_offset, props, {}

            elif element_id.startswith('relation'):
                if self.node_ids is not None:
                    self.node_ids = None
                if self.coords is not None:
                    self.coords = None

                relation_id = long(element_id.split(':')[-1])
                if len(deps
                       ) == 0 or not self.include_polygon(props) or props.get(
                           'type', '').lower() == 'multilinestring':
                    continue

                outer_ways = []
                inner_ways = []
                admin_centers = []

                for elem_id, elem_type, role in deps:
                    if role in ('outer', '') and elem_type == 'way':
                        outer_ways.append(elem_id)
                    elif role == 'inner' and elem_type == 'way':
                        inner_ways.append(elem_id)
                    elif role == 'admin_centre' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None:
                            val['type'] = 'node'
                            val['id'] = long(elem_id)
                            admin_centers.append(val)
                    elif role == 'label' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None and val.get(
                                'name', six.u('')).lower() == props.get(
                                    'name', six.u('')).lower():
                            props.update({
                                k: v
                                for k, v in six.iteritems(val)
                                if k not in props
                            })

                admin_center = {}
                if len(admin_centers) == 1:
                    admin_center = admin_centers[0]

                relation_id_offset = RELATION_OFFSET + relation_id
                if not properties_only:
                    outer_polys = self.create_polygons(outer_ways)
                    inner_polys = self.create_polygons(inner_ways)
                    yield relation_id_offset, props, admin_center, outer_polys, inner_polys
                else:
                    yield relation_id_offset, props, admin_center
            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(
                    element_id.split(':')[0], i))
            i += 1
Esempio n. 12
0
    def polygons(self, properties_only=False):
        '''
        Generator which yields tuples like:

        (relation_id, properties, outer_polygons, inner_polygons)

        At this point a polygon is a list of coordinate tuples,
        suitable for passing to shapely's Polygon constructor
        but may be used for other purposes.

        outer_polygons is a list of the exterior polygons for this
        boundary. inner_polygons is a list of "holes" in the exterior
        polygons although donuts and donut-holes need to be matched
        by the caller using something like shapely's contains.
        '''
        i = 0

        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
            props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                lat = props.get('lat')
                lon = props.get('lon')
                if lat is None or lon is None:
                    continue
                lat, lon = latlon_to_decimal(lat, lon)
                if lat is None or lon is None:
                    continue

                if isclose(lat, 90.0):
                    lat = 89.999

                if isclose(lon, 180.0):
                    lon = 179.999

                if 'name' in props and 'place' in props:
                    self.nodes[node_id] = props

                # Nodes are stored in a sorted array, coordinate indices are simply
                # [lon, lat, lon, lat ...] so the index can be calculated as 2 * i
                # Note that the pairs are lon, lat instead of lat, lon for geometry purposes
                self.coords.append(lon)
                self.coords.append(lat)
                self.node_ids.append(node_id)
            elif element_id.startswith('way'):
                way_id = long(element_id.split(':')[-1])

                # Get node indices by binary search
                try:
                    node_indices = [self.binary_search(self.node_ids, node_id) for node_id in deps]
                except ValueError:
                    continue

                # Way ids stored in a sorted array
                self.way_ids.append(way_id)

                # way_deps is the list of dependent node ids
                # way_coords is a copy of coords indexed by way ids
                for node_id, node_index in izip(deps, node_indices):
                    self.way_deps.append(node_id)
                    self.way_coords.append(self.coords[node_index * 2])
                    self.way_coords.append(self.coords[node_index * 2 + 1])

                self.way_indptr.append(len(self.way_deps))

                if deps[0] == deps[-1] and self.include_polygon(props):
                    way_id_offset = WAY_OFFSET + way_id
                    if not properties_only:
                        outer_polys = self.create_polygons([way_id])
                        inner_polys = []
                        yield way_id_offset, props, {}, outer_polys, inner_polys
                    else:
                        yield way_id_offset, props, {}

            elif element_id.startswith('relation'):
                if self.node_ids is not None:
                    self.node_ids = None
                if self.coords is not None:
                    self.coords = None

                relation_id = long(element_id.split(':')[-1])
                if len(deps) == 0 or not self.include_polygon(props) or props.get('type', '').lower() == 'multilinestring':
                    continue

                outer_ways = []
                inner_ways = []
                admin_centers = []

                for elem_id, elem_type, role in deps:
                    if role in ('outer', '') and elem_type == 'way':
                        outer_ways.append(elem_id)
                    elif role == 'inner' and elem_type == 'way':
                        inner_ways.append(elem_id)
                    elif role == 'admin_centre' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None:
                            val['type'] = 'node'
                            val['id'] = long(elem_id)
                            admin_centers.append(val)
                    elif role == 'label' and elem_type == 'node':
                        val = self.nodes.get(long(elem_id))
                        if val is not None and val.get('name', six.u('')).lower() == props.get('name', six.u('')).lower():
                            props.update({k: v for k, v in six.iteritems(val)
                                          if k not in props})

                admin_center = {}
                if len(admin_centers) == 1:
                    admin_center = admin_centers[0]

                relation_id_offset = RELATION_OFFSET + relation_id
                if not properties_only:
                    outer_polys = self.create_polygons(outer_ways)
                    inner_polys = self.create_polygons(inner_ways)
                    yield relation_id_offset, props, admin_center, outer_polys, inner_polys
                else:
                    yield relation_id_offset, props, admin_center
            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
            i += 1
Esempio n. 13
0
    def numeric_phrase(cls, key, num, language, country=None, dictionaries=(), strict_numeric=False, is_alpha=False):
        has_alpha = False
        has_numeric = True
        is_integer = False
        is_none = False
        if num is not None:
            try:
                num_int = int(num)
                is_integer = True
            except ValueError:
                try:
                    num_float = float(num)
                except ValueError:
                    tokens = tokenize(safe_decode(num))
                    has_numeric = False
                    for t, c in tokens:
                        if c == token_types.NUMERIC:
                            has_numeric = True
                        if any((ch.isalpha() for ch in t)):
                            has_alpha = True

                    if strict_numeric and has_alpha:
                        return safe_decode(num)

        else:
            is_none = True

        values, probs = None, None

        if is_alpha:
            values, probs = address_config.alternative_probabilities('{}.alpha'.format(key), language, dictionaries=dictionaries, country=country)

        # Pick a phrase given the probability distribution from the config
        if values is None:
            values, probs = address_config.alternative_probabilities(key, language, dictionaries=dictionaries, country=country)

        if not values:
            return safe_decode(num) if not is_none else None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        # Dictionaries are lowercased, so title case here
        if phrase_props.get('title_case', True):
            phrase = phrase.title()

        '''
        There are a few ways we can express the number itself

        1. Alias it as some standalone word like basement (for floor "-1")
        2. Use the number itself, so "Floor 2"
        3. Append/prepend an affix e.g. 2/F for second floor
        4. As an ordinal expression e.g. "2nd Floor"
        '''
        have_standalone = False
        have_null = False
        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix', 'ordinal'):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key)
            if prob is not None:
                if num_type == 'standalone':
                    have_standalone = True
                elif num_type == 'null':
                    have_null = True
                values.append(num_type)
                probs.append(prob)
            elif num_type in phrase_props:
                values.append(num_type)
                probs.append(1.0)
                break

        if not probs or is_none:
            return phrase

        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
        if has_alpha:
            values, probs = zip(*[(v, p) for v, p in zip(values, probs) if v in ('numeric', 'null', 'standalone')])
            total = float(sum(probs))
            if isclose(total, 0.0):
                return None

            probs = [p / total for p in probs]

        probs = cdf(probs)

        if len(values) < 2:
            if have_standalone:
                num_type = 'standalone'
            elif have_null:
                num_type = 'null'
            else:
                num_type = 'numeric'
        else:
            num_type = weighted_choice(values, probs)

        if num_type == 'standalone':
            return phrase
        elif num_type == 'null':
            return safe_decode(num)

        props = phrase_props[num_type]

        if is_integer:
            num_int = int(num)
            if phrase_props.get('number_abs_value', False):
                num_int = abs(num_int)
                num = num_int

            if 'number_min_abs_value' in phrase_props and num_int < phrase_props['number_min_abs_value']:
                return None

            if 'number_max_abs_value' in phrase_props and num_int > phrase_props['number_max_abs_value']:
                return None

            if phrase_props.get('number_subtract_abs_value'):
                num_int -= phrase_props['number_subtract_abs_value']
                num = num_int

        num = safe_decode(num)
        digits_props = props.get('digits')
        if digits_props:
            # Inherit the gender and category e.g. for ordinals
            for k in ('gender', 'category'):
                if k in props:
                    digits_props[k] = props[k]
            num = Digits.rewrite(num, language, digits_props, num_type=Digits.CARDINAL if num_type != 'ordinal' else Digits.ORDINAL)

        # Do we add the numeric phrase e.g. Floor No 1
        add_number_phrase = props.get('add_number_phrase', False)
        if add_number_phrase and random.random() < props['add_number_phrase_probability']:
            num = Number.phrase(num, language, country=country)

        whitespace_default = True

        if num_type == 'numeric_affix':
            phrase = props['affix']
            if props.get('upper_case', True):
                phrase = phrase.upper()
            if 'zero_pad' in props and num.isdigit():
                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
            whitespace_default = False
        elif num_type == 'ordinal' and safe_decode(num).isdigit():
            ordinal_expression = ordinal_expressions.suffixed_number(num, language, gender=props.get('gender', None))

            if ordinal_expression is not None:
                num = ordinal_expression

        if 'null_phrase_probability' in props and (num_type == 'ordinal' or (has_alpha and (has_numeric or 'null_phrase_alpha_only' in props))):
            if random.random() < props['null_phrase_probability']:
                return num

        direction = props['direction']
        whitespace = props.get('whitespace', whitespace_default)

        whitespace_probability = props.get('whitespace_probability')
        if whitespace_probability is not None:
            whitespace = random.random() < whitespace_probability

        # Occasionally switch up if direction_probability is specified
        if random.random() > props.get('direction_probability', 1.0):
            if direction == 'left':
                direction = 'right'
            elif direction == 'right':
                direction = 'left'

        whitespace_phrase = six.u(' ') if whitespace else six.u('')
        # Phrase goes to the left of hte number
        if direction == 'left':
            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
        # Phrase goes to the right of the number
        elif direction == 'right':
            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
        # Need to specify a direction, otherwise return naked number
        else:
            return safe_decode(num)
Esempio n. 14
0
    def numeric_phrase(cls,
                       key,
                       num,
                       language,
                       country=None,
                       dictionaries=(),
                       strict_numeric=False,
                       is_alpha=False):
        has_alpha = False
        has_numeric = True
        is_integer = False
        is_none = False
        if num is not None:
            try:
                num_int = int(num)
                is_integer = True
            except ValueError:
                try:
                    num_float = float(num)
                except ValueError:
                    tokens = tokenize(safe_decode(num))
                    has_numeric = False
                    for t, c in tokens:
                        if c == token_types.NUMERIC:
                            has_numeric = True
                        if any((ch.isalpha() for ch in t)):
                            has_alpha = True

                    if strict_numeric and has_alpha:
                        return safe_decode(num)

        else:
            is_none = True

        values, probs = None, None

        if is_alpha:
            values, probs = address_config.alternative_probabilities(
                '{}.alpha'.format(key),
                language,
                dictionaries=dictionaries,
                country=country)

        # Pick a phrase given the probability distribution from the config
        if values is None:
            values, probs = address_config.alternative_probabilities(
                key, language, dictionaries=dictionaries, country=country)

        if not values:
            return safe_decode(num) if not is_none else None

        phrase, phrase_props = weighted_choice(values, probs)

        values = []
        probs = []

        # Dictionaries are lowercased, so title case here
        if phrase_props.get('title_case', True):
            phrase = phrase.title()
        '''
        There are a few ways we can express the number itself

        1. Alias it as some standalone word like basement (for floor "-1")
        2. Use the number itself, so "Floor 2"
        3. Append/prepend an affix e.g. 2/F for second floor
        4. As an ordinal expression e.g. "2nd Floor"
        '''
        have_standalone = False
        have_null = False
        for num_type in ('standalone', 'null', 'numeric', 'numeric_affix',
                         'ordinal'):
            key = '{}_probability'.format(num_type)
            prob = phrase_props.get(key)
            if prob is not None:
                if num_type == 'standalone':
                    have_standalone = True
                elif num_type == 'null':
                    have_null = True
                values.append(num_type)
                probs.append(prob)
            elif num_type in phrase_props:
                values.append(num_type)
                probs.append(1.0)
                break

        if not probs or is_none:
            return phrase

        # If we're using something like "Floor A" or "Unit 2L", remove ordinal/affix items
        if has_alpha:
            values, probs = zip(*[(v, p) for v, p in zip(values, probs)
                                  if v in ('numeric', 'null', 'standalone')])
            total = float(sum(probs))
            if isclose(total, 0.0):
                return None

            probs = [p / total for p in probs]

        probs = cdf(probs)

        if len(values) < 2:
            if have_standalone:
                num_type = 'standalone'
            elif have_null:
                num_type = 'null'
            else:
                num_type = 'numeric'
        else:
            num_type = weighted_choice(values, probs)

        if num_type == 'standalone':
            return phrase
        elif num_type == 'null':
            return safe_decode(num)

        props = phrase_props[num_type]

        if is_integer:
            num_int = int(num)
            if phrase_props.get('number_abs_value', False):
                num_int = abs(num_int)
                num = num_int

            if 'number_min_abs_value' in phrase_props and num_int < phrase_props[
                    'number_min_abs_value']:
                return None

            if 'number_max_abs_value' in phrase_props and num_int > phrase_props[
                    'number_max_abs_value']:
                return None

            if phrase_props.get('number_subtract_abs_value'):
                num_int -= phrase_props['number_subtract_abs_value']
                num = num_int

        num = safe_decode(num)
        digits_props = props.get('digits')
        if digits_props:
            # Inherit the gender and category e.g. for ordinals
            for k in ('gender', 'category'):
                if k in props:
                    digits_props[k] = props[k]
            num = Digits.rewrite(num,
                                 language,
                                 digits_props,
                                 num_type=Digits.CARDINAL
                                 if num_type != 'ordinal' else Digits.ORDINAL)

        # Do we add the numeric phrase e.g. Floor No 1
        add_number_phrase = props.get('add_number_phrase', False)
        if add_number_phrase and random.random(
        ) < props['add_number_phrase_probability']:
            num = Number.phrase(num, language, country=country)

        whitespace_default = True

        if num_type == 'numeric_affix':
            phrase = props['affix']
            if props.get('upper_case', True):
                phrase = phrase.upper()
            if 'zero_pad' in props and num.isdigit():
                num = num.rjust(props['zero_pad'], props.get('zero_char', '0'))
            whitespace_default = False
        elif num_type == 'ordinal' and safe_decode(num).isdigit():
            ordinal_expression = ordinal_expressions.suffixed_number(
                num, language, gender=props.get('gender', None))

            if ordinal_expression is not None:
                num = ordinal_expression

        if 'null_phrase_probability' in props and (
                num_type == 'ordinal' or
            (has_alpha and
             (has_numeric or 'null_phrase_alpha_only' in props))):
            if random.random() < props['null_phrase_probability']:
                return num

        direction = props['direction']
        whitespace = props.get('whitespace', whitespace_default)

        whitespace_probability = props.get('whitespace_probability')
        if whitespace_probability is not None:
            whitespace = random.random() < whitespace_probability

        # Occasionally switch up if direction_probability is specified
        if random.random() > props.get('direction_probability', 1.0):
            if direction == 'left':
                direction = 'right'
            elif direction == 'right':
                direction = 'left'

        whitespace_phrase = six.u(' ') if whitespace else six.u('')
        # Phrase goes to the left of hte number
        if direction == 'left':
            return six.u('{}{}{}').format(phrase, whitespace_phrase, num)
        # Phrase goes to the right of the number
        elif direction == 'right':
            return six.u('{}{}{}').format(num, whitespace_phrase, phrase)
        # Need to specify a direction, otherwise return naked number
        else:
            return safe_decode(num)
Esempio n. 15
0
    def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
        config = yaml.load(open(config_file))

        default_names = nested_get(config, ('names', 'keys'))
        name_keys, probs = alternative_probabilities(default_names)

        self.name_keys = name_keys
        self.name_key_probs = cdf(probs)

        self.component_name_keys = {}

        for component, component_config in six.iteritems(nested_get(config, ('names', 'components'), default={})):
            component_names = component_config.get('keys')
            component_name_keys, component_probs = alternative_probabilities(component_names)
            self.component_name_keys[component] = (component_name_keys, cdf(component_probs))

        self.country_regex_replacements = defaultdict(list)
        for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
            country = props.get('country')
            re_flags = re.I | re.UNICODE
            if not props.get('case_insensitive', True):
                re.flags ^= re.I

            pattern = re.compile(props['pattern'], re_flags)
            replace_group = props['replace_with_group']
            replace_probability = props['replace_probability']
            self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))

        self.country_regex_replacements = dict(self.country_regex_replacements)

        self.prefixes = {}
        self.prefix_regexes = {}
        self.suffixes = {}
        self.suffix_regexes = {}

        for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'prefix' not in val:
                        raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val))

                prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values])
                self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.prefixes[(language, component)] = affix_values, affix_probs_cdf

        for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'suffix' not in val:
                        raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val))

                suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values])
                self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.suffixes[(language, component)] = affix_values, affix_probs_cdf

        self.exceptions = {}

        for props in nested_get(config, ('names', 'exceptions'), default=[]):
            object_type = props['type']
            object_id = safe_encode(props['id'])
            keys = [props['default']]
            probs = [props['probability']]
            for alt in props.get('alternatives', []):
                keys.append(alt['alternative'])
                probs.append(alt['probability'])

            probs = cdf(probs)
            self.exceptions[(object_type, object_id)] = (keys, probs)
Esempio n. 16
0
    def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
        config = yaml.load(open(config_file))

        default_names = nested_get(config, ('names', 'keys'))
        name_keys, probs = alternative_probabilities(default_names)

        self.name_keys = name_keys
        self.name_key_probs = cdf(probs)

        self.component_name_keys = {}

        for component, component_config in six.iteritems(
                nested_get(config, ('names', 'components'), default={})):
            component_names = component_config.get('keys')
            component_name_keys, component_probs = alternative_probabilities(
                component_names)
            self.component_name_keys[component] = (component_name_keys,
                                                   cdf(component_probs))

        self.country_regex_replacements = defaultdict(list)
        for props in nested_get(config, (
                'names',
                'regex_replacements',
        ),
                                default=[]):
            country = props.get('country')
            re_flags = re.I | re.UNICODE
            if not props.get('case_insensitive', True):
                re.flags ^= re.I

            pattern = re.compile(props['pattern'], re_flags)
            replace_group = props['replace_with_group']
            replace_probability = props['replace_probability']
            self.country_regex_replacements[country].append(
                (pattern, replace_group, replace_probability))

        self.country_regex_replacements = dict(self.country_regex_replacements)

        self.prefixes = {}
        self.prefix_regexes = {}
        self.suffixes = {}
        self.suffix_regexes = {}

        for language, components in six.iteritems(
                nested_get(config, ('names', 'prefixes', 'language'),
                           default={})):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'prefix' not in val:
                        raise AssertionError(
                            six.
                            u('Invalid prefix value for (language={}, component={}): {} '
                              ).format(language, component, val))

                prefix_regex = six.u('|').join([
                    six.u('(?:{} )').format(self._string_as_regex(v['prefix']))
                    if v.get('whitespace') else self._string_as_regex(
                        v['prefix']) for v in affix_values
                ])
                self.prefix_regexes[(language, component)] = re.compile(
                    six.u('^{}').format(prefix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.prefixes[(language,
                               component)] = affix_values, affix_probs_cdf

        for language, components in six.iteritems(
                nested_get(config, ('names', 'suffixes', 'language'),
                           default={})):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'suffix' not in val:
                        raise AssertionError(
                            six.
                            u('Invalid suffix value for (language={}, component={}): {} '
                              ).format(language, component, val))

                suffix_regex = six.u('|').join([
                    six.u('(?: {})').format(self._string_as_regex(v['suffix']))
                    if v.get('whitespace') else self._string_as_regex(
                        v['suffix']) for v in affix_values
                ])
                self.suffix_regexes[(language, component)] = re.compile(
                    six.u('{}$').format(suffix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.suffixes[(language,
                               component)] = affix_values, affix_probs_cdf

        self.exceptions = {}

        for props in nested_get(config, ('names', 'exceptions'), default=[]):
            object_type = props['type']
            object_id = safe_encode(props['id'])
            keys = [props['default']]
            probs = [props['probability']]
            for alt in props.get('alternatives', []):
                keys.append(alt['alternative'])
                probs.append(alt['probability'])

            probs = cdf(probs)
            self.exceptions[(object_type, object_id)] = (keys, probs)