def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
    languages = scrape_all_nominatim_category_pages(url=url)
    for lang, phrases in six.iteritems(languages):
        filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
        with open(filename, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(('key', 'value', 'is_plural', 'phrase'))

            for phrase, key, value, is_plural in phrases:
                writer.writerow((safe_encode(key), safe_encode(value),
                                str(int(is_plural)), safe_encode(phrase)))

    print('Done')
def main(url=NOMINATIM_SPECIAL_PHRASES_URL, output_dir=DEFAULT_CATEGORIES_DIR):
    languages = scrape_all_nominatim_category_pages(url=url)
    for lang, phrases in six.iteritems(languages):
        filename = os.path.join(output_dir, '{}.tsv'.format(lang.lower()))
        with open(filename, 'w') as f:
            writer = csv.writer(f, delimiter='\t')
            writer.writerow(('key', 'value', 'is_plural', 'phrase'))

            for phrase, key, value, is_plural in phrases:
                writer.writerow((safe_encode(key), safe_encode(value),
                                 str(int(is_plural)), safe_encode(phrase)))

    print('Done')
Example #3
0
 def write_to_tsv(self, out_filename, min_threshold=5):
     writer = csv.writer(open(out_filename, 'w'), delimiter='\t')
     for k, v in self.names_lower.most_common():
         if v < min_threshold:
             break
         canonical = self.chain_canonical.get(k)
         if canonical:
             canonical = self.names_cap[canonical].most_common(1)[0][0]
         else:
             canonical = ''
         most_common_cap = self.names_cap[k].most_common(1)[0][0]
         writer.writerow((safe_encode(k), safe_encode(most_common_cap),
                          safe_encode(canonical),
                          safe_encode(1) if k in self.all_chains else '',
                          safe_encode(v)))
Example #4
0
 def write_to_tsv(self, out_filename, min_threshold=5):
     writer = csv.writer(open(out_filename, 'w'), delimiter='\t')
     for k, v in self.names_lower.most_common():
         if v < min_threshold:
             break
         canonical = self.chain_canonical.get(k)
         if canonical:
             canonical = self.names_cap[canonical].most_common(1)[0][0]
         else:
             canonical = ''
         most_common_cap = self.names_cap[k].most_common(1)[0][0]
         writer.writerow((safe_encode(k),
                          safe_encode(most_common_cap),
                          safe_encode(canonical),
                          safe_encode(1) if k in self.all_chains else '',
                          safe_encode(v)))
Example #5
0
    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

        kvs = defaultdict(OrderedDict)

        for language in address_phrase_dictionaries.languages:
            for dictionary_name in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in dictionary_name
                is_prefix_dictionary = 'prefixes' in dictionary_name

                for phrases in address_phrase_dictionaries.phrases.get((language, dictionary_name), []):
                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, language, dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(language, dictionary_name, canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
Example #6
0
 def create_intersections(self, outfile):
     out = open(outfile, 'w')
     for node_id, node_props, ways in self.intersections():
         d = {'id': safe_encode(node_id),
              'node': node_props,
              'ways': ways}
         out.write(json.dumps(d) + six.u('\n'))
Example #7
0
    def to_tsv(self, filename, mode='w', max_rows=None):
        f = open(filename, mode)
        writer = csv.writer(f, delimiter='\t')
        for i, (k, v) in enumerate(Counter(self.vocab).most_common()):
            if max_rows is not None and i == max_rows:
                break

            gram = []
            for t, c in k:
                gram.append(t)
                if c != token_types.IDEOGRAPHIC_CHAR:
                    gram.append(six.text_type(' '))

            phrase = six.text_type('').join(gram)

            writer.writerow((safe_encode(phrase), safe_encode(len(k)), safe_encode(v)))
def openaddresses_download_configured_files(out_dir):
    for path in openaddresses_config.sources:

        source = six.b('/').join([safe_encode(p) for p in path])
        filename = safe_encode(path[-1]) + six.b('.zip')
        zip_path = filename + '.zip'
        zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] + [quote_plus(filename)])

        url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)

        download_pre_release_downloads(out_dir)

        print(six.u('doing {}').format(safe_decode(source)))
        success = download_and_unzip_file(url, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))
Example #9
0
    def to_tsv(self, filename, mode='w', max_rows=None):
        f = open(filename, mode)
        writer = csv.writer(f, delimiter='\t')
        for i, (k, v) in enumerate(Counter(self.vocab).most_common()):
            if max_rows is not None and i == max_rows:
                break

            gram = []
            for t, c in k:
                gram.append(t)
                if c != token_types.IDEOGRAPHIC_CHAR:
                    gram.append(six.text_type(' '))

            phrase = six.text_type('').join(gram)

            writer.writerow(
                (safe_encode(phrase), safe_encode(len(k)), safe_encode(v)))
def openaddresses_download_configured_files(out_dir):
    for path in openaddresses_config.sources:

        source = six.b('/').join([safe_encode(p) for p in path])
        filename = safe_encode(path[-1]) + six.b('.zip')
        zip_path = filename + '.zip'
        zip_url_path = six.b('/').join([safe_encode(p) for p in path[:-1]] +
                                       [quote_plus(filename)])

        url = urljoin(OPENADDRESSES_LATEST_DIR, zip_url_path)

        download_pre_release_downloads(out_dir)

        print(six.u('doing {}').format(safe_decode(source)))
        success = download_and_unzip_file(url, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))
Example #11
0
    def name_key_dist(self, props, component):
        object_type = props.get('type')
        object_id = safe_encode(props.get('id', ''))

        if (object_type, object_id) in self.exceptions:
            values, probs = self.exceptions[(object_type, object_id)]
            return values, probs

        name_keys, probs = self.component_name_keys.get(component, (self.name_keys, self.name_key_probs))
        return name_keys, probs
Example #12
0
    def include_component_simple(self, component, containing_ids, country=None):
        containing = self.get_property(('components', component, 'containing'), country=country, default=None)

        if containing is not None:
            for c in containing:
                if (c['type'], safe_encode(c['id'])) in containing_ids:
                    return random.random() < c['probability']

        probability = self.get_property(('components', component, 'probability'), country=country, default=0.0)

        return random.random() < probability
Example #13
0
    def name_key_dist(self, props, component):
        object_type = props.get('type')
        object_id = safe_encode(props.get('id', ''))

        if (object_type, object_id) in self.exceptions:
            values, probs = self.exceptions[(object_type, object_id)]
            return values, probs

        name_keys, probs = self.component_name_keys.get(
            component, (self.name_keys, self.name_key_probs))
        return name_keys, probs
def get_script_codes(all_scripts):

    if not os.path.exists(LOCAL_ISO_15924_FILE):
        temp_dir = tempfile.gettempdir()

        script_codes_filename = os.path.join(temp_dir,
                                             ISO_15924_URL.rsplit('/')[-1])

        # This comes as a .zip
        script_codes_response = requests.get(ISO_15924_URL)
        zf = ZipFile(StringIO(script_codes_response.content))
        iso15924_filename = [
            name for name in zf.namelist() if name.startswith('iso15924')
        ][0]

        # Strip out the comments, etc.
        temp_iso15924_file = u'\n'.join([
            line.rstrip()
            for line in safe_decode(zf.read(iso15924_filename)).split('\n')
            if line.strip() and not line.strip().startswith('#')
        ])

        f = open(LOCAL_ISO_15924_FILE, 'w')
        f.write(safe_encode(temp_iso15924_file))
        f.close()

    script_codes_file = open(LOCAL_ISO_15924_FILE)

    script_codes = {}
    seen_scripts = set()

    # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
    for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
        if name in all_scripts:
            script_codes[code] = name
            seen_scripts.add(name)
        else:
            normalized_name = name.split('(')[0].strip()
            if normalized_name in all_scripts and normalized_name not in seen_scripts:
                script_codes[code] = normalized_name
                seen_scripts.add(normalized_name)

    value_aliases = get_property_value_aliases()
    script_aliases = value_aliases['sc']

    for code, script in script_aliases.iteritems():
        if code not in script_codes and script in all_scripts:
            script_codes[code] = script

    script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)

    return script_codes
Example #15
0
def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False):
    '''
    Parse a file in .osm format iteratively, generating tuples like:
    ('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4])
    '''
    f = open(filename)
    parser = etree.iterparse(f)

    single_type = len(allowed_types) == 1

    for (_, elem) in parser:
        elem_id = long(elem.attrib.pop('id', 0))
        item_type = elem.tag
        if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET:
            elem_id -= WAY_OFFSET
            item_type = 'way'
        elif elem_id >= RELATION_OFFSET:
            elem_id -= RELATION_OFFSET
            item_type = 'relation'

        if item_type in allowed_types:
            attrs = OrderedDict(elem.attrib)
            attrs['type'] = item_type
            attrs['id'] = safe_encode(elem_id)

            top_level_attrs = set(attrs)
            deps = [] if dependencies else None

            for e in elem.getchildren():
                if e.tag == 'tag':
                    # Prevent user-defined lat/lon keys from overriding the lat/lon on the node
                    key = e.attrib['k']
                    if key not in top_level_attrs:
                        attrs[key] = e.attrib['v']
                elif dependencies and item_type == 'way' and e.tag == 'nd':
                    deps.append(long(e.attrib['ref']))
                elif dependencies and item_type == 'relation' and e.tag == 'member' and 'role' in e.attrib:
                    deps.append((long(e.attrib['ref']), e.attrib.get('type'),
                                 e.attrib['role']))

            key = elem_id if single_type else '{}:{}'.format(
                item_type, elem_id)
            yield key, attrs, deps

        if elem.tag in ALL_OSM_TAGS:
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
Example #16
0
    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang,
                                     dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(
                            phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name,
                                     canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)),
                             safe_encode(c)])) for k, vals in kvs.iteritems()
               for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
Example #17
0
def parse_osm(filename, allowed_types=ALL_OSM_TAGS, dependencies=False):
    '''
    Parse a file in .osm format iteratively, generating tuples like:
    ('node:1', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('node:2', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('node:3', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('node:4', OrderedDict([('lat', '12.34'), ('lon', '23.45')])),
    ('way:4444', OrderedDict([('name', 'Main Street')]), [1,2,3,4])
    '''
    f = open(filename)
    parser = etree.iterparse(f)

    single_type = len(allowed_types) == 1

    for (_, elem) in parser:
        elem_id = long(elem.attrib.pop('id', 0))
        item_type = elem.tag
        if elem_id >= WAY_OFFSET and elem_id < RELATION_OFFSET:
            elem_id -= WAY_OFFSET
            item_type = 'way'
        elif elem_id >= RELATION_OFFSET:
            elem_id -= RELATION_OFFSET
            item_type = 'relation'

        if item_type in allowed_types:
            attrs = OrderedDict(elem.attrib)
            attrs['type'] = item_type
            attrs['id'] = safe_encode(elem_id)

            top_level_attrs = set(attrs)
            deps = [] if dependencies else None

            for e in elem.getchildren():
                if e.tag == 'tag':
                    # Prevent user-defined lat/lon keys from overriding the lat/lon on the node
                    key = e.attrib['k']
                    if key not in top_level_attrs:
                        attrs[key] = e.attrib['v']
                elif dependencies and item_type == 'way' and e.tag == 'nd':
                    deps.append(long(e.attrib['ref']))
                elif dependencies and item_type == 'relation' and e.tag == 'member' and 'role' in e.attrib:
                    deps.append((long(e.attrib['ref']), e.attrib.get('type'), e.attrib['role']))

            key = elem_id if single_type else '{}:{}'.format(item_type, elem_id)
            yield key, attrs, deps

        if elem.tag in ALL_OSM_TAGS:
            elem.clear()
            while elem.getprevious() is not None:
                del elem.getparent()[0]
Example #18
0
    def __init__(self, boundaries_dir=OSM_BOUNDARIES_DIR):
        self.config = {}

        self.use_admin_center = {}

        for filename in os.listdir(boundaries_dir):
            if not filename.endswith('.yaml'):
                continue

            country_code = filename.rsplit('.yaml', 1)[0]
            data = yaml.load(open(os.path.join(boundaries_dir, filename)))

            for prop, values in six.iteritems(data):
                if not hasattr(values, 'items'):
                    # non-dict key
                    continue

                for k, v in values.iteritems():
                    if isinstance(
                            v, six.string_types
                    ) and v not in AddressFormatter.address_formatter_fields:
                        raise ValueError(
                            u'Invalid value in {} for prop={}, key={}: {}'.
                            format(filename, prop, k, v))

                if prop == 'overrides':
                    self.use_admin_center.update({
                        (r['type'], safe_encode(r['id'])):
                        r.get('probability', 1.0)
                        for r in values.get('use_admin_center', [])
                    })

                    containing_overrides = values.get('contained_by', {})

                    if not containing_overrides:
                        continue

                    for id_type, vals in six.iteritems(containing_overrides):
                        for element_id in vals:

                            override_config = vals[element_id]

                            config = deepcopy(data)
                            config.pop('overrides')

                            recursive_merge(config, override_config)

                            vals[element_id] = config

            self.config[country_code] = data
Example #19
0
    def download_dependencies(self, path):
        data = json.load(open(path))
        props = data['properties']

        _, filename = os.path.split(path)
        current_wof_id = filename.rsplit('.geojson', 1)[0]

        for hierarchy in props.get('wof:hierarchy', []):
            for key, wof_id in six.iteritems(hierarchy):
                wof_id = safe_encode(wof_id)

                if wof_id != current_wof_id and wof_id != '-1' and not self.client.exists_locally(wof_id):
                    if not self.client.download_file(wof_id):
                        print('error downloading {}'.format(wof_id))
                        continue
        return props.get('name')
Example #20
0
def get_script_codes(all_scripts):

    if not os.path.exists(LOCAL_ISO_15924_FILE):
        temp_dir = tempfile.gettempdir()

        script_codes_filename = os.path.join(temp_dir, ISO_15924_URL.rsplit('/')[-1])

        # This comes as a .zip
        script_codes_response = requests.get(ISO_15924_URL)
        zf = ZipFile(StringIO(script_codes_response.content))
        iso15924_filename = [name for name in zf.namelist() if name.startswith('iso15924')][0]

        # Strip out the comments, etc.
        temp_iso15924_file = u'\n'.join([line.rstrip() for line in safe_decode(zf.read(iso15924_filename)).split('\n')
                                        if line.strip() and not line.strip().startswith('#')])

        f = open(LOCAL_ISO_15924_FILE, 'w')
        f.write(safe_encode(temp_iso15924_file))
        f.close()

    script_codes_file = open(LOCAL_ISO_15924_FILE)

    script_codes = {}
    seen_scripts = set()

    # Scripts in the CLDR repos use 4-letter ISO-15924 codes, so map those
    for code, _, name, _, _, _ in csv.reader(script_codes_file, delimiter=';'):
        if name in all_scripts:
            script_codes[code] = name
            seen_scripts.add(name)
        else:
            normalized_name = name.split('(')[0].strip()
            if normalized_name in all_scripts and normalized_name not in seen_scripts:
                script_codes[code] = normalized_name
                seen_scripts.add(normalized_name)

    value_aliases = get_property_value_aliases()
    script_aliases = value_aliases['sc']

    for code, script in script_aliases.iteritems():
        if code not in script_codes and script in all_scripts:
            script_codes[code] = script

    script_codes.update(SCRIPT_ALIASES_SUPPLEMENTAL)

    return script_codes
Example #21
0
def get_wikipedia_titles(db):
    d = defaultdict(dict)

    cursor = db.execute(wikipedia_query)

    while True:
        batch = cursor.fetchmany(BATCH_SIZE)
        if not batch:
            break

        for (url, geonames_id, is_preferred) in batch:
            title = normalize_wikipedia_url(safe_encode(url))
            if title is not None and title.strip():
                title = utf8_normalize(normalize_name(title))
                d[title.lower()][geonames_id] = int(is_preferred or 0)

    return d
Example #22
0
    def download_dependencies(self, path):
        data = json.load(open(path))
        props = data['properties']

        _, filename = os.path.split(path)
        current_wof_id = filename.rsplit('.geojson', 1)[0]

        for hierarchy in props.get('wof:hierarchy', []):
            for key, wof_id in six.iteritems(hierarchy):
                wof_id = safe_encode(wof_id)

                if wof_id != current_wof_id and wof_id != '-1' and not self.client.exists_locally(
                        wof_id):
                    if not self.client.download_file(wof_id):
                        print('error downloading {}'.format(wof_id))
                        continue
        return props.get('name')
Example #23
0
    def configure(self, base_dir=DICTIONARIES_DIR):
        kvs = defaultdict(OrderedDict)
        for lang in os.listdir(DICTIONARIES_DIR):
            for filename in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in filename
                is_prefix_dictionary = 'prefixes' in filename

                dictionary_name = filename.split('.', 1)[0]

                path = os.path.join(DICTIONARIES_DIR, lang, filename)
                if not os.path.exists(path):
                    continue

                for line in open(path):
                    line = line.strip()
                    if not line:
                        continue

                    phrases = safe_decode(line).split(u'|')
                    if not phrases:
                        continue

                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, lang, dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(lang, dictionary_name, canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)), safe_encode(c)])) for k, vals in kvs.iteritems() for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
        self.configured = True
Example #24
0
    def include_component_simple(self,
                                 component,
                                 containing_ids,
                                 country=None):
        containing = self.get_property(('components', component, 'containing'),
                                       country=country,
                                       default=None)

        if containing is not None:
            for c in containing:
                if (c['type'], safe_encode(c['id'])) in containing_ids:
                    return random.random() < c['probability']

        probability = self.get_property(
            ('components', component, 'probability'),
            country=country,
            default=0.0)

        return random.random() < probability
Example #25
0
    def __init__(self, *dictionaries):
        self.dictionaries = dictionaries
        self.canonicals = {}

        kvs = defaultdict(OrderedDict)

        for language in address_phrase_dictionaries.languages:
            for dictionary_name in self.dictionaries:
                is_suffix_dictionary = 'suffixes' in dictionary_name
                is_prefix_dictionary = 'prefixes' in dictionary_name

                for phrases in address_phrase_dictionaries.phrases.get(
                    (language, dictionary_name), []):
                    canonical = phrases[0]
                    canonical_normalized = normalize_string(canonical)

                    self.canonicals[(canonical, language,
                                     dictionary_name)] = phrases[1:]

                    for i, phrase in enumerate(phrases):

                        if phrase in POSSIBLE_ROMAN_NUMERALS:
                            continue

                        is_canonical = normalize_string(
                            phrase) == canonical_normalized

                        if is_suffix_dictionary:
                            phrase = SUFFIX_KEY + phrase[::-1]
                        elif is_prefix_dictionary:
                            phrase = PREFIX_KEY + phrase

                        kvs[phrase][(language, dictionary_name,
                                     canonical)] = is_canonical

        kvs = [(k, '|'.join([l, d, str(int(i)),
                             safe_encode(c)])) for k, vals in kvs.iteritems()
               for (l, d, c), i in vals.iteritems()]

        self.trie = BytesTrie(kvs)
Example #26
0
    def data_and_dependencies(self, path):
        data = json.load(open(path))
        props = data['properties']

        _, filename = os.path.split(path)
        current_wof_id = filename.rsplit('.geojson', 1)[0]

        dependencies = {}

        for hierarchy in props.get('wof:hierarchy', []):
            for key, wof_id in six.iteritems(hierarchy):
                wof_id = safe_encode(wof_id)
                if wof_id in dependencies or wof_id == current_wof_id:
                    continue

                if not self.client.exists_locally(wof_id):
                    continue

                value = self.client.load(wof_id)

                # Only include properties, not all the polygon data
                dependencies[wof_id] = value.get('properties', {})

        return data, dependencies
Example #27
0
    def data_and_dependencies(self, path):
        data = json.load(open(path))
        props = data['properties']

        _, filename = os.path.split(path)
        current_wof_id = filename.rsplit('.geojson', 1)[0]

        dependencies = {}

        for hierarchy in props.get('wof:hierarchy', []):
            for key, wof_id in six.iteritems(hierarchy):
                wof_id = safe_encode(wof_id)
                if wof_id in dependencies or wof_id == current_wof_id:
                    continue

                if not self.client.exists_locally(wof_id):
                    continue

                value = self.client.load(wof_id)

                # Only include properties, not all the polygon data
                dependencies[wof_id] = value.get('properties', {})

        return data, dependencies
Example #28
0
 def include_polygon(self, props):
     return 'ISO3166-1:alpha2' in props or 'ISO3166-2' in props or (
         props.get('type', 'relation'), safe_encode(props.get(
             'id', ''))) in osm_admin1_ids
Example #29
0
def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
    all_keys = []
    all_rules = []

    all_ordinal_indicators = []
    all_stopwords = []

    all_languages = []

    out = open(outfile, 'w')

    for filename in os.listdir(dirname):
        path = os.path.join(dirname, filename)
        if not os.path.isfile(path) or not filename.endswith('.yaml'):
            continue

        language = filename.split('.yaml', 1)[0]

        data = yaml.load(open(path))

        whole_words_only = data.get('whole_words_only', False)

        rules = data.get('rules', [])
        rule_index = len(all_rules)

        for rule in rules:
            invalid_keys = set(rule.keys()) - valid_numex_keys
            if invalid_keys:
                raise InvalidNumexRuleException(u'Invalid keys: ({}) for language {}, rule: {}'.format(u','.join(invalid_keys), language, rule))
            gender = gender_map[rule.get('gender')]
            rule_type = rule_type_map[rule['type']]
            key = rule['name']
            value = rule['value']
            radix = rule.get('radix', 10)
            rule_category = rule.get('category')
            category = category_map.get(rule_category)
            if category is None:
                continue
            left_context_type = left_context_map[rule.get('left')]
            right_context_type = right_context_map[rule.get('right')]
            all_keys.append(unicode(numex_key_template.format(key=key)))
            all_rules.append(unicode(numex_rule_template.format(
                language=language,
                rule_type=rule_type,
                gender=gender,
                category=category,
                left_context_type=left_context_type,
                right_context_type=right_context_type,
                value=value,
                radix=radix
            )))

        ordinal_indicator_index = len(all_ordinal_indicators)
        ordinal_indicators = data.get('ordinal_indicators', [])
        num_ordinal_indicators = 0

        for rule in ordinal_indicators:
            gender = gender_map[rule.get('gender')]
            category = category_map[rule.get('category')]
            invalid_ordinal_keys = set(rule.keys()) - valid_ordinal_keys
            if invalid_ordinal_keys:
                raise InvalidNumexRuleException(u'Invalid keys ({}) in ordinal rule for language {}, rule: {}'.format(u','.join(invalid_ordinal_keys), language, rule))

            for key, suffixes in rule['suffixes'].iteritems():
                for suffix in suffixes:
                    all_ordinal_indicators.append(unicode(ordinal_indicator_template.format(
                        key=key,
                        value=suffix,
                        gender=gender,
                        category=category
                    )))
                num_ordinal_indicators += len(suffixes)

        stopwords = data.get('stopwords', [])
        stopword_index = len(all_stopwords)
        num_stopwords = len(stopwords)

        for stopword in stopwords:
            all_keys.append(numex_key_template.format(key=unicode(stopword)))
            all_rules.append(stopword_rule)

        num_rules = len(rules) + len(stopwords)

        all_languages.append(unicode(language_template.format(
            language=language,
            whole_words_only=int(whole_words_only),
            rule_index=rule_index,
            num_rules=num_rules,
            ordinal_indicator_index=ordinal_indicator_index,
            num_ordinal_indicators=num_ordinal_indicators
        )))

    out.write(safe_encode(numex_rules_data_template.format(
        numex_keys=u''',
    '''.join(all_keys),
        numex_rules=u''',
    '''.join(all_rules),
        ordinal_indicator_rules=u''',
    '''.join(all_ordinal_indicators),
        stopwords=u''',
    '''.join(all_stopwords),
        languages=u''',
    '''.join(all_languages),
    )))

    out.close()
Example #30
0
    def intersections(self):
        '''
        Generator which yields tuples like:

        (node_id, lat, lon, {way_id: way_props})
        '''
        i = 0

        node_ids = array.array('l')
        node_counts = array.array('i')

        for element_id, props, deps in parse_osm(self.filename,
                                                 dependencies=True):
            props = {
                safe_decode(k): safe_decode(v)
                for k, v in six.iteritems(props)
            }
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                node_ids.append(node_id)
                node_counts.append(0)
                self.node_props.Put(safe_encode(node_id), json.dumps(props))
            elif element_id.startswith('way'):
                # Don't care about the ordering of the nodes, and want uniques e.g. for circular roads
                deps = set(deps)

                # Get node indices by binary search
                for node_id in deps:
                    try:
                        node_index = self.binary_search(node_ids, node_id)
                    except ValueError:
                        continue
                    if node_index is None:
                        continue
                    node_counts[node_index] += 1

            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(
                    element_id.split(':')[0], i))
            i += 1

        for i, count in enumerate(node_counts):
            if count > 1:
                self.node_ids.append(node_ids[i])

        del node_ids
        del node_counts

        i = 0

        for element_id, props, deps in parse_osm(self.filename,
                                                 dependencies=True):
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                node_index = self.binary_search(self.node_ids, node_id)
            elif element_id.startswith('way'):
                props = {
                    safe_decode(k): safe_decode(v)
                    for k, v in six.iteritems(props)
                }
                way_id = long(element_id.split(':')[-1])
                props['id'] = way_id
                for node_id in deps:
                    node_index = self.binary_search(self.node_ids, node_id)
                    if node_index is not None:
                        self.intersection_edges_nodes.append(node_id)
                        self.intersection_edges_ways.append(way_id)
                        self.way_props.Put(safe_encode(way_id),
                                           json.dumps(props))

            if i % 1000 == 0 and i > 0:
                self.logger.info('second pass, doing {}s, at {}'.format(
                    element_id.split(':')[0], i))
            i += 1

        i = 0

        indices = numpy.argsort(self.intersection_edges_nodes)
        self.intersection_edges_nodes = numpy.fromiter(
            (self.intersection_edges_nodes[i] for i in indices),
            dtype=numpy.uint64)
        self.intersection_edges_ways = numpy.fromiter(
            (self.intersection_edges_ways[i] for i in indices),
            dtype=numpy.uint64)
        del indices

        idx = 0

        # Need to make a copy here otherwise will change dictionary size during iteration
        for node_id, g in groupby(self.intersection_edges_nodes):
            group_len = sum((1 for j in g))

            node_props = json.loads(self.node_props.Get(safe_encode(node_id)))

            way_indices = self.intersection_edges_ways[idx:idx + group_len]
            all_ways = [
                json.loads(self.way_props.Get(safe_encode(w)))
                for w in way_indices
            ]
            way_names = set()
            ways = []
            for way in all_ways:
                if way['name'] in way_names:
                    continue
                ways.append(way)
                way_names.add(way['name'])

            idx += group_len

            if i % 1000 == 0 and i > 0:
                self.logger.info('checking intersections, did {}'.format(i))
            i += 1

            if len(ways) > 1:
                node_index = self.binary_search(self.node_ids, node_id)
                yield self.node_ids[node_index], node_props, ways
Example #31
0
    def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
        config = yaml.load(open(config_file))

        default_names = nested_get(config, ('names', 'keys'))
        name_keys, probs = alternative_probabilities(default_names)

        self.name_keys = name_keys
        self.name_key_probs = cdf(probs)

        self.component_name_keys = {}

        for component, component_config in six.iteritems(nested_get(config, ('names', 'components'), default={})):
            component_names = component_config.get('keys')
            component_name_keys, component_probs = alternative_probabilities(component_names)
            self.component_name_keys[component] = (component_name_keys, cdf(component_probs))

        self.country_regex_replacements = defaultdict(list)
        for props in nested_get(config, ('names', 'regex_replacements',), default=[]):
            country = props.get('country')
            re_flags = re.I | re.UNICODE
            if not props.get('case_insensitive', True):
                re.flags ^= re.I

            pattern = re.compile(props['pattern'], re_flags)
            replace_group = props['replace_with_group']
            replace_probability = props['replace_probability']
            self.country_regex_replacements[country].append((pattern, replace_group, replace_probability))

        self.country_regex_replacements = dict(self.country_regex_replacements)

        self.prefixes = {}
        self.prefix_regexes = {}
        self.suffixes = {}
        self.suffix_regexes = {}

        for language, components in six.iteritems(nested_get(config, ('names', 'prefixes', 'language'), default={}) ):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'prefix' not in val:
                        raise AssertionError(six.u('Invalid prefix value for (language={}, component={}): {} ').format(language, component, val))

                prefix_regex = six.u('|').join([six.u('(?:{} )').format(self._string_as_regex(v['prefix'])) if v.get('whitespace') else self._string_as_regex(v['prefix']) for v in affix_values])
                self.prefix_regexes[(language, component)] = re.compile(six.u('^{}').format(prefix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.prefixes[(language, component)] = affix_values, affix_probs_cdf

        for language, components in six.iteritems(nested_get(config, ('names', 'suffixes', 'language'), default={}) ):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'suffix' not in val:
                        raise AssertionError(six.u('Invalid suffix value for (language={}, component={}): {} ').format(language, component, val))

                suffix_regex = six.u('|').join([six.u('(?: {})').format(self._string_as_regex(v['suffix'])) if v.get('whitespace') else self._string_as_regex(v['suffix']) for v in affix_values])
                self.suffix_regexes[(language, component)] = re.compile(six.u('{}$').format(suffix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.suffixes[(language, component)] = affix_values, affix_probs_cdf

        self.exceptions = {}

        for props in nested_get(config, ('names', 'exceptions'), default=[]):
            object_type = props['type']
            object_id = safe_encode(props['id'])
            keys = [props['default']]
            probs = [props['probability']]
            for alt in props.get('alternatives', []):
                keys.append(alt['alternative'])
                probs.append(alt['probability'])

            probs = cdf(probs)
            self.exceptions[(object_type, object_id)] = (keys, probs)
Example #32
0
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR,
                                        output_file=ADDRESS_DATA_FILE,
                                        header_file=ADDRESS_HEADER_FILE):
    address_languages = []
    expansion_rules = []
    canonical_strings = []

    max_dictionary_types = 0

    for language in address_phrase_dictionaries.languages:
        num_language_rules = 0
        language_index = len(expansion_rules)

        language_canonical_dictionaries = defaultdict(list)
        canonical_indices = {}

        for dictionary_name in address_phrase_dictionaries.language_dictionaries[
                language]:
            dictionary_type = gazetteer_types[dictionary_name]

            for phrases in address_phrase_dictionaries.phrases[(
                    language, dictionary_name)]:
                canonical = phrases[0]
                if len(phrases) > 1:
                    canonical_index = canonical_indices.get(canonical, None)
                    if canonical_index is None:
                        canonical_index = len(canonical_strings)
                        canonical_strings.append(quote_string(canonical))
                        canonical_indices[canonical] = canonical_index
                else:
                    canonical_index = -1

                for i, p in enumerate(phrases):
                    language_canonical_dictionaries[
                        p, canonical_index if i > 0 else -1].append(
                            dictionary_type)

        for (
                phrase, canonical_index
        ), dictionary_types in language_canonical_dictionaries.iteritems():
            max_dictionary_types = max(max_dictionary_types,
                                       len(dictionary_types))
            rule_template = address_expansion_rule_template.format(
                phrase=quote_string(phrase),
                num_dictionaries=str(len(dictionary_types)),
                dictionaries=', '.join(dictionary_types),
                canonical_index=canonical_index)
            expansion_rules.append(rule_template)
            num_language_rules += 1

        address_languages.append(
            address_language_index_template.format(
                language=quote_string(language),
                index=language_index,
                length=num_language_rules))

    header = address_expansion_rule_header_template.format(
        max_dictionary_types=str(max_dictionary_types))
    out = open(header_file, 'w')
    out.write(safe_encode(header))
    out.close()

    data_file = address_expansion_data_file_template.format(
        canonical_strings=u''',
    '''.join(canonical_strings),
        expansion_rules=u''',
    '''.join(expansion_rules),
        address_languages=u''',
    '''.join(address_languages),
    )

    out = open(output_file, 'w')
    out.write(safe_encode(data_file))
    out.close()
Example #33
0
def tokenize(s):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u)]
Example #34
0
def parse_numex_rules(dirname=NUMEX_DATA_DIR, outfile=NUMEX_RULES_FILE):
    all_keys = []
    all_rules = []

    all_ordinal_indicators = []
    all_stopwords = []

    all_languages = []

    out = open(outfile, 'w')

    for filename in os.listdir(dirname):
        path = os.path.join(dirname, filename)
        if not os.path.isfile(path) or not filename.endswith('.json'):
            continue

        language = filename.split('.json', 1)[0]

        data = json.load(open(path))

        whole_words_only = data.get('whole_words_only', False)

        rules = data.get('rules', [])
        rule_index = len(all_rules)

        for rule in rules:
            invalid_keys = set(rule.keys()) - valid_numex_keys
            if invalid_keys:
                raise InvalidNumexRuleException(
                    u'Invalid keys: ({}) for language {}, rule: {}'.format(
                        u','.join(invalid_keys), language, rule))
            gender = gender_map[rule.get('gender')]
            rule_type = rule_type_map[rule['type']]
            key = rule['name']
            value = rule['value']
            radix = rule.get('radix', 10)
            category = category_map[rule.get('category')]
            left_context_type = left_context_map[rule.get('left')]
            right_context_type = right_context_map[rule.get('right')]
            all_keys.append(unicode(numex_key_template.format(key=key)))
            all_rules.append(
                unicode(
                    numex_rule_template.format(
                        language=language,
                        rule_type=rule_type,
                        gender=gender,
                        category=category,
                        left_context_type=left_context_type,
                        right_context_type=right_context_type,
                        value=value,
                        radix=radix)))

        ordinal_indicator_index = len(all_ordinal_indicators)
        ordinal_indicators = data.get('ordinal_indicators', [])
        num_ordinal_indicators = 0

        for rule in ordinal_indicators:
            gender = gender_map[rule.get('gender')]
            category = category_map[rule.get('category')]
            invalid_ordinal_keys = set(rule.keys()) - valid_ordinal_keys
            if invalid_ordinal_keys:
                raise InvalidNumexRuleException(
                    u'Invalid keys ({}) in ordinal rule for language {}, rule: {}'
                    .format(u','.join(invalid_ordinal_keys), language, rule))

            for key, suffixes in rule['suffixes'].iteritems():
                for suffix in suffixes:
                    all_ordinal_indicators.append(
                        unicode(
                            ordinal_indicator_template.format(
                                key=key,
                                value=suffix,
                                gender=gender,
                                category=category)))
                num_ordinal_indicators += len(suffixes)

        stopwords = data.get('stopwords', [])
        stopword_index = len(all_stopwords)
        num_stopwords = len(stopwords)

        for stopword in stopwords:
            all_keys.append(numex_key_template.format(key=unicode(stopword)))
            all_rules.append(stopword_rule)

        num_rules = len(rules) + len(stopwords)

        all_languages.append(
            unicode(
                language_template.format(
                    language=language,
                    whole_words_only=int(whole_words_only),
                    rule_index=rule_index,
                    num_rules=num_rules,
                    ordinal_indicator_index=ordinal_indicator_index,
                    num_ordinal_indicators=num_ordinal_indicators)))

    out.write(
        safe_encode(
            numex_rules_data_template.format(
                numex_keys=u''',
    '''.join(all_keys),
                numex_rules=u''',
    '''.join(all_rules),
                ordinal_indicator_rules=u''',
    '''.join(all_ordinal_indicators),
                stopwords=u''',
    '''.join(all_stopwords),
                languages=u''',
    '''.join(all_languages),
            )))

    out.close()
Example #35
0
def encode_field(value):
    return multispace_regex.sub(' ', safe_encode((value if value is not None else '')))
Example #36
0
def abbreviate(gazetteer, s, language, abbreviate_prob=0.3, separate_prob=0.2, add_period_hyphen_prob=0.3):
    '''
    Abbreviations
    -------------

    OSM discourages abbreviations, but to make our training data map better
    to real-world input, we can safely replace the canonical phrase with an
    abbreviated version and retain the meaning of the words
    '''
    raw_tokens = tokenize_raw(s)
    s_utf8 = safe_encode(s)
    tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c)) for o, l, c in raw_tokens]
    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in tokens]

    n = len(tokens)

    abbreviated = []

    i = 0

    def abbreviated_tokens(i, tokens, t, c, length, data, space_token=six.u(' ')):
        data = [d.split(six.b('|')) for d in data]

        # local copy
        abbreviated = []

        n = len(t)

        # Append the original tokens with whitespace if there is any
        if random.random() > abbreviate_prob or not any((int(is_canonical) and lang in (language, 'all') for lang, dictionary, is_canonical, canonical in data)):
            for j, (t_i, c_i) in enumerate(t):
                abbreviated.append(tokens[i + j][0])

                if j < n - 1:
                    abbreviated.append(space_token)
            return abbreviated

        for lang, dictionary, is_canonical, canonical in data:
            if lang not in (language, 'all'):
                continue

            is_canonical = int(is_canonical)
            is_stopword = dictionary == 'stopword'
            is_prefix = dictionary.startswith('concatenated_prefixes')
            is_suffix = dictionary.startswith('concatenated_suffixes')
            is_separable = is_prefix or is_suffix and dictionary.endswith('_separable') and len(t[0][0]) > length

            suffix = None
            prefix = None

            if not is_canonical:
                continue

            if not is_prefix and not is_suffix:
                abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary))
                # TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
                # would require an audit of the dictionaries though so abbreviations are listed from
                # left-to-right by frequency of usage
                token = random.choice(abbreviations) if abbreviations else canonical
                token = recase_abbreviation(token, tokens[i:i + len(t)], space_token=space_token)
                abbreviated.append(token)
                break
            elif is_prefix:
                token = tokens[i][0]
                prefix, token = token[:length], token[length:]

                abbreviated.append(prefix)
                if random.random() < separate_prob:
                    sub_tokens = tokenize(token)
                    if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN, token_types.DASH):
                        token = six.u('').join((t for t, c in sub_tokens[1:]))

                    abbreviated.append(space_token)
                if token.islower():
                    abbreviated.append(token.title())
                else:
                    abbreviated.append(token)
                abbreviated.append(space_token)
                break
            elif is_suffix:
                token = tokens[i][0]

                token, suffix = token[:-length], token[-length:]

                concatenated_abbreviations = gazetteer.canonicals.get((canonical, lang, dictionary), [])

                separated_abbreviations = []
                phrase = gazetteer.trie.get(suffix.rstrip('.'))
                suffix_data = [safe_decode(d).split(six.u('|')) for d in (phrase or [])]
                for l, d, _, c in suffix_data:
                    if l == lang and c == canonical:
                        separated_abbreviations.extend(gazetteer.canonicals.get((canonical, lang, d)))

                separate = random.random() < separate_prob

                if concatenated_abbreviations and not separate:
                    abbreviation = random.choice(concatenated_abbreviations)
                elif separated_abbreviations:
                    abbreviation = random.choice(separated_abbreviations)
                else:
                    abbreviation = canonical

                if separate:
                    sub_tokens = tokenize(token)
                    if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN, token_types.DASH):
                        token = six.u('').join((t for t, c in sub_tokens[:-1]))

                abbreviated.append(token)
                if separate:
                    abbreviated.append(space_token)
                if suffix.isupper():
                    abbreviated.append(abbreviation.upper())
                elif separate:
                    abbreviated.append(abbreviation.title())
                else:
                    abbreviated.append(abbreviation)
                break
        else:
            for j, (t_i, c_i) in enumerate(t):
                abbreviated.append(tokens[i + j][0])
                if j < n - 1:
                    abbreviated.append(space_token)
        return abbreviated

    for t, c, length, data in gazetteer.filter(norm_tokens):
        if c == token_types.PHRASE:
            abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
            abbreviated.extend(abbrev_tokens)

            if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(raw_tokens[i + len(t) - 1][:2]):
                abbreviated.append(six.u(' '))

            i += len(t)

        else:
            token = tokens[i][0]
            if not non_breaking_dash_regex.search(token):
                abbreviated.append(token)
            else:
                sub_tokens = tokenize(non_breaking_dash_regex.sub(six.u(' '), token))
                sub_tokens_norm = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c) for t, c in sub_tokens]

                sub_token_abbreviated = []
                sub_i = 0
                sub_n = len(sub_tokens)
                for t, c, length, data in gazetteer.filter(sub_tokens_norm):
                    if c == token_types.PHRASE:
                        abbrev_tokens = abbreviated_tokens(sub_i, sub_tokens, t, c, length, data, space_token=six.u('-'))
                        sub_token_abbreviated.extend(abbrev_tokens)
                        sub_i += len(t)
                        if sub_i < sub_n:
                            if abbrev_tokens and random.random() < add_period_hyphen_prob and not abbrev_tokens[-1].endswith(six.u('.')) and not abbrev_tokens[-1].lower().endswith(sub_tokens_norm[sub_i - 1][0]):
                                sub_token_abbreviated.append(six.u('.'))
                            sub_token_abbreviated.append(six.u('-'))
                    else:
                        sub_token_abbreviated.append(sub_tokens[sub_i][0])
                        sub_i += 1
                        if sub_i < sub_n:
                            sub_token_abbreviated.append(six.u('-'))

                abbreviated.append(six.u('').join(sub_token_abbreviated))

            if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
                abbreviated.append(six.u(' '))
            i += 1

    return six.u('').join(abbreviated).strip()
Example #37
0
class OpenAddressesFormatter(object):
    field_regex_replacements = {
        # All fields
        None: [
            (re.compile('<\s*null\s*>', re.I), u''),
            (re.compile('[\s]{2,}'), six.u(' ')),
            (re.compile('\`'), u"'"),
            (re.compile('\-?\*'), u""),
        ],
        AddressFormatter.HOUSE_NUMBER: [
            # Most of the house numbers in Montreal start with "#"
            (re.compile('^#', re.UNICODE), u''),
            # Some house numbers have multiple hyphens
            (re.compile('[\-]{2,}'), u'-'),
            # Some house number ranges are split up like "12 -14"
            (re.compile('[\s]*\-[\s]*'), u'-'),
        ]
    }

    unit_type_regexes = {}

    for (lang, dictionary_type), values in six.iteritems(
            address_phrase_dictionaries.phrases):
        if dictionary_type == 'unit_types_numbered':
            unit_phrases = [
                safe_encode(p) for p in itertools.chain(*values) if len(p) > 2
            ]
            pattern = re.compile(
                r'\b(?:{})\s+(?:#?\s*)(?:[\d]+|[a-z]|[a-z]\-?[\d]+|[\d]+\-?[a-z])\s*$'
                .format(safe_encode('|').join(unit_phrases)),
                re.I | re.UNICODE)
            unit_type_regexes[lang] = pattern

    def __init__(self, components, country_rtree, debug=False):
        self.components = components
        self.country_rtree = country_rtree

        self.debug = debug

        self.formatter = AddressFormatter()

    class validators:
        @classmethod
        def validate_postcode(cls, postcode):
            '''
            Postcodes that are all zeros are improperly-formatted NULL values
            '''
            return not all((c in ('0', '-', '.', ' ', ',') for c in postcode))

        @classmethod
        def validate_street(cls, street):
            '''
            Streets should not be simple numbers. If they are it's probably a
            copy/paste error and should be the house number.
            '''
            return not is_numeric(street)

        @classmethod
        def validate_house_number(cls, house_number):
            '''
            House number doesn't necessarily have to be numeric, but in some of the
            OpenAddresses data sets the house number field is equal to the capitalized
            street name, so this at least provides protection against insane values
            for house number at the cost of maybe missing a few houses numbered "A", etc.

            Also OpenAddresses primarily comes from county GIS servers, etc. which use
            a variety of database schemas and don't always handle NULLs very well. Again,
            while a single zero is a valid house number, in OpenAddresses it's more likely
            an error

            While a single zero is a valid house number, more than one zero is not, or
            at least not in OpenAddresses
            '''

            try:
                house_number = int(house_number.strip())
                return house_number > 0
            except (ValueError, TypeError):
                house_number = house_number.strip()
                return house_number and (
                    is_numeric(house_number)
                    or fraction_regex.match(house_number)
                    or number_space_letter_regex.match(house_number)
                    or number_slash_number_regex.match(house_number)
                    or number_fraction_regex.match(house_number)) and not all(
                        (c == '0' for c in house_number if c.isdigit()))

        @classmethod
        def validate_house_number_sin_numero(cls, house_number):
            if sin_numero_regex.match(house_number):
                return True
            return cls.validate_house_number(house_number)

        @classmethod
        def validate_russian_house_number(cls, house_number):
            if dom_korpus_stroyeniye_regex.match(house_number):
                return True
            elif uchastok_regex.match(house_number):
                return True
            elif bea_nomera_regex.match(house_number):
                return True
            return cls.validate_house_number(house_number)

        @classmethod
        def validate_colombian_house_number(cls, house_number):
            return True

        @classmethod
        def validate_chinese_house_number(cls, house_number):
            if not house_number:
                return False
            tokens = tokenize(house_number)

            if all((c in token_types.NUMERIC_TOKEN_TYPES or t in (u'号', u'栋',
                                                                  u'附'))
                   for t, c in tokens):
                return True
            return cls.validate_house_number(house_number)

    component_validators = {
        AddressFormatter.HOUSE_NUMBER: validators.validate_house_number,
        AddressFormatter.ROAD: validators.validate_street,
        AddressFormatter.POSTCODE: validators.validate_postcode,
    }

    language_validators = {
        SPANISH: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_house_number_sin_numero,
        },
        PORTUGUESE: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_house_number_sin_numero,
        },
        RUSSIAN: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_russian_house_number,
        },
        CHINESE: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_chinese_house_number,
        }
    }

    country_validators = {
        Countries.COLOMBIA: {
            AddressFormatter.HOUSE_NUMBER:
            validators.validate_colombian_house_number
        }
    }

    chinese_annex_regex = re.compile(u'([\d]+)(?![\d号栋])', re.U)

    @classmethod
    def format_chinese_house_number(cls, house_number):
        if not house_number:
            return house_number
        return cls.chinese_annex_regex.sub(u'\\1号', house_number)

    @classmethod
    def format_colombian_house_number(cls, house_number):
        house_number = house_number.strip()
        match = colombian_standard_house_number_regex.match(house_number)
        if match:
            separator = random.choice((u'-', u' - ', u' '))

            cross_street, building_number = match.groups()

            numbers = []
            if cross_street and u' ' in cross_street and random.choice(
                (True, False)):
                cross_street = cross_street.replace(u' ', u'')

            if cross_street:
                numbers.append(cross_street)

            if building_number and u' ' in building_number and random.choice(
                (True, False)):
                building_number = building_number.replace(u' ', u'')

            if building_number:
                numbers.append(building_number)

            if numbers:
                house_number = separator.join(numbers)
                house_number_prefixes = (u'#', u'no.', u'no', u'nº')
                if random.choice((True, False)) and not any(
                    (house_number.lower().startswith(p)
                     for p in house_number_prefixes)):
                    house_number = u' '.join(
                        [random.choice(house_number_prefixes), house_number])

        return house_number

    def get_property(self, key, *configs):
        for config in configs:
            value = config.get(key, None)
            if value is not None:
                return value
        return None

    def cldr_country_name(self, country_code, language, configs):
        cldr_country_prob = float(
            self.get_property('cldr_country_probability', *configs))

        country_name = None

        if random.random() < cldr_country_prob:
            localized, iso_3166, alpha2, alpha3 = values = range(4)
            localized_prob = float(
                self.get_property('localized_name_probability', *configs))
            iso_3166_prob = float(
                self.get_property('iso_3166_name_probability', *configs))
            alpha2_prob = float(
                self.get_property('iso_alpha_2_code_probability', *configs))
            alpha3_prob = float(
                self.get_property('iso_alpha_3_code_probability', *configs))

            probs = cdf(
                [localized_prob, iso_3166_prob, alpha2_prob, alpha3_prob])

            country_type = weighted_choice(values, probs)

            country_name = country_code.upper()
            if country_type == localized:
                country_name = country_names.localized_name(
                    country_code, language) or country_names.localized_name(
                        country_code) or country_name
            elif country_type == iso_3166:
                country_name = country_names.iso3166_name(country_code)
            elif country_type == alpha3:
                country_name = country_names.alpha3_code(
                    country_code) or country_name

        return country_name

    @classmethod
    def cleanup_number(cls, num, strip_commas=False):
        num = num.strip()
        if strip_commas:
            num = num.replace(six.u(','), six.u(''))
        try:
            num_int = int(num)
        except (ValueError, TypeError):
            try:
                num_float = float(num)
                leading_zeros = 0
                for c in num:
                    if c == six.u('0'):
                        leading_zeros += 1
                    else:
                        break
                num = safe_decode(int(num_float))
                if leading_zeros:
                    num = six.u('{}{}').format(six.u('0') * leading_zeros, num)
            except (ValueError, TypeError):
                pass
        return num

    @classmethod
    def fix_component_encodings(cls, components):
        return {
            k: ftfy.fix_encoding(safe_decode(v))
            for k, v in six.iteritems(components)
        }

    def formatted_addresses(self,
                            country_dir,
                            path,
                            configs,
                            tag_components=True):
        abbreviate_street_prob = float(
            self.get_property('abbreviate_street_probability', *configs))
        separate_street_prob = float(
            self.get_property('separate_street_probability', *configs) or 0.0)
        abbreviate_unit_prob = float(
            self.get_property('abbreviate_unit_probability', *configs))
        separate_unit_prob = float(
            self.get_property('separate_unit_probability', *configs) or 0.0)
        abbreviate_toponym_prob = float(
            self.get_property('abbreviate_toponym_probability', *configs))

        add_osm_boundaries = bool(
            self.get_property('add_osm_boundaries', *configs) or False)
        add_osm_neighborhoods = bool(
            self.get_property('add_osm_neighborhoods', *configs) or False)
        osm_neighborhood_overrides_city = self.get_property(
            'osm_neighborhood_overrides_city', *configs)
        non_numeric_units = bool(
            self.get_property('non_numeric_units', *configs) or False)
        house_number_strip_commas = bool(
            self.get_property('house_number_strip_commas', *configs) or False)
        numeric_postcodes_only = bool(
            self.get_property('numeric_postcodes_only', *configs) or False)
        postcode_strip_non_digit_chars = bool(
            self.get_property('postcode_strip_non_digit_chars', *configs)
            or False)

        address_only_probability = float(
            self.get_property('address_only_probability', *configs))
        place_only_probability = float(
            self.get_property('place_only_probability', *configs))
        place_and_postcode_probability = float(
            self.get_property('place_and_postcode_probability', *configs))

        city_replacements = self.get_property('city_replacements', *configs)

        override_country_dir = self.get_property('override_country_dir',
                                                 *configs)

        postcode_length = int(
            self.get_property('postcode_length', *configs) or 0)

        drop_address_probability = place_only_probability + place_and_postcode_probability

        ignore_rows_missing_fields = set(
            self.get_property('ignore_rows_missing_fields', *configs) or [])

        ignore_fields_containing = {
            field: re.compile(
                six.u('|').join(
                    [six.u('(?:{})').format(safe_decode(v)) for v in value]),
                re.I | re.UNICODE)
            for field, value in six.iteritems(
                dict(
                    self.get_property('ignore_fields_containing', *configs)
                    or {}))
        }

        alias_fields_containing = {
            field:
            [(re.compile(v['pattern'], re.I | re.UNICODE), v) for v in value]
            for field, value in six.iteritems(
                dict(
                    self.get_property('alias_fields_containing', *configs)
                    or {}))
        }

        config_language = self.get_property('language', *configs)

        add_components = self.get_property('add', *configs)

        fields = self.get_property('fields', *configs)
        if not fields:
            return

        field_map = {
            field_name: f['component']
            for field_name, f in six.iteritems(fields)
        }
        mapped_values = {
            f['component']: f['value_map']
            for f in six.itervalues(fields)
            if hasattr(f.get('value_map'), 'get')
        }

        f = open(path)
        reader = unicode_csv_reader(f)
        headers = reader.next()

        header_indices = {
            i: field_map[k]
            for i, k in enumerate(headers) if k in field_map
        }
        latitude_index = headers.index('LAT')
        longitude_index = headers.index('LON')

        # Clear cached polygons
        self.components.osm_admin_rtree.clear_cache()
        self.components.neighborhoods_rtree.clear_cache()

        for row in reader:
            try:
                latitude = float(row[latitude_index])
                longitude = float(row[longitude_index])
            except (ValueError, TypeError):
                continue

            language = config_language

            components = {}

            skip_record = False

            for i, key in six.iteritems(header_indices):
                value = row[i].strip()
                if not value and key in ignore_rows_missing_fields:
                    skip_record = True
                    break
                elif not value:
                    continue

                if key in mapped_values:
                    value = mapped_values[key].get(value, value)

                if key == AddressFormatter.ROAD and language == SPANISH:
                    value = self.components.spanish_street_name(value)

                if key == AddressFormatter.POSTCODE:
                    value = self.cleanup_number(value)

                    if postcode_strip_non_digit_chars:
                        value = six.u('').join(
                            (c for c in value if c.isdigit()))

                    if value and not is_numeric(
                            value) and numeric_postcodes_only:
                        continue
                    else:
                        if postcode_length:
                            value = value.zfill(
                                postcode_length)[:postcode_length]

                if key in AddressFormatter.BOUNDARY_COMPONENTS and key != AddressFormatter.POSTCODE:
                    if add_osm_boundaries:
                        continue
                    value = self.components.cleaned_name(
                        value, first_comma_delimited_phrase=True)
                    if value and ((len(value) < 2
                                   and not get_string_script(value)[0].lower()
                                   in ideographic_scripts)
                                  or is_numeric(value)):
                        continue

                if not_applicable_regex.match(value) or null_regex.match(
                        value) or unknown_regex.match(value):
                    continue

                for exp, sub_val in self.field_regex_replacements.get(key, []):
                    value = exp.sub(sub_val, value)

                for exp, sub_val in self.field_regex_replacements.get(
                        None, []):
                    value = exp.sub(sub_val, value)

                value = value.strip(', -')

                validator = self.country_validators.get(country_dir, {}).get(
                    key,
                    self.language_validators.get(language, {}).get(
                        key, self.component_validators.get(key, None)))

                if validator is not None and not validator(value):
                    continue

                if key in ignore_fields_containing and ignore_fields_containing[
                        key].search(value):
                    continue

                for (pattern, alias) in alias_fields_containing.get(key, []):
                    if pattern.search(value):
                        if 'component' in alias:
                            key = alias['component']

                if value:
                    components[key] = value

            if skip_record:
                continue

            if components:
                country, candidate_languages = self.country_rtree.country_and_languages(
                    latitude, longitude)
                if not (country and candidate_languages) or (
                        country != country_dir and not override_country_dir):
                    country = country_dir
                    candidate_languages = get_country_languages(country)
                    if not candidate_languages:
                        continue
                    candidate_languages = candidate_languages.items()

                components = self.fix_component_encodings(components)

                if language is None:
                    language = AddressComponents.address_language(
                        components, candidate_languages)

                street = components.get(AddressFormatter.ROAD, None)
                if street is not None:
                    street = street.strip()
                    street = AddressComponents.cleaned_name(street)

                    if language == UNKNOWN_LANGUAGE:
                        strip_unit_language = candidate_languages[0][
                            0] if candidate_languages else None
                    else:
                        strip_unit_language = language

                    street = self.components.strip_unit_phrases_for_language(
                        street, strip_unit_language)

                    street = abbreviate(street_types_gazetteer,
                                        street,
                                        language,
                                        abbreviate_prob=abbreviate_street_prob,
                                        separate_prob=separate_street_prob)
                    components[AddressFormatter.ROAD] = street

                house_number = components.get(AddressFormatter.HOUSE_NUMBER,
                                              None)
                if house_number:
                    house_number = self.cleanup_number(
                        house_number, strip_commas=house_number_strip_commas)

                    if language == CHINESE:
                        house_number = self.format_chinese_house_number(
                            house_number)

                    if country_dir == Countries.COLOMBIA:
                        house_number = self.format_colombian_house_number(
                            house_number)

                    if house_number is not None:
                        components[
                            AddressFormatter.HOUSE_NUMBER] = house_number

                unit = components.get(AddressFormatter.UNIT, None)

                street_required = country not in (
                    Countries.JAPAN, Countries.CZECH_REPUBLIC
                ) and country not in Countries.FORMER_SOVIET_UNION_COUNTRIES

                postcode = components.get(AddressFormatter.POSTCODE, None)

                if postcode:
                    components[AddressFormatter.
                               POSTCODE] = PostalCodes.add_country_code(
                                   postcode, country)

                # If there's a postcode, we can still use just the city/state/postcode, otherwise discard
                if (not street and street_required) or (
                        street and house_number and
                    (street.lower() == house_number.lower())) or (
                        unit and street and street.lower() == unit.lower()):
                    if not postcode:
                        continue
                    components = self.components.drop_address(components)

                # Now that checks, etc. are completed, fetch unit and add phrases, abbreviate, etc.
                unit = components.get(AddressFormatter.UNIT, None)

                if unit is not None:
                    if is_numeric_strict(unit):
                        unit = Unit.phrase(unit, language, country=country)
                    elif non_numeric_units:
                        unit = abbreviate(unit_types_gazetteer,
                                          unit,
                                          language,
                                          abbreviate_prob=abbreviate_unit_prob,
                                          separate_prob=separate_unit_prob)
                    else:
                        unit = None

                    if unit is not None:
                        components[AddressFormatter.UNIT] = unit
                    else:
                        components.pop(AddressFormatter.UNIT)
                        unit = None

                # CLDR country name
                country_name = self.cldr_country_name(country, language,
                                                      configs)
                if country_name:
                    components[AddressFormatter.COUNTRY] = country_name

                for component_key in AddressFormatter.BOUNDARY_COMPONENTS:
                    component = components.get(component_key, None)
                    if component is not None:
                        component = abbreviate(
                            toponym_abbreviations_gazetteer,
                            component,
                            language,
                            abbreviate_prob=abbreviate_toponym_prob)
                        component = self.components.name_hyphens(component)
                        components[component_key] = component

                # Any components specified to be added by the config (usually state)
                if add_components:
                    for k, v in six.iteritems(add_components):
                        if k not in components:
                            components[k] = v

                # Get named states occasionally, added component is usually a state code
                address_state = self.components.state_name(
                    components, country, language)
                if address_state:
                    components[AddressFormatter.STATE] = address_state

                state = components.get(AddressFormatter.STATE)
                if state:
                    state = self.components.abbreviated_state(
                        state, country, language)
                    if state:
                        components[AddressFormatter.STATE] = state

                # This is expensive, so only turn on for files that don't supply their own city names
                # or for which those names are flawed
                osm_components = []

                # Using population=0 instead of None means if there's no known population or
                # we don't need to add OSM components, we assume the population of the town is
                # very small and the place name shouldn't be used unqualified (i.e. needs information
                # like state name to disambiguate it)
                population = 0
                unambiguous_city = False
                if add_osm_boundaries or AddressFormatter.CITY not in components:
                    osm_components = self.components.osm_reverse_geocoded_components(
                        latitude, longitude)
                    self.components.add_admin_boundaries(
                        components, osm_components, country, language,
                        latitude, longitude)
                    categorized = self.components.categorized_osm_components(
                        country, osm_components)
                    for component, label in categorized:
                        if label == AddressFormatter.CITY:
                            unambiguous_city = self.components.unambiguous_wikipedia(
                                component, language)
                            if 'population' in component:
                                population = component['population']
                            break

                if AddressFormatter.CITY not in components and city_replacements:
                    components.update({
                        k: v
                        for k, v in six.iteritems(city_replacements)
                        if k not in components
                    })

                # The neighborhood index is cheaper so can turn on for whole countries
                neighborhood_components = []
                if add_osm_neighborhoods:
                    neighborhood_components = self.components.neighborhood_components(
                        latitude, longitude)
                    self.components.add_neighborhoods(
                        components,
                        neighborhood_components,
                        country,
                        language,
                        replace_city=osm_neighborhood_overrides_city)

                self.components.cleanup_boundary_names(components)
                self.components.country_specific_cleanup(components, country)

                self.components.replace_name_affixes(components,
                                                     language,
                                                     country=country)

                self.components.replace_names(components)

                self.components.prune_duplicate_names(components)

                self.components.remove_numeric_boundary_names(components)
                self.components.add_house_number_phrase(components,
                                                        language,
                                                        country=country)
                self.components.add_postcode_phrase(components,
                                                    language,
                                                    country=country)

                # Component dropout
                all_osm_components = osm_components + neighborhood_components
                components = place_config.dropout_components(
                    components,
                    all_osm_components,
                    country=country,
                    population=population,
                    unambiguous_city=unambiguous_city)

                self.components.add_genitives(components, language)

                formatted = self.formatter.format_address(
                    components,
                    country,
                    language=language,
                    minimal_only=False,
                    tag_components=tag_components)
                yield (language, country, formatted)

                if random.random() < address_only_probability and street:
                    address_only_components = self.components.drop_places(
                        components)
                    address_only_components = self.components.drop_postcode(
                        address_only_components)
                    formatted = self.formatter.format_address(
                        address_only_components,
                        country,
                        language=language,
                        minimal_only=False,
                        tag_components=tag_components)
                    yield (language, country, formatted)

                rand_val = random.random()

                if street and house_number and rand_val < drop_address_probability:
                    components = self.components.drop_address(components)

                    if rand_val < place_and_postcode_probability:
                        components = self.components.drop_postcode(components)

                    if components and (len(components) > 1
                                       or add_osm_boundaries):
                        formatted = self.formatter.format_address(
                            components,
                            country,
                            language=language,
                            minimal_only=False,
                            tag_components=tag_components)
                        yield (language, country, formatted)

    def build_training_data(self,
                            base_dir,
                            out_dir,
                            tag_components=True,
                            sources_only=None):
        all_sources_valid = sources_only is None
        valid_sources = set()
        if not all_sources_valid:
            for source in sources_only:
                if source.startswith(base_dir):
                    source = os.path.relpath(source, base_dir)

                parts = source.strip('/ ').split('/')
                if len(parts) > 3:
                    raise AssertionError(
                        'Sources may only have at maximum 3 parts')
                valid_sources.add(tuple(parts))

        if tag_components:
            formatted_tagged_file = open(
                os.path.join(out_dir,
                             OPENADDRESSES_FORMAT_DATA_TAGGED_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')
        else:
            formatted_tagged_file = open(
                os.path.join(out_dir, OPENADDRESSES_FORMAT_DATA_FILENAME), 'w')
            writer = csv.writer(formatted_tagged_file, 'tsv_no_quote')

        i = 0

        for country_dir in sorted(openaddresses_config.country_configs.keys()):
            country_config = openaddresses_config.country_configs[country_dir]
            # Clear country cache for each new country
            self.country_rtree.clear_cache()

            for file_config in country_config.get('files', []):
                filename = file_config['filename']

                if not all_sources_valid and not (
                    (country_dir, filename) in valid_sources or
                    (country_dir, ) in valid_sources):
                    continue

                print(six.u('doing {}/{}').format(country_dir, filename))

                path = os.path.join(base_dir, country_dir, filename)
                configs = (file_config, country_config,
                           openaddresses_config.config)
                for language, country, formatted_address in self.formatted_addresses(
                        country_dir, path, configs,
                        tag_components=tag_components):
                    if not formatted_address or not formatted_address.strip():
                        continue

                    formatted_address = tsv_string(formatted_address)
                    if not formatted_address or not formatted_address.strip():
                        continue

                    if tag_components:
                        row = (language, country, formatted_address)
                    else:
                        row = (formatted_address, )

                    writer.writerow(row)
                    i += 1
                    if i % 1000 == 0 and i > 0:
                        print('did {} formatted addresses'.format(i))
                        if self.debug:
                            break

            for subdir in sorted(country_config.get('subdirs', {}).keys()):
                subdir_config = country_config['subdirs'][subdir]
                subdir = safe_decode(subdir)
                for file_config in subdir_config.get('files', []):
                    filename = file_config['filename']

                    if not all_sources_valid and not (
                        (country_dir, subdir, filename) in valid_sources or
                        (country_dir, subdir) in valid_sources or
                        (country_dir, ) in valid_sources):
                        continue

                    print(
                        six.u('doing {}/{}/{}').format(country_dir, subdir,
                                                       filename))

                    path = os.path.join(base_dir, country_dir, subdir,
                                        filename)

                    configs = (file_config, subdir_config, country_config,
                               openaddresses_config.config)
                    for language, country, formatted_address in self.formatted_addresses(
                            country_dir,
                            path,
                            configs,
                            tag_components=tag_components):
                        if not formatted_address or not formatted_address.strip(
                        ):
                            continue

                        formatted_address = tsv_string(formatted_address)
                        if not formatted_address or not formatted_address.strip(
                        ):
                            continue

                        if tag_components:
                            row = (language, country, formatted_address)
                        else:
                            row = (formatted_address, )

                        writer.writerow(row)

                        i += 1
                        if i % 1000 == 0 and i > 0:
                            print('did {} formatted addresses'.format(i))
                            if self.debug:
                                break
Example #38
0
    def dropout_components(self,
                           components,
                           boundaries=(),
                           country=None,
                           population=None,
                           unambiguous_city=False):
        containing_ids = set()

        for boundary in boundaries:
            object_type = boundary.get('type')
            object_id = safe_encode(boundary.get('id', ''))
            if not (object_type and object_id):
                continue
            containing_ids.add((object_type, object_id))

        original_bitset = ComponentDependencies.component_bitset(components)

        names = defaultdict(list)
        admin_components = [
            c for c in components if c in self.ADMIN_COMPONENTS
        ]
        for c in admin_components:
            names[components[c]].append(c)

        same_name = set()
        for c, v in six.iteritems(names):
            if len(v) > 1:
                same_name |= set(v)

        new_components = components.copy()

        city_replacements = set()
        if AddressFormatter.CITY not in components:
            city_replacements = self.city_replacements(country)

        for component in admin_components:
            include = self.include_component(component,
                                             containing_ids,
                                             country=country,
                                             population=population,
                                             unambiguous_city=unambiguous_city)

            if not include and component not in city_replacements:
                # Note: this check is for cities that have the same name as their admin
                # areas e.g. Luxembourg, Luxembourg. In cases like this, if we were to drop
                # city, we don't want to include country on its own. This should help the parser
                # default to the city in ambiguous cases where only one component is specified.
                if not (component == AddressFormatter.CITY
                        and component in same_name):
                    new_components.pop(component, None)
                else:
                    value = components[component]
                    for c in names[value]:
                        new_components.pop(c, None)

        for component in self.ADMIN_COMPONENTS:
            value = self.get_property(('components', component, 'value'),
                                      country=country,
                                      default=None)

            if not value:
                values, probs = self.cdf_cache.get((country, component),
                                                   (None, None))
                if values is None:
                    values = self.get_property(
                        ('components', component, 'values'),
                        country=country,
                        default=None)
                    if values is not None:
                        values, probs = zip(*[(v['value'],
                                               float(v['probability']))
                                              for v in values])
                        probs = cdf(probs)
                        self.cdf_cache[(country, component)] = (values, probs)

                if values is not None:
                    value = weighted_choice(values, probs)

            if value is not None and component not in components and self.include_component(
                    component,
                    containing_ids,
                    country=country,
                    population=population,
                    unambiguous_city=unambiguous_city):
                new_components[component] = value

        self.drop_invalid_components(new_components,
                                     country,
                                     original_bitset=original_bitset)

        return new_components
Example #39
0
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
    address_languages = []
    expansion_rules = []
    canonical_strings = []

    max_dictionary_types = 0

    for language in address_phrase_dictionaries.languages:
        num_language_rules = 0
        language_index = len(expansion_rules)

        language_canonical_dictionaries = defaultdict(list)
        canonical_indices = {}

        for dictionary_name in address_phrase_dictionaries.language_dictionaries[language]:
            dictionary_type = gazetteer_types[dictionary_name]

            for phrases in address_phrase_dictionaries.phrases[(language, dictionary_name)]:
                canonical = phrases[0]
                if len(phrases) > 1:
                    canonical_index = canonical_indices.get(canonical, None)
                    if canonical_index is None:
                        canonical_index = len(canonical_strings)
                        canonical_strings.append(quote_string(canonical))
                        canonical_indices[canonical] = canonical_index
                else:
                    canonical_index = -1

                for i, p in enumerate(phrases):
                    language_canonical_dictionaries[p, canonical_index if i > 0 else -1].append(dictionary_type)

        for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
            max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
            rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
                                                                   num_dictionaries=str(len(dictionary_types)),
                                                                   dictionaries=', '.join(dictionary_types),
                                                                   canonical_index=canonical_index)
            expansion_rules.append(rule_template)
            num_language_rules += 1

        address_languages.append(address_language_index_template.format(language=quote_string(language),
                                                                        index=language_index,
                                                                        length=num_language_rules))

    header = address_expansion_rule_header_template.format(
        max_dictionary_types=str(max_dictionary_types)
    )
    out = open(header_file, 'w')
    out.write(safe_encode(header))
    out.close()

    data_file = address_expansion_data_file_template.format(
        canonical_strings=u''',
    '''.join(canonical_strings),
        expansion_rules=u''',
    '''.join(expansion_rules),
        address_languages=u''',
    '''.join(address_languages),
    )

    out = open(output_file, 'w')
    out.write(safe_encode(data_file))
    out.close()
Example #40
0
 def create_intersections(self, outfile):
     out = open(outfile, 'w')
     for node_id, node_props, ways in self.intersections():
         d = {'id': safe_encode(node_id), 'node': node_props, 'ways': ways}
         out.write(json.dumps(d) + six.u('\n'))
Example #41
0
def abbreviate(gazetteer,
               s,
               language,
               abbreviate_prob=0.3,
               separate_prob=0.2,
               add_period_hyphen_prob=0.3):
    '''
    Abbreviations
    -------------

    OSM discourages abbreviations, but to make our training data map better
    to real-world input, we can safely replace the canonical phrase with an
    abbreviated version and retain the meaning of the words
    '''
    raw_tokens = tokenize_raw(s)
    s_utf8 = safe_encode(s)
    tokens = [(safe_decode(s_utf8[o:o + l]), token_types.from_id(c))
              for o, l, c in raw_tokens]
    norm_tokens = [(t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c)
                   for t, c in tokens]

    n = len(tokens)

    abbreviated = []

    i = 0

    def abbreviated_tokens(i,
                           tokens,
                           t,
                           c,
                           length,
                           data,
                           space_token=six.u(' ')):
        data = [d.split(six.b('|')) for d in data]

        # local copy
        abbreviated = []

        n = len(t)

        # Append the original tokens with whitespace if there is any
        if random.random() > abbreviate_prob or not any(
            (int(is_canonical) and lang in (language, 'all')
             for lang, dictionary, is_canonical, canonical in data)):
            for j, (t_i, c_i) in enumerate(t):
                abbreviated.append(tokens[i + j][0])

                if j < n - 1:
                    abbreviated.append(space_token)
            return abbreviated

        for lang, dictionary, is_canonical, canonical in data:
            if lang not in (language, 'all'):
                continue

            is_canonical = int(is_canonical)
            is_stopword = dictionary == 'stopword'
            is_prefix = dictionary.startswith('concatenated_prefixes')
            is_suffix = dictionary.startswith('concatenated_suffixes')
            is_separable = is_prefix or is_suffix and dictionary.endswith(
                '_separable') and len(t[0][0]) > length

            suffix = None
            prefix = None

            if not is_canonical:
                continue

            if not is_prefix and not is_suffix:
                abbreviations = gazetteer.canonicals.get(
                    (canonical, lang, dictionary))
                # TODO: maybe make this a Zipfian choice e.g. so "St" gets chosen most often for "Street"
                # would require an audit of the dictionaries though so abbreviations are listed from
                # left-to-right by frequency of usage
                token = random.choice(
                    abbreviations) if abbreviations else canonical
                token = recase_abbreviation(token,
                                            tokens[i:i + len(t)],
                                            space_token=space_token)
                abbreviated.append(token)
                break
            elif is_prefix:
                token = tokens[i][0]
                prefix, token = token[:length], token[length:]

                abbreviated.append(prefix)
                if random.random() < separate_prob:
                    sub_tokens = tokenize(token)
                    if sub_tokens and sub_tokens[0][1] in (token_types.HYPHEN,
                                                           token_types.DASH):
                        token = six.u('').join((t for t, c in sub_tokens[1:]))

                    abbreviated.append(space_token)
                if token.islower():
                    abbreviated.append(token.title())
                else:
                    abbreviated.append(token)
                abbreviated.append(space_token)
                break
            elif is_suffix:
                token = tokens[i][0]

                token, suffix = token[:-length], token[-length:]

                concatenated_abbreviations = gazetteer.canonicals.get(
                    (canonical, lang, dictionary), [])

                separated_abbreviations = []
                phrase = gazetteer.trie.get(suffix.rstrip('.'))
                suffix_data = [
                    safe_decode(d).split(six.u('|')) for d in (phrase or [])
                ]
                for l, d, _, c in suffix_data:
                    if l == lang and c == canonical:
                        separated_abbreviations.extend(
                            gazetteer.canonicals.get((canonical, lang, d)))

                separate = random.random() < separate_prob

                if concatenated_abbreviations and not separate:
                    abbreviation = random.choice(concatenated_abbreviations)
                elif separated_abbreviations:
                    abbreviation = random.choice(separated_abbreviations)
                else:
                    abbreviation = canonical

                if separate:
                    sub_tokens = tokenize(token)
                    if sub_tokens and sub_tokens[-1][1] in (token_types.HYPHEN,
                                                            token_types.DASH):
                        token = six.u('').join((t for t, c in sub_tokens[:-1]))

                abbreviated.append(token)
                if separate:
                    abbreviated.append(space_token)
                if suffix.isupper():
                    abbreviated.append(abbreviation.upper())
                elif separate:
                    abbreviated.append(abbreviation.title())
                else:
                    abbreviated.append(abbreviation)
                break
        else:
            for j, (t_i, c_i) in enumerate(t):
                abbreviated.append(tokens[i + j][0])
                if j < n - 1:
                    abbreviated.append(space_token)
        return abbreviated

    for t, c, length, data in gazetteer.filter(norm_tokens):
        if c == token_types.PHRASE:
            abbrev_tokens = abbreviated_tokens(i, tokens, t, c, length, data)
            abbreviated.extend(abbrev_tokens)

            if i + len(t) < n and raw_tokens[i + len(t)][0] > sum(
                    raw_tokens[i + len(t) - 1][:2]):
                abbreviated.append(six.u(' '))

            i += len(t)

        else:
            token = tokens[i][0]
            if not non_breaking_dash_regex.search(token):
                abbreviated.append(token)
            else:
                sub_tokens = tokenize(
                    non_breaking_dash_regex.sub(six.u(' '), token))
                sub_tokens_norm = [
                    (t.lower() if c in token_types.WORD_TOKEN_TYPES else t, c)
                    for t, c in sub_tokens
                ]

                sub_token_abbreviated = []
                sub_i = 0
                sub_n = len(sub_tokens)
                for t, c, length, data in gazetteer.filter(sub_tokens_norm):
                    if c == token_types.PHRASE:
                        abbrev_tokens = abbreviated_tokens(
                            sub_i,
                            sub_tokens,
                            t,
                            c,
                            length,
                            data,
                            space_token=six.u('-'))
                        sub_token_abbreviated.extend(abbrev_tokens)
                        sub_i += len(t)
                        if sub_i < sub_n:
                            if abbrev_tokens and random.random(
                            ) < add_period_hyphen_prob and not abbrev_tokens[
                                    -1].endswith(
                                        six.u('.')) and not abbrev_tokens[
                                            -1].lower().endswith(
                                                sub_tokens_norm[sub_i - 1][0]):
                                sub_token_abbreviated.append(six.u('.'))
                            sub_token_abbreviated.append(six.u('-'))
                    else:
                        sub_token_abbreviated.append(sub_tokens[sub_i][0])
                        sub_i += 1
                        if sub_i < sub_n:
                            sub_token_abbreviated.append(six.u('-'))

                abbreviated.append(six.u('').join(sub_token_abbreviated))

            if i < n - 1 and raw_tokens[i + 1][0] > sum(raw_tokens[i][:2]):
                abbreviated.append(six.u(' '))
            i += 1

    return six.u('').join(abbreviated).strip()
Example #42
0
def create_geonames_tsv(db, out_dir=DEFAULT_DATA_DIR):
    '''
    Writes geonames.tsv using the specified db to the specified data directory
    '''
    filename = os.path.join(out_dir, 'geonames.tsv')
    temp_filename = filename + '.tmp'

    f = open(temp_filename, 'w')

    writer = csv.writer(f, 'tsv_no_quote')

    init_languages()

    init_country_names()

    wiki_titles = get_wikipedia_titles(db)
    logging.info('Fetched Wikipedia titles')

    # Iterate over GeoNames boundary types from largest (country) to smallest (neighborhood)
    for boundary_type, codes in geonames_admin_dictionaries.iteritems():
        if boundary_type != boundary_types.COUNTRY:
            predicate = 'where gn.feature_code in ({codes})'.format(
                codes=','.join(['"{}"'.format(c) for c in codes])
            )
        else:
            # The query for countries in GeoNames is somewhat non-trivial
            predicate = 'where gn.geonames_id in (select geonames_id from countries)'

        query = base_geonames_query.format(
            predicate=predicate
        )

        cursor = db.execute(query)
        i = 1
        while True:
            # Fetch rows in batches to save memory
            batch = cursor.fetchmany(BATCH_SIZE)
            if not batch:
                break
            rows = []
            for row in batch:
                row = list(row)
                row[DUMMY_BOUNDARY_TYPE_INDEX] = boundary_type

                language = row[LANGUAGE_INDEX]

                country_code = row[COUNTRY_CODE_INDEX]

                is_preferred = int(row[PREFERRED_INDEX] or 0)
                is_historical = int(row[HISTORICAL_INDEX] or 0)

                lang_spoken = get_country_languages(country_code.lower(), official=False).get(language, None)
                lang_official = get_country_languages(country_code.lower()).get(language, None) == 1
                null_language = not language.strip()

                is_canonical = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX]

                alpha2_code = None
                is_orig_name = False

                if boundary_type == boundary_types.COUNTRY:
                    alpha2_code = row[COUNTRY_CODE_INDEX]

                    is_orig_name = row[NAME_INDEX] == row[CANONICAL_NAME_INDEX] and row[LANGUAGE_INDEX] == ''
                    # Set the canonical for countries to the local name, see country_official_name in country_names.py
                    country_canonical = country_localized_display_name(alpha2_code.lower())
                    if not country_canonical or not country_canonical.strip():
                        raise ValueError('Could not get local canonical name for country code={}'.format(alpha2_code))
                    row[CANONICAL_NAME_INDEX] = country_canonical

                geonames_id = row[GEONAMES_ID_INDEX]

                name = utf8_normalize(safe_decode(row[NAME_INDEX]))

                # For non-postal codes, don't count
                if name.isdigit():
                    continue

                wikipedia_entries = wiki_titles.get(name.lower(), wiki_titles.get(normalize_name(name.lower()), {}))

                row[NAME_INDEX] = name

                if boundary_type == boundary_types.COUNTRY:
                    norm_name = normalize_name(name.lower())
                    for s, repl in saint_replacements:
                        if not wikipedia_entries:
                            wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})

                wiki_row = []

                have_wikipedia = geonames_id in wikipedia_entries
                wiki_preferred = wikipedia_entries.get(geonames_id, 0)

                '''
                The following set of heuristics assigns a numerical value to a given name
                alternative, such that in the case of ambiguous names, this value can be
                used as part of the ranking function (as indeed it will be during sort).
                The higher the value, the more likely the given entity resolution.
                '''
                if is_historical:
                    # Historical names, unlikely to be used
                    language_priority = 0
                elif not null_language and language != 'abbr' and lang_spoken is None:
                    # Name of a place in language not widely spoken e.g. Japanese name for a US toponym
                    language_priority = 1
                elif null_language and not is_preferred and not is_canonical:
                    # Null-language alternate names not marked as preferred, dubious
                    language_priority = 2
                elif language == 'abbr' and not is_preferred:
                    # Abbreviation, not preferred
                    language_priority = 3
                elif language == 'abbr' and is_preferred:
                    # Abbreviation, preferred e.g. NYC, UAE
                    language_priority = 4
                elif lang_spoken and not lang_official and not is_preferred:
                    # Non-preferred name but in a spoken (non-official) language
                    language_priority = 5
                elif lang_official == 1 and not is_preferred:
                    # Name in an official language, not preferred
                    language_priority = 6
                elif null_language and not is_preferred and is_canonical:
                    # Canonical name, may be overly official e.g. Islamic Republic of Pakistan
                    language_priority = 7
                elif is_preferred and not lang_official:
                    # Preferred names, not an official language
                    language_priority = 8
                elif is_preferred and lang_official:
                    # Official language preferred
                    language_priority = 9

                row[DUMMY_LANGUAGE_PRIORITY_INDEX] = language_priority

                if have_wikipedia:
                    wiki_row = row[:]
                    wiki_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1
                    rows.append(map(encode_field, wiki_row))

                canonical = utf8_normalize(safe_decode(row[CANONICAL_NAME_INDEX]))
                row[POPULATION_INDEX] = int(row[POPULATION_INDEX] or 0)

                have_normalized = False

                if is_orig_name:
                    canonical_row = wiki_row[:] if have_wikipedia else row[:]

                    canonical_row_name = normalize_display_name(name)
                    if canonical_row_name != name:
                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
                        have_normalized = True
                        rows.append(map(encode_field, canonical_row))

                if not have_wikipedia:
                    rows.append(map(encode_field, row))

                # Country names have more specialized logic
                if boundary_type == boundary_types.COUNTRY:
                    wikipedia_entries = wiki_titles.get(canonical.lower(), {})

                    canonical_row_name = normalize_display_name(canonical)

                    canonical_row = row[:]

                    if is_orig_name:
                        canonical = safe_decode(canonical)
                        canonical_row[NAME_INDEX] = safe_encode(canonical)

                        norm_name = normalize_name(canonical.lower())
                        for s, repl in saint_replacements:
                            if not wikipedia_entries:
                                wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})

                        if not wikipedia_entries:
                            norm_name = normalize_name(canonical_row_name.lower())
                            for s, repl in saint_replacements:
                                if not wikipedia_entries:
                                    wikipedia_entries = wiki_titles.get(norm_name.replace(s, repl), {})

                        have_wikipedia = geonames_id in wikipedia_entries
                        wiki_preferred = wikipedia_entries.get(geonames_id, 0)

                        if have_wikipedia:
                            canonical_row[DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX] = wiki_preferred + 1

                        if (name != canonical):
                            rows.append(map(encode_field, canonical_row))

                    if canonical_row_name != canonical and canonical_row_name != name:
                        canonical_row[NAME_INDEX] = safe_encode(canonical_row_name)
                        rows.append(map(encode_field, canonical_row))

                    if alpha2_code and is_orig_name:
                        alpha2_row = row[:]
                        alpha2_row[NAME_INDEX] = alpha2_code
                        alpha2_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
                        rows.append(map(encode_field, alpha2_row))

                    if alpha2_code.lower() in country_alpha3_map and is_orig_name:
                        alpha3_row = row[:]
                        alpha3_row[NAME_INDEX] = country_code_alpha3_map[alpha2_code.lower()]
                        alpha3_row[DUMMY_LANGUAGE_PRIORITY_INDEX] = 10
                        rows.append(map(encode_field, alpha3_row))

            writer.writerows(rows)
            logging.info('Did {} batches'.format(i))
            i += 1

        cursor.close()
        f.flush()

    f.close()

    logging.info('Sorting...')

    env = os.environ.copy()
    env['LC_ALL'] = 'C'

    command = ['sort', '-t\t', '-u', '--ignore-case',
               '-k{0},{0}'.format(NAME_INDEX + 1),
               # If there's a Wikipedia link to this name for the given id, sort first
               '-k{0},{0}nr'.format(DUMMY_HAS_WIKIPEDIA_ENTRY_INDEX + 1),
               # Language priority rules as above
               '-k{0},{0}nr'.format(DUMMY_LANGUAGE_PRIORITY_INDEX + 1),
               # Sort descending by population (basic proxy for relevance)
               '-k{0},{0}nr'.format(POPULATION_INDEX + 1),
               # group rows for the same geonames ID together
               '-k{0},{0}'.format(GEONAMES_ID_INDEX + 1),
               # preferred names come first within that grouping
               '-k{0},{0}nr'.format(PREFERRED_INDEX + 1),
               # since uniquing is done on the sort key, add language
               '-k{0},{0}'.format(LANGUAGE_INDEX + 1),
               '-o', filename, temp_filename]

    p = subprocess.Popen(command, env=env)

    return_code = p.wait()
    if return_code != 0:
        raise subprocess.CalledProcessError(return_code, command)

    os.unlink(temp_filename)
Example #43
0
def tokenize(s, whitespace=False):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]), token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u, whitespace)]
Example #44
0
 def include_polygon(self, props):
     return 'ISO3166-1:alpha2' in props or 'ISO3166-2' in props or (props.get('type', 'relation'), safe_encode(props.get('id', ''))) in osm_admin1_ids
Example #45
0
 def path_and_filename(cls, wof_id):
     id_str = safe_encode(wof_id)
     n = 3
     parts = [id_str[i:i + n] for i in six.moves.xrange(0, len(id_str), n)]
     filename = six.u('{}.geojson').format(wof_id)
     return six.u('/').join(parts), filename
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR,
                                        output_file=ADDRESS_DATA_FILE,
                                        header_file=ADDRESS_HEADER_FILE):
    address_languages = []
    expansion_rules = []
    canonical_strings = []

    max_dictionary_types = 0

    for language in os.listdir(base_dir):
        language_dir = os.path.join(base_dir, language)

        num_language_rules = 0
        language_index = len(expansion_rules)

        language_canonical_dictionaries = defaultdict(list)
        canonical_indices = {}

        for filename in os.listdir(language_dir):
            dictionary_name = filename.rstrip('.txt').lower()
            if '.' in dictionary_name:
                raise InvalidAddressFileException(
                    u'Invalid extension for file {}/{}, must be .txt'.format(
                        language, filename))

            if dictionary_name not in gazetteer_types:
                raise InvalidAddressFileException(
                    u'Invalid filename for file {}/{}. Must be one of {{{}}}'.
                    format(language, filename, ', '.join(gazetteer_types)))

            dictionary_type = gazetteer_types[dictionary_name]

            f = open(os.path.join(language_dir, filename))
            for i, line in enumerate(f):
                line = safe_decode(line.rstrip())
                if not line.strip():
                    continue

                if u'}' in line:
                    raise InvalidAddressFileException(
                        u'Found }} in file: {}/{}, line {}'.format(
                            language, filename, i + 1))
                phrases = line.split(u'|')
                if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
                    raise InvalidAddressFileException(
                        u'Found blank synonym in: {}/{}, line {}'.format(
                            language, filename, i + 1))

                canonical = phrases[0]
                if len(phrases) > 1:
                    canonical_index = canonical_indices.get(canonical, None)
                    if canonical_index is None:
                        canonical_index = len(canonical_strings)
                        canonical_strings.append(quote_string(canonical))
                        canonical_indices[canonical] = canonical_index
                else:
                    canonical_index = -1

                for i, p in enumerate(phrases):
                    language_canonical_dictionaries[
                        p, canonical_index if i > 0 else -1].append(
                            dictionary_type)

        for (
                phrase, canonical_index
        ), dictionary_types in language_canonical_dictionaries.iteritems():
            max_dictionary_types = max(max_dictionary_types,
                                       len(dictionary_types))
            rule_template = address_expansion_rule_template.format(
                phrase=quote_string(phrase),
                num_dictionaries=str(len(dictionary_types)),
                dictionaries=', '.join(dictionary_types),
                canonical_index=canonical_index)
            expansion_rules.append(rule_template)
            num_language_rules += 1

        address_languages.append(
            address_language_index_template.format(
                language=quote_string(language),
                index=language_index,
                length=num_language_rules))

    header = address_expansion_rule_header_template.format(
        max_dictionary_types=str(max_dictionary_types))
    out = open(header_file, 'w')
    out.write(safe_encode(header))
    out.close()

    data_file = address_expansion_data_file_template.format(
        canonical_strings=u''',
    '''.join(canonical_strings),
        expansion_rules=u''',
    '''.join(expansion_rules),
        address_languages=u''',
    '''.join(address_languages),
    )

    out = open(output_file, 'w')
    out.write(safe_encode(data_file))
    out.close()
Example #47
0
    def intersections(self):
        '''
        Generator which yields tuples like:

        (node_id, lat, lon, {way_id: way_props})
        '''
        i = 0

        node_ids = array.array('l')
        node_counts = array.array('i')

        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
            props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                node_ids.append(node_id)
                node_counts.append(0)
                self.node_props.Put(safe_encode(node_id), json.dumps(props))
            elif element_id.startswith('way'):
                # Don't care about the ordering of the nodes, and want uniques e.g. for circular roads
                deps = set(deps)

                # Get node indices by binary search
                for node_id in deps:
                    try:
                        node_index = self.binary_search(node_ids, node_id)
                    except ValueError:
                        continue
                    if node_index is None:
                        continue
                    node_counts[node_index] += 1

            if i % 1000 == 0 and i > 0:
                self.logger.info('doing {}s, at {}'.format(element_id.split(':')[0], i))
            i += 1

        for i, count in enumerate(node_counts):
            if count > 1:
                self.node_ids.append(node_ids[i])

        del node_ids
        del node_counts

        i = 0

        for element_id, props, deps in parse_osm(self.filename, dependencies=True):
            if element_id.startswith('node'):
                node_id = long(element_id.split(':')[-1])
                node_index = self.binary_search(self.node_ids, node_id)
            elif element_id.startswith('way'):
                props = {safe_decode(k): safe_decode(v) for k, v in six.iteritems(props)}
                way_id = long(element_id.split(':')[-1])
                props['id'] = way_id
                for node_id in deps:
                    node_index = self.binary_search(self.node_ids, node_id)
                    if node_index is not None:
                        self.intersection_edges_nodes.append(node_id)
                        self.intersection_edges_ways.append(way_id)
                        self.way_props.Put(safe_encode(way_id), json.dumps(props))

            if i % 1000 == 0 and i > 0:
                self.logger.info('second pass, doing {}s, at {}'.format(element_id.split(':')[0], i))
            i += 1

        i = 0

        indices = numpy.argsort(self.intersection_edges_nodes)
        self.intersection_edges_nodes = numpy.fromiter((self.intersection_edges_nodes[i] for i in indices), dtype=numpy.uint64)
        self.intersection_edges_ways = numpy.fromiter((self.intersection_edges_ways[i] for i in indices), dtype=numpy.uint64)
        del indices

        idx = 0

        # Need to make a copy here otherwise will change dictionary size during iteration
        for node_id, g in groupby(self.intersection_edges_nodes):
            group_len = sum((1 for j in g))

            node_props = json.loads(self.node_props.Get(safe_encode(node_id)))

            way_indices = self.intersection_edges_ways[idx:idx + group_len]
            all_ways = [json.loads(self.way_props.Get(safe_encode(w))) for w in way_indices]
            way_names = set()
            ways = []
            for way in all_ways:
                if way['name'] in way_names:
                    continue
                ways.append(way)
                way_names.add(way['name'])

            idx += group_len

            if i % 1000 == 0 and i > 0:
                self.logger.info('checking intersections, did {}'.format(i))
            i += 1

            if len(ways) > 1:
                node_index = self.binary_search(self.node_ids, node_id)
                yield self.node_ids[node_index], node_props, ways
Example #48
0
def create_address_expansion_rules_file(base_dir=ADDRESS_EXPANSIONS_DIR, output_file=ADDRESS_DATA_FILE, header_file=ADDRESS_HEADER_FILE):
    address_languages = []
    expansion_rules = []
    canonical_strings = []

    max_dictionary_types = 0

    for language in os.listdir(base_dir):
        language_dir = os.path.join(base_dir, language)

        num_language_rules = 0
        language_index = len(expansion_rules)

        language_canonical_dictionaries = defaultdict(list)
        canonical_indices = {}

        for filename in os.listdir(language_dir):
            dictionary_name = filename.rstrip('.txt').lower()
            if '.' in dictionary_name:
                raise InvalidAddressFileException(u'Invalid extension for file {}/{}, must be .txt'.format(language, filename))

            if dictionary_name not in gazetteer_types:
                raise InvalidAddressFileException(u'Invalid filename for file {}/{}. Must be one of {{{}}}'.format(language, filename, ', '.join(gazetteer_types)))

            dictionary_type = gazetteer_types[dictionary_name]

            f = open(os.path.join(language_dir, filename))
            for i, line in enumerate(f):
                line = safe_decode(line.rstrip())
                if not line.strip():
                    continue

                if u'}' in line:
                    raise InvalidAddressFileException(u'Found }} in file: {}/{}, line {}'.format(language, filename, i+1))
                phrases = line.split(u'|')
                if sum((1 for p in phrases if len(p.strip()) == 0)) > 0:
                    raise InvalidAddressFileException(u'Found blank synonym in: {}/{}, line {}'.format(language, filename, i+1))

                canonical = phrases[0]
                if len(phrases) > 1:
                    canonical_index = canonical_indices.get(canonical, None)
                    if canonical_index is None:
                        canonical_index = len(canonical_strings)
                        canonical_strings.append(quote_string(canonical))
                        canonical_indices[canonical] = canonical_index
                else:
                    canonical_index = -1

                for p in phrases:
                    language_canonical_dictionaries[(p, canonical_index)].append(dictionary_type)

        for (phrase, canonical_index), dictionary_types in language_canonical_dictionaries.iteritems():
            max_dictionary_types = max(max_dictionary_types, len(dictionary_types))
            rule_template = address_expansion_rule_template.format(phrase=quote_string(phrase),
                                                                   num_dictionaries=str(len(dictionary_types)),
                                                                   dictionaries=', '.join(dictionary_types),
                                                                   canonical_index=canonical_index)
            expansion_rules.append(rule_template)
            num_language_rules += 1


        address_languages.append(address_language_index_template.format(language=quote_string(language),
                                                                        index=language_index,
                                                                        length=num_language_rules))

    header = address_expansion_rule_header_template.format(
        max_dictionary_types=str(max_dictionary_types)
    )
    out = open(header_file, 'w')
    out.write(safe_encode(header))
    out.close()

    data_file = address_expansion_data_file_template.format(
        canonical_strings=u''',
    '''.join(canonical_strings),
        expansion_rules=u''',
    '''.join(expansion_rules),
        address_languages=u''',
    '''.join(address_languages),
    )

    out = open(output_file, 'w')
    out.write(safe_encode(data_file))
    out.close()
Example #49
0
    def __init__(self, config_file=BOUNDARY_NAMES_CONFIG):
        config = yaml.load(open(config_file))

        default_names = nested_get(config, ('names', 'keys'))
        name_keys, probs = alternative_probabilities(default_names)

        self.name_keys = name_keys
        self.name_key_probs = cdf(probs)

        self.component_name_keys = {}

        for component, component_config in six.iteritems(
                nested_get(config, ('names', 'components'), default={})):
            component_names = component_config.get('keys')
            component_name_keys, component_probs = alternative_probabilities(
                component_names)
            self.component_name_keys[component] = (component_name_keys,
                                                   cdf(component_probs))

        self.country_regex_replacements = defaultdict(list)
        for props in nested_get(config, (
                'names',
                'regex_replacements',
        ),
                                default=[]):
            country = props.get('country')
            re_flags = re.I | re.UNICODE
            if not props.get('case_insensitive', True):
                re.flags ^= re.I

            pattern = re.compile(props['pattern'], re_flags)
            replace_group = props['replace_with_group']
            replace_probability = props['replace_probability']
            self.country_regex_replacements[country].append(
                (pattern, replace_group, replace_probability))

        self.country_regex_replacements = dict(self.country_regex_replacements)

        self.prefixes = {}
        self.prefix_regexes = {}
        self.suffixes = {}
        self.suffix_regexes = {}

        for language, components in six.iteritems(
                nested_get(config, ('names', 'prefixes', 'language'),
                           default={})):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'prefix' not in val:
                        raise AssertionError(
                            six.
                            u('Invalid prefix value for (language={}, component={}): {} '
                              ).format(language, component, val))

                prefix_regex = six.u('|').join([
                    six.u('(?:{} )').format(self._string_as_regex(v['prefix']))
                    if v.get('whitespace') else self._string_as_regex(
                        v['prefix']) for v in affix_values
                ])
                self.prefix_regexes[(language, component)] = re.compile(
                    six.u('^{}').format(prefix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.prefixes[(language,
                               component)] = affix_values, affix_probs_cdf

        for language, components in six.iteritems(
                nested_get(config, ('names', 'suffixes', 'language'),
                           default={})):
            for component, affixes in six.iteritems(components):
                affix_values, probs = alternative_probabilities(affixes)

                for val in affix_values:
                    if 'suffix' not in val:
                        raise AssertionError(
                            six.
                            u('Invalid suffix value for (language={}, component={}): {} '
                              ).format(language, component, val))

                suffix_regex = six.u('|').join([
                    six.u('(?: {})').format(self._string_as_regex(v['suffix']))
                    if v.get('whitespace') else self._string_as_regex(
                        v['suffix']) for v in affix_values
                ])
                self.suffix_regexes[(language, component)] = re.compile(
                    six.u('{}$').format(suffix_regex), re.I | re.U)

                if not isclose(sum(probs), 1.0):
                    affix_values.append(None)
                    probs.append(1.0 - sum(probs))
                affix_probs_cdf = cdf(probs)
                self.suffixes[(language,
                               component)] = affix_values, affix_probs_cdf

        self.exceptions = {}

        for props in nested_get(config, ('names', 'exceptions'), default=[]):
            object_type = props['type']
            object_id = safe_encode(props['id'])
            keys = [props['default']]
            probs = [props['probability']]
            for alt in props.get('alternatives', []):
                keys.append(alt['alternative'])
                probs.append(alt['probability'])

            probs = cdf(probs)
            self.exceptions[(object_type, object_id)] = (keys, probs)
Example #50
0
def tokenize(s, whitespace=False):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]),
             token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u, whitespace)]
Example #51
0
def tokenize(s):
    u = safe_decode(s)
    s = safe_encode(s)
    return [(safe_decode(s[start:start + length]),
             token_types.from_id(token_type))
            for start, length, token_type in _tokenize.tokenize(u)]
Example #52
0
    def dropout_components(self, components, boundaries=(), country=None, population=None, unambiguous_city=False):
        containing_ids = set()

        for boundary in boundaries:
            object_type = boundary.get('type')
            object_id = safe_encode(boundary.get('id', ''))
            if not (object_type and object_id):
                continue
            containing_ids.add((object_type, object_id))

        original_bitset = ComponentDependencies.component_bitset(components)

        names = defaultdict(list)
        admin_components = [c for c in components if c in self.ADMIN_COMPONENTS]
        for c in admin_components:
            names[components[c]].append(c)

        same_name = set()
        for c, v in six.iteritems(names):
            if len(v) > 1:
                same_name |= set(v)

        new_components = components.copy()

        city_replacements = set()
        if AddressFormatter.CITY not in components:
            city_replacements = self.city_replacements(country)

        for component in admin_components:
            include = self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city)

            if not include and component not in city_replacements:
                # Note: this check is for cities that have the same name as their admin
                # areas e.g. Luxembourg, Luxembourg. In cases like this, if we were to drop
                # city, we don't want to include country on its own. This should help the parser
                # default to the city in ambiguous cases where only one component is specified.
                if not (component == AddressFormatter.CITY and component in same_name):
                    new_components.pop(component, None)
                else:
                    value = components[component]
                    for c in names[value]:
                        new_components.pop(c, None)

        for component in self.ADMIN_COMPONENTS:
            value = self.get_property(('components', component, 'value'), country=country, default=None)

            if not value:
                values, probs = self.cdf_cache.get((country, component), (None, None))
                if values is None:
                    values = self.get_property(('components', component, 'values'), country=country, default=None)
                    if values is not None:
                        values, probs = zip(*[(v['value'], float(v['probability'])) for v in values])
                        probs = cdf(probs)
                        self.cdf_cache[(country, component)] = (values, probs)

                if values is not None:
                    value = weighted_choice(values, probs)

            if value is not None and component not in components and self.include_component(component, containing_ids, country=country, population=population, unambiguous_city=unambiguous_city):
                new_components[component] = value

        self.drop_invalid_components(new_components, country, original_bitset=original_bitset)

        return new_components