Esempio n. 1
0
def parse_unicode_data():
    """
    Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
    """
    if not os.path.exists(LOCAL_UNIDATA_FILE):
        download_file(UNIDATA)
    unidata_file = open(LOCAL_UNIDATA_FILE)

    for line in csv.reader(unidata_file, delimiter=";"):
        yield UnicodeDataRow(*line)
Esempio n. 2
0
def parse_unicode_data():
    '''
    Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS
    '''
    if not os.path.exists(LOCAL_UNIDATA_FILE):
        download_file(UNIDATA)
    unidata_file = open(LOCAL_UNIDATA_FILE)

    for line in csv.reader(unidata_file, delimiter=';'):
        yield UnicodeDataRow(*line)
def openaddresses_download_all_files(out_dir):
    temp_dir = tempfile.gettempdir()

    local_state_file_path = os.path.join(temp_dir,
                                         OPENADDRESSES_STATE_FILE_NAME)
    if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path):
        sys.exit('Could not download state.txt file')

    reader = unicode_csv_reader(open(local_state_file_path), delimiter='\t')
    headers = reader.next()

    source_index = headers.index('source')
    url_index = headers.index('processed')

    download_pre_release_downloads(out_dir)

    for row in reader:
        source = row[source_index].rsplit('.')[0]
        processed = row[url_index]
        if not processed or not processed.strip():
            continue

        print(six.u('doing {}').format(source))
        success = download_and_unzip_file(processed, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))

    remove_file(local_state_file_path)
Esempio n. 4
0
def openaddresses_download_all_files(out_dir):
    temp_dir = tempfile.gettempdir()

    local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME)
    if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path):
        sys.exit('Could not download state.txt file')

    reader = unicode_csv_reader(open(local_state_file_path), delimiter='\t')
    headers = reader.next()

    source_index = headers.index('source')
    url_index = headers.index('processed')

    download_pre_release_downloads(out_dir)

    for row in reader:
        source = row[source_index].rsplit('.')[0]
        processed = row[url_index]
        if not processed or not processed.strip():
            continue

        print(six.u('doing {}').format(source))
        success = download_and_unzip_file(processed, out_dir)
        if not success:
            print(six.u('ERR: could not download {}').format(source))

    remove_file(local_state_file_path)
Esempio n. 5
0
def download_and_unzip_file(url, out_dir):
    zip_filename = url.rsplit('/', 1)[-1].strip()
    zip_local_path = os.path.join(out_dir, zip_filename)

    success = download_file(url, zip_local_path) and unzip_file(zip_local_path, out_dir)

    if os.path.exists(zip_local_path):
        remove_file(zip_local_path)

    return success
def download_and_unzip_file(url, out_dir):
    zip_filename = url.rsplit('/', 1)[-1].strip()
    zip_local_path = os.path.join(out_dir, zip_filename)

    success = download_file(url, zip_local_path) and unzip_file(
        zip_local_path, out_dir)

    if os.path.exists(zip_local_path):
        remove_file(zip_local_path)

    return success
def main(out_dir):
    # Output is a C header and data file, see templates
    out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
    out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')

    download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
    download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
    download_file(PROPS_URL, LOCAL_PROPS_FILE)
    download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
    download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
    download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
    download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)

    if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
        download_cldr()

    chars = get_chars_by_script()
    all_scripts = build_master_scripts_list(chars)
    script_codes = get_script_codes(all_scripts)

    script_languages = get_script_languages()

    max_langs = 0

    for script, langs in script_languages.iteritems():
        num_langs = len(langs)
        if num_langs > max_langs:
            max_langs = num_langs

    # Generate C header and constants

    script_enum = u'''
    '''.join([
        'SCRIPT_{} = {},'.format(s.upper(), i)
        for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))
    ])

    out_header.write(
        scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
                                       max_langs=max_langs,
                                       script_enum=script_enum))
    out_header.close()

    # Generate C data file

    char_scripts_data = u''',
    '''.join([
        ', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch])
        for batch in batch_iter(chars, 25)
    ])

    script_codes_data = u''',
    '''.join([
        script_code_template.format(name=name.upper(), code=code)
        for code, name in script_codes.iteritems()
    ])

    sorted_lang_scripts = [
        script_languages[s]
        for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))
    ]

    script_language_data = u''',
    '''.join([
        script_language_template.format(
            num_langs=len(langs),
            languages='{{{}}}'.format(
                ', '.join(['"{}"'.format(l)
                           for l in langs]) if langs else 'NULL'))
        for langs in sorted_lang_scripts
    ])

    out_file.write(
        scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
                                       char_scripts=char_scripts_data,
                                       script_codes=script_codes_data,
                                       script_languages=script_language_data))
    out_file.close()
Esempio n. 8
0
def main(out_dir=SRC_DIR):
    # Output is a C header and data file, see templates
    out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w')
    out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w')

    download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE)
    download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE)
    download_file(PROPS_URL, LOCAL_PROPS_FILE)
    download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE)
    download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE)
    download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE)
    download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE)

    if not os.path.exists(CLDR_SUPPLEMENTAL_DATA):
        download_cldr()

    chars = get_chars_by_script()
    all_scripts = build_master_scripts_list(chars)
    script_codes = get_script_codes(all_scripts)

    script_languages = get_script_languages()

    max_langs = 0

    for script, langs in script_languages.iteritems():
        num_langs = len(langs)
        if num_langs > max_langs:
            max_langs = num_langs

    # Generate C header and constants

    script_enum = u'''
    '''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))])

    out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS,
                     max_langs=max_langs,
                     script_enum=script_enum))
    out_header.close()

    # Generate C data file

    char_scripts_data = u''',
    '''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)])

    script_codes_data = u''',
    '''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()])

    sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]

    script_language_data = u''',
    '''.join([script_language_template.format(num_langs=len(langs),
              languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL'))
              for langs in sorted_lang_scripts])

    out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER,
                   char_scripts=char_scripts_data,
                   script_codes=script_codes_data,
                   script_languages=script_language_data))
    out_file.close()