def parse_unicode_data(): """ Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS """ if not os.path.exists(LOCAL_UNIDATA_FILE): download_file(UNIDATA) unidata_file = open(LOCAL_UNIDATA_FILE) for line in csv.reader(unidata_file, delimiter=";"): yield UnicodeDataRow(*line)
def parse_unicode_data(): ''' Parse UnicodeData.txt into namedtuples using UNIDATA_FIELDS ''' if not os.path.exists(LOCAL_UNIDATA_FILE): download_file(UNIDATA) unidata_file = open(LOCAL_UNIDATA_FILE) for line in csv.reader(unidata_file, delimiter=';'): yield UnicodeDataRow(*line)
def openaddresses_download_all_files(out_dir): temp_dir = tempfile.gettempdir() local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME) if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path): sys.exit('Could not download state.txt file') reader = unicode_csv_reader(open(local_state_file_path), delimiter='\t') headers = reader.next() source_index = headers.index('source') url_index = headers.index('processed') download_pre_release_downloads(out_dir) for row in reader: source = row[source_index].rsplit('.')[0] processed = row[url_index] if not processed or not processed.strip(): continue print(six.u('doing {}').format(source)) success = download_and_unzip_file(processed, out_dir) if not success: print(six.u('ERR: could not download {}').format(source)) remove_file(local_state_file_path)
def openaddresses_download_all_files(out_dir): temp_dir = tempfile.gettempdir() local_state_file_path = os.path.join(temp_dir, OPENADDRESSES_STATE_FILE_NAME) if not download_file(OPENADDRESSES_STATE_URL, local_state_file_path): sys.exit('Could not download state.txt file') reader = unicode_csv_reader(open(local_state_file_path), delimiter='\t') headers = reader.next() source_index = headers.index('source') url_index = headers.index('processed') download_pre_release_downloads(out_dir) for row in reader: source = row[source_index].rsplit('.')[0] processed = row[url_index] if not processed or not processed.strip(): continue print(six.u('doing {}').format(source)) success = download_and_unzip_file(processed, out_dir) if not success: print(six.u('ERR: could not download {}').format(source)) remove_file(local_state_file_path)
def download_and_unzip_file(url, out_dir): zip_filename = url.rsplit('/', 1)[-1].strip() zip_local_path = os.path.join(out_dir, zip_filename) success = download_file(url, zip_local_path) and unzip_file(zip_local_path, out_dir) if os.path.exists(zip_local_path): remove_file(zip_local_path) return success
def download_and_unzip_file(url, out_dir): zip_filename = url.rsplit('/', 1)[-1].strip() zip_local_path = os.path.join(out_dir, zip_filename) success = download_file(url, zip_local_path) and unzip_file( zip_local_path, out_dir) if os.path.exists(zip_local_path): remove_file(zip_local_path) return success
def main(out_dir): # Output is a C header and data file, see templates out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w') out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w') download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE) download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE) download_file(PROPS_URL, LOCAL_PROPS_FILE) download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE) download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE) download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE) download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE) if not os.path.exists(CLDR_SUPPLEMENTAL_DATA): download_cldr() chars = get_chars_by_script() all_scripts = build_master_scripts_list(chars) script_codes = get_script_codes(all_scripts) script_languages = get_script_languages() max_langs = 0 for script, langs in script_languages.iteritems(): num_langs = len(langs) if num_langs > max_langs: max_langs = num_langs # Generate C header and constants script_enum = u''' '''.join([ 'SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1)) ]) out_header.write( scripts_header_template.format(num_codepoints=NUM_CODEPOINTS, max_langs=max_langs, script_enum=script_enum)) out_header.close() # Generate C data file char_scripts_data = u''', '''.join([ ', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25) ]) script_codes_data = u''', '''.join([ script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems() ]) sorted_lang_scripts = [ script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1)) ] script_language_data = u''', '''.join([ script_language_template.format( num_langs=len(langs), languages='{{{}}}'.format( ', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL')) for langs in sorted_lang_scripts ]) out_file.write( scripts_c_data_template.format(header_name=SCRIPTS_HEADER, char_scripts=char_scripts_data, script_codes=script_codes_data, script_languages=script_language_data)) out_file.close()
def main(out_dir=SRC_DIR): # Output is a C header and data file, see templates out_file = open(os.path.join(out_dir, SCRIPTS_DATA_FILENAME), 'w') out_header = open(os.path.join(out_dir, SCRIPTS_HEADER), 'w') download_file(SCRIPTS_URL, LOCAL_SCRIPTS_FILE) download_file(BLOCKS_URL, LOCAL_BLOCKS_FILE) download_file(PROPS_URL, LOCAL_PROPS_FILE) download_file(PROP_ALIASES_URL, LOCAL_PROP_ALIASES_FILE) download_file(PROP_VALUE_ALIASES_URL, LOCAL_PROP_VALUE_ALIASES_FILE) download_file(DERIVED_CORE_PROPS_URL, LOCAL_DERIVED_CORE_PROPS_FILE) download_file(WORD_BREAKS_URL, LOCAL_WORD_BREAKS_FILE) if not os.path.exists(CLDR_SUPPLEMENTAL_DATA): download_cldr() chars = get_chars_by_script() all_scripts = build_master_scripts_list(chars) script_codes = get_script_codes(all_scripts) script_languages = get_script_languages() max_langs = 0 for script, langs in script_languages.iteritems(): num_langs = len(langs) if num_langs > max_langs: max_langs = num_langs # Generate C header and constants script_enum = u''' '''.join(['SCRIPT_{} = {},'.format(s.upper(), i) for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))]) out_header.write(scripts_header_template.format(num_codepoints=NUM_CODEPOINTS, max_langs=max_langs, script_enum=script_enum)) out_header.close() # Generate C data file char_scripts_data = u''', '''.join([', '.join([str(all_scripts[sc or UNKNOWN_SCRIPT]) for sc in batch]) for batch in batch_iter(chars, 25)]) script_codes_data = u''', '''.join([script_code_template.format(name=name.upper(), code=code) for code, name in script_codes.iteritems()]) sorted_lang_scripts = [script_languages[s] for s, i in sorted(all_scripts.iteritems(), key=itemgetter(1))] script_language_data = u''', '''.join([script_language_template.format(num_langs=len(langs), languages='{{{}}}'.format(', '.join(['"{}"'.format(l) for l in langs]) if langs else 'NULL')) for langs in sorted_lang_scripts]) out_file.write(scripts_c_data_template.format(header_name=SCRIPTS_HEADER, char_scripts=char_scripts_data, script_codes=script_codes_data, script_languages=script_language_data)) out_file.close()