def get(url, file_name, lan_1_name, lan_2_name): wmt_news_path = os.path.join(data_dir, file_name) wmt_news_dir = os.path.splitext(wmt_news_path)[0] lan_1_file_path = os.path.join(wmt_news_dir, lan_1_name) lan_2_file_path = os.path.join(wmt_news_dir, lan_2_name) # download and unzip data utils.download(url, wmt_news_path) utils.unzip_and_delete(wmt_news_path) # read data lan_1_data = utils.read_lines(lan_1_file_path) lan_2_data = utils.read_lines(lan_2_file_path) return lan_1_data, lan_2_data
def get(url, file_name, lan_1_name, lan_2_name): europarl_news_path = os.path.join(data_dir, file_name) europarl_news_dir = os.path.splitext(europarl_news_path)[0] lan_1_file_path = os.path.join(europarl_news_dir, lan_1_name) lan_2_file_path = os.path.join(europarl_news_dir, lan_2_name) utils.download(url, europarl_news_path) if os.path.isfile(europarl_news_path): shutil.unpack_archive(europarl_news_path, europarl_news_dir) os.remove(europarl_news_path) # read data lan_1_data = utils.read_lines(lan_1_file_path) lan_2_data = utils.read_lines(lan_2_file_path) return lan_1_data, lan_2_data
def get(url, file_name): kyoto_news_path = os.path.join(data_dir, file_name) kyoto_news_dir = os.path.splitext(kyoto_news_path)[0] if not os.path.exists(kyoto_news_dir): utils.download(url, kyoto_news_path) shutil.unpack_archive(kyoto_news_path, kyoto_news_dir) os.remove(kyoto_news_path) print("Download successfully, start extracting the training data.") # Iterate over all the entries train_files = glob.glob(kyoto_news_dir + "/**/*-train.ja", recursive=True) test_files = glob.glob(kyoto_news_dir + "/**/*-train.en", recursive=True) lan_1_data = utils.read_lines(train_files[0]) lan_2_data = utils.read_lines(test_files[0]) print("Done! going to the tokenizer next.") return lan_1_data, lan_2_data
def get(url, file_name): data_path = os.path.join(data_dir, file_name) cache_name = os.path.splitext(data_path)[0] + '.pkl' if os.path.exists(cache_name): return read_cache(cache_name) # download and unzip data utils.download(url, data_path) with gzip.open(data_path, 'rb') as f: _data = f.read().decode('utf-8') _data = utils.full_2_half(utils.unicode_to_ascii(_data)) _data = _data.replace('\r', '').strip().split('\n\t\n') _data = list( map( lambda x: list( map( lambda line: [line.split('\t')[1].strip(), line.split('\t')[0].strip()], x.split('\n'))), _data)) cache(cache_name, _data) return _data
'rb') as f_in: # <<========== extraction happens here with open(os.path.join(work_dir, filename), 'wb') as f_out: shutil.copyfileobj(f_in, f_out) def __add_dict(_dict, k, v): if k not in _dict: _dict[k] = {'translation': []} _dict[k]['translation'].append(v) for i in range(0, len(file_names)): pack_file = os.path.join(saving_directory, file_names[i]) if not os.path.exists(pack_file): utils.download(urls[i], pack_file) unzip_gz(pack_file, saving_directory) # shutil.unpack_archive(packfile, saving_directory) if os.path.exists(pack_file): os.remove(pack_file) print("Download successfully, start extracting the training data.") target_file = os.path.join(saving_news_dir, names[i] + ".*") file_name = glob.glob(target_file, recursive=True) with open(file_name[0], encoding="utf-8") as f: print("reading file for ", file_name[0]) read = csv.reader(f, delimiter='\t') zh_en_dict = {}