Example #1
0
def get(url, file_name, lan_1_name, lan_2_name):
    wmt_news_path = os.path.join(data_dir, file_name)
    wmt_news_dir = os.path.splitext(wmt_news_path)[0]
    lan_1_file_path = os.path.join(wmt_news_dir, lan_1_name)
    lan_2_file_path = os.path.join(wmt_news_dir, lan_2_name)

    # download and unzip data
    utils.download(url, wmt_news_path)
    utils.unzip_and_delete(wmt_news_path)

    # read data
    lan_1_data = utils.read_lines(lan_1_file_path)
    lan_2_data = utils.read_lines(lan_2_file_path)
    return lan_1_data, lan_2_data
Example #2
0
def get(url, file_name, lan_1_name, lan_2_name):
    europarl_news_path = os.path.join(data_dir, file_name)
    europarl_news_dir = os.path.splitext(europarl_news_path)[0]
    lan_1_file_path = os.path.join(europarl_news_dir, lan_1_name)
    lan_2_file_path = os.path.join(europarl_news_dir, lan_2_name)

    utils.download(url, europarl_news_path)
    if os.path.isfile(europarl_news_path):
        shutil.unpack_archive(europarl_news_path, europarl_news_dir)
        os.remove(europarl_news_path)

    # read data
    lan_1_data = utils.read_lines(lan_1_file_path)
    lan_2_data = utils.read_lines(lan_2_file_path)
    return lan_1_data, lan_2_data
Example #3
0
def get(url, file_name):
    kyoto_news_path = os.path.join(data_dir, file_name)
    kyoto_news_dir = os.path.splitext(kyoto_news_path)[0]

    if not os.path.exists(kyoto_news_dir):
        utils.download(url, kyoto_news_path)
        shutil.unpack_archive(kyoto_news_path, kyoto_news_dir)
        os.remove(kyoto_news_path)
        print("Download successfully, start extracting the training data.")

    # Iterate over all the entries
    train_files = glob.glob(kyoto_news_dir + "/**/*-train.ja", recursive=True)
    test_files = glob.glob(kyoto_news_dir + "/**/*-train.en", recursive=True)

    lan_1_data = utils.read_lines(train_files[0])
    lan_2_data = utils.read_lines(test_files[0])

    print("Done! going to the tokenizer next.")

    return lan_1_data, lan_2_data
Example #4
0
def get(url, file_name):
    data_path = os.path.join(data_dir, file_name)
    cache_name = os.path.splitext(data_path)[0] + '.pkl'
    if os.path.exists(cache_name):
        return read_cache(cache_name)

    # download and unzip data
    utils.download(url, data_path)
    with gzip.open(data_path, 'rb') as f:
        _data = f.read().decode('utf-8')

    _data = utils.full_2_half(utils.unicode_to_ascii(_data))
    _data = _data.replace('\r', '').strip().split('\n\t\n')
    _data = list(
        map(
            lambda x: list(
                map(
                    lambda line:
                    [line.split('\t')[1].strip(),
                     line.split('\t')[0].strip()], x.split('\n'))), _data))

    cache(cache_name, _data)
    return _data
Example #5
0
                   'rb') as f_in:  # <<========== extraction happens here
        with open(os.path.join(work_dir, filename), 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)


def __add_dict(_dict, k, v):
    if k not in _dict:
        _dict[k] = {'translation': []}
    _dict[k]['translation'].append(v)


for i in range(0, len(file_names)):
    pack_file = os.path.join(saving_directory, file_names[i])

    if not os.path.exists(pack_file):
        utils.download(urls[i], pack_file)
        unzip_gz(pack_file, saving_directory)
        # shutil.unpack_archive(packfile, saving_directory)

    if os.path.exists(pack_file):
        os.remove(pack_file)
    print("Download successfully, start extracting the training data.")

    target_file = os.path.join(saving_news_dir, names[i] + ".*")
    file_name = glob.glob(target_file, recursive=True)

    with open(file_name[0], encoding="utf-8") as f:
        print("reading file for ", file_name[0])
        read = csv.reader(f, delimiter='\t')

        zh_en_dict = {}