def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) pool = None ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) def consumer(tensor): ds.add_item(tensor) stat = BinarizerDataset.export_binarized_dataset( input_file, vocab, consumer, add_if_not_exist=False, num_workers=num_workers, ) ntok = stat['ntok'] nseq = stat['nseq'] nunk = stat['nunk'] ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print( "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, nseq, ntok, 100 * nunk / ntok, vocab.unk_word, ) )
def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True, copy_from=None): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) words_list = [] # todo: 目前传不出去 def consumer(ids, words): ds.add_item(ids) words_list.append(words) res = Binarizer.binarize(filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def make_binary_dataset(data_dir): dict = dictionary.Dictionary.load(data_dir + '/dict.en.txt') print('Converting utf8 files to fairseq binary') files = glob.glob(data_dir + '/utf8/test*.en', recursive=True) files += glob.glob(data_dir + '/utf8/test*.de', recursive=True) files += glob.glob(data_dir + '/utf8/dev*.en', recursive=True) files += glob.glob(data_dir + '/utf8/dev*.de', recursive=True) files += glob.glob(data_dir + '/utf8/train*.en', recursive=True) files += glob.glob(data_dir + '/utf8/train*.de', recursive=True) def consumer(tensor): ds.add_item(tensor) for file in files: print('Converting file:', file) ds = indexed_dataset.IndexedDatasetBuilder(file + '.bin') def consumer(tensor): ds.add_item(tensor) res = MockTokenizer.binarize(file, dict, consumer) ds.finalize(file + '.idx')
def merge_files(files, outpath): ds = indexed_dataset.IndexedDatasetBuilder("{}.bin".format(outpath)) for file in files: ds.merge_file_(file) os.remove(indexed_dataset.data_file_path(file)) os.remove(indexed_dataset.index_file_path(file)) ds.finalize("{}.idx".format(outpath))
def binarize(args, filename, dict, output_prefix, lang, offset, end): ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) return res
def make_binary_dataset(input_prefix, output_prefix, lng_pair, lang, num_workers): if not args.joined_dictionary and lang != 'en': dict = dictionary.Dictionary.load(tgt_dict_path) else: dict = dictionary.Dictionary.load(dict_path) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = f'{input_prefix}.{lng_pair}.{lang}.tok.bpe' if not os.path.exists(input_file): input_file = f'{input_prefix}.{lng_pair}.{lang}' if not os.path.exists(input_file): print("| {} not found".format(input_file)) return if args.expert: input_file = input_file + '.e' offsets = Tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): fn_without_ext = f"{output_prefix}{worker_id}.{lng_pair}.{lang}" pool.apply_async(binarize, (input_file, dict, fn_without_ext, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( f"{output_prefix}.{lng_pair}.{lang}.bin") merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = f"{output_prefix}{worker_id}.{lng_pair}.{lang}" ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{output_prefix}.{lng_pair}.{lang}.idx") print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def binarize(filename, dict, fn_without_ext, offset, end): ds = indexed_dataset.IndexedDatasetBuilder(f"{fn_without_ext}.bin") def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) ds.finalize(f"{fn_without_ext}.idx") return res
def binarize(args, filename, vocab, output_prefix, lang, offset, end, append_eos=True): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) def consumer(tensor): ds.add_item(tensor) res = Binarizer.binarize(filename, vocab, consumer, append_eos=append_eos, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def make_binary_proto_dataset(input_prefix, output_prefix, output_token_suffix, output_weight_suffix, soft_proto_dict, proto_k, lang, lang_name): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) print('| [prototype] Dictinary: {} types, {} candidates'.format( len(soft_proto_dict), args.proto_k)) ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix, lang_name + '.' + output_token_suffix, 'bin')) ds_w = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path( output_prefix, lang_name + '.' + output_weight_suffix, 'bin'), dtype=np.float32) def consumer(tensor): ds.add_item(tensor) def consumer_weights(tensor): ds_w.add_item(tensor) input_prefix = os.path.join(args.srcdir, input_prefix) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = TokenizerProb.binarize(input_file, dict, consumer, consumer_weights, soft_proto_dict=soft_proto_dict, proto_k=proto_k) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize( dataset_dest_path(output_prefix, lang_name + '.' + output_token_suffix, 'idx')) ds_w.finalize( dataset_dest_path(output_prefix, lang_name + '.' + output_weight_suffix, 'idx'))
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) merge_result( Binarizer.binarize(input_file, vocab, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, ))
def make_binary_dataset(input_file, output_prefix, dic, num_workers): n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] offsets = args.tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, prefix, dic, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( output_file(args, output_prefix, "bin")) merge_result( args.tokenizer.binarize(input_file, dic, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = output_file(args, prefix, '') ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(output_file(args, output_prefix, 'idx')) print("| {}: {} sents, {} tokens, {:.3}% replaced by {}".format( input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dic.unk_word))
def binarize(args, input_file, output_prefix, dict, offset, end): ds = indexed_dataset.IndexedDatasetBuilder( output_file(args, output_prefix, 'bin')) def consumer(tensor): ds.add_item(tensor) res = args.tokenizer.binarize(input_file, dict, consumer, offset=offset, end=end) ds.finalize(output_file(args, output_prefix, 'idx')) return res
def make_binary_dataset(input_prefix, output_prefix): # dict = dictionary.Dictionary.load(dict_path(lang)) # print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix, 'bin')) def consumer(ids): ds.add_item(torch.IntTensor(ids)) # input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') input_file = "{}/{}.bert".format(args.data, input_prefix) res = tokenizer.binarize(input_file, consumer) ds.finalize(dataset_dest_path(output_prefix, 'idx'))
def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') offsets = Tokenizer.find_offsets(input_file, num_workers) print("offsets", offsets) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def make_binary_dataset(input_prefix, output_prefix, lang, src_ids=None): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict))) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res, ids = Tokenizer.binarize(input_file, dict, consumer, src_ids=src_ids) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% replaced by copy'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word, 100 * res['ncopied'] / res['ntok'])) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) return ids
def binarize(args, filename, dict, output_prefix, lang, offset, end): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) # {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced} to_print = ['nseq', 'nunk', 'ntok'] debug_data = {} for k, v in res.items: if k in to_print: debug_data[k] = v debug_data['offset'] = offset debug_data['end'] = end print(debug_data) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) return res
def make_binary_audio_dataset(input_prefix, output_prefix, lang): ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file( args, output_prefix, lang, 'bin'), dtype=np.float32) def consumer(tensor): ds.add_item(tensor) def binarize(input_file, audio_reader, consumer): nseq, nsamp = 0, 0 for tensor in audio_reader(input_file): consumer(tensor) nseq += 1 nsamp += tensor.size(0) return {'nseq': nseq, 'nsamp': nsamp} input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') audio_reader = get_reader(args.format) res = binarize(input_file, audio_reader, consumer) print('| [{}] {}: {} audio_seq, {} audio_samples'.format( lang, input_file, res['nseq'], res['nsamp'])) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) print('input_prefix', input_prefix) print(dict_path(lang)) dict = roberta_dictionary.RobertaDictionary.load(dict_path(lang)) input_file = "{}{}".format( input_prefix, ("." + lang) if lang is not None else "" ) from pytorch_transformers import RobertaTokenizer import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-large') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- penn2orig = {"``":'"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-":'[', "-RSB-":']', "-LCB-":'{', "-RCB-":'}', "-lrb-": '(', "-rrb-": ')', "-lsb-": '[', "-rsb-": ']', "-lcb-": '{', "-rcb-": '}', } words = sent.strip().split() words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) truncated_number = 512 if lang == 'article' else 256 CLS_TOKEN = '<s>' SEP_TOKEN = '</s>' if lang == 'article': for line in open(input_file, encoding='utf8'): article_wids = [] min_src_sentence = 3 max_src_sentence = 100 max_src_ntokens_per_sent = 200 min_src_ntokens_per_sent = 5 sents = line.strip().split('<S_SEP>') sents = [sent.strip().split() for sent in sents] idxs = [i for i, sent in enumerate(sents) if (len(sent) > min_src_ntokens_per_sent)] src = [sents[i][:max_src_ntokens_per_sent] for i in idxs] src = src[:max_src_sentence] src_txt = [' '.join(sent) for sent in src] src_tokens = [tokenizer.tokenize(sent) for sent in src_txt] for i, sent in enumerate(src_tokens): MAX_SENT_NTOKENS = 500 if len(sent) > MAX_SENT_NTOKENS: sent = sent[:MAX_SENT_NTOKENS] if i == 0: input_text = [CLS_TOKEN] + sent + [SEP_TOKEN] elif i != 0: input_text = [SEP_TOKEN] + sent + [SEP_TOKEN] wids = tokenizer.convert_tokens_to_ids(input_text) article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids if article_wids[-1] != dict.sep_index: article_wids[-1] = dict.sep_index tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) elif lang == 'summary': for line in open(input_file, encoding='utf8'): article_wids = [] max_tgt_ntokens = 500 min_tgt_ntokens = 5 sents = line.strip().split('<S_SEP>') sents = [tokenizer.tokenize(sent) for sent in sents] for i, sent in enumerate(sents): # sometimes, there are too many token in one single sentence # to be specific, there are 8 sentences in the training article longer than 512, so truncate them to 500 # MAX_SENT_LEN = 500 # if len(sent) > MAX_SENT_LEN: # sent = sent[:MAX_SENT_LEN] if i != 0: input_text = [SEP_TOKEN] + sent else: input_text = sent wids = tokenizer.convert_tokens_to_ids(input_text) # wtoks = tokenizer.convert_ids_to_tokens(wids) # wstring = tokenizer.convert_tokens_to_string(wtoks) # wids_vocab = [dict.index(word) for word in input_text] # assert wids == wids_vocab, 'word indices should be the same!' article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 article_wids = article_wids[:truncated_number] if len(article_wids) > truncated_number else article_wids if article_wids[-1] == dict.sep_index: article_wids = article_wids[:len(article_wids)-1] # print(article_wids) if len(article_wids) > truncated_number: print('lang: %s, token len: %d, truncated len: %d' % (lang, len(article_wids), truncated_number)) tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, num_seq, num_token, 100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
def make_select_databin(data_ids, prefix_dir="eval-select-bin", prefix_name="eval"): # prefix = "valid" def consumer(ds, ids): ds.add_item(torch.IntTensor(ids)) def consumer_float(ds, ids): ds.add_item(torch.FloatTensor(ids)) query_bin = "/nas/qsj/data-bin-v2/{}/{}-query.bin".format( prefix_dir, prefix_name) query_idx = "/nas/qsj/data-bin-v2/{}/{}-query.idx".format( prefix_dir, prefix_name) passage_bin = "/nas/qsj/data-bin-v2/{}/{}-passage.bin".format( prefix_dir, prefix_name) passage_idx = "/nas/qsj/data-bin-v2/{}/{}-passage.idx".format( prefix_dir, prefix_name) target_bin = "/nas/qsj/data-bin-v2/{}/{}-target.bin".format( prefix_dir, prefix_name) target_idx = "/nas/qsj/data-bin-v2/{}/{}-target.idx".format( prefix_dir, prefix_name) query_ds = indexed_dataset.IndexedDatasetBuilder(query_bin) passage_ds = indexed_dataset.IndexedDatasetBuilder(passage_bin) target_ds = indexed_dataset.IndexedDatasetBuilder(target_bin, dtype=np.float) id_to_index_file = open( "/nas/qsj/data-bin-v2/{}/{}-id-to-index.pk".format( prefix_dir, prefix_name), "wb") id_to_index = {} data_dict = pickle.load(open(data_ids, "rb")) id_ = 0 above_10 = 0 below_10 = 0 for key, data in data_dict.items(): id_to_index[str(id_)] = key id_ += 1 ids_query = data["ids_query"] ids_passages = data["ids_passages"] ids_selected = data["selected"] assert len(ids_passages) == len(ids_selected) len_ = len(ids_passages) if len_ > 10: ids_passages = ids_passages[:10] ids_selected = ids_selected[:10] above_10 += 1 if len_ < 10: for _ in range(len_, 10): ids_passages.append(ids_passages[-1]) ids_selected.append(ids_selected[-1]) below_10 += 1 assert len(ids_passages) == 10 assert len(ids_passages) == len(ids_selected) for ids_p in ids_passages: consumer(query_ds, ids_query) consumer(passage_ds, ids_p) if prefix_name == "train": total_sum = sum(ids_selected) if total_sum > 1: for i in range(10): ids_selected[i] /= total_sum consumer_float(target_ds, ids_selected) query_ds.finalize(query_idx) passage_ds.finalize(passage_idx) target_ds.finalize(target_idx) pickle.dump(id_to_index, id_to_index_file) print("| above_10 {}, below_10 {}".format(above_10, below_10))
def augument_shffle_top5(passages, queries, answers, prefix="train", dir_file="top-5-qa+nlg+shuffle"): def consumer(ds, ids): ds.add_item(torch.IntTensor(ids)) query_bin = "/nas/qsj/data-bin-v2/{}/{}-query.bin".format(dir_file, prefix) query_idx = "/nas/qsj/data-bin-v2/{}/{}-query.idx".format(dir_file, prefix) passage_1_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-1.bin".format( dir_file, prefix) passage_1_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-1.idx".format( dir_file, prefix) passage_2_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-2.bin".format( dir_file, prefix) passage_2_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-2.idx".format( dir_file, prefix) passage_3_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-3.bin".format( dir_file, prefix) passage_3_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-3.idx".format( dir_file, prefix) passage_4_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-4.bin".format( dir_file, prefix) passage_4_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-4.idx".format( dir_file, prefix) passage_5_bin = "/nas/qsj/data-bin-v2/{}/{}-passage-5.bin".format( dir_file, prefix) passage_5_idx = "/nas/qsj/data-bin-v2/{}/{}-passage-5.idx".format( dir_file, prefix) target_bin = "/nas/qsj/data-bin-v2/{}/{}-target.bin".format( dir_file, prefix) target_idx = "/nas/qsj/data-bin-v2/{}/{}-target.idx".format( dir_file, prefix) query_ds = indexed_dataset.IndexedDatasetBuilder(query_bin) passage_1_ds = indexed_dataset.IndexedDatasetBuilder(passage_1_bin) passage_2_ds = indexed_dataset.IndexedDatasetBuilder(passage_2_bin) passage_3_ds = indexed_dataset.IndexedDatasetBuilder(passage_3_bin) passage_4_ds = indexed_dataset.IndexedDatasetBuilder(passage_4_bin) passage_5_ds = indexed_dataset.IndexedDatasetBuilder(passage_5_bin) target_ds = indexed_dataset.IndexedDatasetBuilder(target_bin) for passage, query, answer in zip(passages, queries, answers): passage_shuffle = passage[2:] consumer(query_ds, query) consumer(passage_1_ds, passage[0]) consumer(passage_2_ds, passage[1]) consumer(passage_3_ds, passage_shuffle[0]) consumer(passage_4_ds, passage_shuffle[1]) consumer(passage_5_ds, passage_shuffle[2]) consumer(target_ds, answer) # random.shuffle(passage_shuffle) # consumer(query_ds, query) # consumer(passage_1_ds, passage[0]) # consumer(passage_2_ds, passage[1]) # consumer(passage_3_ds, passage_shuffle[0]) # consumer(passage_4_ds, passage_shuffle[1]) # consumer(passage_5_ds, passage_shuffle[2]) # consumer(target_ds, answer) query_ds.finalize(query_idx) passage_1_ds.finalize(passage_1_idx) passage_2_ds.finalize(passage_2_idx) passage_3_ds.finalize(passage_3_idx) passage_4_ds.finalize(passage_4_idx) passage_5_ds.finalize(passage_5_idx) target_ds.finalize(target_idx)
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers, copy_src_words=None): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) n_seq_tok = [0, 0] replaced = Counter() copyied = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) copyied.update(worker_result["copied"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: # todo: not support copy pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, vocab, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) words_list = [] def binarize_consumer(ids, words): ds.add_item(ids) words_list.append(words) merge_result( Binarizer.binarize(input_file, vocab, binarize_consumer, reverse_order=args.reverse_order, offset=0, end=offsets[1], copy_ext_dict=args.copy_ext_dict, copy_src_words=copy_src_words)) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print( "| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% <unk> copied from src" .format(lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], vocab.unk_word, 100 * sum(copyied.values()) / n_seq_tok[1])) return words_list
def make_binary_dataset(input_prefix, output_prefix, lang, append_eos=False): if lang == args.target_lang: dict = flexible_dictionary.FlexibleDictionary.load(dict_path(lang)) else: # dict = bert_dictionary.BertDictionary.load(dict_path(lang)) dict = gpt2_dictionary.GPT2Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types | {} types (for real)'.format(lang, len(dict) - 1, len(dict))) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') if lang == args.target_lang: res = Tokenizer.binarize(input_file, dict, consumer, append_eos=append_eos) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>')) else: # read article # from pytorch_pretrained_bert.tokenization import BertTokenizer # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) from pytorch_transformers import RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- ''' penn2orig = {"``":'"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-":'[', "-RSB-":']', "-LCB-":'{', "-RCB-":'}'} ''' penn2orig = {"-LRB-": '(', "-RRB-": ')', "-LSB-": '[', "-RSB-": ']', "-LCB-": '{', "-RCB-": '}', "-lrb-": '(', "-rrb-": ')', "-lsb-": '[', "-rsb-": ']', "-lcb-": '{', "-rcb-": '}',} words = sent.strip().split() words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 skip_line = 0 for line in open(input_file, encoding='utf8'): sents = line.strip().split('<S_SEP>') sents = sents[0:args.max_num_sentences] sents = [' '.join(sent.strip().split()[0:args.max_num_words]) for sent in sents] # print(sents) sents = [tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents] article_wids = [] for i, sent in enumerate(sents): # sometimes there are too many tokens MAXLEN = 500 if len(sent) > MAXLEN: # sent = sent[0:MAXLEN] print(' '.join(sent)) skip_line += 1 print(skip_line) continue if i != 0: article_wids.append( dict.sep_index ) wids = tokenizer.convert_tokens_to_ids(sent) # wids_vocab = [dict.index(word) for word in sent] # assert wids == wids_vocab, 'word indices should be the same!' article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, num_seq, num_token, 100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>')) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) print('input_prefix', input_prefix) print(dict_path(lang)) dict = roberta_dictionary.RobertaDictionary.load(dict_path(lang)) input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") from pytorch_transformers import RobertaTokenizer import torch tokenizer = RobertaTokenizer.from_pretrained('roberta-base') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- penn2orig = { "``": '"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-": '[', "-RSB-": ']', "-LCB-": '{', "-RCB-": '}' } words = sent.strip().split() words = [ wd if not wd in penn2orig else penn2orig[wd] for wd in words ] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) output_ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, 'article_next', "bin")) truncated_number = 512 output_length = 256 CLS_TOKEN = '<s>' SEP_TOKEN = '</s>' for line in open(input_file, encoding='utf8'): sents = line.strip().split('<S_SEP>') sents = [ tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents ] article_toks = [] for i, sent in enumerate(sents): if i != 0: article_toks.append(SEP_TOKEN) article_toks.extend(sent) article_segments = [] output_segments = [] tmp_seg = [] for i, tok in enumerate(article_toks): if len(tmp_seg) == 0: tmp_seg.append(CLS_TOKEN) tmp_seg.append(tok) if tok == SEP_TOKEN: tmp_seg.append(tok) if len(tmp_seg) >= truncated_number: tmp_seg = tmp_seg[:truncated_number] if tmp_seg[-1] != SEP_TOKEN: tmp_seg[-1] = SEP_TOKEN tmp_output = article_toks[ i + 1:min(i + 1 + output_length, len(article_toks))] if len(tmp_output) < 0.3 * output_length: break article_segments.append( tokenizer.convert_tokens_to_ids(tmp_seg)) output_segments.append( tokenizer.convert_tokens_to_ids(tmp_output)) tmp_seg = [] assert len(article_segments) == len(output_segments) for i in range(len(article_segments)): assert len(article_segments[i]) <= truncated_number assert len(output_segments[i]) <= output_length and len( output_segments[i]) >= 0.3 * output_length tensor = torch.IntTensor(article_segments[i]) ds.add_item(tensor) output_tensor = torch.IntTensor(output_segments[i]) output_ds.add_item(output_tensor) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) output_ds.finalize( dataset_dest_file(args, output_prefix, 'article_next', "idx")) print('done!')
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) print('input_prefix', input_prefix) print(dict_path(lang)) dict = bert_dictionary.BertDictionary.load(dict_path(lang)) input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") from pytorch_transformers import BertTokenizer import torch tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- penn2orig = { "``": '"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-": '[', "-RSB-": ']', "-LCB-": '{', "-RCB-": '}' } words = sent.strip().split() words = [ wd if not wd in penn2orig else penn2orig[wd] for wd in words ] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) output_ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, 'article_next', "bin")) article_input = 511 article_next = 256 BERT_CLS_ID = tokenizer.convert_tokens_to_ids([BERT_CLS])[0] BERT_SEP_ID = tokenizer.convert_tokens_to_ids([BERT_SEP])[0] for line in open(input_file, encoding='utf8'): sents = line.strip().split('<S_SEP>') sents = [ tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents ] article_wids = [] for i, sent in enumerate(sents): if i != 0: article_wids.append(dict.sep_index) if len(sent) > article_input: wids = [] temp_sent = [ sent[x:x + article_input] for x in range(0, len(sent), article_input) ] for se in temp_sent: se_ids = tokenizer.convert_tokens_to_ids(se) wids.extend(se_ids) else: wids = tokenizer.convert_tokens_to_ids(sent) # wids_vocab = [dict.index(word) for word in sent] # assert wids == wids_vocab, 'word indices should be the same!' article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 article_segments = [ article_wids[x:x + article_input] for x in range(0, len(article_wids), article_input) ] cur_position = 0 for i in range(len(article_segments)): article_seq = article_segments[i] cur_position += len(article_seq) output_seg = article_wids[ cur_position:min(len(article_wids), cur_position + article_next)] if len(output_seg) < 0.3 * article_next: continue num_seq += 1 if len(article_seq) > article_input: print('lang: %s, token len: %d, truncated len: %d' % (lang, len(article_seq), article_input)) if lang == 'article': if article_seq[-1] != BERT_SEP_ID: if article_seq[-2] != BERT_SEP_ID: article_seq[-1] = BERT_SEP_ID article_seq = [BERT_CLS_ID] + article_seq if len(output_seg) > article_next: print( 'lang: article_next, token len: %d, truncated len: %d' % (len(output_seg), article_next)) tensor = torch.IntTensor(article_seq) ds.add_item(tensor) output_tensor = torch.IntTensor(output_seg) output_ds.add_item(output_tensor) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) output_ds.finalize( dataset_dest_file(args, output_prefix, 'article_next', "idx")) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, num_seq, num_token, 100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
def make_binary_dataset(vocab, input_prefix, output_prefix, lang, num_workers): print("| [{}] Dictionary: {} types".format(lang, len(vocab) - 1)) print('input_prefix', input_prefix) print(dict_path(lang)) dict = xlnet_dictionary.XLNetDictionary.load(dict_path(lang)) input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") from pytorch_transformers import XLNetConfig, XLNetTokenizer import torch tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- penn2orig = { "``": '"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-": '[', "-RSB-": ']', "-LCB-": '{', "-RCB-": '}' } words = sent.strip().split() words = [ wd if not wd in penn2orig else penn2orig[wd] for wd in words ] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) for line in open(input_file, encoding='utf8'): sents = line.strip().split('<S_SEP>') sents = [ tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents ] article_wids = [] for i, sent in enumerate(sents): if i != 0: article_wids.append(dict.sep_index) wids = tokenizer.convert_tokens_to_ids(sent) # wids_vocab = [dict.index(word) for word in sent] # assert wids == wids_vocab, 'word indices should be the same!' article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, num_seq, num_token, 100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>'))
def make_binary_dataset(input_prefix, output_prefix, lang): #debugging, do only targets #if 'src' in lang: #if 'tgt' in lang: # print("skip src files...") # #print("skip tgt files...") # return dict = load_dictionary(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) annotator = None aconsumer = None if args.addAnnotations: brigramFile = None if os.path.isfile(args.addAnnotations + "_bigrams"): brigramFile = args.addAnnotations + "_bigrams" if args.addAnnotations.endswith(".mallet"): annotator = MalletLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile) else: annotator = GensimLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile) #generate embeddings for topic keywords annotator.generateKeywordEmbeddings(dataset_dest_path(output_prefix+"_keyEmbeddings", None, 'txt')) annotator.generateKeywordDict(dataset_dest_path(output_prefix+"_keyVocab", None, 'txt')) ads = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix+"_keys", lang, 'bin'), dtype=np.double) def aconsumer(tensor): ads.add_item(tensor) awds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix+"_keywords", lang, 'bin')) def awconsumer(tensor): awds.add_item(tensor) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') if args.singleSeq: if '.src' in input_file: res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, L=args.L, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict) elif '.tgt' in input_file: res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict) else: res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, append_eos=False, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict) else: if '.src' in input_file: res = TokenizerWCSParagraph.binarize(input_file, dict, consumer, max_chunk_length= args.src_chunk_length, L=args.L, aconsumer=aconsumer, awconsumer=awconsumer, annotator=annotator) TokenizerWCSParagraph.printStats(res, lang, input_file, dict) else: res = TokenizerWCSSentence.binarize(input_file, dict, consumer, max_chunk_length=args.tgt_chunk_length, aconsumer=aconsumer, annotator=annotator) TokenizerWCSSentence.printStats(res, lang, input_file, dict) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) if args.addAnnotations: ads.finalize(dataset_dest_path(output_prefix+"_keys", lang, 'idx')) awds.finalize(dataset_dest_path(output_prefix+"_keywords", lang, 'idx'))