def make_binary_dataset(input_prefix, output_prefix, lang, guess): print('aaa') dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) if not guess: ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) input_file = '{}.{}'.format(input_prefix, lang) else: ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.guess.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) input_file = '{}.{}.guess'.format(input_prefix, lang) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang))
def binarize(filename, dict, fn_without_ext, offset, end): ds = indexed_dataset.IndexedDatasetBuilder(f"{fn_without_ext}.bin") def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) ds.finalize(f"{fn_without_ext}.idx") return res
def make_binary_dataset(input_prefix, output_prefix, lng_pair, lang, num_workers): if not args.joined_dictionary and lang != 'en': dict = dictionary.Dictionary.load(tgt_dict_path) else: dict = dictionary.Dictionary.load(dict_path) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = f'{input_prefix}.{lng_pair}.{lang}.tok.bpe' if not os.path.exists(input_file): input_file = f'{input_prefix}.{lng_pair}.{lang}' if not os.path.exists(input_file): print("| {} not found".format(input_file)) return if args.expert: input_file = input_file + '.e' offsets = Tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): fn_without_ext = f"{output_prefix}{worker_id}.{lng_pair}.{lang}" pool.apply_async(binarize, (input_file, dict, fn_without_ext, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( f"{output_prefix}.{lng_pair}.{lang}.bin") merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = f"{output_prefix}{worker_id}.{lng_pair}.{lang}" ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{output_prefix}.{lng_pair}.{lang}.idx") print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def binarize(args, filename, dict, output_prefix, lang, offset, end): ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) return res
def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') offsets = Tokenizer.find_offsets(input_file, num_workers) print("offsets", offsets) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def make_binary_dataset(input_prefix, output_prefix, lang, src_ids=None): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict))) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res, ids = Tokenizer.binarize(input_file, dict, consumer, src_ids=src_ids) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% replaced by copy'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word, 100 * res['ncopied'] / res['ntok'])) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) return ids
def binarize(args, filename, dict, output_prefix, lang, offset, end, append_eos=False): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize( filename, dict, consumer, offset=offset, end=end, append_eos=append_eos ) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang) ) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang))
def binarize(args, filename, dict, output_prefix, lang, offset, end): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) # {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced} to_print = ['nseq', 'nunk', 'ntok'] debug_data = {} for k, v in res.items: if k in to_print: debug_data[k] = v debug_data['offset'] = offset debug_data['end'] = end print(debug_data) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) return res
def make_binary_dataset(input_prefix, output_prefix, lang, append_eos=False): if lang == args.target_lang: dict = flexible_dictionary.FlexibleDictionary.load(dict_path(lang)) else: # dict = bert_dictionary.BertDictionary.load(dict_path(lang)) dict = gpt2_dictionary.GPT2Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types | {} types (for real)'.format(lang, len(dict) - 1, len(dict))) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') if lang == args.target_lang: res = Tokenizer.binarize(input_file, dict, consumer, append_eos=append_eos) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>')) else: # read article # from pytorch_pretrained_bert.tokenization import BertTokenizer # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) from pytorch_transformers import RobertaTokenizer tokenizer = RobertaTokenizer.from_pretrained('roberta-base') def penn_token2orig_token(sent): # -LRB- -RRB- -LSB- -RSB- -LCB- -RCB- ''' penn2orig = {"``":'"', "''": '"', "-LRB-": '(', "-RRB-": ')', "-LSB-":'[', "-RSB-":']', "-LCB-":'{', "-RCB-":'}'} ''' penn2orig = {"-LRB-": '(', "-RRB-": ')', "-LSB-": '[', "-RSB-": ']', "-LCB-": '{', "-RCB-": '}', "-lrb-": '(', "-rrb-": ')', "-lsb-": '[', "-rsb-": ']', "-lcb-": '{', "-rcb-": '}',} words = sent.strip().split() words = [wd if not wd in penn2orig else penn2orig[wd] for wd in words] return ' '.join(words) num_token, num_unk_token = 0, 0 num_seq = 0 skip_line = 0 for line in open(input_file, encoding='utf8'): sents = line.strip().split('<S_SEP>') sents = sents[0:args.max_num_sentences] sents = [' '.join(sent.strip().split()[0:args.max_num_words]) for sent in sents] # print(sents) sents = [tokenizer.tokenize(penn_token2orig_token(sent)) for sent in sents] article_wids = [] for i, sent in enumerate(sents): # sometimes there are too many tokens MAXLEN = 500 if len(sent) > MAXLEN: # sent = sent[0:MAXLEN] print(' '.join(sent)) skip_line += 1 print(skip_line) continue if i != 0: article_wids.append( dict.sep_index ) wids = tokenizer.convert_tokens_to_ids(sent) # wids_vocab = [dict.index(word) for word in sent] # assert wids == wids_vocab, 'word indices should be the same!' article_wids.extend(wids) for wid in wids: if wid == dict.unk_index: num_unk_token += 1 num_token += 1 num_seq += 1 tensor = torch.IntTensor(article_wids) # print( dict.string_complete(tensor) ) ds.add_item(tensor) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, num_seq, num_token, 100 * num_unk_token / num_token, dict.unk_word if hasattr(dict, 'unk_word') else '<no_unk_word>')) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))