def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_dictionary( filenames, workers, dict_cls=dictionary.Dictionary, ): d = dict_cls() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, workers) return d
def make_binary_dataset(input_prefix, output_prefix, lng_pair, lang, num_workers): if not args.joined_dictionary and lang != 'en': dict = dictionary.Dictionary.load(tgt_dict_path) else: dict = dictionary.Dictionary.load(dict_path) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = f'{input_prefix}.{lng_pair}.{lang}.tok.bpe' if not os.path.exists(input_file): input_file = f'{input_prefix}.{lng_pair}.{lang}' if not os.path.exists(input_file): print("| {} not found".format(input_file)) return if args.expert: input_file = input_file + '.e' offsets = Tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): fn_without_ext = f"{output_prefix}{worker_id}.{lng_pair}.{lang}" pool.apply_async(binarize, (input_file, dict, fn_without_ext, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( f"{output_prefix}.{lng_pair}.{lang}.bin") merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): temp_file_path = f"{output_prefix}{worker_id}.{lng_pair}.{lang}" ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(f"{output_prefix}.{lng_pair}.{lang}.idx") print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def test_finalize(self): txt = [ 'A B C D', 'B C D', 'C D', 'D', ] ref_ids1 = list(map(torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ])) ref_ids2 = list(map(torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ])) # build dictionary d = Dictionary() for line in txt: Tokenizer.tokenize(line, d, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode='w') as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids)
def load_dataset2(self, split, **kwargs): data_path = self.args.data[0] prefix = os.path.join( data_path, split + "." + self.args.source_lang + "-" + self.args.target_lang) src_sentences, src_lengths = [], [] with open(prefix + "." + self.args.source_lang, encoding='utf-8') as file: for line in file: sentence = line.strip() tokens = Tokenizer.tokenize(sentence, self.src_dict, add_if_not_exist=False) src_sentences.append(tokens) src_lengths.append(tokens.numel()) tgt_sentences, tgt_lengths = [], [] with open(prefix + "." + self.args.target_lang, encoding='utf-8') as file: for line in file: sentence = line.strip() tokens = Tokenizer.tokenize(sentence, self.tgt_dict, add_if_not_exist=False) tgt_sentences.append(tokens) tgt_lengths.append(tokens.numel()) hters = [] with open(prefix + "." + self.args.hter_lang, encoding='utf-8') as file: for line in file: hter = line.strip() hters.append(torch.FloatTensor([float(hter)])) self.datasets[split] = LanguagePairHTERDataset( src_sentences, src_lengths, self.src_dict, tgt_sentences, tgt_lengths, self.tgt_dict, left_pad_source=self.args.left_pad_source, left_pad_target=self.args.left_pad_target, max_source_positions=self.args.max_source_positions, max_target_positions=self.args.max_target_positions, hter=hters, hter_sizes=torch.ones(len(hters)))
def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') offsets = Tokenizer.find_offsets(input_file, num_workers) print("offsets", offsets) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))
def make_binary_dataset(input_prefix, output_prefix, lang, guess): print('aaa') dict = dictionary.Dictionary.load( os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) if not guess: ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) input_file = '{}.{}'.format(input_prefix, lang) else: ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.guess.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) input_file = '{}.{}.guess'.format(input_prefix, lang) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang))
def read_data(self, path, dictionary, is_dst=False, src_oov_words_list=None): count = 0 with open(path, 'r') as f: for line in f: self.lines.append(line.strip('\n')) src_oov_words = None if is_dst: src_oov_words = src_oov_words_list[count] tokens, tokens_extended, oov_words = Tokenizer.tokenize(line, dictionary, add_if_not_exist=False, is_dst=is_dst, src_oov_words=src_oov_words) # +1 for Lua compatibility tokens = tokens + 1 tokens_extended = tokens_extended + 1 self.tokens_list.append(tokens) self.tokens_list_extended_vocab.append(tokens_extended) self.oov_words_list.append(oov_words) # print(line, tokens, tokens_extended, oov_words) # exit(0) self.sizes.append(len(tokens)) self.oov_sizes.append(len(oov_words)) count += 1 if count%10000==0: print(count) self.sizes = np.array(self.sizes) self.oov_sizes = np.array(self.oov_sizes)
def read_data(self, path, dictionary): with open(path, 'r') as f: for line in f: self.lines.append(line.strip('\n')) # +1 for Lua compatibility tokens = Tokenizer.tokenize(line, dictionary, add_if_not_exist=False) + 1 self.tokens_list.append(tokens) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def __getitem__(self, i): self.check_index(i) # tokenize target side if self.is_tgt: return Tokenizer.segment(self.lines[i], self.dictionary, self.ex_dict, self.is_tgt, self.maxspan, append_eos=self.append_eos) # tokenize source side else: return Tokenizer.segment(self.lines[i], self.dictionary, self.ex_dict, self.is_tgt, self.maxspan, append_eos=self.append_eos)[0]
def binarize(args, filename, dict, output_prefix, lang, offset, end): ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(args, output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) return res
def binarize(filename, dict, fn_without_ext, offset, end): ds = indexed_dataset.IndexedDatasetBuilder(f"{fn_without_ext}.bin") def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) ds.finalize(f"{fn_without_ext}.idx") return res
def get_id(self, string): tokens = Tokenizer.tokenize(string, self.task.dictionary, add_if_not_exist=False, append_eos=False).long() indexed_string = tokens.numpy() if self.map_indices is not None: # map indices to subset of the vocabulary indexed_string = self.convert_ids(indexed_string) return indexed_string
def read_data(self, path, dictionary): with open(path, 'r') as f: for line in f: self.lines.append(line.strip('\n')) tokens = Tokenizer.tokenize( line, dictionary, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, ).long() self.tokens_list.append(tokens) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def load_dataset(self, split, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" prefix = os.path.join(self.args.data, '{}.input-label'.format(split)) # Read input sentences. sentences, lengths = [], [] with open(prefix + '.input', encoding='utf-8') as file: for line in file: sentence = line.strip() # Tokenize the sentence, splitting on spaces tokens = Tokenizer.tokenize( sentence, self.input_vocab, add_if_not_exist=False, ) sentences.append(tokens) lengths.append(tokens.numel()) # Read labels. labels = [] with open(prefix + '.label', encoding='utf-8') as file: for line in file: label = line.strip() labels.append( # Convert label to a numeric ID. torch.LongTensor([self.label_vocab.add_symbol(label)]) ) assert len(sentences) == len(labels) print('| {} {} {} examples'.format(self.args.data, split, len(sentences))) # We reuse LanguagePairDataset since classification can be modeled as a # sequence-to-sequence task where the target sequence has length 1. self.datasets[split] = LanguagePairDataset( src=sentences, src_sizes=lengths, src_dict=self.input_vocab, tgt=labels, tgt_sizes=torch.ones(len(labels)), # targets have length 1 tgt_dict=self.label_vocab, left_pad_source=False, max_source_positions=self.args.max_positions, max_target_positions=1, # Since our target is a single class label, there's no need for # input feeding. If we set this to ``True`` then our Model's # ``forward()`` method would receive an additional argument called # *prev_output_tokens* that would contain a shifted version of the # target sequence. input_feeding=False, )
def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
def __create_sample(self, line): tokens = Tokenizer.tokenize(line, self.task.dictionary, tokenize=default_tokenizer, add_if_not_exist=False).long() sample = {} # target is the sentence, for source, rotate item one token to the left (would start with eos) tokens_list = tokens.numpy() tokens_list = np.insert(tokens_list, 0, self.task.dictionary.eos()) tokens = torch.from_numpy(tokens_list) sample["src_tokens"] = tokens.unsqueeze( 0) # add a dimension for the batch sample["src_lengths"] = tokens.size()[0] sample[ 'target'] = None # this will disable the efficient softmax approximation return sample
def make_binary_dataset(input_prefix, output_prefix, lang, src_ids=None): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict))) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res, ids = Tokenizer.binarize(input_file, dict, consumer, src_ids=src_ids) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}, {:.3}% replaced by copy'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word, 100 * res['ncopied'] / res['ntok'])) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) return ids
def binarize(args, filename, dict, output_prefix, lang, offset, end, append_eos=False): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin") ) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize( filename, dict, consumer, offset=offset, end=end, append_eos=append_eos ) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) return res
def read_data(self, path, dictionary, debug=False): with open(path, 'r') as f: for i, line in enumerate(f): if debug and i >= DEBUG_MODE_LINE_COUNT: break self.lines.append(line.strip('\n')) tokens = Tokenizer.tokenize( line, dictionary, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, ).long() self.tokens_list.append(tokens) self.sizes.append(len(tokens)) print('{} has lines {}, min length {} max length {}'.format( path, len(self.sizes), min(self.sizes), max(self.sizes))) self.sizes = np.array(self.sizes)
def read_data(self, path, dictionary): with open(path, 'r', encoding='utf-8') as f: for line in f: src_tokens = None if self.src_tokens_list is not None: src_tokens = self.src_tokens_list[len(self.lines)] self.lines.append(line.strip('\n')) tokens, words = Tokenizer.tokenize( line, dictionary, add_if_not_exist=False, append_eos=self.append_eos, reverse_order=self.reverse_order, src_tokens=src_tokens) tokens = tokens.long() self.tokens_list.append(tokens) self.words_list.append(words) self.sizes.append(len(tokens)) self.sizes = np.array(self.sizes)
def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang) ) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang))
def __create_sample_batch(self, line_list): tokens_list = [] for line in line_list: tokens = Tokenizer.tokenize(line, self.task.dictionary, tokenize=default_tokenizer, add_if_not_exist=False).tolist() tokens.insert(0, self.task.dictionary.eos() ) # insert EOS at the beginning of the sentence tokens_list.append(tokens) src_lengths_list = [len(tokens) for tokens in tokens_list] token_tensor = torch.nn.utils.rnn.pad_sequence( [torch.LongTensor(t) for t in tokens_list], batch_first=True, padding_value=self.task.dictionary.eos()) sample = {} sample["src_tokens"] = token_tensor sample["src_lengths"] = torch.LongTensor(src_lengths_list) sample[ 'target'] = None # this will disable the efficient softmax approximation return sample
def binarize(args, filename, dict, output_prefix, lang, offset, end): ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) res = Tokenizer.binarize(filename, dict, consumer, offset=offset, end=end) # {'nseq': nseq, 'nunk': sum(replaced.values()), 'ntok': ntok, 'replaced': replaced} to_print = ['nseq', 'nunk', 'ntok'] debug_data = {} for k, v in res.items: if k in to_print: debug_data[k] = v debug_data['offset'] = offset debug_data['end'] = end print(debug_data) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) return res
def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' def dataset_dest_path(output_prefix, lang, extension): base = f'{args.destdir}/{output_prefix}' lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else '' return f'{base}{lang_part}.{extension}' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary(set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) def make_dataset(input_prefix, output_prefix, lang): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def get_ids(dictionary): ids = [] for line in txt: ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids
def main(args): import_user_module(args) print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source task = tasks.get_task(args.task) def train_path(lang): return [ "{}{}".format(trainpref, ("." + lang) if lang else "") for _, trainpref in enumerate(args.trainpref.split(",")) ] def file_name(prefix, lang): fname = prefix if lang is not None: fname += ".{lang}".format(lang=lang) return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path("dict", lang) + ".txt" def build_dictionary(filenames, src=False, tgt=False): assert src ^ tgt return task.build_dictionary( filenames, workers=args.workers, threshold=args.thresholdsrc if src else args.thresholdtgt, nwords=args.nwordssrc if src else args.nwordstgt, padding_factor=args.padding_factor, ) if args.joined_dictionary: assert ( not args.srcdict or not args.tgtdict ), "cannot use both --srcdict and --tgtdict with --joined-dictionary" if args.srcdict: src_dict = task.load_dictionary(args.srcdict) elif args.tgtdict: src_dict = task.load_dictionary(args.tgtdict) else: assert (args.trainpref ), "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary( { train_path(lang) for lang in [args.source_lang, args.target_lang] }, src=True) tgt_dict = src_dict else: if args.srcdict: src_dict = task.load_dictionary(args.srcdict) else: assert (args.trainpref ), "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary(train_path(args.source_lang), src=True) if target: if args.tgtdict: tgt_dict = task.load_dictionary(args.tgtdict) else: assert ( args.trainpref ), "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary(train_path(args.target_lang), tgt=True) else: tgt_dict = None src_dict.save(dict_path(args.source_lang)) if target and tgt_dict is not None: tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = task.load_dictionary(dict_path(lang)) print("| [{}] Dictionary: {} types".format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result["replaced"]) n_seq_tok[0] += worker_result["nseq"] n_seq_tok[1] += worker_result["ntok"] input_file = "{}{}".format(input_prefix, ("." + lang) if lang is not None else "") offsets = Binarizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async( binarize, ( args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1], ), callback=merge_result, ) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, "bin")) merge_result( Binarizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx")) print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word, )) def make_dataset(input_prefix, output_prefix, lang, num_workers=1): if args.output_format == "binary": make_binary_dataset(input_prefix, output_prefix, lang, num_workers) elif args.output_format == "raw": # Copy original text file to destination folder output_text_file = dest_path( output_prefix + ".{}-{}".format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: for k, trainpref in enumerate(args.trainpref.split(",")): outprefix = "train{}".format(k) if k > 0 else "train" make_dataset(trainpref, outprefix, lang, num_workers=args.workers) if args.validpref: for k, validpref in enumerate(args.validpref.split(",")): outprefix = "valid{}".format(k) if k > 0 else "valid" make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(",")): outprefix = "test{}".format(k) if k > 0 else "test" make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print("| Wrote preprocessed data to {}".format(args.destdir)) if args.alignfile: raise NotImplementedError('args.alignfile is not implemented') assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) freq_map = {} with open(args.alignfile, "r", encoding='utf-8') as align_file: with open(src_file_name, "r", encoding='utf-8') as src_file: with open(tgt_file_name, "r", encoding='utf-8') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split("-")), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join( args.destdir, "alignment.{}-{}.txt".format(args.source_lang, args.target_lang), ), "w", encoding='utf-8') as f: for k, v in align_dict.items(): print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
def main(): parser = argparse.ArgumentParser( description='Data pre-processing: Create dictionary and store data in binary format') parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language') parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language') parser.add_argument('--trainpref', metavar='FP', default='train', help='target language') parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes') parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes') parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir') parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int, help='map words appearing less than threshold times to unknown') parser.add_argument('--tgtdict', metavar='FP', help='reuse given target dictionary') parser.add_argument('--srcdict', metavar='FP', help='reuse given source dictionary') parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain') parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain') parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)') parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'], help='output format (optional)') args = parser.parse_args() print(args) os.makedirs(args.destdir, exist_ok=True) if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang)) src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)), threshold=args.thresholdsrc, nwords=args.nwordssrc) if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang)) tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)), threshold=args.thresholdtgt, nwords=args.nwordstgt) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang) ) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) def make_dataset(input_prefix, output_prefix, lang, output_format='binary'): if output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif output_format == 'raw': # Copy original text file to destination folder output_text_file = os.path.join(args.destdir, '{}.{}'.format(output_prefix, lang)) shutil.copyfile('{}.{}'.format(input_prefix, lang), output_text_file) make_dataset(args.trainpref, 'train', args.source_lang, args.output_format) make_dataset(args.trainpref, 'train', args.target_lang, args.output_format) for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, args.source_lang, args.output_format) make_dataset(validpref, outprefix, args.target_lang, args.output_format) for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, args.source_lang, args.output_format) make_dataset(testpref, outprefix, args.target_lang, args.output_format) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: src_file_name = '{}.{}'.format(args.trainpref, args.source_lang) tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang) src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang))) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def build_dictionary(filenames): d = Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.workers) return d
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): d = Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.workers) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary( set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang, num_workers): dict = Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) n_seq_tok = [0, 0] replaced = Counter() def merge_result(worker_result): replaced.update(worker_result['replaced']) n_seq_tok[0] += worker_result['nseq'] n_seq_tok[1] += worker_result['ntok'] input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') offsets = Tokenizer.find_offsets(input_file, num_workers) pool = None if num_workers > 1: pool = Pool(processes=num_workers - 1) for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) pool.apply_async(binarize, (args, input_file, dict, prefix, lang, offsets[worker_id], offsets[worker_id + 1]), callback=merge_result) pool.close() ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_file(args, output_prefix, lang, 'bin')) merge_result( Tokenizer.binarize(input_file, dict, lambda t: ds.add_item(t), offset=0, end=offsets[1])) if num_workers > 1: pool.join() for worker_id in range(1, num_workers): prefix = "{}{}".format(output_prefix, worker_id) temp_file_path = dataset_dest_prefix(args, prefix, lang) ds.merge_file_(temp_file_path) os.remove(indexed_dataset.data_file_path(temp_file_path)) os.remove(indexed_dataset.index_file_path(temp_file_path)) ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx')) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, n_seq_tok[0], n_seq_tok[1], 100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word)) def make_dataset(input_prefix, output_prefix, lang, num_workers=1): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang, num_workers) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang, num_workers=args.workers) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = Dictionary.load(dict_path(args.source_lang)) tgt_dict = Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, 'alignment.{}-{}.txt'.format(args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = dictionary.Dictionary() for lang in [args.source_lang, args.target_lang]: Tokenizer.add_file_to_dictionary( filename='{}.{}'.format(args.trainpref, lang), dict=src_dict, tokenize=tokenize_line, ) src_dict.finalize() tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang)) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang)) src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)), threshold=args.thresholdsrc, nwords=args.nwordssrc) if target: tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)), threshold=args.thresholdtgt, nwords=args.nwordstgt) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang))) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang, args.target_lang, lang) ) def consumer(tensor): ds.add_item(tensor) input_file = '{}.{}'.format(input_prefix, lang) res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize('{}/{}.{}-{}.{}.idx'.format( args.destdir, output_prefix, args.source_lang, args.target_lang, lang)) def make_dataset(input_prefix, output_prefix, lang, output_format='binary'): if output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif output_format == 'raw': # Copy original text file to destination folder output_text_file = os.path.join(args.destdir, '{}.{}'.format(output_prefix, lang)) shutil.copyfile('{}.{}'.format(input_prefix, lang), output_text_file) def make_all(args, make_dataset, lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang, args.output_format) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang, args.output_format) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang, args.output_format) make_all(args, make_dataset, args.source_lang) if target: make_all(args, make_dataset, args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = '{}.{}'.format(args.trainpref, args.source_lang) tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang) src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang))) tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang))) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format( args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def get_offsets(input_file, num_workers): return Tokenizer.find_offsets(input_file, num_workers)
def main(args): print(args) os.makedirs(args.destdir, exist_ok=True) target = not args.only_source def build_dictionary(filenames): d = dictionary.Dictionary() for filename in filenames: Tokenizer.add_file_to_dictionary(filename, d, tokenize_line) return d def train_path(lang): return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '') def file_name(prefix, lang): fname = prefix if lang is not None: fname += f'.{lang}' return fname def dest_path(prefix, lang): return os.path.join(args.destdir, file_name(prefix, lang)) def dict_path(lang): return dest_path('dict', lang) + '.txt' def dataset_dest_path(output_prefix, lang, extension): base = f'{args.destdir}/{output_prefix}' lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else '' return f'{base}{lang_part}.{extension}' if args.joined_dictionary: assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary' assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary' src_dict = build_dictionary( set([ train_path(lang) for lang in [args.source_lang, args.target_lang] ])) tgt_dict = src_dict else: if args.srcdict: src_dict = dictionary.Dictionary.load(args.srcdict) else: assert args.trainpref, "--trainpref must be set if --srcdict is not specified" src_dict = build_dictionary([train_path(args.source_lang)]) if target: if args.tgtdict: tgt_dict = dictionary.Dictionary.load(args.tgtdict) else: assert args.trainpref, "--trainpref must be set if --tgtdict is not specified" tgt_dict = build_dictionary([train_path(args.target_lang)]) src_dict.finalize( threshold=args.thresholdsrc, nwords=args.nwordssrc, padding_factor=args.padding_factor, ) src_dict.save(dict_path(args.source_lang)) if target: if not args.joined_dictionary: tgt_dict.finalize( threshold=args.thresholdtgt, nwords=args.nwordstgt, padding_factor=args.padding_factor, ) tgt_dict.save(dict_path(args.target_lang)) def make_binary_dataset(input_prefix, output_prefix, lang): dict = dictionary.Dictionary.load(dict_path(lang)) print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1)) ds = indexed_dataset.IndexedDatasetBuilder( dataset_dest_path(output_prefix, lang, 'bin')) def consumer(tensor): ds.add_item(tensor) input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '') res = Tokenizer.binarize(input_file, dict, consumer) print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format( lang, input_file, res['nseq'], res['ntok'], 100 * res['nunk'] / res['ntok'], dict.unk_word)) ds.finalize(dataset_dest_path(output_prefix, lang, 'idx')) def make_dataset(input_prefix, output_prefix, lang): if args.output_format == 'binary': make_binary_dataset(input_prefix, output_prefix, lang) elif args.output_format == 'raw': # Copy original text file to destination folder output_text_file = dest_path( output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang), lang, ) shutil.copyfile(file_name(input_prefix, lang), output_text_file) def make_all(lang): if args.trainpref: make_dataset(args.trainpref, 'train', lang) if args.validpref: for k, validpref in enumerate(args.validpref.split(',')): outprefix = 'valid{}'.format(k) if k > 0 else 'valid' make_dataset(validpref, outprefix, lang) if args.testpref: for k, testpref in enumerate(args.testpref.split(',')): outprefix = 'test{}'.format(k) if k > 0 else 'test' make_dataset(testpref, outprefix, lang) make_all(args.source_lang) if target: make_all(args.target_lang) print('| Wrote preprocessed data to {}'.format(args.destdir)) if args.alignfile: assert args.trainpref, "--trainpref must be set if --alignfile is specified" src_file_name = train_path(args.source_lang) tgt_file_name = train_path(args.target_lang) src_dict = dictionary.Dictionary.load(dict_path(args.source_lang)) tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang)) freq_map = {} with open(args.alignfile, 'r') as align_file: with open(src_file_name, 'r') as src_file: with open(tgt_file_name, 'r') as tgt_file: for a, s, t in zip_longest(align_file, src_file, tgt_file): si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False) ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False) ai = list(map(lambda x: tuple(x.split('-')), a.split())) for sai, tai in ai: srcidx = si[int(sai)] tgtidx = ti[int(tai)] if srcidx != src_dict.unk( ) and tgtidx != tgt_dict.unk(): assert srcidx != src_dict.pad() assert srcidx != src_dict.eos() assert tgtidx != tgt_dict.pad() assert tgtidx != tgt_dict.eos() if srcidx not in freq_map: freq_map[srcidx] = {} if tgtidx not in freq_map[srcidx]: freq_map[srcidx][tgtidx] = 1 else: freq_map[srcidx][tgtidx] += 1 align_dict = {} for srcidx in freq_map.keys(): align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get) with open( os.path.join( args.destdir, 'alignment.{}-{}.txt'.format(args.source_lang, args.target_lang)), 'w') as f: for k, v in align_dict.items(): print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)