Beispiel #1
0
    def test_finalize(self):
        txt = [
            'A B C D',
            'B C D',
            'C D',
            'D',
        ]
        ref_ids1 = list(
            map(torch.IntTensor, [
                [4, 5, 6, 7, 2],
                [5, 6, 7, 2],
                [6, 7, 2],
                [7, 2],
            ]))
        ref_ids2 = list(
            map(torch.IntTensor, [
                [7, 6, 5, 4, 2],
                [6, 5, 4, 2],
                [5, 4, 2],
                [4, 2],
            ]))

        # build dictionary
        d = Dictionary()
        for line in txt:
            Tokenizer.tokenize(line, d, add_if_not_exist=True)

        def get_ids(dictionary):
            ids = []
            for line in txt:
                ids.append(
                    Tokenizer.tokenize(line,
                                       dictionary,
                                       add_if_not_exist=False))
            return ids

        def assertMatch(ids, ref_ids):
            for toks, ref_toks in zip(ids, ref_ids):
                self.assertEqual(toks.size(), ref_toks.size())
                self.assertEqual(0, (toks != ref_toks).sum().item())

        ids = get_ids(d)
        assertMatch(ids, ref_ids1)

        # check finalized dictionary
        d.finalize()
        finalized_ids = get_ids(d)
        assertMatch(finalized_ids, ref_ids2)

        # write to disk and reload
        with tempfile.NamedTemporaryFile(mode='w') as tmp_dict:
            d.save(tmp_dict.name)
            d = Dictionary.load(tmp_dict.name)
            reload_ids = get_ids(d)
            assertMatch(reload_ids, ref_ids2)
            assertMatch(finalized_ids, reload_ids)
Beispiel #2
0
    def test_finalize(self):
        txt = [
            'A B C D',
            'B C D',
            'C D',
            'D',
        ]
        ref_ids1 = list(map(torch.IntTensor, [
            [4, 5, 6, 7, 2],
            [5, 6, 7, 2],
            [6, 7, 2],
            [7, 2],
        ]))
        ref_ids2 = list(map(torch.IntTensor, [
            [7, 6, 5, 4, 2],
            [6, 5, 4, 2],
            [5, 4, 2],
            [4, 2],
        ]))

        # build dictionary
        d = Dictionary()
        for line in txt:
            Tokenizer.tokenize(line, d, add_if_not_exist=True)

        def get_ids(dictionary):
            ids = []
            for line in txt:
                ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False))
            return ids

        def assertMatch(ids, ref_ids):
            for toks, ref_toks in zip(ids, ref_ids):
                self.assertEqual(toks.size(), ref_toks.size())
                self.assertEqual(0, (toks != ref_toks).sum().item())

        ids = get_ids(d)
        assertMatch(ids, ref_ids1)

        # check finalized dictionary
        d.finalize()
        finalized_ids = get_ids(d)
        assertMatch(finalized_ids, ref_ids2)

        # write to disk and reload
        with tempfile.NamedTemporaryFile(mode='w') as tmp_dict:
            d.save(tmp_dict.name)
            d = Dictionary.load(tmp_dict.name)
            reload_ids = get_ids(d)
            assertMatch(reload_ids, ref_ids2)
            assertMatch(finalized_ids, reload_ids)
    def load_dataset2(self, split, **kwargs):
        data_path = self.args.data[0]

        prefix = os.path.join(
            data_path,
            split + "." + self.args.source_lang + "-" + self.args.target_lang)
        src_sentences, src_lengths = [], []
        with open(prefix + "." + self.args.source_lang,
                  encoding='utf-8') as file:
            for line in file:
                sentence = line.strip()
                tokens = Tokenizer.tokenize(sentence,
                                            self.src_dict,
                                            add_if_not_exist=False)

                src_sentences.append(tokens)
                src_lengths.append(tokens.numel())
        tgt_sentences, tgt_lengths = [], []

        with open(prefix + "." + self.args.target_lang,
                  encoding='utf-8') as file:
            for line in file:
                sentence = line.strip()
                tokens = Tokenizer.tokenize(sentence,
                                            self.tgt_dict,
                                            add_if_not_exist=False)
                tgt_sentences.append(tokens)
                tgt_lengths.append(tokens.numel())

        hters = []
        with open(prefix + "." + self.args.hter_lang,
                  encoding='utf-8') as file:
            for line in file:
                hter = line.strip()
                hters.append(torch.FloatTensor([float(hter)]))

        self.datasets[split] = LanguagePairHTERDataset(
            src_sentences,
            src_lengths,
            self.src_dict,
            tgt_sentences,
            tgt_lengths,
            self.tgt_dict,
            left_pad_source=self.args.left_pad_source,
            left_pad_target=self.args.left_pad_target,
            max_source_positions=self.args.max_source_positions,
            max_target_positions=self.args.max_target_positions,
            hter=hters,
            hter_sizes=torch.ones(len(hters)))
    def read_data(self, path, dictionary, is_dst=False, src_oov_words_list=None):
        count = 0
        with open(path, 'r') as f:
            for line in f:
                self.lines.append(line.strip('\n'))

                src_oov_words = None
                if is_dst:
                    src_oov_words = src_oov_words_list[count]
                    
                
                tokens, tokens_extended, oov_words = Tokenizer.tokenize(line, dictionary, add_if_not_exist=False, is_dst=is_dst, src_oov_words=src_oov_words)
                
                # +1 for Lua compatibility
                tokens = tokens + 1
                tokens_extended = tokens_extended + 1
                
                self.tokens_list.append(tokens)
                self.tokens_list_extended_vocab.append(tokens_extended)
                self.oov_words_list.append(oov_words)

                # print(line, tokens, tokens_extended, oov_words)
                # exit(0)

                self.sizes.append(len(tokens))
                self.oov_sizes.append(len(oov_words))
                
                count += 1
                if count%10000==0:
                    print(count)
                
        self.sizes = np.array(self.sizes)
        self.oov_sizes = np.array(self.oov_sizes)
Beispiel #5
0
 def get_ids(dictionary):
     ids = []
     for line in txt:
         ids.append(
             Tokenizer.tokenize(line,
                                dictionary,
                                add_if_not_exist=False))
     return ids
Beispiel #6
0
 def read_data(self, path, dictionary):
     with open(path, 'r') as f:
         for line in f:
             self.lines.append(line.strip('\n'))
             # +1 for Lua compatibility
             tokens = Tokenizer.tokenize(line, dictionary, add_if_not_exist=False) + 1
             self.tokens_list.append(tokens)
             self.sizes.append(len(tokens))
     self.sizes = np.array(self.sizes)
Beispiel #7
0
 def read_data(self, path, dictionary):
     with open(path, 'r') as f:
         for line in f:
             self.lines.append(line.strip('\n'))
             # +1 for Lua compatibility
             tokens = Tokenizer.tokenize(line, dictionary, add_if_not_exist=False) + 1
             self.tokens_list.append(tokens)
             self.sizes.append(len(tokens))
     self.sizes = np.array(self.sizes)
Beispiel #8
0
 def get_id(self, string):
     tokens = Tokenizer.tokenize(string,
                                 self.task.dictionary,
                                 add_if_not_exist=False,
                                 append_eos=False).long()
     indexed_string = tokens.numpy()
     if self.map_indices is not None:
         # map indices to subset of the vocabulary
         indexed_string = self.convert_ids(indexed_string)
     return indexed_string
Beispiel #9
0
 def read_data(self, path, dictionary):
     with open(path, 'r') as f:
         for line in f:
             self.lines.append(line.strip('\n'))
             tokens = Tokenizer.tokenize(
                 line, dictionary, add_if_not_exist=False,
                 append_eos=self.append_eos, reverse_order=self.reverse_order,
             ).long()
             self.tokens_list.append(tokens)
             self.sizes.append(len(tokens))
     self.sizes = np.array(self.sizes)
Beispiel #10
0
 def read_data(self, path, dictionary):
     with open(path, 'r') as f:
         for line in f:
             self.lines.append(line.strip('\n'))
             tokens = Tokenizer.tokenize(
                 line, dictionary, add_if_not_exist=False,
                 append_eos=self.append_eos, reverse_order=self.reverse_order,
             ).long()
             self.tokens_list.append(tokens)
             self.sizes.append(len(tokens))
     self.sizes = np.array(self.sizes)
    def load_dataset(self, split, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""

        prefix = os.path.join(self.args.data, '{}.input-label'.format(split))

        # Read input sentences.
        sentences, lengths = [], []
        with open(prefix + '.input', encoding='utf-8') as file:
            for line in file:
                sentence = line.strip()

                # Tokenize the sentence, splitting on spaces
                tokens = Tokenizer.tokenize(
                    sentence, self.input_vocab, add_if_not_exist=False,
                )

                sentences.append(tokens)
                lengths.append(tokens.numel())

        # Read labels.
        labels = []
        with open(prefix + '.label', encoding='utf-8') as file:
            for line in file:
                label = line.strip()
                labels.append(
                    # Convert label to a numeric ID.
                    torch.LongTensor([self.label_vocab.add_symbol(label)])
                )

        assert len(sentences) == len(labels)
        print('| {} {} {} examples'.format(self.args.data, split, len(sentences)))

        # We reuse LanguagePairDataset since classification can be modeled as a
        # sequence-to-sequence task where the target sequence has length 1.
        self.datasets[split] = LanguagePairDataset(
            src=sentences,
            src_sizes=lengths,
            src_dict=self.input_vocab,
            tgt=labels,
            tgt_sizes=torch.ones(len(labels)),  # targets have length 1
            tgt_dict=self.label_vocab,
            left_pad_source=False,
            max_source_positions=self.args.max_positions,
            max_target_positions=1,
            # Since our target is a single class label, there's no need for
            # input feeding. If we set this to ``True`` then our Model's
            # ``forward()`` method would receive an additional argument called
            # *prev_output_tokens* that would contain a shifted version of the
            # target sequence.
            input_feeding=False,
        )
Beispiel #12
0
 def __create_sample(self, line):
     tokens = Tokenizer.tokenize(line,
                                 self.task.dictionary,
                                 tokenize=default_tokenizer,
                                 add_if_not_exist=False).long()
     sample = {}
     # target is the sentence, for source, rotate item one token to the left (would start with eos)
     tokens_list = tokens.numpy()
     tokens_list = np.insert(tokens_list, 0, self.task.dictionary.eos())
     tokens = torch.from_numpy(tokens_list)
     sample["src_tokens"] = tokens.unsqueeze(
         0)  # add a dimension for the batch
     sample["src_lengths"] = tokens.size()[0]
     sample[
         'target'] = None  # this will disable the efficient softmax approximation
     return sample
Beispiel #13
0
 def read_data(self, path, dictionary):
     with open(path, 'r', encoding='utf-8') as f:
         for line in f:
             src_tokens = None
             if self.src_tokens_list is not None:
                 src_tokens = self.src_tokens_list[len(self.lines)]
             self.lines.append(line.strip('\n'))
             tokens, words = Tokenizer.tokenize(
                 line,
                 dictionary,
                 add_if_not_exist=False,
                 append_eos=self.append_eos,
                 reverse_order=self.reverse_order,
                 src_tokens=src_tokens)
             tokens = tokens.long()
             self.tokens_list.append(tokens)
             self.words_list.append(words)
             self.sizes.append(len(tokens))
     self.sizes = np.array(self.sizes)
    def read_data(self, path, dictionary, debug=False):
        with open(path, 'r') as f:
            for i, line in enumerate(f):
                if debug and i >= DEBUG_MODE_LINE_COUNT:
                    break
                self.lines.append(line.strip('\n'))
                tokens = Tokenizer.tokenize(
                    line,
                    dictionary,
                    add_if_not_exist=False,
                    append_eos=self.append_eos,
                    reverse_order=self.reverse_order,
                ).long()
                self.tokens_list.append(tokens)
                self.sizes.append(len(tokens))

        print('{} has lines {}, min length {} max length {}'.format(
            path, len(self.sizes), min(self.sizes), max(self.sizes)))
        self.sizes = np.array(self.sizes)
Beispiel #15
0
    def __create_sample_batch(self, line_list):
        tokens_list = []
        for line in line_list:
            tokens = Tokenizer.tokenize(line,
                                        self.task.dictionary,
                                        tokenize=default_tokenizer,
                                        add_if_not_exist=False).tolist()
            tokens.insert(0, self.task.dictionary.eos()
                          )  # insert EOS at the beginning of the sentence
            tokens_list.append(tokens)

        src_lengths_list = [len(tokens) for tokens in tokens_list]
        token_tensor = torch.nn.utils.rnn.pad_sequence(
            [torch.LongTensor(t) for t in tokens_list],
            batch_first=True,
            padding_value=self.task.dictionary.eos())
        sample = {}
        sample["src_tokens"] = token_tensor
        sample["src_lengths"] = torch.LongTensor(src_lengths_list)
        sample[
            'target'] = None  # this will disable the efficient softmax approximation
        return sample
def main(args):
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    def build_dictionary(filenames):
        if args.singleSeq:
            d = dictionary.Dictionary()
        else:
            d = dictionaryWCS.DictionaryWCS()
        for filename in filenames:
            tokenizer.Tokenizer.add_file_to_dictionary(filename, d, tokenize_line, args.L)

        return d

    def load_dictionary(filename):
        if args.singleSeq:
            d= dictionary.Dictionary.load(filename)
        else:
            d = dictionaryWCS.DictionaryWCS.load(filename)

        return d

    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

    def dataset_dest_path(output_prefix, lang, extension):
        base = f'{args.destdir}/{output_prefix}'
        lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
        return f'{base}{lang_part}.{extension}'

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = build_dictionary(set([
            train_path(lang)
            for lang in [args.source_lang, args.target_lang]
        ]))
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = load_dictionary(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args.source_lang)])
        if target:
            if args.tgtdict:
                tgt_dict = load_dictionary(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)])

    if args.gcn:
        assert args.trainpref, "--trainpref must be set"
        labels_dict = build_dictionary([train_path(args.labels)])
        node1_dict = build_dictionary([train_path(args.node1)])
        node2_dict = build_dictionary([train_path(args.node2)])

    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
    src_dict.save(dict_path(args.source_lang))

    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    if args.gcn:
        labels_dict.finalize(
            threshold=args.thresholdsrc,
            padding_factor=args.padding_factor,
        )
        labels_dict.save(dict_path(args.labels))

        node1_dict.finalize(
            threshold=args.thresholdsrc,
            padding_factor=args.padding_factor,
        )
        node1_dict.save(dict_path(args.node1))

        node2_dict.finalize(
            threshold=args.thresholdsrc,
            padding_factor=args.padding_factor,
        )
        node2_dict.save(dict_path(args.node2))

    def make_binary_dataset(input_prefix, output_prefix, lang):
        #debugging, do only targets
        #if 'src' in lang:
        #if 'tgt' in lang:
        #    print("skip src files...")
        #    #print("skip tgt files...")
        #    return

        dict = load_dictionary(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

        annotator = None
        aconsumer = None
        if args.addAnnotations:
            brigramFile = None
            if os.path.isfile(args.addAnnotations  + "_bigrams"):
                brigramFile = args.addAnnotations  + "_bigrams"
            if args.addAnnotations.endswith(".mallet"):
                annotator = MalletLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile)
            else:
                annotator = GensimLDATopicDistAnnotator(args.addAnnotations, args.numTopics, brigramFile)
            #generate embeddings for topic keywords
            annotator.generateKeywordEmbeddings(dataset_dest_path(output_prefix+"_keyEmbeddings", None, 'txt'))
            annotator.generateKeywordDict(dataset_dest_path(output_prefix+"_keyVocab", None, 'txt'))
            ads = indexed_dataset.IndexedDatasetBuilder(
                dataset_dest_path(output_prefix+"_keys", lang, 'bin'),
                dtype=np.double)
            def aconsumer(tensor):
                ads.add_item(tensor)
            awds = indexed_dataset.IndexedDatasetBuilder(
                dataset_dest_path(output_prefix+"_keywords", lang, 'bin'))
            def awconsumer(tensor):
                awds.add_item(tensor)


        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')

        if args.singleSeq:
            if '.src' in input_file:
                res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, L=args.L,
                                                          aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict)
            elif '.tgt' in input_file:
                res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer,
                                                          aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict)
            else:
                res = TokenizerWCSSingleSequence.binarize(input_file, dict, consumer, append_eos=False,
                                                          aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSingleSequence.printStats(res, lang, input_file, dict)
        else:
            if '.src' in input_file:
                res = TokenizerWCSParagraph.binarize(input_file, dict, consumer,
                                                     max_chunk_length= args.src_chunk_length, L=args.L,
                                                     aconsumer=aconsumer, awconsumer=awconsumer, annotator=annotator)
                TokenizerWCSParagraph.printStats(res, lang, input_file, dict)
            else:
                res = TokenizerWCSSentence.binarize(input_file, dict, consumer, max_chunk_length=args.tgt_chunk_length,
                                                    aconsumer=aconsumer, annotator=annotator)
                TokenizerWCSSentence.printStats(res, lang, input_file, dict)

        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))
        if args.addAnnotations:
            ads.finalize(dataset_dest_path(output_prefix+"_keys", lang, 'idx'))
            awds.finalize(dataset_dest_path(output_prefix+"_keywords", lang, 'idx'))


    def make_dataset(input_prefix, output_prefix, lang):
        if args.output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif args.output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            make_dataset(args.trainpref, 'train', lang)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)
    if args.gcn:
        make_all(args.labels)
        make_all(args.node1)
        make_all(args.node2)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = load_dictionary(dict_path(args.source_lang))
        tgt_dict = load_dictionary(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
                args.source_lang, args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Beispiel #17
0
 def get_ids(dictionary):
     ids = []
     for line in txt:
         ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False))
     return ids
Beispiel #18
0
def main(args):
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    def build_dictionary(filenames):
        d = dictionary.Dictionary()
        for filename in filenames:
            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
        return d

    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

    def dataset_dest_path(output_prefix, lang, extension):
        base = f'{args.destdir}/{output_prefix}'
        lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
        return f'{base}{lang_part}.{extension}'

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = build_dictionary(set([
            train_path(lang)
            for lang in [args.source_lang, args.target_lang]
        ]))
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = dictionary.Dictionary.load(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args.source_lang)])
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)])

    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
    src_dict.save(dict_path(args.source_lang))
    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix, ('.' + lang) if lang is not None else '')
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))

    def make_dataset(input_prefix, output_prefix, lang):
        if args.output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif args.output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix + '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            make_dataset(args.trainpref, 'train', lang)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
                args.source_lang, args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args):
    import_user_module(args)

    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    task = tasks.get_task(args.task)

    def train_path(lang):
        return [
            "{}{}".format(trainpref, ("." + lang) if lang else "")
            for _, trainpref in enumerate(args.trainpref.split(","))
        ]

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path("dict", lang) + ".txt"

    def build_dictionary(filenames, src=False, tgt=False):
        assert src ^ tgt
        return task.build_dictionary(
            filenames,
            workers=args.workers,
            threshold=args.thresholdsrc if src else args.thresholdtgt,
            nwords=args.nwordssrc if src else args.nwordstgt,
            padding_factor=args.padding_factor,
        )

    if args.joined_dictionary:
        assert (
            not args.srcdict or not args.tgtdict
        ), "cannot use both --srcdict and --tgtdict with --joined-dictionary"

        if args.srcdict:
            src_dict = task.load_dictionary(args.srcdict)
        elif args.tgtdict:
            src_dict = task.load_dictionary(args.tgtdict)
        else:
            assert (args.trainpref
                    ), "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary(
                {
                    train_path(lang)
                    for lang in [args.source_lang, args.target_lang]
                },
                src=True)
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = task.load_dictionary(args.srcdict)
        else:
            assert (args.trainpref
                    ), "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary(train_path(args.source_lang), src=True)

        if target:
            if args.tgtdict:
                tgt_dict = task.load_dictionary(args.tgtdict)
            else:
                assert (
                    args.trainpref
                ), "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary(train_path(args.target_lang),
                                            tgt=True)
        else:
            tgt_dict = None

    src_dict.save(dict_path(args.source_lang))
    if target and tgt_dict is not None:
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = task.load_dictionary(dict_path(lang))
        print("| [{}] Dictionary: {} types".format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Binarizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        dict,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin"))
        merge_result(
            Binarizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            lang,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            dict.unk_word,
        ))

    def make_dataset(input_prefix, output_prefix, lang, num_workers=1):
        if args.output_format == "binary":
            make_binary_dataset(input_prefix, output_prefix, lang, num_workers)
        elif args.output_format == "raw":
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix +
                ".{}-{}".format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            for k, trainpref in enumerate(args.trainpref.split(",")):
                outprefix = "train{}".format(k) if k > 0 else "train"
                make_dataset(trainpref,
                             outprefix,
                             lang,
                             num_workers=args.workers)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)

    print("| Wrote preprocessed data to {}".format(args.destdir))

    if args.alignfile:
        raise NotImplementedError('args.alignfile is not implemented')
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        freq_map = {}
        with open(args.alignfile, "r", encoding='utf-8') as align_file:
            with open(src_file_name, "r", encoding='utf-8') as src_file:
                with open(tgt_file_name, "r", encoding='utf-8') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split("-")),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(os.path.join(
                args.destdir,
                "alignment.{}-{}.txt".format(args.source_lang,
                                             args.target_lang),
        ),
                  "w",
                  encoding='utf-8') as f:
            for k, v in align_dict.items():
                print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)
def main(parsed_args):
    assert parsed_args.path is not None, '--path required for evaluation!'
    assert parsed_args.input_ordered_file != "", '--input_ordered_file required for evaluation!'
    assert parsed_args.input_shuffled_file != "", '--input_shuffled_file required for evaluation!'

    print(parsed_args)

    use_cuda = torch.cuda.is_available() and not parsed_args.cpu

    task = tasks.setup_task(parsed_args)

    # Load ensemble
    print('| loading model(s) from {}'.format(parsed_args.path))
    models, args = utils.load_ensemble_for_inference(parsed_args.path.split(':'), task, model_arg_overrides=eval(parsed_args.model_overrides))

    for arg in vars(parsed_args).keys():
        if arg not in {'self_target', 'future_target', 'past_target', 'tokens_per_sample', 'output_size_dictionary'}:
            setattr(args, arg, getattr(parsed_args, arg))
    task = tasks.setup_task(args)

    print(f"Reference file: {parsed_args.input_ordered_file}")
    print(f"Shuffled file: {parsed_args.input_shuffled_file}")

    # Optimize ensemble for generation and set the source and dest dicts on the model (required by scorer)
    for model in models:
        model.make_generation_fast_()
        if args.fp16:
            model.half()

    assert len(models) == 1

    print('num. model params: {}'.format(sum(p.numel() for p in models[0].parameters())))

    lm_input_dictionary = task.dictionary
    vocab = task.target_dictionary

    #print(f"len(lm_input_dictionary): {len(lm_input_dictionary)}")
    #print(f"len(lm_dictionary): {len(vocab)}")
    print(f"Size of vocabulary dictionary: {len(vocab)}")
    assert len(lm_input_dictionary) == len(vocab)
    # print(lm_input_dictionary.eos())
    # print(vocab.eos())

    model = models[0]
    if use_cuda:
        model.cuda()

    #######################################
    eos_id = lm_input_dictionary.eos()

    ordered_input_file_object = codecs.open(parsed_args.input_ordered_file, encoding="utf-8")
    sentence_num = 0

    total_shuffled_neg_log_prob = 0.0
    total_original_neg_log_prob = 0.0
    total_num_tokens = 0

    with codecs.open(parsed_args.input_shuffled_file, encoding="utf-8") as f:
        for line in f:
            if sentence_num >= parsed_args.ordering_start_sentence and sentence_num <= parsed_args.ordering_end_sentence:

                token_ids_tensor = Tokenizer.tokenize(
                    line, lm_input_dictionary, add_if_not_exist=False,
                    append_eos=False, reverse_order=False,
                ).long()

                token_ids = [t for t in token_ids_tensor]
                tokens = tokenize_line(line)
                assert len(token_ids) == len(tokens)

                # print(f"------------------Sentence: {sentence_num}")
                # print(f"Length: {len(tokens)}")

                shuffled_log_prob = advance(model, [eos_id] + token_ids, token_ids + [eos_id], use_cuda, 1)
                total_shuffled_neg_log_prob += -1*shuffled_log_prob

                original_line = ordered_input_file_object.readline()
                original_token_ids_tensor = Tokenizer.tokenize(
                    original_line, lm_input_dictionary, add_if_not_exist=False,
                    append_eos=False, reverse_order=False,
                ).long()

                original_token_ids = [t for t in original_token_ids_tensor]
                original_tokens = tokenize_line(original_line)
                assert len(original_token_ids) == len(original_tokens) and len(original_tokens) == len(tokens)

                original_log_prob = advance(model, [eos_id] + original_token_ids, original_token_ids + [eos_id], use_cuda, 1)
                total_original_neg_log_prob += -1*original_log_prob
                total_num_tokens += len(original_token_ids) + 1  # +1 for the eos_id

            else:
                ordered_input_file_object.readline()  # advance file to maintain alignment with the shuffled file

            sentence_num += 1

    shuffled_perplexity = np.exp(total_shuffled_neg_log_prob / total_num_tokens)
    original_perplexity = np.exp(total_original_neg_log_prob / total_num_tokens)

    print(f"Shuffled perplexity: {shuffled_perplexity}")
    print(f"Original perplexity: {original_perplexity}")
    print(f"Total tokens (including eos): {total_num_tokens}")
Beispiel #21
0
def main(args):
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    def build_dictionary(filenames):
        d = Dictionary()
        for filename in filenames:
            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line,
                                             args.workers)
        return d

    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = build_dictionary(
            set([
                train_path(lang)
                for lang in [args.source_lang, args.target_lang]
            ]))
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = Dictionary.load(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args.source_lang)])
        if target:
            if args.tgtdict:
                tgt_dict = Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)])

    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
    src_dict.save(dict_path(args.source_lang))
    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result['replaced'])
            n_seq_tok[0] += worker_result['nseq']
            n_seq_tok[1] += worker_result['ntok']

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(binarize,
                                 (args, input_file, dict, prefix, lang,
                                  offsets[worker_id], offsets[worker_id + 1]),
                                 callback=merge_result)
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, 'bin'))
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))

        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, n_seq_tok[0], n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1], dict.unk_word))

    def make_dataset(input_prefix, output_prefix, lang, num_workers=1):
        if args.output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang, num_workers)
        elif args.output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix +
                '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            make_dataset(args.trainpref,
                         'train',
                         lang,
                         num_workers=args.workers)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = Dictionary.load(dict_path(args.source_lang))
        tgt_dict = Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    'alignment.{}-{}.txt'.format(args.source_lang,
                                                 args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args):
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = dictionary.Dictionary()
        for lang in [args.source_lang, args.target_lang]:
            Tokenizer.add_file_to_dictionary(
                filename='{}.{}'.format(args.trainpref, lang),
                dict=src_dict,
                tokenize=tokenize_line,
            )
        src_dict.finalize()
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = dictionary.Dictionary.load(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang))
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang))

    src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)),
                  threshold=args.thresholdsrc, nwords=args.nwordssrc)
    if target:
        tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)),
                      threshold=args.thresholdtgt, nwords=args.nwordstgt)

    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(
            '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang,
                                        args.target_lang, lang)
        )

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}.{}'.format(input_prefix, lang)
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize('{}/{}.{}-{}.{}.idx'.format(
            args.destdir, output_prefix,
            args.source_lang, args.target_lang, lang))

    def make_dataset(input_prefix, output_prefix, lang, output_format='binary'):
        if output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = os.path.join(args.destdir, '{}.{}'.format(output_prefix, lang))
            shutil.copyfile('{}.{}'.format(input_prefix, lang), output_text_file)

    def make_all(args, make_dataset, lang):
        if args.trainpref:
            make_dataset(args.trainpref, 'train', lang, args.output_format)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
                make_dataset(validpref, outprefix, lang, args.output_format)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
                make_dataset(testpref, outprefix, lang, args.output_format)

    make_all(args, make_dataset, args.source_lang)
    if target:
        make_all(args, make_dataset, args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
        tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
        src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
                args.source_lang, args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
def main(args):
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    def build_dictionary(filenames):
        d = dictionary.Dictionary()
        for filename in filenames:
            Tokenizer.add_file_to_dictionary(filename, d, tokenize_line)
        return d

    def train_path(lang):
        return '{}{}'.format(args.trainpref, ('.' + lang) if lang else '')

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += f'.{lang}'
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path('dict', lang) + '.txt'

    def dataset_dest_path(output_prefix, lang, extension):
        base = f'{args.destdir}/{output_prefix}'
        lang_part = f'.{args.source_lang}-{args.target_lang}.{lang}' if lang is not None else ''
        return f'{base}{lang_part}.{extension}'

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = build_dictionary(
            set([
                train_path(lang)
                for lang in [args.source_lang, args.target_lang]
            ]))
        tgt_dict = src_dict
    else:
        if args.srcdict:
            src_dict = dictionary.Dictionary.load(args.srcdict)
        else:
            assert args.trainpref, "--trainpref must be set if --srcdict is not specified"
            src_dict = build_dictionary([train_path(args.source_lang)])
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert args.trainpref, "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)])

    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
    src_dict.save(dict_path(args.source_lang))
    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_path(output_prefix, lang, 'bin'))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize(dataset_dest_path(output_prefix, lang, 'idx'))

    def make_dataset(input_prefix, output_prefix, lang):
        if args.output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif args.output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix +
                '.{}-{}'.format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            make_dataset(args.trainpref, 'train', lang)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(',')):
                outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(',')):
                outprefix = 'test{}'.format(k) if k > 0 else 'test'
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    'alignment.{}-{}.txt'.format(args.source_lang,
                                                 args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Beispiel #24
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Data pre-processing: Create dictionary and store data in binary format'
    )
    parser.add_argument('-s',
                        '--source-lang',
                        default=None,
                        metavar='SRC',
                        help='source language')
    parser.add_argument('-t',
                        '--target-lang',
                        default=None,
                        metavar='TARGET',
                        help='target language')
    parser.add_argument('--trainpref',
                        metavar='FP',
                        default='train',
                        help='target language')
    parser.add_argument('--validpref',
                        metavar='FP',
                        default='valid',
                        help='comma separated, valid language prefixes')
    parser.add_argument('--testpref',
                        metavar='FP',
                        default='test',
                        help='comma separated, test language prefixes')
    parser.add_argument('--destdir',
                        metavar='DIR',
                        default='data-bin',
                        help='destination dir')
    parser.add_argument(
        '--thresholdtgt',
        metavar='N',
        default=0,
        type=int,
        help='map words appearing less than threshold times to unknown')
    parser.add_argument(
        '--thresholdsrc',
        metavar='N',
        default=0,
        type=int,
        help='map words appearing less than threshold times to unknown')
    parser.add_argument('--nwordstgt',
                        metavar='N',
                        default=-1,
                        type=int,
                        help='number of target words to retain')
    parser.add_argument('--nwordssrc',
                        metavar='N',
                        default=-1,
                        type=int,
                        help='number of source words to retain')
    parser.add_argument('--alignfile',
                        metavar='ALIGN',
                        default=None,
                        help='an alignment file (optional)')

    args = parser.parse_args()
    print(args)

    os.makedirs(args.destdir, exist_ok=True)

    src_dict = Tokenizer.build_dictionary(
        filename='{}.{}'.format(args.trainpref, args.source_lang))
    src_dict.save(os.path.join(args.destdir,
                               'dict.{}.txt'.format(args.source_lang)),
                  threshold=args.thresholdsrc,
                  nwords=args.nwordssrc)
    tgt_dict = Tokenizer.build_dictionary(
        filename='{}.{}'.format(args.trainpref, args.target_lang))
    tgt_dict.save(os.path.join(args.destdir,
                               'dict.{}.txt'.format(args.target_lang)),
                  threshold=args.thresholdtgt,
                  nwords=args.nwordstgt)

    def make_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder('{}/{}.{}-{}.{}.bin'.format(
            args.destdir, output_prefix, args.source_lang, args.target_lang,
            lang))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}.{}'.format(input_prefix, lang)
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix,
                                                args.source_lang,
                                                args.target_lang, lang))

    make_dataset(args.trainpref, 'train', args.source_lang)
    make_dataset(args.trainpref, 'train', args.target_lang)
    for k, validpref in enumerate(args.validpref.split(',')):
        outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
        make_dataset(validpref, outprefix, args.source_lang)
        make_dataset(validpref, outprefix, args.target_lang)
    for k, testpref in enumerate(args.testpref.split(',')):
        outprefix = 'test{}'.format(k) if k > 0 else 'test'
        make_dataset(testpref, outprefix, args.source_lang)
        make_dataset(testpref, outprefix, args.target_lang)
    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
        tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
        src_dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    'alignment.{}-{}.txt'.format(args.source_lang,
                                                 args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Beispiel #25
0
def main():
    parser = argparse.ArgumentParser(
        description='Data pre-processing: Create dictionary and store data in binary format')
    parser.add_argument('-s', '--source-lang', default=None, metavar='SRC', help='source language')
    parser.add_argument('-t', '--target-lang', default=None, metavar='TARGET', help='target language')
    parser.add_argument('--trainpref', metavar='FP', default='train', help='target language')
    parser.add_argument('--validpref', metavar='FP', default='valid', help='comma separated, valid language prefixes')
    parser.add_argument('--testpref', metavar='FP', default='test', help='comma separated, test language prefixes')
    parser.add_argument('--destdir', metavar='DIR', default='data-bin', help='destination dir')
    parser.add_argument('--thresholdtgt', metavar='N', default=0, type=int,
                        help='map words appearing less than threshold times to unknown')
    parser.add_argument('--thresholdsrc', metavar='N', default=0, type=int,
                        help='map words appearing less than threshold times to unknown')
    parser.add_argument('--tgtdict', metavar='FP', help='reuse given target dictionary')
    parser.add_argument('--srcdict', metavar='FP', help='reuse given source dictionary')
    parser.add_argument('--nwordstgt', metavar='N', default=-1, type=int, help='number of target words to retain')
    parser.add_argument('--nwordssrc', metavar='N', default=-1, type=int, help='number of source words to retain')
    parser.add_argument('--alignfile', metavar='ALIGN', default=None, help='an alignment file (optional)')
    parser.add_argument('--output-format', metavar='FORMAT', default='binary', choices=['binary', 'raw'],
                        help='output format (optional)')

    args = parser.parse_args()
    print(args)
    os.makedirs(args.destdir, exist_ok=True)

    if args.srcdict:
        src_dict = dictionary.Dictionary.load(args.srcdict)
    else:
        src_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.source_lang))
    src_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)),
                  threshold=args.thresholdsrc, nwords=args.nwordssrc)

    if args.tgtdict:
        tgt_dict = dictionary.Dictionary.load(args.tgtdict)
    else:
        tgt_dict = Tokenizer.build_dictionary(filename='{}.{}'.format(args.trainpref, args.target_lang))
    tgt_dict.save(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)),
                  threshold=args.thresholdtgt, nwords=args.nwordstgt)

    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder(
            '{}/{}.{}-{}.{}.bin'.format(args.destdir, output_prefix, args.source_lang,
                                        args.target_lang, lang)
        )

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}.{}'.format(input_prefix, lang)
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize('{}/{}.{}-{}.{}.idx'.format(
            args.destdir, output_prefix,
            args.source_lang, args.target_lang, lang))

    def make_dataset(input_prefix, output_prefix, lang, output_format='binary'):
        if output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = os.path.join(args.destdir, '{}.{}'.format(output_prefix, lang))
            shutil.copyfile('{}.{}'.format(input_prefix, lang), output_text_file)

    make_dataset(args.trainpref, 'train', args.source_lang, args.output_format)
    make_dataset(args.trainpref, 'train', args.target_lang, args.output_format)
    for k, validpref in enumerate(args.validpref.split(',')):
        outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
        make_dataset(validpref, outprefix, args.source_lang, args.output_format)
        make_dataset(validpref, outprefix, args.target_lang, args.output_format)
    for k, testpref in enumerate(args.testpref.split(',')):
        outprefix = 'test{}'.format(k) if k > 0 else 'test'
        make_dataset(testpref, outprefix, args.source_lang, args.output_format)
        make_dataset(testpref, outprefix, args.target_lang, args.output_format)
    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
        tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
        src_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = dictionary.Dictionary.load(os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
                args.source_lang, args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Beispiel #26
0
def main(args):
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    if args.joined_dictionary:
        assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
        assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
        src_dict = build_dictionary(set(chain.from_iterable([
            train_paths(lang)
            for lang in [args.source_lang, args.target_lang]
        ])))
        tgt_dict = src_dict
    else:
        assert args.trainpref, "--trainpref must be set"
        if not args.srcdict:
            src_dict = build_dictionary(chain.from_iterable([train_paths(args.source_lang)]))

        if target and not args.tgtdict:
            assert args.trainpref, "--trainpref must be set"
            tgt_dict = build_dictionary(chain.from_iterable([train_paths(args.target_lang)]))

        if args.srcdict:
            src_dict = load_dictionary(args.srcdict)
        if target and args.tgtdict:
            tgt_dict = load_dictionary(args.tgtdict)

    src_dict.finalize(
        threshold=args.thresholdsrc,
        nwords=args.nwordssrc,
        padding_factor=args.padding_factor,
    )
    src_dict.save(dict_path(args.source_lang))
    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    make_all(args.source_lang, args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_paths(args.source_lang)[0]
        tgt_file_name = train_paths(args.target_lang)[0]
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s, src_dict, add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t, tgt_dict, add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')), a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk() and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx], key=freq_map[srcidx].get)

        with open(os.path.join(args.destdir, 'alignment.{}-{}.txt'.format(
                args.source_lang, args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Beispiel #27
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Data pre-processing: Create dictionary and store data in binary format'
    )
    parser.add_argument('-s',
                        '--source-lang',
                        default=None,
                        metavar='SRC',
                        help='source language')
    parser.add_argument('-t',
                        '--target-lang',
                        default=None,
                        metavar='TARGET',
                        help='target language')
    parser.add_argument('--trainpref',
                        metavar='FP',
                        default='train',
                        help='target language')
    parser.add_argument('--validpref',
                        metavar='FP',
                        default='valid',
                        help='comma separated, valid language prefixes')
    parser.add_argument('--testpref',
                        metavar='FP',
                        default='test',
                        help='comma separated, test language prefixes')
    parser.add_argument('--destdir',
                        metavar='DIR',
                        default='data-bin',
                        help='destination dir')
    # 如果训练集很大, 那么会产生很多低频词, 这些词没有意义很大可能是拼写错误
    # 那么可以通过下面两个参数过滤的低频词, 如果值为10, 则过滤掉频率小于10的词, 这些词不会出现在dict.txt文件中
    parser.add_argument(
        '--thresholdtgt',
        metavar='N',
        default=0,
        type=int,
        help='map words appearing less than threshold times to unknown')
    parser.add_argument(
        '--thresholdsrc',
        metavar='N',
        default=0,
        type=int,
        help='map words appearing less than threshold times to unknown')
    # 如果自己已经有准备好的字典, 可以通过下面两个参数设置
    parser.add_argument('--tgtdict',
                        metavar='FP',
                        help='reuse given target dictionary')
    parser.add_argument('--srcdict',
                        metavar='FP',
                        help='reuse given source dictionary')
    # 限制用于训练的单词数
    parser.add_argument('--nwordstgt',
                        metavar='N',
                        default=-1,
                        type=int,
                        help='number of target words to retain')
    parser.add_argument('--nwordssrc',
                        metavar='N',
                        default=-1,
                        type=int,
                        help='number of source words to retain')
    # 对齐文件格式(train文件): 原文词id-译文词id 多个使用空格分开, 一行代表一句话(eg:0-0 1-2 1-3 2-1 3-4 5-6 7-7)
    # 对齐文件可以是其他工具生成的对齐文件或者人工对齐的文件
    # https://github.com/facebookresearch/fairseq-py/blob/master/preprocess.py#L99
    # 注意: 对齐文件的行数必须跟train.en文件的行数一样, 而不是跟数据集总行数一样
    parser.add_argument('--alignfile',
                        metavar='ALIGN',
                        default=None,
                        help='an alignment file (optional)')
    parser.add_argument('--output-format',
                        metavar='FORMAT',
                        default='binary',
                        choices=['binary', 'raw'],
                        help='output format (optional)')

    args = parser.parse_args()
    print(args)
    os.makedirs(args.destdir, exist_ok=True)

    if args.srcdict:
        src_dict = dictionary.Dictionary.load(args.srcdict)
    else:
        src_dict = Tokenizer.build_dictionary(
            filename='{}.{}'.format(args.trainpref, args.source_lang))
    src_dict.save(os.path.join(args.destdir,
                               'dict.{}.txt'.format(args.source_lang)),
                  threshold=args.thresholdsrc,
                  nwords=args.nwordssrc)

    if args.tgtdict:
        tgt_dict = dictionary.Dictionary.load(args.tgtdict)
    else:
        tgt_dict = Tokenizer.build_dictionary(
            filename='{}.{}'.format(args.trainpref, args.target_lang))
    tgt_dict.save(os.path.join(args.destdir,
                               'dict.{}.txt'.format(args.target_lang)),
                  threshold=args.thresholdtgt,
                  nwords=args.nwordstgt)

    def make_binary_dataset(input_prefix, output_prefix, lang):
        dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(lang)))
        print('| [{}] Dictionary: {} types'.format(lang, len(dict) - 1))

        ds = indexed_dataset.IndexedDatasetBuilder('{}/{}.{}-{}.{}.bin'.format(
            args.destdir, output_prefix, args.source_lang, args.target_lang,
            lang))

        def consumer(tensor):
            ds.add_item(tensor)

        input_file = '{}.{}'.format(input_prefix, lang)
        res = Tokenizer.binarize(input_file, dict, consumer)
        print('| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}'.format(
            lang, input_file, res['nseq'], res['ntok'],
            100 * res['nunk'] / res['ntok'], dict.unk_word))
        ds.finalize('{}/{}.{}-{}.{}.idx'.format(args.destdir, output_prefix,
                                                args.source_lang,
                                                args.target_lang, lang))

    def make_dataset(input_prefix,
                     output_prefix,
                     lang,
                     output_format='binary'):
        if output_format == 'binary':
            make_binary_dataset(input_prefix, output_prefix, lang)
        elif output_format == 'raw':
            # Copy original text file to destination folder
            output_text_file = os.path.join(
                args.destdir, '{}.{}'.format(output_prefix, lang))
            shutil.copyfile('{}.{}'.format(input_prefix, lang),
                            output_text_file)

    make_dataset(args.trainpref, 'train', args.source_lang, args.output_format)
    make_dataset(args.trainpref, 'train', args.target_lang, args.output_format)
    for k, validpref in enumerate(args.validpref.split(',')):
        outprefix = 'valid{}'.format(k) if k > 0 else 'valid'
        make_dataset(validpref, outprefix, args.source_lang,
                     args.output_format)
        make_dataset(validpref, outprefix, args.target_lang,
                     args.output_format)
    for k, testpref in enumerate(args.testpref.split(',')):
        outprefix = 'test{}'.format(k) if k > 0 else 'test'
        make_dataset(testpref, outprefix, args.source_lang, args.output_format)
        make_dataset(testpref, outprefix, args.target_lang, args.output_format)
    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        src_file_name = '{}.{}'.format(args.trainpref, args.source_lang)
        tgt_file_name = '{}.{}'.format(args.trainpref, args.target_lang)
        src_dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(args.source_lang)))
        tgt_dict = dictionary.Dictionary.load(
            os.path.join(args.destdir, 'dict.{}.txt'.format(args.target_lang)))
        freq_map = {}
        with open(args.alignfile, 'r') as align_file:
            with open(src_file_name, 'r') as src_file:
                with open(tgt_file_name, 'r') as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split('-')),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    'alignment.{}-{}.txt'.format(args.source_lang,
                                                 args.target_lang)), 'w') as f:
            for k, v in align_dict.items():
                print('{} {}'.format(src_dict[k], tgt_dict[v]), file=f)
Beispiel #28
0
def main(args):
    print(args)
    os.makedirs(args.destdir, exist_ok=True)
    target = not args.only_source

    def train_path(lang):
        return "{}{}".format(args.trainpref, ("." + lang) if lang else "")

    def file_name(prefix, lang):
        fname = prefix
        if lang is not None:
            fname += ".{lang}".format(lang=lang)
        return fname

    def dest_path(prefix, lang):
        return os.path.join(args.destdir, file_name(prefix, lang))

    def dict_path(lang):
        return dest_path("dict", lang) + ".txt"

    if args.inputtype == "text":
        if args.joined_dictionary:
            assert not args.srcdict, 'cannot combine --srcdict and --joined-dictionary'
            assert not args.tgtdict, 'cannot combine --tgtdict and --joined-dictionary'
            src_dict = build_dictionary(
                {
                    train_path(lang)
                    for lang in [args.source_lang, args.target_lang]
                },
                args.workers,
            )
            tgt_dict = src_dict
        else:
            if args.srcdict:
                src_dict = dictionary.Dictionary.load(args.srcdict)
            else:
                assert (
                    args.trainpref
                ), "--trainpref must be set if --srcdict is not specified"
                src_dict = build_dictionary([train_path(args.source_lang)],
                                            args.workers)
            if target:
                if args.tgtdict:
                    tgt_dict = dictionary.Dictionary.load(args.tgtdict)
                else:
                    assert (
                        args.trainpref
                    ), "--trainpref must be set if --tgtdict is not specified"
                    tgt_dict = build_dictionary([train_path(args.target_lang)],
                                                args.workers)

        src_dict.finalize(
            threshold=args.thresholdsrc,
            nwords=args.nwordssrc,
            padding_factor=args.padding_factor,
        )
        src_dict.save(dict_path(args.source_lang))

    elif args.inputtype == "audio":
        if target:
            if args.tgtdict:
                tgt_dict = dictionary.Dictionary.load(args.tgtdict)
            else:
                assert (
                    args.trainpref
                ), "--trainpref must be set if --tgtdict is not specified"
                tgt_dict = build_dictionary([train_path(args.target_lang)],
                                            args.workers)
    if target:
        if not args.joined_dictionary:
            tgt_dict.finalize(
                threshold=args.thresholdtgt,
                nwords=args.nwordstgt,
                padding_factor=args.padding_factor,
            )
        tgt_dict.save(dict_path(args.target_lang))

    def make_binary_dataset(input_prefix, output_prefix, lang, num_workers):
        dict = dictionary.Dictionary.load(dict_path(lang))
        print("| [{}] Dictionary: {} types".format(lang, len(dict) - 1))
        n_seq_tok = [0, 0]
        replaced = Counter()

        def merge_result(worker_result):
            replaced.update(worker_result["replaced"])
            n_seq_tok[0] += worker_result["nseq"]
            n_seq_tok[1] += worker_result["ntok"]

        input_file = "{}{}".format(input_prefix,
                                   ("." + lang) if lang is not None else "")
        offsets = Tokenizer.find_offsets(input_file, num_workers)
        pool = None
        if num_workers > 1:
            pool = Pool(processes=num_workers - 1)
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                pool.apply_async(
                    binarize,
                    (
                        args,
                        input_file,
                        dict,
                        prefix,
                        lang,
                        offsets[worker_id],
                        offsets[worker_id + 1],
                    ),
                    callback=merge_result,
                )
            pool.close()

        ds = indexed_dataset.IndexedDatasetBuilder(
            dataset_dest_file(args, output_prefix, lang, "bin"))
        merge_result(
            Tokenizer.binarize(input_file,
                               dict,
                               lambda t: ds.add_item(t),
                               offset=0,
                               end=offsets[1]))
        if num_workers > 1:
            pool.join()
            for worker_id in range(1, num_workers):
                prefix = "{}{}".format(output_prefix, worker_id)
                temp_file_path = dataset_dest_prefix(args, prefix, lang)
                ds.merge_file_(temp_file_path)
                os.remove(indexed_dataset.data_file_path(temp_file_path))
                os.remove(indexed_dataset.index_file_path(temp_file_path))

        ds.finalize(dataset_dest_file(args, output_prefix, lang, "idx"))

        print("| [{}] {}: {} sents, {} tokens, {:.3}% replaced by {}".format(
            lang,
            input_file,
            n_seq_tok[0],
            n_seq_tok[1],
            100 * sum(replaced.values()) / n_seq_tok[1],
            dict.unk_word,
        ))

    def make_binary_audio_dataset(input_prefix, output_prefix, lang):
        ds = indexed_dataset.IndexedDatasetBuilder(dataset_dest_file(
            args, output_prefix, lang, 'bin'),
                                                   dtype=np.float32)

        def consumer(tensor):
            ds.add_item(tensor)

        def binarize(input_file, audio_reader, consumer):
            nseq, nsamp = 0, 0
            for tensor in audio_reader(input_file):
                consumer(tensor)
                nseq += 1
                nsamp += tensor.size(0)
            return {'nseq': nseq, 'nsamp': nsamp}

        input_file = '{}{}'.format(input_prefix,
                                   ('.' + lang) if lang is not None else '')
        audio_reader = get_reader(args.format)
        res = binarize(input_file, audio_reader, consumer)
        print('| [{}] {}: {} audio_seq, {} audio_samples'.format(
            lang, input_file, res['nseq'], res['nsamp']))
        ds.finalize(dataset_dest_file(args, output_prefix, lang, 'idx'))

    def make_dataset(input_prefix, output_prefix, lang, num_workers=1):
        if args.output_format == 'binary':
            if args.inputtype == "text" or lang == args.target_lang:
                #The target is assumed to be always textual
                make_binary_dataset(input_prefix, output_prefix, lang,
                                    num_workers)
            else:
                make_binary_audio_dataset(input_prefix, output_prefix, lang)
        elif args.output_format == "raw":
            # Copy original text file to destination folder
            output_text_file = dest_path(
                output_prefix +
                ".{}-{}".format(args.source_lang, args.target_lang),
                lang,
            )
            shutil.copyfile(file_name(input_prefix, lang), output_text_file)

    def make_all(lang):
        if args.trainpref:
            make_dataset(args.trainpref,
                         "train",
                         lang,
                         num_workers=args.workers)
        if args.validpref:
            for k, validpref in enumerate(args.validpref.split(",")):
                outprefix = "valid{}".format(k) if k > 0 else "valid"
                make_dataset(validpref, outprefix, lang)
        if args.testpref:
            for k, testpref in enumerate(args.testpref.split(",")):
                outprefix = "test{}".format(k) if k > 0 else "test"
                make_dataset(testpref, outprefix, lang)

    make_all(args.source_lang)
    if target:
        make_all(args.target_lang)

    print('| Wrote preprocessed data to {}'.format(args.destdir))

    if args.alignfile:
        if args.inputtype == "audio":
            raise (NotImplementedError,
                   "Audio type incompatible with alignment")
        assert args.trainpref, "--trainpref must be set if --alignfile is specified"
        src_file_name = train_path(args.source_lang)
        tgt_file_name = train_path(args.target_lang)
        src_dict = dictionary.Dictionary.load(dict_path(args.source_lang))
        tgt_dict = dictionary.Dictionary.load(dict_path(args.target_lang))
        freq_map = {}
        with open(args.alignfile, "r") as align_file:
            with open(src_file_name, "r") as src_file:
                with open(tgt_file_name, "r") as tgt_file:
                    for a, s, t in zip_longest(align_file, src_file, tgt_file):
                        si = Tokenizer.tokenize(s,
                                                src_dict,
                                                add_if_not_exist=False)
                        ti = Tokenizer.tokenize(t,
                                                tgt_dict,
                                                add_if_not_exist=False)
                        ai = list(map(lambda x: tuple(x.split("-")),
                                      a.split()))
                        for sai, tai in ai:
                            srcidx = si[int(sai)]
                            tgtidx = ti[int(tai)]
                            if srcidx != src_dict.unk(
                            ) and tgtidx != tgt_dict.unk():
                                assert srcidx != src_dict.pad()
                                assert srcidx != src_dict.eos()
                                assert tgtidx != tgt_dict.pad()
                                assert tgtidx != tgt_dict.eos()

                                if srcidx not in freq_map:
                                    freq_map[srcidx] = {}
                                if tgtidx not in freq_map[srcidx]:
                                    freq_map[srcidx][tgtidx] = 1
                                else:
                                    freq_map[srcidx][tgtidx] += 1

        align_dict = {}
        for srcidx in freq_map.keys():
            align_dict[srcidx] = max(freq_map[srcidx],
                                     key=freq_map[srcidx].get)

        with open(
                os.path.join(
                    args.destdir,
                    "alignment.{}-{}.txt".format(args.source_lang,
                                                 args.target_lang),
                ),
                "w",
        ) as f:
            for k, v in align_dict.items():
                print("{} {}".format(src_dict[k], tgt_dict[v]), file=f)