Ejemplo n.º 1
0
    def process(self, corpora, valid_corpora, output_path, bpe_symbols, max_vocab_size, working_dir='.',
                dump_dicts=False):

        class _ReaderWrapper:
            def __init__(self, _corpus, _langs):
                self.corpus = _corpus
                self.langs = _langs
                self._reader = None

            def __enter__(self):
                self._reader = self.corpus.reader(self.langs).__enter__()
                return self._reader

            def __exit__(self, exc_type, exc_val, exc_tb):
                self._reader.__exit__(exc_type, exc_val, exc_tb)
                return self._reader

        self._logger.info('Creating VBE vocabulary')
        vb_builder = SubwordTextProcessor.Builder(symbols=bpe_symbols, max_vocabulary_size=max_vocab_size)
        bpe_encoder = vb_builder.build([_ReaderWrapper(c, [self._source_lang, self._target_lang]) for c in corpora])
        bpe_encoder.save_to_file(self._bpe_model)

        self._logger.info('Creating vocabularies')
        src_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                               onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False)
        trg_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                               onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False)

        for word in bpe_encoder.get_source_terms():
            src_vocab.add(word)
        for word in bpe_encoder.get_target_terms():
            trg_vocab.add(word)

        self._logger.info('Preparing training corpora')
        src_train, trg_train = self._prepare_corpora(corpora, bpe_encoder, src_vocab, trg_vocab)

        self._logger.info('Preparing validation corpora')
        src_valid, trg_valid = self._prepare_corpora(valid_corpora, bpe_encoder, src_vocab, trg_vocab)

        output_file = os.path.join(output_path, 'train_processed.train.pt')
        self._logger.info('Storing OpenNMT preprocessed validation data to "%s"' % output_file)
        torch.save({
            'dicts': {'src': src_vocab, 'tgt': trg_vocab},
            'train': {'src': src_train, 'tgt': trg_train},
            'valid': {'src': src_valid, 'tgt': trg_valid},
        }, output_file)

        if dump_dicts:
            src_dict_file = os.path.join(working_dir, 'train_processed.src.dict')
            trg_dict_file = os.path.join(working_dir, 'train_processed.trg.dict')

            self._logger.info('Storing OpenNMT preprocessed source dictionary "%s"' % src_dict_file)
            src_vocab.writeFile(src_dict_file)

            self._logger.info('Storing OpenNMT preprocessed target dictionary "%s"' % trg_dict_file)
            trg_vocab.writeFile(trg_dict_file)
Ejemplo n.º 2
0
def initVocabularyWithEmb(name,
                          dataFile,
                          vocabFile,
                          embFile,
                          vocabSize,
                          embFile2=None,
                          phraseFile=None):
    if name == "source":
        vocab = None
        if embFile is None:
            raise ValueError("Please provide an embedding file for target")

        if vocabFile is not None:
            # If given, load existing word dictionary.
            print('Reading ' + name + ' vocabulary from \'' + vocabFile +
                  '\'...')
            vocab = onmt.Dict()
            vocab.loadFile(vocabFile)
            print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

        if vocab is None:
            # If a dictionary is still missing, generate it.
            print('Building ' + name + ' vocabulary...')
            genWordVocab = makeVocabulary_src(dataFile, vocabSize, embFile
                                              is not None, embFile)

            vocab = genWordVocab
    elif name == "target":
        vocab = None
        if embFile is None:
            raise ValueError("Please provide an embedding file for target")

        if phraseFile is not None:
            with open(phraseFile, 'r') as file:
                phrase_list = set([l.strip() for l in file.readlines()])
        else:
            phrase_list = None

        if vocabFile is not None:
            # If given, load existing word dictionary.
            print('Reading ' + name + ' vocabulary from \'' + vocabFile +
                  '\'...')
            vocab = onmt.Dict()
            vocab.loadFile(vocabFile)
            print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

        if vocab is None:
            # If a dictionary is still missing, generate it.
            print('Building ' + name + ' vocabulary...')
            genWordVocab = makeVocabulary(dataFile, vocabSize, embFile
                                          is not None, embFile, embFile2,
                                          phrase_list)
            vocab = genWordVocab
            print("sanity check: " + str(vocab == None))
    return vocab
Ejemplo n.º 3
0
    def process(self, corpora, valid_corpora, output_path, checkpoint=None):
        bpe_output_path = os.path.join(output_path, 'vocab.bpe')
        voc_output_path = os.path.join(output_path, 'vocab.pt')

        if checkpoint is not None:
            existing_bpe_path = checkpoint + '.bpe'
            existing_dat_path = checkpoint + '.dat'
            existing_vcb_path = checkpoint + '.vcb'

            with _log_timed_action(self._logger, 'Loading BPE model from %s' % existing_bpe_path):
                shutil.copy(existing_bpe_path, bpe_output_path)
                bpe_encoder = SubwordTextProcessor.load_from_file(bpe_output_path)

            with _log_timed_action(self._logger, 'Loading vocabularies from %s' % existing_dat_path):
                checkpoint_vcb = torch.load(existing_vcb_path, map_location=lambda storage, loc: storage)
                src_vocab = checkpoint_vcb['src']
                trg_vocab = checkpoint_vcb['tgt']

        else:
            with _log_timed_action(self._logger, 'Creating BPE model'):
                vb_builder = SubwordTextProcessor.Builder(symbols=self._bpe_symbols,
                                                          max_vocabulary_size=self._max_vocab_size,
                                                          vocab_pruning_threshold=self._vocab_pruning_threshold)
                bpe_encoder = vb_builder.build([c.reader([self._source_lang, self._target_lang]) for c in corpora])
                bpe_encoder.save_to_file(bpe_output_path)

            with _log_timed_action(self._logger, 'Creating vocabularies'):
                src_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                                       onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False)
                trg_vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                                       onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=False)

                for word in bpe_encoder.get_source_terms():
                    src_vocab.add(word)
                for word in bpe_encoder.get_target_terms():
                    trg_vocab.add(word)

        torch.save({
            'src': src_vocab,
            'tgt': trg_vocab
        }, voc_output_path)

        with _log_timed_action(self._logger, 'Preparing training corpora'):
            train_output_path = os.path.join(output_path, 'train_dataset')
            self._prepare_corpora(corpora, bpe_encoder, src_vocab, trg_vocab, train_output_path)

        with _log_timed_action(self._logger, 'Preparing validation corpora'):
            valid_output_path = os.path.join(output_path, 'valid_dataset')
            self._prepare_corpora(valid_corpora, bpe_encoder, src_vocab, trg_vocab, valid_output_path)
def main():

    dicts = {}
    dicts['src'] = onmt.Dict()
    dicts['src'] = initVocabulary('source', opt.train_txt, opt.src_vocab,
                                  opt.src_vocab_size)
    dicts['tgt'] = dicts['src']

    if opt.src_vocab is None:
        saveVocabulary('source', dicts['src'], opt.save_data + '.dict')

    print('Preparing training ...')
    train_data = makeData(opt.train_txt, dicts)
    print('Preparing validation ...')
    valid_data = makeData(opt.valid_txt, dicts)

    for cs in train_data.keys():

        print('Saving data to \'' + opt.save_data + '_cs' + str(cs) +
              '.train.pt\'...')
        save_data = {
            'dicts': dicts,
            'train': train_data[cs],
            'valid': valid_data[cs]
        }

        save_str = '_cs' + str(cs)

        torch.save(save_data, opt.save_data + save_str + '.train.pt')
    '''
Ejemplo n.º 5
0
def init_vocab(name,
               dataFiles,
               vocabFile,
               vocabSize,
               tokenizer,
               num_workers=1,
               join=False):
    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = onmt.Dict()
        vocab.loadFile(vocabFile)
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:

        print('Building ' + name + ' vocabulary...')
        gen_word_vocab = make_vocab(dataFiles,
                                    vocabSize,
                                    tokenizer,
                                    num_workers=num_workers)

        vocab = gen_word_vocab

    print()
    return vocab
Ejemplo n.º 6
0
def main():

    dicts = {}
    dicts['src'] = onmt.Dict()
    if opt.src_type == "text":
        dicts['src'] = initVocabulary('source', opt.train_src, opt.src_vocab,
                                      opt.src_vocab_size)

    dicts['tgt'] = initVocabulary('target', opt.train_tgt, opt.tgt_vocab,
                                  opt.tgt_vocab_size)

    print('Preparing training ...')
    train = {}
    train['src'], train['tgt'] = makeData(opt.train_src, opt.train_tgt,
                                          dicts['src'], dicts['tgt'])

    print('Preparing validation ...')
    valid = {}
    valid['src'], valid['tgt'] = makeData(opt.valid_src, opt.valid_tgt,
                                          dicts['src'], dicts['tgt'])

    if opt.src_vocab is None:
        saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict')
    if opt.tgt_vocab is None:
        saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict')

    print('Saving data to \'' + opt.save_data + '.train.pt\'...')
    save_data = {
        'dicts': dicts,
        'type': opt.src_type,
        'train': train,
        'valid': valid
    }
    torch.save(save_data, opt.save_data + '.train.pt')
Ejemplo n.º 7
0
def collect_attributes(atb_files):
    # the files can contain multiple attributes
    # each of them will be stored in one onmt.Dict
    print("* Reading attributes ...")
    atb_dicts = dict()

    for file_ in atb_files:

        if not file_:
            continue

        reader = open(file_)

        while True:
            line = reader.readline()

            # normal end of file
            if line == "":
                break

            # attributes are split by space
            atbs = line.strip().split()

            for i, atb in enumerate(atbs):
                if i not in atb_dicts:
                    atb_dicts[i] = onmt.Dict()

                atb_dicts[i].add(atb)

    return atb_dicts
Ejemplo n.º 8
0
def makeVocabulary(filenames, size):
    vocab = onmt.Dict([
        onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
        onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD
    ],
                      lower=opt.lower)

    for filename in filenames:
        print("Reading file " + filename)
        with open(filename) as f:
            for sent in f.readlines():
                for word in sent.split():
                    vocab.add(word)
    #~ with open(filename) as f:
    #~ for sent in f.readlines():
    #~ for word in sent.split():
    #~ vocab.add(word)


#~
    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 9
0
def make_join_vocab(filenames, size, input_type="word"):
    
    vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                       onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD],
                      lower=opt.lower)
    
    for filename in filenames:
        print("Reading file %s ... " % filename)
        with open(filename) as f:
            for sent in f.readlines():

                if input_type == "word":
                    for word in sent.split():
                        vocab.add(word)
                elif input_type == "char":
                    chars = split_line_by_char(sent)
                    for char in chars:
                        vocab.add(char)
                else:
                    raise NotImplementedError("Input type not implemented")


    original_size = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), original_size))

    return vocab
Ejemplo n.º 10
0
def make_vocab(filename, size, input_type='word'):
    vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                       onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD],
                      lower=opt.lower)

    unk_count = 0

    with open(filename) as f:
        for sent in f.readlines():
            if input_type == "word":
                for word in sent.split():
                    idx = vocab.add(word)
            elif input_type == "char":
                chars = split_line_by_char(sent)
                for char in chars:
                    idx = vocab.add(char)
            else:
                raise NotImplementedError("Input type not implemented")

            if idx == 'onmt.Constants.UNK':
                unk_count += 1

    original_size = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), original_size))

    return vocab
Ejemplo n.º 11
0
def init_vocab(name, dataFile, vocabFile, vocabSize, join=False, input_type='word'):

    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = onmt.Dict()
        vocab.loadFile(vocabFile)
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        
        # If a dictionary is still missing, generate it.
        if join:
            
            print('Building ' + 'shared' + ' vocabulary...')
            gen_word_vocab = make_join_vocab(dataFile, vocabSize, input_type=input_type)
        else:
            print('Building ' + name + ' vocabulary...')
            gen_word_vocab = make_vocab(dataFile, vocabSize, input_type=input_type)

        vocab = gen_word_vocab

    print()
    return vocab
Ejemplo n.º 12
0
def makeVocabulary(filename, size, input_type='word'):
    vocab = onmt.Dict([
        onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
        onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD
    ],
                      lower=opt.lower)

    with open(filename) as f:
        for sent in f.readlines():
            if input_type == "word":
                for word in sent.split():
                    vocab.add(word)
            elif input_type == "char":
                sent = sent.strip()
                for char in sent:
                    vocab.add(char)
            else:
                raise NotImplementedError("Input type not implemented")

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 13
0
def initVocabulary(name, dataFile, vocabFile, vocabSize):

    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = onmt.Dict()
        vocab.loadFile(vocabFile)
        print('Loaded ' + vocab.size() + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building ' + name + ' vocabulary...')
        if opt.prune_by_freq:
            if name == 'source':
                genWordVocab = makeVocabulary(dataFile, vocabSize,
                                              opt.src_min_freq)
            elif name == 'target':
                genWordVocab = makeVocabulary(dataFile, vocabSize,
                                              opt.tgt_min_freq)
        else:
            genWordVocab = makeVocabulary(dataFile, vocabSize)

        vocab = genWordVocab

    print()
    return vocab
Ejemplo n.º 14
0
def initVocabulary(name, srcFile, vocabFile, vocabSize, dacts_vocab=0):

    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = onmt.Dict()
        vocab.loadFile(vocabFile)
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        if not dacts_vocab:
            print('Building ' + name + ' vocabulary...')
            genWordVocab = makeVocabulary(srcFile, vocabSize)
        elif dacts_vocab:
            print('Building ' + name + ' vocabulary...')
            genWordVocab = makeVocabulary(srcFile, vocabSize, 0)

        vocab = genWordVocab
        originalSize = vocab.size()
        vocab = vocab.prune(vocabSize)
        print('Created dictionary of size %d (pruned from %d)' %
              (vocab.size(), originalSize))

    print()
    return vocab
Ejemplo n.º 15
0
def main():

    dicts = {}
    dicts['src'] = onmt.Dict()
    dicts['src'] = initVocabulary('source', opt.train_src, opt.src_vocab,
                                  opt.src_vocab_size)
    dicts['tgt'] = dicts['src']

    print('Preparing training ...')
    train = {}
    train['src'], train['tgt'] = makeData(opt.train_src, dicts['src'])

    print('Preparing validation ...')
    valid = {}
    valid['src'], valid['tgt'] = makeData(opt.valid_src, dicts['src'])

    if opt.src_vocab is None:
        saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict')

    print('Saving data to \'' + opt.save_data + '_cs' +
          str(opt.context_size) + '.train.pt\'...')
    save_data = {'dicts': dicts,
                 'train': train,
                 'valid': valid}
    torch.save(save_data, opt.save_data + '.train.pt')
Ejemplo n.º 16
0
def init_vocab(name,
               data_files,
               vocab_file,
               vocab_size,
               tokenizer,
               num_workers=1):
    vocab = None
    if vocab_file is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocab_file + '\'...')
        if not opt.load_bpe_voc:
            vocab = onmt.Dict()
        else:
            if name == "target":
                vocab = onmt.Dict([
                    opt.tgt_pad_token, opt.tgt_unk_token, opt.tgt_bos_token,
                    opt.tgt_eos_token
                ],
                                  lower=opt.lower)
            elif name == "source":
                vocab = onmt.Dict([
                    opt.src_pad_token, opt.src_unk_token, opt.src_bos_token,
                    opt.src_eos_token
                ],
                                  lower=opt.lower)
            else:
                print("Warning: name should be source or target")
                exit(-1)

        vocab.loadFile(vocab_file)
        print('Loaded ' + str(vocab.size()) + ' ' + name + ' words')

    if vocab is None:
        print('Building ' + name + ' vocabulary...')
        gen_word_vocab = make_vocab(
            name,
            data_files,
            vocab_size,
            tokenizer,
            num_workers=num_workers,
        )

        vocab = gen_word_vocab

    print()
    return vocab
Ejemplo n.º 17
0
def makeVocabulary(filename, size, vocab=None):
    if vocab is None:
        vocab = onmt.Dict([
            onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
            onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD
        ],
                          lower=opt.lower)
    elif vocab == 0:
        vocab = onmt.Dict([onmt.Constants.UNK_WORD], lower=opt.lower)

    with open(filename) as f:
        for sent in f.readlines():
            if sent.split()[0] != '===':
                for word in sent.split():
                    vocab.add(word)

    return vocab
Ejemplo n.º 18
0
def makeFeature(data):
    vocabs = {}
    for key, value in data.items():
        in_d = {}
        in_d[onmt.Constants.UNK] = onmt.Constants.UNK_WORD
        in_d[onmt.Constants.PAD] = onmt.Constants.PAD_WORD
        vocab = onmt.Dict([value for key, value in sorted(in_d.items())],
                          lower=opt.lower)
        for data in value:
            vocab.add(data)
        vocabs[key] = vocab
    return vocabs
Ejemplo n.º 19
0
def makeVocabulary(filename, size):
    "Construct the word and feature vocabs."
    vocab = onmt.Dict([
        onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
        onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD
    ],
                      lower=opt.lower)
    featuresVocabs = []
    with codecs.open(filename, "r", "utf-8") as f:
        cnt = 0
        for sent in f.readlines():
            cnt += 1
            words, features, numFeatures \
                = onmt.IO.extractFeatures(sent.split())

            if len(featuresVocabs) == 0 and numFeatures > 0:
                for j in range(numFeatures):
                    featuresVocabs.append(
                        onmt.Dict([
                            onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                            onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD
                        ]))
            else:
                print('cnt : ', cnt)
                assert len(featuresVocabs) == numFeatures, \
                    "all sentences must have the same number of features"
            for i in range(len(words)):
                vocab.add(words[i])
                for j in range(numFeatures):
                    featuresVocabs[j].add(features[j][i])

    originalSize = vocab.size()
    if size != 0:
        vocab = vocab.prune(size)
        print('Created dictionary of size %d (pruned from %d)' %
              (vocab.size(), originalSize))
    else:
        print('Created dictionary of size %d' % (vocab.size()))

    return vocab, featuresVocabs
Ejemplo n.º 20
0
def initVocabulary(dataFile, src_vocab_file, src_vocabSize, tgt_vocab_file, tgt_vocabSize):

    src_vocab = None
    tgt_vocab = None
    if src_vocab_file is not None:
        # If given, load existing word dictionary.
        print('Reading vocabulary from \'' + src_vocab_file + '\'...')
        src_vocab = onmt.Dict()
        src_vocab.loadFile(src_vocab_file)
        print('Loaded ' + str(src_vocab.size()) + ' source words')
    if tgt_vocab_file is not None:
        # If given, load existing word dictionary.
        print('Reading vocabulary from \'' + tgt_vocab_file + '\'...')
        tgt_vocab = onmt.Dict()
        tgt_vocab.loadFile(tgt_vocab_file)
        print('Loaded ' + str(tgt_vocab.size()) + ' target words')
    if src_vocab and tgt_vocab:
        # early return
        return src_vocab, tgt_vocab

    with open(dataFile) as dataFileHandle:
        lines_in_file = [l.strip().decode('utf8').split('\t') for l in dataFileHandle.readlines()]
    line_num = 1
    for l in lines_in_file:
        if len(l) != 2:
            print("Error on line %d" % (line_num))
        line_num += 1
    if src_vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building source vocabulary...')
        src_lines = [l[0] for l in lines_in_file]
        src_vocab = makeVocabulary(src_lines, src_vocabSize)

    if tgt_vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building target vocabulary...')
        tgt_lines = [l[1] for l in lines_in_file]
        tgt_vocab = makeVocabulary(tgt_lines, tgt_vocabSize)

    return src_vocab, tgt_vocab
Ejemplo n.º 21
0
def main():

    conn = redis.Redis(host=opt.host, port=opt.port, db=opt.db)
    if not opt.dbExist:
        dicts = {}
        dicts['src'] = onmt.Dict()
        #if opt.src_type == "text":
        if True: 
            dicts['src'], dicts['src_features'] = \
                    initVocabulary('source', opt.train_src, opt.src_vocab,
                                   opt.src_vocab_size)
        dicts['tgt'], dicts['tgt_features'] = \
            initVocabulary('target',
                           opt.train_tgt,
                           opt.tgt_vocab,
                           opt.tgt_vocab_size)
        print('Preparing training ...')
    else:
        save_data = torch.load(opt.save_data + ".dicts.pt")
        dicts = save_data['dicts']
    train = {}
    conn.set('bucket_size', opt.bucket) 
    train['src'], train['tgt'], \
        train['src_features'], train['tgt_features'], \
        train['alignments'] \
        = makeData(conn, "train", opt.train_src, opt.train_tgt,
                   dicts['src'], dicts['tgt'],
                   dicts['src_features'], dicts['tgt_features'], bucket_size=opt.bucket, dbExist=opt.dbExist)
    print('Preparing validation ...')
    valid = {}
    valid['src'], valid['tgt'], \
        valid['src_features'], valid['tgt_features'], \
        valid['alignments'] \
        = makeData(conn, "valid", opt.valid_src, opt.valid_tgt,
                   dicts['src'], dicts['tgt'],
                   dicts['src_features'], dicts['tgt_features'], bucket_size=opt.bucket, dbExist=opt.dbExist)

    if not opt.dbExist:
        if opt.src_vocab is None:
            saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict')
        if opt.tgt_vocab is None:
            saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict')
        if opt.features_vocabs_prefix:
            saveFeaturesVocabularies('source', dicts['src_features'],
                                     opt.save_data)
            saveFeaturesVocabularies('target', dicts['tgt_features'],
                                     opt.save_data)

        print('Saving data to \'' + opt.save_data + '.dicts.pt\'...')
        save_data = {'dicts': dicts,
                     'type':  "text"}
        torch.save(save_data, opt.save_data + '.dicts.pt')
Ejemplo n.º 22
0
def makeVocabulary(filename, size, embGiven=False, embFile=None):
    special_embeddings = None
    if embGiven:
        special_embeddings = [
            np.zeros(opt.emb_dim, ),
            np.zeros(opt.emb_dim, ),
            np.zeros(opt.emb_dim, ),
            np.ones(opt.emb_dim, )
        ]
    vocab = onmt.Dict([
        onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
        onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD
    ],
                      lower=opt.lower,
                      special_embeddings=special_embeddings)

    with codecs.open(filename, "r", "utf-8") as f:
        for sent in f.readlines():
            for word in sent.split():
                vocab.add(word)

    if embGiven:
        n = 0
        with codecs.open(embFile, "r", "utf-8") as f:
            for l in f:
                items = l.strip().split()
                if len(items) < 301:
                    continue
                try:
                    v = np.array(items[1:], dtype=np.float32)
                except Exception as e:
                    print(items)
                    continue
                    #sys.exit(-1)
                vocab.add_embedding(items[0], v, onmt.Constants.UNK_WORD,
                                    opt.normalize)
                n += 1

    originalSize = vocab.size()
    vocab, c = vocab.prune(size, embGiven)
    if embGiven:
        # print (c, size)
        # print (len(vocab.idxToLabel), len(vocab.embeddings))
        # print (max(vocab.embeddings.keys()))
        vocab.average_unk(onmt.Constants.UNK_WORD, n - c, opt.normalize)
        vocab.convert_embeddings_to_torch(dim=opt.emb_dim)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 23
0
def makeVocabulary(filename, size):
    vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                       onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD])

    with open(filename) as f:
        for sent in f.readlines():
            for word in sent.split():
                vocab.add(word)

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 24
0
def makeVocabulary(data, size, freq):
    in_d = {}
    in_d[onmt.Constants.UNK] = onmt.Constants.UNK_WORD
    in_d[onmt.Constants.PAD] = onmt.Constants.PAD_WORD
    vocab = onmt.Dict([value for key, value in sorted(in_d.items())],
                      lower=opt.lower)

    for key, value in data.items():
        vocab.add(key, value)

    originalSize = vocab.size()
    vocab = vocab.prune(size, freq)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 25
0
def initVocabulary(name, dataFile, vocabFile, vocabSize):
    vocab = None
    if vocabFile is not None:
        # If given, load existing word dictionary.
        print('Reading ' + name + ' vocabulary from \'' + vocabFile + '\'...')
        vocab = onmt.Dict()
        vocab.loadFile(vocabFile)
        print('Loaded ' + vocab.size() + ' ' + name + ' words')

    if vocab is None:
        # If a dictionary is still missing, generate it.
        print('Building ' + name + ' vocabulary...')
        genWordVocab = makeVocabulary(dataFile, vocabSize)

        vocab = genWordVocab

    print()
    return vocab
Ejemplo n.º 26
0
def makeVocabulary(filename, size):
    vocab = onmt.Dict(
        [onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD, \
        onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD], lower=opt.lower, seq_len=opt.seq_length)

    count = 0
    with codecs.open(filename, "r", "utf-8") as f:
        tsv_reader = csv.reader(f, delimiter='\t')
        for line in tsv_reader:
            sent = line[1]
            for word in sent.split():
                vocab.add(word)
            sent = line[2]
            for word in sent.split():
                vocab.add(word)
            count += 1

    with codecs.open(opt.valid_src, "r", "utf-8") as f:
        tsv_reader = csv.reader(f, delimiter='\t')
        for line in tsv_reader:
            sent = line[1]
            for word in sent.split():
                vocab.add(word)
            sent = line[2]
            for word in sent.split():
                vocab.add(word)

    fname = opt.valid_src.split('.tsv')[0][:-3] + 'test.tsv'
    with codecs.open(fname, "r", "utf-8") as f:
        tsv_reader = csv.reader(f, delimiter='\t')
        for line in tsv_reader:
            sent = line[1]
            for word in sent.split():
                vocab.add(word)
            sent = line[2]
            for word in sent.split():
                vocab.add(word)

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 27
0
def makeVocabulary(filename, size, min_freq):
    vocab = onmt.Dict([
        onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
        onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD
    ],
                      lower=opt.lower)

    with codecs.open(filename, 'r', encoding='utf-8') as f:
        for sent in f.readlines():
            for word in sent.split():
                vocab.add(word)

    originalSize = vocab.size()
    vocab = vocab.prune_by_freq(min_freq)
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab
Ejemplo n.º 28
0
def main():

    dicts = {}
    dicts['src'] = onmt.Dict()
    dicts['src'] = initVocabulary('source', opt.train_src, opt.src_vocab,
                                  opt.src_vocab_size, opt.src_min_freq)

    dicts['tgt'] = initVocabulary('target', opt.train_tgt, opt.tgt_vocab,
                                  opt.tgt_vocab_size, opt.tgt_min_freq)

    print('Preparing training ...')
    train = {}
    train['src'], train['tgt'] = makeData(opt.train_src, opt.train_tgt,
                                          dicts['src'], dicts['tgt'])

    print('Preparing test ...')
    test = {}
    test['src'], test['tgt'] = makeData(opt.test_src, opt.test_tgt,
                                        dicts['src'], dicts['tgt'], True)

    print('Preparing validation ...')
    if '' not in (opt.valid_src, opt.valid_tgt):
        valid = {}
        valid['src'], valid['tgt'] = makeData(opt.valid_src, opt.valid_tgt,
                                              dicts['src'], dicts['tgt'])
    else:
        print('Empty validation')
        valid = test

    if opt.src_vocab is None:
        saveVocabulary('source', dicts['src'], opt.save_data + '.src.dict')
    if opt.tgt_vocab is None:
        saveVocabulary('target', dicts['tgt'], opt.save_data + '.tgt.dict')

    print('Saving data to \'' + opt.save_data + '\'...')
    save_data = {
        'dicts': dicts,
        'type': opt.src_type,
        'train': train,
        'valid': valid,
        'test': test
    }
    torch.save(save_data, opt.save_data)
Ejemplo n.º 29
0
def main():

    assert opt.keys != opt.acts
    dicts = {}
    dicts['src'] = onmt.Dict()
    dicts['src'] = initVocabulary('source', opt.train_txt, opt.train_key,
                                  opt.src_vocab, opt.src_vocab_size)
    dicts['tgt'] = dicts['src']

    if opt.src_vocab is None:
        saveVocabulary('source', dicts['src'], opt.save_data + '.dict')

    dicts['das'] = make_act_dict('dialog act', opt.train_act, opt.da_dict)

    if opt.da_dict is None:
        saveVocabulary('dialog act', dicts['das'], opt.save_data + '.da_dict')

    if opt.acts:
        train_context_file = opt.train_act
        valid_context_file = opt.valid_act
    elif opt.keys:
        train_context_file = opt.train_key
        valid_context_file = opt.valid_key

    print('Preparing training ...')
    train_data = makeData(opt.train_txt, opt.train_act, dicts)
    print('Preparing validation ...')
    valid_data = makeData(opt.valid_txt, opt.valid_act, dicts)

    for cs in train_data.keys():

        print('Saving data to \'' + opt.save_data + '_cs' + str(cs) +
              '.train.pt\'...')
        save_data = {
            'dicts': dicts,
            'train': train_data[cs],
            'valid': valid_data[cs]
        }

        save_str = '_cs' + str(cs)

        torch.save(save_data, opt.save_data + save_str + '.train.pt')
    '''
Ejemplo n.º 30
0
def makeVocabulary(lines, size):
    vocab = onmt.Dict([onmt.Constants.PAD_WORD, onmt.Constants.UNK_WORD,
                       onmt.Constants.BOS_WORD, onmt.Constants.EOS_WORD],
                       lower=opt.lower)

    sent_num = 1
    for sent in lines:
        if sent is not list:
            sent = list(sent)
        for word in sent:
            vocab.add(word)
        sent_num += 1

    originalSize = vocab.size()
    vocab = vocab.prune(size)
    print('Created dictionary of size %d (pruned from %d)' %
          (vocab.size(), originalSize))

    return vocab