Example #1
0
 def __init__(
     self,
     input_path=None,
     rating_scale=5,
     verbose=False,
 ):
     self.uid_map = load_dict(os.path.join(input_path, "uid_map"), sep=",")
     self.iid_map = load_dict(os.path.join(input_path, "iid_map"), sep=",")
     self.aspect_id_map = load_dict(os.path.join(input_path,
                                                 "aspect_id_map"),
                                    sep=",")
     self.opinion_id_map = load_dict(os.path.join(input_path,
                                                  "opinion_id_map"),
                                     sep=",")
     self.U = np.load(os.path.join(input_path, "U.npy"))
     self.I = np.load(os.path.join(input_path, "I.npy"))
     self.A = np.load(os.path.join(input_path, "A.npy"))
     self.O = np.load(os.path.join(input_path, "O.npy"))
     self.G1 = np.load(os.path.join(input_path, "G1.npy"))
     self.G2 = np.load(os.path.join(input_path, "G2.npy"))
     self.G3 = np.load(os.path.join(input_path, "G3.npy"))
     self.rating_scale = rating_scale
     self.id2aspect = {v: k for k, v in self.aspect_id_map.items()}
     self.verbose = verbose
     if self.verbose:
         print("Load MTER from %s" % input_path)
Example #2
0
 def __init__(
     self,
     input_path=None,
     alpha=0.85,
     num_most_cared_aspects=15,
     rating_scale=5,
     verbose=False,
 ):
     self.uid_map = load_dict(os.path.join(input_path, "uid_map"), sep=",")
     self.iid_map = load_dict(os.path.join(input_path, "iid_map"), sep=",")
     self.aspect_id_map = load_dict(os.path.join(input_path,
                                                 "aspect_id_map"),
                                    sep=",")
     self.U1 = np.load(os.path.join(input_path, "U1.npy"))
     self.U2 = np.load(os.path.join(input_path, "U2.npy"))
     self.V = np.load(os.path.join(input_path, "V.npy"))
     self.H1 = np.load(os.path.join(input_path, "H1.npy"))
     self.H2 = np.load(os.path.join(input_path, "H2.npy"))
     self.alpha = alpha
     self.n_cared_aspects = num_most_cared_aspects
     self.rating_scale = rating_scale
     self.id2aspect = {v: k for k, v in self.aspect_id_map.items()}
     self.verbose = verbose
     if self.verbose:
         print("Load EFM from %s" % input_path)
Example #3
0
    def __init__(self, source, target,
                 source_dict, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source+'.shuf', 'r')
            self.target = fopen(target+'.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * 20

        self.end_of_data = False
Example #4
0
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source + '.shuf', 'r')
            self.target = fopen(target + '.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * 20

        self.end_of_data = False
Example #5
0
def main(models,
         source_file,
         nbest_file,
         saveto,
         b=80,
         normalize=False,
         verbose=False,
         alignweights=False):

    # load model model_options
    options = []
    for model in args.models:
        try:
            with open('%s.json' % model, 'rb') as f:
                options.append(json.load(f))
        except:
            with open('%s.pkl' % model, 'rb') as f:
                options.append(pkl.load(f))
        #hacks for using old models with missing options
        if not 'dropout_embedding' in options[-1]:
            options[-1]['dropout_embedding'] = 0
        if not 'dropout_hidden' in options[-1]:
            options[-1]['dropout_hidden'] = 0
        if not 'dropout_source' in options[-1]:
            options[-1]['dropout_source'] = 0
        if not 'dropout_target' in options[-1]:
            options[-1]['dropout_target'] = 0

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    rescore_model(source_file, nbest_file, saveto, models, options, b,
                  normalize, verbose, alignweights)
Example #6
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 indomain_source='', indomain_target='',
                 interpolation_rate=0.1,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
            self.indomain_source_orig = indomain_source
            self.indomain_target_orig = indomain_target
            self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
            self.indomain_source = fopen(indomain_source, 'r')
            self.indomain_target = fopen(indomain_target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False

        self.interpolation_rate = interpolation_rate
        self.cur_interpolation_rate = self.interpolation_rate
        self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k))
        self.outdomain_k = self.k - self.indomain_k
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 keep_data_in_memory=False):
        if keep_data_in_memory:
            self.source, self.target = FileWrapper(source), FileWrapper(target)
            if shuffle_each_epoch:
                r = numpy.random.permutation(len(self.source))
                self.source.shuffle_lines(r)
                self.target.shuffle_lines(r)
        elif shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.keep_data_in_memory = keep_data_in_memory
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 indomain_source='', indomain_target='',
                 interpolation_rate=0.1,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            shuffle.main([indomain_source, indomain_target])
            self.source = fopen(source+'.shuf', 'r')
            self.target = fopen(target+'.shuf', 'r')
            self.indomain_source = fopen(indomain_source+'.shuf', 'r')
            self.indomain_target = fopen(indomain_target+'.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
            self.indomain_source = fopen(indomain_source, 'r')
            self.indomain_target = fopen(indomain_target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False

        self.interpolation_rate = interpolation_rate
        self.indomain_k = int(math.ceil(self.interpolation_rate * self.k))
        self.outdomain_k = self.k - self.indomain_k
Example #9
0
    def __init__(self,
                 source,
                 target,
                 source_dicts,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 token_batch_size=0):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main(
                [self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        self.token_batch_size = token_batch_size

        self.end_of_data = False
    def __init__(
        self,
        source,
        target,
        source_dicts,
        target_dict,
        batch_size=128,
        maxlen=100,
        n_words_source=-1,
        n_words_target=-1,
        shuffle_each_epoch=False,
        sort_by_length=True,
        maxibatch_size=20,
    ):
        global epoch_num
        if shuffle_each_epoch:
            shuffle.main([source, target], epoch_num)
            self.source = fopen(source + '.shuf', 'r')
            self.target = fopen(target + '.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False

        self.embeddings = embeddings
Example #11
0
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main(
                [self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')

        #for line in self.source.readlines():
        #print line
        #aline = self.target.readline()
        #print aline

        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:  # if source number is specified
            for key, idx in self.source_dict.items():
                if idx >= self.n_words_source:
                    del self.source_dict[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length
        self.source_buffer = []  # source instance in memory
        self.target_buffer = []  # target instance in memory
        self.k = batch_size * maxibatch_size  # number of instance in memory in total

        self.end_of_data = False
Example #12
0
def main(models, source_file, nbest_file, saveto, b=80,
         normalize=False, verbose=False, alignweights=False):

    # load model model_options
    options = []
    for model in args.models:
        try:
            with open('%s.json' % model, 'rb') as f:
                options.append(json.load(f))
        except:
            with open('%s.pkl' % model, 'rb') as f:
                options.append(pkl.load(f))
        #hacks for using old models with missing options
        if not 'dropout_embedding' in options[-1]:
            options[-1]['dropout_embedding'] = 0
        if not 'dropout_hidden' in options[-1]:
            options[-1]['dropout_hidden'] = 0
        if not 'dropout_source' in options[-1]:
            options[-1]['dropout_source'] = 0
        if not 'dropout_target' in options[-1]:
            options[-1]['dropout_target'] = 0

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
Example #13
0
    def read_model(self, params):
        user_file = os.path.join(params['config_path'],
                                 params['user_filename'])
        item_file = os.path.join(params['config_path'],
                                 params['item_filename'])
        vocab_file = os.path.join(params['config_path'],
                                  params['vocab_filename'])
        aspect_file = os.path.join(params['config_path'],
                                   params['aspect_filename'])
        opinion_file = os.path.join(params['config_path'],
                                    params['opinion_filename'])
        aspect_opinions_file = os.path.join(params['config_path'],
                                            params['aspect_opinions_filename'])
        model_file = os.path.join(params['config_path'],
                                  params['model_filename'])

        context_word_units = int(params['unit'])
        lstm_hidden_units = IN_TO_OUT_UNITS_RATIO * context_word_units
        target_word_units = IN_TO_OUT_UNITS_RATIO * context_word_units

        user2index = load_dict(user_file)
        item2index = load_dict(item_file)
        word2index = load_dict(vocab_file)
        aspect2index = load_dict(aspect_file)
        opinion2index = load_dict(opinion_file)
        aspect_opinions = load_json(aspect_opinions_file)

        n_user = max(user2index.values()) + 1
        n_item = max(item2index.values()) + 1
        n_vocab = max(word2index.values()) + 1
        n_aspect = max(aspect2index.values()) + 1

        n_encode = n_aspect

        # dummy word counts - not used for eval
        cs = [1 for _ in range(n_vocab)]
        # dummy loss func - not used for eval
        loss_func = L.NegativeSampling(target_word_units, cs,
                                       NEGATIVE_SAMPLING_NUM)

        if params['model_type'] == 'c2v':
            model = Context2Vec(self.gpu, n_vocab, context_word_units,
                                lstm_hidden_units, target_word_units,
                                loss_func, self.resume)
        elif params['model_type'] in ['asc2v', 'asc2v-mter']:
            model = AspectSentiContext2Vec(self.gpu, n_vocab, n_encode,
                                           context_word_units,
                                           lstm_hidden_units,
                                           target_word_units, loss_func,
                                           self.resume)
        S.load_npz(model_file, model)
        w = model.loss_func.W.data
        return user2index, item2index, w, word2index, aspect2index, opinion2index, aspect_opinions, model
Example #14
0
    def testLoad(self,cfg):
        cfg = self.cfg
        entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
        word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
        relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

        train_documents = load_documents(cfg['data_folder'] + cfg['train_documents'])
        train_document_entity_indices, train_document_texts = index_document_entities(train_documents, word2id,
                                                                                      entity2id,
                                                                                      cfg['max_document_word'])
        train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices,
                                train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'],
                                cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation'])
Example #15
0
def predict():
    word_to_ix = load_dict(word_dict_file)
    tag_to_ix = load_dict(tag_dict_file)
    ix_to_tag = {v: k for k, v in tag_to_ix.items()}
    model_file = model_path + 'params.pkl'
    if os.path.exists(model_file):
        model.load_state_dict(torch.load(model_file))

    for wordss, tagss, lengths in pred_helper.gen_batch():
        sentence_in = prepare_sequence(wordss, word_to_ix)
        predict_scores, predict_ix_seqs = model(sentence_in, lengths)
        for word, ix in zip(wordss[0], predict_ix_seqs[0]):
            print(word, ix_to_tag[ix])
        print()
Example #16
0
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=None,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main(
                [self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = data_utils.fopen(source, 'r')
            self.target = data_utils.fopen(target, 'r')

        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for key, idx in self.source_dict.items():
                if idx >= self.n_words_source:
                    del self.source_dict[key]

        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False
Example #17
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source+'.shuf', 'r')
            self.target = fopen(target+'.shuf', 'r')
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        
        print "K=", self.k

        self.end_of_data = False
Example #18
0
 def __init__(self, options, **kwargs):
     self.src_dict = load_dict(options['dictionaries'][0])
     self.trg_dict = load_dict(options['dictionaries'][1])
     self.tmp_dir = kwargs['tmp_dir']
     self.translate_script = kwargs['translate_script']
     self.bleu_script = kwargs['bleu_script']
     self.valid_src = kwargs['bleuvalid_src']
     self.valid_trg = kwargs['bleuvalid_trg']
     self.n_words_src = options['n_words_src']
     self.batch_size = 16
     self.batches = self.prepare_data()
     os.system('mkdir -p %s' % self.tmp_dir)
     self.check_script()  # check bleu script
     self.trg_idict = dict()
     for k, v in self.trg_dict.iteritems():
         self.trg_idict[v] = k
Example #19
0
def test():
    entity2id = load_dict(entity2id_file)

    test_data = TypedataLoader(test_file, entity2id)
    my_model = get_model(entity2id)
    test_acc = inference(my_model, test_data, entity2id, log_info=True)
    return test_acc
Example #20
0
    def __init__(self, sources, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=[-1],
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            shuffle.main(sources + [target])
            self.sources = [fopen(source+'.shuf', 'r') for source in sources]
            self.target = fopen(target+'.shuf', 'r')
        else:
            self.sources = [fopen(source, 'r') for source in sources]
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for factor_dicts in source_dicts:
            self.source_dicts.append([load_dict(source_dict) for source_dict in factor_dicts])
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        for i, n_words in enumerate(self.n_words_source):
            if n_words > 0:
                for d in self.source_dicts[i]:
                    for key, idx in d.items():
                        if idx >= n_words:
                            del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffers = [list() for _ in range(len(self.sources))]
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        
        self.end_of_data = False
Example #21
0
    def __init__(self, config, documents, mode='train'):
        self.mode = mode
        self.use_doc = config['use_doc']
        self.use_inverse_relation = config['use_inverse_relation']
        self.max_query_word = config['max_query_word']
        self.max_document_word = config['max_document_word']
        self.max_char = config['max_char']
        self.documents = documents
        self.data_file = config['data_folder'] + config['{}_data'.format(mode)]
        self.batch_size = config['batch_size'] if mode == 'train' else config[
            'batch_size']
        self.max_rel_words = config['max_rel_words']
        self.type_rels = config['type_rels']
        self.fact_drop = config['fact_drop']

        # read all data
        self.data = []
        with open(self.data_file) as f:
            for line in tqdm(list(f)):
                self.data.append(json.loads(line))

        # word and kb vocab
        self.word2id = load_dict(config['data_folder'] + config['word2id'])
        self.relation2id = load_dict(config['data_folder'] +
                                     config['relation2id'])
        self.entity2id = load_dict(config['data_folder'] + config['entity2id'])
        self.id2entity = {i: entity for entity, i in self.entity2id.items()}

        self.rel_word_idx = np.load(config['data_folder'] + 'rel_word_idx.npy')

        # for batching
        self.max_local_entity = 0  # max num of candidates
        self.max_relevant_docs = 0  # max num of retired documents
        self.max_kb_neighbors = config[
            'max_num_neighbors']  # max num of neighbors for entity
        self.max_kb_neighbors_ = config[
            'max_num_neighbors']  # kb relations are directed
        self.max_linked_entities = 0  # max num of linked entities for each doc
        self.max_linked_documents = 50  # max num of linked documents for each entity

        self.num_kb_relation = 2 * len(
            self.relation2id) if self.use_inverse_relation else len(
                self.relation2id)

        # get the batching parameters
        self.get_stats()
Example #22
0
def train():
    print("training ...")

    #prepare data
    entity2id = load_dict(entity2id_file)

    train_data = TypedataLoader(train_file, entity2id)
    dev_data = TypedataLoader(dev_file, entity2id)
    test_data = TypedataLoader(test_file, entity2id)

    my_model = get_model(entity2id)
    trainable_parameters = [
        p for p in my_model.parameters() if p.requires_grad
    ]
    optimizer = torch.optim.Adam(trainable_parameters, lr=learning_rate)

    best_dev_acc = 0.0
    for i in range(epoch):
        try:
            print('epoch', i)
            my_model.train()
            train_loss, train_acc = [], []
            for iteration in tqdm(range(train_data.num_data // batch_size)):
                batch = train_data.get_batch(iteration, batch_size)
                loss, pred, _ = my_model(batch)
                pred = pred.data.cpu().numpy()
                acc = cal_type_acc(pred, batch[-1])
                train_loss.append(loss.data[0])
                train_acc.append(acc)
                # back propogate
                my_model.zero_grad()
                optimizer.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm(my_model.parameters(),
                                              gradient_clip)
                optimizer.step()
            print('avg_training_loss', sum(train_loss) / len(train_loss))
            print('avg_training_acc', sum(train_acc) / len(train_acc))

            print("validating ...")
            eval_acc = inference(my_model, dev_data, entity2id)
            if eval_acc > best_dev_acc and save_model:
                print("saving model to", save_model)
                torch.save(my_model.state_dict(), save_model)
                best_dev_acc = eval_acc

        except KeyboardInterrupt:
            break

    # Test set evaluation
    print("evaluating on test")
    print('loading model from ...', test_file)
    my_model.load_state_dict(torch.load(save_model))
    test_acc = inference(my_model, test_data, entity2id, log_info=True)
    print("test_acc:", test_acc)
    return test_acc
Example #23
0
def test(cfg):
    entity2id = load_dict(cfg['data_folder'] + cfg['entity2id'])
    word2id = load_dict(cfg['data_folder'] + cfg['word2id'])
    relation2id = load_dict(cfg['data_folder'] + cfg['relation2id'])

    test_documents = load_documents(cfg['data_folder'] + cfg['test_documents'])
    test_document_entity_indices, test_document_texts = index_document_entities(
        test_documents, word2id, entity2id, cfg['max_document_word'])
    test_data = DataLoader(cfg['data_folder'] + cfg['test_data'],
                           test_documents, test_document_entity_indices,
                           test_document_texts, word2id, relation2id,
                           entity2id, cfg['max_query_word'],
                           cfg['max_document_word'], cfg['use_kb'],
                           cfg['use_doc'], cfg['use_inverse_relation'])

    my_model = get_model(cfg, test_data.num_kb_relation, len(entity2id),
                         len(word2id)).to(device)
    test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True)
    return test_acc
Example #24
0
    def _build_dictionaries(self, source_dic, target_dic):
        """
        Builds and inverts source and target dictionaries, taken
        from the first model since all of them must have the same
        vocabulary.
        """
        if source_dic == None or target_dic == None:
            dictionaries = self._options[0]['dictionaries']
            dictionaries_source = dictionaries[:-1]
            dictionary_target = dictionaries[-1]
        else:
            dictionaries_source = [source_dic]
            dictionary_target = target_dic

        # load and invert source dictionaries
        word_dicts = []
        word_idicts = []
        for dictionary in dictionaries_source:
            word_dict = load_dict(dictionary)
            if self._options[0]['n_words_src']:
                for key, idx in word_dict.items():
                    if idx >= self._options[0]['n_words_src']:
                        del word_dict[key]
            word_idict = dict()
            for kk, vv in word_dict.iteritems():
                word_idict[vv] = kk
            word_idict[0] = '<eos>'
            word_idict[1] = 'UNK'
            word_dicts.append(word_dict)
            word_idicts.append(word_idict)

        self._word_dicts = word_dicts
        self._word_idicts = word_idicts

        # load and invert target dictionary
        word_dict_trg = load_dict(dictionary_target)
        word_idict_trg = dict()
        for kk, vv in word_dict_trg.iteritems():
            word_idict_trg[vv] = kk
        word_idict_trg[0] = '<eos>'
        word_idict_trg[1] = 'UNK'

        self._word_idict_trg = word_idict_trg
    def __init__(self,
                 source,
                 target,
                 source_dict,
                 target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        # 每次epoch都,打乱文件顺序
        if shuffle_each_epoch:
            shuffle.main([source, target])
            self.source = fopen(source + '.shuf')
            self.target = fopen(target + '.shuf')
        else:
            self.source = fopen(source)
            self.target = fopen(target)

        self.source_dict = load_dict(source_dict)
        self.target_dict = load_dict(target_dict)
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for key, idx in self.source_dict.items():
                if idx >= self.n_words_source:
                    del self.source_dict[key]
        if self.n_words_target > 0:
            for key, idx in self.target_dict.items():
                if idx >= self.n_words_target:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length
        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        self.end_of_data = False
Example #26
0
    def _build_dictionaries(self):
        """
        Builds and inverts source and target dictionaries, taken
        from the first model since all of them must have the same
        vocabulary.
        """
        dictionaries = self._options[0]['dictionaries']
        dictionaries_source = dictionaries[:-1]
        dictionary_target = dictionaries[-1]

        # load and invert source dictionaries
        word_dicts = []
        word_idicts = []
        for dictionary in dictionaries_source:
            word_dict = load_dict(dictionary)
            # n_words is a list containing the max len of each dictionary
            if self._options[0]['n_words'][0]:
                for key, idx in word_dict.items():
                    if idx >= self._options[0]['n_words'][0]:
                        del word_dict[key]
            word_idict = dict()
            for kk, vv in word_dict.iteritems():
                word_idict[vv] = kk
            word_idict[0] = '<eos>'
            word_idict[1] = 'UNK'
            word_dicts.append(word_dict)
            word_idicts.append(word_idict)

        self._word_dicts = word_dicts
        self._word_idicts = word_idicts

        # load and invert target dictionary
        word_dict_trg = load_dict(dictionary_target)
        word_idict_trg = dict()
        for kk, vv in word_dict_trg.iteritems():
            word_idict_trg[vv] = kk
        word_idict_trg[0] = '<eos>'
        word_idict_trg[1] = 'UNK'

        self._word_idict_trg = word_idict_trg
Example #27
0
def _determine_vocab_size_from_file(path, plus_one):
    """ plus_one give place for unk
    """
    try:
        d = load_dict(path)
    except IOError as x:
        logging.error('failed to determine vocabulary size from file: '
                      '{}: {}'.format(path, str(x)))
        sys.exit(1)
    except:
        logging.error('failed to determine vocabulary size from file: '
                      '{}'.format(path))
        sys.exit(1)

    return max(d.values()) + 1 if plus_one else max(d.values())
Example #28
0
    def __init__(self, dict_file=DICT_FILE, schema_file=SCHEMA_FILE):
        """
        init
        """

        #self.logger.info("hook")
        word_dict = util.load_dict(dict_file)
        schema_pos, schema_output = util.get_parse_shitu_conf(schema_file)

        self.word_dict = word_dict
        self.schema_pos = schema_pos
        self.schema_output = schema_output

        dict_size = len(word_dict)
        schema_pos_size = len(schema_pos)
        schema_output_size = len(schema_output)
Example #29
0
    def __init__(self,
                 datasets,
                 dicts,
                 n_words_dicts=None,
                 batch_size=128,
                 maxlen=100,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 factors=1,
                 outputs=1,
                 maxibatch_size=20):

        if shuffle_each_epoch:
            self.datasets_orig = datasets
            self.datasets = shuffle.main(datasets, temporary=True)
        else:
            self.datasets = [fopen(fp, 'r') for fp in datasets]

        self.dicts = []
        for dict_ in dicts:
            self.dicts.append(load_dict(dict_))

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.factors = factors
        self.outputs = outputs

        assert len(
            datasets) == 1 + outputs, 'Datasets and dictionaries mismatch'

        self.n_words_dicts = n_words_dicts

        if self.n_words_dicts:
            for d, max_ in zip(self.dicts, self.n_words_dicts):
                for key, idx in d.items():
                    if idx >= max_:
                        del d[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.buffers = [[] for _ in range(len(datasets))]
        self.k = batch_size * maxibatch_size

        self.end_of_data = False
Example #30
0
    def __init__(self,
                 source,
                 source_dicts,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.source = shuffle.main([self.source_orig], temporary=True)
            self.source = self.source[0]  # ???
            print('this had better be a file:', type(self.source))
        else:
            self.source = fopen(source, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty

        self.n_words_source = n_words_source

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.k = batch_size * maxibatch_size

        self.end_of_data = False
Example #31
0
def load_dict_from_model_config(models):
    import re
    re.sub(' +', ' ', models)
    model = models.split(" ")[0]
    options = []
    try:
        with open('%s.json' % model, 'rb') as f:
            options.append(json.load(f))
    except:
        with open('%s.pkl' % model, 'rb') as f:
            options.append(pkl.load(f))
    dictionaries = options[0]['dictionaries']
    dictionaries_source = dictionaries[:-1]
    #dictionary_target = dictionaries[-1] 
    word_dict = load_dict(dictionaries_source[0])
    if options[0]['n_words_src']:
        for key, idx in word_dict.items():
            if idx >= options[0]['n_words_src']:
                del word_dict[key]
    del word_dict['<EOS>']
    del word_dict['<UNK>']
    return word_dict
Example #32
0
def init_prefs(prefs_path):
    '''
  Initializes the default preferences for the stoichiometry program, and
  loads any preferences that have been saved to the specified file.

  Returns a dictionary of preferences.
  '''
    # Define default preferences.
    dprefs = {
        'wdir': os.path.abspath('.'),
        'autosave_prefs': True,
        'delimiter': '\t'
    }
    # Check for preferences file; load it if it exists.
    lprefs = {}
    if os.path.isfile(prefs_path):
        lprefs = util.load_dict(prefs_path)
    # Supply default preferences that are missing.
    for key in dprefs.keys():
        if key not in lprefs:
            lprefs[key] = dprefs[key]
    # Return preferences.
    return lprefs
Example #33
0
def parse_args():
    parser = argparse.ArgumentParser()

    data = parser.add_argument_group('data sets; model loading and saving')

    data.add_argument('--source_dataset',
                      type=str,
                      metavar='PATH',
                      help="parallel training corpus (source)")
    data.add_argument('--target_dataset',
                      type=str,
                      metavar='PATH',
                      help="parallel training corpus (target)")
    # parallel training corpus (source and target). Hidden option for backward compatibility
    data.add_argument('--datasets',
                      type=str,
                      metavar='PATH',
                      nargs=2,
                      help=argparse.SUPPRESS)
    data.add_argument(
        '--dictionaries',
        type=str,
        required=True,
        metavar='PATH',
        nargs="+",
        help=
        "network vocabularies (one per source factor, plus target vocabulary)")
    data.add_argument('--saveFreq',
                      type=int,
                      default=30000,
                      metavar='INT',
                      help="save frequency (default: %(default)s)")
    data.add_argument('--model',
                      '--saveto',
                      type=str,
                      default='model',
                      metavar='PATH',
                      dest='saveto',
                      help="model file name (default: %(default)s)")
    data.add_argument(
        '--reload',
        type=str,
        default=None,
        metavar='PATH',
        help=
        "load existing model from this path. Set to \"latest_checkpoint\" to reload the latest checkpoint in the same directory of --saveto"
    )
    data.add_argument(
        '--no_reload_training_progress',
        action='store_false',
        dest='reload_training_progress',
        help="don't reload training progress (only used if --reload is enabled)"
    )
    data.add_argument(
        '--summary_dir',
        type=str,
        required=False,
        metavar='PATH',
        help=
        "directory for saving summaries (default: same directory as the --saveto file)"
    )
    data.add_argument(
        '--summaryFreq',
        type=int,
        default=0,
        metavar='INT',
        help=
        "Save summaries after INT updates, if 0 do not save summaries (default: %(default)s)"
    )

    network = parser.add_argument_group('network parameters')
    network.add_argument('--embedding_size',
                         '--dim_word',
                         type=int,
                         default=512,
                         metavar='INT',
                         help="embedding layer size (default: %(default)s)")
    network.add_argument('--state_size',
                         '--dim',
                         type=int,
                         default=1000,
                         metavar='INT',
                         help="hidden state size (default: %(default)s)")

    network.add_argument(
        '--source_vocab_sizes',
        '--n_words_src',
        type=int,
        default=None,
        nargs='+',
        metavar='INT',
        help=
        "source vocabulary sizes (one per input factor) (default: %(default)s)"
    )

    network.add_argument('--target_vocab_size',
                         '--n_words',
                         type=int,
                         default=-1,
                         metavar='INT',
                         help="target vocabulary size (default: %(default)s)")
    network.add_argument('--factors',
                         type=int,
                         default=1,
                         metavar='INT',
                         help="number of input factors (default: %(default)s)")

    network.add_argument(
        '--dim_per_factor',
        type=int,
        default=None,
        nargs='+',
        metavar='INT',
        help=
        "list of word vector dimensionalities (one per factor): '--dim_per_factor 250 200 50' for total dimensionality of 500 (default: %(default)s)"
    )
    network.add_argument(
        '--enc_depth',
        type=int,
        default=1,
        metavar='INT',
        help="number of encoder layers (default: %(default)s)")
    network.add_argument(
        '--enc_recurrence_transition_depth',
        type=int,
        default=1,
        metavar='INT',
        help=
        "number of GRU transition operations applied in the encoder. Minimum is 1. (Only applies to gru). (default: %(default)s)"
    )
    network.add_argument(
        '--dec_depth',
        type=int,
        default=1,
        metavar='INT',
        help="number of decoder layers (default: %(default)s)")
    network.add_argument(
        '--dec_base_recurrence_transition_depth',
        type=int,
        default=2,
        metavar='INT',
        help=
        "number of GRU transition operations applied in the first layer of the decoder. Minimum is 2.  (Only applies to gru_cond). (default: %(default)s)"
    )
    network.add_argument(
        '--dec_high_recurrence_transition_depth',
        type=int,
        default=1,
        metavar='INT',
        help=
        "number of GRU transition operations applied in the higher layers of the decoder. Minimum is 1. (Only applies to gru). (default: %(default)s)"
    )
    network.add_argument(
        '--dec_deep_context',
        action='store_true',
        help="pass context vector (from first layer) to deep decoder layers")
    network.add_argument('--use_dropout',
                         action="store_true",
                         help="use dropout layer (default: %(default)s)")
    network.add_argument(
        '--dropout_embedding',
        type=float,
        default=0.2,
        metavar="FLOAT",
        help=
        "dropout for input embeddings (0: no dropout) (default: %(default)s)")
    network.add_argument(
        '--dropout_hidden',
        type=float,
        default=0.2,
        metavar="FLOAT",
        help="dropout for hidden layer (0: no dropout) (default: %(default)s)")
    network.add_argument(
        '--dropout_source',
        type=float,
        default=0.0,
        metavar="FLOAT",
        help="dropout source words (0: no dropout) (default: %(default)s)")
    network.add_argument(
        '--dropout_target',
        type=float,
        default=0.0,
        metavar="FLOAT",
        help="dropout target words (0: no dropout) (default: %(default)s)")
    network.add_argument(
        '--use_layer_norm',
        '--layer_normalisation',
        action="store_true",
        dest="use_layer_norm",
        help="Set to use layer normalization in encoder and decoder")
    network.add_argument(
        '--tie_encoder_decoder_embeddings',
        action="store_true",
        dest="tie_encoder_decoder_embeddings",
        help=
        "tie the input embeddings of the encoder and the decoder (first factor only). Source and target vocabulary size must be the same"
    )
    network.add_argument(
        '--tie_decoder_embeddings',
        action="store_true",
        dest="tie_decoder_embeddings",
        help=
        "tie the input embeddings of the decoder with the softmax output embeddings"
    )
    network.add_argument(
        '--output_hidden_activation',
        type=str,
        default='tanh',
        choices=['tanh', 'relu', 'prelu', 'linear'],
        help=
        'activation function in hidden layer of the output network (default: %(default)s)'
    )
    network.add_argument(
        '--softmax_mixture_size',
        type=int,
        default=1,
        metavar="INT",
        help="number of softmax components to use (default: %(default)s)")

    training = parser.add_argument_group('training parameters')
    training.add_argument(
        '--maxlen',
        type=int,
        default=100,
        metavar='INT',
        help=
        "maximum sequence length for training and validation (default: %(default)s)"
    )
    training.add_argument('--batch_size',
                          type=int,
                          default=80,
                          metavar='INT',
                          help="minibatch size (default: %(default)s)")
    training.add_argument(
        '--token_batch_size',
        type=int,
        default=0,
        metavar='INT',
        help=
        "minibatch size (expressed in number of source or target tokens). Sentence-level minibatch size will be dynamic. If this is enabled, batch_size only affects sorting by length. (default: %(default)s)"
    )
    training.add_argument(
        '--max_epochs',
        type=int,
        default=5000,
        metavar='INT',
        help="maximum number of epochs (default: %(default)s)")
    training.add_argument(
        '--finish_after',
        type=int,
        default=10000000,
        metavar='INT',
        help="maximum number of updates (minibatches) (default: %(default)s)")
    training.add_argument(
        '--decay_c',
        type=float,
        default=0.0,
        metavar='FLOAT',
        help="L2 regularization penalty (default: %(default)s)")
    training.add_argument(
        '--map_decay_c',
        type=float,
        default=0.0,
        metavar='FLOAT',
        help=
        "MAP-L2 regularization penalty towards original weights (default: %(default)s)"
    )
    training.add_argument(
        '--prior_model',
        type=str,
        metavar='PATH',
        help=
        "Prior model for MAP-L2 regularization. Unless using \"--reload\", this will also be used for initialization."
    )
    training.add_argument(
        '--clip_c',
        type=float,
        default=1.0,
        metavar='FLOAT',
        help="gradient clipping threshold (default: %(default)s)")
    training.add_argument('--learning_rate',
                          '--lrate',
                          type=float,
                          default=0.0001,
                          metavar='FLOAT',
                          help="learning rate (default: %(default)s)")
    training.add_argument('--label_smoothing',
                          type=float,
                          default=0.0,
                          metavar='FLOAT',
                          help="label smoothing (default: %(default)s)")
    training.add_argument(
        '--no_shuffle',
        action="store_false",
        dest="shuffle_each_epoch",
        help="disable shuffling of training data (for each epoch)")
    training.add_argument(
        '--keep_train_set_in_memory',
        action="store_true",
        help="Keep training dataset lines stores in RAM during training")
    training.add_argument('--no_sort_by_length',
                          action="store_false",
                          dest="sort_by_length",
                          help='do not sort sentences in maxibatch by length')
    training.add_argument(
        '--maxibatch_size',
        type=int,
        default=20,
        metavar='INT',
        help=
        'size of maxibatch (number of minibatches that are sorted by length) (default: %(default)s)'
    )
    training.add_argument('--optimizer',
                          type=str,
                          default="adam",
                          choices=['adam'],
                          help="optimizer (default: %(default)s)")

    validation = parser.add_argument_group('validation parameters')
    validation.add_argument(
        '--valid_source_dataset',
        type=str,
        default=None,
        metavar='PATH',
        help="source validation corpus (default: %(default)s)")
    validation.add_argument(
        '--valid_target_dataset',
        type=str,
        default=None,
        metavar='PATH',
        help="target validation corpus (default: %(default)s)")
    # parallel validation corpus (source and target). Hidden option for backward compatibility
    validation.add_argument('--valid_datasets',
                            type=str,
                            default=None,
                            metavar='PATH',
                            nargs=2,
                            help=argparse.SUPPRESS)
    validation.add_argument(
        '--valid_batch_size',
        type=int,
        default=80,
        metavar='INT',
        help="validation minibatch size (default: %(default)s)")
    training.add_argument(
        '--valid_token_batch_size',
        type=int,
        default=0,
        metavar='INT',
        help=
        "validation minibatch size (expressed in number of source or target tokens). Sentence-level minibatch size will be dynamic. If this is enabled, valid_batch_size only affects sorting by length. (default: %(default)s)"
    )
    validation.add_argument('--validFreq',
                            type=int,
                            default=10000,
                            metavar='INT',
                            help="validation frequency (default: %(default)s)")
    validation.add_argument(
        '--valid_script',
        type=str,
        default=None,
        metavar='PATH',
        help=
        "path to script for external validation (default: %(default)s). The script will be passed an argument specifying the path of a file that contains translations of the source validation corpus. It must write a single score to standard output."
    )
    validation.add_argument(
        '--patience',
        type=int,
        default=10,
        metavar='INT',
        help="early stopping patience (default: %(default)s)")
    validation.add_argument(
        '--run_validation',
        action='store_true',
        help="Compute validation score on validation dataset")

    display = parser.add_argument_group('display parameters')
    display.add_argument(
        '--dispFreq',
        type=int,
        default=1000,
        metavar='INT',
        help="display loss after INT updates (default: %(default)s)")
    display.add_argument(
        '--sampleFreq',
        type=int,
        default=10000,
        metavar='INT',
        help="display some samples after INT updates (default: %(default)s)")
    display.add_argument(
        '--beamFreq',
        type=int,
        default=10000,
        metavar='INT',
        help=
        "display some beam_search samples after INT updates (default: %(default)s)"
    )
    display.add_argument('--beam_size',
                         type=int,
                         default=12,
                         metavar='INT',
                         help="size of the beam (default: %(default)s)")

    translate = parser.add_argument_group('translate parameters')
    translate.add_argument('--translate_valid',
                           action='store_true',
                           dest='translate_valid',
                           help='Translate source dataset instead of training')
    translate.add_argument(
        '--no_normalize',
        action='store_false',
        dest='normalize',
        help="Cost of sentences will not be normalized by length")
    translate.add_argument('--n_best',
                           action='store_true',
                           dest='n_best',
                           help="Print full beam")
    translate.add_argument(
        '--n_threads',
        type=int,
        default=5,
        metavar='INT',
        help="Number of threads to use for beam search (default: %(default)s)")
    translate.add_argument(
        '--translation_maxlen',
        type=int,
        default=200,
        metavar='INT',
        help=
        "Maximum length of translation output sentence (default: %(default)s)")
    config = parser.parse_args()

    # allow "--datasets" for backward compatibility
    if config.datasets:
        if config.source_dataset or config.target_dataset:
            logging.error(
                'argument clash: --datasets is mutually exclusive with --source_dataset and --target_dataset'
            )
            sys.exit(1)
        else:
            config.source_dataset = config.datasets[0]
            config.target_dataset = config.datasets[1]
    elif not config.source_dataset:
        logging.error('--source_dataset is required')
        sys.exit(1)
    elif not config.target_dataset:
        logging.error('--target_dataset is required')
        sys.exit(1)

    # allow "--valid_datasets" for backward compatibility
    if config.valid_datasets:
        if config.valid_source_dataset or config.valid_target_dataset:
            logging.error(
                'argument clash: --valid_datasets is mutually exclusive with --valid_source_dataset and --valid_target_dataset'
            )
            sys.exit(1)
        else:
            config.valid_source_dataset = config.valid_datasets[0]
            config.valid_target_dataset = config.valid_datasets[1]

    # check factor-related options are consistent

    if config.dim_per_factor == None:
        if config.factors == 1:
            config.dim_per_factor = [config.embedding_size]
        else:
            logging.error(
                'if using factored input, you must specify \'dim_per_factor\'\n'
            )
            sys.exit(1)

    if len(config.dim_per_factor) != config.factors:
        logging.error(
            'mismatch between \'--factors\' ({0}) and \'--dim_per_factor\' ({1} entries)\n'
            .format(config.factors, len(config.dim_per_factor)))
        sys.exit(1)

    if sum(config.dim_per_factor) != config.embedding_size:
        logging.error(
            'mismatch between \'--embedding_size\' ({0}) and \'--dim_per_factor\' (sums to {1})\n'
            .format(config.embedding_size, sum(config.dim_per_factor)))
        sys.exit(1)

    if len(config.dictionaries) != config.factors + 1:
        logging.error(
            '\'--dictionaries\' must specify one dictionary per source factor and one target dictionary\n'
        )
        sys.exit(1)

    # determine target_embedding_size
    if config.tie_encoder_decoder_embeddings:
        config.target_embedding_size = config.dim_per_factor[0]
    else:
        config.target_embedding_size = config.embedding_size

    # set vocabulary sizes
    vocab_sizes = []
    if config.source_vocab_sizes == None:
        vocab_sizes = [-1] * config.factors
    elif len(config.source_vocab_sizes) == config.factors:
        vocab_sizes = config.source_vocab_sizes
    elif len(config.source_vocab_sizes) < config.factors:
        num_missing = config.factors - len(config.source_vocab_sizes)
        vocab_sizes += config.source_vocab_sizes + [-1] * num_missing
    else:
        logging.error(
            'too many values supplied to \'--source_vocab_sizes\' option (expected one per factor = {0})'
            .format(config.factors))
        sys.exit(1)
    if config.target_vocab_size == -1:
        vocab_sizes.append(-1)
    else:
        vocab_sizes.append(config.target_vocab_size)

    # for unspecified vocabulary sizes, determine sizes from vocabulary dictionaries
    for i, vocab_size in enumerate(vocab_sizes):
        if vocab_size >= 0:
            continue
        try:
            d = util.load_dict(config.dictionaries[i])
        except:
            logging.error(
                'failed to determine vocabulary size from file: {0}'.format(
                    config.dictionaries[i]))
        vocab_sizes[i] = max(d.values()) + 1

    config.source_dicts = config.dictionaries[:-1]
    config.source_vocab_sizes = vocab_sizes[:-1]
    config.target_dict = config.dictionaries[-1]
    config.target_vocab_size = vocab_sizes[-1]

    # set the model version
    config.model_version = 0.2
    config.theano_compat = False

    return config
Example #34
0
def main(models, source_files, saveto, save_alignment, k=5,
         normalize=False, n_process=5, chr_level=False, verbose=False,
         nbest=False, suppress_unk=False, a_json=False,
         print_word_probabilities=False):
    # load model model_options
    options = []
    for model in args.models:
        try:
            with open('%s.json' % model, 'rb') as f:
                options.append(json.load(f))
        except:
            with open('%s.pkl' % model, 'rb') as f:
                options.append(pkl.load(f))

        #hacks for using old models with missing options
        if not 'dropout_embedding' in options[-1]:
            options[-1]['dropout_embedding'] = 0
        if 'dropout_hidden' not in options[-1]:
            options[-1]['dropout_hidden'] = 0
        if 'dropout_source' not in options[-1]:
            options[-1]['dropout_source'] = 0
        if 'dropout_target' not in options[-1]:
            options[-1]['dropout_target'] = 0
        if 'factors' not in options[-1]:
            options[-1]['factors'] = 1
        if 'dim_per_factor' not in options[-1]:
            options[-1]['dim_per_factor'] = [options[-1]['dim_word']]

    dictionaries = options[0]['dictionaries']

    dictionaries_sources = dictionaries[:-1]
    print >> sys.stderr, "SRC DICT:", dictionaries_sources
    dictionary_target = dictionaries[-1]
    print >> sys.stderr, "TRG DICT:", dictionary_target

    encoders_word_dicts = []
    encoders_word_idicts = []
    for dictionaries_source in dictionaries_sources:
        # load source dictionary and invert
        word_dicts = []
        word_idicts = []
        for dictionary in dictionaries_source:
            word_dict = load_dict(dictionary)
            if options[0]['n_words_src']:
                for key, idx in word_dict.items():
                    if idx >= options[0]['n_words_src']:
                        del word_dict[key]
            word_idict = dict()
            for kk, vv in word_dict.iteritems():
                word_idict[vv] = kk
            word_idict[0] = '<eos>'
            word_idict[1] = 'UNK'
            word_dicts.append(word_dict)
            word_idicts.append(word_idict)
        encoders_word_dicts.append(word_dicts)
        encoders_word_idicts.append(word_idicts)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(
            target=translate_model,
            args=(queue, rqueue, midx, models, options, k, normalize, verbose,
                  nbest, save_alignment is not None, suppress_unk))
        processes[midx].start()

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(files):
        source_sentences = []
        for idx, lines in enumerate(zip(*files)):
            #  print lines
            enc_idx = 0;
            xs = []
            enc_words = []
            for src_idx, line in enumerate(lines):
                if chr_level:
                    words = list(line.decode('utf-8').strip())
                else:
                    words = line.strip().split()

                x = []
                for w in words:
                    w = [encoders_word_dicts[src_idx][i][f] if f in word_dicts[i] else 1
                        for (i, f) in enumerate(w.split('|'))]
                    if len(w) != options[0]['factors'][enc_idx]:
                        sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'][enc_idx], len(w)))
                        for midx in xrange(n_process):
                            processes[midx].terminate()
                        sys.exit(1)
                    x.append(w)

                x += [ [0] * options[0]['factors'][enc_idx] ]
                #  print "X:", x
                xs.append(x)
                enc_words.append(words)
            #  print "XS:", xs
            #  new_xs = [ [xs[j][i]   for j in range(len(xs)) ]  for i in range(len(xs[0])) ]
            #  print "NEW XS:", new_xs
            queue.put((idx, xs))
            source_sentences.append(enc_words)
            enc_idx += 1
        return idx+1, source_sentences

    def _finish_processes():
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = rqueue.get()
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    sys.stderr.write('Translating {0} ...\n'.format(':'.join([f.name for f in source_files])))
    n_samples, source_sentences = _send_jobs(source_files)
    _finish_processes()

    for i, trans in enumerate(_retrieve_jobs(n_samples)):
        if nbest:
            samples, scores, word_probs, alignment = trans
            order = numpy.argsort(scores)
            for j in order:
                saveto.write('{0} ||| {1} ||| {2}\n'.format(i, _seqs2words(samples[j]), scores[j]))
                # print alignment matrix for each hypothesis
                # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos
                if save_alignment is not None:
                  if a_json:
                    print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment)
                  else:
                    save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                        i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j])))
                    print_matrix(alignment[j], save_alignment)
        else:
            samples, scores, word_probs, alignment = trans

            saveto.write(_seqs2words(samples) + "\n")
            if print_word_probabilities:
                for prob in word_probs:
                    saveto.write("{} ".format(prob))
                saveto.write('\n')
            if save_alignment is not None:
              if a_json:
                print_matrix_json(trans[1], source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment)
              else:
                save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                      i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0])))
                print_matrix(trans[3], save_alignment)

    sys.stderr.write('Done\n')
Example #35
0
def main(models, source_file, saveto, k=5,
         normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False):

    # load model model_options
    options = []
    for model in args.models:
        try:
            with open('%s.json' % model, 'rb') as f:
                options.append(json.load(f))
        except:
            with open('%s.pkl' % model, 'rb') as f:
                options.append(pkl.load(f))

        #hacks for using old models with missing options
        if not 'dropout_embedding' in options[-1]:
            options[-1]['dropout_embedding'] = 0
        if not 'dropout_hidden' in options[-1]:
            options[-1]['dropout_hidden'] = 0
        if not 'dropout_source' in options[-1]:
            options[-1]['dropout_source'] = 0
        if not 'dropout_target' in options[-1]:
            options[-1]['dropout_target'] = 0

    dictionary, dictionary_target = options[0]['dictionaries']

    # load source dictionary and invert
    word_dict = load_dict(dictionary)
    word_idict = dict()
    for kk, vv in word_dict.iteritems():
        word_idict[vv] = kk
    word_idict[0] = '<eos>'
    word_idict[1] = 'UNK'

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(
            target=translate_model,
            args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, suppress_unk))
        processes[midx].start()

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f):
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()
            x = map(lambda w: word_dict[w] if w in word_dict else 1, words)
            x = map(lambda ii: ii if ii < options[0]['n_words_src'] else 1, x)
            x += [0]
            queue.put((idx, x))
        return idx+1

    def _finish_processes():
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = rqueue.get()
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    sys.stderr.write('Translating {0} ...\n'.format(source_file.name))
    n_samples = _send_jobs(source_file)
    _finish_processes()
    for i, trans in enumerate(_retrieve_jobs(n_samples)):
        if nbest:
            samples, scores = trans
            order = numpy.argsort(scores)
            for j in order:
                saveto.write('{0} ||| {1} ||| {2}\n'.format(i, _seqs2words(samples[j]), scores[j]))
        else:
            saveto.write(_seqs2words(trans) + '\n')

    sys.stderr.write('Done\n')
Example #36
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 model_type,
                 batch_size=128,
                 maxlen=100,
                 source_vocab_sizes=None,
                 target_vocab_size=None,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 token_batch_size=0,
                 keep_data_in_memory=False):
        if keep_data_in_memory:
            self.source, self.target = FileWrapper(source), FileWrapper(target)
            if shuffle_each_epoch:
                r = numpy.random.permutation(len(self.source))
                self.source.shuffle_lines(r)
                self.target.shuffle_lines(r)
        elif shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict, model_type))
        self.target_dict = load_dict(target_dict, model_type)

        # Determine the UNK value for each dictionary (the value depends on
        # which version of build_dictionary.py was used).

        def determine_unk_val(d):
            if '<UNK>' in d and d['<UNK>'] == 2:
                return 2
            return 1

        self.source_unk_vals = [determine_unk_val(d)
                                for d in self.source_dicts]
        self.target_unk_val = determine_unk_val(self.target_dict)


        self.keep_data_in_memory = keep_data_in_memory
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.source_vocab_sizes = source_vocab_sizes
        self.target_vocab_size = target_vocab_size

        self.token_batch_size = token_batch_size

        if self.source_vocab_sizes != None:
            assert len(self.source_vocab_sizes) == len(self.source_dicts)
            for d, vocab_size in zip(self.source_dicts, self.source_vocab_sizes):
                if vocab_size != None and vocab_size > 0:
                    for key, idx in list(d.items()):
                        if idx >= vocab_size:
                            del d[key]

        if self.target_vocab_size != None and self.target_vocab_size > 0:
            for key, idx in list(self.target_dict.items()):
                if idx >= self.target_vocab_size:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
Example #37
0
def main(models, source_file, saveto, save_alignment=None, k=5,
         normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False):
    # load model model_options
    options = []
    for model in models:
        options.append(load_config(model))

        fill_options(options[-1])

    dictionaries = options[0]['dictionaries']

    dictionaries_source = dictionaries[:-1]
    dictionary_target = dictionaries[-1]

    # load source dictionary and invert
    word_dicts = []
    word_idicts = []
    for dictionary in dictionaries_source:
        word_dict = load_dict(dictionary)
        if options[0]['n_words_src']:
            for key, idx in word_dict.items():
                if idx >= options[0]['n_words_src']:
                    del word_dict[key]
        word_idict = dict()
        for kk, vv in word_dict.iteritems():
            word_idict[vv] = kk
        word_idict[0] = '<eos>'
        word_idict[1] = 'UNK'
        word_dicts.append(word_dict)
        word_idicts.append(word_idict)

    # load target dictionary and invert
    word_dict_trg = load_dict(dictionary_target)
    word_idict_trg = dict()
    for kk, vv in word_dict_trg.iteritems():
        word_idict_trg[vv] = kk
    word_idict_trg[0] = '<eos>'
    word_idict_trg[1] = 'UNK'

    # create input and output queues for processes
    queue = Queue()
    rqueue = Queue()
    processes = [None] * n_process
    for midx in xrange(n_process):
        processes[midx] = Process(
            target=translate_model,
            args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph))
        processes[midx].start()

    # utility function
    def _seqs2words(cc):
        ww = []
        for w in cc:
            if w == 0:
                break
            ww.append(word_idict_trg[w])
        return ' '.join(ww)

    def _send_jobs(f):
        source_sentences = []
        for idx, line in enumerate(f):
            if chr_level:
                words = list(line.decode('utf-8').strip())
            else:
                words = line.strip().split()

            x = []
            for w in words:
                w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))]
                if len(w) != options[0]['factors']:
                    sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w)))
                    for midx in xrange(n_process):
                        processes[midx].terminate()
                    sys.exit(1)
                x.append(w)

            x += [[0]*options[0]['factors']]
            queue.put((idx, x))
            source_sentences.append(words)
        return idx+1, source_sentences

    def _finish_processes():
        for midx in xrange(n_process):
            queue.put(None)

    def _retrieve_jobs(n_samples):
        trans = [None] * n_samples
        out_idx = 0
        for idx in xrange(n_samples):
            resp = rqueue.get()
            trans[resp[0]] = resp[1]
            if verbose and numpy.mod(idx, 10) == 0:
                sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples))
            while out_idx < n_samples and trans[out_idx] != None:
                yield trans[out_idx]
                out_idx += 1

    sys.stderr.write('Translating {0} ...\n'.format(source_file.name))
    n_samples, source_sentences = _send_jobs(source_file)
    _finish_processes()

    for i, trans in enumerate(_retrieve_jobs(n_samples)):
        if nbest:
            samples, scores, word_probs, alignment, hyp_graph = trans
            if return_hyp_graph:
                renderer = HypGraphRenderer(hyp_graph)
		renderer.wordify(word_idict_trg)
                renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True)
            order = numpy.argsort(scores)
            for j in order:
                if print_word_probabilities:
                    probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j])
                else:
                    probs = ""
                saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs))
                # print alignment matrix for each hypothesis
                # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos
                if save_alignment is not None:
                  if a_json:
                    print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment)
                  else:
                    save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                        i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j])))
                    print_matrix(alignment[j], save_alignment)
        else:
            samples, scores, word_probs, alignment, hyp_graph = trans
            if return_hyp_graph:
                renderer = HypGraphRenderer(hyp_graph)
		renderer.wordify(word_idict_trg)
                renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True)
            saveto.write(_seqs2words(samples) + "\n")
            if print_word_probabilities:
                for prob in word_probs:
                    saveto.write("{} ".format(prob))
                saveto.write('\n')
            if save_alignment is not None:
              if a_json:
                print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment)
              else:
                save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format(
                                      i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0])))
                print_matrix(alignment, save_alignment)

    sys.stderr.write('Done\n')
Example #38
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 model_type,
                 batch_size=128,
                 maxlen=100,
                 source_vocab_sizes=None,
                 target_vocab_size=None,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20,
                 token_batch_size=0,
                 keep_data_in_memory=False):
        if keep_data_in_memory:
            self.source, self.target = FileWrapper(source), FileWrapper(target)
            if shuffle_each_epoch:
                r = numpy.random.permutation(len(self.source))
                self.source.shuffle_lines(r)
                self.target.shuffle_lines(r)
        elif shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict, model_type))
        self.target_dict = load_dict(target_dict, model_type)

        # Determine the UNK value for each dictionary (the value depends on
        # which version of build_dictionary.py was used).

        def determine_unk_val(d):
            if '<UNK>' in d and d['<UNK>'] == 2:
                return 2
            return 1

        self.source_unk_vals = [determine_unk_val(d)
                                for d in self.source_dicts]
        self.target_unk_val = determine_unk_val(self.target_dict)


        self.keep_data_in_memory = keep_data_in_memory
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.source_vocab_sizes = source_vocab_sizes
        self.target_vocab_size = target_vocab_size

        self.token_batch_size = token_batch_size

        if self.source_vocab_sizes != None:
            assert len(self.source_vocab_sizes) == len(self.source_dicts)
            for d, vocab_size in zip(self.source_dicts, self.source_vocab_sizes):
                if vocab_size != None and vocab_size > 0:
                    for key, idx in list(d.items()):
                        if idx >= vocab_size:
                            del d[key]

        if self.target_vocab_size != None and self.target_vocab_size > 0:
            for key, idx in list(self.target_dict.items()):
                if idx >= self.target_vocab_size:
                    del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
Example #39
0
    def __init__(self, source, target,
                 source_dicts, target_dict,
                 batch_size=128,
                 maxlen=100,
                 n_words_source=-1,
                 n_words_target=-1,
                 skip_empty=False,
                 shuffle_each_epoch=False,
                 sort_by_length=True,
                 use_factor=False,
                 maxibatch_size=20):
        if shuffle_each_epoch:
            self.source_orig = source
            self.target_orig = target
            self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True)
        else:
            self.source = fopen(source, 'r')
            self.target = fopen(target, 'r')
        print 'scan the dataset.'
        for si, _ in enumerate(self.source):
            pass
        for ti, _ in enumerate(self.target):
            pass

        self.source.seek(0)
        self.target.seek(0)
        assert si == ti, 'the number of the source and target document must the same'
        print 'scanned {} lines'.format(si)

        self.source_dicts = []
        for source_dict in source_dicts:
            self.source_dicts.append(load_dict(source_dict))
        self.target_dict = load_dict(target_dict)

        self.batch_size = batch_size
        self.maxlen = maxlen
        self.skip_empty = skip_empty
        self.use_factor = use_factor

        self.n_words_source = n_words_source
        self.n_words_target = n_words_target

        if self.n_words_source > 0:
            for d in self.source_dicts:
                for key, idx in d.items():
                    if idx >= self.n_words_source:
                        del d[key]

        if self.n_words_target > 0:
                for key, idx in self.target_dict.items():
                    if idx >= self.n_words_target:
                        del self.target_dict[key]

        self.shuffle = shuffle_each_epoch
        self.sort_by_length = sort_by_length

        self.source_buffer = []
        self.target_buffer = []
        self.k = batch_size * maxibatch_size
        

        self.end_of_data = False
Example #40
0
    def load_alias(self):
        ''' load an alias dict '''

        self.alias_st = util.load_dict(self.filename_alias_st)
        self.alias_bigram = util.load_dict(self.filename_alias_bigram)