def __init__( self, input_path=None, rating_scale=5, verbose=False, ): self.uid_map = load_dict(os.path.join(input_path, "uid_map"), sep=",") self.iid_map = load_dict(os.path.join(input_path, "iid_map"), sep=",") self.aspect_id_map = load_dict(os.path.join(input_path, "aspect_id_map"), sep=",") self.opinion_id_map = load_dict(os.path.join(input_path, "opinion_id_map"), sep=",") self.U = np.load(os.path.join(input_path, "U.npy")) self.I = np.load(os.path.join(input_path, "I.npy")) self.A = np.load(os.path.join(input_path, "A.npy")) self.O = np.load(os.path.join(input_path, "O.npy")) self.G1 = np.load(os.path.join(input_path, "G1.npy")) self.G2 = np.load(os.path.join(input_path, "G2.npy")) self.G3 = np.load(os.path.join(input_path, "G3.npy")) self.rating_scale = rating_scale self.id2aspect = {v: k for k, v in self.aspect_id_map.items()} self.verbose = verbose if self.verbose: print("Load MTER from %s" % input_path)
def __init__( self, input_path=None, alpha=0.85, num_most_cared_aspects=15, rating_scale=5, verbose=False, ): self.uid_map = load_dict(os.path.join(input_path, "uid_map"), sep=",") self.iid_map = load_dict(os.path.join(input_path, "iid_map"), sep=",") self.aspect_id_map = load_dict(os.path.join(input_path, "aspect_id_map"), sep=",") self.U1 = np.load(os.path.join(input_path, "U1.npy")) self.U2 = np.load(os.path.join(input_path, "U2.npy")) self.V = np.load(os.path.join(input_path, "V.npy")) self.H1 = np.load(os.path.join(input_path, "H1.npy")) self.H2 = np.load(os.path.join(input_path, "H2.npy")) self.alpha = alpha self.n_cared_aspects = num_most_cared_aspects self.rating_scale = rating_scale self.id2aspect = {v: k for k, v in self.aspect_id_map.items()} self.verbose = verbose if self.verbose: print("Load EFM from %s" % input_path)
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True): if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source+'.shuf', 'r') self.target = fopen(target+'.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * 20 self.end_of_data = False
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True): if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source + '.shuf', 'r') self.target = fopen(target + '.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * 20 self.end_of_data = False
def main(models, source_file, nbest_file, saveto, b=80, normalize=False, verbose=False, alignweights=False): # load model model_options options = [] for model in args.models: try: with open('%s.json' % model, 'rb') as f: options.append(json.load(f)) except: with open('%s.pkl' % model, 'rb') as f: options.append(pkl.load(f)) #hacks for using old models with missing options if not 'dropout_embedding' in options[-1]: options[-1]['dropout_embedding'] = 0 if not 'dropout_hidden' in options[-1]: options[-1]['dropout_hidden'] = 0 if not 'dropout_source' in options[-1]: options[-1]['dropout_source'] = 0 if not 'dropout_target' in options[-1]: options[-1]['dropout_target'] = 0 dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' rescore_model(source_file, nbest_file, saveto, models, options, b, normalize, verbose, alignweights)
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, indomain_source='', indomain_target='', interpolation_rate=0.1, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) self.indomain_source_orig = indomain_source self.indomain_target_orig = indomain_target self.indomain_source, self.indomain_target = shuffle.main([self.indomain_source_orig, self.indomain_target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.indomain_source = fopen(indomain_source, 'r') self.indomain_target = fopen(indomain_target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False self.interpolation_rate = interpolation_rate self.cur_interpolation_rate = self.interpolation_rate self.indomain_k = int(math.ceil(self.cur_interpolation_rate * self.k)) self.outdomain_k = self.k - self.indomain_k
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20, keep_data_in_memory=False): if keep_data_in_memory: self.source, self.target = FileWrapper(source), FileWrapper(target) if shuffle_each_epoch: r = numpy.random.permutation(len(self.source)) self.source.shuffle_lines(r) self.target.shuffle_lines(r) elif shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.keep_data_in_memory = keep_data_in_memory self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, indomain_source='', indomain_target='', interpolation_rate=0.1, maxibatch_size=20): if shuffle_each_epoch: shuffle.main([source, target]) shuffle.main([indomain_source, indomain_target]) self.source = fopen(source+'.shuf', 'r') self.target = fopen(target+'.shuf', 'r') self.indomain_source = fopen(indomain_source+'.shuf', 'r') self.indomain_target = fopen(indomain_target+'.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.indomain_source = fopen(indomain_source, 'r') self.indomain_target = fopen(indomain_target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False self.interpolation_rate = interpolation_rate self.indomain_k = int(math.ceil(self.interpolation_rate * self.k)) self.outdomain_k = self.k - self.indomain_k
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20, token_batch_size=0): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.token_batch_size = token_batch_size self.end_of_data = False
def __init__( self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20, ): global epoch_num if shuffle_each_epoch: shuffle.main([source, target], epoch_num) self.source = fopen(source + '.shuf', 'r') self.target = fopen(target + '.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False self.embeddings = embeddings
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') #for line in self.source.readlines(): #print line #aline = self.target.readline() #print aline self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: # if source number is specified for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] # source instance in memory self.target_buffer = [] # target instance in memory self.k = batch_size * maxibatch_size # number of instance in memory in total self.end_of_data = False
def read_model(self, params): user_file = os.path.join(params['config_path'], params['user_filename']) item_file = os.path.join(params['config_path'], params['item_filename']) vocab_file = os.path.join(params['config_path'], params['vocab_filename']) aspect_file = os.path.join(params['config_path'], params['aspect_filename']) opinion_file = os.path.join(params['config_path'], params['opinion_filename']) aspect_opinions_file = os.path.join(params['config_path'], params['aspect_opinions_filename']) model_file = os.path.join(params['config_path'], params['model_filename']) context_word_units = int(params['unit']) lstm_hidden_units = IN_TO_OUT_UNITS_RATIO * context_word_units target_word_units = IN_TO_OUT_UNITS_RATIO * context_word_units user2index = load_dict(user_file) item2index = load_dict(item_file) word2index = load_dict(vocab_file) aspect2index = load_dict(aspect_file) opinion2index = load_dict(opinion_file) aspect_opinions = load_json(aspect_opinions_file) n_user = max(user2index.values()) + 1 n_item = max(item2index.values()) + 1 n_vocab = max(word2index.values()) + 1 n_aspect = max(aspect2index.values()) + 1 n_encode = n_aspect # dummy word counts - not used for eval cs = [1 for _ in range(n_vocab)] # dummy loss func - not used for eval loss_func = L.NegativeSampling(target_word_units, cs, NEGATIVE_SAMPLING_NUM) if params['model_type'] == 'c2v': model = Context2Vec(self.gpu, n_vocab, context_word_units, lstm_hidden_units, target_word_units, loss_func, self.resume) elif params['model_type'] in ['asc2v', 'asc2v-mter']: model = AspectSentiContext2Vec(self.gpu, n_vocab, n_encode, context_word_units, lstm_hidden_units, target_word_units, loss_func, self.resume) S.load_npz(model_file, model) w = model.loss_func.W.data return user2index, item2index, w, word2index, aspect2index, opinion2index, aspect_opinions, model
def testLoad(self,cfg): cfg = self.cfg entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) train_documents = load_documents(cfg['data_folder'] + cfg['train_documents']) train_document_entity_indices, train_document_texts = index_document_entities(train_documents, word2id, entity2id, cfg['max_document_word']) train_data = DataLoader(cfg['data_folder'] + cfg['train_data'], train_documents, train_document_entity_indices, train_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation'])
def predict(): word_to_ix = load_dict(word_dict_file) tag_to_ix = load_dict(tag_dict_file) ix_to_tag = {v: k for k, v in tag_to_ix.items()} model_file = model_path + 'params.pkl' if os.path.exists(model_file): model.load_state_dict(torch.load(model_file)) for wordss, tagss, lengths in pred_helper.gen_batch(): sentence_in = prepare_sequence(wordss, word_to_ix) predict_scores, predict_ix_seqs = model(sentence_in, lengths) for word, ix in zip(wordss[0], predict_ix_seqs[0]): print(word, ix_to_tag[ix]) print()
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=None, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main( [self.source_orig, self.target_orig], temporary=True) else: self.source = data_utils.fopen(source, 'r') self.target = data_utils.fopen(target, 'r') self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source+'.shuf', 'r') self.target = fopen(target+'.shuf', 'r') else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size print "K=", self.k self.end_of_data = False
def __init__(self, options, **kwargs): self.src_dict = load_dict(options['dictionaries'][0]) self.trg_dict = load_dict(options['dictionaries'][1]) self.tmp_dir = kwargs['tmp_dir'] self.translate_script = kwargs['translate_script'] self.bleu_script = kwargs['bleu_script'] self.valid_src = kwargs['bleuvalid_src'] self.valid_trg = kwargs['bleuvalid_trg'] self.n_words_src = options['n_words_src'] self.batch_size = 16 self.batches = self.prepare_data() os.system('mkdir -p %s' % self.tmp_dir) self.check_script() # check bleu script self.trg_idict = dict() for k, v in self.trg_dict.iteritems(): self.trg_idict[v] = k
def test(): entity2id = load_dict(entity2id_file) test_data = TypedataLoader(test_file, entity2id) my_model = get_model(entity2id) test_acc = inference(my_model, test_data, entity2id, log_info=True) return test_acc
def __init__(self, sources, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=[-1], n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: shuffle.main(sources + [target]) self.sources = [fopen(source+'.shuf', 'r') for source in sources] self.target = fopen(target+'.shuf', 'r') else: self.sources = [fopen(source, 'r') for source in sources] self.target = fopen(target, 'r') self.source_dicts = [] for factor_dicts in source_dicts: self.source_dicts.append([load_dict(source_dict) for source_dict in factor_dicts]) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target for i, n_words in enumerate(self.n_words_source): if n_words > 0: for d in self.source_dicts[i]: for key, idx in d.items(): if idx >= n_words: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffers = [list() for _ in range(len(self.sources))] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, config, documents, mode='train'): self.mode = mode self.use_doc = config['use_doc'] self.use_inverse_relation = config['use_inverse_relation'] self.max_query_word = config['max_query_word'] self.max_document_word = config['max_document_word'] self.max_char = config['max_char'] self.documents = documents self.data_file = config['data_folder'] + config['{}_data'.format(mode)] self.batch_size = config['batch_size'] if mode == 'train' else config[ 'batch_size'] self.max_rel_words = config['max_rel_words'] self.type_rels = config['type_rels'] self.fact_drop = config['fact_drop'] # read all data self.data = [] with open(self.data_file) as f: for line in tqdm(list(f)): self.data.append(json.loads(line)) # word and kb vocab self.word2id = load_dict(config['data_folder'] + config['word2id']) self.relation2id = load_dict(config['data_folder'] + config['relation2id']) self.entity2id = load_dict(config['data_folder'] + config['entity2id']) self.id2entity = {i: entity for entity, i in self.entity2id.items()} self.rel_word_idx = np.load(config['data_folder'] + 'rel_word_idx.npy') # for batching self.max_local_entity = 0 # max num of candidates self.max_relevant_docs = 0 # max num of retired documents self.max_kb_neighbors = config[ 'max_num_neighbors'] # max num of neighbors for entity self.max_kb_neighbors_ = config[ 'max_num_neighbors'] # kb relations are directed self.max_linked_entities = 0 # max num of linked entities for each doc self.max_linked_documents = 50 # max num of linked documents for each entity self.num_kb_relation = 2 * len( self.relation2id) if self.use_inverse_relation else len( self.relation2id) # get the batching parameters self.get_stats()
def train(): print("training ...") #prepare data entity2id = load_dict(entity2id_file) train_data = TypedataLoader(train_file, entity2id) dev_data = TypedataLoader(dev_file, entity2id) test_data = TypedataLoader(test_file, entity2id) my_model = get_model(entity2id) trainable_parameters = [ p for p in my_model.parameters() if p.requires_grad ] optimizer = torch.optim.Adam(trainable_parameters, lr=learning_rate) best_dev_acc = 0.0 for i in range(epoch): try: print('epoch', i) my_model.train() train_loss, train_acc = [], [] for iteration in tqdm(range(train_data.num_data // batch_size)): batch = train_data.get_batch(iteration, batch_size) loss, pred, _ = my_model(batch) pred = pred.data.cpu().numpy() acc = cal_type_acc(pred, batch[-1]) train_loss.append(loss.data[0]) train_acc.append(acc) # back propogate my_model.zero_grad() optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm(my_model.parameters(), gradient_clip) optimizer.step() print('avg_training_loss', sum(train_loss) / len(train_loss)) print('avg_training_acc', sum(train_acc) / len(train_acc)) print("validating ...") eval_acc = inference(my_model, dev_data, entity2id) if eval_acc > best_dev_acc and save_model: print("saving model to", save_model) torch.save(my_model.state_dict(), save_model) best_dev_acc = eval_acc except KeyboardInterrupt: break # Test set evaluation print("evaluating on test") print('loading model from ...', test_file) my_model.load_state_dict(torch.load(save_model)) test_acc = inference(my_model, test_data, entity2id, log_info=True) print("test_acc:", test_acc) return test_acc
def test(cfg): entity2id = load_dict(cfg['data_folder'] + cfg['entity2id']) word2id = load_dict(cfg['data_folder'] + cfg['word2id']) relation2id = load_dict(cfg['data_folder'] + cfg['relation2id']) test_documents = load_documents(cfg['data_folder'] + cfg['test_documents']) test_document_entity_indices, test_document_texts = index_document_entities( test_documents, word2id, entity2id, cfg['max_document_word']) test_data = DataLoader(cfg['data_folder'] + cfg['test_data'], test_documents, test_document_entity_indices, test_document_texts, word2id, relation2id, entity2id, cfg['max_query_word'], cfg['max_document_word'], cfg['use_kb'], cfg['use_doc'], cfg['use_inverse_relation']) my_model = get_model(cfg, test_data.num_kb_relation, len(entity2id), len(word2id)).to(device) test_acc = inference(my_model, test_data, entity2id, cfg, log_info=True) return test_acc
def _build_dictionaries(self, source_dic, target_dic): """ Builds and inverts source and target dictionaries, taken from the first model since all of them must have the same vocabulary. """ if source_dic == None or target_dic == None: dictionaries = self._options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] else: dictionaries_source = [source_dic] dictionary_target = target_dic # load and invert source dictionaries word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if self._options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= self._options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) self._word_dicts = word_dicts self._word_idicts = word_idicts # load and invert target dictionary word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' self._word_idict_trg = word_idict_trg
def __init__(self, source, target, source_dict, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): # 每次epoch都,打乱文件顺序 if shuffle_each_epoch: shuffle.main([source, target]) self.source = fopen(source + '.shuf') self.target = fopen(target + '.shuf') else: self.source = fopen(source) self.target = fopen(target) self.source_dict = load_dict(source_dict) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for key, idx in self.source_dict.items(): if idx >= self.n_words_source: del self.source_dict[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def _build_dictionaries(self): """ Builds and inverts source and target dictionaries, taken from the first model since all of them must have the same vocabulary. """ dictionaries = self._options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load and invert source dictionaries word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) # n_words is a list containing the max len of each dictionary if self._options[0]['n_words'][0]: for key, idx in word_dict.items(): if idx >= self._options[0]['n_words'][0]: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) self._word_dicts = word_dicts self._word_idicts = word_idicts # load and invert target dictionary word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' self._word_idict_trg = word_idict_trg
def _determine_vocab_size_from_file(path, plus_one): """ plus_one give place for unk """ try: d = load_dict(path) except IOError as x: logging.error('failed to determine vocabulary size from file: ' '{}: {}'.format(path, str(x))) sys.exit(1) except: logging.error('failed to determine vocabulary size from file: ' '{}'.format(path)) sys.exit(1) return max(d.values()) + 1 if plus_one else max(d.values())
def __init__(self, dict_file=DICT_FILE, schema_file=SCHEMA_FILE): """ init """ #self.logger.info("hook") word_dict = util.load_dict(dict_file) schema_pos, schema_output = util.get_parse_shitu_conf(schema_file) self.word_dict = word_dict self.schema_pos = schema_pos self.schema_output = schema_output dict_size = len(word_dict) schema_pos_size = len(schema_pos) schema_output_size = len(schema_output)
def __init__(self, datasets, dicts, n_words_dicts=None, batch_size=128, maxlen=100, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, factors=1, outputs=1, maxibatch_size=20): if shuffle_each_epoch: self.datasets_orig = datasets self.datasets = shuffle.main(datasets, temporary=True) else: self.datasets = [fopen(fp, 'r') for fp in datasets] self.dicts = [] for dict_ in dicts: self.dicts.append(load_dict(dict_)) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.factors = factors self.outputs = outputs assert len( datasets) == 1 + outputs, 'Datasets and dictionaries mismatch' self.n_words_dicts = n_words_dicts if self.n_words_dicts: for d, max_ in zip(self.dicts, self.n_words_dicts): for key, idx in d.items(): if idx >= max_: del d[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.buffers = [[] for _ in range(len(datasets))] self.k = batch_size * maxibatch_size self.end_of_data = False
def __init__(self, source, source_dicts, batch_size=128, maxlen=100, n_words_source=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.source = shuffle.main([self.source_orig], temporary=True) self.source = self.source[0] # ??? print('this had better be a file:', type(self.source)) else: self.source = fopen(source, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.n_words_source = n_words_source if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def load_dict_from_model_config(models): import re re.sub(' +', ' ', models) model = models.split(" ")[0] options = [] try: with open('%s.json' % model, 'rb') as f: options.append(json.load(f)) except: with open('%s.pkl' % model, 'rb') as f: options.append(pkl.load(f)) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] #dictionary_target = dictionaries[-1] word_dict = load_dict(dictionaries_source[0]) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] del word_dict['<EOS>'] del word_dict['<UNK>'] return word_dict
def init_prefs(prefs_path): ''' Initializes the default preferences for the stoichiometry program, and loads any preferences that have been saved to the specified file. Returns a dictionary of preferences. ''' # Define default preferences. dprefs = { 'wdir': os.path.abspath('.'), 'autosave_prefs': True, 'delimiter': '\t' } # Check for preferences file; load it if it exists. lprefs = {} if os.path.isfile(prefs_path): lprefs = util.load_dict(prefs_path) # Supply default preferences that are missing. for key in dprefs.keys(): if key not in lprefs: lprefs[key] = dprefs[key] # Return preferences. return lprefs
def parse_args(): parser = argparse.ArgumentParser() data = parser.add_argument_group('data sets; model loading and saving') data.add_argument('--source_dataset', type=str, metavar='PATH', help="parallel training corpus (source)") data.add_argument('--target_dataset', type=str, metavar='PATH', help="parallel training corpus (target)") # parallel training corpus (source and target). Hidden option for backward compatibility data.add_argument('--datasets', type=str, metavar='PATH', nargs=2, help=argparse.SUPPRESS) data.add_argument( '--dictionaries', type=str, required=True, metavar='PATH', nargs="+", help= "network vocabularies (one per source factor, plus target vocabulary)") data.add_argument('--saveFreq', type=int, default=30000, metavar='INT', help="save frequency (default: %(default)s)") data.add_argument('--model', '--saveto', type=str, default='model', metavar='PATH', dest='saveto', help="model file name (default: %(default)s)") data.add_argument( '--reload', type=str, default=None, metavar='PATH', help= "load existing model from this path. Set to \"latest_checkpoint\" to reload the latest checkpoint in the same directory of --saveto" ) data.add_argument( '--no_reload_training_progress', action='store_false', dest='reload_training_progress', help="don't reload training progress (only used if --reload is enabled)" ) data.add_argument( '--summary_dir', type=str, required=False, metavar='PATH', help= "directory for saving summaries (default: same directory as the --saveto file)" ) data.add_argument( '--summaryFreq', type=int, default=0, metavar='INT', help= "Save summaries after INT updates, if 0 do not save summaries (default: %(default)s)" ) network = parser.add_argument_group('network parameters') network.add_argument('--embedding_size', '--dim_word', type=int, default=512, metavar='INT', help="embedding layer size (default: %(default)s)") network.add_argument('--state_size', '--dim', type=int, default=1000, metavar='INT', help="hidden state size (default: %(default)s)") network.add_argument( '--source_vocab_sizes', '--n_words_src', type=int, default=None, nargs='+', metavar='INT', help= "source vocabulary sizes (one per input factor) (default: %(default)s)" ) network.add_argument('--target_vocab_size', '--n_words', type=int, default=-1, metavar='INT', help="target vocabulary size (default: %(default)s)") network.add_argument('--factors', type=int, default=1, metavar='INT', help="number of input factors (default: %(default)s)") network.add_argument( '--dim_per_factor', type=int, default=None, nargs='+', metavar='INT', help= "list of word vector dimensionalities (one per factor): '--dim_per_factor 250 200 50' for total dimensionality of 500 (default: %(default)s)" ) network.add_argument( '--enc_depth', type=int, default=1, metavar='INT', help="number of encoder layers (default: %(default)s)") network.add_argument( '--enc_recurrence_transition_depth', type=int, default=1, metavar='INT', help= "number of GRU transition operations applied in the encoder. Minimum is 1. (Only applies to gru). (default: %(default)s)" ) network.add_argument( '--dec_depth', type=int, default=1, metavar='INT', help="number of decoder layers (default: %(default)s)") network.add_argument( '--dec_base_recurrence_transition_depth', type=int, default=2, metavar='INT', help= "number of GRU transition operations applied in the first layer of the decoder. Minimum is 2. (Only applies to gru_cond). (default: %(default)s)" ) network.add_argument( '--dec_high_recurrence_transition_depth', type=int, default=1, metavar='INT', help= "number of GRU transition operations applied in the higher layers of the decoder. Minimum is 1. (Only applies to gru). (default: %(default)s)" ) network.add_argument( '--dec_deep_context', action='store_true', help="pass context vector (from first layer) to deep decoder layers") network.add_argument('--use_dropout', action="store_true", help="use dropout layer (default: %(default)s)") network.add_argument( '--dropout_embedding', type=float, default=0.2, metavar="FLOAT", help= "dropout for input embeddings (0: no dropout) (default: %(default)s)") network.add_argument( '--dropout_hidden', type=float, default=0.2, metavar="FLOAT", help="dropout for hidden layer (0: no dropout) (default: %(default)s)") network.add_argument( '--dropout_source', type=float, default=0.0, metavar="FLOAT", help="dropout source words (0: no dropout) (default: %(default)s)") network.add_argument( '--dropout_target', type=float, default=0.0, metavar="FLOAT", help="dropout target words (0: no dropout) (default: %(default)s)") network.add_argument( '--use_layer_norm', '--layer_normalisation', action="store_true", dest="use_layer_norm", help="Set to use layer normalization in encoder and decoder") network.add_argument( '--tie_encoder_decoder_embeddings', action="store_true", dest="tie_encoder_decoder_embeddings", help= "tie the input embeddings of the encoder and the decoder (first factor only). Source and target vocabulary size must be the same" ) network.add_argument( '--tie_decoder_embeddings', action="store_true", dest="tie_decoder_embeddings", help= "tie the input embeddings of the decoder with the softmax output embeddings" ) network.add_argument( '--output_hidden_activation', type=str, default='tanh', choices=['tanh', 'relu', 'prelu', 'linear'], help= 'activation function in hidden layer of the output network (default: %(default)s)' ) network.add_argument( '--softmax_mixture_size', type=int, default=1, metavar="INT", help="number of softmax components to use (default: %(default)s)") training = parser.add_argument_group('training parameters') training.add_argument( '--maxlen', type=int, default=100, metavar='INT', help= "maximum sequence length for training and validation (default: %(default)s)" ) training.add_argument('--batch_size', type=int, default=80, metavar='INT', help="minibatch size (default: %(default)s)") training.add_argument( '--token_batch_size', type=int, default=0, metavar='INT', help= "minibatch size (expressed in number of source or target tokens). Sentence-level minibatch size will be dynamic. If this is enabled, batch_size only affects sorting by length. (default: %(default)s)" ) training.add_argument( '--max_epochs', type=int, default=5000, metavar='INT', help="maximum number of epochs (default: %(default)s)") training.add_argument( '--finish_after', type=int, default=10000000, metavar='INT', help="maximum number of updates (minibatches) (default: %(default)s)") training.add_argument( '--decay_c', type=float, default=0.0, metavar='FLOAT', help="L2 regularization penalty (default: %(default)s)") training.add_argument( '--map_decay_c', type=float, default=0.0, metavar='FLOAT', help= "MAP-L2 regularization penalty towards original weights (default: %(default)s)" ) training.add_argument( '--prior_model', type=str, metavar='PATH', help= "Prior model for MAP-L2 regularization. Unless using \"--reload\", this will also be used for initialization." ) training.add_argument( '--clip_c', type=float, default=1.0, metavar='FLOAT', help="gradient clipping threshold (default: %(default)s)") training.add_argument('--learning_rate', '--lrate', type=float, default=0.0001, metavar='FLOAT', help="learning rate (default: %(default)s)") training.add_argument('--label_smoothing', type=float, default=0.0, metavar='FLOAT', help="label smoothing (default: %(default)s)") training.add_argument( '--no_shuffle', action="store_false", dest="shuffle_each_epoch", help="disable shuffling of training data (for each epoch)") training.add_argument( '--keep_train_set_in_memory', action="store_true", help="Keep training dataset lines stores in RAM during training") training.add_argument('--no_sort_by_length', action="store_false", dest="sort_by_length", help='do not sort sentences in maxibatch by length') training.add_argument( '--maxibatch_size', type=int, default=20, metavar='INT', help= 'size of maxibatch (number of minibatches that are sorted by length) (default: %(default)s)' ) training.add_argument('--optimizer', type=str, default="adam", choices=['adam'], help="optimizer (default: %(default)s)") validation = parser.add_argument_group('validation parameters') validation.add_argument( '--valid_source_dataset', type=str, default=None, metavar='PATH', help="source validation corpus (default: %(default)s)") validation.add_argument( '--valid_target_dataset', type=str, default=None, metavar='PATH', help="target validation corpus (default: %(default)s)") # parallel validation corpus (source and target). Hidden option for backward compatibility validation.add_argument('--valid_datasets', type=str, default=None, metavar='PATH', nargs=2, help=argparse.SUPPRESS) validation.add_argument( '--valid_batch_size', type=int, default=80, metavar='INT', help="validation minibatch size (default: %(default)s)") training.add_argument( '--valid_token_batch_size', type=int, default=0, metavar='INT', help= "validation minibatch size (expressed in number of source or target tokens). Sentence-level minibatch size will be dynamic. If this is enabled, valid_batch_size only affects sorting by length. (default: %(default)s)" ) validation.add_argument('--validFreq', type=int, default=10000, metavar='INT', help="validation frequency (default: %(default)s)") validation.add_argument( '--valid_script', type=str, default=None, metavar='PATH', help= "path to script for external validation (default: %(default)s). The script will be passed an argument specifying the path of a file that contains translations of the source validation corpus. It must write a single score to standard output." ) validation.add_argument( '--patience', type=int, default=10, metavar='INT', help="early stopping patience (default: %(default)s)") validation.add_argument( '--run_validation', action='store_true', help="Compute validation score on validation dataset") display = parser.add_argument_group('display parameters') display.add_argument( '--dispFreq', type=int, default=1000, metavar='INT', help="display loss after INT updates (default: %(default)s)") display.add_argument( '--sampleFreq', type=int, default=10000, metavar='INT', help="display some samples after INT updates (default: %(default)s)") display.add_argument( '--beamFreq', type=int, default=10000, metavar='INT', help= "display some beam_search samples after INT updates (default: %(default)s)" ) display.add_argument('--beam_size', type=int, default=12, metavar='INT', help="size of the beam (default: %(default)s)") translate = parser.add_argument_group('translate parameters') translate.add_argument('--translate_valid', action='store_true', dest='translate_valid', help='Translate source dataset instead of training') translate.add_argument( '--no_normalize', action='store_false', dest='normalize', help="Cost of sentences will not be normalized by length") translate.add_argument('--n_best', action='store_true', dest='n_best', help="Print full beam") translate.add_argument( '--n_threads', type=int, default=5, metavar='INT', help="Number of threads to use for beam search (default: %(default)s)") translate.add_argument( '--translation_maxlen', type=int, default=200, metavar='INT', help= "Maximum length of translation output sentence (default: %(default)s)") config = parser.parse_args() # allow "--datasets" for backward compatibility if config.datasets: if config.source_dataset or config.target_dataset: logging.error( 'argument clash: --datasets is mutually exclusive with --source_dataset and --target_dataset' ) sys.exit(1) else: config.source_dataset = config.datasets[0] config.target_dataset = config.datasets[1] elif not config.source_dataset: logging.error('--source_dataset is required') sys.exit(1) elif not config.target_dataset: logging.error('--target_dataset is required') sys.exit(1) # allow "--valid_datasets" for backward compatibility if config.valid_datasets: if config.valid_source_dataset or config.valid_target_dataset: logging.error( 'argument clash: --valid_datasets is mutually exclusive with --valid_source_dataset and --valid_target_dataset' ) sys.exit(1) else: config.valid_source_dataset = config.valid_datasets[0] config.valid_target_dataset = config.valid_datasets[1] # check factor-related options are consistent if config.dim_per_factor == None: if config.factors == 1: config.dim_per_factor = [config.embedding_size] else: logging.error( 'if using factored input, you must specify \'dim_per_factor\'\n' ) sys.exit(1) if len(config.dim_per_factor) != config.factors: logging.error( 'mismatch between \'--factors\' ({0}) and \'--dim_per_factor\' ({1} entries)\n' .format(config.factors, len(config.dim_per_factor))) sys.exit(1) if sum(config.dim_per_factor) != config.embedding_size: logging.error( 'mismatch between \'--embedding_size\' ({0}) and \'--dim_per_factor\' (sums to {1})\n' .format(config.embedding_size, sum(config.dim_per_factor))) sys.exit(1) if len(config.dictionaries) != config.factors + 1: logging.error( '\'--dictionaries\' must specify one dictionary per source factor and one target dictionary\n' ) sys.exit(1) # determine target_embedding_size if config.tie_encoder_decoder_embeddings: config.target_embedding_size = config.dim_per_factor[0] else: config.target_embedding_size = config.embedding_size # set vocabulary sizes vocab_sizes = [] if config.source_vocab_sizes == None: vocab_sizes = [-1] * config.factors elif len(config.source_vocab_sizes) == config.factors: vocab_sizes = config.source_vocab_sizes elif len(config.source_vocab_sizes) < config.factors: num_missing = config.factors - len(config.source_vocab_sizes) vocab_sizes += config.source_vocab_sizes + [-1] * num_missing else: logging.error( 'too many values supplied to \'--source_vocab_sizes\' option (expected one per factor = {0})' .format(config.factors)) sys.exit(1) if config.target_vocab_size == -1: vocab_sizes.append(-1) else: vocab_sizes.append(config.target_vocab_size) # for unspecified vocabulary sizes, determine sizes from vocabulary dictionaries for i, vocab_size in enumerate(vocab_sizes): if vocab_size >= 0: continue try: d = util.load_dict(config.dictionaries[i]) except: logging.error( 'failed to determine vocabulary size from file: {0}'.format( config.dictionaries[i])) vocab_sizes[i] = max(d.values()) + 1 config.source_dicts = config.dictionaries[:-1] config.source_vocab_sizes = vocab_sizes[:-1] config.target_dict = config.dictionaries[-1] config.target_vocab_size = vocab_sizes[-1] # set the model version config.model_version = 0.2 config.theano_compat = False return config
def main(models, source_files, saveto, save_alignment, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False): # load model model_options options = [] for model in args.models: try: with open('%s.json' % model, 'rb') as f: options.append(json.load(f)) except: with open('%s.pkl' % model, 'rb') as f: options.append(pkl.load(f)) #hacks for using old models with missing options if not 'dropout_embedding' in options[-1]: options[-1]['dropout_embedding'] = 0 if 'dropout_hidden' not in options[-1]: options[-1]['dropout_hidden'] = 0 if 'dropout_source' not in options[-1]: options[-1]['dropout_source'] = 0 if 'dropout_target' not in options[-1]: options[-1]['dropout_target'] = 0 if 'factors' not in options[-1]: options[-1]['factors'] = 1 if 'dim_per_factor' not in options[-1]: options[-1]['dim_per_factor'] = [options[-1]['dim_word']] dictionaries = options[0]['dictionaries'] dictionaries_sources = dictionaries[:-1] print >> sys.stderr, "SRC DICT:", dictionaries_sources dictionary_target = dictionaries[-1] print >> sys.stderr, "TRG DICT:", dictionary_target encoders_word_dicts = [] encoders_word_idicts = [] for dictionaries_source in dictionaries_sources: # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) encoders_word_dicts.append(word_dicts) encoders_word_idicts.append(word_idicts) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(files): source_sentences = [] for idx, lines in enumerate(zip(*files)): # print lines enc_idx = 0; xs = [] enc_words = [] for src_idx, line in enumerate(lines): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [encoders_word_dicts[src_idx][i][f] if f in word_dicts[i] else 1 for (i, f) in enumerate(w.split('|'))] if len(w) != options[0]['factors'][enc_idx]: sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'][enc_idx], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [ [0] * options[0]['factors'][enc_idx] ] # print "X:", x xs.append(x) enc_words.append(words) # print "XS:", xs # new_xs = [ [xs[j][i] for j in range(len(xs)) ] for i in range(len(xs[0])) ] # print "NEW XS:", new_xs queue.put((idx, xs)) source_sentences.append(enc_words) enc_idx += 1 return idx+1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = rqueue.get() trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(':'.join([f.name for f in source_files]))) n_samples, source_sentences = _send_jobs(source_files) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores, word_probs, alignment = trans order = numpy.argsort(scores) for j in order: saveto.write('{0} ||| {1} ||| {2}\n'.format(i, _seqs2words(samples[j]), scores[j])) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment = trans saveto.write(_seqs2words(samples) + "\n") if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(trans[1], source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0]))) print_matrix(trans[3], save_alignment) sys.stderr.write('Done\n')
def main(models, source_file, saveto, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False): # load model model_options options = [] for model in args.models: try: with open('%s.json' % model, 'rb') as f: options.append(json.load(f)) except: with open('%s.pkl' % model, 'rb') as f: options.append(pkl.load(f)) #hacks for using old models with missing options if not 'dropout_embedding' in options[-1]: options[-1]['dropout_embedding'] = 0 if not 'dropout_hidden' in options[-1]: options[-1]['dropout_hidden'] = 0 if not 'dropout_source' in options[-1]: options[-1]['dropout_source'] = 0 if not 'dropout_target' in options[-1]: options[-1]['dropout_target'] = 0 dictionary, dictionary_target = options[0]['dictionaries'] # load source dictionary and invert word_dict = load_dict(dictionary) word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, suppress_unk)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f): for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = map(lambda w: word_dict[w] if w in word_dict else 1, words) x = map(lambda ii: ii if ii < options[0]['n_words_src'] else 1, x) x += [0] queue.put((idx, x)) return idx+1 def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = rqueue.get() trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(source_file.name)) n_samples = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores = trans order = numpy.argsort(scores) for j in order: saveto.write('{0} ||| {1} ||| {2}\n'.format(i, _seqs2words(samples[j]), scores[j])) else: saveto.write(_seqs2words(trans) + '\n') sys.stderr.write('Done\n')
def __init__(self, source, target, source_dicts, target_dict, model_type, batch_size=128, maxlen=100, source_vocab_sizes=None, target_vocab_size=None, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20, token_batch_size=0, keep_data_in_memory=False): if keep_data_in_memory: self.source, self.target = FileWrapper(source), FileWrapper(target) if shuffle_each_epoch: r = numpy.random.permutation(len(self.source)) self.source.shuffle_lines(r) self.target.shuffle_lines(r) elif shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict, model_type)) self.target_dict = load_dict(target_dict, model_type) # Determine the UNK value for each dictionary (the value depends on # which version of build_dictionary.py was used). def determine_unk_val(d): if '<UNK>' in d and d['<UNK>'] == 2: return 2 return 1 self.source_unk_vals = [determine_unk_val(d) for d in self.source_dicts] self.target_unk_val = determine_unk_val(self.target_dict) self.keep_data_in_memory = keep_data_in_memory self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.source_vocab_sizes = source_vocab_sizes self.target_vocab_size = target_vocab_size self.token_batch_size = token_batch_size if self.source_vocab_sizes != None: assert len(self.source_vocab_sizes) == len(self.source_dicts) for d, vocab_size in zip(self.source_dicts, self.source_vocab_sizes): if vocab_size != None and vocab_size > 0: for key, idx in list(d.items()): if idx >= vocab_size: del d[key] if self.target_vocab_size != None and self.target_vocab_size > 0: for key, idx in list(self.target_dict.items()): if idx >= self.target_vocab_size: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def main(models, source_file, saveto, save_alignment=None, k=5, normalize=False, n_process=5, chr_level=False, verbose=False, nbest=False, suppress_unk=False, a_json=False, print_word_probabilities=False, return_hyp_graph=False): # load model model_options options = [] for model in models: options.append(load_config(model)) fill_options(options[-1]) dictionaries = options[0]['dictionaries'] dictionaries_source = dictionaries[:-1] dictionary_target = dictionaries[-1] # load source dictionary and invert word_dicts = [] word_idicts = [] for dictionary in dictionaries_source: word_dict = load_dict(dictionary) if options[0]['n_words_src']: for key, idx in word_dict.items(): if idx >= options[0]['n_words_src']: del word_dict[key] word_idict = dict() for kk, vv in word_dict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' word_dicts.append(word_dict) word_idicts.append(word_idict) # load target dictionary and invert word_dict_trg = load_dict(dictionary_target) word_idict_trg = dict() for kk, vv in word_dict_trg.iteritems(): word_idict_trg[vv] = kk word_idict_trg[0] = '<eos>' word_idict_trg[1] = 'UNK' # create input and output queues for processes queue = Queue() rqueue = Queue() processes = [None] * n_process for midx in xrange(n_process): processes[midx] = Process( target=translate_model, args=(queue, rqueue, midx, models, options, k, normalize, verbose, nbest, save_alignment is not None, suppress_unk, return_hyp_graph)) processes[midx].start() # utility function def _seqs2words(cc): ww = [] for w in cc: if w == 0: break ww.append(word_idict_trg[w]) return ' '.join(ww) def _send_jobs(f): source_sentences = [] for idx, line in enumerate(f): if chr_level: words = list(line.decode('utf-8').strip()) else: words = line.strip().split() x = [] for w in words: w = [word_dicts[i][f] if f in word_dicts[i] else 1 for (i,f) in enumerate(w.split('|'))] if len(w) != options[0]['factors']: sys.stderr.write('Error: expected {0} factors, but input word has {1}\n'.format(options[0]['factors'], len(w))) for midx in xrange(n_process): processes[midx].terminate() sys.exit(1) x.append(w) x += [[0]*options[0]['factors']] queue.put((idx, x)) source_sentences.append(words) return idx+1, source_sentences def _finish_processes(): for midx in xrange(n_process): queue.put(None) def _retrieve_jobs(n_samples): trans = [None] * n_samples out_idx = 0 for idx in xrange(n_samples): resp = rqueue.get() trans[resp[0]] = resp[1] if verbose and numpy.mod(idx, 10) == 0: sys.stderr.write('Sample {0} / {1} Done\n'.format((idx+1), n_samples)) while out_idx < n_samples and trans[out_idx] != None: yield trans[out_idx] out_idx += 1 sys.stderr.write('Translating {0} ...\n'.format(source_file.name)) n_samples, source_sentences = _send_jobs(source_file) _finish_processes() for i, trans in enumerate(_retrieve_jobs(n_samples)): if nbest: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) order = numpy.argsort(scores) for j in order: if print_word_probabilities: probs = " ||| " + " ".join("{0}".format(prob) for prob in word_probs[j]) else: probs = "" saveto.write('{0} ||| {1} ||| {2}{3}\n'.format(i, _seqs2words(samples[j]), scores[j], probs)) # print alignment matrix for each hypothesis # header: sentence id ||| translation ||| score ||| source ||| source_token_count+eos translation_token_count+eos if save_alignment is not None: if a_json: print_matrix_json(alignment[j], source_sentences[i], _seqs2words(samples[j]).split(), i, i+j,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(samples[j]), scores[j], ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(samples[j]))) print_matrix(alignment[j], save_alignment) else: samples, scores, word_probs, alignment, hyp_graph = trans if return_hyp_graph: renderer = HypGraphRenderer(hyp_graph) renderer.wordify(word_idict_trg) renderer.save_png(return_hyp_graph, detailed=True, highlight_best=True) saveto.write(_seqs2words(samples) + "\n") if print_word_probabilities: for prob in word_probs: saveto.write("{} ".format(prob)) saveto.write('\n') if save_alignment is not None: if a_json: print_matrix_json(alignment, source_sentences[i], _seqs2words(trans[0]).split(), i, i,save_alignment) else: save_alignment.write('{0} ||| {1} ||| {2} ||| {3} ||| {4} {5}\n'.format( i, _seqs2words(trans[0]), 0, ' '.join(source_sentences[i]) , len(source_sentences[i])+1, len(trans[0]))) print_matrix(alignment, save_alignment) sys.stderr.write('Done\n')
def __init__(self, source, target, source_dicts, target_dict, batch_size=128, maxlen=100, n_words_source=-1, n_words_target=-1, skip_empty=False, shuffle_each_epoch=False, sort_by_length=True, use_factor=False, maxibatch_size=20): if shuffle_each_epoch: self.source_orig = source self.target_orig = target self.source, self.target = shuffle.main([self.source_orig, self.target_orig], temporary=True) else: self.source = fopen(source, 'r') self.target = fopen(target, 'r') print 'scan the dataset.' for si, _ in enumerate(self.source): pass for ti, _ in enumerate(self.target): pass self.source.seek(0) self.target.seek(0) assert si == ti, 'the number of the source and target document must the same' print 'scanned {} lines'.format(si) self.source_dicts = [] for source_dict in source_dicts: self.source_dicts.append(load_dict(source_dict)) self.target_dict = load_dict(target_dict) self.batch_size = batch_size self.maxlen = maxlen self.skip_empty = skip_empty self.use_factor = use_factor self.n_words_source = n_words_source self.n_words_target = n_words_target if self.n_words_source > 0: for d in self.source_dicts: for key, idx in d.items(): if idx >= self.n_words_source: del d[key] if self.n_words_target > 0: for key, idx in self.target_dict.items(): if idx >= self.n_words_target: del self.target_dict[key] self.shuffle = shuffle_each_epoch self.sort_by_length = sort_by_length self.source_buffer = [] self.target_buffer = [] self.k = batch_size * maxibatch_size self.end_of_data = False
def load_alias(self): ''' load an alias dict ''' self.alias_st = util.load_dict(self.filename_alias_st) self.alias_bigram = util.load_dict(self.filename_alias_bigram)