def train_and_test_core(self, testing_ids, testing_indices): if self.use_hs == 0: hs_list = [0] if self.use_hs == 1: hs_list = [1] if self.use_hs == 2: hs_list = [0, 1] train_times = [] test_times = [] acc = [] for use_hs in hs_list: testing_ids = np.sort(testing_ids) training_ids = np.delete(self.totalN, testing_ids) # Train a model based on training data training_start_time = time.time() train_sentences_dup = models.uncertainWord2vec.DataLoader( self.domain, self.files, 'certain-observation', self.num_train, self.beam_size, training_ids, self.distr_sz) # Get statistics print "PER: ", train_sentences_dup.get_sequence_PER([2]) model_dup = models.UncertainWord2Vec( uncertSentences=train_sentences_dup, uncertainTrain=False, min_count=1, sg=1, workers=8, hs=use_hs, negative=0, window=self.winSz, iter=self.iter, sample=0, model=3) model_dup.save(self.domain + '/naive_model' + '.txt') train_time = time.time() - training_start_time # print("--- training model %s needs %.2f seconds ---\n" % (self.__class__.__name__, train_time)) plans = models.uncertainWord2vec.DataLoader( self.domain, self.files, 'model2', self.num_train, self.beam_size, testing_ids, distr_sz=self.distr_sz) # Testing UDUP GP = models.uncertainWord2vec.DataLoader(self.domain, self.files, 'ground-truth', self.num_train, self.beam_size, testing_ids, distr_sz=self.distr_sz) GdPlans = [q for q in GP] NP = models.uncertainWord2vec.DataLoader(self.domain, self.files, 'certain-observation', self.num_train, self.beam_size, testing_ids, distr_sz=self.distr_sz) NoisyPlans = [p for p in NP] actions_dup = model_dup.wv.vocab.keys() vocab_size_dup = len(actions_dup) correct_dup = 0 total = 0 print "GM Testing : Running on data %s with H-Softmax = %d" % ( self.folderName, use_hs) testing_start_time = time.time() # for itr, plan_grd in enumerate(GdPlans): # plan_grd = zip(*plan_grd)[0] # plan_dup = list(zip(*NoisyPlans[itr])[0]) # blank_count, indices, incomplete_plan_dup = obtain_incomplete_plan(self.mode, self.missing, plan_dup) # total += blank_count for itr, plan in enumerate(plans): # plan_grd = zip(*plan_grd)[0] plan_grd = zip(*GdPlans[itr])[0] plan_dup = list(zip(*NoisyPlans[itr])[0]) blank_count, indices, incomplete_plan_dup = obtain_incomplete_plan( self.mode, self.missing, plan, testing_indices[testing_ids[itr]]) total += blank_count # Testing DUP weights_dup = np.zeros(vocab_size_dup * blank_count).reshape( vocab_size_dup, blank_count) random_indices_dup = random.sample( range(vocab_size_dup), blank_count) # random fill the blank word for order in range(blank_count): blank_index = indices[order] random_word_dup = actions_dup[random_indices_dup[order]] incomplete_plan_dup[blank_index] = random_word_dup best_plan_args_dup, weights_dup = EMTesting( weights_dup, self.winSz, indices, incomplete_plan_dup, blank_count, vocab_size_dup, actions_dup, self.topk, model_dup, 'uw2v2') for blank_order in range(blank_count): blank_index = indices[blank_order] for sample_index in best_plan_args_dup[:, blank_order]: if actions_dup[sample_index] == plan_grd[blank_index]: correct_dup += 1 break test_time = time.time() - testing_start_time train_times.append(np.round(train_time, 2)) test_times.append(np.round(test_time, 2)) acc.append(np.round(float(correct_dup) / total, 4) * 100) total = 0 correct_dup = 0 # print("--- testing model %s needs %.2f seconds ---\n" % (self.__class__.__name__, test_time)) return acc, train_times, test_times
def train_and_test_core_bp(self, testing_ids): testing_ids = np.sort(testing_ids) training_ids = np.delete(self.totalN, testing_ids) # Train a model based on training data if not (self.cvSplit > 1) and self.shouldTrain == True: if train_uw2v == True: sentences_uw2v = models.uncertainWord2vec.DataLoader( self.files, 'all', self.num_train, self.beam_size) model_uw2v = models.UncertainWord2Vec( uncertSentences=sentences_uw2v, min_count=1, sg=1, workers=8, hs=1, window=self.winSz, iter=iter) model_uw2v.save(self.domain + '/model_uw2v' + '.txt') sentences_dup = models.uncertainWord2vec.DataLoader( self.files, 'all', self.num_train, self.beam_size) model_dup = models.UncertainWord2Vec( uncertSentences=sentences_dup, uncertainTrain=False, min_count=1, sg=1, workers=8, hs=1, window=self.winSz, iter=iter, sample=0) # sg=0, cbox, 1, skipgram, default worker = 4 model_dup.save(self.domain + '/model_dup' + '.txt') elif not (self.cvSplit > 1): # OR load a mode model_uw2v = models.uncertainWord2vec.UncertainWord2Vec.load( self.domain + '/model_uw2v' + '.txt') model_dup = models.Word2Vec.load(self.domain + '/model_dup' + '.txt') else: train_sentences_uw2v = models.uncertainWord2vec.DataLoader( self.domain, self.files, 'model1', self.num_train, self.beam_size, training_ids, resample=True) if train_uw2v == True: model_uw2v = models.UncertainWord2Vec( uncertSentences=train_sentences_uw2v, uncertainTrain=False, min_count=1, sg=1, workers=8, hs=1, window=self.winSz, iter=20, sample=0, model=1) model_uw2v.save(self.domain + '/model_BM4' + '.txt') print "Training : COMPLETE!" GdPlans = models.uncertainWord2vec.DataLoader(self.domain, self.files, 'ground-truth', self.num_train, self.beam_size, testing_ids) NP = models.uncertainWord2vec.DataLoader(self.domain, self.files, 'certain-observation', self.num_train, self.beam_size, testing_ids) NoisyPlans = [p for p in NP] if train_uw2v == True: actions_uw2v = model_uw2v.wv.vocab.keys() vocab_size_uw2v = len(actions_uw2v) correct_uw2v = 0 total = 0 print "Testing : RUNNING . . ." for itr, plan_grd in enumerate(GdPlans): plan_grd = zip(*plan_grd)[0] plan_dup = list(zip(*NoisyPlans[itr])[0]) blank_count, indices, incomplete_plan_dup = obtain_incomplete_plan( self.mode, self.missing, plan_dup) total += blank_count if train_uw2v == True: plans = models.uncertainWord2vec.DataLoader( self.files, 'model1', self.num_train, self.beam_size, testing_ids[itr]) # Testing UDUP best_plan_args_uw2v_array = np.empty(int(self.missing), dtype=np.int) for id, p in enumerate(plans): plan, confidence = zip(*p[1]) plan = list(plan) blank_count, indices, incomplete_plan_udup = obtain_incomplete_plan( self.mode, self.missing, plan, indices, blank_count) # Compute fake reliabilities for plans with missing actions weights_uw2v = np.zeros(vocab_size_uw2v * blank_count).reshape( vocab_size_uw2v, blank_count) random_indices_uw2v = random.sample( range(vocab_size_uw2v), blank_count) # random fill the blank word for order in range(blank_count): blank_index = indices[order] if train_uw2v == True: random_word_uw2v = actions_uw2v[ random_indices_uw2v[order]] incomplete_plan_udup[ blank_index] = random_word_uw2v best_plan_args_uw2v, weights_uw2v = EMTesting( weights_uw2v, self.winSz, indices, incomplete_plan_udup, blank_count, vocab_size_uw2v, actions_uw2v, self.topk, model_uw2v, 'dup') for k in range(self.topk): best_plan_args_uw2v_array = np.column_stack( (best_plan_args_uw2v_array, best_plan_args_uw2v[k, :])) if train_uw2v == True: # np.delete(best_plan_args_uw2v_array,0,1) best_plan_args_uw2v_array = best_plan_args_uw2v_array[:, 1:] ans_uw2v = [] for s in best_plan_args_uw2v_array: topk_items = Counter(s) ans_uw2v.append(zip(*topk_items.most_common(self.topk))[0]) for blank_order in range(blank_count): blank_index = indices[blank_order] # for sample_index in best_plan_args_uw2v[:, blank_order]: for sample_index in ans_uw2v[blank_order]: if actions_uw2v[sample_index] == plan_grd[blank_index]: correct_uw2v += 1 break return correct_uw2v, total
def train_and_test(gen_args): ''' The function trains a model on training data and then tests the models accuracy on the testing data. Since training is time consuming, we save the model and load it later for further testing ''' domain, shouldTrain, cvSplit, iter, topk, mode, missing, biasWin, num_train, winSz, beam_size = gen_args dir = os.path.dirname(__file__) folder = os.path.join(dir) + '/' + domain + '/all35noisy/' files = glob.glob(folder + '*.h5') with h5py.File(files[0], 'r') as h5file: data_size = len(h5file['UncertainData']) if num_train is None: totalN = np.arange(data_size) else: totalN = np.arange(num_train) shuffle_totalN = np.random.permutation(totalN) # if cvSplit > 1: RandomTotal = np.split(shuffle_totalN, cvSplit) # else: RandomTotal = [totalN[:np.around(data_size*0.2)]] corrCV_dup = np.array([], dtype=np.float32) corrCV_uw2v = np.array([], dtype=np.float32) for i, testing_ids in enumerate(RandomTotal): testing_ids = np.sort(testing_ids) training_ids = np.delete(totalN, testing_ids) # Train a model based on training data if not (cvSplit > 1) and shouldTrain == True: if train_uw2v == True: sentences_uw2v = models.uncertainWord2vec.DataLoader( files, 'all', num_train, beam_size) model_uw2v = models.UncertainWord2Vec( uncertSentences=sentences_uw2v, min_count=1, sg=1, workers=8, hs=1, window=winSz, iter=iter) model_uw2v.save(domain + '/model_uw2v' + '.txt') sentences_dup = models.uncertainWord2vec.DataLoader( files, 'all', num_train, beam_size) model_dup = models.UncertainWord2Vec( uncertSentences=sentences_dup, uncertainTrain=False, min_count=1, sg=1, workers=8, hs=1, window=winSz, iter=iter, sample=0) # sg=0, cbox, 1, skipgram, default worker = 4 model_dup.save(domain + '/model_dup' + '.txt') elif not (cvSplit > 1): # OR load a mode model_uw2v = models.uncertainWord2vec.UncertainWord2Vec.load( domain + '/model_uw2v' + '.txt') model_dup = models.Word2Vec.load(domain + '/model_dup' + '.txt') else: train_sentences_uw2v = models.uncertainWord2vec.DataLoader( files, 'model1', num_train, beam_size, training_ids, resample=True) if train_uw2v == True: model_uw2v = models.UncertainWord2Vec( uncertSentences=train_sentences_uw2v, uncertainTrain=False, min_count=1, sg=1, workers=8, hs=1, window=winSz, iter=20, sample=0, model=1) model_uw2v.save(domain + '/model_uw2v' + '.txt') train_sentences_dup = models.uncertainWord2vec.DataLoader( files, 'certain-observation', num_train, beam_size, training_ids) model_dup = models.UncertainWord2Vec( uncertSentences=train_sentences_dup, uncertainTrain=False, min_count=1, sg=1, workers=8, hs=1, window=winSz, iter=20, sample=0, model=3) model_dup.save(domain + '/model_dup' + '.txt') print "Training : COMPLETE!" GdPlans = models.uncertainWord2vec.DataLoader(files, 'ground-truth', num_train, beam_size, testing_ids) NP = models.uncertainWord2vec.DataLoader(files, 'certain-observation', num_train, beam_size, testing_ids) NoisyPlans = [p for p in NP] if train_uw2v == True: actions_uw2v = model_uw2v.wv.vocab.keys() vocab_size_uw2v = len(actions_uw2v) correct_uw2v = 0 actions_dup = model_dup.wv.vocab.keys() vocab_size_dup = len(actions_dup) correct_dup = 0 total = 0 print "Testing : RUNNING . . ." for itr, plan_grd in enumerate(GdPlans): plan_grd = zip(*plan_grd)[0] plan_dup = list(zip(*NoisyPlans[itr])[0]) blank_count, indices, incomplete_plan_dup = obtain_incomplete_plan( mode, missing, plan_dup) total += blank_count # Testing DUP weights_dup = np.zeros(vocab_size_dup * blank_count).reshape( vocab_size_dup, blank_count) random_indices_dup = random.sample( range(vocab_size_dup), blank_count) # random fill the blank word for order in range(blank_count): blank_index = indices[order] random_word_dup = actions_dup[random_indices_dup[order]] incomplete_plan_dup[blank_index] = random_word_dup best_plan_args_dup, weights_dup = EMTesting( weights_dup, winSz, indices, incomplete_plan_dup, blank_count, vocab_size_dup, actions_dup, topk, model_dup, 'dup') if train_uw2v == True: plans = models.uncertainWord2vec.DataLoader( files, 'model1', num_train, beam_size, testing_ids[itr]) # Testing UDUP best_plan_args_uw2v_array = np.empty(int(missing), dtype=np.int) for id, p in enumerate(plans): lambda_i = p[0] plan, confidence = zip(*p[1]) confidence = np.array(confidence).astype(np.float) plan = list(plan) blank_count, indices, incomplete_plan_udup = obtain_incomplete_plan( mode, missing, plan, indices, blank_count) # Compute fake reliabilities for plans with missing actions lambda_i, new_confidence = compute_fake_confidence_reliability( lambda_i, indices, confidence) weights_uw2v = np.zeros(vocab_size_uw2v * blank_count).reshape( vocab_size_uw2v, blank_count) random_indices_uw2v = random.sample( range(vocab_size_uw2v), blank_count) # random fill the blank word for order in range(blank_count): blank_index = indices[order] if train_uw2v == True: random_word_uw2v = actions_uw2v[ random_indices_uw2v[order]] incomplete_plan_udup[ blank_index] = random_word_uw2v best_plan_args_uw2v, weights_uw2v = EMTesting( weights_uw2v, winSz, indices, incomplete_plan_udup, blank_count, vocab_size_uw2v, actions_uw2v, topk, model_uw2v, 'dup', lambda_i=lambda_i, confidence=new_confidence) for k in range(topk): best_plan_args_uw2v_array = np.column_stack( (best_plan_args_uw2v_array, best_plan_args_uw2v[k, :])) if train_uw2v == True: # np.delete(best_plan_args_uw2v_array,0,1) best_plan_args_uw2v_array = best_plan_args_uw2v_array[:, 1:] ans_uw2v = [] for s in best_plan_args_uw2v_array: topk_items = Counter(s) ans_uw2v.append(zip(*topk_items.most_common(topk))[0]) for blank_order in range(blank_count): blank_index = indices[blank_order] # for sample_index in best_plan_args_uw2v[:, blank_order]: for sample_index in ans_uw2v[blank_order]: if actions_uw2v[sample_index] == plan_grd[blank_index]: correct_uw2v += 1 break for blank_order in range(blank_count): blank_index = indices[blank_order] for sample_index in best_plan_args_dup[:, blank_order]: if actions_dup[sample_index] == plan_grd[blank_index]: correct_dup += 1 break # Print at certain time intervals # if (itr*100)/len(list_of_actions) % 10 == 0: # sys.stdout.write( "\rProgress: %s %%" % str( (itr*100)/len(list_of_actions) ) ) # sys.stdout.flush() if train_uw2v == True: print "UW2V Correct Predictions: %d, accuracy: %0.2f%% for DATA_CV %i\n" % ( correct_uw2v, correct_uw2v * 100.0 / total, i) corrCV_uw2v = np.append(corrCV_uw2v, correct_uw2v) print "DUP Correct Predictions: %d, accuracy: %0.2f%% for DATA_CV %i\n" % ( correct_dup, correct_dup * 100.0 / total, i) corrCV_dup = np.append(corrCV_dup, correct_dup) if not (cvSplit > 1): break sys.stdout.write("\r\rTesting : COMPLETE!\n") sys.stdout.flush() # print "\nUnknown actions: %s; Correct predictions: %s" % (str(total), str(correct)) # print "Set Accuracy: %s\n" % str( float(correct*100)/total) # return total, correct_uw2v, correct_dup return total, np.mean(corrCV_uw2v), np.mean(corrCV_dup)
def train_and_test_core(self, testing_ids, testing_indices): if self.use_hs == 0: hs_list = [0] if self.use_hs == 1: hs_list = [1] if self.use_hs == 2: hs_list = [0, 1] train_times = [] test_times = [] acc = [] for use_hs in hs_list: testing_ids = np.sort(testing_ids) training_ids = np.delete(self.totalN, testing_ids) # Train a model based on training data training_start_time = time.time() train_sentences_uw2v = models.uncertainWord2vec.DataLoader( self.domain, self.files, 'model1', self.num_train, self.beam_size, training_ids, resample=True, distr_sz=self.distr_sz) model_uw2v = models.UncertainWord2Vec( uncertSentences=train_sentences_uw2v, uncertainTrain=False, min_count=1, sg=1, workers=8, hs=use_hs, negative=0, window=self.winSz, iter=self.iter, sample=0, model=1) model_uw2v.save(self.domain + '/resampling_based_model' + '.txt') train_time = time.time() - training_start_time # print("--- training model %s needs %.2f seconds ---\n" % (self.__class__.__name__, train_time)) plans = models.uncertainWord2vec.DataLoader( self.domain, self.files, 'model2', self.num_train, self.beam_size, testing_ids, distr_sz=self.distr_sz) # Testing UDUP GP = models.uncertainWord2vec.DataLoader(self.domain, self.files, 'ground-truth', self.num_train, self.beam_size, testing_ids, distr_sz=self.distr_sz) GdPlans = [q for q in GP] NP = models.uncertainWord2vec.DataLoader(self.domain, self.files, 'certain-observation', self.num_train, self.beam_size, testing_ids, distr_sz=self.distr_sz) if train_uw2v == True: actions_uw2v = model_uw2v.wv.vocab.keys() vocab_size_uw2v = len(actions_uw2v) correct_uw2v = 0 total = 0 print "RBM Testing : Running on data %s with H-Softmax = %d" % ( self.folderName, use_hs) testing_start_time = time.time() for itr, plan in enumerate(plans): # plan_grd = zip(*plan_grd)[0] plan_grd = zip(*GdPlans[itr])[0] # plan_dup = list(zip(*NoisyPlans[itr])[0]) # blank_count, indices, incomplete_plan_dup = obtain_incomplete_plan(self.mode, self.missing, plan_dup) # plans = models.uncertainWord2vec.DataLoader(files, 'model2', num_train, beam_size, testing_ids[itr]) # Testing UDUP # for id, plan in enumerate(plans): plan = list(plan) blank_count, indices, incomplete_plan_udup = obtain_incomplete_plan( self.mode, self.missing, plan, testing_indices[testing_ids[itr]] ) # Yantian 051318, was testing_indices[itr] total += blank_count weights_uw2v = np.zeros(vocab_size_uw2v * blank_count).reshape( vocab_size_uw2v, blank_count) random_indices_uw2v = random.sample( range(vocab_size_uw2v), blank_count) # random fill the blank word for order in range(blank_count): blank_index = indices[order] if train_uw2v == True: random_word_uw2v = actions_uw2v[ random_indices_uw2v[order]] incomplete_plan_udup[blank_index] = random_word_uw2v best_plan_args_uw2v, weights_uw2v = EMTesting( weights_uw2v, self.winSz, indices, incomplete_plan_udup, blank_count, vocab_size_uw2v, actions_uw2v, self.topk, model_uw2v, 'uw2v2') for blank_order in range(blank_count): blank_index = indices[blank_order] # for sample_index in best_plan_args_uw2v[:, blank_order]: for sample_index in best_plan_args_uw2v[:, blank_order]: if actions_uw2v[sample_index] == plan_grd[blank_index]: correct_uw2v += 1 break test_time = time.time() - testing_start_time train_times.append(np.round(train_time, 2)) test_times.append(np.round(test_time, 2)) acc.append(np.round(float(correct_uw2v) / total, 4) * 100) total = 0 correct_dup = 0 # print("--- testing model %s needs %.2f seconds ---\n" % (self.__class__.__name__, test_time)) return acc, train_times, test_times