def __init__(self, train=False, fromCheckpoint=None): self.IMG_SIZE = 40 self.NUM_LABEL = 10 self.setupModel() self.dataset = dl.Dataset() if (train): self.setupTraining() self.dataset.loadData(train='train/', test='test/', categories=[ 'sad', 'dead', 'at', 'hash', 'conf', 'empty', 'dot', 'dollar', 'plus', 'dash' ]) self.sess = tf.Session() self.saver = tf.train.Saver() if fromCheckpoint: self.saver.restore(self.sess, fromCheckpoint) else: init = tf.global_variables_initializer() self.sess.run(init)
def multinomial_bayes(): masterDataset = dl.Dataset(x_train, y_train) val_len = int(len(masterDataset) * 0.2) train_len = len(masterDataset) - val_len trainDataset, valDataset = random_split(masterDataset, (train_len, val_len)) #x_train_ref, y_train_ref = (trainDataset.dataset.data, trainDataset.dataset.labels) #x_test_ref, y_test_ref = (valDataset.dataset.data, valDataset.dataset.labels) num_feature = [] acc = [] for i in range(50000, 5000, -5000): mnb = MultinomialNB() x_train_ref, y_train_ref = (trainDataset.dataset.data, trainDataset.dataset.labels) x_test_ref, y_test_ref = (valDataset.dataset.data, valDataset.dataset.labels) x_train_ref = cl.phrase_tf_idf_encode(x_train_ref, i) x_test_ref = cl.phrase_tf_idf_encode(x_test_ref, i) #x_train_ref = cl.phrase_one_hot_encode(x_train_ref, i) #x_test_ref = cl.phrase_one_hot_encode(x_test_ref, i) mnb.fit(x_train_ref, y_train_ref) predmnb = mnb.predict(x_test_ref) score = round(accuracy_score(y_test_ref, predmnb) * 100, 2) print(i, score) num_feature.append(i) acc.append(score) plt.plot(num_feature, acc) plt.xlabel('number of words') plt.ylabel('classification accuracy') plt.show()
def main(): # if gpu is to be used device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # data object data = dl.Dataset(DATA_OPTIONS, device) # Pytorch model mlp = model.MLP() print('hi')
def train_dataloader(self): dataset = data_loader.Dataset(self.hparams, mode='train', sampling=self.hparams.sampling) if self.logger is not None: self.logger.experiment.info( f"Total training videos: {len(dataset)}") dataloader = torch.utils.data.DataLoader( dataset, batch_size=self.hparams.batch_size, shuffle=True, num_workers=self.hparams.num_workers) return dataloader
def test_abs(args, device, pt, step, model_path): # pdb.set_trace() if pt != '': test_from = pt else: test_from = args.test_from logger.info('Loading checkpoint from %s' % test_from) checkpoint = torch.load(test_from, map_location=lambda storage, loc: storage) opt = vars(checkpoint['opt']) for k in opt.keys(): if (k in model_flags): setattr(args, k, opt[k]) print(args) model = AbsSummarizer(args.temp_dir, device, checkpoint, model_path=model_path) model.eval() data = data_loader.Dataset("test_data") testing_generator = torch.utils.data.DataLoader(data, **params, drop_last=True, collate_fn=data.collate_fn) # TODO: verify is_test label # test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False), # args.test_batch_size, device, # shuffle=False, is_test=True) tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=args.temp_dir) symbols = { 'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]'] } translator = Translator(args, device, model, tokenizer, symbols, logger=logger) translator.translate(testing_generator, step)
def test_dataloader(self): dataset = data_loader.Dataset(self.hparams, mode='test', sampling='all') if self.logger is not None: self.logger.experiment.info( f"Total testing videos: {len(dataset)}") dataloader = torch.utils.data.DataLoader( dataset, batch_size=1, shuffle=False, num_workers=self.hparams.num_workers) self.class_dict = dataset.class_dict return dataloader
def generate_summ(summ_list, tgt_seqs, epoch_idx, logger=None): id2token = data_loader.Dataset(args.pkl_path+"train.pkl").id2token summ_pred = [] summ_raw = [] for idxlist in summ_list: summ = [id2token[x] for x in idxlist if x != 0] strs = " ".join(summ) summ_pred.append(strs) for rawlists in tgt_seqs: tgt_summ = "" for rawlist in rawlists: for lists in rawlist: strs = " ".join(lists) tgt_summ += strs summ_raw.append(tgt_summ) eval_rouge(summ_pred ,summ_raw, epoch_idx, logger) for i in range(2): print("-------------pred summ-------------") print(summ_pred[i]) print("-------------raw summ-------------") print(summ_raw[i])
import pickle from tqdm import tqdm import shutil import data_loader import Model import optim_custorm import loss_custorm from argsuse import * import preprocess from rouge import Rouge #from logger import Logger USE_CUDA = torch.cuda.is_available() n_voc = len(data_loader.Dataset(args.pkl_path+"train.pkl").token2id) train_loader = data_loader.get_loader(args.pkl_path+"train.pkl", True, args.batch_size) val_loader = data_loader.get_loader(args.pkl_path+"val.pkl", False, 1) test_loader = data_loader.get_loader(args.pkl_path+"test.pkl", False, 1) weight = preprocess.read_pkl(args.pkl_path+"embeddings.pkl") def save_checkpoint(state, is_best, filename=args.model_path+args.gpu+"/"+'checkpoint.pth.tar'): torch.save(state, filename) if is_best: shutil.copyfile(filename, 'model_best.pth.tar') def main(): critorion = loss_custorm.loss_fuc(nn.NLLLoss, ignore_index=0)
optim_bert_args = optimizer.OptimizerArgs(lr=0.002, warmup_steps=20000) optim_decoder_args = optimizer.OptimizerArgs(lr=0.2, warmup_steps=10000) optim_bert = optimizer.optim_bert(optim_bert_args, model) optim_dec = optimizer.optim_decoder(optim_decoder_args, model) optims = [optim_bert, optim_dec] # Get Tokenizer. BERT has its own tokenizer tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=cache_dir) symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'], 'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']} print("BERT setup done") # data = data_loader.Dataset("individual") data = data_loader.Dataset("full_data") # TODO: Look into creating checkpoints print("Starting training...") batch_size = 8 params = {'batch_size': batch_size, 'shuffle': True, 'num_workers': 8, 'pin_memory': True} training_generator = torch.utils.data.DataLoader(data, **params, drop_last=True, collate_fn=data.collate_fn) padding_index = 0 loss_fn = loss.LabelSmoothingLoss(label_smoothing=0.1,