Esempio n. 1
0
    def __init__(self, train=False, fromCheckpoint=None):
        self.IMG_SIZE = 40
        self.NUM_LABEL = 10

        self.setupModel()

        self.dataset = dl.Dataset()

        if (train):
            self.setupTraining()
            self.dataset.loadData(train='train/',
                                  test='test/',
                                  categories=[
                                      'sad', 'dead', 'at', 'hash', 'conf',
                                      'empty', 'dot', 'dollar', 'plus', 'dash'
                                  ])

        self.sess = tf.Session()
        self.saver = tf.train.Saver()

        if fromCheckpoint:
            self.saver.restore(self.sess, fromCheckpoint)
        else:
            init = tf.global_variables_initializer()
            self.sess.run(init)
Esempio n. 2
0
File: main.py Progetto: codian/yelp
def multinomial_bayes():
    masterDataset = dl.Dataset(x_train, y_train)
    val_len = int(len(masterDataset) * 0.2)
    train_len = len(masterDataset) - val_len
    trainDataset, valDataset = random_split(masterDataset, (train_len, val_len))
    #x_train_ref, y_train_ref = (trainDataset.dataset.data, trainDataset.dataset.labels)
    #x_test_ref, y_test_ref = (valDataset.dataset.data, valDataset.dataset.labels)
    num_feature = []
    acc = []
    for i in range(50000, 5000, -5000):
        mnb = MultinomialNB()
        x_train_ref, y_train_ref = (trainDataset.dataset.data, trainDataset.dataset.labels)
        x_test_ref, y_test_ref = (valDataset.dataset.data, valDataset.dataset.labels)
        x_train_ref = cl.phrase_tf_idf_encode(x_train_ref, i)
        x_test_ref = cl.phrase_tf_idf_encode(x_test_ref, i)
        #x_train_ref = cl.phrase_one_hot_encode(x_train_ref, i)
        #x_test_ref = cl.phrase_one_hot_encode(x_test_ref, i)
        mnb.fit(x_train_ref, y_train_ref)
        predmnb = mnb.predict(x_test_ref)
        score = round(accuracy_score(y_test_ref, predmnb) * 100, 2)
        print(i, score)
        num_feature.append(i)
        acc.append(score)
    plt.plot(num_feature, acc)
    plt.xlabel('number of words')
    plt.ylabel('classification accuracy')
    plt.show()
Esempio n. 3
0
def main():
    # if gpu is to be used
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # data object
    data = dl.Dataset(DATA_OPTIONS, device)

    # Pytorch model
    mlp = model.MLP()

    print('hi')
Esempio n. 4
0
 def train_dataloader(self):
     dataset = data_loader.Dataset(self.hparams,
                                   mode='train',
                                   sampling=self.hparams.sampling)
     if self.logger is not None:
         self.logger.experiment.info(
             f"Total training videos: {len(dataset)}")
     dataloader = torch.utils.data.DataLoader(
         dataset,
         batch_size=self.hparams.batch_size,
         shuffle=True,
         num_workers=self.hparams.num_workers)
     return dataloader
Esempio n. 5
0
def test_abs(args, device, pt, step, model_path):
    # pdb.set_trace()
    if pt != '':
        test_from = pt
    else:
        test_from = args.test_from
    logger.info('Loading checkpoint from %s' % test_from)

    checkpoint = torch.load(test_from,
                            map_location=lambda storage, loc: storage)
    opt = vars(checkpoint['opt'])

    for k in opt.keys():
        if (k in model_flags):
            setattr(args, k, opt[k])
    print(args)

    model = AbsSummarizer(args.temp_dir,
                          device,
                          checkpoint,
                          model_path=model_path)
    model.eval()

    data = data_loader.Dataset("test_data")
    testing_generator = torch.utils.data.DataLoader(data,
                                                    **params,
                                                    drop_last=True,
                                                    collate_fn=data.collate_fn)

    # TODO: verify is_test label
    # test_iter = data_loader.Dataloader(args, load_dataset(args, 'test', shuffle=False),
    #                                    args.test_batch_size, device,
    #                                    shuffle=False, is_test=True)

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True,
                                              cache_dir=args.temp_dir)
    symbols = {
        'BOS': tokenizer.vocab['[unused0]'],
        'EOS': tokenizer.vocab['[unused1]'],
        'PAD': tokenizer.vocab['[PAD]'],
        'EOQ': tokenizer.vocab['[unused2]']
    }

    translator = Translator(args,
                            device,
                            model,
                            tokenizer,
                            symbols,
                            logger=logger)
    translator.translate(testing_generator, step)
Esempio n. 6
0
 def test_dataloader(self):
     dataset = data_loader.Dataset(self.hparams,
                                   mode='test',
                                   sampling='all')
     if self.logger is not None:
         self.logger.experiment.info(
             f"Total testing videos: {len(dataset)}")
     dataloader = torch.utils.data.DataLoader(
         dataset,
         batch_size=1,
         shuffle=False,
         num_workers=self.hparams.num_workers)
     self.class_dict = dataset.class_dict
     return dataloader
Esempio n. 7
0
def generate_summ(summ_list, tgt_seqs, epoch_idx, logger=None):
    id2token = data_loader.Dataset(args.pkl_path+"train.pkl").id2token
    summ_pred = []
    summ_raw = []
    for idxlist in summ_list:
        summ = [id2token[x] for x in idxlist if x != 0]
        strs = " ".join(summ)
        summ_pred.append(strs)
    for rawlists in tgt_seqs:
        tgt_summ = ""
        for rawlist in rawlists:
            for lists in rawlist:
                strs = " ".join(lists)
                tgt_summ += strs
        summ_raw.append(tgt_summ)
    eval_rouge(summ_pred ,summ_raw, epoch_idx, logger)

    for i in range(2):
        print("-------------pred summ-------------")
        print(summ_pred[i])
        print("-------------raw  summ-------------")
        print(summ_raw[i])
Esempio n. 8
0
import pickle
from tqdm import tqdm
import shutil

import data_loader
import Model
import optim_custorm
import loss_custorm
from argsuse import *
import preprocess
from rouge import Rouge
#from logger import Logger

USE_CUDA = torch.cuda.is_available()

n_voc = len(data_loader.Dataset(args.pkl_path+"train.pkl").token2id)
train_loader = data_loader.get_loader(args.pkl_path+"train.pkl", True, args.batch_size)
val_loader = data_loader.get_loader(args.pkl_path+"val.pkl", False, 1)
test_loader = data_loader.get_loader(args.pkl_path+"test.pkl", False, 1)
weight = preprocess.read_pkl(args.pkl_path+"embeddings.pkl")

def save_checkpoint(state, is_best, filename=args.model_path+args.gpu+"/"+'checkpoint.pth.tar'):
    torch.save(state, filename)
    if is_best:
        shutil.copyfile(filename, 'model_best.pth.tar')




def main():
    critorion = loss_custorm.loss_fuc(nn.NLLLoss, ignore_index=0)
Esempio n. 9
0
optim_bert_args = optimizer.OptimizerArgs(lr=0.002, warmup_steps=20000)
optim_decoder_args = optimizer.OptimizerArgs(lr=0.2, warmup_steps=10000)

optim_bert = optimizer.optim_bert(optim_bert_args, model)
optim_dec = optimizer.optim_decoder(optim_decoder_args, model)
optims = [optim_bert, optim_dec]

# Get Tokenizer. BERT has its own tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, cache_dir=cache_dir)
symbols = {'BOS': tokenizer.vocab['[unused0]'], 'EOS': tokenizer.vocab['[unused1]'],
           'PAD': tokenizer.vocab['[PAD]'], 'EOQ': tokenizer.vocab['[unused2]']}

print("BERT setup done")

# data = data_loader.Dataset("individual")
data = data_loader.Dataset("full_data")

# TODO: Look into creating checkpoints

print("Starting training...")

batch_size = 8
params = {'batch_size': batch_size,
          'shuffle': True,
          'num_workers': 8,
          'pin_memory': True}
training_generator = torch.utils.data.DataLoader(data, **params, drop_last=True, collate_fn=data.collate_fn)

padding_index = 0

loss_fn = loss.LabelSmoothingLoss(label_smoothing=0.1,