Beispiel #1
0
    def __init__(self, opt):
        self.opt = opt

        absa_dataset = ABSADatesetReader(dataset=opt.dataset,
                                         embed_dim=opt.embed_dim,
                                         use_bert=True,
                                         max_len=70)

        self.train_data_loader = BucketIterator(data=absa_dataset.train_data,
                                                batch_size=opt.batch_size,
                                                shuffle=True,
                                                max_len=90)
        self.test_data_loader = BucketIterator(data=absa_dataset.test_data,
                                               batch_size=opt.batch_size,
                                               shuffle=False,
                                               max_len=90)

        self.model = opt.model_class(absa_dataset.embedding_matrix,
                                     opt).to(opt.device)
        self._print_args()
        self.global_f1 = 0.

        if torch.cuda.is_available():
            print('cuda memory allocated:',
                  torch.cuda.memory_allocated(device=opt.device.index))
Beispiel #2
0
    def __init__(self, opt):
        self.opt = opt

        absa_dataset = ABSADatesetReader(dataset=opt.dataset,
                                         embed_dim=opt.embed_dim)

        self.train_data_loader = BucketIterator(data=absa_dataset.train_data,
                                                batch_size=opt.batch_size,
                                                shuffle=True)
        self.test_data_loader = BucketIterator(data=absa_dataset.test_data,
                                               batch_size=opt.batch_size,
                                               shuffle=False)

        self.model = opt.model_class(absa_dataset.embedding_matrix,
                                     opt).to(opt.device)
        # # self.model = opt.model_class(absa_dataset.embedding_matrix, opt)
        #
        # # multi-gpu training (should be after apex fp16 initialization)
        # if opt.n_gpu > 1:
        #     self.model = torch.nn.DataParallel(self.model)
        #
        # # Distributed training (should be after apex fp16 initialization)
        # if opt.local_rank != -1:
        #     self.model = torch.nn.parallel.DistributedDataParallel(self.model, device_ids=[opt.local_rank],
        #                                                       output_device=opt.local_rank,
        #                                                       find_unused_parameters=True)
        # self.model.to(opt.device)

        self._print_args()
        self.global_f1 = 0.

        if torch.cuda.is_available():
            print('cuda memory allocated:',
                  torch.cuda.memory_allocated(device=opt.device.index))
Beispiel #3
0
    def __init__(self, opt, post_vocab):
        self.opt = opt

        absa_dataset = ABSADatesetReader(dataset=opt.dataset,
                                         embed_dim=opt.embed_dim,
                                         post_vocab=args['post_vocab'])
        self.train_data_loader = BucketIterator(data=absa_dataset.train_data,
                                                batch_size=opt.batch_size,
                                                shuffle=True)
        self.test_data_loader = BucketIterator(data=absa_dataset.test_data,
                                               batch_size=opt.batch_size,
                                               shuffle=False)
        common_adj = absa_dataset.common_adj
        fre_embedding = absa_dataset.fre_embedding
        if torch.cuda.is_available():
            common_adj = common_adj.cuda()
            fre_embedding = fre_embedding.cuda()

        self.model = opt.model_class(absa_dataset.embedding_matrix, common_adj,
                                     fre_embedding, post_vocab,
                                     opt).to(opt.device)
        self._print_args()
        self.global_f1 = 0.

        if torch.cuda.is_available():
            print('cuda memory allocated:',
                  torch.cuda.memory_allocated(device=opt.device.index))
Beispiel #4
0
 def __init__(self, opt):
     self.opt = opt
     absa_dataset = ABSADatesetReader(dataset=opt.dataset, embed_dim=opt.embed_dim, max_seq_len=opt.max_seq_len)
     self.train_data_loader = DataLoader(dataset=absa_dataset.train_data, batch_size=opt.batch_size, shuffle=True)
     self.test_data_loader = DataLoader(dataset=absa_dataset.test_data, batch_size=len(absa_dataset.test_data), shuffle=False)
     self.final_data_loader = DataLoader(dataset=absa_dataset.final_data, batch_size=len(absa_dataset.final_data), shuffle=False)
     self.model = opt.model_class(absa_dataset.embedding_matrix, opt).to(opt.device)
     self._init_and_print_parameters()
Beispiel #5
0
    def __init__(self, opt):
        self.opt = opt

        absa_dataset = ABSADatesetReader(dataset=opt.dataset, embed_dim=opt.embed_dim)
        self.train_data_loader = BucketIterator(data=absa_dataset.train_data, batch_size=opt.batch_size, shuffle=True)
        self.test_data_loader = BucketIterator(data=absa_dataset.test_data, batch_size=opt.batch_size, shuffle=False)

        self.model = opt.model_class(absa_dataset.embedding_matrix, opt).to(opt.device)
        self._print_args()
        self.global_f1 = 0.
Beispiel #6
0
    def __init__(self, opt):
        self.opt = opt

        absa_dataset = ABSADatesetReader(dataset=opt.dataset, embed_dim=opt.embed_dim)
        valset_len = int(len(absa_dataset.train_data) * 0.1)
        self.train_data_loader = BucketIterator(data=absa_dataset.train_data[:-valset_len], batch_size=opt.batch_size, shuffle=True)
        self.val_data_loader = BucketIterator(data=absa_dataset.train_data[-valset_len:], batch_size=opt.batch_size, shuffle=False)
        bert = BertModel.from_pretrained('bert-base-chinese')   #中文预训练模型,如果要套ALBERT,得把ALBERT封装好,在这里导入,并把模型参数都改掉
        self.model = nn.DataParallel(opt.model_class(bert, opt).to(opt.device))

        self._print_args()
Beispiel #7
0
    def __init__(self, opt):
        self.opt = opt
        fname = {
            'cr': {
                'train': './datasets/cr/train.csv',
                'test': './datasets/cr/dev.csv'
            },
            'mr': {
                'train': './datasets/mr/train.csv',
                'test': './datasets/mr/dev.csv'
            },
            'mpqa': {
                'train': './datasets/mpqa/train.csv',
                'test': './datasets/mpqa/dev.csv'
            },
            'subj': {
                'train': './datasets/subj/train.csv',
                'test': './datasets/subj/dev.csv'
            },
            'sst2': {
                'train': './datasets/sst2/train.csv',
                'test': './datasets/sst2/test.csv'
            },
            'TREC': {
                'train': './datasets/TREC/train.csv',
                'test': './datasets/TREC/test.csv'
            },
        }
        if os.path.exists(opt.dataset + '_word2idx.pkl'):
            print("loading {0} tokenizer...".format(opt.dataset))
            with open(opt.dataset + '_word2idx.pkl', 'rb') as f:
                word2idx = pickle.load(f)
                self.tokenizer = Tokenizer(word2idx=word2idx)
        else:
            print("reading {0} dataset...".format(opt.dataset))

            text = ABSADatesetReader.__read_text__(
                [fname[opt.dataset]['train'], fname[opt.dataset]['test']])
            self.tokenizer = Tokenizer()
            self.tokenizer.fit_on_text(text)
            with open(opt.dataset + '_word2idx.pkl', 'wb') as f:
                pickle.dump(self.tokenizer.word2idx, f)
        embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx,
                                                  opt.embed_dim, opt.dataset)
        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        print('loading model {0} ...'.format(opt.model_name))
        self.model.load_state_dict(torch.load(opt.state_dict_path))
        self.model = self.model
        # switch model to evaluation mode
        self.model.eval()
        torch.autograd.set_grad_enabled(False)
Beispiel #8
0
    def __init__(self, opt):
        self.opt = opt
        fname = {
            'twitter': {
                'train': './datasets/acl-14-short-data/train.raw',
                'test': './datasets/acl-14-short-data/test.raw'
            },
            'rest14': {
                'train': './datasets/semeval14/restaurant_train.raw',
                'test': './datasets/semeval14/restaurant_test.raw'
            },
            'lap14': {
                'train': './datasets/semeval14/laptop_train.raw',
                'test': './datasets/semeval14/laptop_test.raw'
            },
            'rest15': {
                'train': './datasets/semeval15/restaurant_train.raw',
                'test': './datasets/semeval15/restaurant_test.raw'
            },
            'rest16': {
                'train': './datasets/semeval16/restaurant_train.raw',
                'test': './datasets/semeval16/restaurant_test.raw'
            },
        }
        if os.path.exists(opt.dataset + '_word2idx.pkl'):
            print("loading {0} tokenizer...".format(opt.dataset))
            with open(opt.dataset + '_word2idx.pkl', 'rb') as f:
                word2idx = pickle.load(f)
                self.tokenizer = Tokenizer(word2idx=word2idx)
        else:
            print("reading {0} dataset...".format(opt.dataset))

            text = ABSADatesetReader.__read_text__(
                [fname[opt.dataset]['train'], fname[opt.dataset]['test']])
            self.tokenizer = Tokenizer()
            self.tokenizer.fit_on_text(text)
            with open(opt.dataset + '_word2idx.pkl', 'wb') as f:
                pickle.dump(self.tokenizer.word2idx, f)
        embedding_matrix = build_embedding_matrix(self.tokenizer.word2idx,
                                                  opt.embed_dim, opt.dataset)
        self.model = opt.model_class(embedding_matrix, opt).to(opt.device)
        print('loading model {0} ...'.format(opt.model_name))
        self.model.load_state_dict(torch.load(opt.state_dict_path))
        self.model = self.model
        # switch model to evaluation mode
        self.model.eval()
        torch.autograd.set_grad_enabled(False)
Beispiel #9
0
    def __init__(self, opt):
        self.opt = opt
        print('> training arguments:')
        for arg in vars(opt):
            print('>>> {0}: {1}'.format(arg, getattr(opt, arg)))

        if opt.dataset in ['restaurant', 'laptop']:
            self.my_dataset = ABSADatesetReader(dataset=opt.dataset,
                                                embed_dim=opt.embed_dim,
                                                max_seq_len=opt.max_seq_len)
            self.train_data_loader = DataLoader(
                dataset=self.my_dataset.train_data,
                batch_size=opt.batch_size,
                shuffle=True)
            self.dev_data_loader = DataLoader(
                dataset=self.my_dataset.test_data,
                batch_size=len(self.my_dataset.test_data),
                shuffle=False)
            self.test_data_loader = DataLoader(
                dataset=self.my_dataset.test_data,
                batch_size=len(self.my_dataset.test_data),
                shuffle=False)

        elif opt.dataset in ['zol_cellphone']:
            self.my_dataset = ZOLDatesetReader(
                dataset=opt.dataset,
                embed_dim=opt.embed_dim,
                max_seq_len=opt.max_seq_len,
                cnn_model_name=opt.cnn_model_name)
            self.train_data_loader = DataLoader(
                dataset=self.my_dataset.train_data,
                batch_size=opt.batch_size,
                shuffle=True)
            self.dev_data_loader = DataLoader(dataset=self.my_dataset.dev_data,
                                              batch_size=len(
                                                  self.my_dataset.dev_data),
                                              shuffle=False)
            self.test_data_loader = DataLoader(
                dataset=self.my_dataset.test_data,
                batch_size=len(self.my_dataset.test_data),
                shuffle=False)

        self.idx2word = self.my_dataset.idx2word
        self.writer = SummaryWriter(log_dir=opt.logdir)
        self.model = opt.model_class(self.my_dataset.embedding_matrix,
                                     opt).to(opt.device)
        self.reset_parameters()
Beispiel #10
0
    def __init__(self, opt):
        self.opt = opt

        absa_dataset = ABSADatesetReader(dataset=opt.dataset,
                                         embed_dim=opt.embed_dim,
                                         max_seq_len=opt.max_seq_len)
        self.train_data_loader = DataLoader(dataset=absa_dataset.train_data,
                                            batch_size=opt.batch_size,
                                            shuffle=True)
        self.test_data_loader = DataLoader(dataset=absa_dataset.test_data,
                                           batch_size=opt.batch_size,
                                           shuffle=False)

        self.model = opt.model_class(absa_dataset.embedding_matrix,
                                     opt).to(opt.device)
        print("cuda memory allocated:", torch.cuda.memory_allocated(device=0))
        self._print_args()
Beispiel #11
0
    def __init__(self, args):
        self.args = args
        torch.manual_seed(self.args.seed)
        if self.args.device == "cuda":
            torch.cuda.set_device(self.args.gpu)
            torch.cuda.manual_seed(self.args.seed)
        np.random.seed(self.args.seed)
        random.seed(self.args.seed)
        print('> training arguments:')
        for arg in vars(args):
            print('>>> {0}: {1}'.format(arg, getattr(args, arg)))

        absa_dataset = ABSADatesetReader(dataset=args.dataset,
                                         embed_dim=args.embed_dim,
                                         max_seq_len=args.max_seq_len)
        if self.args.dev > 0.0:
            random.shuffle(absa_dataset.train_data.data)
            dev_num = int(len(absa_dataset.train_data.data) * self.args.dev)
            absa_dataset.dev_data.data = absa_dataset.train_data.data[:dev_num]
            absa_dataset.train_data.data = absa_dataset.train_data.data[
                dev_num:]

        # print(len(absa_dataset.train_data.data), len(absa_dataset.dev_data.data))

        self.train_data_loader = DataLoader(dataset=absa_dataset.train_data,
                                            batch_size=args.batch_size,
                                            shuffle=True)
        if self.args.dev > 0.0:
            self.dev_data_loader = DataLoader(dataset=absa_dataset.dev_data,
                                              batch_size=len(
                                                  absa_dataset.dev_data),
                                              shuffle=False)
        self.test_data_loader = DataLoader(dataset=absa_dataset.test_data,
                                           batch_size=len(
                                               absa_dataset.test_data),
                                           shuffle=False)
        self.mdl = args.model_class(
            self.args,
            embedding_matrix=absa_dataset.embedding_matrix,
            aspect_embedding_matrix=absa_dataset.aspect_embedding_matrix)
        self.reset_parameters()
        self.mdl.encoder.weight.requires_grad = True
        self.mdl.encoder_aspect.weight.requires_grad = True
        self.mdl.to(device)
        self.criterion = nn.CrossEntropyLoss()
Beispiel #12
0
    def __init__(self, opt):
        self.opt = opt
        print('> training arguments:')
        for arg in vars(opt):
            print('>>> {0}: {1}'.format(arg, getattr(opt, arg)))

        if not os.path.exists(opt.checkpoint):
            os.mkdir(opt.checkpoint)

        # torch.manual_seed(opt.rand_seed)
        # torch.cuda.manual_seed_all(opt.rand_seed)

        transform = transforms.Compose([
            transforms.RandomCrop(
                opt.crop_size
            ),  #args.crop_size, by default it is set to be 224
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
        ])

        absa_dataset = ABSADatesetReader(transform, dataset=opt.dataset, embed_dim=opt.embed_dim, max_seq_len=opt.max_seq_len, \
                        path_image=opt.path_image)
        self.train_data_loader = DataLoader(dataset=absa_dataset.train_data,
                                            batch_size=opt.batch_size,
                                            shuffle=True)
        self.dev_data_loader = DataLoader(dataset=absa_dataset.dev_data,
                                          batch_size=opt.batch_size,
                                          shuffle=False)
        self.test_data_loader = DataLoader(dataset=absa_dataset.test_data,
                                           batch_size=opt.batch_size,
                                           shuffle=False)
        #self.writer = SummaryWriter(log_dir=opt.logdir)

        print('building model')
        net = getattr(resnet, 'resnet152')()
        net.load_state_dict(
            torch.load(os.path.join(opt.resnet_root, 'resnet152.pth')))
        self.encoder = myResnet(net, opt.fine_tune_cnn,
                                self.opt.device).to(device)
        self.model = opt.model_class(absa_dataset.embedding_matrix,
                                     opt).to(device)
        self.reset_parameters()
Beispiel #13
0
    def __init__(self,
                 module_class,
                 model_name,
                 dataset='twitter',
                 embed_dim=100,
                 max_seq_len=40,
                 batch_size=128):
        absa_dataset = ABSADatesetReader(dataset=dataset,
                                         embed_dim=embed_dim,
                                         max_seq_len=max_seq_len)
        self.train_data_loader = DataLoader(dataset=absa_dataset.train_data,
                                            batch_size=batch_size,
                                            shuffle=True)
        self.test_data_loader = DataLoader(dataset=absa_dataset.test_data,
                                           batch_size=len(
                                               absa_dataset.test_data),
                                           shuffle=False)
        self.writer = SummaryWriter(log_dir='{0}_logs'.format(model_name))

        self.model = module_class(absa_dataset.embedding_matrix).to(device)
Beispiel #14
0
    def __init__(self, opt):
        self.opt = opt
        print('> training arguments:')
        for arg in vars(opt):
            print('>>> {0}: {1}'.format(arg, getattr(opt, arg)))

        absa_dataset = ABSADatesetReader(dataset=opt.dataset,
                                         embed_dim=opt.embed_dim,
                                         max_seq_len=opt.max_seq_len)
        self.train_data_loader = DataLoader(dataset=absa_dataset.train_data,
                                            batch_size=opt.batch_size,
                                            shuffle=True)
        self.test_data_loader = DataLoader(dataset=absa_dataset.test_data,
                                           batch_size=len(
                                               absa_dataset.test_data),
                                           shuffle=False)
        self.writer = SummaryWriter(log_dir=opt.logdir)

        self.model = opt.model_class(absa_dataset.embedding_matrix,
                                     opt).to(device)
        self.reset_parameters()
Beispiel #15
0
    def __init__(self, opt):
        self.opt = opt
        print('> training arguments:')
        for arg in vars(opt):
            print('>>> {0}: {1}'.format(arg, getattr(opt, arg)))

        absa_dataset = ABSADatesetReader(dataset=opt.dataset,
                                         embed_dim=opt.embed_dim,
                                         max_seq_len=opt.max_seq_len,
                                         fold_num=opt.fold_num)
        self.train_data_loader = DataLoader(dataset=absa_dataset.train_data,
                                            batch_size=opt.batch_size,
                                            shuffle=True)
        self.test_data_loader = DataLoader(dataset=absa_dataset.test_data,
                                           batch_size=opt.batch_size,
                                           shuffle=False)
        if opt.checkpoint_path != "":
            self.model = torch.load(opt.checkpoint_path)
        else:
            self.model = opt.model_class(absa_dataset.embedding_matrix,
                                         opt).to(device)
        self.reset_parameters()