def build_model(args): if args.clf_model.lower() == "cnn": # easy for text tokenization tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) model = CNN_Text(args) elif args.clf_model.lower() == "robert": print("name is {}".format(args.model_name_or_path)) tokenizer = RobertaTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = RobertaConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = RobertaForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers if args.freeze: for n, p in model.named_parameters(): if "bert" in n: p.requires_grad = False elif args.clf_model.lower() == "bert": tokenizer = BertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = BertConfig.from_pretrained(args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = BertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) # freeze the weight for transformers # if args.freeze: # for n, p in model.named_parameters(): # if "bert" in n: # p.requires_grad = False else: tokenizer = DistilBertTokenizer.from_pretrained( args.model_name_or_path, do_lower_case=args.do_lower_case) config = DistilBertConfig.from_pretrained( args.model_name_or_path, num_labels=args.num_labels, finetuning_task=args.task_name) model = DistilBertForSequenceClassification.from_pretrained( args.model_name_or_path, config=config) model.expand_class_head(args.multi_head) model = model.to(args.device) return tokenizer, model
def main(args): # # Device configuration device = torch.device( 'cuda:{}'.format(args.gpu) if torch.cuda.is_available() else 'cpu') num_epochs = 80 num_classes = 8 learning_rate = 0.08 num_views = 3 num_layers = 4 data_path = args.dir file_list = [ './data/train_web_content.npy', './data/train_web_links.npy', './data/train_web_title.npy', './data/test_web_content.npy', './data/test_web_links.npy', './data/test_web_title.npy', './data/train_label.npy', './data/test_label.npy' ] aaa = list(map(os.path.exists, file_list)) if sum(aaa) != len(aaa): print( 'Raw data has not been pre-processed! Start pre-processing the raw data.' ) data_loader.preprocess(data_path) else: print('Loading the existing data set...') # train_dataset = data_loader.Load_datasets('train', num_classes) train_dataset = data_loader.Load_datasets('train', 8) train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4) input_dims = np.array(train_dataset.data[0]).shape model = CNN_Text(input_dims, [64, 32, 32, 32], [1, 2, 3, 4], num_classes, 0.5, num_layers, num_views).to(device) model = model.double() model.device = device model.learning_rate = learning_rate model.epoch = 0 if args.model != None: model.load_state_dict(torch.load(args.mpodel)) print('Successfully load pre-trained model!') # train the model until the model is fully trained train_model(model, train_loader, num_epochs) print('Finish training process!') evaluation(model)
def __init__(self, config, n_gpu, vocab, train_loader=None, val_loader=None): self.config = config self.vocab = vocab self.n_gpu = n_gpu self.train_loader = train_loader self.val_loader = val_loader # Build model vocab_size = self.vocab.vocab_size() self.model = CNN_Text(self.config, vocab_size, self.config.n_label) self.model.to(device) if self.n_gpu > 1: self.model = nn.DataParallel(self.model) # Build optimizer self.optimizer = optim.Adam(self.model.parameters(), lr=self.config.lr, weight_decay=0.0005) # Build criterion self.criterion = nn.CrossEntropyLoss()
def main_train(): def clean_str(string): string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string) string = re.sub(r"\'s", " \'s", string) string = re.sub(r"\'ve", " \'ve", string) string = re.sub(r"n\'t", " n\'t", string) string = re.sub(r"\'re", " \'re", string) string = re.sub(r"\'d", " \'d", string) string = re.sub(r"\'ll", " \'ll", string) string = re.sub(r",", " , ", string) string = re.sub(r"!", " ! ", string) string = re.sub(r"\(", " \( ", string) string = re.sub(r"\)", " \) ", string) string = re.sub(r"\?", " \? ", string) string = re.sub(r"\s{2,}", " ", string) return string TEXT = data.Field(sequential=True, lower=True, batch_first=True) TEXT.preprocessing = data.Pipeline(clean_str) LABEL = data.Field(sequential=False, use_vocab=False, batch_first=True) trainset, valset = MR.splits(data_path, fields=[("text", TEXT), ("label", LABEL)]) TEXT.build_vocab(trainset) with open("text.field", 'wb') as f: dill.dump(TEXT, f) trainiter = data.BucketIterator(trainset, batch_size=batch_size, sort_key=lambda x: len(x.text), shuffle=True, device=device) valiter = data.BucketIterator(valset, batch_size=batch_size, sort_key=lambda x: len(x.text), shuffle=True, device=device) model = CNN_Text(channel_dim, len(TEXT.vocab), embed_dim, output_dim, kernel_sizes, is_static=False, dropout_rate=dropout_rate) model = model.to(device) criterion = torch.nn.CrossEntropyLoss() optimizer = torch.optim.Adam(model.parameters(), lr, weight_decay=weight_decay) train_model(epochs, model, trainiter, valiter, optimizer, criterion)
tokens.append('oov') tokens.append('bos') id = 0 word2id = {} for word in tokens: word2id[word] = id id += 1 args.embed_num = len(tokens) args.class_num = 2 args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')] #print("\nParameters:") #for attr, value in sorted(args.__dict__.items()): # print("\t{}={}".format(attr.upper(), value)) model = CNN_Text(args) if torch.cuda.is_available(): model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) report_interval = 5000 for epoch in range(1, args.epochs + 1): train_batch_i = 0 batch_counter = 0 accumulated_loss = 0 train_sents_scaned = 0 train_num_correct = 0 model.train() print('--' * 20) start_time = time.time()
#use CUDA to speed up use_cuda = torch.cuda.is_available() #get data train_loader = Data.DataLoader(dataset=CustomDataset(path="train.json", balance=False), batch_size=BATCH_SIZE, shuffle=True) test_loader = Data.DataLoader(dataset=CustomDataset(path="test.json", balance=False), batch_size=BATCH_SIZE, shuffle=True) #initialize model cnn = CNN_Text() if use_cuda: cnn = cnn.cuda() optimizer = torch.optim.Adam(cnn.parameters(), lr=LR, weight_decay=0.0005) #train for epoch in range(EPOCH): print("epoch :") if epoch % 5 == 0: test(cnn, test_loader, use_cuda) for step, data in enumerate(train_loader): vec, lens, label = data #print(vec.shape) if use_cuda: vec = vec.cuda() label = label.cuda()
def train(self, m_2, m_3, m_4): word_dict, label_dict = self.divide_two_dict(m_2) if self.hyperparameter_1.word_embedding: path = "word2vec/glove.6B.100d.txt" print("loading word2vec ") word_vecs = self.load_my_vector(path, word_dict.m_list) print("new words already in word2vec:" + str(len(word_vecs))) print("loading unknow word2vec and convert to list... ") word_vecs = self.add_unknow_words_by_average( word_vecs, word_dict.m_list, k=self.hyperparameter_1.embed_dim) print("unknown word2vec load ! and converted to list...") # if self.hyperparameter_1.word_embedding: self.hyperparameter_1.pretrained_weight = word_vecs # pretrained_weight = np.array(self.hyperparameter_1.pretrained_weight) # self.embed.weight.data.copy_(torch.from_numpy(pretrained_weight)) # self.nn = network(2, 2, 2, hidden_layer_weights=None, hidden_layer_bias=None, output_layer_weights=None, output_layer_bias=None) train_example = self.out_example_index(m_2, m_2) dev_example = self.out_example_index(m_2, m_3) test_example = self.out_example_index(m_2, m_4) random.shuffle(train_example) random.shuffle(dev_example) random.shuffle(test_example) self.model = CNN_Text(self.hyperparameter_1) optimizer = torch.optim.Adam(self.model.parameters(), lr=self.hyperparameter_1.lr) train_example_idx = self.set_index(train_example) random.shuffle(train_example_idx) steps = 0 self.model.train() for epoch in range(1, self.hyperparameter_1.epochs + 1): batchBlock = self.set_batchBlock(train_example) for every_batchBlock in range(batchBlock): exams = [] start_pos = every_batchBlock * self.hyperparameter_1.batch_size end_pos = (every_batchBlock + 1) * self.hyperparameter_1.batch_size if end_pos > len(train_example): end_pos = len(train_example) for idx in range(start_pos, end_pos): exams.append(train_example[train_example_idx[idx]]) max_len = self.get_max_sentence_len(exams) optimizer.zero_grad() feat, label = self.batch(exams, self.hyperparameter_1.batch_size, max_len) label = label.view(len(exams)) logit = self.model.forward(feat) loss = F.cross_entropy(logit, label) loss.backward() optimizer.step() steps += 1 if steps % self.hyperparameter_1.log_interval == 0: train_size = len(train_example) corrects = (torch.max(logit, 1)[1].view( label.size()).data == label.data).sum() accuracy = corrects / self.hyperparameter_1.batch_size * 100.0 sys.stdout.write( '\rBatch[{}/{}] - loss: {:.6f} acc: {:.4f}%({}/{})'. format(steps, train_size, loss.data[0], accuracy, corrects, self.hyperparameter_1.batch_size)) if steps % self.hyperparameter_1.test_interval == 0: self.eval(dev_example, self.model) if steps % self.hyperparameter_1.save_interval == 0: if not os.path.isdir(self.hyperparameter_1.save_dir): os.makedirs(self.hyperparameter_1.save_dir) save_prefix = os.path.join(self.hyperparameter_1.save_dir, 'snapshot') save_path = '{}_steps{}.pt'.format(save_prefix, steps) torch.save(self.model, save_path)