def create_and_train_network(args): name = args['name'] now = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), LOG_DIR = os.path.join('log', f'{name}-{now}') train_data = ReviewDataset('train.npy', test_ratio=0.1, label_smoothing=args['label_smoothing'], output_type=args['output_type'], data_type=args['network_type']) if args['num_words']: train_data.num_words = args['num_words'] else: train_data.find_out_num_words() # Define network net = Network(args, num_words=train_data.num_words, logdir=LOG_DIR) # LOad network if args['load']: load_model = args['load'] net.model.load_weights(f'models/{load_model}') net.train_epoch(train_data, args, 1) metrics = net.evaluate(train_data, args, 30) print(f'Loaded network: {metrics}') for epoch in range(args['epochs']): net.train_epoch(train_data, args, 100) metrics = net.evaluate(train_data, args, 30) print(f'Epoch {epoch}:{metrics}') # Save checkpoint if epoch % 10 == 9: net.model.save_weights(f'models/{name}-{epoch}') return net
def get_dataset(embeddings, paths, val=False): """ Gathers all the review files pathnames, and returns a ReviewDataset object. If val == True, splits the data into 90% training data and 10% validation data. """ files = [] labels = [] for sentiment in paths: files.append([]) labels.append([]) for dirpath, dirnames, filenames in os.walk(sentiment): for filename in filenames: files[-1].append(dirpath + '/' + filename) if sentiment == path_neg or sentiment == path_test_neg: labels[-1].append(0) elif sentiment == path_pos or sentiment == path_test_pos: labels[-1].append(1) if val: split = int(len(files[0]) * 0.9) train_files = files[0][:split] train_files.extend(files[1][:split]) train_labels = labels[0][:split] train_labels.extend(labels[1][:split]) train_data = ReviewDataset(embeddings, train_files, train_labels) val_files = files[0][split:] val_files.extend(files[1][split:]) val_labels = labels[0][split:] val_labels.extend(labels[1][split:]) val_data = ReviewDataset(embeddings, val_files, val_labels) return train_data, val_data files = [file for files_ in files for file in files_] labels = [label for labels_ in labels for label in labels_] data = ReviewDataset(embeddings, files, labels) return data
cur_idx += 1 return result if __name__ == '__main__': THRESH = 0.10 SAVING_DIR = '../models/' MODELS = [ 'best_bert_model_774', 'best_bert_model_77', 'best_bert_model_cv0', 'best_bert_model_cv1', 'best_bert_model_cv2', 'best_bert_model_cv3', 'best_bert_model_cv4' ] tokenizer = BertTokenizer.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch', do_lower_case=True) test_dataset = ReviewDataset('../data/TEST/Test_reviews.csv', None, tokenizer) test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) ret = None for name in MODELS: model_path = osp.join(SAVING_DIR, name) model = OpinioNet.from_pretrained( '/home/zydq/.torch/models/bert/chinese-bert_chinese_wwm_pytorch') model.load_state_dict(torch.load(model_path)) model.cuda() ret = accum_result(ret, eval_epoch(model, test_loader)) del model
MODELS = list(zip(WEIGHT_NAMES, MODEL_NAMES, THRESHS)) # tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODELS['roberta']['path'], do_lower_case=True) # test_dataset = ReviewDataset(args.rv, args.lb, tokenizer, 'laptop') # test_loader = DataLoader(test_dataset, args.bs, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) ret = None raw = None lb = None num_model = 0 for weight_name, model_name, thresh in MODELS: if not osp.isfile('../models/' + weight_name): continue num_model += 1 model_config = PRETRAINED_MODELS[model_name] tokenizer = BertTokenizer.from_pretrained(model_config['path'], do_lower_case=True) test_dataset = ReviewDataset(args.rv, args.lb, tokenizer, 'laptop') test_loader = DataLoader(test_dataset, args.bs, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) if not raw: raw = [s[0][0] for s in test_dataset.samples] if not lb and args.lb: lb = [s[0][1] for s in test_dataset.samples] model = OpinioNet.from_pretrained(model_config['path'], version=model_config['version'], focal=model_config['focal']) print(weight_name)
if not isbad: nmsopns.append(opn) results[i] = nmsopns # print(results) return results if __name__ == '__main__': from pytorch_pretrained_bert import BertTokenizer from dataset import ReviewDataset tokenizer = BertTokenizer.from_pretrained('/home/zydq/.torch/models/bert/ERNIE', do_lower_case=True) model = OpinioNet.from_pretrained('/home/zydq/.torch/models/bert/ERNIE') model.cuda() model.train() d = ReviewDataset('../data/TRAIN/Train_reviews.csv', '../data/TRAIN/Train_labels.csv', tokenizer) b_raw, b_in, b_tgt = d.batchify(d[:10]) for i in range(len(b_in)): b_in[i] = b_in[i].cuda() for i in range(len(b_tgt)): b_tgt[i] = b_tgt[i].cuda() print(b_in) probs, logits = model.forward(b_in) loss = model.loss(logits, b_tgt) result = model.nms(probs) print(loss) print(result)
seed=1337, # Runtime options catch_keyboard_interrupt=True, cuda=True, expand_filepaths_to_save_dir=True, reload_from_files=False, train=False, ) # handle dirs handle_dirs(args.save_dir) vectorizer_pth = os.path.join(args.save_dir, args.vectorizer_file) if args.reload_from_files: # training from a checkpoint print("Loading dataset and vectorizer") dataset = ReviewDataset.load_dataset_and_load_vectorizer( args.review_csv, vectorizer_pth) else: print("Loading dataset and creating vectorizer") # create dataset and vectorizer dataset = ReviewDataset.load_dataset_and_make_vectorizer(args.review_csv) dataset.save_vectorizer(vectorizer_pth) vectorizer = dataset.get_vectorizer() classifier = ReviewPerceptronClassifier(num_features=len( vectorizer.review_vocab), num_classes=1) # classifier = ReviewMLPClassifier(num_features=len(vectorizer.review_vocab), num_classes=1, hidden_layer_dim=[100]) args.classifier = classifier args.vectorizer = vectorizer
LABEL_DIR = '../data/TRAIN/Train_laptop_corpus_labels.csv' SUBMIT_DIR = None with open(THRESH_DIR, 'r', encoding='utf-8') as f: thresh_dict = json.load(f) WEIGHT_NAMES, MODEL_NAMES, THRESHS = [], [], [] for k, v in thresh_dict.items(): WEIGHT_NAMES.append(k) MODEL_NAMES.append(v['name']) THRESHS.append(v['thresh']) MODELS = list(zip(WEIGHT_NAMES, MODEL_NAMES, THRESHS)) tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODELS['roberta']['path'], do_lower_case=True) test_dataset = ReviewDataset(DATA_DIR, None, tokenizer, 'laptop') test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) ret = None num_model = 0 for weight_name, model_name, thresh in MODELS: if not osp.isfile('../models/' + weight_name): continue num_model += 1 model_config = PRETRAINED_MODELS[model_name] tokenizer = BertTokenizer.from_pretrained(model_config['path'], do_lower_case=True) test_dataset = ReviewDataset(DATA_DIR, None, tokenizer, 'laptop') test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) print(model_config) model = OpinioNet.from_pretrained(model_config['path'], version=model_config['version'], focal=model_config['focal']) model.load_state_dict(torch.load('../models/' + weight_name)) model.cuda()
}, ignore_index=True) cur_idx += 1 step += 1 return result if __name__ == '__main__': EP = 100 SAVING_DIR = '../models/' tokenizer = BertTokenizer.from_pretrained( '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch', do_lower_case=True) test_dataset = ReviewDataset('../data/TEST/Test_reviews.csv', None, tokenizer, type='laptop') test_loader = DataLoader(test_dataset, 12, collate_fn=test_dataset.batchify, shuffle=False, num_workers=5) model = OpinioNet.from_pretrained( '/home/zydq/.torch/models/bert/chinese_wwm_ext_pytorch') model.load_state_dict(torch.load('../models/saved_best_model_wwm_ext')) model.cuda() result = eval_epoch(model, test_loader) import time result.to_csv('../submit/result-' + str(round(time.time())) + '.csv', header=False,