def setup_train_configuration(self, config: Dict[str, Any]) -> None: """Setup train configuration.""" self.config = config self.total_epochs = self.config["EPOCHS"] # get datasets trainset, testset = utils.get_dataset( config["DATASET"], config["AUG_TRAIN"], config["AUG_TEST"], config["AUG_TRAIN_PARAMS"], config["AUG_TEST_PARAMS"], ) self.input_size = trainset[0][0].size() logger.info("Datasets prepared") # transform the training dataset for CutMix augmentation if "CUTMIX" in config: trainset = CutMix( trainset, config["MODEL_PARAMS"]["num_classes"], **config["CUTMIX"], ) # get dataloaders self.trainloader, self.testloader = utils.get_dataloader( trainset, testset, config["BATCH_SIZE"], config["N_WORKERS"], ) logger.info("Dataloader prepared") # define criterion and optimizer self.criterion = get_criterion( criterion_name=config["CRITERION"], criterion_params=config["CRITERION_PARAMS"], device=self.device, ) self.regularizer = None if "REGULARIZER" in config: self.regularizer = get_regularizer(config["REGULARIZER"], config["REGULARIZER_PARAMS"]) self.optimizer = optim.SGD( self.model.parameters(), lr=config["LR"], momentum=config["MOMENTUM"], weight_decay=config["WEIGHT_DECAY"], nesterov=config["NESTEROV"], ) # learning rate scheduler self.lr_scheduler = get_lr_scheduler( config["LR_SCHEDULER"], config["LR_SCHEDULER_PARAMS"], )
def test_get_dataset(): data_dir = utils.get_dataset() assert os.path.isdir(data_dir), "Image directory was not created" assert isinstance(data_dir, pathlib.Path), "Returned directory is not pathlib.Path" files = os.listdir(data_dir) assert len(files) > 0, "No files in image directory." if len(files) > 0: for file in files: assert file.endswith( "jpg") # Don't want any of those dumb mat files assert os.path.getsize(data_dir / file) > 0, f"{file} is empty."
def model_1(args): if os.path.isdir(os.getcwd() + '/results/images/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/images/' + args.run_name) if os.path.isdir(os.getcwd() + '/results/history/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/history/' + args.run_name) if os.path.isdir(os.getcwd() + '/results/files/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/files/' + args.run_name) datapath = args.datadir #args.batch_size = 2 args.img_size = 224 dataset, data_loader = utils.get_dataset(datapath, args.img_size, \ args.batch_size) classes, class_to_idx, idx_to_class = utils.get_classes(dataset) word_dim = 300 label_criterion = nn.CrossEntropyLoss() reconstr_criterion = nn.L1Loss() #reconstr_criterion = nn.MSELoss() model = BimodalDAEImage(300, 2048, n_classes=len(classes)) cnn = resnet101(pretrained=True) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) print('\nNum classes: %r, num images: %r' % (len(classes), len(dataset))) #### change temp #word_vecs = utils.get_wvecs_json(os.getcwd() + '/data/files/wvecs.json', classes, word_dim) word_vecs = utils.get_word_vectors(os.getcwd() + '/data/files/wvecs.json', classes, word_dim) loss_hist, metric_hist = {}, {} softmax = nn.Softmax(dim=1) for epoch in range(args.epochs): print('Epoch %r' % epoch) log.info('Epoch %r' % epoch) loss_hist[epoch], metric_hist[epoch] = {}, {} for batch_idx, (img, target_tensor) in enumerate(data_loader): batch_acc, batch_loss = [], {'reconstr': [], 'classification': []} target_idxs = target_tensor.data.numpy().tolist() target_names = [idx_to_class[idx] for idx in target_idxs] target_labels = torch.tensor([[1 if i == idx else 0 for i in \ range(len(classes))] for idx in target_idxs], \ dtype=torch.long) # previously target dist reps target_textual = torch.tensor([word_vecs[name] for name in target_names], \ dtype=torch.float32) #print('Text', target_textual.size()) #img_rep = img[0].reshape(1, 3, args.img_size, args.img_size) #print(img_rep.size()) #rep = vgg.forward(img_rep) #print(rep.size()) target_visual = torch.tensor([ cnn.forward(img[idx].reshape(1, 3, args.img_size, args.img_size)).data.numpy() for idx in range(len(target_idxs)) ], dtype=torch.float32) #print('Visual', target_visual.size()) n_samples = len(target_idxs) optimizer.zero_grad() img_reconstr, text_reconstr, hidden = model.forward(target_visual, \ target_textual) textual_loss = reconstr_criterion(text_reconstr, target_textual) textual_loss.backward(retain_graph=True) visual_loss = reconstr_criterion(img_reconstr, target_visual) visual_loss.backward(retain_graph=True) #print('Textual reconstr', text_reconstr.size()) #print('Visual reconstr', img_reconstr.size()) #print('Hidden', hidden.size()) preds = softmax(hidden) pred_loss = label_criterion(preds, target_tensor) pred_loss.backward() optimizer.step() if epoch % 10 == 0: state = {'epoch': epoch + 1, 'state_dict': \ model.state_dict(), 'optimizer': optimizer.state_dict()} torch.save(state, os.getcwd() + "/model_states/" + args.run_name) return
def eval(args): args = handle_args(args) if os.path.isdir(os.getcwd() + '/results/images/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/images/' + args.run_name) if os.path.isdir(os.getcwd() + '/results/history/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/history/' + args.run_name) if os.path.isdir(os.getcwd() + '/results/files/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/files/' + args.run_name) datapath = args.datadir dataset, data_loader = utils.get_dataset(datapath, args.img_size, \ args.batch_size) classes, class_to_idx, idx_to_class = utils.get_classes(dataset) word_dim = 300 label_dim = len(classes) # todo! change the number of classes to intersect with quickdraws classes! #label_criterion = nn.L1Loss() label_criterion = nn.CrossEntropyLoss() #label_criterion = nn.MultiLabelMarginLoss() reconstr_criterion = nn.MSELoss() model = DistributedWordLabeller(width=args.img_size, height=args.img_size, \ word_dim=word_dim, label_dim=label_dim) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) print('\nNum classes: %r, num images: %r' % (len(classes), len(dataset))) #word_vecs = {i: [0, 0, 0] for i in range(300)} word_vecs = utils.get_word_vectors('/data/nlp/glove/glove_300d.json', classes, word_dim) matrix = np.zeros((len(classes), len(classes))) if args.eval == 'True': datapath = args.test_datadir dataset, data_loader = utils.get_dataset(datapath, args.img_size, \ args.batch_size) checkpoint = torch.load(os.getcwd() + '/model_states/' + args.run_name) model.load_state_dict(checkpoint['state_dict']) with torch.no_grad(): for batch_idx, (img, target_tensor) in enumerate(data_loader): target_idxs = target_tensor.data.numpy().tolist() target_names = [idx_to_class[idx] for idx in target_idxs] print(target_names) target_labels = torch.tensor([[1 if i == idx else 0 for i in \ range(len(classes))] for idx in target_idxs], \ dtype=torch.float32) reconstr, word_dist, label_dist = model.forward(img) labels = model.pred_labels(label_dist) matrix = update_label_matrix(matrix, labels, target_idxs) avg_acc, metric_dict = matrix_to_metrics(train_matrix, idx_to_class) print(avg_acc); print(metric_dict) return f = open(os.getcwd() + '/results/files/' + args.run_name + \ '/matrix.json', 'w+') temp = {'matrix': matrix.tolist(), 'metrics': metric_dict, 'avg_acc': avg_acc} f.write(json.dumps(temp))
def save_encodings(args): datapath = args.datadir dataset, data_loader = utils.get_dataset(datapath, args.img_size, \ args.batch_size) classes, class_to_idx, idx_to_class = utils.get_temp_classes(dataset) word_dim = 300 label_dim = len(classes) model = DistributedWordOnly(width=args.img_size, height=args.img_size, \ word_dim=word_dim, label_dim=label_dim) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) dict = {class_name: [] for class_name in classes} print(dict) args.run_name = 'photo-word-only' checkpoint = torch.load(os.getcwd() + '/model_states/' + args.run_name) model.load_state_dict(checkpoint['state_dict']) print('Model loaded!') encoding_dict = {} with torch.no_grad(): for batch_idx, (img, target_tensor) in enumerate(data_loader): target_idxs = target_tensor.data.numpy().tolist() target_names = [idx_to_class[idx] for idx in target_idxs] print(target_names) target_labels = torch.tensor([[1 if i == idx else 0 for i in \ range(len(classes))] for idx in target_idxs], \ dtype=torch.float32) word_dist, label_dist = model.forward(img) print(word_dist) word_reps = word_dist.data.numpy() print(word_reps) print(len(word_reps)) for idx in range(len(word_reps)): target = target_idxs[idx] if target not in encoding_dict.keys(): encoding_dict[target] = [] encoding_dict[target] = list(word_reps[idx].tolist()) print(encoding_dict) labels = model.pred_labels(label_dist) print(target_idxs) print(labels) train_matrix = update_label_matrix(np.zeros((len(classes), len(classes))), \ labels, target_idxs) avg_acc, metric_dict = matrix_to_metrics(train_matrix, idx_to_class) avg_acc, metric_dict = matrix_to_metrics(train_matrix, idx_to_class) print('Done!') for item in encoding_dict: print(item) print(encoding_dict[item]) print(len(encoding_dict[item])) f = open(os.getcwd() + '/results/files/' + args.run_name + '/encoding_dict.json', 'w+') f.write(json.dumps(encoding_dict))
def extract_weights(args): args = handle_args(args) datapath = args.datadir dataset, data_loader = utils.get_dataset(datapath, args.img_size, \ args.batch_size) classes, class_to_idx, idx_to_class = utils.get_temp_classes(dataset) word_dim = 300 label_dim = len(classes) print('Model: ', args.run_name) model = DistributedWordLabeller(width=args.img_size, height=args.img_size, \ word_dim=word_dim, label_dim=label_dim) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) # get the matrix and classify the matrix f = open(os.getcwd() + '/results/files/' + args.run_name + '/matrix.json', 'r') for line in f: temp = json.loads(line) matrix = temp['matrix'] #print(len(matrix)) #print(matrix) #print(matrix[0][0]) print(idx_to_class) avg_acc, metric_dict = matrix_to_metrics(matrix, idx_to_class) #print(metric_dict) #print(avg_acc) print(idx_to_class) for idx in metric_dict: print(idx_to_class[idx]) print(metric_dict[idx]) return # get the weights from the linear layer, this forms the matrix, classify this matrix, matrix = np.zeros((len(classes), len(classes))) identity = torch.eye(300) m = model.label_classifier(identity) m = torch.transpose(m, 0, 1) #print(m) f = open(os.getcwd() + '/results/files/sketch_sketchy/' + 'idx_to_class.json', 'r') for line in f: idx_to_class = json.loads(line) rep_dict = {} for i in range(len(m)): class_name = idx_to_class[str(i)] rep_dict[class_name] = m[i].data.numpy() f = open(os.getcwd() + '/data/files/wvecs.json', 'r') for line in f: wvecs = json.loads(line) for name in wvecs: wvecs[name] = np.array(wvecs[name]) classes = list(rep_dict.keys()) sims = [] def get_sim(class_1, class_2, rep_dict): dot_product = rep_dict[class_1].reshape(-1, 1) * rep_dict[class_2].reshape(-1, 1) #print(dot_product) norm_1 = (np.linalg.norm(rep_dict[class_1].reshape(-1, 1))) norm_2 = (np.linalg.norm(rep_dict[class_2].reshape(-1, 1))) cos_sim = np.sum(dot_product)/(norm_1*norm_2) return cos_sim for (class_1, class_2) in itertools.product(classes, classes): pair = class_1 + '#' + class_2 cos_sim = get_sim(class_1, class_2, rep_dict) word_sim = get_sim(class_1, class_2, wvecs) #print(cos_sim) sims.append((pair, round(cos_sim, 3), round(word_sim, 3))) sorted_sims = sorted(sims, key=operator.itemgetter(1), reverse=True) for item in sorted_sims: if item[1] > 0.9: continue print(item) print(len(sorted_sims)) def get_corrs(pair_tuples): temp = 'sailboat,piano,sheep,pistol,snail,harp,cat,rocket,cannon,rabbit' temp = temp.split(',') f = open(os.getcwd() + '/data/files/sem-vis-sketchy.tsv', 'r') lines = [line.strip().split('\t') for line in f.readlines()] pairs = {item[0]:item[1] for item in pair_tuples} vals = [] for line in lines: class_1, class_2, sem, vis = line[0], line[1], float(line[2]), float(line[3]) if class_1 in temp or class_2 in temp: continue pair = class_1 + '#' + class_2 if pair in pairs: vals.append((pair, sem, vis, pairs[pair])) for val in vals: print(val) cos_list = [item[-1] for item in vals] sem_list = [item[1] for item in vals] vis_list = [item[2] for item in vals] #print(vals) spearman_sem = stats.spearmanr(cos_list, sem_list) spearman_vis = stats.spearmanr(cos_list, vis_list) pearson_sem = stats.pearsonr(cos_list, sem_list) pearson_vis = stats.pearsonr(cos_list, vis_list) print('Semantic: Pearson: %f, Spearman: %f' %(round(pearson_sem[0], 3), \ round(spearman_sem[0], 3))) print('Visual: Pearson: %f, Spearman: %f' %(round(pearson_vis[0], 3), \ round(spearman_vis[0], 3))) print('\n\nImage!') get_corrs([(item[0], item[1]) for item in sims]) #print('Word!') #get_corrs([(item[0], item[2]) for item in sims]) return if args.eval == 'True': datapath = args.test_datadir dataset, data_loader = utils.get_dataset(datapath, args.img_size, \ args.batch_size) checkpoint = torch.load(os.getcwd() + '/model_states/' + args.run_name) model.load_state_dict(checkpoint['state_dict']) with torch.no_grad(): for batch_idx, (img, target_tensor) in enumerate(data_loader): target_idxs = target_tensor.data.numpy().tolist() target_names = [idx_to_class[idx] for idx in target_idxs] print(target_names) target_labels = torch.tensor([[1 if i == idx else 0 for i in \ range(len(classes))] for idx in target_idxs], \ dtype=torch.float32) reconstr, word_dist, label_dist = model.forward(img) labels = model.pred_labels(label_dist) matrix = update_label_matrix(matrix, labels, target_idxs) avg_acc, metric_dict = matrix_to_metrics(matrix, idx_to_class) print(avg_acc); print(metric_dict) return
def model_1(args): if os.path.isdir(os.getcwd() + '/results/images/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/images/' + args.run_name) if os.path.isdir(os.getcwd() + '/results/history/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/history/' + args.run_name) if os.path.isdir(os.getcwd() + '/results/files/' + args.run_name) is False: os.mkdir(os.getcwd() + '/results/files/' + args.run_name) datapath = args.datadir args.img_size = 224 dataset, data_loader = utils.get_dataset(datapath, args.img_size, \ args.batch_size) classes, class_to_idx, idx_to_class = utils.get_classes(dataset) word_dim = 300 label_dim = len(classes) model = BimodalDAEImage(300, 2048, n_classes=len(classes)) cnn = resnet101(pretrained=True) optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=1e-5) print('\nNum classes: %r, num images: %r' % (len(classes), len(dataset))) word_vecs = utils.get_wvecs_json(os.getcwd() + '/data/files/wvecs.json', classes, word_dim) #word_vecs = utils.get_word_vectors(os.getcwd() + '/data/files/wvecs.json', classes, word_dim) encoding_dict = {} with torch.no_grad(): for batch_idx, (img, target_tensor) in enumerate(data_loader): target_idxs = target_tensor.data.numpy().tolist() target_names = [idx_to_class[idx] for idx in target_idxs] target_labels = torch.tensor([[1 if i == idx else 0 for i in \ range(len(classes))] for idx in target_idxs], \ dtype=torch.long) # previously target dist reps target_textual = torch.tensor([word_vecs[name] for name in target_names], \ dtype=torch.float32) target_visual = torch.tensor( [cnn.forward( img[idx].reshape(1, 3, args.img_size, args.img_size)).data.numpy() for idx in range(len(target_idxs))], dtype=torch.float32 ) n_samples = len(target_idxs) img_reconstr, text_reconstr, hidden = model.forward(target_visual, \ target_textual) print('Hidden', hidden.size()) #preds = softmax(hidden) reps = hidden.data.numpy() for idx in range(len(reps)): target = target_names[idx] print(target) if target not in encoding_dict.keys(): encoding_dict[target] = [] #val = reps[idx].view(1, -1) encoding_dict[target].append(list(reps[idx].tolist())) f = open(os.getcwd() + '/results/files/' + args.run_name + '/encoding_dict.json', 'w+') f.write(json.dumps(encoding_dict)) print('Eval done!')
def main(args): """ Call the configuration function of the model, build the model and load data, then start training. model_config: a json file with the hyperparameters,such as dropout rate ,learning rate,num tasks and so on; num_tasks: it means the number of task that each dataset contains, it's related to the dataset; DownstreamModel: It means the PretrainGNNModel for different strategies and it is an supervised GNN model which predicts the tasks. """ compound_encoder_config = load_json_config(args.compound_encoder_config) model_config = load_json_config(args.model_config) if not args.dropout_rate is None: compound_encoder_config['dropout_rate'] = args.dropout_rate model_config['dropout_rate'] = args.dropout_rate task_names = get_downstream_task_names(args.dataset_name, args.data_path) model_config['num_tasks'] = len(task_names) ### build model compound_encoder = PretrainGNNModel(compound_encoder_config) model = DownstreamModel(model_config, compound_encoder) criterion = nn.BCELoss(reduction='none') encoder_params = compound_encoder.parameters() head_params = exempt_parameters(model.parameters(), encoder_params) encoder_opt = paddle.optimizer.Adam(args.encoder_lr, parameters=encoder_params) head_opt = paddle.optimizer.Adam(args.head_lr, parameters=head_params) print('Total param num: %s' % (len(model.parameters()))) print('Encoder param num: %s' % (len(encoder_params))) print('Head param num: %s' % (len(head_params))) if not args.init_model is None and not args.init_model == "": compound_encoder.set_state_dict(paddle.load(args.init_model)) print('Load state_dict from %s' % args.init_model) ### load data # featurizer: # Gen features according to the raw data and return the graph data. # Collate features about the graph data and return the feed dictionary. # splitter: # split type of the dataset:random,scaffold,random with scaffold. Here is randomsplit. # `ScaffoldSplitter` will firstly order the compounds according to Bemis-Murcko scaffold, # then take the first `frac_train` proportion as the train set, the next `frac_valid` proportion as the valid set # and the rest as the test set. `ScaffoldSplitter` can better evaluate the generalization ability of the model on # out-of-distribution samples. Note that other splitters like `RandomSplitter`, `RandomScaffoldSplitter` # and `IndexSplitter` is also available." dataset = get_dataset(args.dataset_name, args.data_path, task_names) dataset.transform(DownstreamTransformFn(), num_workers=args.num_workers) splitter = create_splitter(args.split_type) train_dataset, valid_dataset, test_dataset = splitter.split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1) print("Train/Valid/Test num: %s/%s/%s" % (len(train_dataset), len(valid_dataset), len(test_dataset))) ### start train # Load the train function and calculate the train loss in each epoch. # Here we set the epoch is in range of max epoch,you can change it if you want. # Then we will calculate the train loss ,valid auc,test auc and print them. # Finally we save it to the model according to the dataset. list_val_auc, list_test_auc = [], [] collate_fn = DownstreamCollateFn( atom_names=compound_encoder_config['atom_names'], bond_names=compound_encoder_config['bond_names']) for epoch_id in range(args.max_epoch): train_loss = train(args, model, train_dataset, collate_fn, criterion, encoder_opt, head_opt) val_auc = evaluate(args, model, valid_dataset, collate_fn) test_auc = evaluate(args, model, test_dataset, collate_fn) list_val_auc.append(val_auc) list_test_auc.append(test_auc) test_auc_by_eval = list_test_auc[np.argmax(list_val_auc)] print("epoch:%s train/loss:%s" % (epoch_id, train_loss)) print("epoch:%s val/auc:%s" % (epoch_id, val_auc)) print("epoch:%s test/auc:%s" % (epoch_id, test_auc)) print("epoch:%s test/auc_by_eval:%s" % (epoch_id, test_auc_by_eval)) paddle.save( compound_encoder.state_dict(), '%s/epoch%d/compound_encoder.pdparams' % (args.model_dir, epoch_id)) paddle.save(model.state_dict(), '%s/epoch%d/model.pdparams' % (args.model_dir, epoch_id)) outs = { 'model_config': basename(args.model_config).replace('.json', ''), 'metric': '', 'dataset': args.dataset_name, 'split_type': args.split_type, 'batch_size': args.batch_size, 'dropout_rate': args.dropout_rate, 'encoder_lr': args.encoder_lr, 'head_lr': args.head_lr, 'exp_id': args.exp_id, } best_epoch_id = np.argmax(list_val_auc) for metric, value in [('test_auc', list_test_auc[best_epoch_id]), ('max_valid_auc', np.max(list_val_auc)), ('max_test_auc', np.max(list_test_auc))]: outs['metric'] = metric print('\t'.join(['FINAL'] + ["%s:%s" % (k, outs[k]) for k in outs] + [str(value)]))
def main(): # Argparse custom actions class SetModes(argparse.Action): """Set the modes of operations.""" def __call__(self, parser, args, values, option_string=None): for value in values: setattr(args, value, True) # yapf: disable parser = argparse.ArgumentParser(description='Fake News Classifier') # Initialization parser.add_argument('--init', action='store_true', default=False, help='perform initialization') # Modes parser.add_argument('-m', '--mode', action=SetModes, nargs='+', choices=['train', 'test', 'demo', 'plot'], help='specify the mode of operation: train, test, demo, plot') parser.add_argument('--train', action='store_true', default=False, help='train the model') parser.add_argument('--test', action='store_true', default=False, help='test the model (must either train or load a model)') parser.add_argument('--demo', action='store_true', default=False, help='demo the model on linewise samples from a file (must either train or load a model)') parser.add_argument('--plot', action='store_true', default=False, help='plot training data (must either train or have existing training data)') # Options parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('-c', '--config', type=str, help='path to configuration json file (overrides args)') parser.add_argument('--data-loader', type=str, default='BatchLoader', help='data loader to use (default: "BatchLoader")') parser.add_argument('--dataset', type=str, default='FakeRealNews', help='dataset to use (default: "FakeRealNews")') parser.add_argument('-e', '--epochs', type=int, default=10, help='number of epochs to train (default: 10)') parser.add_argument('-f', '--file', type=str, help='specify a file for another argument') parser.add_argument('--lr', '--learning-rate', dest='learning_rate', type=float, default=1e-4, help='learning rate (default: 1e-4)') parser.add_argument('-l', '--load', type=int, metavar='EPOCH', help='load a model and its training data') parser.add_argument('--loss', type=str, default='BCEWithLogitsLoss', help='loss function (default: "BCEWithLogitsLoss")') parser.add_argument('--model', type=str, default='FakeNewsNet', help='model architecture to use (default: "FakeNewsNet")') parser.add_argument('-s', '--sample-size', type=int, metavar='N', help='limit sample size for training') parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') parser.add_argument('--save', action='store_true', default=True, help='save model checkpoints and training data (default: True)') parser.add_argument('--no-save', dest='save', action='store_false') args = parser.parse_args() # yapf: enable # Print help if no args if len(sys.argv) == 1: parser.print_help() parser.exit() # Configure logger logging.basicConfig(level=logging.DEBUG) logging.getLogger('matplotlib').setLevel(logging.WARNING) # Load configuration file if specified if args.config is not None: utils.load_config(args) # Exit if no mode is specified if not args.init and not args.train and not args.test and not args.demo and not args.plot: logging.error( 'No mode specified. Please specify with: --mode {init,train,test,demo,plot}' ) exit(1) # Exit on `--load` if run directory not found if (args.load is not None or (args.plot and not args.train)) and not os.path.isdir(utils.get_path(args)): logging.error( 'Could not find directory for current configuration {}'.format( utils.get_path(args))) exit(1) # Exit on `test` or `demo` without `train` or `--load EPOCH` if (args.test or args.demo) and not (args.train or args.load is not None): logging.error( 'Cannot run `test` or `demo` without a model. Try again with either `train` or `--load EPOCH`.' ) exit(1) # Exit on `demo` without a string file if args.demo and not args.file: logging.error( 'Cannot run `demo` without a file. Try again with `--file FILE`.') exit(1) # Setup run directory if args.save and not args.init and not (args.train or args.test or args.demo or args.plot): utils.save_config(args) path = utils.get_path(args) + '/output.log' os.makedirs(os.path.dirname(path), exist_ok=True) logging.getLogger().addHandler(logging.FileHandler(path)) # Set random seeds random.seed(args.seed) torch.manual_seed(args.seed) # Variable declarations training_data = None # Load GloVe vocabulary if args.init or args.train or args.test or args.demo: glove = torchtext.vocab.GloVe(name='6B', dim=50) # Perform initialization if args.init or args.train or args.test: # Determine which dataset to use dataset = utils.get_dataset(args) # Preload the dataset dataset.load() # Get preprocessed samples samples = preprocessing.get_samples(dataset, glove, args.init) random.shuffle(samples) # DataLoader setup for `train`, `test` if args.train or args.test: # Select data loader to use DataLoader = utils.get_data_loader(args) # Split samples split_ratio = [.6, .2, .2] trainset, validset, testset = list( DataLoader.splits(samples, split_ratio)) if args.sample_size is not None: # limit samples used in training trainset = trainset[:args.sample_size] validset = validset[:int(args.sample_size * split_ratio[1] / split_ratio[0])] # Get data loaders train_loader, valid_loader, test_loader = [ DataLoader(split, batch_size=args.batch_size) for split in [trainset, validset, testset] ] # Load samples for demo if args.demo: if os.path.isfile(args.file): # Read samples from the input file with open(args.file, 'r') as f: samples = [line for line in f if line.strip()] data = pd.DataFrame({ 'text': samples, 'label': [0.5] * len(samples) }) # Preprocess samples preprocessing.clean(data) samples = preprocessing.encode(data, glove) samples = [(torch.tensor(text).long(), label) for text, label in samples] # Select data loader to use DataLoader = utils.get_data_loader(args) # Get data loader data_loader = DataLoader(samples, batch_size=1, shuffle=False) else: logging.error('Could not find file for demo at {}'.format( args.file)) exit(1) # Model setup for `train`, `test`, `demo` if args.train or args.test or args.demo: # Create the model model = utils.get_model(glove, args) # Load a model if args.load is not None: utils.load_model(args.load, model, args) # Run `train` if args.train: training_data = training.train(model, train_loader, valid_loader, args) # Run `test` if args.test: if args.train or args.load is not None: criterion = utils.get_criterion(args.loss) acc, loss = training.evaluate(model, test_loader, criterion) logging.info('Testing accuracy: {:.4%}, loss: {:.6f}'.format( acc, loss)) else: logging.error('No model loaded for testing') exit(1) # Run `demo` if args.demo: if args.train or args.load is not None: model.eval() # set model to evaluate mode logging.info('-- Results --') for i, (text, _) in enumerate(data_loader): preview = data['text'][i][:32] + '...' out = model(text).flatten() prob = torch.sigmoid(out) # apply sigmoid to get probability pred = (prob > 0.5).long() # predict `true` if greater than 0.5 label = ['fake', 'true'][pred.item()] label = '{}{}{}'.format( '\033[92m' if pred.item() else '\033[93m', label, '\033[0m') confidence = (prob if pred.item() else 1 - prob).item() logging.info( 'Report {}: {} with {:.2%} confidence - "{}"'.format( i, label, confidence, preview)) else: logging.error('No model loaded for demo') exit(1) # Run `plot` if args.plot: if training_data is None: training_data = utils.load_training_data(args, allow_missing=False) if args.load is not None and not args.train: for k, v in training_data.items(): training_data[k] = v[:args.load + 1] logging.info('Plotting training data') training.plot(training_data)
It may be necessary to gather the resources manually depending on your needs. """ # standard library import os import zipfile from pathlib import Path # local from src.config import PATHS, GEONAMES, MAPPING from src.utils import get_dataset, download_from_url # change the working directory os.chdir('../') # download the place name data files get_dataset(GEONAMES, PATHS.resources / 'geonames') # download shapefiles for field, url in zip(MAPPING._fields, MAPPING): suffix = Path(url).suffix path_out = PATHS.resources / 'shapefiles' / field filename = f"{field}{suffix}" if not (path_out / filename).exists(): download_from_url(url, filename=filename, path_out=path_out) if suffix == '.zip': with zipfile.ZipFile(path_out / filename, 'r') as zip_ref: zip_ref.extractall(path_out)
from src.utils import get_dataset from src.configuration import Configuration dataset = get_dataset(Configuration.from_file('config.cfg')) i = 0 for key in dataset.keys(): song = dataset[key] if song.lyrics is not None: i += 1 print(i)
def main(args): """ Call the configuration function of the model, build the model and load data, then start training. model_config: a json file with the hyperparameters,such as dropout rate ,learning rate,num tasks and so on; num_tasks: it means the number of task that each dataset contains, it's related to the dataset; """ ### config for the body compound_encoder_config = load_json_config(args.compound_encoder_config) if not args.dropout_rate is None: compound_encoder_config['dropout_rate'] = args.dropout_rate ### config for the downstream task task_type = 'regr' metric = get_metric(args.dataset_name) task_names = get_downstream_task_names(args.dataset_name, args.data_path) dataset_stat = get_dataset_stat(args.dataset_name, args.data_path, task_names) label_mean = np.reshape(dataset_stat['mean'], [1, -1]) label_std = np.reshape(dataset_stat['std'], [1, -1]) model_config = load_json_config(args.model_config) if not args.dropout_rate is None: model_config['dropout_rate'] = args.dropout_rate model_config['task_type'] = task_type model_config['num_tasks'] = len(task_names) print('model_config:') print(model_config) ### build model compound_encoder = GeoGNNModel(compound_encoder_config) model = DownstreamModel(model_config, compound_encoder) if metric == 'square': criterion = nn.MSELoss() else: criterion = nn.L1Loss() encoder_params = compound_encoder.parameters() head_params = exempt_parameters(model.parameters(), encoder_params) encoder_opt = paddle.optimizer.Adam(args.encoder_lr, parameters=encoder_params) head_opt = paddle.optimizer.Adam(args.head_lr, parameters=head_params) print('Total param num: %s' % (len(model.parameters()))) print('Encoder param num: %s' % (len(encoder_params))) print('Head param num: %s' % (len(head_params))) for i, param in enumerate(model.named_parameters()): print(i, param[0], param[1].name) if not args.init_model is None and not args.init_model == "": compound_encoder.set_state_dict(paddle.load(args.init_model)) print('Load state_dict from %s' % args.init_model) ### load data if args.task == 'data': print('Preprocessing data...') dataset = get_dataset(args.dataset_name, args.data_path, task_names) transform_fn = DownstreamTransformFn() dataset.transform(transform_fn, num_workers=args.num_workers) dataset.save_data(args.cached_data_path) return else: if args.cached_data_path is None or args.cached_data_path == "": print('Processing data...') dataset = get_dataset(args.dataset_name, args.data_path, task_names) transform_fn = DownstreamTransformFn() dataset.transform(transform_fn, num_workers=args.num_workers) else: print('Read preprocessing data...') dataset = InMemoryDataset(npz_data_path=args.cached_data_path) splitter = create_splitter(args.split_type) train_dataset, valid_dataset, test_dataset = splitter.split(dataset, frac_train=0.8, frac_valid=0.1, frac_test=0.1) print("Train/Valid/Test num: %s/%s/%s" % (len(train_dataset), len(valid_dataset), len(test_dataset))) print('Train min/max/mean %s/%s/%s' % get_label_stat(train_dataset)) print('Valid min/max/mean %s/%s/%s' % get_label_stat(valid_dataset)) print('Test min/max/mean %s/%s/%s' % get_label_stat(test_dataset)) ### start train list_val_metric, list_test_metric = [], [] collate_fn = DownstreamCollateFn( atom_names=compound_encoder_config['atom_names'], bond_names=compound_encoder_config['bond_names'], bond_float_names=compound_encoder_config['bond_float_names'], bond_angle_float_names=compound_encoder_config[ 'bond_angle_float_names'], task_type=task_type) for epoch_id in range(args.max_epoch): train_loss = train(args, model, label_mean, label_std, train_dataset, collate_fn, criterion, encoder_opt, head_opt) val_metric = evaluate(args, model, label_mean, label_std, valid_dataset, collate_fn, metric) test_metric = evaluate(args, model, label_mean, label_std, test_dataset, collate_fn, metric) list_val_metric.append(val_metric) list_test_metric.append(test_metric) test_metric_by_eval = list_test_metric[np.argmin(list_val_metric)] print("epoch:%s train/loss:%s" % (epoch_id, train_loss)) print("epoch:%s val/%s:%s" % (epoch_id, metric, val_metric)) print("epoch:%s test/%s:%s" % (epoch_id, metric, test_metric)) print("epoch:%s test/%s_by_eval:%s" % (epoch_id, metric, test_metric_by_eval)) paddle.save( compound_encoder.state_dict(), '%s/epoch%d/compound_encoder.pdparams' % (args.model_dir, epoch_id)) paddle.save(model.state_dict(), '%s/epoch%d/model.pdparams' % (args.model_dir, epoch_id)) outs = { 'model_config': basename(args.model_config).replace('.json', ''), 'metric': '', 'dataset': args.dataset_name, 'split_type': args.split_type, 'batch_size': args.batch_size, 'dropout_rate': args.dropout_rate, 'encoder_lr': args.encoder_lr, 'head_lr': args.head_lr, } best_epoch_id = np.argmin(list_val_metric) for metric, value in [('test_%s' % metric, list_test_metric[best_epoch_id]), ('max_valid_%s' % metric, np.min(list_val_metric)), ('max_test_%s' % metric, np.min(list_test_metric))]: outs['metric'] = metric print('\t'.join(['FINAL'] + ["%s:%s" % (k, outs[k]) for k in outs] + [str(value)]))
# paths PRETRAINED_PATH = 'huggingface/' SQUAD_TRAIN_DATA_PATH = 'data/squad/train-v2.0.json' SQUAD_VAL_DATA_PATH = 'data/squad/dev-v2.0.json' MODEL_SAVE_PATH = 'models/with_answers_only/' LOG_DIR = 'logs/gradient_tape/' if not os.path.exists(MODEL_SAVE_PATH): os.makedirs(MODEL_SAVE_PATH) if TENSORBOARD: %load_ext tensorboard %tensorboard --logdir logs/gradient_tape if LOAD_DATA: train_dataset, input_tokens_train = get_dataset(SQUAD_TRAIN_DATA_PATH, PRETRAINED_PATH) train_dataset = train_dataset.batch(BATCH_SIZE) val_dataset, input_tokens_val = get_dataset(SQUAD_VAL_DATA_PATH, PRETRAINED_PATH) val_dataset = val_dataset.batch(BATCH_SIZE) if SHUFFLE: # doesn't work with reshuffle_each_iteration --> other data at every epoch? train_dataset = train_dataset.shuffle(buffer_size=SHUFFLE_BUF, reshuffle_each_iteration=True) val_dataset = val_dataset.shuffle(buffer_size=SHUFFLE_BUF, reshuffle_each_iteration=True) tokenizer = AlbertTokenizer.from_pretrained(PRETRAINED_PATH) with open(PRETRAINED_PATH + 'config.json') as f: config = json.load(f) # training configurations loop for desired_batch_size, lr in itertools.product(DESIRED_BATCH_SIZES, LR): model = QaAlbertModel(config, PRETRAINED_PATH)