def create_dataset(lookoutvision_client, s3_resource, bucket, project_name, dataset_images, dataset_type): """ Creates a manifest from images in the supplied bucket and then creates a dataset. :param lookoutvision_client: A Boto3 Lookout for Vision client. :param s3_resource: A Boto3 Amazon S3 client. :param bucket: The bucket that stores the manifest file. :param project_name: The project in which to create the dataset. :param dataset_images: The location of the images referenced by the dataset. :param dataset_type: The type of dataset to create (train or test). """ print(f"Creating {dataset_type} dataset...") manifest_file = f"s3://{bucket}/{project_name}/manifests/{dataset_type}.manifest" logger.info("Creating %s manifest file in %s.", dataset_type, manifest_file) Datasets.create_manifest_file_s3(s3_resource, dataset_images, manifest_file) logger.info("Create %s dataset for project %s", dataset_type, project_name) Datasets.create_dataset(lookoutvision_client, project_name, manifest_file, dataset_type)
def test_create_dataset(make_stubber, error_code): lookoutvision_client = boto3.client('lookoutvision') lookoutvision_stubber = make_stubber(lookoutvision_client) project_name = 'test-project_name' bucket = 'test-bucket' object_key = 'test-object' manifest_file = f'{bucket}/{object_key}' dataset_type = 'train' status = 'CREATE_COMPLETE' message = 'Test message' lookoutvision_stubber.stub_create_dataset(project_name, dataset_type, bucket, object_key, status, message, error_code=error_code) if error_code is None: lookoutvision_stubber.stub_describe_dataset(project_name, dataset_type, status, message) if error_code is None: Datasets.create_dataset(lookoutvision_client, project_name, manifest_file, dataset_type) else: with pytest.raises(ClientError) as exc_info: Datasets.create_dataset(lookoutvision_client, project_name, manifest_file, dataset_type) assert exc_info.value.response['Error']['Code'] == error_code
def test_update_dataset_entries(make_stubber, error_code): lookoutvision_client = boto3.client('lookoutvision') lookoutvision_stubber = make_stubber(lookoutvision_client) project_name = 'test-project_name' updates_file = 'test/test_manifests/updates.manifest' dataset_type = 'train' status_complete = 'UPDATE_COMPLETE' status_running = 'UPDATE_IN_PROGRESS' message = 'Test message' changes = "" with open(updates_file) as f: changes = f.read() lookoutvision_stubber.stub_update_dataset_entries(project_name, dataset_type, changes, status_running, error_code=error_code) if error_code is None: lookoutvision_stubber.stub_describe_dataset(project_name, dataset_type, status_complete, message) if error_code is None: Datasets.update_dataset_entries(lookoutvision_client, project_name, dataset_type, updates_file) else: with pytest.raises(ClientError) as exc_info: Datasets.update_dataset_entries(lookoutvision_client, project_name, dataset_type, updates_file) assert exc_info.value.response['Error']['Code'] == error_code
def get_transform(nparray): data_file = "framingham.csv" numeric_var = [ "age", "cigsPerDay", "totChol", "sysBP", "diaBP", "BMI", "heartRate", "glucose", ] level_var = ["education"] category_var = [ "male", "currentSmoker", "BPMeds", "prevalentStroke", "prevalentHyp", "diabetes", ] target = ["TenYearCHD"] # Create Data object data = Datasets( data_file=data_file, cat_cols=category_var, num_cols=numeric_var, level_cols=level_var, label_col=target, train=True, ) return data.preprocess_newdata(nparray)
def test_create_manifest_file_s3(make_stubber, monkeypatch, error_code): s3_resource = boto3.resource('s3') s3_stubber = make_stubber(s3_resource.meta.client) image_bucket = 'image-bucket' image_prefix = 'image-prefix/' image_path = f'{image_bucket}/{image_prefix}' mani_bucket = 'mani-bucket' mani_prefix = 'mani-prefix/' manifest_path = f'{mani_bucket}/{mani_prefix}' monkeypatch.setattr( s3_resource.meta.client, 'upload_file', lambda Filename, Bucket, Key, ExtraArgs, Callback, Config: None) s3_stubber.stub_list_objects(image_bucket, [f'{image_prefix}anomaly/anomaly-test-key'], f"{image_prefix}anomaly/", '/') s3_stubber.stub_list_objects(image_bucket, [f'{image_prefix}normal/normal-test-key'], f"{image_prefix}normal/", '/', error_code=error_code) with open("temp.manifest", 'w') as mani: mani.write("Test manifest.") if error_code is None: Datasets.create_manifest_file_s3(s3_resource, image_path, manifest_path) else: with pytest.raises(ClientError) as exc_info: Datasets.create_manifest_file_s3(s3_resource, image_path, manifest_path) assert exc_info.value.response['Error']['Code'] == error_code
def do(config): # 데이터 읽기 & 전처리 print("Read data") ds = Datasets(config.data_path) data = ds.read_data() print("Data preprocessing..") preprocessing = Preprocessing(config) X = preprocessing.do(data) print('Train model') if config.sg == 'CBOW': model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=0 ) else: model = Word2Vec( sentences=X, size=config.size, window=config.window, min_count=config.min_count, workers=config.workers, sg=1 ) print(model.wv.vectors.shape) model.save(os.path.join(config.save_directory, config.ckpt_name))
def train(nn_name='12-net', k=12): """ Fucntion for traning 12-net with testing on part of data using cross validation """ suff = str(k) if nn_name.find('calib') > 0: X_data_name = 'train_data_icalib_' + suff + '.npy' y_data_name = 'labels_icalib_' + suff + '.npy' else: X_data_name = 'train_data_' + suff + '.npy' y_data_name = 'labels_' + suff + '.npy' rates12 = sp.hstack((0.05 * sp.ones(25, dtype=sp.float32), 0.005 * sp.ones(15, dtype=sp.float32), 0.0005 * sp.ones(10, dtype=sp.float32))) rates24 = sp.hstack((0.01 * sp.ones(25, dtype=sp.float32), 0.0001 * sp.ones(15, dtype=sp.float32))) rates48 = sp.hstack([ 0.05 * sp.ones(15, dtype=sp.float32), 0.005 * sp.ones(10, dtype=sp.float32) ]) if nn_name == '24-net': nn = Cnnl( nn_name=nn_name, l_rates=rates24, subnet=Cnnl(nn_name='12-net', l_rates=rates12).load_model('12-net_lasagne_.pickle')) elif nn_name == '48-net': nn = Cnnl(nn_name=nn_name, l_rates=rates48, subnet=Cnnl( nn_name='24-net', l_rates=rates24, subnet=Cnnl( nn_name='12-net', l_rates=rates12).load_model('12-net_lasagne_.pickle') ).load_model('24-net_lasagne_.pickle')) else: nn = Cnnl(nn_name=nn_name, l_rates=rates12) if not os.path.exists(nn_name + '_lasagne_.pickle'): if nn_name.find('calib') > 0: ds.get_train_wider_calib_data(k=k) else: ds.get_train_data(k=k) X, y = sp.load(X_data_name), sp.load(y_data_name) X_train, y_train = X, y if not os.path.exists(nn_name + '_lasagne_.pickle'): if nn_name == '24-net': X_sub_train12 = sp.load('train_data_12.npy') nn.fit(X=X_train, y=y_train, X12=X_sub_train12) elif nn_name == '48-net': X_sub_train12 = sp.load('train_data_12.npy') X_sub_train24 = sp.load('train_data_24.npy') nn.fit(X=X_train, y=y_train, X12=X_sub_train12, X24=X_sub_train24) else: nn.fit(X=X_train, y=y_train) nn.save_model(nn_name + '_lasagne_.pickle')
def __init__(self): super(Ui, self).__init__() uic.loadUi('mainwindow.ui', self) self.show() self.load_dir_but.clicked.connect( lambda: load_dir_dialog(self, self.load_dir_lineEdit)) self.load_drift_file_but.clicked.connect( lambda: load_file_dialog(self, self.load_drift_file_lineEdit)) self.load_calibration_file_but.clicked.connect( lambda: load_file_dialog(self, self.load_calibration_file_lineEdit )) def select_model_path(self, line_edit): self.model = None return load_dir_dialog(self, line_edit) self.select_model_path_but.clicked.connect( lambda: select_model_path(self, self.model_path_lineEdit)) self.load_data.clicked.connect(self.load_data_func) self.label_data_but.clicked.connect(self.label_data_func) self.load_prefix_tables() self.datasets = Datasets() self.model = None self.labeled_dataset = None self.original_dataset = None
def finetune_q_model(): X_train, X_valid, Y_train, Y_valid = Datasets.make_stacked_frame_data( data_folder_path) # Only need steering targets Y_train = Y_train[0] Y_valid = Y_valid[0] model = q_categorical(input_dimension=(120, 160, 4)) # Load pretrained Q model for finetuning model.load_weights('saved_models/robin_track_v2_highres.h5') model.layers[-1].activation = softmax adam = Adam(lr=1e-4) # Use a smaller learning rate for fine-tuning? model.compile(loss='categorical_crossentropy', optimizer=adam) print("weights Load Successfully!") callbacks = [EarlyStopping(monitor='val_loss', patience=3)] #ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)] model.fit(X_train, Y_train, epochs=20, batch_size=64, validation_data=(X_valid, Y_valid)) timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') model.save_weights('saved_models/finetune_q_' + timestamp + '.h5', overwrite=True)
def main(): datasets = [ {"city": "beijing", "age": 500, "temperature": 26}, {"city": "shanghai", "age": 550, "temperature": 27}, {"city": "shenzheng", "age": 300, "temperature": 30}, ] dict_vectorizer = DictVectorizer() dv_datasets = dict_vectorizer.fit_transform(datasets) print dv_datasets.toarray() print dict_vectorizer.vocabulary_ print dict_vectorizer.feature_names_ print "-" * 80 #fh_vectorizer = FeatureHasher(n_features=10, input_type="dict") #fh_datasets = fh_vectorizer.fit_transform([{"text": 10, "words": 7}, {"name": 1, "words": 5}, {"gender": 1}]) fh_vectorizer = FeatureHasher(n_features=10, input_type="string") fh_datasets = fh_vectorizer.fit_transform(["Liming love football", "Zhansan likes baseball"]) print fh_datasets.toarray() raw_datasets, _ = Datasets.load_datasets() datasets = [v for v in raw_datasets.data[:10]] count_vectorizer = CountVectorizer(decode_error="ignore") cv_datasets = count_vectorizer.fit_transform(datasets) print count_vectorizer.vocabulary_ tfidf_transformer = TfidfTransformer(smooth_idf=True) tfidft_datasets = tfidf_transformer.fit_transform(cv_datasets) print tfidft_datasets.toarray() print tfidf_transformer.idf_ hash_vectorizer = HashingVectorizer(n_features=100, decode_error="ignore") hv_datasets = hash_vectorizer.fit_transform(datasets) print hv_datasets.toarray().shape
def fit(self, **kwargs): logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO) logging.root.level = logging.INFO datasets = Datasets() params = DOC2VEC_PARAMS self.model = Doc2Vec(datasets.tagged_docs, **params) self.model.save(self.__get_model_fpath())
def __init__(self, dataset_name, model_name, optimizer_name, trial_num): """ :param dataset_name: name of the dataset :type dataset_name: str :param model_name: name of the model :type model_name: str :param optimizer_name: name of the optimizer :type optimizer_name: str :param trial_num: current number of repeated trials :type trial_num: int """ # get optimized hyperparameters with open( f'../params/{dataset_name}_{model_name}_{optimizer_name}/result.json' ) as f: params = json.load(f) # get instances self.dataset = Datasets.get(dataset_name) self.model = Models.get(model_name, dataset=self.dataset) self.optimizer = Optimizers.get(optimizer_name, params=params) # get config with open('./config.json') as f: config = json.load(f) # get constants c = config['constants'][dataset_name][model_name] self.loss = c['loss'] self.batch_size = c['batch_size'] self.epochs = c['epochs'] # configure and initialize directory d = self.main_dir = f'../data/{dataset_name}_{model_name}_{optimizer_name}/trial{trial_num}' if os.path.exists(d): shutil.rmtree(d) os.makedirs(d) # configure hyperdash experiment self.hd_exp = HyperdashExperiment( f'{dataset_name}', api_key_getter=lambda: config['hyperdash']['api_key']) self.hd_exp.param('dataset_name', dataset_name) self.hd_exp.param('model_name', model_name) self.hd_exp.param('optimizer_name', optimizer_name) self.hd_exp.param('trial_num', trial_num) for k, v in params.items(): self.hd_exp.param(k, v) # set callbacks self.callbacks = [ Hyperdash(['accuracy', 'loss', 'val_accuracy', 'val_loss'], self.hd_exp), TensorBoard(log_dir=f'{self.main_dir}/tensorboard'), TimeLogger(filename=f'{self.main_dir}/time.csv'), CSVLogger(filename=f'{self.main_dir}/result.csv', append=True) ]
def test_delete_dataset(make_stubber, error_code): lookoutvision_client = boto3.client('lookoutvision') lookoutvision_stubber = make_stubber(lookoutvision_client) project_name = 'test-project_name' dataset_type = 'train' lookoutvision_stubber.stub_delete_dataset(project_name, dataset_type, error_code=error_code) if error_code is None: Datasets.delete_dataset(lookoutvision_client, project_name, dataset_type) else: with pytest.raises(ClientError) as exc_info: Datasets.delete_dataset(lookoutvision_client, project_name, dataset_type) assert exc_info.value.response['Error']['Code'] == error_code
def train(nn_name = '12-net',k = 12): """ Fucntion for traning 12-net with testing on part of data using cross validation """ suff = str(k) if nn_name.find('calib') > 0: X_data_name = 'train_data_icalib_'+ suff + '.npy' y_data_name = 'labels_icalib_'+ suff + '.npy' else: X_data_name = 'train_data_'+ suff + '.npy' y_data_name = 'labels_'+ suff + '.npy' rates12 = sp.hstack((0.05 * sp.ones(25,dtype=sp.float32),0.005*sp.ones(15,dtype=sp.float32),0.0005*sp.ones(10,dtype=sp.float32))) rates24 = sp.hstack((0.01 * sp.ones(25,dtype=sp.float32),0.0001*sp.ones(15,dtype=sp.float32))) rates48 = sp.hstack ([0.05 * sp.ones(15,dtype=sp.float32),0.005*sp.ones(10,dtype=sp.float32) ]) if nn_name == '24-net': nn = Cnnl(nn_name = nn_name,l_rates=rates24,subnet=Cnnl(nn_name = '12-net',l_rates=rates12).load_model( '12-net_lasagne_.pickle')) elif nn_name == '48-net': nn = Cnnl(nn_name = nn_name,l_rates=rates48,subnet=Cnnl(nn_name = '24-net',l_rates=rates24,subnet=Cnnl(nn_name = '12-net',l_rates=rates12).load_model( '12-net_lasagne_.pickle')).load_model('24-net_lasagne_.pickle')) else: nn = Cnnl(nn_name = nn_name,l_rates=rates12) if not os.path.exists(nn_name + '_lasagne_.pickle'): if nn_name.find('calib') > 0: ds.get_train_wider_calib_data(k=k) else: ds.get_train_data(k=k) X,y = sp.load(X_data_name),sp.load(y_data_name) X_train,y_train = X,y if not os.path.exists(nn_name + '_lasagne_.pickle'): if nn_name == '24-net': X_sub_train12 = sp.load('train_data_12.npy') nn.fit(X = X_train,y = y_train,X12 = X_sub_train12) elif nn_name == '48-net': X_sub_train12 = sp.load('train_data_12.npy') X_sub_train24 = sp.load('train_data_24.npy') nn.fit(X = X_train,y = y_train,X12 = X_sub_train12,X24 = X_sub_train24) else: nn.fit(X = X_train,y = y_train) nn.save_model(nn_name + '_lasagne_.pickle')
def objective(self, params): """ objective function to optimize :param params: hyperparamters for optimizer :return: maximum validation accuracy :rtype: float """ # get instances dataset = Datasets.get(self.dataset_name) model = Models.get(self.model_name, dataset=dataset) optimizer = Optimizers.get(self.optimizer_name, params=params) # configure hyperdash experiment hd_exp = HyperdashExperiment( f'{self.dataset_name}', api_key_getter=lambda: self.config['hyperdash']['api_key']) hd_exp.param('dataset_name', self.dataset_name) hd_exp.param('model_name', self.model_name) hd_exp.param('optimizer_name', self.optimizer_name) for k, v in params.items(): hd_exp.param(k, v) # set callbacks callbacks = [ Hyperdash(['accuracy', 'loss', 'val_accuracy', 'val_loss'], hd_exp), EarlyStopping('val_accuracy', patience=10, min_delta=0.01, verbose=1), TerminateOnNaN() ] # get data (x_train, y_train), *_ = dataset.get_batch() # start learning model.compile(loss=self.loss, optimizer=optimizer, metrics=['accuracy']) history = model.fit(x_train, y_train, batch_size=self.batch_size, epochs=self.epochs, callbacks=callbacks, validation_split=0.2, verbose=2) # stop hyperdash experiment hd_exp.end() # return maximum validation accuracy val_accuracy = np.array(history.history['val_accuracy']) return max(val_accuracy) * (-1)
def __init__(self, analyze, conf, data_conf, output, algorithms, datasets, metrics): self.conf = self._load_conf(conf) self.shall_analyze = analyze self.data_conf = self._load_conf(data_conf) self.data_class = Datasets() self.output = output self.output_dir = os.path.abspath(self.conf.get("output_dir", "./output")) self.shall_plot = self.conf.get("plot_data") self.algorithms = algorithms self.datasets = datasets self.metrics = metrics
def do(config): # 데이터 읽기 & 전처리 print("Read data") ds = Datasets(config.data_path) data = ds.read_data() print("Data preprocessing..") preprocessing = Preprocessing(config) x_train, y_train = preprocessing.do(data) print("Model build..") model, callback = build(config, preprocessing.vocab_size) history = model.fit(x_train, y_train, epochs=config.epoch, callbacks=callback, batch_size=config.batch_size, validation_split=0.2) model.save(os.path.join(config.save_directory, config.ckpt_name))
def train_model(self): t0 = time.time() print(self.model.summary()) dataset = Datasets(self.options) train_gen = dataset.make_split_generator('train') validation_gen = dataset.make_split_generator('validation') optimizer = Adam(lr=self.options.init_lr, decay=0) self.model.compile( optimizer, loss=self.loss_function, metrics=self.metrics) callbacks = self.make_callbacks() self.model.fit_generator( train_gen, initial_epoch=self.options.init_epoch, steps_per_epoch=self.options.steps_per_epoch, epochs=self.options.epochs, validation_data=validation_gen, validation_steps=self.options.validation_steps, callbacks=callbacks) print('Training time cost: %0.2f(min).'%((time.time()-t0)/60))
def get_prediction(sentence): sentence = Datasets.normalize_string(sentence) sentence = tokenizer.tokenize(sentence) sentence = tokenizer.convert_tokens_to_ids(sentence) sentence = [vocab['[CLS]']] + sentence + [vocab['[SEP]']] output = model(torch.tensor(sentence).unsqueeze(0)) output_softmax = softmax(output)[0] max_out = label_list[output_softmax.argmax()] argidx = output_softmax.argsort(descending=True) result = {label_list[i]: round(output_softmax[i].item(), 3) for i in range(len(label_list))} sorted_result = {label_list[i]: round(output_softmax[i].item(), 3) for i in argidx} return max_out, result, sorted_result
def test_describe_dataset(make_stubber, error_code): lookoutvision_client = boto3.client('lookoutvision') lookoutvision_stubber = make_stubber(lookoutvision_client) project_name = 'test-project_name' dataset_type = 'train' status = 'CREATE_COMPLETE' message = 'Test message' image_stats = {'Total': 5, 'Labeled': 2, 'Normal': 2, 'Anomaly': 1} lookoutvision_stubber.stub_describe_dataset(project_name, dataset_type, status, message, image_stats, error_code=error_code) if error_code is None: Datasets.describe_dataset(lookoutvision_client, project_name, dataset_type) else: with pytest.raises(ClientError) as exc_info: Datasets.describe_dataset(lookoutvision_client, project_name, dataset_type) assert exc_info.value.response['Error']['Code'] == error_code
def example(): from datasets import Datasets datasets = Datasets() datasets.download() training_data = datasets.load() test_data = datasets.load(test=True) my_xgb_regressor = MyXGBRegressor(datasets) my_xgb_regressor.train(training_data) predictions = my_xgb_regressor.predict(test_data, save=True)
def test(model, args): model.eval() # Load Datasets dataset = Datasets(file_path=args.test_data_path, label_list=label_list, pretrained_type=args.pretrained_type) # Use custom batch function collate_fn = ClassificationBatchFunction(args.max_len, dataset.pad_idx, dataset.cls_idx, dataset.sep_idx) loader = DataLoader(dataset=dataset, batch_size=args.train_batch_size, num_workers=8, pin_memory=True, collate_fn=collate_fn) loss, acc, f1, (total_y_hat, cm) = evaluate(args, loader, model, device) return loss, acc, f1, total_y_hat, cm
def train_lstm_model(): X_train, X_valid, Y_train, Y_valid = Datasets.make_lstm_data( data_folder_path) callbacks = [EarlyStopping(monitor='val_loss', patience=3)] #ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)] model = lstm_categorical(input_dimension=(7, 120, 160, 3)) model.fit(X_train, Y_train, epochs=20, batch_size=64, validation_data=(X_valid, Y_valid)) timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') model.save_weights('saved_models/lstm_categorical_' + timestamp + '.h5', overwrite=True)
def run_aft(possible_widths, pool_size, max_layers, cand_epochs, final_epochs, iteration=''): data = Datasets.mnist() comp = 0.0 model = AutoForwardThinking(possible_widths, pool_size, max_layers, data) stats = model.train(final_epochs=final_epochs, cand_epochs=cand_epochs, batch_size=128, stopping_comp=comp) for key in stats.keys(): print(key, len(stats[key])) # save training stats df = pd.DataFrame(stats) comments = list(df['comments'])[0] df['min_width'] = min(possible_widths) df['max_width'] = max(possible_widths) df['pool_size'] = pool_size df['max_layers'] = max_layers df['cand_epochs'] = cand_epochs df['final_epochs'] = final_epochs df['compensation'] = comp layers = comments.split(']')[0].replace('[', '') df['n_layers'] = layers.count(',') + 1 df['layers'] = layers comments = comments.split(']')[1].strip(',').strip() df['hid_act'] = comments.split(',')[0].strip() df['out_act'] = comments.split(',')[1].strip() df['optimizer'] = comments.split(',')[-1].replace('prop', '').strip() df = df.drop(columns='comments') fname = 'aft_min{}_max{}_p{}_maxl{}_canep{}_finep{}_comp{}_{}.csv'.format( min(possible_widths), max(possible_widths), pool_size, max_layers, cand_epochs, final_epochs, str(comp).replace('.', '-'), iteration ) while os.path.exists(fname): fname = fname.replace('.csv', '_.csv') df.to_csv(fname) K.clear_session()
def main(): raw_datasets, _ = Datasets.load_datasets() X, Y = gen_datasets(raw_datasets) vectorizer = CountVectorizer(decode_error="ignore") cv_datasets = vectorizer.fit_transform(X).toarray() clf = ExtraTreesClassifier() clf = clf.fit(cv_datasets, Y) print cv_datasets.shape print clf.feature_importances_ modle = SelectFromModel(clf, prefit=True) X_new = modle.transform(cv_datasets) print X_new.shape binarizer = Binarizer(threshold=1.0) b_datasets = binarizer.fit_transform(cv_datasets) variance_threshold = VarianceThreshold(.8 * (1 - .8)) v_datasets = variance_threshold.fit_transform(b_datasets) print v_datasets.shape
def train_single_frame_model(): """ Same as default categorical from donkey """ X_train, X_valid, Y_train, Y_valid = Datasets.make_single_frame_data( data_folder_path) model = default_categorical() callbacks = [EarlyStopping(monitor='val_loss', patience=3)] #ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)] model.fit(X_train, Y_train, epochs=3, batch_size=64, validation_data=(X_valid, Y_valid)) timestamp = datetime.datetime.now().strftime('%Y%m%d%H%M%S') model.save_weights('saved_models/single_frame_categorical_' + timestamp + '.h5', overwrite=True)
def main(args): """Execute a task based on the given command-line arguments. This function is the main entry-point of the program. It allows the user to extract features, train a model, compute predictions, and evaluate predictions using the command-line interface. """ from datasets import Datasets datasets = Datasets(args.dataset_path) if args.command == 'extract': extract(datasets.get(args.dataset), args) elif args.command == 'train': train(datasets.get('training'), args) elif args.command == 'predict': predict(datasets.get(args.dataset), args) elif args.command == 'evaluate': if isinstance(args.training_id, list): evaluate_all(datasets.get('test'), args) else: evaluate(datasets.get('test'), args)
def train(args): set_seed(args) # Set device if args.device == 'cuda': device = torch.device( 'cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) if args.pretrained_type == 'skt' and 'bert.' not in list( pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} bert_config = BertConfig( os.path.join(pretrained_path + '/bert_config.json')) model = BertForMLM(bert_config).to(device) model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = MLMBatchFunction(args.max_len, tr_set.vocab) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, drop_last=True, collate_fn=collate_fn) if args.do_eval: dev_set = Datasets(file_path=args.dev_data_path, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format( t_total, warmup_steps)) # Use gradual warmup and cosine decay scheduler = optimization.WarmupCosineWithHardRestartsSchedule( optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_pretrain/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_pretrain/results.csv") model.zero_grad() best_val_loss = 1e+9 best_val_acc = 0 global_step = 0 train_loss, train_acc = 0, 0 val_loss, val_acc = 0, 0 logging_loss, logging_acc = 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, y_train, mask_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'masked_lm_labels': y_train, } output, loss = model(**inputs) y_max = output.max(dim=2)[1] # Get accuracy for maked tokens total_length = torch.ones_like(y_train).masked_fill( y_train == -1, 0).sum().item() total_sum = torch.zeros_like(y_max).masked_fill( y_max == y_train, 1).sum().item() batch_acc = total_sum / total_length # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss /= grad_accu batch_acc /= grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc if (step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_( amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('lr', {'lr': scheduler.get_lr()[0]}, global_step) logger.info( '[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}'.format( global_step, t_total, loss_, acc_)) logging_acc, logging_loss = train_acc, train_loss if args.do_eval: # Validation val_loss, val_acc = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}'.format( global_step, t_total, val_loss, val_acc) logger.info(val_result) total_result.append(val_result) if val_loss <= best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin')) torch.save(args, os.path.join(save_path, 'training_args.bin')) logger.info('Saving model checkpoint to %s', save_path) best_val_loss = val_loss best_val_acc = val_acc if (epoch + 1) % args.saving_step == 0: torch.save( model.state_dict(), os.path.join(save_path, 'epoch{}_model.bin'.format(epoch + 1))) # Save results in 'model_saved_pretrain/results.csv' results = { 'train_loss': loss_, 'train_acc': acc_, 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'save_dir': save_path, 'global_step': global_step, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
def main(args): datasets = Datasets(data_path=args.data_path) # Prepare output files outname1 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.nll' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../tmp/' + args.dataset + '_' + str(args.num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile2 = open(outname2, 'w') running_loss = 0 running_time = 0 for i in range(args.splits): print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data(i) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] Z = kmeans2(X, args.num_inducing, minit='points')[0] # set up batches batch_size = args.M if args.M < X.shape[0] else X.shape[0] train_dataset = tf.data.Dataset.from_tensor_slices((X, Y)).repeat()\ .prefetch(X.shape[0]//2)\ .shuffle(buffer_size=(X.shape[0]//2))\ .batch(batch_size) print('Setting up DGP model...') kernels = [] for l in range(args.num_layers): kernels.append(SquaredExponential() + White(variance=1e-5)) dgp_model = DGP(X.shape[1], kernels, Gaussian(variance=0.05), Z, num_outputs=Y.shape[1], num_samples=args.num_samples, num_data=X.shape[0]) # initialise inner layers almost deterministically for layer in dgp_model.layers[:-1]: layer.q_sqrt = Parameter(layer.q_sqrt.value() * 1e-5, transform=triangular()) optimiser = tf.optimizers.Adam(args.learning_rate) def optimisation_step(model, X, Y): with tf.GradientTape() as tape: tape.watch(model.trainable_variables) obj = -model.elbo(X, Y, full_cov=False) grad = tape.gradient(obj, model.trainable_variables) optimiser.apply_gradients(zip(grad, model.trainable_variables)) def monitored_training_loop(model, train_dataset, logdir, iterations, logging_iter_freq): # TODO: use tensorboard to log trainables and performance tf_optimisation_step = tf.function(optimisation_step) batches = iter(train_dataset) for i in range(iterations): X, Y = next(batches) tf_optimisation_step(model, X, Y) iter_id = i + 1 if iter_id % logging_iter_freq == 0: tf.print( f'Epoch {iter_id}: ELBO (batch) {model.elbo(X, Y)}') print('Training DGP model...') t0 = time.time() monitored_training_loop(dgp_model, train_dataset, logdir=args.log_dir, iterations=args.iterations, logging_iter_freq=args.logging_iter_freq) t1 = time.time() print('Time taken to train: {}'.format(t1 - t0)) outfile2.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile2.flush() os.fsync(outfile2.fileno()) running_time += t1 - t0 m, v = dgp_model.predict_y(Xs, num_samples=args.test_samples) test_nll = np.mean( logsumexp(norm.logpdf(Ys * Y_std, m * Y_std, v**0.5 * Y_std), 0, b=1 / float(args.test_samples))) print('Average test log likelihood: {}'.format(test_nll)) outfile1.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile1.flush() os.fsync(outfile1.fileno()) running_loss += t1 - t0 outfile1.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Average: {}\n'.format(running_time / args.splits)) outfile1.close() outfile2.close()
def main(args): num_layers = len(args.hidden_dims) datasets = Datasets(data_path=args.data_path) # Prepare output files outname1 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\ + str(args.num_inducing) + '.rmse' if not os.path.exists(os.path.dirname(outname1)): os.makedirs(os.path.dirname(outname1)) outfile1 = open(outname1, 'w') outname2 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\ + str(args.num_inducing) + '.nll' outfile2 = open(outname2, 'w') outname3 = '../tmp/aep_' + args.dataset + '_' + str(num_layers) + '_'\ + str(args.num_inducing) + '.time' outfile3 = open(outname3, 'w') running_err = 0 running_loss = 0 running_time = 0 test_errs = np.zeros(args.splits) test_nlls = np.zeros(args.splits) test_times = np.zeros(args.splits) for i in range(args.splits): print('Split: {}'.format(i)) print('Getting dataset...') data = datasets.all_datasets[args.dataset].get_data(i) X, Y, Xs, Ys, Y_std = [ data[_] for _ in ['X', 'Y', 'Xs', 'Ys', 'Y_std'] ] dgp_model = aep.SDGPR(X, Y, args.num_inducing, args.hidden_dims) print('Training DGP model...') t0 = time.time() dgp_model.optimise(method='Adam', mb_size=args.batch_size, adam_lr=args.learning_rate, maxiter=args.iterations) t1 = time.time() test_times[i] = t1 - t0 print('Time taken to train: {}'.format(t1 - t0)) outfile3.write('Split {}: {}\n'.format(i + 1, t1 - t0)) outfile3.flush() os.fsync(outfile3.fileno()) running_time += t1 - t0 # Minibatch test predictions means, vars = [], [] test_batch_size = args.test_batch_size if len(Xs) > test_batch_size: for mb in range(-(-len(Xs) // test_batch_size)): m, v = dgp_model.predict_y(Xs[mb * test_batch_size:(mb + 1) * test_batch_size, :]) means.append(m) vars.append(v) else: m, v = dgp_model.predict_y(Xs) means.append(m) vars.append(v) mean_ND = np.concatenate(means, 0) var_ND = np.concatenate(vars, 0) test_err = np.mean(Y_std * np.mean((Ys - mean_ND)**2.0)**0.5) test_errs[i] = test_err print('Average RMSE: {}'.format(test_err)) outfile1.write('Split {}: {}\n'.format(i + 1, test_err)) outfile1.flush() os.fsync(outfile1.fileno()) running_err += test_err test_nll = np.mean( norm.logpdf(Ys * Y_std, mean_ND * Y_std, var_ND**0.5 * Y_std)) test_nlls[i] = test_nll print('Average test log likelihood: {}'.format(test_nll)) outfile2.write('Split {}: {}\n'.format(i + 1, test_nll)) outfile2.flush() os.fsync(outfile2.fileno()) running_loss += test_nll outfile1.write('Average: {}\n'.format(running_err / args.splits)) outfile1.write('Standard deviation: {}\n'.format(np.std(test_errs))) outfile2.write('Average: {}\n'.format(running_loss / args.splits)) outfile2.write('Standard deviation: {}\n'.format(np.std(test_nlls))) outfile3.write('Average: {}\n'.format(running_time / args.splits)) outfile3.write('Standard deviation: {}\n'.format(np.std(test_times))) outfile1.close() outfile2.close() outfile3.close()
def run_algorithms(algorithms, datasets, metrics, output, conf): dts = Datasets() shall_plot = conf.get("plot_data") if shall_plot: plot_dir = conf.get("plot_dir", "../plots") tmp_plot_dir = "../plots_1" if os.path.exists(tmp_plot_dir): shutil.rmtree(tmp_plot_dir) os.mkdir(tmp_plot_dir) orig_data_dir = os.path.join(tmp_plot_dir, "original") os.mkdir(orig_data_dir) for dataset in datasets: plot_data(os.path.join(orig_data_dir, "%s-orig.png" % dataset), "%s-orig" % dataset, dataset) if output == 'dump_text' and not os.path.exists("../dumps"): os.mkdir("../dumps") for algorithm in algorithms: if shall_plot: algo_dir = os.path.join(tmp_plot_dir, algorithm) os.mkdir(algo_dir) algo_conf = conf["algorithms"].get(algorithm, None) if not algo_conf: logging.error("Algorithm %s not found in conf file" % algorithm) sys.exit(0) algo_conf['name'] = algorithm learn_class = _get_algorithm_class(algorithm) learn = learn_class(**algo_conf) learn._set_cross_validation(conf.get("cv_method", None), conf.get("cv_metric", None), conf.get("cv_params", None)) results = [] for dataset in datasets: if dataset not in conf["datasets"]: logging.error("Dataset %s not found" % dataset) sys.exit(0) cv_dir = None if shall_plot: dataset_dir = os.path.join(algo_dir, dataset) os.mkdir(dataset_dir) if algo_conf.get("cross_validate", True): cv_dir = os.path.join(dataset_dir, "cv") os.mkdir(cv_dir) training_sizes = conf.get("training_size", [0.40]) scores = [] for training_size in training_sizes: data = dts.load_dataset(dataset, training_size) learn.set_dataset(dataset, training_size*100, cv_dir) if learn.check_type(data["type"]): eval_metrics = [] if metrics: eval_metrics.extend(metrics) else: eval_metrics.extend(algo_conf["allowed_metrics"]) learn.train(data["x_train"], data["y_train"]) result_tups = learn.evaluate(data["x_test"], data["y_test"], eval_metrics) print_results(training_size, algorithm, dataset, result_tups) results.append((algorithm, dataset, training_size, result_tups)) if shall_plot: decision_plot_path = os.path.join(dataset_dir, "decision-%s_%s_size_%d.png" % (dataset, algorithm, training_size * 100)) learn.plot_results(decision_plot_path, dataset, training_size, data['x_train'], data['x_test'], data['y_train'], data['y_test']) for metric, y_test, score in result_tups: metric_plot_path = os.path.join(dataset_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100)) plot_metric(metric_plot_path, data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100) scores.append(result_tups[0][2]) if shall_plot: train_plot_path = os.path.join(dataset_dir, "train_vs_acc-%s_%s.png" % (algorithm, dataset)) plot_training_results(train_plot_path, [train_size * 100 for train_size in training_sizes], scores) if output == "pdf": generate_pdf(results) elif output == "dump_text": dump_results(algorithm, results) if conf.get("plot_data", False): shutil.rmtree(plot_dir) shutil.move(tmp_plot_dir, plot_dir)
class ClassifierLib: def __init__(self, analyze, conf, data_conf, output, algorithms, datasets, metrics): self.conf = self._load_conf(conf) self.shall_analyze = analyze self.data_conf = self._load_conf(data_conf) self.data_class = Datasets() self.output = output self.output_dir = os.path.abspath(self.conf.get("output_dir", "./output")) self.shall_plot = self.conf.get("plot_data") self.algorithms = algorithms self.datasets = datasets self.metrics = metrics def _get_algorithm_class(self, algorithm_name): module = importlib.import_module("%s" % algorithm_name) if not module: logging.error("Module %s not found" % algorithm_name) class_name = algorithm_name.replace("_"," ").title().replace(" ","") logging.info("Algorithm %s loaded from module %s" % (class_name, algorithm_name)) return getattr(module, class_name) def _load_conf(self, conf_path): conf_file = open(os.path.abspath(conf_path)) return yaml.load(conf_file) def run_algorithm(self, algorithm, data, data_conf, training_size): algo_conf = self.conf['algorithms'][algorithm] learn_class = self._get_algorithm_class(algorithm) learn = learn_class(**algo_conf) if not learn.check_type(getattr(constants, data_conf["type"])): return dataset = data_conf['name'] learn.set_dataset(dataset, training_size) if algo_conf.get("cross_validate", False): learn._set_cross_validation(self.conf.get("cv_method", None), self.conf.get("cv_metric", None), self.conf.get("cv_params", None)) learn.cross_validation(data['x_train'], data['y_train'], self.conf.get('print_cv_score', self.conf.get('print_cv_score', False))) learn.train(data["x_train"], data["y_train"]) result = learn.predict(data['x_test']) if self.conf.get('evaluate', False): eval_metrics = [] if self.metrics: eval_metrics.extend(self.metrics) else: eval_metrics.extend(algo_conf["allowed_metrics"]) result = learn.evaluate(result, data["y_test"], eval_metrics) return result def run(self): if os.path.exists(self.output_dir): if os.path.exists("%s%s" % (self.output_dir, "_1")): shutil.rmtree("%s%s" % (self.output_dir, "_1")) shutil.move(self.output_dir, "%s%s" % (self.output_dir, "_1")) os.mkdir(self.output_dir) for dataset in self.datasets: if dataset not in self.data_conf: logging.error("Dataset %s not found" % dataset) sys.exit(0) dataset_dir = os.path.join(self.output_dir, dataset) os.mkdir(dataset_dir) if self.shall_analyze: self.analyze(dataset, dataset_dir) for algorithm in self.algorithms: algo_dir = os.path.join(dataset_dir, algorithm) os.mkdir(algo_dir) results = [] for training_size in self.conf.get('training_sizes', [.4]): data_conf = self.data_conf[dataset] data = self.data_class.load_dataset(dataset, training_size) result = self.run_algorithm(algorithm, data, data_conf, training_size) if self.conf.get('evaluate', True): if self.output == "print": self.print_results(training_size, algorithm, dataset, result) if self.shall_plot: for metric, y_test, score in result: metric_plot_path = os.path.join(algo_dir, "metric-%s-%s_%s_size_%d.png" % (metric, dataset, algorithm, training_size * 100)) plot_metric(data['type'], y_test, data['y_test'], dataset, algorithm, training_size * 100, metric_plot_path) else: result_file = open(os.path.join(algo_dir, "result.csv"), 'a+') result_file.write(",".join(results)) result_file.close() def analyze(self, dataset, dataset_dir): data = self.data_class.load_dataset(dataset, train_size=100) (X, Y) = (data['x_train'], data['y_train']) print_score.print_breakdown(X, Y) if self.shall_plot: plot_scatter(X, Y, "%s-orig" % dataset, filename=os.path.join(dataset_dir, "%s-orig.png" % dataset)) plot_histogram(X, Y, "%s-hist" % dataset, filename=os.path.join(dataset_dir, "%s-hist.png" % dataset)) pca = PCA() pca.fit(X) plot_PCA_variance(pca.explained_variance_ratio_ * 100, "%s-pca-#feature-vs-variance" % dataset, filename=os.path.join(dataset_dir, "%s-pca-variance-ratio" % dataset)) def print_results(self, training_size, algorithm, dataset, metric_tuples): #print "\nFor Algorithm::\t%s" % algorithm #print "For Dataset::\t%s\n" % dataset for met_tup in metric_tuples: func = getattr(print_score, "print_%s" % met_tup[0]) func(training_size, algorithm, dataset, met_tup[2])
def train(args): set_seed(args) # Set device if args.device == 'cuda': device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') logger.info('use cuda') else: device = torch.device('cpu') logger.info('use cpu') # Set label list for classification if args.num_label == 'multi': label_list = ['공포', '놀람', '분노', '슬픔', '중립', '행복', '혐오'] elif args.num_label == 'binary': label_list = ['긍정', '부정'] logger.info('use {} labels for training'.format(len(label_list))) # Load pretrained model and model configuration pretrained_path = os.path.join('./pretrained_model/', args.pretrained_type) if args.pretrained_model_path is None: # Use pretrained bert model(etri/skt) pretrained_model_path = os.path.join(pretrained_path, 'pytorch_model.bin') else: # Use further-pretrained bert model pretrained_model_path = args.pretrained_model_path logger.info('Pretrain Model : {}'.format(pretrained_model_path)) pretrained = torch.load(pretrained_model_path) if args.pretrained_type == 'skt' and 'bert.' not in list(pretrained.keys())[0]: logger.info('modify parameter names') # Change parameter name for consistency new_keys_ = ['bert.' + k for k in pretrained.keys()] old_values_ = pretrained.values() pretrained = {k: v for k, v in zip(new_keys_, old_values_)} bert_config = BertConfig(os.path.join(pretrained_path + '/bert_config.json')) bert_config.num_labels = len(label_list) model = BertForEmotionClassification(bert_config).to(device) model.load_state_dict(pretrained, strict=False) # Load Datasets tr_set = Datasets(file_path=args.train_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) # Use custom batch function collate_fn = ClassificationBatchFunction(args.max_len, tr_set.pad_idx, tr_set.cls_idx, tr_set.sep_idx) tr_loader = DataLoader(dataset=tr_set, batch_size=args.train_batch_size, shuffle=True, num_workers=8, pin_memory=True, collate_fn=collate_fn) dev_set = Datasets(file_path=args.dev_data_path, label_list=label_list, pretrained_type=args.pretrained_type, max_len=args.max_len) dev_loader = DataLoader(dataset=dev_set, batch_size=args.eval_batch_size, num_workers=8, pin_memory=True, drop_last=False, collate_fn=collate_fn) # optimizer optimizer = layerwise_decay_optimizer(model=model, lr=args.learning_rate, layerwise_decay=args.layerwise_decay) # lr scheduler t_total = len(tr_loader) // args.gradient_accumulation_steps * args.epochs warmup_steps = int(t_total * args.warmup_percent) logger.info('total training steps : {}, lr warmup steps : {}'.format(t_total, warmup_steps)) # Use gradual warmup and linear decay scheduler = optimization.WarmupLinearSchedule(optimizer, warmup_steps=warmup_steps, t_total=t_total) # for low-precision training if args.fp16: try: from apex import amp logger.info('Use fp16') except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level, verbosity=0) # tensorboard setting save_path = "./model_saved_finetuning/lr{},batch{},total{},warmup{},len{},{}".format( args.learning_rate, args.train_batch_size * args.gradient_accumulation_steps, t_total, args.warmup_percent, args.max_len, args.pretrained_type) if not os.path.isdir(save_path): os.makedirs(save_path) writer = SummaryWriter(save_path) # Save best model results with resultwriter result_writer = utils.ResultWriter("./model_saved_finetuning/results.csv") model.zero_grad() best_val_loss = 1e+9 global_step = 0 train_loss, train_acc, train_f1 = 0, 0, 0 logging_loss, logging_acc, logging_f1 = 0, 0, 0 logger.info('***** Training starts *****') total_result = [] for epoch in tqdm(range(args.epochs), desc='epochs'): for step, batch in tqdm(enumerate(tr_loader), desc='steps', total=len(tr_loader)): model.train() x_train, mask_train, y_train = map(lambda x: x.to(device), batch) inputs = { 'input_ids': x_train, 'attention_mask': mask_train, 'classification_label': y_train, } output, loss = model(**inputs) y_max = output.max(dim=1)[1] cr = classification_report(y_train.tolist(), y_max.tolist(), labels=list(range(len(label_list))), target_names=label_list, output_dict=True) # Get accuracy(micro f1) if 'micro avg' not in cr.keys(): batch_acc = list(cr.items())[len(label_list)][1] else: # If at least one of labels does not exists in mini-batch, use micro average instead batch_acc = cr['micro avg']['f1-score'] # macro f1 batch_macro_f1 = cr['macro avg']['f1-score'] # accumulate measures grad_accu = args.gradient_accumulation_steps if grad_accu > 1: loss /= grad_accu batch_acc /= grad_accu batch_macro_f1 /= grad_accu if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() train_loss += loss.item() train_acc += batch_acc train_f1 += batch_macro_f1 if (global_step + 1) % grad_accu == 0: if args.fp16: torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.grad_clip_norm) else: torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip_norm) optimizer.step() scheduler.step() model.zero_grad() global_step += 1 if global_step % args.logging_step == 0: acc_ = (train_acc - logging_acc) / args.logging_step f1_ = (train_f1 - logging_f1) / args.logging_step loss_ = (train_loss - logging_loss) / args.logging_step writer.add_scalars('loss', {'train': loss_}, global_step) writer.add_scalars('acc', {'train': acc_}, global_step) writer.add_scalars('macro_f1', {'train': f1_}, global_step) logger.info('[{}/{}], trn loss : {:.3f}, trn acc : {:.3f}, macro f1 : {:.3f}'.format( global_step, t_total, loss_, acc_, f1_ )) logging_acc, logging_f1, logging_loss = train_acc, train_f1, train_loss # Get f1 score for each label f1_results = [(l, r['f1-score']) for i, (l, r) in enumerate(cr.items()) if i < len(label_list)] f1_log = "\n".join(["{} : {}".format(l, f) for l, f in f1_results]) logger.info("\n\n***f1-score***\n" + f1_log + "\n\n***confusion matrix***\n{}".format( confusion_matrix(y_train.tolist(), y_max.tolist()))) # Validation val_loss, val_acc, val_macro_f1, _ = evaluate(args, dev_loader, model, device) val_result = '[{}/{}] val loss : {:.3f}, val acc : {:.3f}. val macro f1 : {:.3f}'.format( global_step, t_total, val_loss, val_acc, val_macro_f1 ) writer.add_scalars('loss', {'val': val_loss}, global_step) writer.add_scalars('acc', {'val': val_acc}, global_step) writer.add_scalars('macro_f1', {'val': val_macro_f1}, global_step) logger.info(val_result) total_result.append(val_result) if val_loss < best_val_loss: # Save model checkpoints torch.save(model.state_dict(), os.path.join(save_path, 'best_model.bin')) torch.save(args, os.path.join(save_path, 'training_args.bin')) logger.info('Saving model checkpoint to %s', save_path) best_val_loss = val_loss best_val_acc = val_acc best_val_macro_f1 = val_macro_f1 # Save results in 'model_saved_finetuning/results.csv' results = { 'val_loss': best_val_loss, 'val_acc': best_val_acc, 'val_macro_f1' : best_val_macro_f1, 'save_dir': save_path, 'pretrained_path': pretrained_path, } result_writer.update(args, **results) return global_step, loss_, acc_, best_val_loss, best_val_acc, total_result
if 'cosine' in data['tools']: initial_head = initial_head + ", th_cosine, size_result_cosine, f1_score_cosine, recall_cosine, runtime_cosine" if 'giga' in data['tools']: initial_head = initial_head + ", th_giga, size_result_giga, f1_score_giga, recall_giga, runtime_giga" if 'knode' in data['tools']: initial_head = initial_head + ", th_knode, size_result_knode, f1_score_knode, recall_knode, runtime_knode" if 'aminsga2' in data['tools']: initial_head = initial_head + ", th_aminsga2, size_result_aminsga2, f1_score_aminsga2, recall_aminsga2, runtime_aminsga2" initial_head = initial_head + ", th_baseline, f1_score_baseline, recall_baseline, runtime_baseline \n" outfile.write(initial_head) for ctr in range(arg.number_of_runs): if arg.graph_generation == "guyondata": G = Datasets.get_guyon_graph(ctr + 1) else: G = Datasets.get_scale_free_graph_edge(arg.network_size, initial_module, nb_modules, arg.module_size, arg.prob_p, arg.prob_q, arg.removed_edges, rng) rate_conection = len(G.edges) / len(G.nodes) average_shortest_paths = [] for _, cluster in Datasets.get_groups(G).items(): nodes = list(cluster) average_shortest_paths.append( Scores.average_shortest_path(G, nodes)) result = str(ctr) + "," + str(arg.network_size) + "," + str(