def test_dataroot(self): from src.utils import load_config load_config() fail = '' self.assertFalse(os.path.isdir(fail)) path = os.environ.get('DATAROOT', fail) self.assertTrue(os.path.isdir(path), f"Environment variable $DATAROOT {path} is not set.")
def main(): config = utils.load_config(args.config) config["weighted"] = "weighted" in config.keys() # copy args to config config["mode"] = args.mode config["fold"] = args.fold config["apex"] = args.apex config["output"] = args.output config["snapshot"] = args.snapshot # config["resume_from"] = args.resume_from if args.resume_from utils.set_seed(SEED) device = torch.device(DEVICE) log(f"Fold {args.fold}") model = factory.get_model(config).to(device) log(f"Model type: {model.__class__.__name__}") if config["mode"] == 'train': train(config, model) valid(config, model) elif config["mode"] == 'valid': valid(config, model) elif config["mode"] == 'test': valid(config, model, all_exam=True)
def main(): cfg = load_config() print('Loading Model & Data.') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = torch.load(cfg['INFERENCE_MODEL']).to(device) model.eval() data = tensor(load_pickle(cfg['INFERENCE_DATA'])).float().to(device) label = None if len(cfg['INFERENCE_DATA']) > 0: label = load_pickle(cfg['INFERENCE_LABEL']) print('Starting Inference.') with torch.no_grad(): y_hat = model(data) y_hat = torch.argmax(y_hat, dim=1).tolist() print(y_hat) if label is not None: print('Generating Report') print( classification_report(label, y_hat, target_names=['SR', 'SB', 'AFIB', 'ST'])) print('Kappa Score: {:.2f}'.format(cohen_kappa_score(label, y_hat)))
def __init__(self, url, config): super(BasicWebScraper, self).__init__() self.config = load_config(config) self.source_url = url self.inside_container = False self.inside_sub_container = False self.cont_tag_counter = 0 self.sub_cont_tag_counter = 0
def main(): parser = argparse.ArgumentParser() parser.add_argument("--action", type=str, default='train', help="train or test") args = parser.parse_args() config = load_config() # 使用tensorboard time_now = datetime.now().isoformat() if not os.path.exists(config.RUN_PATH): os.mkdir(config.RUN_PATH) writer = SummaryWriter(log_dir=config.RUN_PATH) # 随机数种子 torch.manual_seed(config.SEED) torch.cuda.manual_seed(config.SEED) np.random.seed(config.SEED) random.seed(config.SEED) # INIT GPU os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU) if torch.cuda.is_available(): config.DEVICE = torch.device("cuda") print('\nGPU IS AVAILABLE') torch.backends.cudnn.benchmark = True else: config.DEVICE = torch.device("cpu") net = UNet(2).to(config.DEVICE) print(list(torchvision.models.resnet18(False).children())[7]) optimizer = optim.Adam(net.parameters(), betas=(0.5, 0.999), lr=config.LR) loss = nn.L1Loss() # 加载数据集 if args.action == 'train': train_dataset = LABDataset(config, config.TRAIN_PATH) len_train = len(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.BATCH_SIZE, shuffle=True) iter_per_epoch = len(train_loader) train_(config, train_loader, net, optimizer, loss, len_train, iter_per_epoch, writer) if args.action == "test": test_dataset = LABDataset(config, config.TEST_PATH) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) test(config, test_loader, net, loss)
def main(): print('Loading config') cfg = load_config() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('Utilizing device {}'.format(device)) print('Initializing Dataloader & Dataset.') training_data = TweetDataset(cfg['PROCESSED_DATA_DIR']) data_loader = DataLoader(dataset=training_data, batch_size=cfg['BATCH_SIZE'], shuffle=True, num_workers=4, collate_fn=collate_var_sequences, pin_memory=True) print('Initializing Model & Components.') model = GRUPredictor(num_classes=training_data.num_classes, hidden_size=cfg['HIDDEN_SIZE'], batch_size=cfg['BATCH_SIZE'], num_layers=cfg['NUM_LAYERS'], drop_out=cfg['DROPOUT']).to(device) optimizer = Adam(model.parameters(), lr=cfg['LR']) loss = CrossEntropyLoss() print('Initializing Helper Agents.') tracker = TrackingAgent(cfg['BATCH_SIZE'], len(training_data)) summary = SummaryAgent(cfg['SUMMARY_PATH'], model.name, cfg) print('Start training with {} epochs'.format(cfg['EPOCHS'])) for e in range(1, cfg['EPOCHS'] + 1): for i_batch, sample_batched in enumerate(tqdm(data_loader, leave=False)): x_sequence = sample_batched['tweet'].to(device) y = sample_batched['label'].to(device) y_hat = model(x_sequence) batch_loss = loss(y_hat, y) optimizer.zero_grad() batch_loss.backward() optimizer.step() tracker.add_loss(batch_loss) tracker.add_correct_class(y_hat, y) mean_loss = tracker.get_loss() accuracy = tracker.get_accuracy() tracker.reset() summary.add_scalar('Loss', mean_loss) summary.add_scalar('Accuracy', accuracy) summary.save_model(model) summary.adv_episode() summary.flush() print('Ep. {0}; Mean Epoch Loss {1:.2f}; Train Acc. {2:.2f}'.format(e, mean_loss, accuracy)) summary.close()
def run_web_scraping_and_save_data(cls): """Runs the web scraping workflow, collects the product information data for each main and sub-category, and then it saves the whole list into a CSV file. # Improvement should be be save the currency and price separately, to have integer price fields, for example. # Many improvements could be done ... """ product_df = pd.DataFrame() if load_config(MAIN_URL_COLLECTION_NEEDED): url = load_config(MAIN_SOURCE_URL) main_url_collector = URLCollector(url, MAIN_URLS_TO_SCRAP_CONFIG) main_url_list = main_url_collector.get_urls_to_scrap() print(main_url_list) if load_config(PRODUCT_SUB_URL_COLL_NEEDED): for main_url in main_url_list: url = load_config(SUB_SOURCE_URL_BEGINNING) + main_url sub_url_collector = URLCollector( url, PRODUCT_SUB_URLS_TO_SCRAP_CONFIG) sub_url_list = sub_url_collector.get_urls_to_scrap() print(sub_url_list) for sub_url in sub_url_list: url = load_config(SUB_SOURCE_URL_BEGINNING) + sub_url product_scrapper = ProductScraper( url, PRODUCT_SCRAPING_CONFIG) product_data = product_scrapper.get_product_scraping_data( ) cur_product_df = pd.DataFrame(product_data) cur_product_df.insert( 0, MAIN_CATEG_COL, main_url.split('/')[2] [main_url.split('/')[2].find('-') + 1:]) cur_product_df.insert( 1, SUB_CATEG_COL, sub_url.split('/')[3] [sub_url.split('/')[3].find('-') + 1:]) product_df = product_df.append(cur_product_df) print(len(product_df)) print(product_df.head()) product_df.to_csv(RESULT_FILE) return product_df
def __init__(self, *args, **kwargs): tk.Tk.__init__(self, *args, **kwargs) self.title('Minter') self.geometry('300x300') container = tk.Frame(self) container.pack(side='top', fill='both', expand=True) container.grid_columnconfigure(0, weight=1) self.config = load_config() self.context = {} self.steps = {} self._show_frame(container, StartPage)
def test_gender_analyze(self): import os import pickle from src.gender_estimation import gender_analyze from src.datasets import ImdbDataset, unpickle_imdb from src.utils import load_config from src.convnets.utils import IMAGENET_MEAN, IMAGENET_STD from torchvision import transforms load_config() trans = transforms.Compose([ transforms.Resize(64), transforms.CenterCrop(64), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) imdb_root = os.environ['IMDB_ROOT'] df = unpickle_imdb(f"{imdb_root}/imdb.pickle") ds = ImdbDataset(root=imdb_root, df=df[:250], transform=trans, include_path=True) log = gender_analyze(self.weights, ds) pickle.dump(log, open('tmp/gender_analyze_log.p', 'wb'))
def main(mode): config = load_config() # 使用tensorboard time_now = datetime.now().isoformat() if not os.path.exists(config.RUN_PATH): os.mkdir(config.RUN_PATH) writer = SummaryWriter(log_dir=config.RUN_PATH) # 随机数种子 torch.manual_seed(config.SEED) torch.cuda.manual_seed(config.SEED) np.random.seed(config.SEED) random.seed(config.SEED) # INIT GPU os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU) if torch.cuda.is_available(): config.DEVICE = torch.device("cuda") print('\nGPU IS AVAILABLE') torch.backends.cudnn.benchmark = True else: config.DEVICE = torch.device("cpu") net = UNet(1).to(config.DEVICE) optimizer = optim.Adam(net.parameters(), betas=(0.5, 0.999), lr=config.LR) #criterion = nn.CrossEntropyLoss() # 定义loss函数 criterion = nn.BCELoss() # 加载数据集 if mode == 1: train_dataset = MyDataset(config, config.TRAIN_PATH) len_train = len(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.BATCH_SIZE, shuffle=True) iter_per_epoch = len(train_loader) train_(config, train_loader, net, optimizer, criterion, len_train, iter_per_epoch, writer) if mode == 2: test_dataset = MyDataset(config, config.TEST_PATH) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) test(config, test_loader, net, criterion)
def step_impl(context): context.execute_steps(u''' Then the output should contain: """ Successfully logged in! """ ''') assert_that( load_config(), has_entry( 'session', has_entries({ 'email': CONFIG['DEFORM']['EMAIL'], 'session_id': is_not(None) })))
def main(): args = parse_args() # config config: DictConfig = load_config(args.config, update_dotlist=args.opts) print_config(config) # modules model = DepthPLModel(config) nyu2data = Nyu2DataModule(config) # trainer setting trainer = get_trainer(args, config) # train trainer.fit(model, nyu2data)
def main(argv): # load config config = load_config(FLAGS.config_path) # specify and create experiment path timestamp = datetime.now().strftime("_%Y%m%d-%H%M%S") current_experiment_path = os.path.join(EXPERIMENT_PATH, FLAGS.experiment_name + timestamp) os.makedirs(current_experiment_path) # copy current config file to experiment path experiment_config_path = os.path.join(current_experiment_path, EXPERIMENT_CONFIG_NAME) shutil.copy2(FLAGS.config_path, experiment_config_path) # initialize trainer and train trainer = Trainer(config, current_experiment_path) trainer.train()
def main(): cfg = load_config() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('Utilizing device {}'.format(device)) map_char_to_int = load_pickle(cfg['PROCESSED_DATA_DIR'] + '/map_char_to_int.pkl') map_int_to_char = load_pickle(cfg['PROCESSED_DATA_DIR'] + '/map_int_to_char.pkl') num_chars = len(map_int_to_char) ohe_mapping = torch.eye(num_chars).to(device) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('Utilizing device {}'.format(device)) print('Loading Model') model = LSTMPredictor(num_chars, cfg['HIDDEN_SIZE'], cfg['BATCH_SIZE']).to(device) model.load_state_dict(torch.load(cfg['MODEL_PATH'])) model.eval() while True: input_str = input('Tweet: ') if input_str == "exit": break input_str = input_str.lower() while True: input_enc = [map_char_to_int[char] for char in list(input_str)] input_ohe = ohe_mapping[input_enc].unsqueeze(dim=0) with torch.no_grad(): output = model(input_ohe) output_prob = torch.softmax(output, dim=1).squeeze().cpu().numpy() result = np.random.choice(num_chars, p=output_prob) input_str += map_int_to_char[result] if len(input_str) >= 140 or (len(input_str) >= 100 and input_str[-1] == "."): print(input_str) break
def main(): with Timer('Loading config'): cfg = load_config() with Timer('Loading tweets'): tweets = load_raw_data(cfg['RAW_DATA_PATH']) with Timer('Cleaning sentences'): tweet_text = cleanse_sentences(list(tweets['text'])) with Timer('Mapping characters to integers'): tweet_enc, map_char_to_int, map_int_to_char = map_tweets_to_int(tweet_text) with Timer('Producing dataset'): tweet_train, tweet_label = produce_dataset(tweet_enc) with Timer('Save dataset and mapping tables'): save_pickle(tweet_train, cfg['PROCESSED_DATA_DIR'] + '/train.pkl') save_pickle(tweet_label, cfg['PROCESSED_DATA_DIR'] + '/label.pkl') save_pickle(map_char_to_int, cfg['PROCESSED_DATA_DIR'] + '/map_char_to_int.pkl') save_pickle(map_int_to_char, cfg['PROCESSED_DATA_DIR'] + '/map_int_to_char.pkl')
def main(): """ Load raw ECG data from disc and transform it to cleansed training data. """ cfg = load_config() with Timer('Getting label list'): labels, file_list = get_labels(cfg['RAW_DATA_PATH'] + '/Diagnostics.xlsx') with Timer('Loading & Downsampling files'): ecg_data = get_ecg_data(cfg['RAW_DATA_PATH'] + '/ECGDataDenoised', file_list, cfg['DOWNSAMPLE_THRESHOLD'], cfg['DATA_SLICE'], cfg['NUM_WORKERS']) with Timer('Imputing missing values'): ecg_data = impute_nans(ecg_data) with Timer('Splitting into Train & Test Set'): x_train, x_test, y_train, y_test = train_test_split(ecg_data, labels, test_size=0.2, shuffle=True, stratify=labels, random_state=42) print('Final Training set has {} samples'.format(len(x_train))) print('Final Test set has {} samples'.format(len(x_test))) print('Distribution of labels in Training: {}'.format( Counter(y_train))) print('Distribution of labels in Testing: {}'.format(Counter(y_test))) with Timer('Normalizing data'): x_train, x_test = normalize_data(x_train, x_test) with Timer('Saving generated arrays'): save_pickle(x_train, cfg['PROCESSED_DATA_DIR'] + '/train_data.pkl') save_pickle(y_train, cfg['PROCESSED_DATA_DIR'] + '/train_label.pkl') save_pickle(x_test, cfg['PROCESSED_DATA_DIR'] + '/test_data.pkl') save_pickle(y_test, cfg['PROCESSED_DATA_DIR'] + '/test_label.pkl')
def main(): parser = ArgumentParser() parser.add_argument('--valid', action='store_true') args = parser.parse_args() config = load_config('./config/lstm_f.json') config.setdefault('max_len', 220) config.setdefault('max_features', 100000) config.setdefault('batch_size', 512) config.setdefault('train_epochs', 10) config.setdefault('tolerance', 10) config.setdefault('num_folds', 5) config.setdefault('lr', 1e-3) config.setdefault('loss_alpha', 0.1) config.setdefault('loss_beta', 1.0) config.setdefault('device', 'cuda') config.setdefault('seed', 1029) device = torch.device(config.device) OUT_DIR = Path(f'../output/lstm_f/') submission_file_name = 'valid_submission.csv' if args.valid else 'submission.csv' SUBMISSION_PATH = OUT_DIR / submission_file_name OUT_DIR.mkdir(exist_ok=True) warnings.filterwarnings('ignore') seed_torch(config.seed) with timer('preprocess'): train = pd.read_csv(TRAIN_DATA) if args.valid: train = train.sample(frac=1, random_state=1029).reset_index(drop=True) test = train.tail(200000) train = train.head(len(train) - 200000) else: test = pd.read_csv(TEST_DATA) train['comment_text'] = train['comment_text'].apply(preprocess) test['comment_text'] = test['comment_text'].apply(preprocess) # replace blank with nan train['comment_text'].replace('', np.nan, inplace=True) test['comment_text'].replace('', np.nan, inplace=True) # nan prediction nan_pred = train['target'][train['comment_text'].isna()].mean() # fill up the missing values X_train = train['comment_text'].fillna('_##_').values X_test = test['comment_text'].fillna('_##_').values # get the target values weights = training_weights(train, TOXICITY_COLUMN, IDENTITY_COLUMNS) loss_weight = 1.0 / weights.mean() y_train_identity = train[IDENTITY_COLUMNS].values y_train_annotator_counts = train['toxicity_annotator_count'].values y_train = np.hstack( (train[TOXICITY_COLUMN].values.reshape(-1, 1), weights.reshape(-1, 1), train[AUX_TOXICITY_COLUMNS].values)) train_nan_mask = X_train == '_##_' test_nan_mask = X_test == '_##_' vocab = build_vocab(chain(X_train), config.max_features) fasttext_embedding_matrix = load_embedding(EMBEDDING_FASTTEXT, vocab['token2id']) glove_embedding_matrix = load_embedding(EMBEDDING_GLOVE, vocab['token2id']) joblib.dump(vocab, OUT_DIR / 'vocab.pkl') np.save(OUT_DIR / 'fasttext_embedding_matrix', fasttext_embedding_matrix) np.save(OUT_DIR / 'glove_embedding_matrix', glove_embedding_matrix) X_train = np.array(tokenize(X_train, vocab, config.max_len)) X_test = np.array(tokenize(X_test, vocab, config.max_len)) all_related_columns = [TOXICITY_COLUMN ] + AUX_TOXICITY_COLUMNS + IDENTITY_COLUMNS negative_indices = np.arange( 0, len(train))[(train[all_related_columns] == 0.0).sum( axis=1) == len(all_related_columns)] with timer('train'): skf = StratifiedKFold(n_splits=config.num_folds, shuffle=True, random_state=1) num_aux_targets = y_train.shape[-1] - 2 custom_loss = CustomLoss( loss_weight, alpha=config.loss_alpha, beta=config.loss_beta, use_annotator_counts=True, weight_from_annotator_counts=lambda x: torch.log(x + 2)) test_dataset = TextDataset(token_lists=X_test) test_prediction = np.zeros(len(test_dataset)) test_prediction_count = 0 models = {} for i, (train_idx, valid_idx) in enumerate( skf.split(X_train, y_train[:, 0] >= 0.5)): seed_torch(i) np.random.shuffle(negative_indices) drop_indices = set(negative_indices[:len(negative_indices) // 2]) train_idx = [i for i in train_idx if i not in drop_indices] train_token_lists = [X_train[i] for i in train_idx] valid_token_lists = [X_train[i] for i in valid_idx] train_dataset = TextDataset( token_lists=train_token_lists, targets=y_train[train_idx], identities=y_train_identity[train_idx], annotator_counts=y_train_annotator_counts[train_idx]) valid_dataset = TextDataset( token_lists=valid_token_lists, targets=y_train[valid_idx], identities=y_train_identity[valid_idx], annotator_counts=y_train_annotator_counts[valid_idx]) model = LstmGruNet(embedding_matrices=[ glove_embedding_matrix, fasttext_embedding_matrix ], num_aux_targets=num_aux_targets).to(device) model, records = train(model, custom_loss, train_dataset, valid_dataset, device=device, batch_size=config.batch_size, num_epochs=config.train_epochs, tolerance=config.tolerance, lr=config.lr) test_prediction += predict(model, test_dataset, device) test_prediction_count += 1 torch.save(model.state_dict(), OUT_DIR / f'model.{i}.json') with open(OUT_DIR / f'records.{i}.json', 'w') as f: import json json.dump(records, f, indent=4) submission = pd.DataFrame({ 'id': test['id'], 'prediction': test_prediction / test_prediction_count }) submission.to_csv(SUBMISSION_PATH, index=False) display_tables(OUT_DIR)
def main(): config = load_config('./config/blend.json') config.setdefault('n_folds', 10) config.setdefault('n_trials', 300) config.setdefault('threshold', 0.03) df_valid = pd.concat([ pd.read_csv(path / 'valid_submission.csv', index_col='id') for path in VALID_DIR ], axis=0) train_scores = [] valid_scores = [] params = {model: [] for model in models} for i in range(config.n_folds): df_valid = df_valid.sample(frac=1, random_state=i).reset_index(drop=True) train_fold = df_valid[:len(df_valid) // 2] valid_fold = df_valid[len(df_valid) // 2:] train_evaluator = JigsawEvaluator(train_fold[TOXICITY_COLUMN].values, train_fold[IDENTITY_COLUMNS].values) valid_evaluator = JigsawEvaluator(valid_fold[TOXICITY_COLUMN].values, valid_fold[IDENTITY_COLUMNS].values) study = optuna.create_study() study.optimize( lambda trial: objective(trial, train_fold.values, train_evaluator), n_trials=config.n_trials) trial = study.best_trial train_scores.append(1 - trial.value) values = np.array(list(trial.params.values())) values /= values.sum() for key, value in zip(trial.params.keys(), values): params[key].append(value) valid_preds = np.zeros((len(valid_fold))) for key, value in trial.params.items(): valid_preds += valid_fold[key].values * value score, _ = valid_evaluator.get_final_metric(valid_preds) valid_scores.append(score) for i, (train_score, valid_score) in enumerate(zip(train_scores, valid_scores)): print( f'fold {str(i + 1):2s} - train: {train_score:.5f}, valid: {valid_score:.5f}' ) print('-' * 20) print( f'train mean: {np.mean(train_scores):.5f}, var: {np.var(train_scores):.7f}' ) print( f'valid mean: {np.mean(valid_scores):.5f}, var: {np.var(valid_scores):.7f}' ) print('-' * 20) for key, values in params.items(): print(f'{key:25s} {np.mean(values):.6f} {np.var(values):.6f}') print('-' * 20) print(f'robust folds: threshold {config.threshold}') robust_folds = [] robust_train_scores = [] robust_valid_scores = [] for i, (train_score, valid_score) in enumerate(zip(train_scores, valid_scores)): if np.abs(train_score - valid_score) < config.threshold: robust_folds.append(i) robust_train_scores.append(train_score) robust_valid_scores.append(valid_score) print(' '.join(map(str, robust_folds))) print('-' * 20) print( f'train mean: {np.mean(robust_train_scores):.5f}, var: {np.var(robust_train_scores):.7f}' ) print( f'valid mean: {np.mean(robust_valid_scores):.5f}, var: {np.var(robust_valid_scores):.7f}' ) print('-' * 20) for key, values in params.items(): robust_values = np.array(values)[robust_folds] print( f'{key:25s} {np.mean(robust_values):.6f} {np.var(robust_values):.6f}' )
metric=eval_metrics[main_metric], prev_metric=best_metric) save_model(ema_model, logdir, "ema.pth") elapsed_sec = time.time() - t0 elapsed_min = int(elapsed_sec // 60) elapsed_sec = elapsed_sec % 60 logger.info( f"Elapsed time: {elapsed_min}min {elapsed_sec:.4f}seconds.") if __name__ == "__main__": warnings.filterwarnings("ignore") args = utils.get_parser().parse_args() config = utils.load_config(args.config) global_params = config["globals"] output_dir = Path(global_params["output_dir"]) output_dir.mkdir(exist_ok=True, parents=True) logger = utils.get_logger(output_dir / "output.log") utils.set_seed(global_params["seed"]) device = C.get_device(global_params["device"]) df, datadir = C.get_metadata(config) splitter = C.get_split(config) calltype_labels = C.get_calltype_labels(df)
def main(): parser = argparse.ArgumentParser() parser.add_argument("--action", type=str, default='test', help="train or test") args = parser.parse_args() config = load_config() # 使用tensorboard time_now = datetime.now().isoformat() if not os.path.exists(config.RUN_PATH): os.mkdir(config.RUN_PATH) writer = SummaryWriter(log_dir=os.path.join(config.RUN_PATH, time_now)) # 随机数种子 torch.manual_seed(config.SEED) torch.cuda.manual_seed(config.SEED) np.random.seed(config.SEED) random.seed(config.SEED) # INIT GPU os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU) if torch.cuda.is_available(): config.DEVICE = torch.device("cuda") print('\nGPU IS AVAILABLE') torch.backends.cudnn.benchmark = True else: config.DEVICE = torch.device("cpu") ############################################################## # Initialise the generator and discriminator with the UNet and # DNet architectures respectively. generator = UNet(True).to(config.DEVICE) discriminator = DNet().to(config.DEVICE) ################################################################### # Create ADAM optimizer for the generator as well the discriminator. # Create loss criterion for calculating the L1 and adversarial loss. d_optimizer = optim.Adam(discriminator.parameters(), betas=(0.5, 0.999), lr=config.LR) g_optimizer = optim.Adam(generator.parameters(), betas=(0.5, 0.999), lr=config.LR) d_loss = nn.BCELoss() g_loss_bce = nn.BCELoss() g_loss_l1 = nn.L1Loss() # 加载数据集 if args.action == 'train': train_dataset = LABDataset(config, config.TRAIN_PATH) len_train = len(train_dataset) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=config.BATCH_SIZE, shuffle=True) iter_per_epoch = len(train_loader) train_(config, train_loader, generator, discriminator, d_optimizer, g_optimizer, d_loss, g_loss_bce, g_loss_l1, len_train, iter_per_epoch, writer) if args.action == "test": test_dataset = LABDataset(config, config.TEST_PATH) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) test(config, test_loader, generator, g_loss_l1)
def step_impl(context): assert_that(load_config(), has_entry('current_project', CONFIG['DEFORM']['PROJECT']))
import os import argparse from src.utils import load_config, train, train_am, load_model, inference, make_submission if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--config', type=str, default='config/ResNet50.yml', help='path to config file') args = parser.parse_args() config = load_config(args.config) train_am(config) # model = load_model('saved_model/resnet50_acc_0.785.pkl', config) # make_submission(model, config, 'Train-Test-Data/public-test.csv', 'submission.csv') # orig_folder = 'Train-Test-Data/dataset/439-F-38' # files = os.listdir(orig_folder) # for fn in files: # output = inference(model, os.path.join(orig_folder, fn), config) # print(output.item())
def main(): print('Loading config') cfg = load_config() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('Utilizing device {}'.format(device)) print('Initializing Dataloader & Dataset.') train_data = ECGDataset(cfg['PROCESSED_DATA_DIR']) test_data = ECGDataset(cfg['PROCESSED_DATA_DIR'], test=True) train_loader = DataLoader(dataset=train_data, batch_size=cfg['BATCH_SIZE'], shuffle=True, num_workers=4, pin_memory=True) test_loader = DataLoader(dataset=test_data, batch_size=cfg['BATCH_SIZE'], num_workers=4, pin_memory=True) print('Initializing Model & Components.') model_factory = ModelFactory(cfg, num_classes=train_data.num_classes, input_size=train_data.data.shape[2], input_length=train_data.data.shape[1]) model = model_factory.get().to(device) optimizer = Adam(model.parameters(), lr=cfg['LR']) loss = CrossEntropyLoss() print('Initializing Helper Agents.') tracker = TrackingAgent() summary = SummaryAgent(cfg['SUMMARY_PATH'], model.name, cfg) print('Start training with {} epochs'.format(cfg['EPOCHS'])) for e in range(1, cfg['EPOCHS'] + 1): tracker.start_time() model.train() for i_batch, sample_batched in enumerate(tqdm(train_loader, leave=False)): x_sequence = sample_batched['data'].to(device) y = sample_batched['label'].to(device) y_hat = model(x_sequence) batch_loss = loss(y_hat, y) optimizer.zero_grad() batch_loss.backward() optimizer.step() tracker.add_train_loss(batch_loss) tracker.add_train_prediction(y_hat, y) tracker.stop_time() tracker.add_cpu_usage() tracker.add_gpu_usage() model.eval() for i_batch, sample_batched in enumerate(test_loader): with torch.no_grad(): x_test = sample_batched['data'].to(device) y_test = sample_batched['label'].to(device) y_hat = model(x_test) test_loss = loss(y_hat, y_test) tracker.add_test_loss(test_loss) tracker.add_test_prediction(y_hat, y_test) train_metrics = tracker.get_train_metrics() train_loss = tracker.get_train_loss() summary.add_scalar('Train Loss', train_loss) summary.add_scalar('Train Accuracy', train_metrics[0]) summary.add_scalar('Train Precision', train_metrics[1]) summary.add_scalar('Train Recall', train_metrics[2]) summary.add_scalar('Train F1-Score', train_metrics[3]) test_metrics = tracker.get_test_metrics() test_loss = tracker.get_test_loss() summary.add_scalar('Test Loss', test_loss) summary.add_scalar('Test Accuracy', test_metrics[0]) summary.add_scalar('Test Precision', test_metrics[1]) summary.add_scalar('Test Recall', test_metrics[2]) summary.add_scalar('Test F1-Score', test_metrics[3]) cpu, gpu = tracker.get_performance_metrics() summary.add_scalar('CPU Utilization', cpu) summary.add_scalar('GPU Utilization', gpu) summary.add_scalar('Epoch Time', tracker.epoch_time) tracker.reset() summary.save_model(model) summary.adv_episode() summary.flush() print('Ep. {0}; Epoch Loss {1:.2f}; Train Acc. {2:.2f}; Val. Loss {3:.2f}; Val. Acc. {4:.2f}' .format(e, train_loss, train_metrics[0], test_loss, test_metrics[0])) summary.close() tracker.get_plots(show=False)
import os import sys root = os.path.abspath('..') sys.path.append(root) import pandas as pd from src.utils import load_config from sklearn.model_selection import train_test_split import logging as log def create_holdout_set(df, test_split_percent): X_train, X_test = train_test_split(df, test_size=test_split_percent, random_state=2020) X_test.to_csv(os.path.join(config['PATHS']['Project_path'] + 'data/', 'hold_out.csv'), index=False) X_train.to_csv(os.path.join(config['PATHS']['Project_path'] + 'data/', 'train.csv'), index=False) if __name__ == '__main__': config = load_config('config.yaml') df = pd.read_csv( os.path.join(config['PATHS']['Project_path'] + 'data/', 'dataset_00_with_header.csv')) create_holdout_set(df, 0.9) log.info('Holdout set created !!!')
def main(): # Argparse custom actions class SetModes(argparse.Action): """Set the modes of operations.""" def __call__(self, parser, args, values, option_string=None): for value in values: setattr(args, value, True) # yapf: disable parser = argparse.ArgumentParser(description='Fake News Classifier') # Initialization parser.add_argument('--init', action='store_true', default=False, help='perform initialization') # Modes parser.add_argument('-m', '--mode', action=SetModes, nargs='+', choices=['train', 'test', 'demo', 'plot'], help='specify the mode of operation: train, test, demo, plot') parser.add_argument('--train', action='store_true', default=False, help='train the model') parser.add_argument('--test', action='store_true', default=False, help='test the model (must either train or load a model)') parser.add_argument('--demo', action='store_true', default=False, help='demo the model on linewise samples from a file (must either train or load a model)') parser.add_argument('--plot', action='store_true', default=False, help='plot training data (must either train or have existing training data)') # Options parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('-c', '--config', type=str, help='path to configuration json file (overrides args)') parser.add_argument('--data-loader', type=str, default='BatchLoader', help='data loader to use (default: "BatchLoader")') parser.add_argument('--dataset', type=str, default='FakeRealNews', help='dataset to use (default: "FakeRealNews")') parser.add_argument('-e', '--epochs', type=int, default=10, help='number of epochs to train (default: 10)') parser.add_argument('-f', '--file', type=str, help='specify a file for another argument') parser.add_argument('--lr', '--learning-rate', dest='learning_rate', type=float, default=1e-4, help='learning rate (default: 1e-4)') parser.add_argument('-l', '--load', type=int, metavar='EPOCH', help='load a model and its training data') parser.add_argument('--loss', type=str, default='BCEWithLogitsLoss', help='loss function (default: "BCEWithLogitsLoss")') parser.add_argument('--model', type=str, default='FakeNewsNet', help='model architecture to use (default: "FakeNewsNet")') parser.add_argument('-s', '--sample-size', type=int, metavar='N', help='limit sample size for training') parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') parser.add_argument('--save', action='store_true', default=True, help='save model checkpoints and training data (default: True)') parser.add_argument('--no-save', dest='save', action='store_false') args = parser.parse_args() # yapf: enable # Print help if no args if len(sys.argv) == 1: parser.print_help() parser.exit() # Configure logger logging.basicConfig(level=logging.DEBUG) logging.getLogger('matplotlib').setLevel(logging.WARNING) # Load configuration file if specified if args.config is not None: utils.load_config(args) # Exit if no mode is specified if not args.init and not args.train and not args.test and not args.demo and not args.plot: logging.error( 'No mode specified. Please specify with: --mode {init,train,test,demo,plot}' ) exit(1) # Exit on `--load` if run directory not found if (args.load is not None or (args.plot and not args.train)) and not os.path.isdir(utils.get_path(args)): logging.error( 'Could not find directory for current configuration {}'.format( utils.get_path(args))) exit(1) # Exit on `test` or `demo` without `train` or `--load EPOCH` if (args.test or args.demo) and not (args.train or args.load is not None): logging.error( 'Cannot run `test` or `demo` without a model. Try again with either `train` or `--load EPOCH`.' ) exit(1) # Exit on `demo` without a string file if args.demo and not args.file: logging.error( 'Cannot run `demo` without a file. Try again with `--file FILE`.') exit(1) # Setup run directory if args.save and not args.init and not (args.train or args.test or args.demo or args.plot): utils.save_config(args) path = utils.get_path(args) + '/output.log' os.makedirs(os.path.dirname(path), exist_ok=True) logging.getLogger().addHandler(logging.FileHandler(path)) # Set random seeds random.seed(args.seed) torch.manual_seed(args.seed) # Variable declarations training_data = None # Load GloVe vocabulary if args.init or args.train or args.test or args.demo: glove = torchtext.vocab.GloVe(name='6B', dim=50) # Perform initialization if args.init or args.train or args.test: # Determine which dataset to use dataset = utils.get_dataset(args) # Preload the dataset dataset.load() # Get preprocessed samples samples = preprocessing.get_samples(dataset, glove, args.init) random.shuffle(samples) # DataLoader setup for `train`, `test` if args.train or args.test: # Select data loader to use DataLoader = utils.get_data_loader(args) # Split samples split_ratio = [.6, .2, .2] trainset, validset, testset = list( DataLoader.splits(samples, split_ratio)) if args.sample_size is not None: # limit samples used in training trainset = trainset[:args.sample_size] validset = validset[:int(args.sample_size * split_ratio[1] / split_ratio[0])] # Get data loaders train_loader, valid_loader, test_loader = [ DataLoader(split, batch_size=args.batch_size) for split in [trainset, validset, testset] ] # Load samples for demo if args.demo: if os.path.isfile(args.file): # Read samples from the input file with open(args.file, 'r') as f: samples = [line for line in f if line.strip()] data = pd.DataFrame({ 'text': samples, 'label': [0.5] * len(samples) }) # Preprocess samples preprocessing.clean(data) samples = preprocessing.encode(data, glove) samples = [(torch.tensor(text).long(), label) for text, label in samples] # Select data loader to use DataLoader = utils.get_data_loader(args) # Get data loader data_loader = DataLoader(samples, batch_size=1, shuffle=False) else: logging.error('Could not find file for demo at {}'.format( args.file)) exit(1) # Model setup for `train`, `test`, `demo` if args.train or args.test or args.demo: # Create the model model = utils.get_model(glove, args) # Load a model if args.load is not None: utils.load_model(args.load, model, args) # Run `train` if args.train: training_data = training.train(model, train_loader, valid_loader, args) # Run `test` if args.test: if args.train or args.load is not None: criterion = utils.get_criterion(args.loss) acc, loss = training.evaluate(model, test_loader, criterion) logging.info('Testing accuracy: {:.4%}, loss: {:.6f}'.format( acc, loss)) else: logging.error('No model loaded for testing') exit(1) # Run `demo` if args.demo: if args.train or args.load is not None: model.eval() # set model to evaluate mode logging.info('-- Results --') for i, (text, _) in enumerate(data_loader): preview = data['text'][i][:32] + '...' out = model(text).flatten() prob = torch.sigmoid(out) # apply sigmoid to get probability pred = (prob > 0.5).long() # predict `true` if greater than 0.5 label = ['fake', 'true'][pred.item()] label = '{}{}{}'.format( '\033[92m' if pred.item() else '\033[93m', label, '\033[0m') confidence = (prob if pred.item() else 1 - prob).item() logging.info( 'Report {}: {} with {:.2%} confidence - "{}"'.format( i, label, confidence, preview)) else: logging.error('No model loaded for demo') exit(1) # Run `plot` if args.plot: if training_data is None: training_data = utils.load_training_data(args, allow_missing=False) if args.load is not None and not args.train: for k, v in training_data.items(): training_data[k] = v[:args.load + 1] logging.info('Plotting training data') training.plot(training_data)
def main(config: DictConfig, ckpt: dict, show_dir: Optional[str] = None): # seed if config.SEED is not None: make_deterministic(seed=config.SEED) # data test_loader = get_test_loader(config) # model model = build_model(config, model_state_dict=ckpt['model_state_dict']) # test test(model=model, data_loader=test_loader, device=config.DEVICE, threshold_edge=config.TEST.THRESHOLD_EDGE, show_dir=show_dir) if __name__ == "__main__": args = parse_args() # load config, ckpt config = load_config(args.config, update_dotlist=args.opts) ckpt: dict = torch.load(args.ckpt) main(config, ckpt, args.show_dir)
def main(): parser = ArgumentParser() parser.add_argument('--config_file', type=str, required=True) parser.add_argument('--valid', action='store_true') args = parser.parse_args() config_file = Path(args.config_file) config = load_config(config_file) config.setdefault('max_len', 220) config.setdefault('max_head_len', 128) config.setdefault('epochs', 2) config.setdefault('down_sample_frac', 0.5) config.setdefault('lr', 1.5e-5) config.setdefault('batch_size', 16) config.setdefault('accumulation_steps', 4) config.setdefault('lr_weight_decay_coef', 1.0) config.setdefault('warmup', 0.05) config.setdefault('old_data', False) config.setdefault('old_fine_tuned', False) config.setdefault('device', 'cuda') config.setdefault('seed', 1234) assert 'lm_model_name' in config assert not (config.old_fine_tuned and config.old_data) assert config.max_len >= config.max_head_len assert config.epochs <= 2 lm_model_name = config_file.stem if config.old_fine_tuned: PRETRAINED_PATH = Path(f'../output/{lm_model_name}_old_fine_tune/') assert PRETRAINED_PATH.exists() else: PRETRAINED_PATH = args.lm_model MODE = args.lm_model[:4] LOWER_CASE = 'uncased' in args.lm_model LARGE_MODEL = 'large' in args.lm_model DEVICE = torch.device(config.device) if config.old_data: lm_model_name += '_old_fine_tune' if args.valid: valid_size = 200000 shuffle_seed = 1029 lm_model_name += '_valid' else: valid_size = 0 shuffle_seed = config.seed OUT_DIR = Path(f'../output/{lm_model_name}/') TEST_SUBMISSION = OUT_DIR / 'submission.csv' VALID_SUBMISSION = OUT_DIR / 'valid_submission.csv' OUT_DIR.mkdir(exist_ok=True) warnings.filterwarnings('ignore') seed_torch(config.seed) if not args.old: train_data = TRAIN_DATA test_data = TEST_DATA sample_submission = SAMPLE_SUBMISSION train_size = 1804874 - valid_size else: train_data = TRAIN_OLD test_data = TEST_OLD sample_submission = SAMPLE_OLD train_size = 159571 - valid_size TOXICITY_COLUMN = OLD_TOXICITY_COLUMN IDENTITY_COLUMNS = OLD_IDENTITY_COLUMNS AUX_TOXICITY_COLUMNS = OLD_AUX_TOXICITY_COLUMNS if MODE == 'bert': from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam lm_tokenizer = BertTokenizer.from_pretrained(args.lm_model, cache_dir=None, do_lower_case=LOWER_CASE) model = BertForSequenceClassification.from_pretrained( PRETRAINED_PATH, cache_dir=None, num_labels=1 + len(AUX_TOXICITY_COLUMNS)) optimizer_class = BertAdam else: from pytorch_pretrained_bert import GPT2Tokenizer, OpenAIAdam, GPT2Model lm_tokenizer = GPT2Tokenizer.from_pretrained(args.lm_model, cache_dir=None) model = GPT2ClassificationHeadModel.from_pretrained( PRETRAINED_PATH, clf_dropout=config.get('dropout_rate', 0.1), n_class=1 + len(AUX_TOXICITY_COLUMNS)) optimizer_class = OpenAIAdam assert config.lr_weight_decay_coef == 1.0 with timer('preprocess'): tokenizer = MyTokenizer(lm_tokenizer, config.max_len, config.max_head_len, MODE) df_train = pd.read_csv(TRAIN_DATA).sample( frac=1, random_state=shuffle_seed).reset_index(drop=True) df_train['comment_text'] = df_train['comment_text'].astype(str) df_train = df_train.fillna(0) X_train = tokenizer.tokenize( df_train['comment_text'].fillna('DUMMY_VALUE'), num_threads=16, chunksize=5000) df_test = pd.read_csv(TEST_DATA) df_test['comment_text'] = df_test['comment_text'].astype(str) df_test = df_test.fillna(0) X_test = tokenizer.tokenize( df_test['comment_text'].fillna('DUMMY_VALUE'), num_threads=16, chunksize=5000) df_train.drop(['comment_text'], axis=1, inplace=True) df_test.drop(['comment_text'], axis=1, inplace=True) X_valid = X_train[train_size:] X_train = X_train[:train_size] y_identity_train = df_train[IDENTITY_COLUMNS].values y_annotator_counts_train = df_train['toxicity_annotator_count'].values weights = training_weights(df_train, TOXICITY_COLUMN, IDENTITY_COLUMNS) y_train = np.hstack( (df_train[TOXICITY_COLUMN].values.reshape(-1, 1), weights.reshape(-1, 1), df_train[AUX_TOXICITY_COLUMNS].values)) y_valid = y_train[train_size:] y_train = y_train[:train_size] y_identity_valid = y_identity_train[train_size:] y_identity_train = y_identity_train[:train_size] y_annotator_counts_valid = y_annotator_counts_train[train_size:] y_annotator_counts_train = y_annotator_counts_train[:train_size] loss_weight = 1.0 / weights.mean() if not args.old else None # drop negative samples here frac = config.down_sample_frac target_negative = (y_train > 0.0).sum(axis=1) == 1 identity_negative = (y_identity_train > 0.0).sum(axis=1) == 0 negative_mask = identity_negative & target_negative negative_indices = np.arange(len(y_train))[negative_mask] drop_indices_0 = set( negative_indices[:int(len(negative_indices) * frac)]) drop_indices_1 = set( negative_indices[int(len(negative_indices) * (1 - frac)):]) drop_indices_list = [drop_indices_0, drop_indices_1] len_train = len(y_train) - len(drop_indices_0) with timer('train'): model.zero_grad() model = model.to(DEVICE) num_layers = 24 if LARGE_MODEL else 12 optimizer_grouped_parameters = get_optimizer_params( model, config.lr, config.lr_weight_decay_coef, num_layers) num_train_optimization_steps = int(config.epochs * len_train / config.batch_size / config.accumulation_steps) optimizer = optimizer_class(optimizer_grouped_parameters, lr=config.lr, warmup=config.warmup, t_total=num_train_optimization_steps) model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0) model = model.train() batch_count = len_train // config.batch_size loss_fn = CustomLoss(loss_weight) for epoch, drop_indices in zip(range(config.epochs), drop_indices_list): sample_indices = np.array( [i for i in range(len(y_train)) if i not in drop_indices]) X_sampled_train = [X_train[i] for i in sample_indices] y_sampled_train = y_train[sample_indices] y_sampled_identity_train = y_identity_train[sample_indices] y_sampled_annotator_counts_train = y_annotator_counts_train[ sample_indices] train_dataset = TextDataset(X_sampled_train, y_sampled_train, y_sampled_identity_train, y_sampled_annotator_counts_train) train_loader = LengthBucketingDataLoader( train_dataset, shuffle=True, drop_last=True, batch_size=config.batch_size) tk0 = tqdm(enumerate(train_loader), total=batch_count) optimizer.zero_grad() for i, (x_batch, _, a_batch, y_batch, y_identity_batch) in tk0: y_pred = model(x_batch.to(DEVICE), attention_mask=(x_batch > 0).to(DEVICE), labels=None) loss = loss_fn(y_pred, y_batch.to(DEVICE)) with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() if (i + 1) % config.accumulation_steps == 0: optimizer.step() optimizer.zero_grad() model.save_pretrained(OUT_DIR) with timer('evaluate'): if args.valid: valid_dataset = TextDataset(X_valid, y_valid, y_identity_valid, y_annotator_counts_valid) valid_preds = predict(model, valid_dataset, device=DEVICE) df_valid = df_train.tail(valid_size) df_valid['model1'] = valid_preds evaluator = JigsawEvaluator(df_valid[TOXICITY_COLUMN].values, df_valid[IDENTITY_COLUMNS].values) final_score, _ = evaluator.get_final_metric( df_valid['model1'].values) valid_prediction = predict(model, TextDataset(X_valid), device=DEVICE) valid_submission = pd.DataFrame({ 'id': df_valid['id'], 'prediction': valid_prediction }) valid_submission.to_csv(VALID_SUBMISSION, index=False) print(f'validation score: {final_score:.5f}') test_prediction = predict(model, TextDataset(X_test), device=DEVICE) submission = pd.DataFrame({ 'id': df_test['id'], 'prediction': test_prediction }) submission.to_csv(TEST_SUBMISSION, index=False)
import src.utils as utils get_ipython().run_line_magic("autoreload", "2") get_ipython().run_line_magic("matplotlib", "inline") os.getcwd() #%% [markdown] # ### Choose experiment, print out configurations #%% base_path = "./maggot_models/models/runs/" experiment = "fit_dcsbm" run = 2 config = utils.load_config(base_path, experiment, run) dcsbm_df = utils.load_pickle(base_path, experiment, run, "dcsbm_out_df") # dcsbm_df = dcsbm_df.apply(pd.to_numeric) dcsbm_df.head() #%% def get_best(df): # out_df = df[(df["param_n_components"] == 1) & (df["param_regularizer"] == 0)] # kept_params = ["param_n_blocks"] param_range = np.unique(df["param_n_blocks"].values) best_rows = [] for p in param_range: temp_df = df[df["param_n_blocks"] == p] ind = temp_df["mse"].idxmin()
def gender_estimation(weights=None): load_config() device = torch.device('cuda') imdb_root = os.environ['IMDB_ROOT'] df = unpickle_imdb(f"{imdb_root}/imdb.pickle") savedir = f"{os.environ['LOG_DIR']}" trans = transforms.Compose([ # transforms.Resize(72), #transforms.RandomCrop(64), transforms.Resize(64), transforms.CenterCrop(64), transforms.RandomHorizontalFlip(0.5), transforms.ToTensor(), transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD), ]) ds = ImdbDataset(root=imdb_root, df=df, transform=trans) print(f"Loaded ds with {len(ds)} items.") tr, val = random_split(ds, [len(ds) - len(ds) // 10, len(ds) // 10]) loss_fn = CrossEntropyLoss() #model = resnet50(pretrained=True) model = resnet18(pretrained=True) model.fc = nn.Linear(model.fc.in_features, 2) """ from src.simclr import ResNetSimCLR model = ResNetSimCLR('resnet50', 64) #if weights: # model.load_state_dict(torch.load(weights)) model.projector = nn.Sequential( nn.Linear(model.n_features, model.n_features, bias=False), nn.ReLU(), nn.Linear(model.n_features, 2, bias=False) ) for param in model.encoder.parameters(): param.requires_grad = False """ model.to(device) optim = Adam(model.parameters(), lr=3e-4, weight_decay=1e-6) tr_dl = DataLoader(tr, batch_size=16, shuffle=True, num_workers=8, pin_memory=True) val_dl = DataLoader(val, batch_size=16, shuffle=False, num_workers=4, pin_memory=True) tr_log, val_log = {}, {} def untrans_display(im): std, mean = torch.as_tensor(IMAGENET_STD), torch.as_tensor( IMAGENET_MEAN) if mean.ndim == 1: mean = mean[:, None, None] if std.ndim == 1: std = std[:, None, None] im.mul_(std).add_(mean) trans = transforms.ToPILImage() im = trans(im) im.show() def log_epoch(preds, labels, loss, log): _, pred_class = torch.max(preds.data, 1) log[epoch].append({ 'loss': loss.item(), 'count': labels.size(0), 'correct': (pred_class == labels).sum().item(), 'tp': ((pred_class == 1) & (labels == 1)).sum().item(), 'tn': ((pred_class == 0) & (labels == 0)).sum().item(), 'fp': ((pred_class == 1) & (labels == 0)).sum().item(), 'fn': ((pred_class == 0) & (labels == 1)).sum().item(), 'cnt_p': (labels == 1).sum().item(), 'cnt_n': (labels == 0).sum().item(), }) def print_log_epoch(_e, log, pretext=''): epoch_loss = [x['loss'] for x in log[_e]] sum_loss = sum(epoch_loss) cnt_loss = len(epoch_loss) avg_loss = sum_loss / cnt_loss print( f"{pretext}Epoch {_e}: Total Loss={sum_loss}\tAvg Loss={avg_loss}\tNum Batches={cnt_loss}" ) e_cnt = [x['count'] for x in log[_e]] e_correct = [x['correct'] for x in log[_e]] e_acc = sum(e_correct) / sum(e_cnt) print( f"{pretext}Epoch {_e}: Total Cnt={sum(e_cnt)}\tTotal Cor={sum(e_correct)}\tAcc={e_acc}" ) tp_cnt = sum([x['tp'] for x in log[_e]]) tn_cnt = sum([x['tn'] for x in log[_e]]) fp_cnt = sum([x['fp'] for x in log[_e]]) fn_cnt = sum([x['fn'] for x in log[_e]]) p_cnt = sum([x['cnt_p'] for x in log[_e]]) n_cnt = sum([x['cnt_n'] for x in log[_e]]) print( f"{pretext}Epoch {_e}: TP={tp_cnt}\tTN={tn_cnt}\tFP={fp_cnt}\tFN={fn_cnt}\tP cnt={p_cnt}\tN cnt={n_cnt}" ) def _epoch(train): if train: dl = tr_dl model.train() else: dl = val_dl model.eval() for ix, (img, label) in enumerate(dl): img = img.to(device=device) labels = label.to(device=device, dtype=torch.int64) preds = model(img) # _, preds = model(img) loss = loss_fn(preds, labels) if train: optim.zero_grad() loss.backward() optim.step() log_epoch(preds, labels, loss, tr_log) else: log_epoch(preds, labels, loss, val_log) def _save_weights(prefix='', suffix=''): time = datetime.datetime.now() s = f"{prefix}_{time.month}_{time.day}_{time.hour}_{time.minute}_{time.second}_{suffix}.pth" fname = f"{savedir}/{s}" print(f"Saving to: {fname}") torch.save(model.state_dict(), fname) epoch = 0 for i in range(20): tr_log[epoch] = [] _epoch(True) print_log_epoch(epoch, tr_log) val_log[epoch] = [] _epoch(False) print_log_epoch(epoch, val_log, pretext='VAL::') epoch += 1 _save_weights(prefix=f'long')
import unittest import os from src.utils import load_config load_config() class IMDBTestCase(unittest.TestCase): def setUp(self) -> None: self.imdb_root = os.environ['IMDB_ROOT'] def test_build_imdb(self): from src.datasets import build_imdb df = build_imdb(f"{self.imdb_root}/imdb.mat", n=None, save=f"{self.imdb_root}/imdb.pickle") print(df) print(df[:5]) def test_load_pickle(self): from src.datasets import unpickle_imdb df = unpickle_imdb(f"{self.imdb_root}/imdb.pickle") print(df) def test_imdb_dataset(self): from src.datasets import ImdbDataset from torchvision.transforms import ToTensor from src.datasets import unpickle_imdb df = unpickle_imdb(f"{self.imdb_root}/imdb.pickle") ds = ImdbDataset(root=self.imdb_root, df=df, transform=ToTensor())