Exemple #1
0
 def test_dataroot(self):
     from src.utils import load_config
     load_config()
     fail = ''
     self.assertFalse(os.path.isdir(fail))
     path = os.environ.get('DATAROOT', fail)
     self.assertTrue(os.path.isdir(path),
                     f"Environment variable $DATAROOT {path} is not set.")
Exemple #2
0
def main():
    config = utils.load_config(args.config)
    config["weighted"] = "weighted" in config.keys()

    # copy args to config
    config["mode"] = args.mode
    config["fold"] = args.fold
    config["apex"] = args.apex
    config["output"] = args.output
    config["snapshot"] = args.snapshot
    # config["resume_from"] = args.resume_from if args.resume_from

    utils.set_seed(SEED)
    device = torch.device(DEVICE)

    log(f"Fold {args.fold}")

    model = factory.get_model(config).to(device)

    log(f"Model type: {model.__class__.__name__}")
    if config["mode"] == 'train':
        train(config, model)
        valid(config, model)
    elif config["mode"] == 'valid':
        valid(config, model)
    elif config["mode"] == 'test':
        valid(config, model, all_exam=True)
def main():
    cfg = load_config()

    print('Loading Model & Data.')
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = torch.load(cfg['INFERENCE_MODEL']).to(device)
    model.eval()
    data = tensor(load_pickle(cfg['INFERENCE_DATA'])).float().to(device)
    label = None
    if len(cfg['INFERENCE_DATA']) > 0:
        label = load_pickle(cfg['INFERENCE_LABEL'])

    print('Starting Inference.')
    with torch.no_grad():
        y_hat = model(data)
        y_hat = torch.argmax(y_hat, dim=1).tolist()

    print(y_hat)

    if label is not None:
        print('Generating Report')
        print(
            classification_report(label,
                                  y_hat,
                                  target_names=['SR', 'SB', 'AFIB', 'ST']))
        print('Kappa Score: {:.2f}'.format(cohen_kappa_score(label, y_hat)))
Exemple #4
0
 def __init__(self, url, config):
     super(BasicWebScraper, self).__init__()
     self.config = load_config(config)
     self.source_url = url
     self.inside_container = False
     self.inside_sub_container = False
     self.cont_tag_counter = 0
     self.sub_cont_tag_counter = 0
Exemple #5
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--action",
                        type=str,
                        default='train',
                        help="train or test")
    args = parser.parse_args()

    config = load_config()

    # 使用tensorboard
    time_now = datetime.now().isoformat()

    if not os.path.exists(config.RUN_PATH):
        os.mkdir(config.RUN_PATH)
    writer = SummaryWriter(log_dir=config.RUN_PATH)

    # 随机数种子
    torch.manual_seed(config.SEED)
    torch.cuda.manual_seed(config.SEED)
    np.random.seed(config.SEED)
    random.seed(config.SEED)

    # INIT GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU)
    if torch.cuda.is_available():
        config.DEVICE = torch.device("cuda")
        print('\nGPU IS AVAILABLE')
        torch.backends.cudnn.benchmark = True
    else:
        config.DEVICE = torch.device("cpu")

    net = UNet(2).to(config.DEVICE)
    print(list(torchvision.models.resnet18(False).children())[7])

    optimizer = optim.Adam(net.parameters(), betas=(0.5, 0.999), lr=config.LR)
    loss = nn.L1Loss()

    # 加载数据集
    if args.action == 'train':

        train_dataset = LABDataset(config, config.TRAIN_PATH)
        len_train = len(train_dataset)
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
        iter_per_epoch = len(train_loader)
        train_(config, train_loader, net, optimizer, loss, len_train,
               iter_per_epoch, writer)

    if args.action == "test":

        test_dataset = LABDataset(config, config.TEST_PATH)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=1,
                                                  shuffle=False)
        test(config, test_loader, net, loss)
def main():
    print('Loading config')
    cfg = load_config()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Utilizing device {}'.format(device))

    print('Initializing Dataloader & Dataset.')
    training_data = TweetDataset(cfg['PROCESSED_DATA_DIR'])
    data_loader = DataLoader(dataset=training_data,
                             batch_size=cfg['BATCH_SIZE'],
                             shuffle=True,
                             num_workers=4,
                             collate_fn=collate_var_sequences,
                             pin_memory=True)

    print('Initializing Model & Components.')
    model = GRUPredictor(num_classes=training_data.num_classes,
                         hidden_size=cfg['HIDDEN_SIZE'],
                         batch_size=cfg['BATCH_SIZE'],
                         num_layers=cfg['NUM_LAYERS'],
                         drop_out=cfg['DROPOUT']).to(device)
    optimizer = Adam(model.parameters(), lr=cfg['LR'])
    loss = CrossEntropyLoss()

    print('Initializing Helper Agents.')
    tracker = TrackingAgent(cfg['BATCH_SIZE'], len(training_data))
    summary = SummaryAgent(cfg['SUMMARY_PATH'], model.name, cfg)

    print('Start training with {} epochs'.format(cfg['EPOCHS']))
    for e in range(1, cfg['EPOCHS'] + 1):
        for i_batch, sample_batched in enumerate(tqdm(data_loader, leave=False)):
            x_sequence = sample_batched['tweet'].to(device)
            y = sample_batched['label'].to(device)

            y_hat = model(x_sequence)
            batch_loss = loss(y_hat, y)

            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

            tracker.add_loss(batch_loss)
            tracker.add_correct_class(y_hat, y)

        mean_loss = tracker.get_loss()
        accuracy = tracker.get_accuracy()
        tracker.reset()
        summary.add_scalar('Loss', mean_loss)
        summary.add_scalar('Accuracy', accuracy)
        summary.save_model(model)
        summary.adv_episode()
        summary.flush()

        print('Ep. {0}; Mean Epoch Loss {1:.2f}; Train Acc. {2:.2f}'.format(e, mean_loss, accuracy))

    summary.close()
Exemple #7
0
    def run_web_scraping_and_save_data(cls):
        """Runs the web scraping workflow, collects the product information data for each main and sub-category,
        and then it saves the whole list into a CSV file.
        # Improvement should be be save the currency and price separately, to have integer price fields, for example.
        # Many improvements could be done ...
        """
        product_df = pd.DataFrame()
        if load_config(MAIN_URL_COLLECTION_NEEDED):
            url = load_config(MAIN_SOURCE_URL)
            main_url_collector = URLCollector(url, MAIN_URLS_TO_SCRAP_CONFIG)
            main_url_list = main_url_collector.get_urls_to_scrap()
            print(main_url_list)

            if load_config(PRODUCT_SUB_URL_COLL_NEEDED):
                for main_url in main_url_list:
                    url = load_config(SUB_SOURCE_URL_BEGINNING) + main_url
                    sub_url_collector = URLCollector(
                        url, PRODUCT_SUB_URLS_TO_SCRAP_CONFIG)
                    sub_url_list = sub_url_collector.get_urls_to_scrap()
                    print(sub_url_list)

                    for sub_url in sub_url_list:
                        url = load_config(SUB_SOURCE_URL_BEGINNING) + sub_url
                        product_scrapper = ProductScraper(
                            url, PRODUCT_SCRAPING_CONFIG)
                        product_data = product_scrapper.get_product_scraping_data(
                        )
                        cur_product_df = pd.DataFrame(product_data)
                        cur_product_df.insert(
                            0, MAIN_CATEG_COL,
                            main_url.split('/')[2]
                            [main_url.split('/')[2].find('-') + 1:])
                        cur_product_df.insert(
                            1, SUB_CATEG_COL,
                            sub_url.split('/')[3]
                            [sub_url.split('/')[3].find('-') + 1:])
                        product_df = product_df.append(cur_product_df)
        print(len(product_df))
        print(product_df.head())
        product_df.to_csv(RESULT_FILE)
        return product_df
Exemple #8
0
    def __init__(self, *args, **kwargs):
        tk.Tk.__init__(self, *args, **kwargs)
        self.title('Minter')
        self.geometry('300x300')

        container = tk.Frame(self)
        container.pack(side='top', fill='both', expand=True)
        container.grid_columnconfigure(0, weight=1)

        self.config = load_config()
        self.context = {}
        self.steps = {}
        self._show_frame(container, StartPage)
 def test_gender_analyze(self):
     import os
     import pickle
     from src.gender_estimation import gender_analyze
     from src.datasets import ImdbDataset, unpickle_imdb
     from src.utils import load_config
     from src.convnets.utils import IMAGENET_MEAN, IMAGENET_STD
     from torchvision import transforms
     load_config()
     trans = transforms.Compose([
         transforms.Resize(64),
         transforms.CenterCrop(64),
         transforms.ToTensor(),
         transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
     ])
     imdb_root = os.environ['IMDB_ROOT']
     df = unpickle_imdb(f"{imdb_root}/imdb.pickle")
     ds = ImdbDataset(root=imdb_root,
                      df=df[:250],
                      transform=trans,
                      include_path=True)
     log = gender_analyze(self.weights, ds)
     pickle.dump(log, open('tmp/gender_analyze_log.p', 'wb'))
Exemple #10
0
def main(mode):

    config = load_config()

    # 使用tensorboard
    time_now = datetime.now().isoformat()

    if not os.path.exists(config.RUN_PATH):
        os.mkdir(config.RUN_PATH)
    writer = SummaryWriter(log_dir=config.RUN_PATH)

    # 随机数种子
    torch.manual_seed(config.SEED)
    torch.cuda.manual_seed(config.SEED)
    np.random.seed(config.SEED)
    random.seed(config.SEED)

    # INIT GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU)
    if torch.cuda.is_available():
        config.DEVICE = torch.device("cuda")
        print('\nGPU IS AVAILABLE')
        torch.backends.cudnn.benchmark = True
    else:
        config.DEVICE = torch.device("cpu")

    net = UNet(1).to(config.DEVICE)

    optimizer = optim.Adam(net.parameters(), betas=(0.5, 0.999), lr=config.LR)
    #criterion = nn.CrossEntropyLoss()  # 定义loss函数
    criterion = nn.BCELoss()
    # 加载数据集
    if mode == 1:

        train_dataset = MyDataset(config, config.TRAIN_PATH)
        len_train = len(train_dataset)
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
        iter_per_epoch = len(train_loader)
        train_(config, train_loader, net, optimizer, criterion, len_train,
               iter_per_epoch, writer)

    if mode == 2:

        test_dataset = MyDataset(config, config.TEST_PATH)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=1,
                                                  shuffle=False)
        test(config, test_loader, net, criterion)
Exemple #11
0
def step_impl(context):
    context.execute_steps(u'''
            Then the output should contain:
                """
                Successfully logged in!
                """
        ''')
    assert_that(
        load_config(),
        has_entry(
            'session',
            has_entries({
                'email': CONFIG['DEFORM']['EMAIL'],
                'session_id': is_not(None)
            })))
Exemple #12
0
def main():
    args = parse_args()

    # config
    config: DictConfig = load_config(args.config, update_dotlist=args.opts)
    print_config(config)

    # modules
    model = DepthPLModel(config)
    nyu2data = Nyu2DataModule(config)

    # trainer setting
    trainer = get_trainer(args, config)

    # train
    trainer.fit(model, nyu2data)
Exemple #13
0
def main(argv):
    # load config
    config = load_config(FLAGS.config_path)

    # specify and create experiment path
    timestamp = datetime.now().strftime("_%Y%m%d-%H%M%S")
    current_experiment_path = os.path.join(EXPERIMENT_PATH,
                                           FLAGS.experiment_name + timestamp)
    os.makedirs(current_experiment_path)

    # copy current config file to experiment path
    experiment_config_path = os.path.join(current_experiment_path,
                                          EXPERIMENT_CONFIG_NAME)
    shutil.copy2(FLAGS.config_path, experiment_config_path)

    # initialize trainer and train
    trainer = Trainer(config, current_experiment_path)
    trainer.train()
def main():
    cfg = load_config()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Utilizing device {}'.format(device))

    map_char_to_int = load_pickle(cfg['PROCESSED_DATA_DIR'] +
                                  '/map_char_to_int.pkl')
    map_int_to_char = load_pickle(cfg['PROCESSED_DATA_DIR'] +
                                  '/map_int_to_char.pkl')
    num_chars = len(map_int_to_char)
    ohe_mapping = torch.eye(num_chars).to(device)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Utilizing device {}'.format(device))

    print('Loading Model')
    model = LSTMPredictor(num_chars, cfg['HIDDEN_SIZE'],
                          cfg['BATCH_SIZE']).to(device)
    model.load_state_dict(torch.load(cfg['MODEL_PATH']))
    model.eval()

    while True:
        input_str = input('Tweet: ')
        if input_str == "exit":
            break

        input_str = input_str.lower()
        while True:
            input_enc = [map_char_to_int[char] for char in list(input_str)]
            input_ohe = ohe_mapping[input_enc].unsqueeze(dim=0)
            with torch.no_grad():
                output = model(input_ohe)
                output_prob = torch.softmax(output,
                                            dim=1).squeeze().cpu().numpy()

            result = np.random.choice(num_chars, p=output_prob)
            input_str += map_int_to_char[result]

            if len(input_str) >= 140 or (len(input_str) >= 100
                                         and input_str[-1] == "."):
                print(input_str)
                break
def main():
    with Timer('Loading config'):
        cfg = load_config()

    with Timer('Loading tweets'):
        tweets = load_raw_data(cfg['RAW_DATA_PATH'])

    with Timer('Cleaning sentences'):
        tweet_text = cleanse_sentences(list(tweets['text']))

    with Timer('Mapping characters to integers'):
        tweet_enc, map_char_to_int, map_int_to_char = map_tweets_to_int(tweet_text)

    with Timer('Producing dataset'):
        tweet_train, tweet_label = produce_dataset(tweet_enc)

    with Timer('Save dataset and mapping tables'):
        save_pickle(tweet_train, cfg['PROCESSED_DATA_DIR'] + '/train.pkl')
        save_pickle(tweet_label, cfg['PROCESSED_DATA_DIR'] + '/label.pkl')
        save_pickle(map_char_to_int, cfg['PROCESSED_DATA_DIR'] + '/map_char_to_int.pkl')
        save_pickle(map_int_to_char, cfg['PROCESSED_DATA_DIR'] + '/map_int_to_char.pkl')
Exemple #16
0
def main():
    """
    Load raw ECG data from disc and transform it to cleansed training data.
    """
    cfg = load_config()

    with Timer('Getting label list'):
        labels, file_list = get_labels(cfg['RAW_DATA_PATH'] +
                                       '/Diagnostics.xlsx')

    with Timer('Loading & Downsampling files'):
        ecg_data = get_ecg_data(cfg['RAW_DATA_PATH'] + '/ECGDataDenoised',
                                file_list, cfg['DOWNSAMPLE_THRESHOLD'],
                                cfg['DATA_SLICE'], cfg['NUM_WORKERS'])

    with Timer('Imputing missing values'):
        ecg_data = impute_nans(ecg_data)

    with Timer('Splitting into Train & Test Set'):
        x_train, x_test, y_train, y_test = train_test_split(ecg_data,
                                                            labels,
                                                            test_size=0.2,
                                                            shuffle=True,
                                                            stratify=labels,
                                                            random_state=42)

        print('Final Training set has {} samples'.format(len(x_train)))
        print('Final Test set has {} samples'.format(len(x_test)))
        print('Distribution of labels in Training: {}'.format(
            Counter(y_train)))
        print('Distribution of labels in Testing: {}'.format(Counter(y_test)))

    with Timer('Normalizing data'):
        x_train, x_test = normalize_data(x_train, x_test)

    with Timer('Saving generated arrays'):
        save_pickle(x_train, cfg['PROCESSED_DATA_DIR'] + '/train_data.pkl')
        save_pickle(y_train, cfg['PROCESSED_DATA_DIR'] + '/train_label.pkl')
        save_pickle(x_test, cfg['PROCESSED_DATA_DIR'] + '/test_data.pkl')
        save_pickle(y_test, cfg['PROCESSED_DATA_DIR'] + '/test_label.pkl')
def main():
    parser = ArgumentParser()
    parser.add_argument('--valid', action='store_true')
    args = parser.parse_args()

    config = load_config('./config/lstm_f.json')
    config.setdefault('max_len', 220)
    config.setdefault('max_features', 100000)
    config.setdefault('batch_size', 512)
    config.setdefault('train_epochs', 10)
    config.setdefault('tolerance', 10)
    config.setdefault('num_folds', 5)
    config.setdefault('lr', 1e-3)
    config.setdefault('loss_alpha', 0.1)
    config.setdefault('loss_beta', 1.0)
    config.setdefault('device', 'cuda')
    config.setdefault('seed', 1029)

    device = torch.device(config.device)

    OUT_DIR = Path(f'../output/lstm_f/')
    submission_file_name = 'valid_submission.csv' if args.valid else 'submission.csv'
    SUBMISSION_PATH = OUT_DIR / submission_file_name
    OUT_DIR.mkdir(exist_ok=True)

    warnings.filterwarnings('ignore')
    seed_torch(config.seed)

    with timer('preprocess'):
        train = pd.read_csv(TRAIN_DATA)
        if args.valid:
            train = train.sample(frac=1,
                                 random_state=1029).reset_index(drop=True)
            test = train.tail(200000)
            train = train.head(len(train) - 200000)
        else:
            test = pd.read_csv(TEST_DATA)

        train['comment_text'] = train['comment_text'].apply(preprocess)
        test['comment_text'] = test['comment_text'].apply(preprocess)

        # replace blank with nan
        train['comment_text'].replace('', np.nan, inplace=True)
        test['comment_text'].replace('', np.nan, inplace=True)

        # nan prediction
        nan_pred = train['target'][train['comment_text'].isna()].mean()

        # fill up the missing values
        X_train = train['comment_text'].fillna('_##_').values
        X_test = test['comment_text'].fillna('_##_').values

        # get the target values
        weights = training_weights(train, TOXICITY_COLUMN, IDENTITY_COLUMNS)
        loss_weight = 1.0 / weights.mean()
        y_train_identity = train[IDENTITY_COLUMNS].values
        y_train_annotator_counts = train['toxicity_annotator_count'].values
        y_train = np.hstack(
            (train[TOXICITY_COLUMN].values.reshape(-1, 1),
             weights.reshape(-1, 1), train[AUX_TOXICITY_COLUMNS].values))

        train_nan_mask = X_train == '_##_'
        test_nan_mask = X_test == '_##_'

        vocab = build_vocab(chain(X_train), config.max_features)
        fasttext_embedding_matrix = load_embedding(EMBEDDING_FASTTEXT,
                                                   vocab['token2id'])
        glove_embedding_matrix = load_embedding(EMBEDDING_GLOVE,
                                                vocab['token2id'])

        joblib.dump(vocab, OUT_DIR / 'vocab.pkl')
        np.save(OUT_DIR / 'fasttext_embedding_matrix',
                fasttext_embedding_matrix)
        np.save(OUT_DIR / 'glove_embedding_matrix', glove_embedding_matrix)

        X_train = np.array(tokenize(X_train, vocab, config.max_len))
        X_test = np.array(tokenize(X_test, vocab, config.max_len))

        all_related_columns = [TOXICITY_COLUMN
                               ] + AUX_TOXICITY_COLUMNS + IDENTITY_COLUMNS
        negative_indices = np.arange(
            0, len(train))[(train[all_related_columns] == 0.0).sum(
                axis=1) == len(all_related_columns)]

    with timer('train'):
        skf = StratifiedKFold(n_splits=config.num_folds,
                              shuffle=True,
                              random_state=1)
        num_aux_targets = y_train.shape[-1] - 2
        custom_loss = CustomLoss(
            loss_weight,
            alpha=config.loss_alpha,
            beta=config.loss_beta,
            use_annotator_counts=True,
            weight_from_annotator_counts=lambda x: torch.log(x + 2))
        test_dataset = TextDataset(token_lists=X_test)
        test_prediction = np.zeros(len(test_dataset))
        test_prediction_count = 0
        models = {}
        for i, (train_idx, valid_idx) in enumerate(
                skf.split(X_train, y_train[:, 0] >= 0.5)):
            seed_torch(i)
            np.random.shuffle(negative_indices)
            drop_indices = set(negative_indices[:len(negative_indices) // 2])
            train_idx = [i for i in train_idx if i not in drop_indices]
            train_token_lists = [X_train[i] for i in train_idx]
            valid_token_lists = [X_train[i] for i in valid_idx]
            train_dataset = TextDataset(
                token_lists=train_token_lists,
                targets=y_train[train_idx],
                identities=y_train_identity[train_idx],
                annotator_counts=y_train_annotator_counts[train_idx])
            valid_dataset = TextDataset(
                token_lists=valid_token_lists,
                targets=y_train[valid_idx],
                identities=y_train_identity[valid_idx],
                annotator_counts=y_train_annotator_counts[valid_idx])
            model = LstmGruNet(embedding_matrices=[
                glove_embedding_matrix, fasttext_embedding_matrix
            ],
                               num_aux_targets=num_aux_targets).to(device)
            model, records = train(model,
                                   custom_loss,
                                   train_dataset,
                                   valid_dataset,
                                   device=device,
                                   batch_size=config.batch_size,
                                   num_epochs=config.train_epochs,
                                   tolerance=config.tolerance,
                                   lr=config.lr)
            test_prediction += predict(model, test_dataset, device)
            test_prediction_count += 1
            torch.save(model.state_dict(), OUT_DIR / f'model.{i}.json')

            with open(OUT_DIR / f'records.{i}.json', 'w') as f:
                import json
                json.dump(records, f, indent=4)

            submission = pd.DataFrame({
                'id':
                test['id'],
                'prediction':
                test_prediction / test_prediction_count
            })
            submission.to_csv(SUBMISSION_PATH, index=False)
            display_tables(OUT_DIR)
def main():
    config = load_config('./config/blend.json')
    config.setdefault('n_folds', 10)
    config.setdefault('n_trials', 300)
    config.setdefault('threshold', 0.03)

    df_valid = pd.concat([
        pd.read_csv(path / 'valid_submission.csv', index_col='id')
        for path in VALID_DIR
    ],
                         axis=0)
    train_scores = []
    valid_scores = []
    params = {model: [] for model in models}

    for i in range(config.n_folds):
        df_valid = df_valid.sample(frac=1,
                                   random_state=i).reset_index(drop=True)
        train_fold = df_valid[:len(df_valid) // 2]
        valid_fold = df_valid[len(df_valid) // 2:]
        train_evaluator = JigsawEvaluator(train_fold[TOXICITY_COLUMN].values,
                                          train_fold[IDENTITY_COLUMNS].values)
        valid_evaluator = JigsawEvaluator(valid_fold[TOXICITY_COLUMN].values,
                                          valid_fold[IDENTITY_COLUMNS].values)

        study = optuna.create_study()
        study.optimize(
            lambda trial: objective(trial, train_fold.values, train_evaluator),
            n_trials=config.n_trials)
        trial = study.best_trial
        train_scores.append(1 - trial.value)
        values = np.array(list(trial.params.values()))
        values /= values.sum()
        for key, value in zip(trial.params.keys(), values):
            params[key].append(value)

        valid_preds = np.zeros((len(valid_fold)))
        for key, value in trial.params.items():
            valid_preds += valid_fold[key].values * value
        score, _ = valid_evaluator.get_final_metric(valid_preds)
        valid_scores.append(score)

    for i, (train_score,
            valid_score) in enumerate(zip(train_scores, valid_scores)):
        print(
            f'fold {str(i + 1):2s} - train: {train_score:.5f}, valid: {valid_score:.5f}'
        )

    print('-' * 20)
    print(
        f'train mean: {np.mean(train_scores):.5f}, var: {np.var(train_scores):.7f}'
    )
    print(
        f'valid mean: {np.mean(valid_scores):.5f}, var: {np.var(valid_scores):.7f}'
    )

    print('-' * 20)
    for key, values in params.items():
        print(f'{key:25s} {np.mean(values):.6f} {np.var(values):.6f}')

    print('-' * 20)
    print(f'robust folds: threshold {config.threshold}')
    robust_folds = []
    robust_train_scores = []
    robust_valid_scores = []
    for i, (train_score,
            valid_score) in enumerate(zip(train_scores, valid_scores)):
        if np.abs(train_score - valid_score) < config.threshold:
            robust_folds.append(i)
            robust_train_scores.append(train_score)
            robust_valid_scores.append(valid_score)
        print(' '.join(map(str, robust_folds)))

    print('-' * 20)
    print(
        f'train mean: {np.mean(robust_train_scores):.5f}, var: {np.var(robust_train_scores):.7f}'
    )
    print(
        f'valid mean: {np.mean(robust_valid_scores):.5f}, var: {np.var(robust_valid_scores):.7f}'
    )

    print('-' * 20)
    for key, values in params.items():
        robust_values = np.array(values)[robust_folds]
        print(
            f'{key:25s} {np.mean(robust_values):.6f} {np.var(robust_values):.6f}'
        )
Exemple #19
0
                                      metric=eval_metrics[main_metric],
                                      prev_metric=best_metric)

        save_model(ema_model, logdir, "ema.pth")
        elapsed_sec = time.time() - t0
        elapsed_min = int(elapsed_sec // 60)
        elapsed_sec = elapsed_sec % 60
        logger.info(
            f"Elapsed time: {elapsed_min}min {elapsed_sec:.4f}seconds.")


if __name__ == "__main__":
    warnings.filterwarnings("ignore")

    args = utils.get_parser().parse_args()
    config = utils.load_config(args.config)

    global_params = config["globals"]

    output_dir = Path(global_params["output_dir"])
    output_dir.mkdir(exist_ok=True, parents=True)
    logger = utils.get_logger(output_dir / "output.log")

    utils.set_seed(global_params["seed"])
    device = C.get_device(global_params["device"])

    df, datadir = C.get_metadata(config)
    splitter = C.get_split(config)

    calltype_labels = C.get_calltype_labels(df)
Exemple #20
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument("--action",
                        type=str,
                        default='test',
                        help="train or test")
    args = parser.parse_args()

    config = load_config()

    # 使用tensorboard
    time_now = datetime.now().isoformat()

    if not os.path.exists(config.RUN_PATH):
        os.mkdir(config.RUN_PATH)
    writer = SummaryWriter(log_dir=os.path.join(config.RUN_PATH, time_now))

    # 随机数种子
    torch.manual_seed(config.SEED)
    torch.cuda.manual_seed(config.SEED)
    np.random.seed(config.SEED)
    random.seed(config.SEED)

    # INIT GPU
    os.environ['CUDA_VISIBLE_DEVICES'] = ','.join(str(e) for e in config.GPU)
    if torch.cuda.is_available():
        config.DEVICE = torch.device("cuda")
        print('\nGPU IS AVAILABLE')
        torch.backends.cudnn.benchmark = True
    else:
        config.DEVICE = torch.device("cpu")

    ##############################################################
    # Initialise the generator and discriminator with the UNet and
    # DNet architectures respectively.
    generator = UNet(True).to(config.DEVICE)
    discriminator = DNet().to(config.DEVICE)

    ###################################################################
    # Create ADAM optimizer for the generator as well the discriminator.
    # Create loss criterion for calculating the L1 and adversarial loss.
    d_optimizer = optim.Adam(discriminator.parameters(),
                             betas=(0.5, 0.999),
                             lr=config.LR)
    g_optimizer = optim.Adam(generator.parameters(),
                             betas=(0.5, 0.999),
                             lr=config.LR)

    d_loss = nn.BCELoss()
    g_loss_bce = nn.BCELoss()
    g_loss_l1 = nn.L1Loss()

    # 加载数据集
    if args.action == 'train':

        train_dataset = LABDataset(config, config.TRAIN_PATH)
        len_train = len(train_dataset)
        train_loader = torch.utils.data.DataLoader(
            train_dataset, batch_size=config.BATCH_SIZE, shuffle=True)
        iter_per_epoch = len(train_loader)
        train_(config, train_loader, generator, discriminator, d_optimizer,
               g_optimizer, d_loss, g_loss_bce, g_loss_l1, len_train,
               iter_per_epoch, writer)

    if args.action == "test":

        test_dataset = LABDataset(config, config.TEST_PATH)
        test_loader = torch.utils.data.DataLoader(test_dataset,
                                                  batch_size=1,
                                                  shuffle=False)
        test(config, test_loader, generator, g_loss_l1)
Exemple #21
0
def step_impl(context):
    assert_that(load_config(),
                has_entry('current_project', CONFIG['DEFORM']['PROJECT']))
import os
import argparse
from src.utils import load_config, train, train_am, load_model, inference, make_submission

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--config',
                        type=str,
                        default='config/ResNet50.yml',
                        help='path to config file')
    args = parser.parse_args()

    config = load_config(args.config)
    train_am(config)
    # model = load_model('saved_model/resnet50_acc_0.785.pkl', config)
    # make_submission(model, config, 'Train-Test-Data/public-test.csv', 'submission.csv')
    # orig_folder = 'Train-Test-Data/dataset/439-F-38'
    # files = os.listdir(orig_folder)
    # for fn in files:
    #     output = inference(model, os.path.join(orig_folder, fn), config)
    #     print(output.item())
Exemple #23
0
def main():
    print('Loading config')
    cfg = load_config()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print('Utilizing device {}'.format(device))

    print('Initializing Dataloader & Dataset.')
    train_data = ECGDataset(cfg['PROCESSED_DATA_DIR'])
    test_data = ECGDataset(cfg['PROCESSED_DATA_DIR'], test=True)
    train_loader = DataLoader(dataset=train_data,
                              batch_size=cfg['BATCH_SIZE'],
                              shuffle=True,
                              num_workers=4,
                              pin_memory=True)

    test_loader = DataLoader(dataset=test_data,
                             batch_size=cfg['BATCH_SIZE'],
                             num_workers=4,
                             pin_memory=True)

    print('Initializing Model & Components.')
    model_factory = ModelFactory(cfg,
                                 num_classes=train_data.num_classes,
                                 input_size=train_data.data.shape[2],
                                 input_length=train_data.data.shape[1])

    model = model_factory.get().to(device)
    optimizer = Adam(model.parameters(), lr=cfg['LR'])
    loss = CrossEntropyLoss()

    print('Initializing Helper Agents.')
    tracker = TrackingAgent()
    summary = SummaryAgent(cfg['SUMMARY_PATH'], model.name, cfg)

    print('Start training with {} epochs'.format(cfg['EPOCHS']))
    for e in range(1, cfg['EPOCHS'] + 1):

        tracker.start_time()
        model.train()
        for i_batch, sample_batched in enumerate(tqdm(train_loader, leave=False)):
            x_sequence = sample_batched['data'].to(device)
            y = sample_batched['label'].to(device)

            y_hat = model(x_sequence)
            batch_loss = loss(y_hat, y)

            optimizer.zero_grad()
            batch_loss.backward()
            optimizer.step()

            tracker.add_train_loss(batch_loss)
            tracker.add_train_prediction(y_hat, y)

        tracker.stop_time()
        tracker.add_cpu_usage()
        tracker.add_gpu_usage()

        model.eval()
        for i_batch, sample_batched in enumerate(test_loader):
            with torch.no_grad():
                x_test = sample_batched['data'].to(device)
                y_test = sample_batched['label'].to(device)
                y_hat = model(x_test)
                test_loss = loss(y_hat, y_test)

            tracker.add_test_loss(test_loss)
            tracker.add_test_prediction(y_hat, y_test)

        train_metrics = tracker.get_train_metrics()
        train_loss = tracker.get_train_loss()
        summary.add_scalar('Train Loss', train_loss)
        summary.add_scalar('Train Accuracy', train_metrics[0])
        summary.add_scalar('Train Precision', train_metrics[1])
        summary.add_scalar('Train Recall', train_metrics[2])
        summary.add_scalar('Train F1-Score', train_metrics[3])

        test_metrics = tracker.get_test_metrics()
        test_loss = tracker.get_test_loss()
        summary.add_scalar('Test Loss', test_loss)
        summary.add_scalar('Test Accuracy', test_metrics[0])
        summary.add_scalar('Test Precision', test_metrics[1])
        summary.add_scalar('Test Recall', test_metrics[2])
        summary.add_scalar('Test F1-Score', test_metrics[3])

        cpu, gpu = tracker.get_performance_metrics()
        summary.add_scalar('CPU Utilization', cpu)
        summary.add_scalar('GPU Utilization', gpu)
        summary.add_scalar('Epoch Time', tracker.epoch_time)

        tracker.reset()
        summary.save_model(model)
        summary.adv_episode()
        summary.flush()

        print('Ep. {0}; Epoch Loss {1:.2f}; Train Acc. {2:.2f}; Val. Loss {3:.2f}; Val. Acc. {4:.2f}'
              .format(e, train_loss, train_metrics[0], test_loss, test_metrics[0]))

    summary.close()
    tracker.get_plots(show=False)
Exemple #24
0
import os
import sys
root = os.path.abspath('..')
sys.path.append(root)
import pandas as pd
from src.utils import load_config
from sklearn.model_selection import train_test_split
import logging as log


def create_holdout_set(df, test_split_percent):
    X_train, X_test = train_test_split(df,
                                       test_size=test_split_percent,
                                       random_state=2020)
    X_test.to_csv(os.path.join(config['PATHS']['Project_path'] + 'data/',
                               'hold_out.csv'),
                  index=False)
    X_train.to_csv(os.path.join(config['PATHS']['Project_path'] + 'data/',
                                'train.csv'),
                   index=False)


if __name__ == '__main__':
    config = load_config('config.yaml')
    df = pd.read_csv(
        os.path.join(config['PATHS']['Project_path'] + 'data/',
                     'dataset_00_with_header.csv'))
    create_holdout_set(df, 0.9)
    log.info('Holdout set created !!!')
Exemple #25
0
def main():
    # Argparse custom actions
    class SetModes(argparse.Action):
        """Set the modes of operations."""
        def __call__(self, parser, args, values, option_string=None):
            for value in values:
                setattr(args, value, True)

    # yapf: disable
    parser = argparse.ArgumentParser(description='Fake News Classifier')
    # Initialization
    parser.add_argument('--init', action='store_true', default=False,
                        help='perform initialization')
    # Modes
    parser.add_argument('-m', '--mode', action=SetModes, nargs='+', choices=['train', 'test', 'demo', 'plot'],
                        help='specify the mode of operation: train, test, demo, plot')
    parser.add_argument('--train', action='store_true', default=False,
                        help='train the model')
    parser.add_argument('--test', action='store_true', default=False,
                        help='test the model (must either train or load a model)')
    parser.add_argument('--demo', action='store_true', default=False,
                        help='demo the model on linewise samples from a file (must either train or load a model)')
    parser.add_argument('--plot', action='store_true', default=False,
                        help='plot training data (must either train or have existing training data)')
    # Options
    parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('-c', '--config', type=str,
                        help='path to configuration json file (overrides args)')
    parser.add_argument('--data-loader', type=str, default='BatchLoader',
                        help='data loader to use (default: "BatchLoader")')
    parser.add_argument('--dataset', type=str, default='FakeRealNews',
                        help='dataset to use (default: "FakeRealNews")')
    parser.add_argument('-e', '--epochs', type=int, default=10,
                        help='number of epochs to train (default: 10)')
    parser.add_argument('-f', '--file', type=str,
                        help='specify a file for another argument')
    parser.add_argument('--lr', '--learning-rate', dest='learning_rate', type=float, default=1e-4,
                        help='learning rate (default: 1e-4)')
    parser.add_argument('-l', '--load', type=int, metavar='EPOCH',
                        help='load a model and its training data')
    parser.add_argument('--loss', type=str, default='BCEWithLogitsLoss',
                        help='loss function (default: "BCEWithLogitsLoss")')
    parser.add_argument('--model', type=str, default='FakeNewsNet',
                        help='model architecture to use (default: "FakeNewsNet")')
    parser.add_argument('-s', '--sample-size', type=int, metavar='N',
                        help='limit sample size for training')
    parser.add_argument('--seed', type=int, default=0,
                        help='random seed (default: 0)')
    parser.add_argument('--save', action='store_true', default=True,
                        help='save model checkpoints and training data (default: True)')
    parser.add_argument('--no-save', dest='save', action='store_false')
    args = parser.parse_args()
    # yapf: enable

    # Print help if no args
    if len(sys.argv) == 1:
        parser.print_help()
        parser.exit()

    # Configure logger
    logging.basicConfig(level=logging.DEBUG)
    logging.getLogger('matplotlib').setLevel(logging.WARNING)

    # Load configuration file if specified
    if args.config is not None:
        utils.load_config(args)

    # Exit if no mode is specified
    if not args.init and not args.train and not args.test and not args.demo and not args.plot:
        logging.error(
            'No mode specified. Please specify with: --mode {init,train,test,demo,plot}'
        )
        exit(1)
    # Exit on `--load` if run directory not found
    if (args.load is not None or
        (args.plot
         and not args.train)) and not os.path.isdir(utils.get_path(args)):
        logging.error(
            'Could not find directory for current configuration {}'.format(
                utils.get_path(args)))
        exit(1)
    # Exit on `test` or `demo` without `train` or `--load EPOCH`
    if (args.test or args.demo) and not (args.train or args.load is not None):
        logging.error(
            'Cannot run `test` or `demo` without a model. Try again with either `train` or `--load EPOCH`.'
        )
        exit(1)
    # Exit on `demo` without a string file
    if args.demo and not args.file:
        logging.error(
            'Cannot run `demo` without a file. Try again with `--file FILE`.')
        exit(1)

    # Setup run directory
    if args.save and not args.init and not (args.train or args.test
                                            or args.demo or args.plot):
        utils.save_config(args)
        path = utils.get_path(args) + '/output.log'
        os.makedirs(os.path.dirname(path), exist_ok=True)
        logging.getLogger().addHandler(logging.FileHandler(path))

    # Set random seeds
    random.seed(args.seed)
    torch.manual_seed(args.seed)

    # Variable declarations
    training_data = None

    # Load GloVe vocabulary
    if args.init or args.train or args.test or args.demo:
        glove = torchtext.vocab.GloVe(name='6B', dim=50)

    # Perform initialization
    if args.init or args.train or args.test:
        # Determine which dataset to use
        dataset = utils.get_dataset(args)
        # Preload the dataset
        dataset.load()
        # Get preprocessed samples
        samples = preprocessing.get_samples(dataset, glove, args.init)
        random.shuffle(samples)

    # DataLoader setup for `train`, `test`
    if args.train or args.test:
        # Select data loader to use
        DataLoader = utils.get_data_loader(args)

        # Split samples
        split_ratio = [.6, .2, .2]
        trainset, validset, testset = list(
            DataLoader.splits(samples, split_ratio))
        if args.sample_size is not None:  # limit samples used in training
            trainset = trainset[:args.sample_size]
            validset = validset[:int(args.sample_size * split_ratio[1] /
                                     split_ratio[0])]

        # Get data loaders
        train_loader, valid_loader, test_loader = [
            DataLoader(split, batch_size=args.batch_size)
            for split in [trainset, validset, testset]
        ]

    # Load samples for demo
    if args.demo:
        if os.path.isfile(args.file):
            # Read samples from the input file
            with open(args.file, 'r') as f:
                samples = [line for line in f if line.strip()]
            data = pd.DataFrame({
                'text': samples,
                'label': [0.5] * len(samples)
            })
            # Preprocess samples
            preprocessing.clean(data)
            samples = preprocessing.encode(data, glove)
            samples = [(torch.tensor(text).long(), label)
                       for text, label in samples]

            # Select data loader to use
            DataLoader = utils.get_data_loader(args)

            # Get data loader
            data_loader = DataLoader(samples, batch_size=1, shuffle=False)
        else:
            logging.error('Could not find file for demo at {}'.format(
                args.file))
            exit(1)

    # Model setup for `train`, `test`, `demo`
    if args.train or args.test or args.demo:
        # Create the model
        model = utils.get_model(glove, args)

        # Load a model
        if args.load is not None:
            utils.load_model(args.load, model, args)

    # Run `train`
    if args.train:
        training_data = training.train(model, train_loader, valid_loader, args)

    # Run `test`
    if args.test:
        if args.train or args.load is not None:
            criterion = utils.get_criterion(args.loss)
            acc, loss = training.evaluate(model, test_loader, criterion)
            logging.info('Testing accuracy: {:.4%}, loss: {:.6f}'.format(
                acc, loss))
        else:
            logging.error('No model loaded for testing')
            exit(1)

    # Run `demo`
    if args.demo:
        if args.train or args.load is not None:
            model.eval()  # set model to evaluate mode
            logging.info('-- Results --')
            for i, (text, _) in enumerate(data_loader):
                preview = data['text'][i][:32] + '...'
                out = model(text).flatten()
                prob = torch.sigmoid(out)  # apply sigmoid to get probability
                pred = (prob >
                        0.5).long()  # predict `true` if greater than 0.5
                label = ['fake', 'true'][pred.item()]
                label = '{}{}{}'.format(
                    '\033[92m' if pred.item() else '\033[93m', label,
                    '\033[0m')
                confidence = (prob if pred.item() else 1 - prob).item()
                logging.info(
                    'Report {}: {} with {:.2%} confidence - "{}"'.format(
                        i, label, confidence, preview))
        else:
            logging.error('No model loaded for demo')
            exit(1)

    # Run `plot`
    if args.plot:
        if training_data is None:
            training_data = utils.load_training_data(args, allow_missing=False)
        if args.load is not None and not args.train:
            for k, v in training_data.items():
                training_data[k] = v[:args.load + 1]

        logging.info('Plotting training data')
        training.plot(training_data)
Exemple #26
0

def main(config: DictConfig, ckpt: dict, show_dir: Optional[str] = None):

    # seed
    if config.SEED is not None:
        make_deterministic(seed=config.SEED)

    # data
    test_loader = get_test_loader(config)

    # model
    model = build_model(config, model_state_dict=ckpt['model_state_dict'])

    # test
    test(model=model,
         data_loader=test_loader,
         device=config.DEVICE,
         threshold_edge=config.TEST.THRESHOLD_EDGE,
         show_dir=show_dir)


if __name__ == "__main__":
    args = parse_args()

    # load config, ckpt
    config = load_config(args.config, update_dotlist=args.opts)
    ckpt: dict = torch.load(args.ckpt)

    main(config, ckpt, args.show_dir)
Exemple #27
0
def main():
    parser = ArgumentParser()
    parser.add_argument('--config_file', type=str, required=True)
    parser.add_argument('--valid', action='store_true')
    args = parser.parse_args()

    config_file = Path(args.config_file)
    config = load_config(config_file)

    config.setdefault('max_len', 220)
    config.setdefault('max_head_len', 128)
    config.setdefault('epochs', 2)
    config.setdefault('down_sample_frac', 0.5)
    config.setdefault('lr', 1.5e-5)
    config.setdefault('batch_size', 16)
    config.setdefault('accumulation_steps', 4)
    config.setdefault('lr_weight_decay_coef', 1.0)
    config.setdefault('warmup', 0.05)
    config.setdefault('old_data', False)
    config.setdefault('old_fine_tuned', False)
    config.setdefault('device', 'cuda')
    config.setdefault('seed', 1234)

    assert 'lm_model_name' in config
    assert not (config.old_fine_tuned and config.old_data)
    assert config.max_len >= config.max_head_len
    assert config.epochs <= 2

    lm_model_name = config_file.stem
    if config.old_fine_tuned:
        PRETRAINED_PATH = Path(f'../output/{lm_model_name}_old_fine_tune/')
        assert PRETRAINED_PATH.exists()
    else:
        PRETRAINED_PATH = args.lm_model
    MODE = args.lm_model[:4]
    LOWER_CASE = 'uncased' in args.lm_model
    LARGE_MODEL = 'large' in args.lm_model
    DEVICE = torch.device(config.device)

    if config.old_data:
        lm_model_name += '_old_fine_tune'

    if args.valid:
        valid_size = 200000
        shuffle_seed = 1029
        lm_model_name += '_valid'
    else:
        valid_size = 0
        shuffle_seed = config.seed

    OUT_DIR = Path(f'../output/{lm_model_name}/')
    TEST_SUBMISSION = OUT_DIR / 'submission.csv'
    VALID_SUBMISSION = OUT_DIR / 'valid_submission.csv'
    OUT_DIR.mkdir(exist_ok=True)

    warnings.filterwarnings('ignore')
    seed_torch(config.seed)

    if not args.old:
        train_data = TRAIN_DATA
        test_data = TEST_DATA
        sample_submission = SAMPLE_SUBMISSION
        train_size = 1804874 - valid_size
    else:
        train_data = TRAIN_OLD
        test_data = TEST_OLD
        sample_submission = SAMPLE_OLD
        train_size = 159571 - valid_size

        TOXICITY_COLUMN = OLD_TOXICITY_COLUMN
        IDENTITY_COLUMNS = OLD_IDENTITY_COLUMNS
        AUX_TOXICITY_COLUMNS = OLD_AUX_TOXICITY_COLUMNS

    if MODE == 'bert':
        from pytorch_pretrained_bert import BertTokenizer, BertForSequenceClassification, BertAdam

        lm_tokenizer = BertTokenizer.from_pretrained(args.lm_model,
                                                     cache_dir=None,
                                                     do_lower_case=LOWER_CASE)
        model = BertForSequenceClassification.from_pretrained(
            PRETRAINED_PATH,
            cache_dir=None,
            num_labels=1 + len(AUX_TOXICITY_COLUMNS))
        optimizer_class = BertAdam
    else:
        from pytorch_pretrained_bert import GPT2Tokenizer, OpenAIAdam, GPT2Model

        lm_tokenizer = GPT2Tokenizer.from_pretrained(args.lm_model,
                                                     cache_dir=None)
        model = GPT2ClassificationHeadModel.from_pretrained(
            PRETRAINED_PATH,
            clf_dropout=config.get('dropout_rate', 0.1),
            n_class=1 + len(AUX_TOXICITY_COLUMNS))
        optimizer_class = OpenAIAdam
        assert config.lr_weight_decay_coef == 1.0

    with timer('preprocess'):
        tokenizer = MyTokenizer(lm_tokenizer, config.max_len,
                                config.max_head_len, MODE)
        df_train = pd.read_csv(TRAIN_DATA).sample(
            frac=1, random_state=shuffle_seed).reset_index(drop=True)
        df_train['comment_text'] = df_train['comment_text'].astype(str)
        df_train = df_train.fillna(0)
        X_train = tokenizer.tokenize(
            df_train['comment_text'].fillna('DUMMY_VALUE'),
            num_threads=16,
            chunksize=5000)

        df_test = pd.read_csv(TEST_DATA)
        df_test['comment_text'] = df_test['comment_text'].astype(str)
        df_test = df_test.fillna(0)
        X_test = tokenizer.tokenize(
            df_test['comment_text'].fillna('DUMMY_VALUE'),
            num_threads=16,
            chunksize=5000)

        df_train.drop(['comment_text'], axis=1, inplace=True)
        df_test.drop(['comment_text'], axis=1, inplace=True)

        X_valid = X_train[train_size:]
        X_train = X_train[:train_size]

        y_identity_train = df_train[IDENTITY_COLUMNS].values
        y_annotator_counts_train = df_train['toxicity_annotator_count'].values

        weights = training_weights(df_train, TOXICITY_COLUMN, IDENTITY_COLUMNS)
        y_train = np.hstack(
            (df_train[TOXICITY_COLUMN].values.reshape(-1, 1),
             weights.reshape(-1, 1), df_train[AUX_TOXICITY_COLUMNS].values))

        y_valid = y_train[train_size:]
        y_train = y_train[:train_size]
        y_identity_valid = y_identity_train[train_size:]
        y_identity_train = y_identity_train[:train_size]
        y_annotator_counts_valid = y_annotator_counts_train[train_size:]
        y_annotator_counts_train = y_annotator_counts_train[:train_size]
        loss_weight = 1.0 / weights.mean() if not args.old else None

        # drop negative samples here
        frac = config.down_sample_frac
        target_negative = (y_train > 0.0).sum(axis=1) == 1
        identity_negative = (y_identity_train > 0.0).sum(axis=1) == 0
        negative_mask = identity_negative & target_negative
        negative_indices = np.arange(len(y_train))[negative_mask]
        drop_indices_0 = set(
            negative_indices[:int(len(negative_indices) * frac)])
        drop_indices_1 = set(
            negative_indices[int(len(negative_indices) * (1 - frac)):])
        drop_indices_list = [drop_indices_0, drop_indices_1]

        len_train = len(y_train) - len(drop_indices_0)

    with timer('train'):
        model.zero_grad()
        model = model.to(DEVICE)
        num_layers = 24 if LARGE_MODEL else 12
        optimizer_grouped_parameters = get_optimizer_params(
            model, config.lr, config.lr_weight_decay_coef, num_layers)
        num_train_optimization_steps = int(config.epochs * len_train /
                                           config.batch_size /
                                           config.accumulation_steps)

        optimizer = optimizer_class(optimizer_grouped_parameters,
                                    lr=config.lr,
                                    warmup=config.warmup,
                                    t_total=num_train_optimization_steps)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level='O1',
                                          verbosity=0)
        model = model.train()

        batch_count = len_train // config.batch_size
        loss_fn = CustomLoss(loss_weight)
        for epoch, drop_indices in zip(range(config.epochs),
                                       drop_indices_list):
            sample_indices = np.array(
                [i for i in range(len(y_train)) if i not in drop_indices])
            X_sampled_train = [X_train[i] for i in sample_indices]
            y_sampled_train = y_train[sample_indices]
            y_sampled_identity_train = y_identity_train[sample_indices]
            y_sampled_annotator_counts_train = y_annotator_counts_train[
                sample_indices]
            train_dataset = TextDataset(X_sampled_train, y_sampled_train,
                                        y_sampled_identity_train,
                                        y_sampled_annotator_counts_train)
            train_loader = LengthBucketingDataLoader(
                train_dataset,
                shuffle=True,
                drop_last=True,
                batch_size=config.batch_size)
            tk0 = tqdm(enumerate(train_loader), total=batch_count)
            optimizer.zero_grad()
            for i, (x_batch, _, a_batch, y_batch, y_identity_batch) in tk0:
                y_pred = model(x_batch.to(DEVICE),
                               attention_mask=(x_batch > 0).to(DEVICE),
                               labels=None)
                loss = loss_fn(y_pred, y_batch.to(DEVICE))
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                if (i + 1) % config.accumulation_steps == 0:
                    optimizer.step()
                    optimizer.zero_grad()

        model.save_pretrained(OUT_DIR)

    with timer('evaluate'):
        if args.valid:
            valid_dataset = TextDataset(X_valid, y_valid, y_identity_valid,
                                        y_annotator_counts_valid)
            valid_preds = predict(model, valid_dataset, device=DEVICE)

            df_valid = df_train.tail(valid_size)
            df_valid['model1'] = valid_preds
            evaluator = JigsawEvaluator(df_valid[TOXICITY_COLUMN].values,
                                        df_valid[IDENTITY_COLUMNS].values)
            final_score, _ = evaluator.get_final_metric(
                df_valid['model1'].values)

            valid_prediction = predict(model,
                                       TextDataset(X_valid),
                                       device=DEVICE)
            valid_submission = pd.DataFrame({
                'id': df_valid['id'],
                'prediction': valid_prediction
            })
            valid_submission.to_csv(VALID_SUBMISSION, index=False)
            print(f'validation score: {final_score:.5f}')

        test_prediction = predict(model, TextDataset(X_test), device=DEVICE)
        submission = pd.DataFrame({
            'id': df_test['id'],
            'prediction': test_prediction
        })
        submission.to_csv(TEST_SUBMISSION, index=False)
import src.utils as utils

get_ipython().run_line_magic("autoreload", "2")

get_ipython().run_line_magic("matplotlib", "inline")
os.getcwd()

#%% [markdown]
# ### Choose experiment, print out configurations

#%%
base_path = "./maggot_models/models/runs/"
experiment = "fit_dcsbm"
run = 2
config = utils.load_config(base_path, experiment, run)
dcsbm_df = utils.load_pickle(base_path, experiment, run, "dcsbm_out_df")

# dcsbm_df = dcsbm_df.apply(pd.to_numeric)
dcsbm_df.head()


#%%
def get_best(df):
    # out_df = df[(df["param_n_components"] == 1) & (df["param_regularizer"] == 0)]
    # kept_params = ["param_n_blocks"]
    param_range = np.unique(df["param_n_blocks"].values)
    best_rows = []
    for p in param_range:
        temp_df = df[df["param_n_blocks"] == p]
        ind = temp_df["mse"].idxmin()
def gender_estimation(weights=None):
    load_config()
    device = torch.device('cuda')
    imdb_root = os.environ['IMDB_ROOT']
    df = unpickle_imdb(f"{imdb_root}/imdb.pickle")
    savedir = f"{os.environ['LOG_DIR']}"
    trans = transforms.Compose([
        # transforms.Resize(72),
        #transforms.RandomCrop(64),
        transforms.Resize(64),
        transforms.CenterCrop(64),
        transforms.RandomHorizontalFlip(0.5),
        transforms.ToTensor(),
        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD),
    ])
    ds = ImdbDataset(root=imdb_root, df=df, transform=trans)
    print(f"Loaded ds with {len(ds)} items.")
    tr, val = random_split(ds, [len(ds) - len(ds) // 10, len(ds) // 10])
    loss_fn = CrossEntropyLoss()

    #model = resnet50(pretrained=True)
    model = resnet18(pretrained=True)
    model.fc = nn.Linear(model.fc.in_features, 2)
    """
    from src.simclr import ResNetSimCLR
    model = ResNetSimCLR('resnet50', 64)
    #if weights:
    #    model.load_state_dict(torch.load(weights))

    model.projector = nn.Sequential(
        nn.Linear(model.n_features, model.n_features, bias=False),
        nn.ReLU(),
        nn.Linear(model.n_features, 2, bias=False)
    )
    for param in model.encoder.parameters():
        param.requires_grad = False
    """
    model.to(device)
    optim = Adam(model.parameters(), lr=3e-4, weight_decay=1e-6)
    tr_dl = DataLoader(tr,
                       batch_size=16,
                       shuffle=True,
                       num_workers=8,
                       pin_memory=True)
    val_dl = DataLoader(val,
                        batch_size=16,
                        shuffle=False,
                        num_workers=4,
                        pin_memory=True)
    tr_log, val_log = {}, {}

    def untrans_display(im):
        std, mean = torch.as_tensor(IMAGENET_STD), torch.as_tensor(
            IMAGENET_MEAN)
        if mean.ndim == 1: mean = mean[:, None, None]
        if std.ndim == 1: std = std[:, None, None]
        im.mul_(std).add_(mean)
        trans = transforms.ToPILImage()
        im = trans(im)
        im.show()

    def log_epoch(preds, labels, loss, log):
        _, pred_class = torch.max(preds.data, 1)
        log[epoch].append({
            'loss':
            loss.item(),
            'count':
            labels.size(0),
            'correct': (pred_class == labels).sum().item(),
            'tp': ((pred_class == 1) & (labels == 1)).sum().item(),
            'tn': ((pred_class == 0) & (labels == 0)).sum().item(),
            'fp': ((pred_class == 1) & (labels == 0)).sum().item(),
            'fn': ((pred_class == 0) & (labels == 1)).sum().item(),
            'cnt_p': (labels == 1).sum().item(),
            'cnt_n': (labels == 0).sum().item(),
        })

    def print_log_epoch(_e, log, pretext=''):
        epoch_loss = [x['loss'] for x in log[_e]]
        sum_loss = sum(epoch_loss)
        cnt_loss = len(epoch_loss)
        avg_loss = sum_loss / cnt_loss
        print(
            f"{pretext}Epoch {_e}: Total Loss={sum_loss}\tAvg Loss={avg_loss}\tNum Batches={cnt_loss}"
        )

        e_cnt = [x['count'] for x in log[_e]]
        e_correct = [x['correct'] for x in log[_e]]
        e_acc = sum(e_correct) / sum(e_cnt)
        print(
            f"{pretext}Epoch {_e}: Total Cnt={sum(e_cnt)}\tTotal Cor={sum(e_correct)}\tAcc={e_acc}"
        )

        tp_cnt = sum([x['tp'] for x in log[_e]])
        tn_cnt = sum([x['tn'] for x in log[_e]])
        fp_cnt = sum([x['fp'] for x in log[_e]])
        fn_cnt = sum([x['fn'] for x in log[_e]])
        p_cnt = sum([x['cnt_p'] for x in log[_e]])
        n_cnt = sum([x['cnt_n'] for x in log[_e]])
        print(
            f"{pretext}Epoch {_e}: TP={tp_cnt}\tTN={tn_cnt}\tFP={fp_cnt}\tFN={fn_cnt}\tP cnt={p_cnt}\tN cnt={n_cnt}"
        )

    def _epoch(train):
        if train:
            dl = tr_dl
            model.train()
        else:
            dl = val_dl
            model.eval()

        for ix, (img, label) in enumerate(dl):
            img = img.to(device=device)
            labels = label.to(device=device, dtype=torch.int64)
            preds = model(img)
            # _, preds = model(img)
            loss = loss_fn(preds, labels)
            if train:
                optim.zero_grad()
                loss.backward()
                optim.step()
                log_epoch(preds, labels, loss, tr_log)
            else:
                log_epoch(preds, labels, loss, val_log)

    def _save_weights(prefix='', suffix=''):
        time = datetime.datetime.now()
        s = f"{prefix}_{time.month}_{time.day}_{time.hour}_{time.minute}_{time.second}_{suffix}.pth"
        fname = f"{savedir}/{s}"
        print(f"Saving to: {fname}")
        torch.save(model.state_dict(), fname)

    epoch = 0
    for i in range(20):
        tr_log[epoch] = []
        _epoch(True)
        print_log_epoch(epoch, tr_log)

        val_log[epoch] = []
        _epoch(False)
        print_log_epoch(epoch, val_log, pretext='VAL::')
        epoch += 1
        _save_weights(prefix=f'long')
Exemple #30
0
import unittest
import os
from src.utils import load_config

load_config()


class IMDBTestCase(unittest.TestCase):
    def setUp(self) -> None:
        self.imdb_root = os.environ['IMDB_ROOT']

    def test_build_imdb(self):
        from src.datasets import build_imdb
        df = build_imdb(f"{self.imdb_root}/imdb.mat",
                        n=None,
                        save=f"{self.imdb_root}/imdb.pickle")
        print(df)
        print(df[:5])

    def test_load_pickle(self):
        from src.datasets import unpickle_imdb
        df = unpickle_imdb(f"{self.imdb_root}/imdb.pickle")
        print(df)

    def test_imdb_dataset(self):
        from src.datasets import ImdbDataset
        from torchvision.transforms import ToTensor
        from src.datasets import unpickle_imdb
        df = unpickle_imdb(f"{self.imdb_root}/imdb.pickle")
        ds = ImdbDataset(root=self.imdb_root, df=df, transform=ToTensor())