def main(config, results):

    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = Dataset(config, tokenizer)
    label_names = ds.label_names
    model_config = tu.load_model_config(config)

    for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(
            ds.get_train_valid_dataloaders(include_valid_df=True)):
        print(f"------------------ BEGIN ITER {i} -----------------------")
        model = tu.load_model(config, model_config)
        model.to(config.device)
        util.set_seed(config)

        run_name = f"fold{i}"

        experiment = Experiment(config,
                                model,
                                tokenizer,
                                label_names=label_names,
                                run_name=run_name,
                                results=results)
        global_step, tr_loss = experiment.train(
            train_dataloader, valid_dataloader=valid_dataloader)

        results = experiment.results
        # experiment.evaluate('valid', valid_dataloader)
        print(f"================== DONE ITER {i} =======================\n\n")

    return results
def main():
    config = get_config()
    with config:
        config.logging_steps = 400
        config.train_epochs = 2
        config.lr = 4e-5
        # config.lr = 1e-4
        config.model_type = 'roberta'
        config.model_path = util.models_path('StackOBERTflow-comments-small-v1')
        # config.train_head_only = True

    ds = TDDataset(config, binary=True)

    tokenizer = tu.load_tokenizer(config)
    model_cls = tu.get_model_cls(config)

    train_dataloader = ds.get_complete_train_dataloader(tokenizer)
    model = tu.load_model(config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer)
    global_step, tr_loss = experiment.train(train_dataloader)

    experiment.save_model(util.models_path('satd_complete_binary'))
Beispiel #3
0
def main(config, results):

    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = Dataset(config, tokenizer)
    label_names = ds.label_names

    train_dataloader, valid_dataloader, test_dataloader = getattr(
        ds, f"get_{config.dataset}_train_valid_test_dataloaders")()

    model = tu.load_model(config, model_config)
    model.resize_token_embeddings(len(tokenizer))
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config,
                            model,
                            tokenizer,
                            label_names=label_names,
                            results=results)
    global_step, tr_loss = experiment.train(train_dataloader,
                                            valid_dataloader=valid_dataloader,
                                            test_dataloader=test_dataloader)

    results = experiment.results
    return results
def main(config, results):
    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = Dataset(config, tokenizer)
    label_names = ds.label_names

    train_dataloader, valid_dataloader = ds.get_train_valid_dataloaders()

    model = tu.load_model(config, model_config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config,
                            model,
                            tokenizer,
                            label_names=label_names,
                            results=results)
    global_step, tr_loss = experiment.train(train_dataloader,
                                            valid_dataloader=valid_dataloader)
    results = experiment.results

    experiment.save_model(util.models_path('comment_code_shuffle'))

    return results
Beispiel #5
0
def main(config, results):

    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = Dataset(config, tokenizer)
    label_names = ds.label_names

    dataloaders = getattr(ds, f"get_{config.dataset}_train_valid_dataloaders")(include_valid_df=True)

    for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(dataloaders):
        print(f"------------------ BEGIN ITER {i} -----------------------")
        # need to reload original model config, to avoid vocabulary size mismatch
        # caused by custom tokens
        model_config = tu.load_model_config(config)
        model = tu.load_model(config, model_config)
        model.resize_token_embeddings(len(tokenizer))
        model.to(config.device)
        util.set_seed(config)

        run_name = f"fold{i}"

        experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results)
        global_step, tr_loss = experiment.train(
            train_dataloader, valid_dataloader=valid_dataloader)

        results = experiment.results
        # experiment.evaluate('valid', valid_dataloader)
        print(f"================== DONE ITER {i} =======================\n\n")

    return results
def main(config, results):
    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = Dataset(config, tokenizer)
    label_names = ds.label_names

    train_dataloader = ds.get_train_dataloader()
    fake_valid_dataloader = ds.get_fake_valid_dataloader()

    # with config:
    #     config.max_steps=100

    model = tu.load_model(config, model_config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results)
    global_step, tr_loss = experiment.train(train_dataloader, valid_dataloader=fake_valid_dataloader) #test_dataloader=test_dataloader)

    valid_dataloader = ds.get_valid_dataloader()
    test_dataloader = ds.get_test_dataloader()
    experiment.evaluate('test_final', test_dataloader)
    experiment.evaluate('valid_final', valid_dataloader)
    experiment.save_model('test_model_complexity')

    with config:
        config.model_path = 'test_model_complexity' 
    model = tu.load_model(config, model_config)
    model.to(config.device)
    logger.warn('#################################### =========================')
    experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results)
    experiment.evaluate('test_final_reloaded', test_dataloader)
    experiment.evaluate('valid_final_reloaded', valid_dataloader)



    results = experiment.results
    
    return results
def main(config, results):
    pd.set_option('display.max_rows', None)

    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = SentiDataset(config, tokenizer)
    test_dataloader = ds.get_test_dataloader()

    model = tu.load_model(config, model_config)
    model.to(config.device)
    util.set_seed(config)

    train_dataloader, valid_dataloader = ds.get_train_valid_dataloaders()
    test_dataloader = ds.get_test_dataloader()

    test_dataloaders = {'test': ds.get_test_dataloader()}

    if config.jira:
        test_dataloaders['JIRA'] = (
            ds.get_jira_dataloader(),
            dict(pred_label_ids_func=ds.neutral_to_negative))

    if config.app_reviews:
        test_dataloaders['AppReviews'] = ds.get_app_reviews_dataloader()

    if config.sentidata_so:
        test_dataloaders[
            'StackOverflow (SentiData)'] = ds.get_stack_overflow_dataloader()

    experiment = Experiment(config,
                            model,
                            tokenizer,
                            label_names=ds.label_names,
                            results=results)
    global_step, tr_loss = experiment.train(train_dataloader,
                                            valid_dataloader=valid_dataloader,
                                            test_dataloader=test_dataloaders)

    # interp_df = experiment.interpret(test_dataloader, ds.test_df, label_names=ds.label_names)
    # with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    #     print(interp_df)

    # if config.interp_out_file:
    #     interp_df.to_csv(config.interp_out_file, index=False)

    return experiment.results
def main(config, results):
    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    if config.clap:
        ds = ClapDataset(config, tokenizer)
    else:
        ds = Dataset(config, tokenizer)

    label_names = ds.label_names
    model_config = tu.load_model_config(config)

    interp_out_file = Path(config.interp_out_file) if config.interp_out_file else None

    for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(ds.get_train_valid_dataloaders(include_valid_df=True)):
        print(f"------------------ BEGIN ITER {i} -----------------------")
        model = tu.load_model(config, model_config)
        model.to(config.device)
        util.set_seed(config)

        run_name = f"fold{i}"

        experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results)
        global_step, tr_loss = experiment.train(
            train_dataloader, valid_dataloader=valid_dataloader)


        if interp_out_file:
            interp_df = experiment.interpret(valid_dataloader, valid_df, label_names=label_names)
            with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                print(interp_df)
            interp_df.to_csv(interp_out_file.with_name(f"{interp_out_file.name}_iter{i}"), index=False)

        results = experiment.results
        # experiment.evaluate('valid', valid_dataloader)
        print(f"================== DONE ITER {i} =======================\n\n")

    return results
def main(config, results):
    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = Dataset(config, tokenizer)
    label_names = ds.label_names

    train_dataloader, valid_dataloader = ds.get_train_valid_dataloaders()
    test_dataloader = ds.get_test_dataloader()

    # with config:
    #     config.max_steps=100

    model = tu.load_model(config, model_config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer, label_names=label_names, results=results)
    global_step, tr_loss = experiment.train(train_dataloader, valid_dataloader=valid_dataloader, test_dataloader=test_dataloader)

    results = experiment.results
    
    return results
Beispiel #10
0
def main():
    config = get_config(parse_args)
    util.set_seed(config)

    with config:
        config.logging_steps = 50
        config.train_epochs = 5

    # config.train_head_only = True

    print("model is now", config.model_path)
    ds = CadoDataset(config)
    label_names = ds.label_names

    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    f1s = []
    results = None

    test_dataloader = ds.get_test_dataloader(tokenizer)

    for i, (train_dataloader, (valid_dataloader, valid_df)) in enumerate(ds.get_all_train_valid_dataloaders(tokenizer, include_valid_df=True)):
        print(f"------------------ BEGIN ITER {i} -----------------------")
        model = tu.load_model(config, model_config)
        model.to(config.device)
        util.set_seed(config)

        run_name = f"{config.single_class if config.single_class else 'multi'}_{i}"

        experiment = Experiment(config, model, tokenizer, label_names=label_names, run_name=run_name, results=results)
        global_step, tr_loss = experiment.train(
            train_dataloader, valid_dataloader=valid_dataloader, test_dataloader=test_dataloader)

        results = experiment.results
        # experiment.evaluate('valid', valid_dataloader)
        print(f"================== DONE ITER {i} =======================\n\n")
def active_learn(config,
                 model_config,
                 tokenizer,
                 results,
                 label_names,
                 test_df,
                 full_pool_df,
                 backtrans_pool_dfs,
                 get_dataloader_func,
                 run_configs,
                 active_learning_iters=10,
                 dropout_iters=20,
                 balance=False):
    test_dataloader = get_dataloader_func(test_df, bs=config.eval_bs)

    for run_config in run_configs:
        method, dropout, backtrans_langs, cluster_size = run_config
        run_name = method.__name__
        if dropout:
            run_name += '_dropout'
        run_name = '_'.join([run_name, *backtrans_langs, f"c{cluster_size}"])

        util.set_seed(config)

        model = tu.load_model(config, model_config)
        model.to(config.device)

        # remove initial seed from pool
        train_df, pool_df = train_test_split(
            full_pool_df,
            train_size=config.active_learn_seed_size,
            random_state=config.seed)

        logger.info("RUN CONFIG: %s (pool size: %d)", run_name,
                    pool_df.shape[0])

        experiment = Experiment(config,
                                model,
                                tokenizer,
                                label_names=label_names,
                                run_name=run_name,
                                results=results)

        cur_iter = 0

        extra_log = {'iter': cur_iter, 'pool': pool_df.shape[0]}
        experiment.evaluate('test', test_dataloader, extra_log=extra_log)

        while pool_df.shape[0] > 0:
            train_dataloader = get_dataloader_func(train_df,
                                                   bs=config.train_bs,
                                                   balance=balance)

            # DON'T SHUFFLE THE POOL!
            dataloader_pool = get_dataloader_func(pool_df,
                                                  bs=config.eval_bs,
                                                  shuffle=False)

            logger.info(
                "=================== Remaining %d (%s) ================",
                pool_df.shape[0], run_config)
            logger.info(
                "Evaluating: training set size: %d | pool set size: %d",
                train_df.shape[0], pool_df.shape[0])

            global_step, tr_loss = experiment.train(train_dataloader)

            extra_log = {'iter': cur_iter, 'pool': pool_df.shape[0]}

            _, _, preds = experiment.evaluate('pool',
                                              dataloader_pool,
                                              extra_log=extra_log)
            experiment.evaluate('test', test_dataloader, extra_log=extra_log)

            if method != af.random_conf:
                if dropout:
                    for i in range(dropout_iters):
                        torch.manual_seed(i)

                        _, _, preds_i = experiment.evaluate('pool_dropout',
                                                            dataloader_pool,
                                                            mc_dropout=True,
                                                            skip_cb=True)
                        preds_i = torch.from_numpy(preds_i)
                        probs_i = F.softmax(preds_i, dim=1)

                        if i == 0:
                            probs = probs_i
                        else:
                            probs.add_(probs_i)
                    probs.div_(dropout_iters)
                else:
                    preds = torch.from_numpy(preds)
                    probs = F.softmax(preds, dim=1)
            else:
                preds = torch.from_numpy(preds)

                # only need the shape
                probs = preds

            scores = method(probs)
            _, topk_indices = torch.topk(
                scores,
                min(cluster_size * config.active_learn_step_size,
                    scores.shape[0]))

            if cluster_size > 1:
                topk_preds = preds[topk_indices]
                n_clusters = min(config.active_learn_step_size,
                                 scores.shape[0])
                kmeans = KMeans(n_clusters=n_clusters).fit(topk_preds.numpy())
                _, unique_indices = np.unique(kmeans.labels_,
                                              return_index=True)
                topk_indices = topk_indices[unique_indices]
                # assert(topk_indices.shape[0] == n_clusters)
                logger.debug("top_k: %s", topk_indices.shape)

            logger.debug("%s %s", scores.shape, pool_df.shape)

            assert (scores.shape[0] == pool_df.shape[0])

            uncertain_rows = pool_df.iloc[topk_indices]
            train_df = train_df.append(uncertain_rows, ignore_index=True)

            for backtrans_lang in backtrans_langs:
                backtrans_pool_df = backtrans_pool_dfs[backtrans_lang]
                backtrans_uncertain_rows = backtrans_pool_df[
                    backtrans_pool_df.id.isin(uncertain_rows.id)]
                train_df = train_df.append(backtrans_uncertain_rows,
                                           ignore_index=True)

            pool_df = pool_df.drop(pool_df.index[topk_indices])
            cur_iter += 1

        logger.debug(
            "Pool exhausted, stopping active learning loop (%d remaining)",
            pool_df.shape[0])

        results = experiment.results
    return results
def main():
    config = get_config()
    with config:
        config.logging_steps = 400
        config.train_epochs = 2
        config.lr = 4e-5
        # config.lr = 1e-4
        config.model_type = 'roberta'
        config.model_path = util.models_path('satd_complete_binary')
        # config.train_head_only = True

    tokenizer = tu.load_tokenizer(config)
    model_cls = tu.get_model_cls(config)

    df = pd.read_csv(util.data_path('satd', 'unclassified.csv'))
    # df = pd.read_csv(util.data_path('satd', 'dataset.csv'))
    df.dropna(inplace=True)
    # df.rename(columns={'classification': 'orig_classification'}, inplace=True)

    print(df.dtypes)

    print(df.head())

    df['preprocessed'] = df.commenttext.map(TDDataset.preprocess)
    df.dropna(inplace=True)
    # df = df.head(100)
    preprocessed = df.preprocessed.values
    dummy_labels = np.zeros(preprocessed.shape[0])
    dataloader = tu.get_dataloader(config,
                                   tokenizer,
                                   preprocessed,
                                   dummy_labels,
                                   bs=128,
                                   shuffle=False)

    model = tu.load_model(config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer)

    preds = experiment.predict(dataloader)
    preds = torch.from_numpy(preds)
    probs = F.softmax(preds, dim=1)
    uncertainty = least_conf(probs).numpy()
    labels = np.argmax(preds, axis=1)

    df['uncertainty'] = uncertainty
    df['probs0'] = probs[:, 0].numpy()
    df['probs1'] = probs[:, 1].numpy()
    df['classification'] = labels
    df.drop('preprocessed', axis='columns', inplace=True)

    label_name_map = {i: l for i, l in enumerate(TDDataset.BINARY_LABEL_NAMES)}
    print(label_name_map)

    # convert_label = {'DEFECT': 1, 'DESIGN': 1,
    #                  'IMPLEMENTATION': 1, 'TEST': 1,
    #                  'WITHOUT_CLASSIFICATION': 0, 'DOCUMENTATION': 1}
    # df['correct'] = (df.orig_classification.map(convert_label) == df.classification)
    # print(df.correct.value_counts(normalize=True))

    df.classification = df.classification.map(label_name_map)
    df.to_csv(util.data_path('satd', 'unclassified_evaled.csv'), index=False)

    tech_debt_df = df[df.classification == 'TECHNICAL_DEBT']
    print(tech_debt_df.shape)
    tech_debt_df.to_csv(util.data_path('satd', 'unclassified_pos.csv'),
                        index=False)
Beispiel #13
0
def main(config, results):

    logger.warning('Unclassified threshold: %s', config.self_train_thresh)

    ds = TDDataset(config,
                   binary=True,
                   self_train_thresh=config.self_train_thresh,
                   keyword_masking_frac=config.keyword_masking_frac)

    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)
    label_names = ds.label_names

    #project_name = 'emf-2.4.1'
    project_name = config.single_project

    iter_obj = [
        (project_name, *ds.get_train_valid_dataloaders(
            tokenizer, project_name, include_valid_df=True))
    ] if project_name else ds.get_fold_dataloaders(tokenizer,
                                                   include_valid_df=True)

    interp_out_file = Path(
        config.interp_out_file) if config.interp_out_file else None

    # for train_dataloader, valid_dataloader in [ds.get_train_valid_dataloaders(tokenizer, project_name)]:
    for project_name, train_dataloader, (valid_dataloader,
                                         valid_df) in iter_obj:
        print(
            f"------------------ BEGIN PROJECT {project_name} -----------------------"
        )

        model = tu.load_model(config, model_config)
        model.to(config.device)
        util.set_seed(config)

        experiment = Experiment(config,
                                model,
                                tokenizer,
                                label_names=label_names,
                                run_name=project_name,
                                results=results)
        global_step, tr_loss = experiment.train(
            train_dataloader, valid_dataloader=valid_dataloader)

        if interp_out_file:
            interp_df = experiment.interpret(valid_dataloader, valid_df)
            with pd.option_context('display.max_rows', None,
                                   'display.max_columns', None):
                print(interp_df)
            interp_df.to_csv(interp_out_file.with_name(
                f"{project_name}_{interp_out_file.name}"),
                             index=False)

        results = experiment.results

        # experiment.evaluate('valid', valid_dataloader)

        print(
            f"================== DONE PROJECT {project_name} =======================\n\n"
        )
    return results