def main():
    config = get_config()
    with config:
        config.logging_steps = 400
        config.train_epochs = 2
        config.lr = 4e-5
        # config.lr = 1e-4
        config.model_type = 'roberta'
        config.model_path = util.models_path('StackOBERTflow-comments-small-v1')
        # config.train_head_only = True

    ds = TDDataset(config, binary=True)

    tokenizer = tu.load_tokenizer(config)
    model_cls = tu.get_model_cls(config)

    train_dataloader = ds.get_complete_train_dataloader(tokenizer)
    model = tu.load_model(config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer)
    global_step, tr_loss = experiment.train(train_dataloader)

    experiment.save_model(util.models_path('satd_complete_binary'))
def main(config, results):
    model_config = tu.load_model_config(config)
    tokenizer = tu.load_tokenizer(config, model_config)

    ds = Dataset(config, tokenizer)
    label_names = ds.label_names

    train_dataloader, valid_dataloader = ds.get_train_valid_dataloaders()

    model = tu.load_model(config, model_config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config,
                            model,
                            tokenizer,
                            label_names=label_names,
                            results=results)
    global_step, tr_loss = experiment.train(train_dataloader,
                                            valid_dataloader=valid_dataloader)
    results = experiment.results

    experiment.save_model(util.models_path('comment_code_shuffle'))

    return results
def set_defaults(config):
    config.train_epochs = 8
    config.lr = 4e-5
    config.max_seq_len = 255
    config.train_bs = 32
    config.eval_bs = 128 * 2
    config.num_labels = 2
    config.hidden_dropout_prob = 0.1
    config.logging_steps = 50
    config.model_path = util.models_path('StackOBERTflow-comments-small-v1')
def set_defaults(config):
    config.train_epochs = 3
    config.lr = 1e-5
    # config.model_path = util.models_path('android.stackexchange.com')
    # config.model_path = util.models_path('android.stackexchange.com')#a
    # config.model_path = util.models_path('stackoverflow_1M')
    # config.model_path = 'bert-base-uncased'
    # config.model_path = util.models_path('StackOBERTflow-comments-small-v1/')
    # config.model_path = 'bert-large-uncased'
    config.max_seq_len = 255
    config.model_type = None
    config.model_path = util.models_path('StackOBERTflow-comments-small-v1')
    config.train_bs = 32
    config.eval_bs = 256
    #config.hidden_dropout_prob = 0.07
    config.num_labels = 12
    config.hidden_dropout_prob = 0.1
    config.multi_label = True
def main():
    config = get_config()
    with config:
        config.logging_steps = 400
        config.train_epochs = 2
        config.lr = 4e-5
        # config.lr = 1e-4
        config.model_type = 'roberta'
        config.model_path = util.models_path('satd_complete_binary')
        # config.train_head_only = True

    tokenizer = tu.load_tokenizer(config)
    model_cls = tu.get_model_cls(config)

    df = pd.read_csv(util.data_path('satd', 'unclassified.csv'))
    # df = pd.read_csv(util.data_path('satd', 'dataset.csv'))
    df.dropna(inplace=True)
    # df.rename(columns={'classification': 'orig_classification'}, inplace=True)

    print(df.dtypes)

    print(df.head())

    df['preprocessed'] = df.commenttext.map(TDDataset.preprocess)
    df.dropna(inplace=True)
    # df = df.head(100)
    preprocessed = df.preprocessed.values
    dummy_labels = np.zeros(preprocessed.shape[0])
    dataloader = tu.get_dataloader(config,
                                   tokenizer,
                                   preprocessed,
                                   dummy_labels,
                                   bs=128,
                                   shuffle=False)

    model = tu.load_model(config)
    model.to(config.device)
    util.set_seed(config)

    experiment = Experiment(config, model, tokenizer)

    preds = experiment.predict(dataloader)
    preds = torch.from_numpy(preds)
    probs = F.softmax(preds, dim=1)
    uncertainty = least_conf(probs).numpy()
    labels = np.argmax(preds, axis=1)

    df['uncertainty'] = uncertainty
    df['probs0'] = probs[:, 0].numpy()
    df['probs1'] = probs[:, 1].numpy()
    df['classification'] = labels
    df.drop('preprocessed', axis='columns', inplace=True)

    label_name_map = {i: l for i, l in enumerate(TDDataset.BINARY_LABEL_NAMES)}
    print(label_name_map)

    # convert_label = {'DEFECT': 1, 'DESIGN': 1,
    #                  'IMPLEMENTATION': 1, 'TEST': 1,
    #                  'WITHOUT_CLASSIFICATION': 0, 'DOCUMENTATION': 1}
    # df['correct'] = (df.orig_classification.map(convert_label) == df.classification)
    # print(df.correct.value_counts(normalize=True))

    df.classification = df.classification.map(label_name_map)
    df.to_csv(util.data_path('satd', 'unclassified_evaled.csv'), index=False)

    tech_debt_df = df[df.classification == 'TECHNICAL_DEBT']
    print(tech_debt_df.shape)
    tech_debt_df.to_csv(util.data_path('satd', 'unclassified_pos.csv'),
                        index=False)
Ejemplo n.º 6
0
def init_config(config, parse_args_func=None, experiment_parse_args_func=None, after_parse_func=None):
    parser = argparse.ArgumentParser()

    parser.add_argument("--lr", default=None, type=float, required=False)
    parser.add_argument("--eval_bs", default=None, type=int, required=False)
    parser.add_argument("--train_bs", default=None, type=int, required=False)
    parser.add_argument("--max_steps", default=None, type=int, required=False)
    parser.add_argument("--train_epochs", default=None, type=int, required=False)
    parser.add_argument("--logging_steps", default=None, type=int, required=False)
    parser.add_argument('--loss_label_weights', nargs='+', type=float, required=False)
    parser.add_argument('--loss_func', default='cross_entropy', type=str, required=False)
    parser.add_argument('--seed', type=int, default=42, required=False)
    parser.add_argument('--seeds', nargs='+', type=int, required=False)
    parser.add_argument('--reinit_layers', nargs='+', type=int, default=[], required=False)
    parser.add_argument("--reinit_pooler", action="store_true")
    parser.add_argument("--no_cuda", action="store_true")
    parser.add_argument("--fp16", action="store_true")
    parser.add_argument("--n_gpu", default=1, type=int)
    parser.add_argument("--out_file", default=None, type=str, required=False)

    parser.add_argument("--no_pretrain", action="store_true")

    default_output_model_path = util.models_path("{}-{}".format(
        list(filter(None, config.model_path.split("/"))).pop(),
        time.strftime("%Y_%m_%d-%H_%M_%S")
    ))

    parser.add_argument("--output_model_path",
                        default=default_output_model_path,
                        type=str,
                        required=False)

    parser.add_argument(
        "--model_path",
        default=None,
        type=str,
        required=False,
        help="Path to pre-trained model or shortcut name"
    )

    parser.add_argument(
        "--tokenizer_path",
        default=None,
        type=str,
        required=False
    )

    if parse_args_func:
        parse_args_func(parser)

    if experiment_parse_args_func:
      experiment_parse_args_func(parser)

    args = parser.parse_args()
    for var, value in vars(args).items():
        if value or not var in config:
            config[var] = value

    config['device'] = "cuda" if not config.no_cuda else "cpu"

    if after_parse_func:
        after_parse_func(config)