# In[24]:


output_model_file = "bert_pytorch.bin"

lr=2e-5
batch_size = 32
accumulation_steps=1
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = BertForSequenceClassification.from_pretrained("../working",cache_dir=None,num_labels=len(y_columns))
model.zero_grad()
model = model.to(device)
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
train = train_dataset

num_train_optimization_steps = int(EPOCHS*len(train)/batch_size/accumulation_steps)

optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=lr,
                     warmup=0.05,
def predict_civility(ds, **kwargs):
    global db
    print(os.getcwd())
    ls = []
    civility = False
    while civility:
        data = []
        for doc in col.distinct('message', {'civility_class': None}):
            if len(data) < 2000:
                data.append({'message': doc})
            else:
                break

        if len(data) > 0:
            print(len(data))
            df = pd.DataFrame(data)
            df['label'] = 0
            dev_df_bert = pd.DataFrame({
                'id':
                range(len(df)),
                'label':
                df['label'],
                'alpha': ['a'] * df.shape[0],
                'text':
                df['message'].replace(r'\n', ' ', regex=True)
            })

            dev_df_bert.to_csv('./home/jay/airflow/dags/data/dev.tsv',
                               sep='\t',
                               index=False,
                               header=False)

            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")
            # This is where BERT will look for pre-trained models to load parameters from.
            CACHE_DIR = './home/jay/airflow/dags/cache/'

            # The maximum total input sequence length after WordPiece tokenization.
            # Sequences longer than this will be truncated, and sequences shorter than this will be padded.
            MAX_SEQ_LENGTH = 128

            TRAIN_BATCH_SIZE = 24
            EVAL_BATCH_SIZE = 8
            LEARNING_RATE = 2e-5
            RANDOM_SEED = 42
            GRADIENT_ACCUMULATION_STEPS = 1
            WARMUP_PROPORTION = 0.1
            OUTPUT_MODE = 'classification'
            NUM_TRAIN_EPOCHS = 1
            CONFIG_NAME = "config.json"
            WEIGHTS_NAME = "pytorch_model.bin"
            Data = 'FB20'
            DATA_DIR = "./home/jay/airflow/dags/data/"
            categories = ["Uncivil"]
            #         categories = ["Attack", "Advocacy", "Ceremonial", "CTA", "CI", "Image", "Issue"]
            for Category in categories:
                print(Category)
                TASK_NAME = Data + Category
                BERT_MODEL = TASK_NAME + '.tar.gz'

                # The output directory where the fine-tuned model and checkpoints will be written.
                OUTPUT_DIR = './home/jay/airflow/dags/outputs/' + TASK_NAME + '/'
                tokenizer = BertTokenizer.from_pretrained(OUTPUT_DIR +
                                                          'vocab.txt',
                                                          do_lower_case=False)
                processor = BinaryClassificationProcessor()
                eval_examples = processor.get_dev_examples(DATA_DIR)
                label_list = processor.get_labels(
                )  # [0, 1] for binary classification
                num_labels = len(label_list)
                eval_examples_len = len(eval_examples)

                label_map = {label: i for i, label in enumerate(label_list)}
                eval_examples_for_processing = [
                    (example, label_map, MAX_SEQ_LENGTH, tokenizer,
                     OUTPUT_MODE) for example in eval_examples
                ]

                process_count = cpu_count() - 1
                #             if __name__ ==  '__main__':
                #         print('Preparing to convert' {eval_examples_len} examples..')
                #         print(f'Spawning {process_count} processes..')
                with Pool(process_count) as p:
                    eval_features = list(
                        p.imap(convert_example_to_feature,
                               eval_examples_for_processing))

                all_input_ids = torch.tensor(
                    [f.input_ids for f in eval_features], dtype=torch.long)
                all_input_mask = torch.tensor(
                    [f.input_mask for f in eval_features], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in eval_features], dtype=torch.long)
                all_label_ids = torch.tensor(
                    [f.label_id for f in eval_features], dtype=torch.long)

                eval_data = TensorDataset(all_input_ids, all_input_mask,
                                          all_segment_ids, all_label_ids)

                # Run prediction for full data
                eval_sampler = SequentialSampler(eval_data)
                eval_dataloader = DataLoader(eval_data,
                                             sampler=eval_sampler,
                                             batch_size=EVAL_BATCH_SIZE)

                # Load pre-trained model (weights)
                model = BertForSequenceClassification.from_pretrained(
                    CACHE_DIR + BERT_MODEL,
                    cache_dir=CACHE_DIR,
                    num_labels=len(label_list))
                print(label_list)

                model.to(device)

                model.eval()
                eval_loss = 0
                nb_eval_steps = 0
                preds = []

                for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
                    input_ids = input_ids.to(device)
                    input_mask = input_mask.to(device)
                    segment_ids = segment_ids.to(device)
                    label_ids = label_ids.to(device)

                    with torch.no_grad():
                        logits = model(input_ids,
                                       segment_ids,
                                       input_mask,
                                       labels=None)

                    # create eval loss and other metric required by the task

                    loss_fct = CrossEntropyLoss()
                    tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                             label_ids.view(-1))

                    eval_loss += tmp_eval_loss.mean().item()
                    nb_eval_steps += 1
                    if len(preds) == 0:
                        preds.append(logits.detach().cpu().numpy())
                    else:
                        preds[0] = np.append(preds[0],
                                             logits.detach().cpu().numpy(),
                                             axis=0)

                eval_loss = eval_loss / nb_eval_steps
                preds = preds[0]
                preds = np.argmax(preds, axis=1)
                df[Category] = preds

            del df['label']

            dc = df.to_dict('records')

            for doc in dc:
                doc['civility_class'] = []
                for c in categories:
                    if doc[c] == 1:
                        doc['civility_class'].append('uncivil')
                    else:
                        doc['civility_class'].append('civil')
                    del doc[c]
            print(len(dc))
            print(dc[0])
            print("Pushing into DB")
            ct = 0
            for doc in dc:
                for x in col.find({"message": doc['message'], 'marked': 0}):
                    # x['marked']=1
                    x['civility_class'] = doc['civility_class']
                    col.update_one({'_id': x['_id']}, {"$set": x}, True)
    return "Done"
Beispiel #3
0
def main():
    train_df = pd.read_csv(TRAIN_PATH)
    train_df['male'] = np.load(
        "../input/identity-column-data/male_labeled.npy")
    train_df['female'] = np.load(
        "../input/identity-column-data/female_labeled.npy")
    train_df['homosexual_gay_or_lesbian'] = np.load(
        "../input/identity-column-data/homosexual_gay_or_lesbian_labeled.npy")
    train_df['christian'] = np.load(
        "../input/identity-column-data/christian_labeled.npy")
    train_df['jewish'] = np.load(
        "../input/identity-column-data/jewish_labeled.npy")
    train_df['muslim'] = np.load(
        "../input/identity-column-data/muslim_labeled.npy")
    train_df['black'] = np.load(
        "../input/identity-column-data/black_labeled.npy")
    train_df['white'] = np.load(
        "../input/identity-column-data/white_labeled.npy")
    train_df['psychiatric_or_mental_illness'] = np.load(
        "../input/identity-column-data/psychiatric_or_mental_illness_labeled.npy"
    )
    fold_df = pd.read_csv(FOLD_PATH)

    # y = np.where(train_df['target'] >= 0.5, 1, 0)
    y = train_df['target'].values
    y_aux = train_df[AUX_COLUMNS].values

    identity_columns_new = []
    for column in identity_columns + ['target']:
        train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True,
                                             False)
        if column != "target":
            identity_columns_new.append(column + "_bin")

    # Overall
    weights = np.ones((len(train_df), )) / 4
    # Subgroup
    weights += (train_df[identity_columns].fillna(0).values >= 0.5).sum(
        axis=1).astype(bool).astype(np.int) / 4
    # Background Positive, Subgroup Negative
    weights += (
        ((train_df["target"].values >= 0.5).astype(bool).astype(np.int) +
         (1 - (train_df[identity_columns].fillna(0).values >= 0.5).sum(
             axis=1).astype(bool).astype(np.int))) > 1).astype(bool).astype(
                 np.int) / 4
    # Background Negative, Subgroup Positive
    weights += (
        ((train_df["target"].values < 0.5).astype(bool).astype(np.int) +
         (train_df[identity_columns].fillna(0).values >= 0.5).sum(
             axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(
                 np.int) / 4
    loss_weight = 0.5

    with timer('preprocessing text'):
        # df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]]
        train_df['comment_text'] = train_df['comment_text'].astype(str)
        train_df = train_df.fillna(0)

    with timer('load embedding'):
        tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH,
                                                  cache_dir=None,
                                                  do_lower_case=False)
        X_text = convert_lines_head_tail(
            train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, head_len,
            tokenizer)
        X_text = np.array(X_text).astype("int32")
        del tokenizer
        gc.collect()

    with timer('train'):
        train_index = fold_df.fold_id != fold_id
        valid_index = fold_df.fold_id == fold_id
        X_train, y_train, y_aux_train, w_train = X_text[train_index], y[
            train_index].astype("float32"), y_aux[train_index].astype(
                "float32"), weights[train_index].astype("float32")
        X_val, y_val, y_aux_val, w_val = X_text[valid_index], y[valid_index].astype("float32"),\
                                         y_aux[valid_index].astype("float32"), weights[valid_index].astype("float32")
        test_df = train_df[valid_index]
        train_size = len(X_train)
        del X_text, y, y_aux, weights, train_index, valid_index, train_df, fold_df
        gc.collect()

        model = BertForSequenceClassification.from_pretrained(
            WORK_DIR, cache_dir=None, num_labels=n_labels)
        model.zero_grad()
        model = model.to(device)

        y_train = np.concatenate(
            (y_train.reshape(-1, 1), w_train.reshape(-1, 1), y_aux_train),
            axis=1).astype("float32")
        y_val = np.concatenate(
            (y_val.reshape(-1, 1), w_val.reshape(-1, 1), y_aux_val),
            axis=1).astype("float32")
        del w_train, w_val, y_aux_train, y_aux_val
        gc.collect()

        train_dataset = torch.utils.data.TensorDataset(
            torch.tensor(X_train, dtype=torch.long),
            torch.tensor(y_train, dtype=torch.float32))
        valid = torch.utils.data.TensorDataset(
            torch.tensor(X_val, dtype=torch.long),
            torch.tensor(y_val, dtype=torch.float32))
        ran_sampler = torch.utils.data.RandomSampler(train_dataset)
        len_sampler = LenMatchBatchSampler(ran_sampler,
                                           batch_size=batch_size,
                                           drop_last=False)
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_sampler=len_sampler)
        valid_loader = torch.utils.data.DataLoader(valid,
                                                   batch_size=batch_size * 2,
                                                   shuffle=False)
        del X_train, y_train, X_val, y_val
        gc.collect()
        LOGGER.info(f"done data loader setup")

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        num_train_optimization_steps = int(epochs * train_size / batch_size /
                                           accumulation_steps)
        total_step = int(epochs * train_size / batch_size)

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=base_lr,
                             warmup=0.005,
                             t_total=num_train_optimization_steps)
        LOGGER.info(f"done optimizer loader setup")

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        # criterion = torch.nn.BCEWithLogitsLoss().to(device)
        criterion = CustomLoss(loss_weight).to(device)
        LOGGER.info(f"done amp setup")

        for epoch in range(epochs):
            LOGGER.info(f"Starting {epoch} epoch...")
            LOGGER.info(f"length {train_size} train...")
            if epoch == 1:
                for param_group in optimizer.param_groups:
                    param_group['lr'] = base_lr * gammas[1]
            tr_loss, train_losses = train_one_epoch(model,
                                                    train_loader,
                                                    criterion,
                                                    optimizer,
                                                    device,
                                                    accumulation_steps,
                                                    total_step,
                                                    n_labels,
                                                    base_lr,
                                                    gamma=gammas[2 * epoch])
            LOGGER.info(f'Mean train loss: {round(tr_loss,5)}')

            torch.save(model.state_dict(),
                       '{}_epoch{}_fold{}.pth'.format(exp, epoch, fold_id))

            valid_loss, oof_pred = validate(model, valid_loader, criterion,
                                            device, n_labels)
            LOGGER.info(f'Mean valid loss: {round(valid_loss,5)}')

        del model
        gc.collect()
        torch.cuda.empty_cache()

    test_df["pred"] = oof_pred[:, 0]
    test_df = convert_dataframe_to_bool(test_df)
    bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns)
    LOGGER.info(bias_metrics_df)

    score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df))
    LOGGER.info(f'final score is {score}')

    test_df.to_csv("oof.csv", index=False)

    xs = list(range(1, len(train_losses) + 1))
    plt.plot(xs, train_losses, label='Train loss')
    plt.legend()
    plt.xticks(xs)
    plt.xlabel('Iter')
    plt.savefig("loss.png")
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForSequenceClassification

BertForSequenceClassification.from_pretrained('bert-large-uncased', cache_dir='./', num_labels=5)
BertTokenizer.from_pretrained('bert-large-uncased', cache_dir='./')

BertForSequenceClassification.from_pretrained('bert-large-cased', cache_dir='./', num_labels=5)
BertTokenizer.from_pretrained('bert-large-cased', cache_dir='./')

BertModel.from_pretrained('bert-large-cased', cache_dir='./')
BertTokenizer.from_pretrained('bert-large-cased', cache_dir='./')

BertTokenizer.from_pretrained('bert-base-uncased', cache_dir='./')
BertModel.from_pretrained('bert-base-uncased', cache_dir='./')

def load_pretrained_model(args, processor):
    label_list = processor.get_labels()
    model = BertForSequenceClassification.from_pretrained(args.bert_checkpoint_dir, num_labels=len(label_list))
    module_utils.set_requires_grad(model, False)
    return model
    test_examples = [
        InputExample('test', row.tweet, label='UNT')
        for row in test.itertuples()
    ]
    label_list = ['UNT', 'TIN']
if sys.argv[1] == 'C':
    test_examples = [
        InputExample('test', row.tweet, label='IND')
        for row in test.itertuples()
    ]
    label_list = ['IND', 'GRP', 'OTH']

tokenizer = BertTokenizer.from_pretrained(VOCAB)
if sys.argv[1] == 'C':
    model = BertForSequenceClassification.from_pretrained(MODEL,
                                                          cache_dir=cache_dir,
                                                          num_labels=3)
else:
    model = BertForSequenceClassification.from_pretrained(MODEL,
                                                          cache_dir=cache_dir,
                                                          num_labels=2)

model.load_state_dict(
    torch.load('./BERT/bert_task' + str(sys.argv[1]) + str(sys.argv[2]) +
               '.pkl'))

model.to(device)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

# Prepare optimizer
Beispiel #7
0
 def new_model(self):
     self.model = BertForSequenceClassification.from_pretrained(
         self.bert_model, num_labels=self.num_classes)
     self.__init_model()
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        choices=[
                            "bert-base-uncased",
                            "bert-large-uncased",
                            "bert-base-cased",
                            "bert-large-cased",
                            "bert-base-multilingual-uncased",
                            "bert-base-multilingual-cased",
                            "bert-base-chinese",
                        ],
                        help="Bert pre-trained model selected in the list")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--labels",
                        nargs='+',
                        default=['0', '1'],
                        help="labels")

    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_test",
                        action='store_true',
                        help="Whether to run eval on the test set.")
    parser.add_argument("--do_distill",
                        action='store_true',
                        help="Whether to run distillation.")
    parser.add_argument("--blendcnn_channels",
                        nargs='+',
                        default=(100,) * 8,
                        help="BlendCNN channels.")
    parser.add_argument("--blendcnn_act",
                        default='relu',
                        choices=list(ACT2FN.keys()),
                        help="BlendCNN activation function.")
    parser.add_argument('--blendcnn_dropout',
                        action='store_true',
                        help="Whether to use dropout in BlendCNN")
    parser.add_argument('--blendcnn_pair',
                        action='store_true',
                        help="Whether to use BlendCNNForSequencePairClassification")
    parser.add_argument("--export_onnx",
                        action='store_true',
                        help="Whether to export model to onnx format.")
    parser.add_argument("--onnx_framework",
                        choices=[
                            "caffe2",
                        ],
                        help="Select the ONNX framework to run eval")
    parser.add_argument("--eval_interval",
                        default=1000,
                        type=int,
                        help="Specify eval interval during training.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")

    distiller.knowledge_distillation.add_distillation_args(parser)
    args = parser.parse_args()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "custom": lambda: CustomProcessor(args.labels),
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size / args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not any((args.do_train, args.do_eval, args.do_test, args.do_distill, args.export_onnx)):
        raise ValueError("At least one of `do_train`, `do_eval`, `do_test`, `do_distill`, `export_onnx` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    global_step = 0
    loss = 0
    output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
    onnx_model_file = os.path.join(args.output_dir, "model.onnx")
    eval_data = None

    if args.do_train:
        model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                              cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(
                                                                  args.local_rank),
                                                              num_labels=num_labels)
        model = convert_model(args, model, device, n_gpu)

        tensorboard_log_dir = os.path.join(args.output_dir, './log')
        os.makedirs(tensorboard_log_dir, exist_ok=True)
        tensorboard_logger = SummaryWriter(tensorboard_log_dir)

        if args.do_eval and do_eval_or_test(args) and eval_data is None:
            eval_data = prepare(args, processor, label_list, tokenizer, 'dev')

        global_step, loss = train(args,
                                  model,
                                  output_model_file,
                                  processor,
                                  label_list,
                                  tokenizer,
                                  device,
                                  n_gpu,
                                  tensorboard_logger,
                                  eval_data)

    model_config = None
    model_embeddings = None
    if args.onnx_framework is None:
        # Load a trained model that you have fine-tuned
        if os.path.exists(output_model_file):
            model_state_dict = torch.load(output_model_file, map_location=lambda storage, loc: storage)
        else:
            model_state_dict = None
        model = BertForSequenceClassification.from_pretrained(args.bert_model,
                                                              state_dict=model_state_dict,
                                                              num_labels=num_labels)
        model_config = copy.deepcopy(model.config)
        model_embeddings = model.bert.embeddings
        model = convert_model(args, model, device, n_gpu)
    else:
        import onnx
        model = onnx.load(onnx_model_file)
        onnx.checker.check_model(model)

    if args.do_distill:
        assert model_config is not None
        assert model_embeddings is not None
        output_distilled_model_file = os.path.join(args.output_dir, DISTILLER_WEIGHTS_NAME)
        teacher = model
        model_config.hidden_act = args.blendcnn_act
        if args.blendcnn_pair:
            student = BlendCNNForSequencePairClassification(model_config,
                                                            num_labels=num_labels,
                                                            channels=(model_config.hidden_size,) +
                                                                     args.blendcnn_channels,
                                                            n_hidden_dense=(model_config.hidden_size,) * 2,
                                                            use_dropout=args.blendcnn_dropout)
        else:
            student = BlendCNN(model_config,
                               num_labels=num_labels,
                               channels=(model_config.hidden_size,) + args.blendcnn_channels,
                               n_hidden_dense=(model_config.hidden_size,) * 2,
                               use_dropout=args.blendcnn_dropout)
        student.embeddings.load_state_dict(model_embeddings.state_dict())

        student = convert_model(args, student, device, 1)
        if os.path.exists(output_distilled_model_file):
            logger.info(
                'Loading existing distilled model {}, skipping distillation'.format(output_distilled_model_file))
            student.load_state_dict(torch.load(output_distilled_model_file))
        else:
            dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt)
            args.kd_policy = distiller.KnowledgeDistillationPolicy(student, teacher, args.kd_temp, dlw)

            tensorboard_log_dir = os.path.join(args.output_dir, './log')
            os.makedirs(tensorboard_log_dir, exist_ok=True)
            tensorboard_logger = SummaryWriter(tensorboard_log_dir)

            if args.do_eval and do_eval_or_test(args) and eval_data is None:
                eval_data = prepare(args, processor, label_list, tokenizer, 'dev')

            global_step, loss = distill(args,
                                        output_distilled_model_file,
                                        processor,
                                        label_list,
                                        tokenizer,
                                        device,
                                        n_gpu,
                                        tensorboard_logger,
                                        eval_data)
        model = student

    if do_eval_or_test(args):
        result = {
            'global_step': global_step,
            'loss': loss
        }
        model.float()
        name = '_distiller' if args.do_distill else ''

        if args.do_eval:
            if eval_data is None:
                eval_data = prepare(args, processor, label_list, tokenizer, 'dev')
            eval_loss, eval_accuracy, eval_probs = eval(args, model, eval_data, device, verbose=True)
            np.savetxt(os.path.join(args.output_dir, 'dev{}_probs.npy'.format(name)), eval_probs)
            result.update({
                'dev{}_loss'.format(name): eval_loss,
                'dev{}_accuracy'.format(name): eval_accuracy,
            })

        if args.do_test:
            eval_data = prepare(args, processor, label_list, tokenizer, 'test')
            eval_loss, eval_accuracy, eval_probs = eval(args, model, eval_data, device, verbose=True)
            np.savetxt(os.path.join(args.output_dir, 'test{}_probs.npy'.format(name)), eval_probs)
            result.update({
                'test{}_loss'.format(name): eval_loss,
                'test{}_accuracy'.format(name): eval_accuracy,
            })

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

    if args.export_onnx:
        if not env_enabled(ENV_OPENAIGPT_GELU) or not env_enabled(ENV_DISABLE_APEX):
            raise ValueError('Both {} and {} must be 1 to properly export ONNX.'.format(ENV_OPENAIGPT_GELU,
                                                                                        ENV_DISABLE_APEX))

        if not isinstance(model, torch.nn.Module):
            raise ValueError('model is not an instance of torch.nn.Module.')

        import onnx
        import onnx.utils
        import onnx.optimizer
        dummy_input = get_dummy_input(args, processor, label_list, tokenizer, device)
        torch.onnx.export(model,
                          dummy_input,
                          onnx_model_file,
                          input_names=['input_ids', 'input_mask', 'segment_ids'],
                          output_names=['output_logit'],
                          verbose=True)
        optimized_model = onnx.optimizer.optimize(onnx.load(onnx_model_file),
                                                  [pass_ for pass_ in onnx.optimizer.get_available_passes()
                                                   if 'split' not in pass_])
        optimized_model = onnx.utils.polish_model(optimized_model)
        onnx.save(optimized_model, os.path.join(args.output_dir, 'optimized_model.onnx'))
Beispiel #9
0
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.float)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                          all_label_ids)

# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data,
                             sampler=eval_sampler,
                             batch_size=EVAL_BATCH_SIZE)

model = BertForSequenceClassification.from_pretrained(
    OUTPUT_DIR, cache_dir=CACHE_DIR, num_labels=len(label_list))

model.to(device)

model.eval()
eval_loss = 0
nb_eval_steps = 0
preds = []

for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader,
                                                          desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)
Beispiel #10
0
def main():
    train_df = pd.read_csv(TRAIN_PATH).sample(train_size + valid_size,
                                              random_state=seed)

    y = np.where(train_df['target'] >= 0.5, 1, 0)
    y_aux = train_df[AUX_COLUMNS].values

    identity_columns_new = []
    for column in identity_columns + ['target']:
        train_df[column + "_bin"] = np.where(train_df[column] >= 0.5, True,
                                             False)
        if column != "target":
            identity_columns_new.append(column + "_bin")

    weights = np.ones((len(train_df), )) / 4
    weights += (train_df[identity_columns].fillna(0).values >= 0.5).sum(
        axis=1).astype(bool).astype(np.int) / 4
    weights += (
        ((train_df["target"].values >= 0.5).astype(bool).astype(np.int) +
         (train_df[identity_columns].fillna(0).values < 0.5).sum(
             axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(
                 np.int) / 4
    weights += (
        ((train_df["target"].values < 0.5).astype(bool).astype(np.int) +
         (train_df[identity_columns].fillna(0).values >= 0.5).sum(
             axis=1).astype(bool).astype(np.int)) > 1).astype(bool).astype(
                 np.int) / 4
    loss_weight = 1.0 / weights.mean()

    with timer('preprocessing text'):
        #df["comment_text"] = [analyzer_embed(text) for text in df["comment_text"]]
        train_df['comment_text'] = train_df['comment_text'].astype(str)
        train_df = train_df.fillna(0)

    with timer('load embedding'):
        tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH,
                                                  cache_dir=None,
                                                  do_lower_case=True)
        train_lines = zip(
            train_df['comment_text'].fillna("DUMMY_VALUE").values.tolist())
        result = Parallel(n_jobs=4, backend='multiprocessing')(
            delayed(convert_line_fast)(i, max_len, tokenizer)
            for i in train_lines)
        X_text = [r[0] for r in result]
        train_lengths = [r[1] for r in result]
        #X_text, train_lengths = convert_lines(train_df["comment_text"].fillna("DUMMY_VALUE"), max_len, tokenizer)

    test_df = train_df[train_size:]

    with timer('train'):
        X_train, y_train, y_aux_train, w_train = X_text[:
                                                        train_size], y[:
                                                                       train_size], y_aux[:
                                                                                          train_size], weights[:
                                                                                                               train_size]
        X_val, y_val, y_aux_val, w_val = X_text[train_size:], y[
            train_size:], y_aux[train_size:], weights[train_size:]
        model = BertForSequenceClassification.from_pretrained(
            WORK_DIR, cache_dir=None, num_labels=n_labels)
        model.zero_grad()
        model = model.to(device)

        y_train = np.concatenate(
            (y_train.reshape(-1, 1), w_train.reshape(-1, 1), y_aux_train),
            axis=1)
        y_val = np.concatenate(
            (y_val.reshape(-1, 1), w_val.reshape(-1, 1), y_aux_val), axis=1)

        train_dataset = torch.utils.data.TensorDataset(
            torch.tensor(X_train, dtype=torch.long),
            torch.tensor(y_train, dtype=torch.float))
        valid = torch.utils.data.TensorDataset(
            torch.tensor(X_val, dtype=torch.long),
            torch.tensor(y_val, dtype=torch.float))
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=batch_size,
                                                   shuffle=True)
        valid_loader = torch.utils.data.DataLoader(valid,
                                                   batch_size=batch_size * 2,
                                                   shuffle=False)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        num_train_optimization_steps = int(epochs * train_size / batch_size /
                                           accumulation_steps)
        total_step = int(epochs * train_size / batch_size)

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=2e-5,
                             warmup=0.05,
                             t_total=num_train_optimization_steps)

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level="O1",
                                          verbosity=0)
        #criterion = torch.nn.BCEWithLogitsLoss().to(device)
        criterion = CustomLoss(loss_weight).to(device)

        LOGGER.info(f"Starting 1 epoch...")
        tr_loss, train_losses = train_one_epoch(model, train_loader, criterion,
                                                optimizer, device,
                                                accumulation_steps, total_step,
                                                n_labels)
        LOGGER.info(f'Mean train loss: {round(tr_loss,5)}')

        torch.save(model.state_dict(), '{}_dic'.format(exp))

        valid_loss, oof_pred = validate(model, valid_loader, criterion, device,
                                        n_labels)
        del model
        gc.collect()
        torch.cuda.empty_cache()

    test_df["pred"] = oof_pred[:, 0]
    test_df = convert_dataframe_to_bool(test_df)
    bias_metrics_df = compute_bias_metrics_for_model(test_df, identity_columns)
    LOGGER.info(bias_metrics_df)

    score = get_final_metric(bias_metrics_df, calculate_overall_auc(test_df))
    LOGGER.info(f'final score is {score}')

    test_df.to_csv("oof.csv", index=False)

    xs = list(range(1, len(train_losses) + 1))
    plt.plot(xs, train_losses, label='Train loss')
    plt.legend()
    plt.xticks(xs)
    plt.xlabel('Iter')
    plt.savefig("loss.png")
    device = torch.device("cuda", LOCAL_RANK)
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')

logger.info(
    "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
    format(device, n_gpu, bool(LOCAL_RANK != -1), FP16))

# Load a trained model that you have fine-tuned
output_model_file = os.path.join(OUTPUT_DIR, "pytorch_model.bin")
model_state_dict = torch.load(output_model_file,
                              map_location='cpu')  # Modify if running on GPU
model = BertForSequenceClassification.from_pretrained(
    BERT_MODEL,
    state_dict=model_state_dict,
    num_labels=len(label_list),
    multi_label=True)
model.to(device)


def eval_and_predict(examples: List[InputExample],
                     multi_label,
                     batch_size=8,
                     eval=True):
    features = convert_examples_to_features(examples,
                                            label_list,
                                            MAX_SEQ_LENGTH,
                                            tokenizer,
                                            multi_label=multi_label)
    logger.info("***** Running evaluation *****")
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--preprocess_data",
        action='store_true',
        help="to activate the preprocessing of the data if not done yet")
    parser.add_argument("--sup_input",
                        default='data',
                        type=str,
                        required=False,
                        help="The input labelled pickle file")
    parser.add_argument(
        "--pickle_input_sup",
        default="supervised.p",
        required=False,
        help="The preprocessed supervised data to unpickle from")
    parser.add_argument("--sequence_length",
                        default=256,
                        type=int,
                        help="Length of the sequence used in the model")
    parser.add_argument(
        "--load_model",
        default=None,
        required=False,
        type=str,
        help="Name of a save model file to load and start from")
    parser.add_argument(
        "--unsup_input",
        default='data',
        type=str,
        required=False,
        help=
        "The input unlabelled pickle file. If preprocess_data is activate please enter the prefix of the files."
    )
    parser.add_argument("--uda",
                        default=True,
                        type=bool,
                        help="Whether or not to use uda.")
    parser.add_argument("--multi_gpu",
                        action='store_true',
                        help='to activate multi gpus')
    parser.add_argument("--batch_size",
                        default=4,
                        type=int,
                        help='Batch size of the labelled data')
    parser.add_argument(
        '--unsup_ratio',
        default=3,
        type=int,
        help=
        'To define the batch_size of unlabelled data, unsup_ratio * batch_size.'
    )
    parser.add_argument(
        "--gradient_accumulation",
        default=3,
        type=int,
        help="how many gradients to accumulate before stepping down.")
    parser.add_argument(
        "--lr_classifier",
        default=10e-4,
        type=float,
        help=" Learning rate applied to the last layer - classifier layer - .")
    parser.add_argument(
        "--lr_model",
        default=10e-6,
        type=float,
        help=
        "Learning rate applied to the whole model bar the classifier layer.")
    parser.add_argument('--verbose',
                        action='store_true',
                        help="to activate the printing of intermediate values")
    parser.add_argument('--tensorboard',
                        action='store_true',
                        help="to activate tensorboard on port")
    parser.add_argument("--epoch",
                        default=3,
                        type=int,
                        help="how many epochs to perform")
    parser.add_argument("--labelled_examples",
                        default=20000,
                        type=int,
                        help="how many labelled examples to learn from")
    parser.add_argument(
        "--temperature",
        default=0.85,
        type=float,
        help=
        "Set the temperature on the pre_softmax layer for unsupervisded entropy"
    )
    parser.add_argument(
        "--uda_threshold",
        default=-1,
        type=float,
        help="Set the minimal acceptable max probability for unsupervised data"
    )
    parser.add_argument(
        "--sup_threshold",
        default=0.5,
        type=float,
        help=
        "Unused ... Set the maximal acceptable correct probability for supervised data"
    )
    parser.add_argument(
        "--tsa",
        default='linear',
        type=str,
        help="Set the method to perform threshold annealing on supervised data"
    )
    parser.add_argument(
        "--test_frequency",
        default=20,
        type=int,
        help="Perform test scoring every -test_frequency- gradient steps")
    parser.add_argument("--regularisation",
                        action='store_true',
                        help="Regularize the last layer.")

    args = parser.parse_args()

    if True:  #args.tensorboard :
        train_log_dir = 'logs/'
        train_summary_writer = summary.create_file_writer(train_log_dir)

    if args.preprocess_data:
        with open(args.unsup_input + '/original.txt') as original:
            src = original.readlines()
        with open(args.unsup_input + '/paraphrase.txt') as paraphrase:
            tgt = paraphrase.readlines()
        unsupervised_data = prepare_unsupervised_data(
            src, tgt, max_seq_length=args.sequence_length)
        df_train = p.load(open(args.sup_input + '/train_label.p', 'rb'))
        df_test = p.load(open(args.sup_input + '/test_label.p', 'rb'))
        supervised_data = prepare_supervised_data(
            df_train, max_seq_length=args.sequence_length)
        test_data = prepare_supervised_data(
            df_test, max_seq_length=args.sequence_length)
        p.dump(unsupervised_data, open('unsupervised.p', 'wb'))
        p.dump(supervised_data, open(args.pickle_input_sup, 'wb'))
        p.dump(test_data, open('test.p', 'wb'))

    unsupervised_data = p.load(open('unsupervised.p', 'rb'))
    unsupervised_data = list(np.array(unsupervised_data).reshape(-1))
    supervised_data = p.load(open(args.pickle_input_sup, 'rb'))
    test_data = p.load(open('test.p', 'rb'))

    ### Recuperation sous tensors des données non supervisées
    original_input_ids = torch.tensor(
        [f.input_ids[0] for f in unsupervised_data], dtype=torch.long)
    original_input_mask = torch.tensor(
        [f.input_mask[0] for f in unsupervised_data], dtype=torch.long)
    original_segment_ids = torch.tensor(
        [f.segment_ids[0] for f in unsupervised_data], dtype=torch.long)

    augmented_input_ids = torch.tensor(
        [f.input_ids[1] for f in unsupervised_data], dtype=torch.long)
    augmented_input_mask = torch.tensor(
        [f.input_mask[1] for f in unsupervised_data], dtype=torch.long)
    augmented_segment_ids = torch.tensor(
        [f.segment_ids[1] for f in unsupervised_data], dtype=torch.long)

    ### Recuperation sous tensors des données supervisées
    supervised_input_ids = torch.tensor([f.input_ids for f in supervised_data],
                                        dtype=torch.long)
    supervised_input_mask = torch.tensor(
        [f.input_mask for f in supervised_data], dtype=torch.long)
    supervised_segment_ids = torch.tensor(
        [f.segment_ids for f in supervised_data], dtype=torch.long)
    supervised_label_ids = torch.tensor([f.label_id for f in supervised_data],
                                        dtype=torch.long)

    test_input_ids = torch.tensor([f.input_ids for f in test_data],
                                  dtype=torch.long)
    test_input_mask = torch.tensor([f.input_mask for f in test_data],
                                   dtype=torch.long)
    test_segment_ids = torch.tensor([f.segment_ids for f in test_data],
                                    dtype=torch.long)
    test_label_ids = torch.tensor([f.label_id for f in test_data],
                                  dtype=torch.long)

    ### Creation des datasets
    unsupervised_dataset = TensorDataset(original_input_ids, original_input_mask, original_segment_ids,\
                                    augmented_input_ids,augmented_input_mask,augmented_segment_ids)

    supervised_dataset = TensorDataset(supervised_input_ids,\
                                    supervised_input_mask,supervised_segment_ids,\
                                    supervised_label_ids)

    test_dataset = TensorDataset(test_input_ids,\
                                    test_input_mask,test_segment_ids,\
                                    test_label_ids)

    ### Training

    ### Variables
    unsup_train_batch_size = args.batch_size * args.unsup_ratio
    sup_train_batch_size = args.batch_size
    labelled_examples = args.labelled_examples
    unsup_train_sampler = RandomSampler(unsupervised_dataset)
    unsup_train_dataloader = DataLoader(unsupervised_dataset,
                                        sampler=unsup_train_sampler,
                                        batch_size=unsup_train_batch_size)

    # sup_train_sampler = RandomSampler(supervised_dataset)
    sup_subset_sampler = torch.utils.data.SubsetRandomSampler(\
                                            np.random.randint(supervised_input_ids.size(0), size=labelled_examples))
    sup_train_dataloader = DataLoader(supervised_dataset,
                                      sampler=sup_subset_sampler,
                                      batch_size=sup_train_batch_size)

    test_sampler = torch.utils.data.SubsetRandomSampler(\
                                            np.random.randint(test_input_ids.size(0), size=10000))
    test_dataloader = DataLoader(test_dataset,
                                 sampler=test_sampler,
                                 batch_size=16)

    num_labels = 2
    if args.load_model is not None:
        model = torch.load(args.load_model)
    else:
        model = BertForSequenceClassification.from_pretrained(
            'bert-large-uncased', num_labels=num_labels).to(device)

    if args.multi_gpu:
        model = nn.DataParallel(model)

    ### Parameters
    param_optimizer = list(model.module.classifier.named_parameters())
    lr = args.lr_classifier
    lr_bert = args.lr_model

    epochs = args.epoch
    accumulation_steps = args.gradient_accumulation
    uda_threshold = args.uda_threshold
    temperature = args.temperature
    tsa = True
    verbose = False

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.,
        'lr':
        lr,
        'max_grad_norm':
        -1
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01,
        'lr':
        lr
    }, {
        'params': model.module.bert.parameters(),
        'weight_decay': 0.01,
        'lr': lr_bert,
        'max_grad_norm': -1
    }]
    #optimizer = BertAdam(optimizer_grouped_parameters)
    optimizer = torch.optim.Adam(optimizer_grouped_parameters)
    # Locally used variables
    global_step = 0
    accuracy = 0
    counter = 1
    test_counter = 0
    loss_function = CrossEntropyLoss(reduction='none')
    optimizer.zero_grad()
    best = 0
    ### TRAINING
    for epoch in range(epochs):
        for step, batch in tqdm(enumerate(unsup_train_dataloader)):
            model.train()
            ### Unsupervised Loss
            batch = tuple(t.to(device) for t in batch)
            original_input, _, _, augmented_input, _, _ = batch
            if args.regularisation:
                with torch.no_grad():
                    originals = model(original_input) / temperature
                    logits_original = F.log_softmax(
                        model.bert(original_input)[1], dim=-1)
                    entropy = -torch.exp(logits_original) * logits_original
                    with train_summary_writer.as_default():
                        tf.summary.scalar('entropy',
                                          entropy.sum(-1).mean(0).item(),
                                          step=global_step)
                    max_logits = torch.max(logits_original, dim=-1)[0]
                    if uda_threshold > 0:
                        loss_unsup_mask = torch.where(
                            max_logits.cpu() < np.log(uda_threshold),
                            torch.tensor([1], dtype=torch.uint8),
                            torch.tensor([0], dtype=torch.uint8))
                        loss_unsup_mask.to(device)
                        loss_unsup_mask = loss_unsup_mask.view(-1)
                logits_augmented = F.log_softmax(
                    model.bert(augmented_input)[1], dim=-1)
                loss_unsup = kl_for_log_probs(logits_augmented,
                                              logits_original)
                if uda_threshold > 0:
                    loss_unsup[loss_unsup_mask] = 0
                    loss_unsup = loss_unsup[loss_unsup > 0.]
                if loss_unsup.size(0) > 0:
                    loss_unsup_mean = loss_unsup.mean(-1)
                    with train_summary_writer.as_default():
                        tf.summary.scalar('Number of elements unsup',
                                          loss_unsup.size(0), global_step)
                        tf.summary.scalar('Loss_Unsup',
                                          loss_unsup_mean.item(),
                                          step=global_step)
                    loss_unsup_mean.backward()
            else:
                with torch.no_grad():
                    originals = model(original_input) / temperature
                    logits_original = F.log_softmax(model(original_input),
                                                    dim=-1)
                    entropy = -torch.exp(logits_original) * logits_original
                    with train_summary_writer.as_default():
                        tf.summary.scalar('entropy',
                                          entropy.sum(-1).mean(0).item(),
                                          step=global_step)
                    max_logits = torch.max(logits_original, dim=-1)[0]
                    if uda_threshold > 0:
                        loss_unsup_mask = torch.where(
                            max_logits.cpu() < np.log(uda_threshold),
                            torch.tensor([1], dtype=torch.uint8),
                            torch.tensor([0], dtype=torch.uint8))
                        loss_unsup_mask.to(device)
                        loss_unsup_mask = loss_unsup_mask.view(-1)
                logits_augmented = F.log_softmax(model(augmented_input),
                                                 dim=-1)
                loss_unsup = kl_for_log_probs(logits_augmented,
                                              logits_original)
                if uda_threshold > 0:
                    loss_unsup[loss_unsup_mask] = 0
                    loss_unsup = loss_unsup[loss_unsup > 0.]
                if loss_unsup.size(0) > 0:
                    loss_unsup_mean = loss_unsup.mean(-1)
                    with train_summary_writer.as_default():
                        tf.summary.scalar('Number of elements unsup',
                                          loss_unsup.size(0), global_step)
                        tf.summary.scalar('Loss_Unsup',
                                          loss_unsup_mean.item(),
                                          step=global_step)
                    loss_unsup_mean.backward()
        ### Cleaning
            del loss_unsup
            del loss_unsup_mean
            del logits_original
            del logits_augmented
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

            ### Supervised Loss
            for i, batch_sup in enumerate(sup_train_dataloader):
                if counter % (i + 1) == 0:
                    batch_sup = tuple(t.to(device) for t in batch_sup)
                    input_ids, input_mask, segment_ids, label_ids = batch_sup
                    # tf.summary.scalar('learning rate', np.max(optimizer.get_lr(), step=global_step)
                    logits = model(input_ids)
                    loss_sup = loss_function(logits.view(-1, 2),
                                             label_ids.view(-1))

                    with torch.no_grad():
                        outputs = F.softmax(logits, dim=-1)
                        sentiment_corrects = torch.sum(
                            torch.max(outputs, -1)[1] == label_ids)
                        sentiment_acc = sentiment_corrects.double(
                        ) / sup_train_batch_size
                        accuracy += sentiment_acc
                        #accuracy_temp = accuracy/step
                    with train_summary_writer.as_default():
                        tf.summary.scalar('Batch_score',
                                          sentiment_acc.item(),
                                          step=global_step)
                        #tf.summary.scalar('Global_score', accuracy_temp.item(), step=global_step)
                    number_of_elements = outputs.size(0)

                    ### Threshold Annealing
                    if tsa:
                        tsa_start = 1. / num_labels
                        tsa_threshold = get_tsa_threshold(global_step = global_step,\
                                                        num_train_step  = 3000, start = tsa_start,\
                                                        end=1.,schedule = 'linear', scale = 5)
                        loss_mask = torch.ones(loss_sup.size()).long()
                        probas = torch.gather(
                            outputs, dim=-1,
                            index=label_ids.unsqueeze(1)).cpu()
                        loss_mask = torch.where(
                            probas > tsa_threshold,
                            torch.tensor([1], dtype=torch.uint8),
                            torch.tensor([0], dtype=torch.uint8))
                        loss_mask.to(device)
                        loss_mask = loss_mask.view(-1)
                        with train_summary_writer.as_default():
                            tf.summary.scalar('tsa_threshold', tsa_threshold,
                                              global_step)
                            tf.summary.scalar('loss_sup',
                                              loss_sup.mean(-1).item(),
                                              step=global_step)
                        loss_sup[loss_mask] = 0.
                        number_of_elements = loss_mask.size(0) - loss_mask.sum(
                            0)
                        if verbose:
                            print('outputs', outputs)
                            print('tsa_threshold', tsa_threshold)
                            print('label_ids', label_ids)
                            print('probas', probas)
                            print('mask', loss_mask)
                            print('post_loss', loss_sup)
                            print('number_of_elements : ',
                                  loss_mask.size(0) - loss_mask.sum(0))

                    if number_of_elements > 0:
                        loss_sup = loss_sup[loss_sup > 0.]
                        nb_elements = loss_sup.size(0)
                        loss_sup = loss_sup.mean(-1)
                        loss_sup.backward()
                    else:
                        nb_elements = 0
                        loss_sup = torch.tensor([0.])
                    with train_summary_writer.as_default():
                        tf.summary.scalar('nb_elements_sup', nb_elements,
                                          global_step)
                        tf.summary.scalar('Post_loss',
                                          loss_sup.item(),
                                          step=global_step)

                        #tf.summary.scalar('Learning Rate',optimizer.get_lr()[0], step=global_step)
                    # loss_sup.backward()
                    #else:
                    #   loss_sup = torch.tensor([0.])
            ### Cleaning

                    del loss_sup
                    del logits
                    gc.collect()
                    torch.cuda.empty_cache()
                    torch.cuda.ipc_collect()
                    counter += 1
                    if counter > labelled_examples + 1:
                        counter = 1
                    break
                else:
                    gc.collect()
                    torch.cuda.empty_cache()
                    torch.cuda.ipc_collect()
                    continue

            ### Accumulation Steps and Gradient steps
            if (step + 1) % accumulation_steps == 0:
                torch.nn.utils.clip_grad_value_(model.parameters(), 1)
                optimizer.step()
                optimizer.zero_grad()

            ### Test set and Evaluation  every x gradient steps
            if (step + 1) % 100 == 0:
                loss = []
                sentiment_test_acc = 0
                for test_step, test_batch in enumerate(test_dataloader):
                    test_batch = tuple(t.to(device) for t in test_batch)
                    input_ids, input_mask, segment_ids, label_ids = test_batch
                    with torch.no_grad():
                        logits = model(input_ids)
                        loss_test = loss_function(logits.view(-1, 2),
                                                  label_ids.view(-1)).mean(-1)
                        with train_summary_writer.as_default():
                            tf.summary.scalar(
                                'Test_loss_continuous',
                                loss_test.item(),
                                step=test_step +
                                test_counter * len(test_dataloader))
                        loss.append(loss_test.item())
                        outputs = F.softmax(logits, dim=-1)
                        sentiment_corrects = torch.sum(
                            torch.max(outputs, -1)[1] == label_ids)
                        sentiment_test_acc += sentiment_corrects.double()
                        accuracy += sentiment_acc / input_ids.size(0)
                sentiment_test_acc = sentiment_test_acc / len(test_dataloader)
                with train_summary_writer.as_default():
                    tf.summary.scalar('Test_score',
                                      sentiment_test_acc.item() / 16,
                                      step=global_step)
                    tf.summary.scalar('test_loss',
                                      np.array(loss).mean(),
                                      step=global_step)
                    tf.summary.scalar('test_loss_std',
                                      np.array(loss).std(),
                                      step=global_step)
                test_counter += 1
                print('best_score', best)
                if sentiment_test_acc.item() / 16 > best:
                    model_to_save = model.module if hasattr(
                        model, 'module') else model
                    torch.save(model_to_save, "best_model_score.pt")
                    best = sentiment_test_acc.item() / 16

            ### Increase the global step tracker
            global_step += 1
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run prediction on a given dataset.")
    parser.add_argument("--input_file_for_pred",
                        default=None,
                        type=str,
                        help="File to run prediction on.")
    parser.add_argument("--output_file_for_pred",
                        default=None,
                        type=str,
                        help="File to output predictions into.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumualte before performing a backward/update pass."
    )
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=128,
        help=
        'Loss scaling, positive power of 2 values can improve fp16 convergence.'
    )

    args = parser.parse_args()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "anli": AnliProcessor,
        "anli3": AnliProcessor3Option,
        'anli_csk': AnliWithCSKProcessor,
        'bin_anli': BinaryAnli,
        'wsc': WSCProcessor
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info("device %s n_gpu %d distributed training %r", device, n_gpu,
                bool(args.local_rank != -1))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    os.makedirs(args.output_dir, exist_ok=True)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    if task_name == 'bin_anli':
        model = BertForSequenceClassification.from_pretrained(
            args.bert_model, len(label_list))
    else:
        model = BertForMultipleChoice.from_pretrained(args.bert_model,
                                                      len(label_list),
                                                      len(label_list))
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                           for n, param in model.named_parameters()]
    elif args.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                           for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params': [p for n, p in param_optimizer if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in param_optimizer if n in no_decay],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0

    model_save_path = os.path.join(args.output_dir, "bert-finetuned.model")
    tr_loss = None
    if args.do_train:
        if task_name.lower().startswith(
                "anli") or task_name.lower().startswith("wsc"):
            train_features = convert_examples_to_features_mc(
                train_examples, label_list, args.max_seq_length, tokenizer)
        else:
            train_features = convert_examples_to_features(
                train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            status_tqdm = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(status_tqdm):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss, _ = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            args.loss_scale = args.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1
                status_tqdm.set_description_str(
                    "Iteration / Training Loss: {}".format(
                        (tr_loss / nb_tr_examples)))

        torch.save(model, model_save_path)

    if args.do_eval:
        if args.do_predict and args.input_file_for_pred is not None:
            eval_examples = processor.get_examples_from_file(
                args.input_file_for_pred)
        else:
            eval_examples = processor.get_dev_examples(args.data_dir)
        if task_name.lower().startswith(
                "anli") or task_name.lower().startswith("wsc"):
            eval_features = convert_examples_to_features_mc(
                eval_examples, label_list, args.max_seq_length, tokenizer)
        else:
            eval_features = convert_examples_to_features(
                eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        logger.info(
            "***** Loading model from: {} *****".format(model_save_path))
        model = torch.load(model_save_path)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        eval_predictions = []
        eval_pred_probs = []

        logger.info("***** Predicting ... *****".format(model_save_path))

        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss, logits = model(input_ids, segment_ids,
                                              input_mask, label_ids)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)

            eval_predictions.extend(np.argmax(logits, axis=1).tolist())

            eval_pred_probs.extend([_compute_softmax(list(l)) for l in logits])

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'global_step': global_step,
            'loss': tr_loss / nb_tr_steps if tr_loss is not None else 0.0
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))

        if task_name == "wsc":
            pred_examples = list(TsvIO.read(args.input_file_for_pred))

        else:
            pred_examples = read_jsonl_lines(args.input_file_for_pred)

        logger.info("***** Eval predictions *****")
        for record, pred, probs in zip(pred_examples, eval_predictions,
                                       eval_pred_probs):
            record['bert_prediction'] = pred
            record['bert_correct'] = pred == (
                int(record[processor.label_field()]) - 1)
            record['bert_pred_probs'] = probs

        write_items([json.dumps(r) for r in pred_examples],
                    args.output_file_for_pred)
Beispiel #14
0
label_list = processor.get_labels() 
num_labels = len(label_list)

num_train_optimization_steps = int(train_examples_len / TRAIN_BATCH_SIZE / GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS
tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False)
label_map = {label: i for i, label in enumerate(label_list)}
train_examples_for_processing = [(example, label_map, MAX_SEQ_LENGTH, tokenizer, OUTPUT_MODE) for example in train_examples]



process_count = cpu_count() - 1
with Pool(process_count) as p:
    train_features = list(tqdm_notebook(p.imap(convert_examples_to_features.convert_example_to_feature, \
    											train_examples_for_processing), total=train_examples_len))

model = BertForSequenceClassification.from_pretrained(BERT_MODEL, cache_dir=CACHE_DIR, num_labels=num_labels)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
    		{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    		{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]


optimizer = BertAdam(optimizer_grouped_parameters,
                     lr=LEARNING_RATE,
                     warmup=WARMUP_PROPORTION,
                     t_total=num_train_optimization_steps)
    test_inputs = torch.tensor(input_ids)
    test_masks = torch.tensor(attention_masks)
    fake_ids = torch.Tensor(fake_ids)
    print("set batch size")
    batch_size = 24


    test_data = TensorDataset(test_inputs, test_masks, fake_ids)

    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    print("finished setting batch size")

    model = BertForSequenceClassification.from_pretrained(output_dir, num_labels=2)
    model.cuda()

    # Put model in evaluation mode to evaluate loss on the validation set
    model.eval()

    # Predict data by minibatch
    if(True):
        output_predictions = []
        batch_cnt = 0
        for batch in test_dataloader:
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_idstrs = batch
            # Telling the model not to compute or store gradients, saving memory and speeding up validation
Y_Val = train_df[target_column].values[num_to_load:]

train_dataset = torch.utils.data.TensorDataset(
    torch.tensor(X, dtype=torch.long), torch.tensor(Y, dtype=torch.float))

output_model_file = 'bert_pytorch.bin'

lr = 2e-5
batch_size = 32
accumulation_step = 2
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

model = BertForSequenceClassification.from_pretrained(
    "./", cache_dir=None, num_labels=len(target_column))
model.zero_grad()
model = model.to(device)
params = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params': [p for n, p in params if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params': [p for n, p in params if any(nd in n for nd in no_decay)],
    'weight_decay':
    0.0
}]
train = train_dataset
    train_batch_size = 32
    eval_batch_size = 128
    train_batch_size = train_batch_size // gradient_accumulation_steps
    output_dir = OutputDir
    num_train_epochs = NUMofEPOCH
    num_train_optimization_steps = int(
        len(TrainExamples) / train_batch_size /
        gradient_accumulation_steps) * num_train_epochs
    cache_dir = CacheDir
    learning_rate = LearningRate
    warmup_proportion = 0.1
    max_seq_length = MAXSEQLEN

    # Load model
    tokenizer = BertTokenizer.from_pretrained(BERTModel)
    Model = BertForSequenceClassification.from_pretrained(
        BERTModel, cache_dir=cache_dir, num_labels=len(LabelList))
    Model.to(device)
    if n_gpu > 1:
        Model = torch.nn.DataParallel(Model)

    # Load a trained model and config that you have fine-tuned
    # tokenizer = BertTokenizer.from_pretrained(BERTModel)
    # config = BertConfig(load_config_file)
    # Model = BertForSequenceClassification(config, num_labels = len(LabelList))
    # Model.load_state_dict(torch.load(load_model_file))
    # Model.to(device)  # important to specific device
    # if n_gpu > 1:
    # 	Model = torch.nn.DataParallel(Model)

    # Prepare optimizer
    param_optimizer = list(Model.named_parameters())
Beispiel #18
0
train_examples_len = len(train_features)
print("Train features count: ", train_examples_len)

num_train_optimization_steps = ceil(
    train_examples_len / TRAIN_BATCH_SIZE /
    GRADIENT_ACCUMULATION_STEPS) * NUM_TRAIN_EPOCHS

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

print('n_gpu:', n_gpu)

torch.cuda.manual_seed_all(RANDOM_SEED)

model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                      cache_dir='cache',
                                                      num_labels=num_labels)

model.to(device)
if n_gpu > 1:
    model = torch.nn.DataParallel(model)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
    'weight_decay':
    0.01
}, {
    'params':
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                             "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "arg": ArgProcessor,
    }

    num_labels_task = {
        "cola": 2,
        "mnli": 3,
        "mrpc": 2,
        # "arg": 2,
        "arg": 3,
    }

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    num_labels = num_labels_task[task_name]
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    model_state_dict = torch.load('models/pytorch_model.bin', map_location=torch.device('cpu'))
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(PYTORCH_PRETRAINED_BERT_CACHE,
                                                                   'distributed_{}'.format(args.local_rank))
    model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict,
                                                          num_labels=num_labels)
    # model.load_state_dict(torch.load('./models/pytorch_model1.bin', map_location=torch.device('cpu')))

    # cache_dir=cache_dir,

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    if args.fp16:
        try:
            from apex.optimizers import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps,
                                                                          args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

    if args.do_train:
        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
        Path(str(Path.cwd() / "data" / "output")).mkdir(parents=True, exist_ok=True)
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())

        # Load a trained model and config that you have fine-tuned
        config = BertConfig(output_config_file)
        model = BertForSequenceClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file, map_location=torch.device('cpu')))
    else:
        # model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
        # WHY DO THEY DO THIS
        # MY TRAINED ONE
        model_state_dict = torch.load('./models/pytorch_model.bin', map_location=torch.device('cpu'))
        model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict,
                                                             num_labels=num_labels)
        # INTERMDEDIATE IMHO TRAINED ONE - oh this one doesn't work.... why?!
        # model_state_dict = torch.load('./models/pytorch_model.bin', map_location=torch.device('cpu'))
        # model = BertForSequenceClassification.from_pretrained(args.bert_model, state_dict=model_state_dict,
        #                                                       num_labels=num_labels)
    model.to(device)
    pred, prob = [], []
    gold = []
    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_dev_examples(args.data_dir)
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
                logits = model(input_ids, segment_ids, input_mask)

            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(logits, label_ids)
            for a, b in zip(logits, label_ids):
                pred.append(np.argmax(a))
                gold.append(b)
            # prob.append(a)

            eval_loss += tmp_eval_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {'eval_loss': eval_loss,
                  'eval_accuracy': eval_accuracy,
                  'global_step': global_step,
                  'loss': loss}
        print(classification_report(gold, pred))
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            f = open('predictions.txt', 'w')
            for line1 in pred:
                f.write(str(line1) + '\n')
api = KhaiiiApi()

#토크나이저 선언
tokenizer = BertTokenizer.from_pretrained("./vocab.korean_morp.list",
                                          do_lower_case=False)

#토큰 개수의 최대값 설정
MAX_LEN = 256

# 디바이스 설정
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
# 분류를 위한 BERT 모델 생성
model = BertForSequenceClassification.from_pretrained(
    "/home/jupyter/pytorch-korbert", num_labels=2)

# 모델 로드
checkpoint = torch.load('./kor_bert_senti1', map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])


#update 모듈
def update(changing_doc):
    for i in range(len(changing_doc['messages'])):
        if ((changing_doc['messages'][i]['sender'] != 'admin')
                and (changing_doc['messages'][i]['complain'] == 100)):
            changing_doc['messages'][i]['complain'] = int(
                analysis(changing_doc['messages'][i]['message']) * 100)
        else:
            continue
Beispiel #21
0
                                 dtype=torch.long)
elif OUTPUT_MODE == "regression":
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.float)

eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                          all_label_ids)

# Run prediction for full data
eval_sampler = SequentialSampler(eval_data)
eval_dataloader = DataLoader(eval_data,
                             sampler=eval_sampler,
                             batch_size=EVAL_BATCH_SIZE)

# Load pre-trained model (weights)
model = BertForSequenceClassification.from_pretrained(
    CACHE_DIR + BERT_MODEL, cache_dir=CACHE_DIR, num_labels=len(label_list))
model.to(device)

model.eval()
eval_loss = 0
nb_eval_steps = 0
preds = []
# predicting ~~~~~~~~~~~
for input_ids, input_mask, segment_ids, label_ids in tqdm_notebook(
        eval_dataloader, desc="Evaluating"):
    input_ids = input_ids.to(device)
    input_mask = input_mask.to(device)
    segment_ids = segment_ids.to(device)
    label_ids = label_ids.to(device)

    with torch.no_grad():
Beispiel #22
0
import numpy as np

torch.backends.cudnn.deterministic = True
torch.cuda.manual_seed(seed)
torch.manual_seed(seed)
np.random.seed(seed)

tokenizer = BertTokenizer.from_pretrained(model_type)

# ## Load Pre-Trained BERT Model

from pytorch_pretrained_bert import BertForSequenceClassification, BertAdam

print('loading model')
model = BertForSequenceClassification.from_pretrained(model_type,
                                                      cache_dir=None,
                                                      num_labels=1)

# ## Fine-Tune BERT

from torch.nn import functional as F
from tqdm import tqdm, trange

train_optimization_steps = int(epochs * len(dataset) / batch_size /
                               accumulation_steps)

param_optimizer = list(model.named_parameters())

optimizer_grouped_parameters = [{
    'params':
    [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
Beispiel #23
0
    all_tokens = []
    longer = 0
    for text in tqdm_notebook(example):
        tokens_a = tokenizer.tokenize(text)
        if len(tokens_a) > max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
            longer += 1
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"] + tokens_a + ["[SEP]"]) + [0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)
    print(f"There are {longer} lines longer than {max_seq_length}")
    return np.array(all_tokens)

device = torch.device('cpu')
BERT_MODEL_PATH = Path('/content/drive/My Drive/cb1bert/uncased_L-12_H-768_A-12/')
model = BertForSequenceClassification.from_pretrained(WORK_DIR,
                                                          cache_dir=None,
                                                          num_labels=3)
model.load_state_dict(torch.load('/content/drive/My Drive/cb1bert/bert_pytorch.bin', map_location='cpu'))

def test():
  tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_PATH, cache_dir=None,
                                                do_lower_case=True)
  print("Enter title")
  title = input()
  print("Enter text")
  text = input()

  # intialise data of lists. 
  input_data = {'title':[title], 'text':[text]} 
    
  # Create DataFrame 
for index in range(NUM_MODEL):

    seed_everything(seed + index)

    x_train_fold = torch.tensor(x_train, dtype=torch.long)
    y_train_fold = torch.tensor(y_train, dtype=torch.float)

    train_data = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)

    train_loader = torch.utils.data.DataLoader(train_data,
                                               batch_size=BATCH_SIZE,
                                               shuffle=True)

    print("model: {}".format(index))

    net = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                        num_labels=6)
    ## load pretrain model
    #   net.load_state_dict(torch.load("../input/bert-model3/bert_pytorch_v3.pt"))
    net.load_state_dict(
        torch.load("../input/pytorch-943-bert/bert_pytorch.pt"))
    net.cuda()

    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='mean')

    param_optimizer = list(net.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
Beispiel #25
0
def train(args, train_dataloader, valid_dataloader, num_train_examples):
    device = torch.device(args.device)
    model = BertForSequenceClassification.from_pretrained(
        args.bert_model,
        #cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format(-1),
        num_labels=args.class_size).to(device)
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    num_train_steps = int(num_train_examples / args.batch_size / 1 *
                          args.max_epoch)
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    if args.fl_loss:
        others_idx = 0
        alpha = [(1. - args.fl_alpha) / 3.] * args.class_size
        alpha[others_idx] = args.fl_alpha
        criterion = FocalLoss(gamma=args.fl_gamma,
                              alpha=alpha,
                              size_average=True)
    else:
        criterion = nn.CrossEntropyLoss()

    writer = SummaryWriter(log_dir='runs/' + args.model_time)

    model.train()

    acc, loss, size, last_epoch = 0, 0, 0, -1
    max_dev_acc = 0
    max_dev_f1 = 0
    best_model = None

    print("tarining start")
    for epoch in range(args.max_epoch):
        print('epoch: ', epoch + 1)
        for i, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            input_ids, input_mask, segment_ids, label_ids = batch
            pred = model(input_ids, segment_ids, input_mask)

            optimizer.zero_grad()

            batch_loss = criterion(pred, label_ids)
            loss += batch_loss.item()

            batch_loss.backward()
            optimizer.step()

            _, pred = pred.max(dim=1)
            acc += (pred == label_ids).sum().float().cpu().item()
            size += len(pred)

            if (i + 1) % args.print_every == 0:
                acc = acc / size
                c = (i + 1) // args.print_every
                writer.add_scalar('loss/train', loss, c)
                writer.add_scalar('acc/train', acc, c)
                print(
                    f'{i+1} steps - train loss: {loss:.3f} / train acc: {acc:.3f}'
                )
                acc, loss, size = 0, 0, 0

            if (i + 1) % args.validate_every == 0:
                c = (i + 1) // args.validate_every
                dev_loss, dev_acc, dev_f1 = test(model, valid_dataloader,
                                                 criterion, args, device)
                if dev_acc > max_dev_acc:
                    max_dev_acc = dev_acc
                if dev_f1 > max_dev_f1:
                    max_dev_f1 = dev_f1
                    best_model = copy.deepcopy(model.state_dict())
                writer.add_scalar('loss/dev', dev_loss, c)
                writer.add_scalar('acc/dev', dev_acc, c)
                writer.add_scalar('f1/dev', dev_f1, c)
                print(
                    f'dev loss: {dev_loss:.4f} / dev acc: {dev_acc:.4f} / dev f1: {dev_f1:.4f} '
                    f'(max dev acc: {max_dev_acc:.4f} / max dev f1: {max_dev_f1:.4f})'
                )
                model.train()

    writer.close()
    return best_model, max_dev_f1
Beispiel #26
0
labels = [i for i in range(4)]
target_names = list(set(labels))
label2idx = {label: idx for idx, label in enumerate(target_names)}

TARGET_NAME_PATH = os.path.join(os.path.expanduser("~"), "target_names.json")

target_names = list(set(labels))
with open(TARGET_NAME_PATH, "w") as o:
    json.dump(target_names, o)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BERT_MODEL = "bert-base-multilingual-uncased"
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL, do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(BERT_MODEL, num_labels = 4)
model.to(device)
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)
MAX_SEQ_LENGTH=100

class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
n_gpu = torch.cuda.device_count()
if n_gpu > 1:
    logger.info(f"let's use {n_gpu} gpu")

# random seed
random.seed(44)
np.random.seed(44)
torch.manual_seed(44)
if n_gpu > 1:
    torch.cuda.manual_seed_all(44)


model = BertForSequenceClassification.from_pretrained(pretrained_model_name_or_path=bert_model_path,
                                                          cache_dir=bert_data_path,
                                                          num_labels=num_class)
model.to(device)

if n_gpu > 1:
    model = torch.nn.DataParallel(model)


# optim
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
Beispiel #28
0
def train_Pytorch_BERT(texts, targets):
    epochs = 1  # No Time for more..
    MAX_LEN = 35
    batch_size = 32
    nb_labels = 2  # OR len(input_data['target'].unique())

    train_x, test_x, train_y, test_y = train_test_split(texts,
                                                        targets,
                                                        test_size=0.1,
                                                        random_state=42)

    device_name = tf.test.gpu_device_name()
    if device_name != '/device:GPU:0':
        raise SystemError('GPU device not found')
    print('Found GPU at: {}'.format(device_name))

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.get_device_name(0)

    sentences = ["[CLS] " + str(tweet) + " [SEP]" for tweet in train_x]
    print(sentences[0])

    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                              do_lower_case=True)
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
    print("Tokenize the first sentence:")
    print(tokenized_texts[0])

    #input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
    #                          maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")
    # Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids,
                              maxlen=MAX_LEN,
                              dtype="long",
                              truncating="post",
                              padding="post")

    attention_masks = []
    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(
        input_ids, train_y, random_state=2018, test_size=0.1)
    train_masks, validation_masks, _, _ = train_test_split(attention_masks,
                                                           input_ids,
                                                           random_state=2018,
                                                           test_size=0.1)

    train_inputs = torch.tensor(train_inputs)
    validation_inputs = torch.tensor(validation_inputs)
    train_labels = torch.tensor(train_labels)
    validation_labels = torch.tensor(validation_labels)
    train_masks = torch.tensor(train_masks)
    validation_masks = torch.tensor(validation_masks)

    train_data = TensorDataset(train_inputs, train_masks, train_labels)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=batch_size)
    validation_data = TensorDataset(validation_inputs, validation_masks,
                                    validation_labels)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data,
                                       sampler=validation_sampler,
                                       batch_size=batch_size)

    model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                          num_labels=nb_labels)
    model.cuda()

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay_rate':
        0.0
    }]

    optimizer = BertAdam(optimizer_grouped_parameters, lr=2e-5, warmup=.1)

    # Function to calculate the accuracy of our predictions vs labels
    def flat_accuracy(preds, labels):
        pred_flat = np.argmax(preds, axis=1).flatten()
        labels_flat = labels.flatten()
        return np.sum(pred_flat == labels_flat) / len(labels_flat)

    train_loss_set = []
    for _ in trange(epochs, desc="Epoch"):

        model.train()
        tr_loss = 0
        nb_tr_examples, nb_tr_steps = 0, 0

        for step, batch in enumerate(train_dataloader):
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            optimizer.zero_grad()
            loss = model(b_input_ids,
                         token_type_ids=None,
                         attention_mask=b_input_mask,
                         labels=b_labels)
            train_loss_set.append(loss.item())
            loss.backward()
            optimizer.step()
            tr_loss += loss.item()
            nb_tr_examples += b_input_ids.size(0)
            nb_tr_steps += 1
        print("Train loss: {}".format(tr_loss / nb_tr_steps))

        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in validation_dataloader:
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_input_mask, b_labels = batch
            with torch.no_grad():
                logits = model(b_input_ids,
                               token_type_ids=None,
                               attention_mask=b_input_mask)
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            tmp_eval_accuracy = flat_accuracy(logits, label_ids)
            eval_accuracy += tmp_eval_accuracy
            nb_eval_steps += 1
        print("Validation Accuracy: {}".format(eval_accuracy / nb_eval_steps))

    plt.figure(figsize=(15, 5))
    plt.title("Training loss")
    plt.xlabel("Batch")
    plt.ylabel("Loss")
    plt.plot(train_loss_set)
    plt.show()

    #######################
    ###Evaluation part#####
    #######################

    sentences = ["[CLS] " + str(query) + " [SEP]" for query in test_x]
    labels = test_y
    tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]

    # input_ids = pad_sequences([tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts],
    #                           maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

    input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]
    input_ids = pad_sequences(input_ids,
                              maxlen=MAX_LEN,
                              dtype="long",
                              truncating="post",
                              padding="post")
    attention_masks = []

    for seq in input_ids:
        seq_mask = [float(i > 0) for i in seq]
        attention_masks.append(seq_mask)

    prediction_inputs = torch.tensor(input_ids)
    prediction_masks = torch.tensor(attention_masks)
    prediction_labels = torch.tensor(labels)
    batch_size = 32
    prediction_data = TensorDataset(prediction_inputs, prediction_masks,
                                    prediction_labels)
    prediction_sampler = SequentialSampler(prediction_data)
    prediction_dataloader = DataLoader(prediction_data,
                                       sampler=prediction_sampler,
                                       batch_size=batch_size)

    model.eval()
    predictions, true_labels = [], []
    for batch in prediction_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        with torch.no_grad():
            logits = model(b_input_ids,
                           token_type_ids=None,
                           attention_mask=b_input_mask)
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        predictions.append(logits)
        true_labels.append(label_ids)

    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
    flat_true_labels = [item for sublist in true_labels for item in sublist]

    print('Classification accuracy using BERT Fine Tuning: {0:0.2%}'.format(
        accuracy_score(flat_true_labels, flat_predictions)))

    return model, tokenizer
Beispiel #29
0
# Create an iterator of our data with torch DataLoader
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data,
                              sampler=train_sampler,
                              batch_size=batch_size)
validation_data = TensorDataset(validation_inputs, validation_masks,
                                validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data,
                                   sampler=validation_sampler,
                                   batch_size=batch_size)

# Load BertForSequenceClassification, the pretrained BERT model with a single linear classification layer on top.

model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2)
# model.cuda()

# # BERT model summary
# BertForSequenceClassification(
#   (bert): BertModel(
#     (embeddings): BertEmbeddings(
#       (word_embeddings): Embedding(30522, 768, padding_idx=0)
#       (position_embeddings): Embedding(512, 768)
#       (token_type_embeddings): Embedding(2, 768)
#       (LayerNorm): BertLayerNorm()
#       (dropout): Dropout(p=0.1)
#     )
#     (encoder): BertEncoder(
#       (layer): ModuleList(
#         (0): BertLayer(
Beispiel #30
0
def train_bert(config: PipeLineConfig):
    logging.basicConfig(level=logging.INFO)

    logging.info("Reading data...")
    input_folder = "../input/jigsaw-unintended-bias-in-toxicity-classification/"
    train = pd.read_csv(os.path.join(input_folder, "train.csv"))

    logging.info("Tokenizing...")

    with multiprocessing.Pool(processes=32) as pool:
        text_list = train.comment_text.tolist()
        sequences = pool.map(convert_line_cased, text_list)

    logging.info("Building ttensors for training...")
    sequences = np.array(sequences)
    lengths = np.argmax(sequences == 0, axis=1)
    lengths[lengths == 0] = sequences.shape[1]

    logging.info("Bulding target tesnor...")
    iden = train[IDENTITY_COLUMNS].fillna(0).values
    subgroup_target = np.hstack([
        (iden >= 0.5).any(axis=1, keepdims=True).astype(np.int),
        iden,
        iden.max(axis=1, keepdims=True),
    ])
    sub_target_weigths = (~train[IDENTITY_COLUMNS].isna().values.any(
        axis=1, keepdims=True)).astype(np.int)

    weights = np.ones(len(train))
    weights += (iden >= 0.5).any(1)
    weights += (train["target"].values >= 0.5) & (iden < 0.5).any(1)
    weights += (train["target"].values < 0.5) & (iden >= 0.5).any(1)
    weights /= weights.mean()

    y_aux_train = train[AUX_TARGETS]
    y_train_torch = torch.tensor(
        np.hstack([
            train.target.values[:, None],
            weights[:, None],
            y_aux_train,
            subgroup_target,
            sub_target_weigths,
        ])).float()

    perfect_output = torch.tensor(
        np.hstack([train.target.values[:, None], y_aux_train,
                   subgroup_target])).float()

    logging.info("Seeding with seed %d ...", config.seed)
    seed_everything(config.seed)

    logging.info("Creating dataset...")

    dataset = data.TensorDataset(
        torch.from_numpy(sequences).long(), y_train_torch,
        torch.from_numpy(lengths))
    train_loader = data.DataLoader(dataset,
                                   batch_size=BATCH_SIZE,
                                   collate_fn=clip_to_max_len,
                                   shuffle=True)

    logging.info("Creating a model...")
    model = BertForSequenceClassification.from_pretrained("bert-base-cased",
                                                          num_labels=18)
    model.zero_grad()
    model = model.cuda()
    model.classifier.bias = nn.Parameter(
        perfect_bias(perfect_output.mean(0)).cuda())

    logs_file = f"./tb_logs/final_{config.expname}"
    optimizer_grouped_parameters = [
        {
            "params":
            [p for n, p in model.named_parameters() if should_decay(n)],
            "weight_decay": config.decay,
        },
        {
            "params":
            [p for n, p in model.named_parameters() if not should_decay(n)],
            "weight_decay":
            0.00,
        },
    ]

    optimizer = BertAdam(
        optimizer_grouped_parameters,
        lr=config.lr,
        warmup=config.warmup,
        t_total=config.epochs * len(train_loader) // ACCUM_STEPS,
    )

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      opt_level="O1",
                                      verbosity=0)
    model = model.train()

    writer = SummaryWriter(logs_file)
    agg = TensorboardAggregator(writer)
    custom_loss = prepare_loss(config)

    for _ in range(config.epochs):
        for j, (X, y) in enumerate(train_loader):

            X = X.cuda()
            y = y.cuda()

            y_pred = model(X, attention_mask=(X > 0))
            loss = custom_loss(y_pred, y)

            accuracy = ((y_pred[:, 0] > 0) == (y[:, 0] > 0.5)).float().mean()
            agg.log({
                "train_loss": loss.item(),
                "train_accuracy": accuracy.item()
            })

            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()

            if (j + 1) % ACCUM_STEPS == 0:
                optimizer.step()
                optimizer.zero_grad()

    torch.save(model.state_dict(),
               f"./models/final-pipe3-{config.expname}.bin")