N, D = train_x.shape
_, num_classes = train_y.shape

train_x = torch.from_numpy(train_x).type(torch.float)
train_y = torch.from_numpy(train_y)
test_x = torch.from_numpy(test_x).type(torch.float)
test_y = torch.from_numpy(test_y)

max_iters = 51
# pick a batch size, learning rate
batch_size = 16
learning_rate = 1e-3
hidden_size = 64

batches = DataLoader(TensorDataset(train_x, train_y),
                     shuffle=True,
                     batch_size=batch_size)

model = torch.nn.Sequential(
    torch.nn.Linear(D, hidden_size),
    torch.nn.Sigmoid(),
    torch.nn.Linear(hidden_size, num_classes),
    # torch.nn.Softmax()
)
# print(model)

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
# optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)

train_loss = []
Beispiel #2
0
    def load_and_cache_examples(self,
                                examples,
                                evaluate=False,
                                no_cache=False):
        """
        Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures.

        Utility function for train() and eval() methods. Not intended to be used directly.
        """

        process_count = self.args['process_count']

        tokenizer = self.tokenizer
        output_mode = 'classification'
        args = self.args

        if not os.path.isdir(self.args['cache_dir']):
            os.mkdir(self.args['cache_dir'])

        mode = 'dev' if evaluate else 'train'
        cached_features_file = os.path.join(
            args['cache_dir'],
            f"cached_{mode}_{args['model_type']}_{args['max_seq_length']}_binary"
        )

        if os.path.exists(cached_features_file) and not args[
                'reprocess_input_data'] and not no_cache:
            features = torch.load(cached_features_file)

        else:
            features = convert_examples_to_features(
                examples,
                args['max_seq_length'],
                tokenizer,
                output_mode,
                # xlnet has a cls token at the end
                cls_token_at_end=bool(args['model_type'] in ['xlnet']),
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2
                if args['model_type'] in ['xlnet'] else 0,
                sep_token=tokenizer.sep_token,
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                sep_token_extra=bool(args['model_type'] in ['roberta']),
                # pad on the left for xlnet
                pad_on_left=bool(args['model_type'] in ['xlnet']),
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4
                if args['model_type'] in ['xlnet'] else 0,
                process_count=process_count)

            if not no_cache:
                torch.save(features, cached_features_file)

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
        return dataset
    def test_eval(self):
        data = DATAMultiWOZ(debug=False, data_dir=self.data_dir)
        test_examples = data.read_examples(
            os.path.join(self.data_dir, 'test.json'))
        print('eval_examples的数量', len(test_examples))

        dialogueID = [x.guid for x in test_examples]
        utterance_text = [x.text_history for x in test_examples]

        test_features = data.convert_examples_to_features(
            test_examples, self.tokenizer, self.max_seq_length)
        test_input_ids = torch.tensor(data.select_field(
            test_features, 'input_ids'),
                                      dtype=torch.long)
        test_input_mask = torch.tensor(data.select_field(
            test_features, 'input_mask'),
                                       dtype=torch.long)
        test_segment_ids = torch.tensor(data.select_field(
            test_features, 'segment_ids'),
                                        dtype=torch.long)
        test_utterance_mask = torch.tensor(data.select_field(
            test_features, 'utterance_mask'),
                                           dtype=torch.long)
        test_domainslot_mask = torch.tensor(data.select_field(
            test_features, 'domainslot_mask'),
                                            dtype=torch.long)
        test_label_tokens_start = torch.tensor(
            [f.label_tokens_start for f in test_features], dtype=torch.long)
        test_label_tokens_end = torch.tensor(
            [f.label_tokens_end for f in test_features], dtype=torch.long)
        test_label_sentence_domainslot = torch.tensor(
            [f.label_sentence_domainslot for f in test_features],
            dtype=torch.long)

        text_histtokens = [f.hist_token for f in test_features]

        test_data = TensorDataset(test_input_ids, test_input_mask,
                                  test_segment_ids, test_utterance_mask,
                                  test_domainslot_mask,
                                  test_label_tokens_start,
                                  test_label_tokens_end,
                                  test_label_sentence_domainslot)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=self.eval_batch_size)

        config = BertConfig.from_pretrained(self.model_name_or_path)
        model = BertForTokenClassification.from_pretrained(os.path.join(
            self.output_dir, "pytorch_model.bin"),
                                                           self.args,
                                                           config=config)
        model.to(self.device)
        model.eval()

        gold_labels_tokens_start = []
        gold_labels_tokens_end = []
        gold_label_sentence_domainslot = []
        scores_tokens_start = []
        scores_tokens_end = []
        scores_sentence_domainslot = []
        # ID = [x.guid for x in eval_examples]
        dialogueID = [x.guid for x in test_examples]
        # utterance_text = [x.text_eachturn for x in test_examples]

        for input_ids, input_mask, segment_ids, \
            utterance_mask, domainslot_mask, \
            label_tokens_start, label_tokens_end, \
            label_sentence_domainslot in test_dataloader:
            input_ids = input_ids.to(self.device)
            input_mask = input_mask.to(self.device)
            segment_ids = segment_ids.to(self.device)
            utterance_mask = utterance_mask.to(self.device)
            domainslot_mask = domainslot_mask.to(self.device)
            label_tokens_start = label_tokens_start.to(self.device)
            label_tokens_end = label_tokens_end.to(self.device)
            label_sentence_domainslot = label_sentence_domainslot.to(
                self.device)

            logits_tokens_start, logits_tokens_end, logits_sentence_domainslot, _ = model(
                input_ids=input_ids,
                token_type_ids=segment_ids,
                attention_mask=input_mask,
                utterance_mask=utterance_mask,
                domainslot_mask=domainslot_mask)
            logits_tokens_start = logits_tokens_start.detach().view(
                -1, 2).cpu().numpy()
            logits_tokens_end = logits_tokens_end.detach().view(
                -1, 2).cpu().numpy()
            logits_sentence_domainslot = logits_sentence_domainslot.view(
                -1, 2).detach().cpu().numpy()

            label_tokens_start = label_tokens_start.view(-1).to('cpu').numpy()
            label_tokens_end = label_tokens_end.view(-1).to('cpu').numpy()
            label_sentence_domainslot = label_sentence_domainslot.to(
                'cpu').numpy()

            scores_tokens_start.append(logits_tokens_start)
            scores_tokens_end.append(logits_tokens_end)
            scores_sentence_domainslot.append(logits_sentence_domainslot)

            gold_labels_tokens_start.append(label_tokens_start)
            gold_labels_tokens_end.append(label_tokens_end)
            gold_label_sentence_domainslot.append(label_sentence_domainslot)

        gold_labels_tokens_start = np.concatenate(gold_labels_tokens_start, 0)
        gold_labels_tokens_end = np.concatenate(gold_labels_tokens_end, 0)
        gold_label_sentence_domainslot = np.concatenate(
            gold_label_sentence_domainslot, 0)
        scores_tokens_start = np.concatenate(scores_tokens_start, 0)
        scores_tokens_end = np.concatenate(scores_tokens_end, 0)
        scores_sentence_domainslot = np.concatenate(scores_sentence_domainslot,
                                                    0)

        # 计算评价指标
        # print(scores_tokens_start.shape)
        # print(scores_tokens_end.shape)
        # print(scores_sentence_domainslot.shape)
        # print(gold_labels_tokens_start.shape)
        assert scores_tokens_start.shape[0] == scores_tokens_end.shape[
            0] == gold_labels_tokens_start.shape[
                0] == gold_labels_tokens_end.shape[0]
        # eval_accuracy_domain = accuracyF1(scores_domain, gold_labels_domain,mode='domain',report=True)
        # eval_accuracy_dependcy = accuracyF1(scores_dependcy, gold_labels_dependcy,mode='dependcy',report=True)
        test_F1_tokenstart, test_F1_tokenend, F1_sentence_domainslot, F1_token_domainslot = compute_jointGoal_domainslot(
            dialogueID,
            text_histtokens,
            scores_tokens_start,
            scores_tokens_end,
            scores_sentence_domainslot,
            gold_labels_tokens_start,
            gold_labels_tokens_end,
            gold_label_sentence_domainslot,
        )

        print('F1_token_domainslot', F1_token_domainslot,
              'F1_sentence_domainslot', F1_sentence_domainslot,
              'eval_F1_tokenstart', test_F1_tokenstart, 'eval_F1_tokenend',
              test_F1_tokenend)
def run():
    args = parser.parse_args()
    nlayer = args.nlayer
    bidirection = args.bidirection
    file_path = args.file_path  #'/content/drive/My Drive/Master_Final_Project/Genetic_attack/Code/nlp_adversarial_example_master_pytorch/glove.840B.300d.txt'#'/lustre/scratch/scratch/ucabdc3/lstm_attack'
    save_path = os.path.join(file_path, 'results')
    MAX_VOCAB_SIZE = 50000
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #    with open(os.path.join(file_path, 'dataset_%d.pkl' %MAX_VOCAB_SIZE), 'rb') as f:
    #        dataset = pickle.load(f)

    with open('aux_files/dataset_%d.pkl' % MAX_VOCAB_SIZE, 'rb') as f:
        dataset = pickle.load(f)


#    skip_list = np.load('aux_files/missed_embeddings_counter_%d.npy' %MAX_VOCAB_SIZE)
    embedding_matrix = np.load('aux_files/embeddings_glove_%d.npy' %
                               (MAX_VOCAB_SIZE))
    embedding_matrix = torch.tensor(embedding_matrix.T).to(device)
    dist = np.load(('aux_files/dist_counter_%d.npy' % (MAX_VOCAB_SIZE)))

    #    goog_lm = LM()

    # pytorch
    max_len = args.max_len
    #    padded_train_raw = pad_sequences(dataset.train_seqs2, maxlen = max_len, padding = 'post')
    padded_test_raw = pad_sequences(dataset.test_seqs2,
                                    maxlen=max_len,
                                    padding='post')
    #    # TrainSet
    #    data_set = Data_infor(padded_train_raw, dataset.train_y)
    #    num_train = len(data_set)
    #    indx = list(range(num_train))
    #    train_set = Subset(data_set, indx)

    # TestSet
    batch_size = 1
    SAMPLE_SIZE = args.sample_size
    data_set = Data_infor(padded_test_raw, dataset.test_y)
    num_test = len(data_set)
    indx = list(range(num_test))

    all_test_set = Subset(data_set, indx)
    #indx = random.sample(indx, SAMPLE_SIZE)
    with open('attack_results_final_300.pkl', 'rb') as f:
        results = pickle.load(f)
    seqs = []
    lens = []
    tgts = []
    for i in range(len(results[1])):
        if np.array(results[1][i]).shape == ():
            continue
        seqs.append(results[1][i])
        lens.append(results[2][i])
        tgts.append(results[3][i])
    seqs = torch.tensor(seqs)
    lens = torch.tensor(lens)
    tgts = torch.tensor(tgts)
    test_set = TensorDataset(seqs, lens, tgts)
    all_test_loader = DataLoader(test_set, batch_size=128, shuffle=True)

    lstm_size = 128
    rnn_state_save = os.path.join(file_path, 'best_lstm_0.7_0.001_300')

    model = SentimentAnalysis(batch_size=lstm_size,
                              embedding_matrix=embedding_matrix,
                              hidden_size=lstm_size,
                              kept_prob=0.7,
                              num_layers=nlayer,
                              bidirection=bidirection)

    model.load_state_dict(torch.load(rnn_state_save))
    model = model.to(device)

    model.eval()
    test_pred = torch.tensor([])
    test_targets = torch.tensor([])

    with torch.no_grad():
        for batch_index, (seqs, length, target) in enumerate(all_test_loader):
            seqs, target, length = seqs.to(device), target.to(
                device), length.to(device)
            seqs = seqs.type(torch.LongTensor)
            len_order = torch.argsort(length, descending=True)
            length = length[len_order]
            seqs = seqs[len_order]
            target = target[len_order]

            output, pred_out = model.pred(seqs, length, False)
            test_pred = torch.cat((test_pred, pred_out.cpu()), dim=0)
            test_targets = torch.cat(
                (test_targets, target.type(torch.float).cpu()))

        accuracy = model.evaluate_accuracy(test_pred.numpy(),
                                           test_targets.numpy())
    print('Test Accuracy:{:.4f}.'.format(accuracy))
Beispiel #5
0
def main():
    class DictAttr(dict):
        def __getattr__(self, key):
            if key not in self:
                raise AttributeError(key)
            return self[key]

        def __setattr__(self, key, value):
            self[key] = value

        def __delattr__(self, key):
            del self[key]

    args = DictAttr()
    args.model_name = 'openai-gpt'
    args.train_dataset = "data_in/ROCStories/cloze_test_val__spring2016 - cloze_test_ALL_val.csv"
    args.eval_dataset = "data_in/ROCStories/cloze_test_test__spring2016 - cloze_test_ALL_test.csv"
    args.train_batch_size = 8

    # parser = argparse.ArgumentParser()
    # parser.add_argument('--model_name', type=str, default='openai-gpt',
    #                     help='pretrained model name')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )
    parser.add_argument('--train_dataset', type=str, default='')
    parser.add_argument('--eval_dataset', type=str, default='')
    parser.add_argument('--seed', type=int, default=42)
    parser.add_argument('--num_train_epochs', type=int, default=3)
    parser.add_argument('--train_batch_size', type=int, default=8)
    parser.add_argument('--eval_batch_size', type=int, default=16)
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument('--max_grad_norm', type=int, default=1)
    parser.add_argument("--max_steps",
                        default=-1,
                        type=int,
                        help="If > 0: set total number of training \
                        steps to perform. Override num_train_epochs.")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before\
                        performing a backward/update pass.")
    parser.add_argument('--learning_rate', type=float, default=6.25e-5)
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
    parser.add_argument('--weight_decay', type=float, default=0.01)
    parser.add_argument('--lm_coef', type=float, default=0.9)
    parser.add_argument('--n_valid', type=int, default=374)

    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_gpu = torch.cuda.device_count()
    logger.info("device: {}, n_gpu {}".format(device, n_gpu))

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # Load tokenizer and model
    # This loading functions also add new tokens and embeddings called `special tokens`
    # These new embeddings will be fine-tuned on the RocStories dataset
    special_tokens = ['_start_', '_delimiter_', '_classify_']
    tokenizer = OpenAIGPTTokenizer.from_pretrained(
        args.model_name, special_tokens=special_tokens)
    special_tokens_ids = list(
        tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
    model = OpenAIGPTDoubleHeadsModel.from_pretrained(
        args.model_name, num_special_tokens=len(special_tokens))
    model.to(device)

    # Load and encode the datasets
    if not args.train_dataset and not args.eval_dataset:
        roc_stories = cached_path(ROCSTORIES_URL)

    def tokenize_and_encode(obj):
        """ Tokenize and encode a nested object """
        if isinstance(obj, str):
            return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
        elif isinstance(obj, int):
            return obj
        return list(tokenize_and_encode(o) for o in obj)

    logger.info("Encoding dataset...")
    train_dataset = load_rocstories_dataset(args.train_dataset)
    #("Rick grew up in a troubled household. He never found good support in family, and turned to gangs. It wasn't long before Rick got shot in a robbery. The incident caused him to turn a new leaf.", 'He is happy now.', 'He joined a gang.', 0)

    eval_dataset = load_rocstories_dataset(args.eval_dataset)
    datasets = (train_dataset, eval_dataset)
    encoded_datasets = tokenize_and_encode(datasets)

    # Compute the max input length for the Transformer
    max_length = model.config.n_positions // 2 - 2
    input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3  \
                           for dataset in encoded_datasets for story, cont1, cont2, _ in dataset)
    input_length = min(input_length, model.config.n_positions
                       )  # Max size of input for the pre-trained model

    # Prepare inputs tensors and dataloaders
    tensor_datasets = pre_process_datasets(encoded_datasets, input_length,
                                           max_length, *special_tokens_ids)
    train_tensor_dataset, eval_tensor_dataset = tensor_datasets[
        0], tensor_datasets[1]

    train_data = TensorDataset(*train_tensor_dataset)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    eval_data = TensorDataset(*eval_tensor_dataset)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Prepare optimizer
    if args.do_train:
        if args.max_steps > 0:
            t_total = args.max_steps
            args.num_train_epochs = args.max_steps //\
                (len(train_dataloader) // args.gradient_accumulation_steps) + 1
        else:
            t_total = len(train_dataloader)\
                // args.gradient_accumulation_steps * args.num_train_epochs

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer,
                                         warmup_steps=args.warmup_steps,
                                         t_total=t_total)

    if args.do_train:
        nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_steps = 0
            tqdm_bar = tqdm(train_dataloader, desc="Training")
            for step, batch in enumerate(tqdm_bar):
                batch = tuple(t.to(device) for t in batch)
                input_ids, mc_token_ids, lm_labels, mc_labels = batch
                losses = model(input_ids, mc_token_ids, lm_labels, mc_labels)
                loss = args.lm_coef * losses[0] + losses[1]
                loss.backward()
                scheduler.step()
                optimizer.step()
                optimizer.zero_grad()
                tr_loss += loss.item()
                exp_average_loss = loss.item(
                ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item(
                )
                nb_tr_steps += 1
                tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(
                    exp_average_loss,
                    scheduler.get_lr()[0])

    # Save a trained model
    if args.do_train:
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
        tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
        model.to(device)

    if args.do_eval:
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        for batch in tqdm(eval_dataloader, desc="Evaluating"):
            batch = tuple(t.to(device) for t in batch)
            input_ids, mc_token_ids, lm_labels, mc_labels = batch
            with torch.no_grad():
                _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids,
                                                 lm_labels, mc_labels)

            mc_logits = mc_logits.detach().cpu().numpy()
            mc_labels = mc_labels.to('cpu').numpy()
            tmp_eval_accuracy = accuracy(mc_logits, mc_labels)

            eval_loss += mc_loss.mean().item()
            eval_accuracy += tmp_eval_accuracy

            nb_eval_examples += input_ids.size(0)
            nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples
        train_loss = tr_loss / nb_tr_steps if args.do_train else None
        result = {
            'eval_loss': eval_loss,
            'eval_accuracy': eval_accuracy,
            'train_loss': train_loss
        }

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
Beispiel #6
0
 def setUp(self):
     super(TestCustomPinFn, self).setUp()
     inps = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
     tgts = torch.arange(10 * 5, dtype=torch.float32).view(10, 5)
     self.dataset = TensorDataset(inps, tgts)
Beispiel #7
0
 def setUp(self):
     super(TestDataLoader, self).setUp()
     self.data = torch.randn(100, 2, 3, 5)
     self.labels = torch.randperm(50).repeat(2)
     self.dataset = TensorDataset(self.data, self.labels)
# for validation set
val_seq = torch.tensor(tokens_val['input_ids'])
val_mask = torch.tensor(tokens_val['attention_mask'])
val_y = torch.tensor(val_labels)

# for test set
test_seq = torch.tensor(tokens_test['input_ids'])
test_mask = torch.tensor(tokens_test['attention_mask'])
test_y = torch.tensor(test_labels)

print(f'shape of train val test set: {train_y.shape}, {val_y.shape}, {test_y.shape}')

# # Create DataLoaders
# wrap tensors
train_data = TensorDataset(train_seq, train_mask, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_seq, val_mask, val_y)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_seq, test_mask, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

"""# # Freeze BERT Parameters
for param in bert.parameters():
    param.requires_grad = False"""
Beispiel #9
0
def get(logger=None, args=None):
    data = {}
    taskcla = []
    t = 0

    for dataset in datasets:
        data[t] = {}
        if 'Bing' in dataset:
            data[t]['name'] = dataset
            data[t]['ncla'] = 2
        elif 'XuSemEval' in dataset:
            data[t]['name'] = dataset
            data[t]['ncla'] = 3

        processor = data_utils.AscProcessor()
        label_list = processor.get_labels()
        tokenizer = Tokenizer()
        train_examples = processor.get_train_examples(dataset)

        train_features = data_utils.convert_examples_to_features_w2v(
            train_examples, label_list, args.max_term_length,
            args.max_sentence_length, tokenizer, word_index_pretrained,
            vocab_size)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)

        all_tokens_term_ids = torch.tensor(
            [f.tokens_term_ids for f in train_features], dtype=torch.long)
        all_tokens_sentence_ids = torch.tensor(
            [f.tokens_sentence_ids for f in train_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        # print('all_tokens_term_ids: ',all_tokens_term_ids)

        train_data = TensorDataset(all_tokens_term_ids,
                                   all_tokens_sentence_ids, all_label_ids)

        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        data[t]['train'] = train_dataloader

        valid_examples = processor.get_dev_examples(dataset)
        valid_features=data_utils.convert_examples_to_features_w2v\
                (valid_examples, label_list, args.max_term_length, args.max_sentence_length,
                 tokenizer, word_index_pretrained, vocab_size)
        valid_all_tokens_term_ids = torch.tensor(
            [f.tokens_term_ids for f in valid_features], dtype=torch.long)
        valid_all_tokens_sentence_ids = torch.tensor(
            [f.tokens_sentence_ids for f in valid_features], dtype=torch.long)
        valid_all_label_ids = torch.tensor(
            [f.label_id for f in valid_features], dtype=torch.long)

        valid_data = TensorDataset(valid_all_tokens_term_ids,
                                   valid_all_tokens_sentence_ids,
                                   valid_all_label_ids)

        logger.info("***** Running validations *****")
        logger.info("  Num orig examples = %d", len(valid_examples))
        logger.info("  Num split examples = %d", len(valid_features))
        logger.info("  Batch size = %d", args.train_batch_size)

        valid_sampler = SequentialSampler(valid_data)
        valid_dataloader = DataLoader(valid_data,
                                      sampler=valid_sampler,
                                      batch_size=args.train_batch_size)

        data[t]['valid'] = valid_dataloader

        processor = data_utils.AscProcessor()
        label_list = processor.get_labels()
        tokenizer = BertTokenizer.from_pretrained(args.bert_model)
        eval_examples = processor.get_test_examples(dataset)
        eval_features = \
            data_utils.convert_examples_to_features_w2v\
                (eval_examples, label_list, args.max_term_length, args.max_sentence_length,
                 tokenizer, word_index_pretrained, vocab_size)

        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)

        all_tokens_term_ids = torch.tensor(
            [f.tokens_term_ids for f in eval_features], dtype=torch.long)
        all_tokens_sentence_ids = torch.tensor(
            [f.tokens_sentence_ids for f in eval_features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)

        eval_data = TensorDataset(all_tokens_term_ids, all_tokens_sentence_ids,
                                  all_label_ids)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        data[t]['test'] = eval_dataloader

        t += 1

    # Others
    f_name = 'asc_random'
    data_asc = {}

    with open(f_name, 'r') as f_random_seq:
        random_sep = f_random_seq.readlines()[args.idrandom].split()

    print('random_sep: ', random_sep)
    print('domains: ', domains)

    print('random_sep: ', len(random_sep))
    print('domains: ', len(domains))

    for task_id in range(args.ntasks):
        # print('task_id: ',task_id)
        asc_id = domains.index(random_sep[task_id])
        data_asc[task_id] = data[asc_id]
        taskcla.append((task_id, int(data[asc_id]['ncla'])))

    # Others
    n = 0
    for t in data.keys():
        n += data[t]['ncla']
    data['ncla'] = n

    return data_asc, taskcla, vocab_size, embeddings
Beispiel #10
0
                      activation,
                      w_init=w_init)
            torch.save(vae, 'pretrained_vae_n.pkl')

        else:
            if layer_wised == 0:
                vae = VAE(encoder_sizes, image_train[:, :, 0], activation)
                torch.save(vae, 'layerwisetrained_vae_n.pkl')
            else:
                vae = torch.load('layerwisetrained_vae_n.pkl')

            vae = vae.cuda()
            x_mean, _ = vae.get_latent(
                torch.from_numpy(image_train[:, :, 0]).cuda())
            print("| Latent range: {}/{}".format(x_mean.min(), x_mean.max()))
            dataloader = DataLoader(TensorDataset(
                torch.from_numpy(image_train[:, :, 0])),
                                    batch_size=BATCH_SIZE,
                                    shuffle=True)
            # optimizer = optim.Adam(vae.get_para(),lr=0.0001,weight_decay=0.0001)
            # optimizer = optim.Adam(vae.get_para(),lr=0.002)
            optimizer = optim.SGD(vae.get_para(), lr=0.0001, momentum=0.9)
            lr_scheduler = StepLR(optimizer, step_size=1000, gamma=0.5)
            print("2.1 pretrain the VAE model")
            vae = pretrain(vae,
                           optimizer,
                           lr_scheduler,
                           dataloader,
                           epoch_num=100)
            torch.save(vae, 'pretrained_vae_n.pkl')
    if resume:
        print("|Load pretrained model: {}".format(resume))
Beispiel #11
0
input_ids_dev, attention_masks_dev = encode_data(tokenizer, questions_dev,
                                                 passages_dev, max_seq_length)

train_features = (input_ids_train, attention_masks_train, answers_train)
dev_features = (input_ids_dev, attention_masks_dev, answers_dev)

batch_size = 32

train_features_tensors = [
    torch.tensor(feature, dtype=torch.long) for feature in train_features
]
dev_features_tensors = [
    torch.tensor(feature, dtype=torch.long) for feature in dev_features
]

train_dataset = TensorDataset(*train_features_tensors)
dev_dataset = TensorDataset(*dev_features_tensors)

train_sampler = RandomSampler(train_dataset)
dev_sampler = SequentialSampler(dev_dataset)

train_dataloader = DataLoader(train_dataset,
                              sampler=train_sampler,
                              batch_size=batch_size)
dev_dataloader = DataLoader(dev_dataset,
                            sampler=dev_sampler,
                            batch_size=batch_size)

########################################################
########################this should be 5################
########################################################
def create_batch_iter(mode):
    """构造迭代器"""
    processor, tokenizer = init_params()
    if mode == "train":
        examples = processor.get_train_examples(args.data_dir)

        num_train_steps = int(
            len(examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

        batch_size = args.train_batch_size

        logger.info("  Num steps = %d", num_train_steps)

    elif mode == "dev":
        examples = processor.get_dev_examples(args.data_dir)
        batch_size = args.eval_batch_size
    else:
        raise ValueError("Invalid mode %s" % mode)

    label_list = processor.get_labels()

    # 特征
    features = convert_examples_to_features(examples, label_list,
                                            args.max_seq_length, tokenizer)

    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", batch_size)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)
    all_output_mask = torch.tensor([f.output_mask for f in features],
                                   dtype=torch.long)

    # 数据集
    data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                         all_label_ids, all_output_mask)

    if mode == "train":
        sampler = RandomSampler(data)
    elif mode == "dev":
        sampler = SequentialSampler(data)
    else:
        raise ValueError("Invalid mode %s" % mode)

    # 迭代器
    iterator = DataLoader(data, sampler=sampler, batch_size=batch_size)

    if mode == "train":
        torch.save((iterator, num_train_steps), args.TRAIN_CACHE)
        return iterator, num_train_steps
    elif mode == "dev":
        torch.save(iterator, args.VALID_CACHE)
        return iterator
    else:
        raise ValueError("Invalid mode %s" % mode)
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese."
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument("--train_file",
                        default=None,
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json"
    )
    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.")
    parser.add_argument("--do_train",
                        default=False,
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=False,
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for predictions.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10% "
        "of training.")
    parser.add_argument("--save_checkpoints_steps",
                        default=1000,
                        type=int,
                        help="How often to save the model checkpoint.")
    parser.add_argument("--iterations_per_loop",
                        default=1000,
                        type=int,
                        help="How many steps to make in each estimator call.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json "
        "output file.")
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        action='store_true',
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--optimize_on_cpu',
        default=False,
        action='store_true',
        help=
        "Whether to perform optimization and keep the optimizer averages on CPU"
    )
    parser.add_argument(
        '--fp16',
        default=False,
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=128,
        help=
        'Loss scaling, positive power of 2 values can improve fp16 convergence.'
    )

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        if args.fp16:
            logger.info(
                "16-bits training currently not supported in distributed training"
            )
            args.fp16 = False  # (see https://github.com/pytorch/pytorch/pull/13496)
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits trainiing: {}"
        .format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = int(args.train_batch_size /
                                args.gradient_accumulation_steps)

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    train_examples = None
    num_train_steps = None
    if args.do_train:
        train_examples = read_squad_examples(input_file=args.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps * args.num_train_epochs)

    # Prepare model
    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.fp16:
        param_optimizer = [(n, param.clone().detach().to('cpu').float().requires_grad_()) \
                            for n, param in model.named_parameters()]
    elif args.optimize_on_cpu:
        param_optimizer = [(n, param.clone().detach().to('cpu').requires_grad_()) \
                            for n, param in model.named_parameters()]
    else:
        param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'gamma', 'beta']
    optimizer_grouped_parameters = [{
        'params': [p for n, p in param_optimizer if n not in no_decay],
        'weight_decay_rate':
        0.01
    }, {
        'params': [p for n, p in param_optimizer if n in no_decay],
        'weight_decay_rate':
        0.0
    }]
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_steps)

    global_step = 0
    if args.do_train:
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.fp16 and args.loss_scale != 1.0:
                    # rescale loss for fp16 training
                    # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                    loss = loss * args.loss_scale
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16 or args.optimize_on_cpu:
                        if args.fp16 and args.loss_scale != 1.0:
                            # scale down gradients for fp16 training
                            for param in model.parameters():
                                param.grad.data = param.grad.data / args.loss_scale
                        is_nan = set_optimizer_params_grad(
                            param_optimizer,
                            model.named_parameters(),
                            test_nan=True)
                        if is_nan:
                            logger.info(
                                "FP16 TRAINING: Nan in gradients, reducing loss scaling"
                            )
                            args.loss_scale = args.loss_scale / 2
                            model.zero_grad()
                            continue
                        optimizer.step()
                        copy_optimizer_params_to_model(
                            model.named_parameters(), param_optimizer)
                    else:
                        optimizer.step()
                    model.zero_grad()
                    global_step += 1

    if args.do_predict:
        eval_examples = read_squad_examples(input_file=args.predict_file,
                                            is_training=False)
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)
        if args.local_rank == -1:
            eval_sampler = SequentialSampler(eval_data)
        else:
            eval_sampler = DistributedSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader, desc="Evaluating"):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, args.verbose_logging)
def trainingBert(data, bert_model):    
    from transformers import BertTokenizer, BertForNextSentencePrediction
    import torch

    model = BertForNextSentencePrediction.from_pretrained(bert_model, return_dict=True)
    tokenizer = BertTokenizer.from_pretrained(bert_model)

    sentence1 = data[0]
    sentence2 = data[1]


    max_len = 1500



    input_ids = []
    attention_masks = []
    labels = []


    for x in range(len(data[0])):
        s1 = data[0][x]
        s2 = data[1][x]


        encoded_dict = tokenizer.encode_plus(
                        s1, text_pair=s2,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

        encoded_dict_reverse = tokenizer.encode_plus(
                        s2, text_pair=s1,
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length =max_len,           # Pad & truncate all sentences.
                        truncation=True,
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
    

        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        labels.append(0)


    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)






    from torch.utils.data import TensorDataset, random_split

    dataset = TensorDataset(input_ids, attention_masks, labels)
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))
    from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

    batch_size = 16

    train_dataloader = DataLoader(
                train_dataset,  # The training samples.
                batch_size = batch_size # Trains with this batch size.
            )
    validation_dataloader = DataLoader(
                val_dataset, # The validation samples.
                batch_size = batch_size # Evaluate with this batch size.
            )
    from transformers.optimization import AdamW
    optimizer = AdamW(model.parameters(),
                      lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                      eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                    )

    from transformers import get_linear_schedule_with_warmup

    epochs = 4


    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer)



    training_stats = []

    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
       
        for step, batch in enumerate(train_dataloader):

            b_input_ids = batch[0]
            b_input_mask = batch[1]
            b_labels = batch[2]

            res1 = model(b_input_ids, 
                                 token_type_ids=None, 
                                 attention_mask=b_input_mask, 
                                 next_sentence_label=b_labels)
            loss = res1[0]
            logits = res1[1]

            total_train_loss += loss.item()

        avg_train_loss = total_train_loss / len(train_dataloader)            

        import os
        model_dir = str(epoch_i) + '/'
        output_dir = PROJECT_ROOT + 'model_save/' + model_dir
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        print("Saving model to %s" % output_dir)

        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))

        print("")
        print("Running Validation...")
        # Tracking variables 
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            b_input_ids = batch[0]
            b_input_mask = batch[1]
            b_labels = batch[2]

            res2 = model(b_input_ids, 
                                   token_type_ids=None, 
                                   attention_mask=b_input_mask,
                                   next_sentence_label=b_labels)
            loss = res2[0]
            logits = res2[1]

            total_eval_loss += loss.item()
            logits = logits.numpy()
            label_ids = b_labels.numpy()

            accs = calc_acc(logits, label_ids)
            total_eval_accuracy += accs

        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        avg_val_loss = total_eval_loss / len(validation_dataloader)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))


        print("")
        print("Training complete!")

        print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))
    
    
    return model
    def predict(epoch=None):
        test_examples = processor.get_test_examples()
        test_features = convert_examples_to_features(
            test_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_doc_ids = torch.tensor([f.guid for f in test_features],
                                   dtype=torch.long)
        
        test_data = TensorDataset(all_input_ids,
                                  all_input_mask,
                                  all_segment_ids,
                                  all_doc_ids)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)
        
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        ids = []
        # FIXME: make it flexible to accept path
        all_ids_test = read_ids(os.path.join(args.data_dir, "ids_testing.txt"))
        
        for input_ids, input_mask, segment_ids, doc_ids in \
                tqdm(test_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            doc_ids = doc_ids.to(device)
            
            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask, labels=None)
            
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
            if len(ids) == 0:
                ids.append(doc_ids.detach().cpu().numpy())
            else:
                ids[0] = np.append(
                    ids[0], doc_ids.detach().cpu().numpy(), axis=0)
        
        ids = ids[0]
        preds = sigmoid(preds[0])
        preds = (preds > 0.5).astype(int)
        id2preds = {val:preds[i] for i, val in enumerate(ids)}
        
        for i, val in enumerate(all_ids_test):
            if val not in id2preds:
                id2preds[val] = []
        
        with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"),
                  "rb") as rf:
            mlb = pkl.load(rf)

        preds = [mlb.classes_[preds[i, :].astype(bool)].tolist()
                 for i in range(preds.shape[0])]
        id2preds = {val:preds[i] for i, val in enumerate(ids)}
        preds = [id2preds[val] if val in id2preds else []
                 for i, val in enumerate(all_ids_test)]
        
        with open(os.path.join(args.output_dir, f"preds_test{epoch}.txt"),
                  "w") as\
                wf:
            for idx, doc_id in enumerate(all_ids_test):
                line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n"
                wf.write(line)
                                              max_seq_length, tokenizer)

dev_examples = processor.get_dev_examples(
    "/home/wangwei/pt_workdir/bert_ner_task/data")
dev_features = convert_examples_to_features(train_examples, label_list,
                                            max_seq_length, tokenizer)

dev_input_ids = torch.tensor([f.input_ids for f in train_features],
                             dtype=torch.long)
dev_input_mask = torch.tensor([f.input_mask for f in train_features],
                              dtype=torch.long)
dev_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                               dtype=torch.long)
dev_label_ids = torch.tensor([f.label_id for f in train_features],
                             dtype=torch.long)
dev_data = TensorDataset(dev_input_ids, dev_input_mask, dev_segment_ids,
                         dev_label_ids)
dev_loader = DataLoader(dev_data,
                        sampler=RandomSampler(dev_data),
                        batch_size=train_batch_size)

logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_examples))
logger.info("  Batch size = %d", train_batch_size)
all_input_ids = torch.tensor([f.input_ids for f in train_features],
                             dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in train_features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                               dtype=torch.long)
all_label_ids = torch.tensor([f.label_id for f in train_features],
                             dtype=torch.long)
Beispiel #17
0
    def data_reader(self,
                    data_filepath,
                    label_filepath,
                    jitter_filepath,
                    train,
                    type,
                    should_batch=True,
                    shuffle=True,
                    infer=False):
        if infer:
            pass
        else:
            input_data, labels, jitter = read_npy(data_filepath), read_npy(
                label_filepath), read_npy(jitter_filepath)

            if train:
                self.logger.info(f'Original data size - before Augmentation')
                self.logger.info(f'Total data {str(len(input_data))}')
                self.logger.info(
                    f'Event rate {str(sum(labels) / len(labels))}')
                self.logger.info(
                    f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}'
                )

                for x in input_data:
                    self._min = min(np.min(x), self._min)
                    self._max = max(np.max(x), self._max)
                self._mean, self._std = np.mean(input_data), np.std(input_data)
                self._jmean, self._jstd = np.mean(jitter), np.std(jitter)
                self._jmin, self._jmax = np.min(jitter), np.max(jitter)

                if self.data_augment:
                    self.logger.info(f'Data Augmentation starts . . .')
                    label_to_augment = 1
                    amount_to_augment = 1.3
                    ones_ids = [
                        idx for idx, x in enumerate(labels)
                        if x == label_to_augment
                    ]
                    random_idxs = random.choices(
                        ones_ids, k=int(len(ones_ids) * amount_to_augment))
                    data_to_augment = input_data[random_idxs]
                    augmented_data, jitter_augmented_data = [], []
                    augmented_labels = []
                    for x in data_to_augment:
                        x = librosaSpectro_to_torchTensor(x)
                        x = random.choice([time_mask, freq_mask])(x)[0].numpy()
                        augmented_data.append(x), augmented_labels.append(
                            label_to_augment)

                    # Jitter and shimmer
                    # jitter_augmented_data, jitter_labels = BorderlineSMOTE().fit_resample(X=jitter, y=labels)
                    #
                    # assert np.mean(jitter_labels[len(jitter):][
                    #                :len(augmented_data)]) == 1, 'Issue with Jitter Shimmer Augmentation'
                    #
                    # jitter = np.concatenate((jitter, jitter_augmented_data[len(jitter):][:len(augmented_data)]))
                    input_data = np.concatenate((input_data, augmented_data))
                    labels = np.concatenate((labels, augmented_labels))

                    # Temp fix
                    # input_data = input_data[:len(jitter)]
                    # labels = labels[:len(jitter)]

                    # assert len(jitter) == len(
                    #         input_data), "Input data and Jitter Shimmer augmentations don't match in length"

                    self.logger.info(f'Data Augmentation done . . .')

                # data = [(x, y, z) for x, y, z in zip(input_data, labels, jitter)]
                # random.shuffle(data)
                # input_data, labels, jitter = np.array([x[0] for x in data]), [x[1] for x in data], np.array(
                #         [x[2] for x in data])

                data = [(x, y) for x, y in zip(input_data, labels)]
                random.shuffle(data)
                input_data, labels = np.array([x[0] for x in data
                                               ]), [x[1] for x in data]

                # Initialize pos_weight based on training data
                self.pos_weight = len([x for x in labels if x == 0]) / len(
                    [x for x in labels if x == 1])
                self.logger.info(
                    f'Pos weight for the train data - {self.pos_weight}')

            self.logger.info(f'Total data {str(len(input_data))}')
            self.logger.info(f'Event rate {str(sum(labels) / len(labels))}')
            self.logger.info(
                f'Input data shape:{np.array(input_data).shape} | Output data shape:{np.array(labels).shape}'
            )

            self.logger.info(
                f'Min max values used for normalisation {self._min, self._max}'
            )
            self.logger.info(
                f'Min max values used for normalisation {self._min, self._max}'
            )

            # Normalizing `input data` on train dataset's min and max values
            if self.normalise:
                input_data = (input_data - self._min) / (self._max - self._min)
                input_data = (input_data - self._mean) / self._std

                # jitter = (jitter - self._jmin) / (self._jmax - self._jmin)
                # jitter = (jitter - self._jmean) / self._jstd

            self.dataset_sizes[type] = len(input_data)
            return DataLoader(
                TensorDataset(
                    torch.Tensor(input_data).unsqueeze(1).repeat(1, 3, 1, 1),
                    torch.Tensor(labels)),
                batch_size=self.batch_size
                #        ,sampler=torch.utils.data.SubsetRandomSampler(list([x for x in range(10)]))
            )
Beispiel #18
0
def load_examples(args, tokenizer, mode):
    processor = processors[args.task](args)

    # Load data features from dataset file
    # NOTE: Get image features
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(args.data_dir,
                                        'cached_{}_{}'.format(args.task, mode))
    cached_img_features_file = os.path.join(
        args.data_dir, 'cached_img_{}_{}'.format(args.task, mode))

    if os.path.exists(cached_features_file) and os.path.exists(
            cached_img_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
        logger.info("Loading img features from cached file %s",
                    cached_img_features_file)
        all_img_features = torch.load(cached_img_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        img_feature_file = h5py.File(
            os.path.join(args.data_dir, args.h5_filename), 'r')
        if mode == "train":
            examples = processor.get_examples("train")
            img_ids = get_image_nums(args, args.train_file)
            all_img_features = load_vgg_features(img_feature_file, img_ids)
        elif mode == "dev":
            examples = processor.get_examples("dev")
            img_ids = get_image_nums(args, args.dev_file)
            all_img_features = load_vgg_features(img_feature_file, img_ids)
        elif mode == "test":
            examples = processor.get_examples("test")
            img_ids = get_image_nums(args, args.test_file)
            all_img_features = load_vgg_features(img_feature_file, img_ids)
        else:
            raise Exception("For mode, Only train, dev, test is available")

        label_len = len(get_label(args))

        features = convert_examples_to_features(examples, args.max_seq_len,
                                                tokenizer, label_len)
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)
        torch.save(all_img_features, cached_img_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in features],
                                 dtype=torch.long)

    print(all_input_ids.size())
    print(all_attention_mask.size())
    print(all_token_type_ids.size())
    print(all_input_ids.size())
    print(all_img_features.size())

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_label_ids,
                            all_img_features)
    return dataset
Beispiel #19
0
 def test_single_tensor(self):
     t = torch.randn(5, 10)
     source = TensorDataset(t)
     self.assertEqual(len(source), 5)
     for i in range(5):
         self.assertEqual(t[i], source[i][0])
Beispiel #20
0
def init_model():
    bert_tokenizer = "/Users/quantum/Downloads/bert-base-chinese/bert_chinese_vocab.txt"
    train_model = "/Users/quantum/Downloads/2019217/pytorch_model_epoch_0.bin"
    bert_model = "/Users/quantum/Downloads/bert-base-chinese/bert-base-chinese.tar.gz"
    data_dir = "/Users/quantum/Downloads/bert-base-chinese/"
    max_seq_length = 256
    do_lower_case = False

    processors = {"wnli": WNLIProcessor}

    task_name = "wnli"
    processor = processors[task_name]()
    label_list = processor.get_labels()

    tokenizer = BertTokenizer.from_pretrained(bert_tokenizer,
                                              do_lower_case=do_lower_case)

    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(train_model, map_location='cpu')
    # print(model_state_dict)
    model = BertForSequenceClassification.from_pretrained(
        bert_model, state_dict=model_state_dict)
    model.to(device)

    eval_examples = processor.get_test_examples(data_dir)
    eval_features = convert_examples_to_features(eval_examples, label_list,
                                                 max_seq_length, tokenizer)
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(eval_examples))
    all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                 dtype=torch.long)
    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                              all_label_ids)
    # Run prediction for full data
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=10)

    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for input_ids, input_mask, segment_ids, label_ids in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)
        label_ids = label_ids.to(device)

        with torch.no_grad():
            tmp_eval_loss = model(input_ids, segment_ids, input_mask,
                                  label_ids)
            logits = model(input_ids, segment_ids, input_mask)

        logits = logits.detach().cpu().numpy()
        label_ids = label_ids.to('cpu').numpy()
        tmp_eval_accuracy = accuracy(logits, label_ids)

        eval_loss += tmp_eval_loss.mean().item()
        eval_accuracy += tmp_eval_accuracy

        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

        eval_loss = eval_loss / nb_eval_steps
        eval_accuracy = eval_accuracy / nb_eval_examples

        result = {'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy}

        logger.info("***** Test results *****")
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
    return tokenizer, model
Beispiel #21
0
 def test_len(self):
     source = TensorDataset(torch.randn(15, 10, 2, 3, 4, 5),
                            torch.randperm(15))
     self.assertEqual(len(source), 15)
Beispiel #22
0
def load_and_cache_examples(args, task, tokenizer, evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    processor = processors[task](language=args.language,
                                 train_language=args.train_language)
    output_mode = output_modes[task]
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}_{}_{}".format(
            "test" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
            str(task),
            str(args.train_language if (
                not evaluate and args.train_language is not None
            ) else args.language),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        label_list = processor.get_labels()
        examples = (processor.get_test_examples(args.data_dir) if evaluate else
                    processor.get_train_examples(args.data_dir))
        features = convert_examples_to_features(
            examples,
            tokenizer,
            label_list=label_list,
            max_length=args.max_seq_length,
            output_mode=output_mode,
            pad_on_left=False,
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_attention_mask = torch.tensor([f.attention_mask for f in features],
                                      dtype=torch.long)
    all_token_type_ids = torch.tensor([f.token_type_ids for f in features],
                                      dtype=torch.long)
    if output_mode == "classification":
        all_labels = torch.tensor([f.label for f in features],
                                  dtype=torch.long)
    else:
        raise ValueError("No other `output_mode` for XNLI.")

    dataset = TensorDataset(all_input_ids, all_attention_mask,
                            all_token_type_ids, all_labels)
    return dataset
Beispiel #23
0
    def __init__(self, input_size, output_size):
        super(NN, self).__init__()
        self.layer1 = torch.nn.Linear(input_size, output_size,
                                      bias=False)  #全結合層の定義
        torch.nn.init.normal_(self.layer1.weight, 0.0, 1.0)  # 正規乱数で重みを初期化

    def forward(self, input):
        activation = torch.nn.Softmax(dim=-1)
        output = activation(self.layer1(input))

        return output


model = NN(300, 4)
#(X, y)の組を作成
data_train = TensorDataset(X_train, y_train)
#DataLoaderの作成
dataloader = DataLoader(data_train, batch_size=1, shuffle=True)
creterion = torch.nn.CrossEntropyLoss()
#最適化アルゴリズムの定義
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)


def accuracy(probs, y):
    cnt = 0
    for i, prob in enumerate(probs):
        #tensorからndarrayに変換し、最大要素のindexを返す
        y_pred = np.argmax(prob.detach().numpy())
        if y_pred == y.detach().numpy()[i]:
            cnt += 1
Beispiel #24
0
te_masks = [[float(i > 0) for i in ii] for ii in te_inputs]

# convert to tensor!
tr_inputs = torch.tensor(tr_inputs)
val_inputs = torch.tensor(val_inputs)
te_inputs = torch.tensor(te_inputs)

tr_tags = torch.tensor(tr_tags)
val_tags = torch.tensor(val_tags)
te_tags = torch.tensor(te_tags)

tr_masks = torch.tensor(tr_masks)
val_masks = torch.tensor(val_masks)
te_masks = torch.tensor(te_masks)

train_data = TensorDataset(tr_inputs, tr_masks, tr_tags)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler = train_sampler, batch_size = bs)

valid_data = TensorDataset(val_inputs, val_masks, val_tags)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler = valid_sampler, batch_size = bs)

test_data = TensorDataset(te_inputs, te_masks, te_tags)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler = test_sampler, batch_size = bs)

#config = DistilBertConfig.from_pretrained("distillbert_ner_c_model_save")
#model = DistillBertTagger(config = config)

model = DistilBertForTokenClassification.from_pretrained(
Beispiel #25
0
X_valid = joblib.load('ch08/X_valid.joblib')
y_valid = joblib.load('ch08/y_valid.joblib')
X_valid = torch.from_numpy(X_valid.astype(np.float32)).clone()
y_valid = torch.from_numpy(y_valid.astype(np.int64)).clone()

X_test = joblib.load('ch08/X_test.joblib')
y_test = joblib.load('ch08/y_test.joblib')
X_test = torch.from_numpy(X_test.astype(np.float32)).clone()
y_test = torch.from_numpy(y_test.astype(np.int64)).clone()

X = X_train
y = y_train
X = X.to('cuda:0')
y = y.to('cuda:0')
ds = TensorDataset(X, y)

net = nn.Linear(X.size()[1], 4)
net = net.to('cuda:0')
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

batchSize = [1, 2, 4, 8]

for bs in batchSize:
    loader = DataLoader(ds, batch_size=bs, shuffle=True)

    train_losses = []
    valid_losses = []
    train_accs = []
    valid_accs = []
def main():
    parser = argparse.ArgumentParser()
    
    ## Required parameters
    parser.add_argument("--corpus_type",
                        default="mixed",
                        type=str,
                        required=True,
                        help="Corpus type, mixed or categories")
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir. Should contain.pkl files, "
                             "named: train_data.pkl, dev_data.pkl, "
                             "test_data.pkl and mlb.pkl (e.g. as in "
                             "`exps-data/data`).")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: "
                             "bert-base-german-cased, bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, "
                             "bert-large-cased, bert-base-multilingual-uncased, "
                             "bert-base-multilingual-cased, "
                             "bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model "
                             "predictions and checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--use_data",
                        default="orig",
                        type=str,
                        help="Original DE, tokenized DE or tokenized EN.")
    parser.add_argument("--cache_dir",
                        default="",
                        type=str,
                        help="Where do you want to store the pre-trained "
                             "models downloaded from s3")
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after "
                             "WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, "
                             "and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Set this flag if using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--loss_fct",
                        default="bbce",
                        type=str,
                        help="Loss function to use BCEWithLogitsLoss (`bbce`)")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_proportion",
                        default=0.1,
                        type=float,
                        help="Proportion of training to perform linear "
                             "learning rate warmup for. "
                             "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumulate before "
                             "performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead "
                             "of 32-bit")
    parser.add_argument('--loss_scale',
                        type=float, default=0,
                        help="Loss scaling to improve fp16 numeric stability. "
                             "Only used when fp16 set to True.\n"
                             "0 (default value): dynamic loss scaling.\n"
                             "Positive power of 2: static loss scaling value.")
    parser.add_argument('--server_ip', type=str, default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()
    
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/
        # debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()
    
    processors = {
        "nts": NTSTaskProcessor
    }
    
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and
                                        not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of
        # sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - '
                                 '%(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0]
                        else logging.WARN)
    
    logger.info("device: {} n_gpu: {}, distributed training: {}, "
                "16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))
    
    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, "
                         "should be >= 1".format(
                            args.gradient_accumulation_steps))
    
    args.train_batch_size = args.train_batch_size // \
                            args.gradient_accumulation_steps
    
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)
    
    if not args.do_train and not args.do_eval:
        raise ValueError("At least one of `do_train` or `do_eval` must "
                         "be True.")
    
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) \
            and args.do_train:
        raise ValueError("Output directory ({}) already exists and is not "
                         "empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    
    task_name = args.task_name.lower()
    
    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))
    
    processor = processors[task_name](args.data_dir,
                                      args.corpus_type, use_data=args.use_data)
    pos_weight = torch.tensor(processor.pos_weight, requires_grad=False,
                              dtype=torch.float, device=device)
    label_list = processor.get_labels()
    num_labels = len(label_list)
    
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)
    
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = processor.get_train_examples()
        num_train_optimization_steps = int(
            len(train_examples) /
            args.train_batch_size /
            args.gradient_accumulation_steps
        ) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // \
                                           torch.distributed.get_world_size()
    
    # Prepare model
    cache_dir = args.cache_dir if args.cache_dir else os.path.join(
        str(PYTORCH_PRETRAINED_BERT_CACHE),
        'distributed_{}'.format(args.local_rank))
    model = BertForMultiLabelSequenceClassification.from_pretrained(
        args.bert_model,
        cache_dir=cache_dir,
        num_labels=num_labels,
        loss_fct=args.loss_fct
    )
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https:/"
                              "/www.github.com/nvidia/apex to use distributed "
                              "and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)
    
    # Prepare optimizer
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]
    if args.do_train:
        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from "
                                  "https://www.github.com/nvidia/apex to use "
                                  "distributed and fp16 training.")
            
            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        
        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps,
                                 schedule='warmup_cosine')
    
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0

    def eval(epoch=None):
        eval_examples = processor.get_dev_examples()
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_doc_ids = torch.tensor([f.guid for f in eval_features],
                                   dtype=torch.long)
        
        # output_mode == "classification":
        all_label_ids = torch.tensor([f.label_ids for f in eval_features],
                                     dtype=torch.float)
        all_label_ids = all_label_ids.view(-1, num_labels)
        
        eval_data = TensorDataset(all_input_ids,
                                  all_input_mask,
                                  all_segment_ids,
                                  all_label_ids,
                                  all_doc_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        ids = []
        # FIXME: make it flexible to accept path
        all_ids_dev = read_ids(os.path.join(args.data_dir,
                                            "ids_development.txt"))
        
        for input_ids, input_mask, segment_ids, label_ids, doc_ids in \
                tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            doc_ids = doc_ids.to(device)
            
            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask, labels=None)
            
            # create eval loss and other metric required by the task
            # output_mode == "classification":
            loss_fct = BCEWithLogitsLoss()
            tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                     label_ids.view(-1, num_labels))
            
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
            if len(ids) == 0:
                ids.append(doc_ids.detach().cpu().numpy())
            else:
                ids[0] = np.append(
                    ids[0], doc_ids.detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        ids = ids[0]
        preds = sigmoid(preds[0])
        preds = (preds > 0.5).astype(int)
        
        result = compute_metrics(task_name, preds, all_label_ids.numpy())
        #result = compute_metrics(task_name, preds, all_label_ids.numpy())
        loss = tr_loss/nb_tr_steps if args.do_train else None

        result['train_loss'] = loss
        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
            writer.write('\n')
        
        with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"),
                  "rb") as rf:
            mlb = pkl.load(rf)
        preds = [mlb.classes_[preds[i, :].astype(bool)].tolist()
                 for i in range(preds.shape[0])]
        id2preds = {val:preds[i] for i, val in enumerate(ids)}
        preds = [id2preds[val] if val in id2preds else []
                 for i, val in enumerate(all_ids_dev)]
        
        with open(os.path.join(args.output_dir, f"preds_development"
                                                f"{epoch}.txt"),
                  "w") as wf:
            for idx, doc_id in enumerate(all_ids_dev):
                line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n"
                wf.write(line)

    def predict(epoch=None):
        test_examples = processor.get_test_examples()
        test_features = convert_examples_to_features(
            test_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(test_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in test_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in test_features],
                                       dtype=torch.long)
        all_doc_ids = torch.tensor([f.guid for f in test_features],
                                   dtype=torch.long)
        
        test_data = TensorDataset(all_input_ids,
                                  all_input_mask,
                                  all_segment_ids,
                                  all_doc_ids)
        # Run prediction for full data
        test_sampler = SequentialSampler(test_data)
        test_dataloader = DataLoader(test_data,
                                     sampler=test_sampler,
                                     batch_size=args.eval_batch_size)
        
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        ids = []
        # FIXME: make it flexible to accept path
        all_ids_test = read_ids(os.path.join(args.data_dir, "ids_testing.txt"))
        
        for input_ids, input_mask, segment_ids, doc_ids in \
                tqdm(test_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            doc_ids = doc_ids.to(device)
            
            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask, labels=None)
            
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
            if len(ids) == 0:
                ids.append(doc_ids.detach().cpu().numpy())
            else:
                ids[0] = np.append(
                    ids[0], doc_ids.detach().cpu().numpy(), axis=0)
        
        ids = ids[0]
        preds = sigmoid(preds[0])
        preds = (preds > 0.5).astype(int)
        id2preds = {val:preds[i] for i, val in enumerate(ids)}
        
        for i, val in enumerate(all_ids_test):
            if val not in id2preds:
                id2preds[val] = []
        
        with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"),
                  "rb") as rf:
            mlb = pkl.load(rf)

        preds = [mlb.classes_[preds[i, :].astype(bool)].tolist()
                 for i in range(preds.shape[0])]
        id2preds = {val:preds[i] for i, val in enumerate(ids)}
        preds = [id2preds[val] if val in id2preds else []
                 for i, val in enumerate(all_ids_test)]
        
        with open(os.path.join(args.output_dir, f"preds_test{epoch}.txt"),
                  "w") as\
                wf:
            for idx, doc_id in enumerate(all_ids_test):
                line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n"
                wf.write(line)
    
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        
        # output_mode == "classification":
        all_label_ids = torch.tensor([f.label_ids for f in train_features],
                                     dtype=torch.float)
        all_label_ids = all_label_ids.view(-1, num_labels)
        
        train_data = TensorDataset(all_input_ids,
                                   all_input_mask,
                                   all_segment_ids,
                                   all_label_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)
        
        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader,
                                              desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                
                # define a new function to compute loss values for both
                # output_modes
                logits = model(input_ids, segment_ids, input_mask, labels=None)
                
                # if output_mode == "classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits.view(-1, num_labels),
                                label_ids.view(-1, num_labels))

                if n_gpu > 1:
                    loss = loss.mean() # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used that handles
                        # this automatically
                        lr_this_step = args.learning_rate * \
                                       warmup_linear.get_lr(
                            global_step/num_train_optimization_steps,
                            args.warmup_proportion
                        )
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            eval(epoch=epoch)
            predict(epoch=epoch)

            # save checkpoints
            # Save a trained model, configuration and tokenizer
            # model_to_save = model.module if hasattr(model,
            #                                         'module') else model
            # # If we save using the predefined names, we can load using
            # # `from_pretrained`
            # os.makedirs(f"{args.output_dir}/{epoch}")
            # output_model_file = os.path.join(f"{args.output_dir}/{epoch}", "
            #                                  f"WEIGHTS_NAME)
            # output_config_file = os.path.join(f"{args.output_dir}/{epoch}",
            # CONFIG_NAME)
            #
            # torch.save(model_to_save.state_dict(), output_model_file)
            # model_to_save.config.to_json_file(output_config_file)
            # tokenizer.save_vocabulary(f"{args.output_dir}/{epoch}")
            # end save checkpoints
    
    if args.do_train and (args.local_rank == -1 or
                          torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(model, 'module') else model
        
        # If we save using the predefined names, we can load using
        # `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
        
        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)
        
        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForMultiLabelSequenceClassification.from_pretrained(
            args.output_dir,
            num_labels=num_labels)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir,
            do_lower_case=args.do_lower_case)
    else:
        model = BertForMultiLabelSequenceClassification.from_pretrained(
            args.bert_model,
            num_labels=num_labels)
    model.to(device)
    
    if args.do_eval and (args.local_rank == -1 or
                         torch.distributed.get_rank() == 0):
        eval()
        predict()
Beispiel #27
0
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

"""# Preprocessing of data to feed to BERT"""

# Run `preprocessing_for_bert` on the prediction set
print('Tokenizing data...')
pre_inputs, pre_masks = preprocessing_for_bert(df['reviews.text'])

# Create the DataLoader for our prediction set
pre_dataset = TensorDataset(pre_inputs, pre_masks)
pre_sampler = SequentialSampler(pre_dataset)
pre_dataloader = DataLoader(pre_dataset, sampler=pre_sampler, batch_size=10)

"""# **Load the saved BERT trained Model**"""

model=torch.load("/content/drive/MyDrive/FYP Datasets/trained_model1.pth")

"""# **Testing**

## Get predictions
"""

#Compute predicted probabilities on the test set
probs = bert_predict(model, pre_dataloader)
    def eval(epoch=None):
        eval_examples = processor.get_dev_examples()
        eval_features = convert_examples_to_features(
            eval_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_doc_ids = torch.tensor([f.guid for f in eval_features],
                                   dtype=torch.long)
        
        # output_mode == "classification":
        all_label_ids = torch.tensor([f.label_ids for f in eval_features],
                                     dtype=torch.float)
        all_label_ids = all_label_ids.view(-1, num_labels)
        
        eval_data = TensorDataset(all_input_ids,
                                  all_input_mask,
                                  all_segment_ids,
                                  all_label_ids,
                                  all_doc_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        
        model.eval()
        eval_loss = 0
        nb_eval_steps = 0
        preds = []
        ids = []
        # FIXME: make it flexible to accept path
        all_ids_dev = read_ids(os.path.join(args.data_dir,
                                            "ids_development.txt"))
        
        for input_ids, input_mask, segment_ids, label_ids, doc_ids in \
                tqdm(eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)
            doc_ids = doc_ids.to(device)
            
            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask, labels=None)
            
            # create eval loss and other metric required by the task
            # output_mode == "classification":
            loss_fct = BCEWithLogitsLoss()
            tmp_eval_loss = loss_fct(logits.view(-1, num_labels),
                                     label_ids.view(-1, num_labels))
            
            eval_loss += tmp_eval_loss.mean().item()
            nb_eval_steps += 1
            if len(preds) == 0:
                preds.append(logits.detach().cpu().numpy())
            else:
                preds[0] = np.append(
                    preds[0], logits.detach().cpu().numpy(), axis=0)
            if len(ids) == 0:
                ids.append(doc_ids.detach().cpu().numpy())
            else:
                ids[0] = np.append(
                    ids[0], doc_ids.detach().cpu().numpy(), axis=0)

        eval_loss = eval_loss / nb_eval_steps
        ids = ids[0]
        preds = sigmoid(preds[0])
        preds = (preds > 0.5).astype(int)
        
        result = compute_metrics(task_name, preds, all_label_ids.numpy())
        #result = compute_metrics(task_name, preds, all_label_ids.numpy())
        loss = tr_loss/nb_tr_steps if args.do_train else None

        result['train_loss'] = loss
        result['eval_loss'] = eval_loss
        result['global_step'] = global_step
        
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as writer:
            logger.info("***** Eval results *****")
            for key in sorted(result.keys()):
                logger.info("  %s = %s", key, str(result[key]))
                writer.write("%s = %s\n" % (key, str(result[key])))
            writer.write('\n')
        
        with open(os.path.join(args.data_dir, f"mlb_{args.corpus_type}.pkl"),
                  "rb") as rf:
            mlb = pkl.load(rf)
        preds = [mlb.classes_[preds[i, :].astype(bool)].tolist()
                 for i in range(preds.shape[0])]
        id2preds = {val:preds[i] for i, val in enumerate(ids)}
        preds = [id2preds[val] if val in id2preds else []
                 for i, val in enumerate(all_ids_dev)]
        
        with open(os.path.join(args.output_dir, f"preds_development"
                                                f"{epoch}.txt"),
                  "w") as wf:
            for idx, doc_id in enumerate(all_ids_dev):
                line = str(doc_id) + "\t" + "|".join(preds[idx]) + "\n"
                wf.write(line)
    def create_dataloader(self):
        data = DATAMultiWOZ(
            debug=False,
            data_dir=self.data_dir,
        )
        train_examples = data.read_examples(
            os.path.join(self.data_dir, 'train.json'))
        train_features = data.convert_examples_to_features(
            train_examples, self.tokenizer, self.max_seq_length)
        all_input_ids = torch.tensor(data.select_field(train_features,
                                                       'input_ids'),
                                     dtype=torch.long)
        all_input_mask = torch.tensor(data.select_field(
            train_features, 'input_mask'),
                                      dtype=torch.long)
        all_segment_ids = torch.tensor(data.select_field(
            train_features, 'segment_ids'),
                                       dtype=torch.long)
        all_utterance_mask = torch.tensor(data.select_field(
            train_features, 'utterance_mask'),
                                          dtype=torch.long)
        all_domainslot_mask = torch.tensor(data.select_field(
            train_features, 'domainslot_mask'),
                                           dtype=torch.long)
        all_label_tokens_start = torch.tensor(
            [f.label_tokens_start for f in train_features], dtype=torch.long)
        all_label_tokens_end = torch.tensor(
            [f.label_tokens_end for f in train_features], dtype=torch.long)
        all_label_sentence_domainslot = torch.tensor(
            [f.label_sentence_domainslot for f in train_features],
            dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_utterance_mask,
                                   all_domainslot_mask, all_label_tokens_start,
                                   all_label_tokens_end,
                                   all_label_sentence_domainslot)

        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=self.train_batch_size)

        eval_examples = data.read_examples(
            os.path.join(self.data_dir, 'test.json'))
        eval_features = data.convert_examples_to_features(
            eval_examples, self.tokenizer, self.max_seq_length)
        eval_input_ids = torch.tensor(data.select_field(
            eval_features, 'input_ids'),
                                      dtype=torch.long)
        eval_input_mask = torch.tensor(data.select_field(
            eval_features, 'input_mask'),
                                       dtype=torch.long)
        eval_segment_ids = torch.tensor(data.select_field(
            eval_features, 'segment_ids'),
                                        dtype=torch.long)
        eval_utterance_mask = torch.tensor(data.select_field(
            eval_features, 'utterance_mask'),
                                           dtype=torch.long)
        eval_domainslot_mask = torch.tensor(data.select_field(
            eval_features, 'domainslot_mask'),
                                            dtype=torch.long)
        eval_label_tokens_start = torch.tensor(
            [f.label_tokens_start for f in eval_features], dtype=torch.long)
        eval_label_tokens_end = torch.tensor(
            [f.label_tokens_end for f in eval_features], dtype=torch.long)
        eval_label_sentence_domainslot = torch.tensor(
            [f.label_sentence_domainslot for f in eval_features],
            dtype=torch.long)

        eval_data = TensorDataset(eval_input_ids, eval_input_mask,
                                  eval_segment_ids, eval_utterance_mask,
                                  eval_domainslot_mask,
                                  eval_label_tokens_start,
                                  eval_label_tokens_end,
                                  eval_label_sentence_domainslot)
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=self.eval_batch_size)

        return train_dataloader, eval_dataloader, train_examples, eval_examples
def main():
    # Training settings
    def strpair(arg):
        p = tuple(arg.split(':'))
        if len(p) == 1:
            p = p + p
        return p

    parser = argparse.ArgumentParser(
        description='Ablation eval',
        epilog=textwrap.dedent(help_epilog),
        formatter_class=argparse.RawDescriptionHelpFormatter)
    parser.add_argument('--model',
                        type=str,
                        default=None,
                        help='constructor for the model to test')
    parser.add_argument('--pthfile',
                        type=str,
                        default=None,
                        help='filename of .pth file for the model')
    parser.add_argument('--outdir',
                        type=str,
                        default='dissect',
                        required=True,
                        help='directory for dissection output')
    parser.add_argument('--layers',
                        type=strpair,
                        nargs='+',
                        help='space-separated list of layer names to edit' +
                        ', in the form layername[:reportedname]')
    parser.add_argument('--classes',
                        type=str,
                        nargs='+',
                        help='space-separated list of class names to ablate')
    parser.add_argument('--metric',
                        type=str,
                        default='iou',
                        help='ordering metric for selecting units')
    parser.add_argument('--unitcount',
                        type=int,
                        default=30,
                        help='number of units to ablate')
    parser.add_argument('--segmenter',
                        type=str,
                        help='directory containing segmentation dataset')
    parser.add_argument('--netname',
                        type=str,
                        default=None,
                        help='name for network in generated reports')
    parser.add_argument('--batch_size',
                        type=int,
                        default=5,
                        help='batch size for forward pass')
    parser.add_argument('--size',
                        type=int,
                        default=200,
                        help='number of images to test')
    parser.add_argument('--no-cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA usage')
    parser.add_argument('--quiet',
                        action='store_true',
                        default=False,
                        help='silences console output')
    if len(sys.argv) == 1:
        parser.print_usage(sys.stderr)
        sys.exit(1)
    args = parser.parse_args()

    # Set up console output
    verbose_progress(not args.quiet)

    # Speed up pytorch
    torch.backends.cudnn.benchmark = True

    # Set up CUDA
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    if args.cuda:
        torch.backends.cudnn.benchmark = True

    # Take defaults for model constructor etc from dissect.json settings.
    with open(os.path.join(args.outdir, 'dissect.json')) as f:
        dissection = EasyDict(json.load(f))
    if args.model is None:
        args.model = dissection.settings.model
    if args.pthfile is None:
        args.pthfile = dissection.settings.pthfile
    if args.segmenter is None:
        args.segmenter = dissection.settings.segmenter

    # Instantiate generator
    model = create_instrumented_model(args, gen=True, edit=True)
    if model is None:
        print('No model specified')
        sys.exit(1)

    # Instantiate model
    device = next(model.parameters()).device
    input_shape = model.input_shape

    # 4d input if convolutional, 2d input if first layer is linear.
    raw_sample = standard_z_sample(args.size, input_shape[1],
                                   seed=2).view((args.size, ) +
                                                input_shape[1:])
    dataset = TensorDataset(raw_sample)

    # Create the segmenter
    segmenter = autoimport_eval(args.segmenter)

    # Now do the actual work.
    labelnames, catnames = (segmenter.get_label_and_category_names(dataset))
    label_category = [
        catnames.index(c) if c in catnames else 0 for l, c in labelnames
    ]
    labelnum_from_name = {n[0]: i for i, n in enumerate(labelnames)}

    segloader = torch.utils.data.DataLoader(dataset,
                                            batch_size=args.batch_size,
                                            num_workers=10,
                                            pin_memory=(device.type == 'cuda'))

    # Index the dissection layers by layer name.
    dissect_layer = {lrec.layer: lrec for lrec in dissection.layers}

    # First, collect a baseline
    for l in model.ablation:
        model.ablation[l] = None

    # For each sort-order, do an ablation
    progress = default_progress()
    for classname in progress(args.classes):
        post_progress(c=classname)
        for layername in progress(model.ablation):
            post_progress(l=layername)
            rankname = '%s-%s' % (classname, args.metric)
            classnum = labelnum_from_name[classname]
            try:
                ranking = next(r for r in dissect_layer[layername].rankings
                               if r.name == rankname)
            except:
                print('%s not found' % rankname)
                sys.exit(1)
            ordering = numpy.argsort(ranking.score)
            # Check if already done
            ablationdir = os.path.join(args.outdir, layername, 'pixablation')
            if os.path.isfile(os.path.join(ablationdir, '%s.json' % rankname)):
                with open(os.path.join(ablationdir,
                                       '%s.json' % rankname)) as f:
                    data = EasyDict(json.load(f))
                # If the unit ordering is not the same, something is wrong
                if not all(a == o
                           for a, o in zip(data.ablation_units, ordering)):
                    continue
                if len(data.ablation_effects) >= args.unitcount:
                    continue  # file already done.
                measurements = data.ablation_effects
            measurements = measure_ablation(segmenter, segloader, model,
                                            classnum, layername,
                                            ordering[:args.unitcount])
            measurements = measurements.cpu().numpy().tolist()
            os.makedirs(ablationdir, exist_ok=True)
            with open(os.path.join(ablationdir, '%s.json' % rankname),
                      'w') as f:
                json.dump(
                    dict(classname=classname,
                         classnum=classnum,
                         baseline=measurements[0],
                         layer=layername,
                         metric=args.metric,
                         ablation_units=ordering.tolist(),
                         ablation_effects=measurements[1:]), f)