Example #1
0
 def prepare_data(self):
     "Called to initialize data. Use the call to construct features"
     args = self.hparams
     for mode in ["train", "dev", "test"]:
         cached_features_file = self._feature_file(mode)
         if not os.path.exists(cached_features_file):
             logger.info("Creating features from dataset file at %s",
                         args.data_dir)
             examples = read_examples_from_file(args.data_dir, mode)
             features = convert_examples_to_features(
                 examples,
                 self.labels,
                 args.max_seq_length,
                 self.tokenizer,
                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
                 cls_token=self.tokenizer.cls_token,
                 cls_token_segment_id=2
                 if args.model_type in ["xlnet"] else 0,
                 sep_token=self.tokenizer.sep_token,
                 sep_token_extra=bool(args.model_type in ["roberta"]),
                 pad_on_left=bool(args.model_type in ["xlnet"]),
                 pad_token=self.tokenizer.convert_tokens_to_ids(
                     [self.tokenizer.pad_token])[0],
                 pad_token_segment_id=4
                 if args.model_type in ["xlnet"] else 0,
                 pad_token_label_id=self.pad_token_label_id,
             )
             logger.info("Saving features into cached file %s",
                         cached_features_file)
             torch.save(features, cached_features_file)
 def load_and_cache_examples(self, cit_strings, tokenizer, labels, pad_token_label_id, mode):
     examples = self.read_examples(cit_strings,mode)
     features = convert_examples_to_features(examples, labels, self.max_seq_length, tokenizer,
                                             cls_token_at_end=bool(self.model_type in ["xlnet"]),
                                             # xlnet has a cls token at the end
                                             cls_token=tokenizer.cls_token,
                                             cls_token_segment_id=2 if self.model_type in ["xlnet"] else 0,
                                             sep_token=tokenizer.sep_token,
                                             sep_token_extra=bool(self.model_type in ["roberta"]),
                                             # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                             pad_on_left=bool(self.model_type in ["xlnet"]),
                                             # pad on the left for xlnet
                                             pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                             pad_token_segment_id=4 if self.model_type in ["xlnet"] else 0,
                                             pad_token_label_id=pad_token_label_id
                                             )
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
     all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
     all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
     print('#'*10,'Finish Preparing Dataset','#'*10)
     print('The shape of the input dataset: ',all_input_ids.shape)
     dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
     return dataset
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir,
        "cached_{}_{}_{}".format(
            mode,
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length)),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        examples = read_examples_from_file(args.data_dir, mode)
        features = convert_examples_to_features(
            examples,
            labels,
            args.max_seq_length,
            tokenizer,
            cls_token_at_end=bool(args.model_type in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
            pad_token_label_id=pad_token_label_id,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features],
                                 dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Example #4
0
def load_and_cache_examples(tokenizer, labels):

    examples = read_examples_from_file('./', mode='test')

    features = convert_examples_to_features(
        examples,
        labels,
        128,
        tokenizer,
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=0,
        sep_token=tokenizer.sep_token,
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0])

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features],
                                 dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args.data_dir, "cached_{}_{}_{}".format(
            mode,
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", args.data_dir)
        examples = read_examples_from_file(args.data_dir, mode)
        features = convert_examples_to_features(
            examples,
            labels,
            args.max_seq_length,
            tokenizer,
            cls_token_at_end=False,
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False,
            pad_on_left=False,
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0,
            pad_token_label_id=pad_token_label_id)
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(features, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features],
                                 dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                            all_label_ids)
    return dataset
Example #6
0
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id,
                            batch_size, mode):
    drop_remainder = True if args["tpu"] or mode == "train" else False

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        args["data_dir"],
        "cached_{}_{}_{}.tf_record".format(
            mode,
            list(filter(None, args["model_name_or_path"].split("/"))).pop(),
            str(args["max_seq_length"])),
    )
    if os.path.exists(cached_features_file) and not args["overwrite_cache"]:
        logging.info("Loading features from cached file %s",
                     cached_features_file)
        dataset, size = load_cache(cached_features_file,
                                   args["max_seq_length"])
    else:
        logging.info("Creating features from dataset file at %s",
                     args["data_dir"])
        examples = read_examples_from_file(args["data_dir"], mode)
        features = convert_examples_to_features(
            examples,
            labels,
            args["max_seq_length"],
            tokenizer,
            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args["model_type"] in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args["model_type"] in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=4 if args["model_type"] in ["xlnet"] else 0,
            pad_token_label_id=pad_token_label_id,
        )
        logging.info("Saving features into cached file %s",
                     cached_features_file)
        save_cache(features, cached_features_file)
        dataset, size = load_cache(cached_features_file,
                                   args["max_seq_length"])

    if mode == "train":
        dataset = dataset.repeat()
        dataset = dataset.shuffle(buffer_size=8192, seed=args["seed"])

    dataset = dataset.batch(batch_size, drop_remainder)
    dataset = dataset.prefetch(buffer_size=batch_size)

    return dataset, size
Example #7
0
def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, lang, mode):
    data_path = os.path.join(args.data_dir, lang)
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(data_path, "cached_{}_{}_{}".format(mode,
                            list(filter(None, args.model_name_or_path.split("/"))).pop(), str(args.max_seq_length)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s.", cached_features_file)
        features = torch.load(cached_features_file)
    else:
        logger.info("Creating features from dataset file at %s", data_path)
        examples = read_examples_from_file(data_path, mode)
        features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
                                                cls_token_at_end=False,
                                                # xlnet has a cls token at the end
                                                cls_token=tokenizer.cls_token,
                                                cls_token_segment_id=0,
                                                sep_token=tokenizer.sep_token,
                                                sep_token_extra=False,
                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                                                pad_on_left=False,
                                                # pad on the left for xlnet
                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
                                                pad_token_segment_id=0,
                                                pad_token_label_id=pad_token_label_id
                                                )

        logger.info("Saving features into cached file %s", cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)

    return dataset
Example #8
0
def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
    #eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    
    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)
    # Eval!
    logger.info("***** Running evaluation %s *****", prefix)
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None
    model.eval()


    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file

    logger.info("Creating features from dataset file at %s", args.data_dir)
    examples = read_examples_from_file(args.data_dir, mode)
    print(len(examples))
    print(examples[0])  # list of words in one document (sentence)

    out_label_listX=[]
    preds_listX=[]

    for example in tqdm(examples, desc="Evaluating"):
        max_length = 500
        min_context = 128
        l = len(example.words) #TODO number of segments for each word? word_tokens = tokenizer.tokenize(word)
        word_tokens_lengths=[len(tokenizer.tokenize(word)) for word in example.words]

        ws=windows(word_tokens_lengths, max_length, min_context)
        print(ws)
        
        text_examples=[]
        for start_all, start_content, end_content, end_all in ws:
            ex=InputExample(guid=example.guid, words=example.words[start_all:end_all], labels=example.labels[start_all:end_all])
            text_examples.append(ex)

        # przed tym trzeba podzielić
        features = convert_examples_to_features(
            text_examples,
            labels,
            512, #args.max_seq_length,
            tokenizer,
            cls_token_at_end=bool(args.model_type in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args.model_type in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args.model_type in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
            pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
            pad_token_label_id=pad_token_label_id,
        )
    
        if args.local_rank == 0 and not evaluate:
            torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
    
        # Convert to Tensors and build dataset
        all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
        all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
    
        eval_dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    







    
        # Note that DistributedSampler samples randomly
        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)

    

        # Eval!

        logger.info("  Num examples = %d", len(eval_dataset))
        
        # batch = next(iter(eval_dataloader))
        a = []
        b = []
        for batch, (start_all, start_content, end_content, end_all) in tqdm(zip(eval_dataloader, ws), desc="Evaluating"):
            preds = None
            out_label_ids = None
            batch = tuple(t.to(args.device) for t in batch)
    
            with torch.no_grad():
                inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
                if args.model_type != "distilbert":
                    inputs["token_type_ids"] = (
                        batch[2] if args.model_type in ["bert", "xlnet"] else None
                    )  # XLM and RoBERTa don"t use segment_ids
                outputs = model(**inputs)
                tmp_eval_loss, logits = outputs[:2]
    
                if args.n_gpu > 1:
                    tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
    
                eval_loss += tmp_eval_loss.item()
            nb_eval_steps += 1
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
        
            
            preds = np.argmax(preds, axis=2)
        
            label_map = {i: label for i, label in enumerate(labels)}
        
            out_label_list = [[] for _ in range(out_label_ids.shape[0])]
            preds_list = [[] for _ in range(out_label_ids.shape[0])]
        
            for i in range(out_label_ids.shape[0]):
                for j in range(out_label_ids.shape[1]):
                    if out_label_ids[i, j] != pad_token_label_id:
                        out_label_list[i].append(label_map[out_label_ids[i][j]])
                        preds_list[i].append(label_map[preds[i][j]])
    
            #join
            
            for i in range(len(out_label_list)):
                a.extend(out_label_list[i][start_content-start_all:end_content-start_all])
                b.extend(preds_list[i][start_content-start_all:end_content-start_all])
    
        out_label_listX.append(a)
        preds_listX.append(b)
        # results = {
        #     "loss": eval_loss,
        #     "precision": precision_score(out_label_list, preds_list),
        #     "recall": recall_score(out_label_list, preds_list),
        #     "f1": f1_score(out_label_list, preds_list),
        # }

    eval_loss = eval_loss / nb_eval_steps

    try:
        results = {
            "loss": eval_loss,
            "precision": precision_score(out_label_listX, preds_listX),
            "recall": recall_score(out_label_listX, preds_listX),
            "f1": f1_score(out_label_listX, preds_listX),
        }
        
        logger.info("***** Eval results %s *****", prefix)
        for key in sorted(results.keys()):
            logger.info("  %s = %s", key, str(results[key]))
        
        return results, preds_listX
    except IndexError: #no output labels in file
        return {}, preds_listX
Example #9
0
model.load_state_dict(
    torch.load(os.path.join(args.model_name_or_path, 'pytorch_model.bin'),
               map_location='cpu'))
model.to(device)

pad_token_label_id = CrossEntropyLoss().ignore_index
examples = read_examples_from_file(args.data_dir, mode)

features, all_tokens = convert_examples_to_features(
    examples,
    labels,
    args.max_seq_length,
    tokenizer,
    cls_token_at_end=bool(args.model_type in ["xlnet"]),
    cls_token=tokenizer.cls_token,
    cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
    sep_token=tokenizer.sep_token,
    sep_token_extra=bool(args.model_type in ["roberta"]),
    pad_on_left=bool(args.model_type in ["xlnet"]),
    pad_token=tokenizer.pad_token_id,
    pad_token_segment_id=tokenizer.pad_token_type_id,
    pad_token_label_id=pad_token_label_id,
)

all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features],
                               dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
Example #10
0
# setting the folder for possible whole directory tagging
files_dir = 'data/'

# getting labels
labels = get_labels('labels.txt')
pad_token_label_id = CrossEntropyLoss().ignore_index

# reading examples
examples = read_examples_from_file('.', 'test')
features = convert_examples_to_features(
    examples,
    labels,
    256,
    tokenizer,
    cls_token_at_end=False,
    cls_token=tokenizer.cls_token,
    cls_token_segment_id=0,
    sep_token=tokenizer.sep_token,
    sep_token_extra=False,
    pad_on_left=False,
    pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
    pad_token_segment_id=0,
    pad_token_label_id=pad_token_label_id)
all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
all_input_mask = torch.tensor([f.input_mask for f in features],
                              dtype=torch.long)
all_segment_ids = torch.tensor([f.segment_ids for f in features],
                               dtype=torch.long)
all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                        all_label_ids)
Example #11
0
def predict(args,
            data_list,
            model,
            tokenizer,
            labels,
            pad_token_label_id,
            mode,
            prefix="",
            label=None):
    """Receive a list of data and predict the label, suitable for receiving upstream data from cobot"""
    examples = read_examples_from_list(data_list, mode, label)
    features = convert_examples_to_features(
        examples,
        labels,
        args.max_seq_length,
        tokenizer,
        cls_token_at_end=bool(args.model_type in ["xlnet"]),
        # xlnet has a cls token at the end
        cls_token=tokenizer.cls_token,
        cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
        sep_token=tokenizer.sep_token,
        sep_token_extra=bool(args.model_type in ["roberta"]),
        # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
        pad_on_left=bool(args.model_type in ["xlnet"]),
        # pad on the left for xlnet
        pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
        pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
        pad_token_label_id=pad_token_label_id)
    preds = None
    out_label_ids = None
    model.eval()
    for feature in features:
        with torch.no_grad():
            input_length = len(feature.input_ids)
            inputs = {
                "input_ids":
                torch.tensor(feature.input_ids,
                             device=args.device).view(1, input_length),
                "attention_mask":
                torch.tensor(feature.input_mask,
                             device=args.device).view(1, input_length),
                "token_type_ids":
                torch.tensor(feature.segment_ids,
                             device=args.device).view(1, input_length),
                "labels":
                torch.tensor(feature.label_ids,
                             device=args.device).view(1, input_length)
            }
            outputs = model(**inputs)
            if args.model_type == "bert":
                _, logits = outputs
            else:
                _, logits, batch_preds = outputs

        if args.model_type == "bert":
            if preds is None:
                preds = logits.detach().cpu().numpy()
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs["labels"].detach().cpu().numpy(),
                    axis=0)
        elif args.model_type == "bert_crf":
            if preds is None:
                preds = np.array(batch_preds)
                out_label_ids = inputs["labels"].detach().cpu().numpy()
            else:
                preds = np.append(preds, np.array(batch_preds), axis=0)
                out_label_ids = np.append(
                    out_label_ids,
                    inputs["labels"].detach().cpu().numpy(),
                    axis=0)

    if args.model_type == "bert":
        preds = np.argmax(preds, axis=2)

    label_map = {i: label for i, label in enumerate(labels)}

    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
    preds_list = [[] for _ in range(out_label_ids.shape[0])]

    for i in range(out_label_ids.shape[0]):
        for j in range(out_label_ids.shape[1]):
            if out_label_ids[i, j] != pad_token_label_id:
                out_label_list[i].append(label_map[out_label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    return preds_list
Example #12
0
def load_and_cache_examples(args,
                            tokenizer,
                            labels,
                            pad_token_label_id,
                            lang,
                            mode,
                            plain_text=False):
    data_path = os.path.join(args.data_dir, lang)
    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        data_path, "cached_{}_{}_{}".format(
            mode,
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length)))
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s. Plain text: %s",
                    cached_features_file, plain_text)
        features = torch.load(cached_features_file)
    else:
        logger.info(
            "Creating features from dataset file at %s. Plain text: %s",
            data_path, plain_text)
        examples = read_examples_from_file(data_path, mode)
        features = convert_examples_to_features(
            examples,
            labels,
            args.max_seq_length,
            tokenizer,
            cls_token_at_end=False,
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=False,
            pad_on_left=False,
            # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0,
            pad_token_label_id=pad_token_label_id)

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(features, cached_features_file)

    # Convert to Tensors and build dataset
    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                   dtype=torch.long)

    if not plain_text:
        all_label_ids = torch.tensor([f.label_ids for f in features],
                                     dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
        return dataset
    else:
        # assert lang in args.src_langs and lang != args.tgt_lang
        language_id = args.src_langs.index(
            lang) if lang in args.src_langs else len(args.src_langs)
        all_language_id = torch.tensor([language_id] * len(features),
                                       dtype=torch.long)
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_language_id)
        return dataset
Example #13
0
def main(_):
    logging.set_verbosity(logging.INFO)
    args = flags.FLAGS.flag_values_dict()

    if args["fp16"]:
        tf.config.optimizer.set_experimental_options(
            {"auto_mixed_precision": True})

    if args["tpu"]:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            tpu=args["tpu"])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
        args["n_device"] = args["num_tpu_cores"]
    elif len(args["gpus"].split(",")) > 1:
        args["n_device"] = len(
            [f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
        strategy = tf.distribute.MirroredStrategy(
            devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
    elif args["no_cuda"]:
        args["n_device"] = 1
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")
    else:
        args["n_device"] = len(args["gpus"].split(","))
        strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" +
                                                   args["gpus"].split(",")[0])

    logging.warning(
        "n_device: %s, distributed training: %s, 16-bits training: %s",
        args["n_device"],
        bool(args["n_device"] > 1),
        args["fp16"],
    )

    labels = get_labels(args["labels"])
    pad_token_label_id = -1

    logging.info("predict parameters %s", args)
    tokenizer = AutoTokenizer.from_pretrained(
        args["output_dir"], do_lower_case=args["do_lower_case"])
    model = TFAutoModelForTokenClassification.from_pretrained(
        args["output_dir"])

    while True:
        print('Input chinese sentence:')
        line = str(input())
        if line == 'quit':
            break
        if len(line) < 1:
            print(
                'Please input a chinese sentence or "quit" to break this loop:'
            )
            continue

        examples = read_examples_from_line(line)
        features = convert_examples_to_features(
            examples,
            labels,
            args["max_seq_length"],
            tokenizer,
            cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=2 if args["model_type"] in ["xlnet"] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=bool(args["model_type"] in ["roberta"]),
            # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=bool(args["model_type"] in ["xlnet"]),
            # pad on the left for xlnet
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=tokenizer.pad_token_type_id,
            pad_token_label_id=pad_token_label_id,
        )

        feature = features[0]
        X = collections.OrderedDict()

        X["input_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.input_ids)))
        X["input_mask"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.input_mask)))
        X["segment_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.segment_ids)))
        X["label_ids"] = tf.train.Feature(int64_list=tf.train.Int64List(
            value=list(feature.label_ids)))
        tf_example = tf.train.Example(features=tf.train.Features(feature=X))
        tf_example = tf_example.SerializeToString()

        max_seq_length = args["max_seq_length"]
        name_to_features = {
            "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
            "label_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
        }

        def _decode_record(record):
            example = tf.io.parse_single_example(record, name_to_features)
            features = {}
            features["input_ids"] = example["input_ids"]
            features["input_mask"] = example["input_mask"]
            features["segment_ids"] = example["segment_ids"]
            return features, example["label_ids"]

        dataset = []
        dataset.append(tf_example)

        dataset = tf.data.Dataset.from_tensor_slices(dataset)
        dataset = dataset.map(_decode_record)

        batch_size = 1
        dataset = dataset.batch(batch_size)

        eval_features, eval_labels = iter(dataset).next()

        inputs = {
            "attention_mask": eval_features["input_mask"],
            "training": False
        }

        if args["model_type"] != "distilbert":
            inputs["token_type_ids"] = (eval_features["segment_ids"]
                                        if args["model_type"]
                                        in ["bert", "xlnet"] else None)

        with strategy.scope():
            logits = model(eval_features["input_ids"], **inputs)[0]
            active_loss = tf.reshape(eval_labels, (-1, )) != pad_token_label_id

        preds = logits.numpy()
        label_ids = eval_labels.numpy()

        preds = np.argmax(preds, axis=2)
        y_pred = [[] for _ in range(label_ids.shape[0])]

        for i in range(label_ids.shape[0]):
            for j in range(label_ids.shape[1]):
                if label_ids[i, j] != pad_token_label_id:
                    y_pred[i].append(labels[preds[i, j]])

        tokens = tokenizer.tokenize(line)
        print('## tokens = %s' % tokens)
        print('## y_pred = %s' % y_pred)
        print('## %s = %s' % (len(tokens), len(y_pred[0])))
        word_group = []
        subword = {}

        def _add_word(subword):
            word_group.append(subword['token'] + '/' + subword['flag'])
            subword.clear()

        for i, token in enumerate(tokens):
            flag = y_pred[0][i]
            print('## %s = %s' % (token, flag))
            if flag.startswith('B'):
                if len(subword) > 0:
                    _add_word(subword)
                subword['token'] = token
                subword['flag'] = flag
            elif flag.startswith('I'):
                if (len(subword) > 0 and (y_pred[0][i - 1].startswith('I')
                                          or y_pred[0][i - 1].startswith('B'))
                        and (y_pred[0][i - 1][1:] == flag[1:])):
                    subword['token'] = subword['token'] + token
                    continue
                elif len(subword) > 0:
                    _add_word(subword)
                subword['token'] = token
                subword['flag'] = flag
            else:
                if len(subword) > 0:
                    _add_word(subword)
                subword['token'] = token
                subword['flag'] = flag
                _add_word(subword)

        if len(subword) > 0:
            _add_word(subword)
        print('## word_group = %s' % word_group)
Example #14
0
def load_and_cache_examples(args, task, tokenizer, labels, pad_token_label_id,
                            mode):
    args.data_dir = args.base_data_dir + task.lower()
    try:
        # print("old args.data_dir: ", args.data_dir, flush=True)
        args.data_dir = args.data_dir.replace('/home/rizwan/.flair/datasets/',
                                              '/local/rizwan/UDTree/')
        # print("new args.data_dir: ", args.data_dir, flush=True)

        if args.local_rank not in [-1, 0] and not evaluate:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.data_dir,
            "cached_{}_{}_{}".format(
                mode,
                list(filter(None, args.model_name_or_path.split("/"))).pop(),
                str(args.max_seq_length)),
        )
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info('-' * 120)
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            features = torch.load(cached_features_file)
            logger.info('-' * 120)
        else:
            logger.info("Creating features from dataset file at %s",
                        args.data_dir)
            examples = read_examples_from_file(args.data_dir, mode)
            features = convert_examples_to_features(
                examples,
                labels,
                args.max_seq_length,
                tokenizer,
                cls_token_at_end=bool(args.model_type in ["xlnet"]),
                # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=bool(args.model_type in ["roberta"]),
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(args.model_type in ["xlnet"]),
                # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
                pad_token_label_id=pad_token_label_id,
            )
            if args.local_rank in [-1, 0]:
                logger.info("Saving features into cached file %s",
                            cached_features_file)
                torch.save(features, cached_features_file)

        if args.local_rank == 0 and not evaluate:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

            # Data Shepley debug
        if args.data_size and mode == "train":
            if args.data_size < len(features):
                logger.info('-' * 50)
                logger.info(
                    f'originial data size: {len(features)} truncated to {args.data_size}'
                )
                features = np.random.choice(features,
                                            args.data_size,
                                            replace=False)
                logger.info('-' * 50)
        # Convert to Tensors and build dataset
        logger.info("- Creating Tensors")
        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_ids for f in features],
                                     dtype=torch.long)

        logger.info("- Creating Tensor dataset")
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)

        random_init_result = None

        return dataset, random_init_result, all_label_ids.shape[0]
    except:
        print("could not print args.data_dir", flush=True)

        if args.local_rank not in [-1, 0] and not evaluate:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

        # Load data features from cache or dataset file
        cached_features_file = os.path.join(
            args.data_dir,
            "cached_{}_{}_{}".format(
                mode,
                list(filter(None, args.model_name_or_path.split("/"))).pop(),
                str(args.max_seq_length)),
        )
        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info('-' * 120)
            logger.info("Loading features from cached file %s",
                        cached_features_file)
            features = torch.load(cached_features_file)
            logger.info('-' * 120)
        else:
            logger.info("Creating features from dataset file at %s",
                        args.data_dir)
            examples = read_examples_from_file(args.data_dir, mode)
            features = convert_examples_to_features(
                examples,
                labels,
                args.max_seq_length,
                tokenizer,
                cls_token_at_end=bool(args.model_type in ["xlnet"]),
                # xlnet has a cls token at the end
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
                sep_token=tokenizer.sep_token,
                sep_token_extra=bool(args.model_type in ["roberta"]),
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                pad_on_left=bool(args.model_type in ["xlnet"]),
                # pad on the left for xlnet
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
                pad_token_label_id=pad_token_label_id,
            )
            if args.local_rank in [-1, 0]:
                logger.info("Saving features into cached file %s",
                            cached_features_file)
                torch.save(features, cached_features_file)

        if args.local_rank == 0 and not evaluate:
            torch.distributed.barrier(
            )  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

            # Data Shepley debug
        if args.data_size and mode == "train":
            if args.data_size < len(features):
                logger.info('-' * 50)
                logger.info(
                    f'originial data size: {len(features)} truncated to {args.data_size}'
                )
                features = np.random.choice(features,
                                            args.data_size,
                                            replace=False)
                logger.info('-' * 50)
        # Convert to Tensors and build dataset
        logger.info("- Creating Tensors")
        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_ids for f in features],
                                     dtype=torch.long)

        logger.info("- Creating Tensor dataset")
        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)

        random_init_result = None

        return dataset, random_init_result, all_label_ids.shape[0]