Esempio n. 1
0
def load_model():
    # device = torch.device("cuda")
    model = MT5ForConditionalGeneration.from_pretrained(model_name)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    print(f"success load state dict from: {model_path}")
    return model
Esempio n. 2
0
def TorchMT5Trainer(
    model_params,
    device,
    output_dir=OUTPUT_DIR,
):
    set_seed(model_params)

    tokenizer = T5Tokenizer.from_pretrained(model_params["MODEL"])
    model = MT5ForConditionalGeneration.from_pretrained(model_params["MODEL"])
    model = model.to(device)

    print("Reading data...")
    train_loader, eval_loader = get_dataloaders(tokenizer, model_params)
    optimizer = torch.optim.SGD(
        params=model.parameters(),
        lr=model_params["LEARNING_RATE"],
    )

    print("Training...")
    for epoch in range(1, model_params["TRAIN_EPOCHS"] + 1):
        train(epoch, tokenizer, model, device, train_loader, optimizer)

    print("Evaluating...")
    predictions, actuals = eval(0, tokenizer, model, device, eval_loader)
    output_df = pd.DataFrame({"Predictions": predictions, "Actuals": actuals})
    if not os.path.exists(output_dir):
        os.mkdir(output_dir)
    output_df.to_csv(output_dir)
Esempio n. 3
0
def init_ff_mt5():
    """
    Initializes the FlexFlow representation of the HuggingFace mT5 model.

    Returns:
        (ffmodel, input_dls, label_dl)

        ffmodel (FFModel): Compiled and initialized FlexFlow model representing
            HuggingFace mT5.
        input_dls (List[SingleDataLoader]): List consisting of the encoder
            input IDs, encoder attention mask, and decoder input IDs
            dataloaders.
        label_dl (SingleDataLoader): Label dataloader.
    """
    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    mt5_torch = MT5ForConditionalGeneration.from_pretrained(
        PRETRAINED_MODEL_NAME,
    )
    input_ids, attention_mask, decoder_input_ids, labels = load_batch_ff()
    input_tensors = [
        ffmodel.create_tensor(input_ids.shape, DataType.DT_INT64),
        ffmodel.create_tensor(attention_mask.shape, DataType.DT_INT64),
        ffmodel.create_tensor(decoder_input_ids.shape, DataType.DT_INT64),
    ]
    mt5_model = PyTorchModel(
        mt5_torch,
        is_hf_model=True,
        input_names=["input_ids", "attention_mask", "decoder_input_ids"],
        batch_size=ffconfig.batch_size,
        seq_length=(input_ids.shape[1], decoder_input_ids.shape[1]),
    )
    output_tensors = mt5_model.torch_to_ff(ffmodel, input_tensors)
    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)
    ffmodel.compile(
        optimizer=ffoptimizer,
        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
        metrics=[
            MetricsType.METRICS_ACCURACY,
            MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
        ],
    )
    input_ids_dl = ffmodel.create_data_loader(input_tensors[0], input_ids)
    attention_mask_dl = ffmodel.create_data_loader(
        input_tensors[1], attention_mask,
    )
    decoder_input_ids_dl = ffmodel.create_data_loader(
        input_tensors[2], decoder_input_ids,
    )
    # NOTE: We cast down the label tensor data to 32-bit to accomomodate the
    # label tensor's bitwidth requirement
    label_dl = ffmodel.create_data_loader(
        ffmodel.label_tensor, labels.astype("int32"),
    )
    input_dls = [input_ids_dl, attention_mask_dl, decoder_input_ids_dl]
    ffmodel.init_layers()
    return (ffmodel, input_dls, label_dl)
Esempio n. 4
0
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.hparams = hparams

        self.model = MT5ForConditionalGeneration.from_pretrained(
            hparams.model_name_or_path)
        self.tokenizer = MT5TokenizerFast.from_pretrained(
            hparams.tokenizer_name_or_path)
        self.model.get_output_embeddings().weight.requires_grad = False
        self.model.get_input_embeddings().weight.requires_grad = False
 def load(self):
     set_seed(42)
     _model = MT5ForConditionalGeneration.from_pretrained(
         self.pretrained_path)
     self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_path)
     self.device = torch.device(
         "cuda" if torch.cuda.is_available() else "cpu")
     self.model = _model.to(self.device)
     self.logger.info(
         "Pretrained file and tokenizer for model {} were loaded.".format(
             self.model_name))
Esempio n. 6
0
    def load_model(
        model_name_or_path: str,
        cache_dir: str,
        device: torch.device,
        merge_encoder_and_decoder_init: bool = True,
        model_type: str = "t5",
    ) -> Dict[str, torch.nn.Module]:
        """Load model given a pretrained name or path, then build models for ONNX conversion.

        Args:
            model_name_or_path (str): pretrained model name or path
            cache_dir (str): cache directory
            device (torch.device): device to run the model
            merge_encoder_and_decoder_init (bool, optional): Whether merge encoder and decoder initialization into one ONNX model. Defaults to True.
            is_mt5 (bool, optional): whether the model is MT5 instead of T5
        Returns:
            Dict[str, torch.nn.Module]: mapping from name to modules for ONNX conversion.
        """
        if model_type == "t5":
            model = T5ForConditionalGeneration.from_pretrained(
                model_name_or_path, cache_dir=cache_dir)
        elif model_type == "mt5":
            model = MT5ForConditionalGeneration.from_pretrained(
                model_name_or_path, cache_dir=cache_dir)
        else:
            raise ValueError("only support mode_type=t5 or mt5")

        decoder = T5Decoder(model.decoder, model.lm_head, model.config)
        decoder.eval().to(device)

        if merge_encoder_and_decoder_init:
            encoder_decoder_init = T5EncoderDecoderInit(
                model.encoder,
                model.decoder,
                model.lm_head,
                model.config,
                decoder_start_token_id=None,
            )
            return {
                "encoder_decoder_init": encoder_decoder_init,
                "decoder": decoder
            }
        else:
            encoder = T5Encoder(model.encoder, model.config)
            encoder.eval().to(device)
            decoder_init = T5DecoderInit(model.decoder, model.lm_head,
                                         model.config)
            decoder_init.eval().to(device)
            return {
                "encoder": encoder,
                "decoder": decoder,
                "decoder_init": decoder_init,
            }
Esempio n. 7
0
def t5(comment: str, model_checkpoint: str, cuda: bool = True):
    device = "cuda" if torch.cuda.is_available() and cuda else "cpu"
    tok = AutoTokenizer.from_pretrained(model_checkpoint)
    model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint).to(device)
    model.eval()

    inputs = tok("speech review: " + comment, return_tensors="pt")
    inputs["decoder_input_ids"] = torch.tensor([[tok.pad_token_id] for _ in range(len(inputs["input_ids"]))])
    outputs = model(**inputs)
    selected_logits = outputs.logits.squeeze(1)[:, [59006, 112560]]
    score = nn.functional.softmax(selected_logits, dim=-1)
    print(score)
    return score
Esempio n. 8
0
def main(
        model_path: str, corpus: Corpus = "kaggle", split_name: str = "valid",
        max_len: int = 128, batch_size: int = 32):
    if "mt5" in Path(model_path).stem:
        tokenizer = MT5Tokenizer.from_pretrained(model_path)
        # print(tokenizer.encode("</s>"))
        model = MT5ForConditionalGeneration(
            MT5Config.from_pretrained(model_path)
        ).eval()
    else:
        tokenizer = T5Tokenizer.from_pretrained(model_path)
        # print(tokenizer.encode("</s>"))
        model = T5ForConditionalGeneration(
            T5Config.from_pretrained(model_path)
        ).eval()
    shrink_vocab(model_path, model)
    model.lm_head = torch.nn.Linear(model.lm_head.in_features, 3, bias=False)
    model.load_state_dict(torch.load(Path(model_path) / "pytorch_model.bin"))
    model = model.cuda()
    # model.load_state_dict(torch.load(model_path))
    context_tokens_1 = tokenizer.encode("mnli hypothesis:")[:-1]
    context_tokens_2 = tokenizer.encode("premise:")[:-1]
    collate_fn = partial(
        collate_batch, pad=model.config.decoder_start_token_id,
        decode_start_token=model.config.pad_token_id,
        max_len=max_len, is_classifier=True
    )
    dataset = XNLIDataset(
        corpus, split_name + ".jbl",
        context_tokens_1, context_tokens_2)
    data_loader = DataLoader(
        dataset, num_workers=1, shuffle=False, drop_last=False,
        batch_size=batch_size, collate_fn=collate_fn)
    preds, labels = [], []
    for input_batch, label_batch in tqdm(data_loader, ncols=100):
        for key, val in input_batch.items():
            input_batch[key] = val.cuda()
        outputs = model(**input_batch)
        preds_local = torch.argmax(outputs["logits"][:, 0, :].cpu(), dim=-1)
        preds.append(preds_local.numpy())
        labels.append(np.asarray([x[0] for x in label_batch["ids"].cpu().numpy()]))
    full_labels = np.concatenate(labels)
    full_preds = np.concatenate(preds)
    # print("Label mapping:")
    # for key in np.unique(full_labels):
    #     print(f"{key}: {tokenizer.decode([key])}")
    print("Labels:")
    print(pd.Series(full_labels).value_counts())
    print("Predictions:")
    print(pd.Series(full_preds).value_counts())
    print("Acc: %.2f%%" % (np.mean(full_labels == full_preds) * 100))
Esempio n. 9
0
    def __init__(self):
        self.mt5_tokenizer = MT5Tokenizer.from_pretrained(
            "Pollawat/mt5-small-thai-qa-qg")
        self.mt5_model = MT5ForConditionalGeneration.from_pretrained(
            "Pollawat/mt5-small-thai-qa-qg")

        self.wangchanberta_tokenizer = AutoTokenizer.from_pretrained(
            "airesearch/wangchanberta-base-att-spm-uncased")
        self.wangchanberta_model = AutoModelForMaskedLM.from_pretrained(
            "airesearch/wangchanberta-base-att-spm-uncased")
        self.wangchanberta_pipeline = pipeline(
            task='fill-mask',
            tokenizer=self.wangchanberta_tokenizer,
            model=self.wangchanberta_model)
        self.stopwords = thai_stopwords()
Esempio n. 10
0
 def __init__(self,
              model_size: str = "small",
              num_beams: int = 4,
              no_repeat_ngram_size: int = 2,
              min_length: int = 30,
              max_length: int = 100,
              skip_special_tokens: bool = True):
     if model_size not in ["small", "base", "large", "xl", "xxl"]:
         raise ValueError(f"""model_size \"{model_size}\" not found.
             It might be a typo; if not, please consult our document.""")
     self.model = MT5ForConditionalGeneration.from_pretrained(
         f'google/mt5-{model_size}')
     self.tokenizer = T5Tokenizer.from_pretrained(
         f'google/mt5-{model_size}')
     self.num_beams = num_beams
     self.no_repeat_ngram_size = no_repeat_ngram_size
     self.min_length = min_length
     self.max_length = max_length
     self.skip_special_tokens = skip_special_tokens
Esempio n. 11
0
def create_t5_encoder_decoder(pretrained_version="t5-base"):
    """Generates an encoder and a decoder model with a language model head from a pretrained huggingface model

    Args:
        pretrained_version (str): Name of a pretrained model, or path to a pretrained / finetuned version of T5

    Returns:
        simplified_encoder: pytorch t5 encoder with a wrapper to output only the hidden states
        decoder_with_lm_head: pytorch t5 decoder with a language modeling head
    """

    if 'mt5' in pretrained_version:
        model = MT5ForConditionalGeneration.from_pretrained(
            pretrained_version, use_auth_token=get_auth_token())
    else:
        model = T5ForConditionalGeneration.from_pretrained(
            pretrained_version, use_auth_token=get_auth_token())

    return turn_model_into_encoder_decoder(model)
Esempio n. 12
0
def extract_mt5_subgraph(
    initial_op_name: Optional[str] = None,
    final_op_name: Optional[str] = None,
):
    """
    Extracts the mT5 subgraph starting from ``initial_op_name`` and ending
    with ``final_op_name`` (inclusive) in the topological order. If either
    argument is ``None``, then that side of the limit defaults to the first
    and last operator, respectively.

    NOTE: HuggingFace's symbolic trace only supports tracing a selection of
    classes. As a result, we must extract subgraphs from the full mT5 graph
    in the Python FlexFlow space.

    Returns:
        subgraph (List[Node]): List of the nodes comprising the subgraph.
    """
    mt5_torch = MT5ForConditionalGeneration.from_pretrained(
        PRETRAINED_MODEL_NAME,
    )
    input_ids, _, decoder_input_ids, _ = load_batch_ff()
    BATCH_SIZE = 8
    mt5_model = PyTorchModel(
        mt5_torch,
        is_hf_model=True,
        input_names=["input_ids", "attention_mask", "decoder_input_ids"],
        batch_size=BATCH_SIZE,
        seq_length=(input_ids.shape[1], decoder_input_ids.shape[1]),
    )
    graph = mt5_model._trace_model()
    subgraph = []
    in_subgraph: bool = initial_op_name is None
    for node in graph:
        if initial_op_name is not None and node.name == initial_op_name:
            in_subgraph = True
        if in_subgraph:
            subgraph.append(node)
        if final_op_name is not None and node.name == final_op_name:
            break
    return subgraph
Esempio n. 13
0
    def __init__(self,
                 vocab: Vocabulary,
                 pretrained_model_path,
                 beam_size=5,
                 max_decoding_steps=140,
                 indexer=None):
        super().__init__(vocab)
        self.plm = MT5ForConditionalGeneration.from_pretrained(pretrained_model_path)
        self._indexer = indexer or PretrainedTransformerIndexer(pretrained_model_path, namespace="tokens")
        ##
        self._start_id = self.plm.config.decoder_start_token_id
        ##
        self._end_id = self.plm.config.eos_token_id  #
        self._decoder_start_id = self.plm.config.decoder_start_token_id
        self._end_id = self.plm.config.eos_token_id  #
        self._pad_id = self.plm.config.pad_token_id  #

        self._beam_search = BeamSearch(
            self._end_id, max_steps=max_decoding_steps, beam_size=beam_size or 1
        )
        self._rouge = ROUGE(exclude_indices={self._start_id, self._pad_id, self._end_id})
        self._bleu = BLEU(exclude_indices={self._start_id, self._pad_id, self._end_id})
def main(t5_model: str, kaggle: bool = True, mnli: bool = True):
    model_name = t5_model.split("/")[-1]
    Path("cache/").mkdir(exist_ok=True)
    target_path = f"cache/{model_name}/"
    if Path(target_path).exists():
        # Remove existing model
        shutil.rmtree(target_path)
    tokenizer = MT5Tokenizer.from_pretrained(t5_model)
    tokenizer.save_pretrained(target_path)
    tmp = MT5ForConditionalGeneration.from_pretrained(t5_model)
    tmp.save_pretrained(target_path)
    del tmp

    seen_tokens = collect_tokens(tokenizer, kaggle, mnli)

    m = model.ModelProto()
    m.ParseFromString(open(f"{target_path}spiece.model", 'rb').read())

    kept_pieces, i = [], len(m.pieces) - 1
    while len(m.pieces):
        piece = m.pieces.pop()
        if i < 259 or i in seen_tokens:
            kept_pieces.append(piece)
        i -= 1
    kept_pieces = list(reversed(kept_pieces))
    print("# of kept pieces:", len(kept_pieces))
    m.pieces.extend(kept_pieces)

    # backup
    Path(f"{target_path}spiece.model").rename(f"{target_path}spiece.model.old")
    # write new
    with open(f"{target_path}spiece.model", 'wb') as f:
        f.write(m.SerializeToString())

    kept_ids = sorted(list(seen_tokens.union(set(range(259)))))
    with open(f"{target_path}kept_ids.json", 'w') as fout:
        json.dump(kept_ids, fout)
Esempio n. 15
0
def main(model_path: str,
         data_folder: str = "/kaggle/input/contradictory-my-dear-watson/"):
    df_test = pd.read_csv(Path(data_folder) / "test.csv")

    tokenizer = MT5Tokenizer.from_pretrained(model_path)

    model = MT5ForConditionalGeneration(MT5Config.from_pretrained(model_path))
    model.lm_head = torch.nn.Linear(model.lm_head.in_features, 3, bias=False)
    shrink_vocab(model_path, model)
    model.load_state_dict(
        torch.load(str(Path(model_path) / "pytorch_model.bin")))
    model = model.cuda().eval()

    # label_tokens_dict = {
    #     tokens[0]: idx for idx, tokens in enumerate(tokenizer.batch_encode_plus(
    #         ["entailment", "neutral", "contradiction"]
    #     )["input_ids"])
    # }

    class InferenceDataset(XNLIDataset):
        def __init__(self, premise_ids, hypothesis_ids):
            self.labels = None
            self.premise_ids = premise_ids
            self.hypothesis_ids = hypothesis_ids
            self.context_tokens_1 = tokenizer.encode("mnli hypothesis:")[:-1]
            self.context_tokens_2 = tokenizer.encode("premise:")[:-1]
            self.tokenizer = None
            self.max_len = 64

    collate_fn = partial(collate_batch,
                         pad=model.config.decoder_start_token_id,
                         decode_start_token=model.config.pad_token_id,
                         max_len=128,
                         is_classifier=False)

    premise_ids, hypothesis_ids, _ = process_file(df_test,
                                                  tokenizer,
                                                  batch_size=32)
    dataset = InferenceDataset(premise_ids, hypothesis_ids)
    data_loader = DataLoader(dataset,
                             batch_size=16,
                             shuffle=False,
                             num_workers=1,
                             collate_fn=collate_fn)

    preds = []
    for input_batch, _ in data_loader:
        for key, val in input_batch.items():
            input_batch[key] = val.cuda()
        outputs = model(**input_batch)
        preds_local = [
            x for x in torch.argmax(outputs["logits"][:, 0, :],
                                    dim=-1).cpu().numpy()
        ]
        preds.append(preds_local)

    preds = np.concatenate(preds)
    # inverse_label_dict = {key: tokenizer.decode([key]) for key in np.unique(preds)}

    df_sub = pd.DataFrame({
        "id": df_test.id.values,
        "prediction": preds  # [inverse_label_dict[x] for x in preds]
    })
    df_sub.to_csv("submission.csv", index=False)
Esempio n. 16
0
def create_submission(
    test_csv: str = "data/test.csv",
    model_checkpoint: str = "deepset/gbert-base",
    model_type: str = "auto",
    batch_size: int = 16,
    max_length: int = 256,
    output_file: str = "submission.csv",
    binary: bool = True,
):
    logger.info(f"Start singleclass prediction.")
    logger.info(f"Load the model: {model_checkpoint}.")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if model_type == "auto":
        model = AutoModelForSequenceClassification.from_pretrained(
            model_checkpoint, num_labels=2).to(device)
    elif model_type == "t5":
        model = MT5ForConditionalGeneration.from_pretrained(
            model_checkpoint).to(device)
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    if model_type == "auto":

        def get_predictions(outputs):
            if binary:
                return np.argmax(outputs.logits.tolist(), axis=1).tolist()
            return outputs.logits.tolist()

    elif model_type == "t5":

        def get_predictions(outputs):
            logits = outputs.logits.squeeze(1)
            selected_logits = logits[:, [59006, 112560]]
            probs = F.softmax(selected_logits, dim=1)
            if binary:
                return np.argmax(probs.tolist(), axis=1).tolist()
            return probs.tolist()

    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    logger.info("Load and preprocess the dataset.")
    logger.debug(f"test_csv: {test_csv}")
    dataset = load(test_csv,
                   model_checkpoint,
                   model_type,
                   preprocess=True,
                   labels=[],
                   max_length=max_length)
    if model_type == "auto":
        columns = ["input_ids", "token_type_ids", "attention_mask"]
    elif model_type == "t5":
        columns = ["input_ids", "attention_mask", "decoder_input_ids"]
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")
    final_columns = []
    for column in columns:
        if column in dataset.column_names:
            final_columns.append(column)
    columns = final_columns

    dataset.set_format(type="torch", columns=columns)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    all_predictions = []
    for batch in tqdm(dataloader, desc="In progress..."):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        predictions = get_predictions(outputs)
        all_predictions += predictions

    try:
        ids = dataset["id"]
    except:
        try:
            ids = dataset["comment_id"]
        except:
            ids = dataset["comment_text"]
    if binary:
        df = pd.DataFrame(columns=["id", "prediction"],
                          data=zip(*[ids, all_predictions]))
    else:
        predictions0 = list(list(zip(*all_predictions))[0])
        predictions1 = list(list(zip(*all_predictions))[1])
        df = pd.DataFrame(columns=["id", "prediction0", "prediction1"],
                          data=zip(*[ids, predictions0, predictions1]))
    df.to_csv(output_file)
Esempio n. 17
0
def predict_official(
    test_csv: str = "data/test.csv",
    truth_csv: str = "data/truth.csv",
    labels: List[str] = ["Sub1_Toxic"],
    model_checkpoint: str = "deepset/gbert-base",
    model_type: str = "auto",
    batch_size: int = 16,
    max_length: int = 256,
    balanced: bool = False,
):
    logger.info(f"Start singleclass prediction.")
    logger.info(f"Load the model: {model_checkpoint}.")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if model_type == "auto":
        model = AutoModelForSequenceClassification.from_pretrained(
            model_checkpoint, num_labels=2).to(device)
    elif model_type == "t5":
        model = MT5ForConditionalGeneration.from_pretrained(
            model_checkpoint).to(device)
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    if model_type == "auto":

        def get_predictions(outputs):
            return np.argmax(outputs.logits.tolist(), axis=1).tolist()

        def get_labels(labels):
            labels = labels.cpu()
            labels = np.where(labels == -1.0, 0, labels)
            labels = np.where(labels == 1.0, 1, labels)
            return labels.tolist()

    elif model_type == "t5":

        def get_predictions(outputs):
            logits = outputs.logits.squeeze(1)
            selected_logits = logits[:, [59006, 112560]]
            probs = F.softmax(selected_logits, dim=1)
            return np.argmax(probs.tolist(), axis=1).tolist()

        def get_labels(labels):
            labels = labels.cpu()
            labels = np.where(labels == 59006, 0, labels)
            labels = np.where(labels == 112560, 1, labels)
            return labels.tolist()

    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    logger.info("Load and preprocess the dataset.")
    logger.debug(f"test_csv: {test_csv}")
    dataset = load(test_csv,
                   model_checkpoint,
                   model_type,
                   preprocess=True,
                   labels=[],
                   max_length=max_length)
    if model_type == "auto":
        columns = [
            "input_ids", "token_type_ids", "attention_mask", "comment_id"
        ]
    elif model_type == "t5":
        columns = [
            "input_ids", "attention_mask", "decoder_input_ids", "comment_id"
        ]
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")
    final_columns = []
    for column in columns:
        if column in dataset.column_names:
            final_columns.append(column)
    columns = final_columns

    dataset.set_format(type="torch", columns=columns)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    all_ids = []
    all_predictions = []
    for batch in tqdm(dataloader, desc="In progress..."):
        batch = {k: v.to(device) for k, v in batch.items()}
        ids = get_labels(batch.pop("comment_id"))
        outputs = model(**batch)
        predictions = get_predictions(outputs)
        assert len(predictions) == len(ids)
        all_ids += ids
        all_predictions += predictions

    labels_df = pd.read_csv(truth_csv)
    labels_df = labels_df.set_index("comment_id")
    all_labels = [labels_df.loc[i]["Sub1_Toxic"] for i in all_ids]

    if balanced:
        all_labels, all_predictions = balance_evaluation(
            all_labels, all_predictions)

    report = classification_report(all_labels,
                                   all_predictions,
                                   output_dict=True)
    precision_score_1 = report["macro avg"]["precision"]
    recall_score_1 = report["macro avg"]["recall"]
    f1_score_1 = 0
    if precision_score_1 + recall_score_1 > 0:
        f1_score_1 = 2 * precision_score_1 * recall_score_1 / (
            precision_score_1 + recall_score_1)
    stats = {
        "f1": f1_score_1,
        "recall": recall_score_1,
        "precision": precision_score_1,
    }

    print(stats)
    return stats
Esempio n. 18
0
 def model_init():
     return MT5ForConditionalGeneration.from_pretrained(model_checkpoint)
Esempio n. 19
0
def singleclass(
    train_csv: List[str] = ["data/train.train.csv"],
    test_csv: str = "data/train.test.csv",
    train_labels: List[str] = ["Sub1_Toxic"],
    test_labels: List[str] = ["Sub1_Toxic"],
    class_weights: bool = False,
    model_checkpoint: str = "deepset/gbert-base",
    model_type: str = "auto",
    output_dir: str = "models/singleclass/",
    strategy: str = "epoch",
    batch_size: int = 16,
    gradient_accumulation_steps: int = 1,
    eval_accumulation_steps: int = 100,
    learning_rate: float = 5e-5,
    nb_epoch: int = 3,
    max_length: int = 256,
    eval_steps: int = 250,
    save_steps: int = 500,
):
    logger.info(f"Start singleclass training.")
    output_dir += (
        model_checkpoint.replace("/", "_")
        + "_class_weights="
        + str(class_weights)
        + "_labels="
        + "_".join(train_labels)
        + "_languages="
        + "+".join(train_csv).replace("data/", "").replace("/", "_")
        + "_bs="
        + str(batch_size)
        + "_lr="
        + str(learning_rate)
        + "_epoch="
        + str(nb_epoch)
    )
    output_dir = output_dir[:256]
    logger.info(f"Load the model: {model_checkpoint}.")

    if model_type == "auto":
        model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    elif model_type == "t5":
        model = MT5ForConditionalGeneration.from_pretrained(model_checkpoint)
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    args = TrainingArguments(
        output_dir=output_dir,
        save_strategy=strategy,
        save_steps=save_steps,
        evaluation_strategy=strategy,
        eval_steps=eval_steps,
        eval_accumulation_steps=eval_accumulation_steps,
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        num_train_epochs=nb_epoch,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_dir="./logs",
        logging_steps=10,
    )

    metric = load_metric("metrics/singleclass.py")

    if model_type == "auto":

        def compute_metrics(eval_pred):
            predictions, labels = eval_pred
            predictions = np.argmax(predictions, axis=1)
            return metric.compute(predictions=predictions, references=labels)

    elif model_type == "t5":

        def compute_metrics(eval_pred):
            logits, labels = eval_pred
            # print("LOGITS")
            # print(type(logits))
            # print(len(logits))
            # print(type(logits[0]))
            # print(logits[0].shape)
            # print(np.argmax(logits[0], axis=2))
            labels = np.where(labels == 59006, 0, labels)
            labels = np.where(labels == 112560, 1, labels)
            logits = torch.tensor(logits[0]).squeeze(1)
            selected_logits = logits[:, [59006, 112560]]
            probs = F.softmax(selected_logits, dim=1)
            predictions = np.argmax(probs.tolist(), axis=1)
            return metric.compute(predictions=predictions, references=labels)

    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    logger.info("Load and preprocess the dataset.")
    logger.debug(f"train_csv: {train_csv}")
    logger.debug(f"test_csv: {test_csv}")
    train_dataset = load(
        train_csv, model_checkpoint, model_type, preprocess=True, labels=train_labels, max_length=max_length
    )
    test_dataset = load(
        test_csv, model_checkpoint, model_type, preprocess=True, labels=test_labels, max_length=max_length
    )
    logger.info(f"Dataset sample: {train_dataset[0]}")
    if model_type == "auto":
        tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
    elif model_type == "t5":
        tokenizer = T5Tokenizer.from_pretrained(model_checkpoint, use_fast=True)
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    if model_type == "auto":
        if class_weights == True:
            if len(train_labels) == 1 and train_labels[0] == "Sub1_Toxic":
                logger.info("Using TrainerWithClassWeightsToxic")
                trainer = TrainerWithClassWeightsToxic(
                    model,
                    args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics,
                )
            else:
                raise NotImplementedError()
        else:
            logger.info("Using Trainer")
            trainer = Trainer(
                model,
                args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
            )
    elif model_type == "t5":
        if class_weights == True:
            if len(train_labels) == 1 and train_labels[0] == "Sub1_Toxic":
                logger.info("Using MT5TrainerWithClassWeightsToxic")
                trainer = MT5TrainerWithClassWeightsToxic(
                    model,
                    args,
                    train_dataset=train_dataset,
                    eval_dataset=test_dataset,
                    tokenizer=tokenizer,
                    compute_metrics=compute_metrics,
                )
            else:
                raise NotImplementedError()
        else:
            logger.info("Using MT5Trainer")
            trainer = Trainer(
                model,
                args,
                train_dataset=train_dataset,
                eval_dataset=test_dataset,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
            )
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    logger.info("Start the training.")
    trainer.train()

    logger.info("Start the evaluation.")
    metrics = trainer.evaluate()
    logger.info(metrics)
    trainer.save_model()
Esempio n. 20
0
from transformers import MT5Config, MT5ForConditionalGeneration, load_tf_weights_in_t5
import torch

config = MT5Config.from_pretrained('config.json')
model = MT5ForConditionalGeneration(config)

ckpt = 'D:\\BaiduNetdiskDownload\\chinese_t5_pegasus_base\\chinese_t5_pegasus_base\\model.ckpt'

model = load_tf_weights_in_t5(model, config, ckpt)

torch.save(model.state_dict(), 'pytorch_model.bin')
Esempio n. 21
0
def predict(
    test_csv: str = "data/train.test.csv",
    labels: List[str] = ["Sub1_Toxic"],
    model_checkpoint: str = "deepset/gbert-base",
    model_type: str = "auto",
    batch_size: int = 16,
    max_length: int = 256,
    balanced: bool = False,
):
    logger.info(f"Start singleclass prediction.")
    logger.info(f"Load the model: {model_checkpoint}.")
    device = "cuda" if torch.cuda.is_available() else "cpu"
    if model_type == "auto":
        model = AutoModelForSequenceClassification.from_pretrained(
            model_checkpoint, num_labels=2).to(device)
    elif model_type == "t5":
        model = MT5ForConditionalGeneration.from_pretrained(
            model_checkpoint).to(device)
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    metric = load_metric("metrics/singleclass.py")

    if model_type == "auto":

        def get_predictions(outputs):
            return np.argmax(outputs.logits.tolist(), axis=1).tolist()

        def get_labels(labels):
            labels = labels.cpu()
            labels = np.where(labels == -1.0, 0, labels)
            labels = np.where(labels == 1.0, 1, labels)
            return labels.tolist()

    elif model_type == "t5":

        def get_predictions(outputs):
            logits = outputs.logits.squeeze(1)
            selected_logits = logits[:, [59006, 112560]]
            probs = F.softmax(selected_logits, dim=1)
            return np.argmax(probs.tolist(), axis=1).tolist()

        def get_labels(labels):
            labels = labels.cpu()
            labels = np.where(labels == 59006, 0, labels)
            labels = np.where(labels == 112560, 1, labels)
            return labels.tolist()

    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")

    logger.info("Load and preprocess the dataset.")
    logger.debug(f"test_csv: {test_csv}")
    dataset = load(test_csv,
                   model_checkpoint,
                   model_type,
                   preprocess=True,
                   labels=labels,
                   max_length=max_length)
    if model_type == "auto":
        columns = ["input_ids", "token_type_ids", "attention_mask", "labels"]
    elif model_type == "t5":
        columns = [
            "input_ids", "attention_mask", "decoder_input_ids", "labels"
        ]
    else:
        raise NotImplementedError("Model type available: 'auto' or 't5'")
    final_columns = []
    for column in columns:
        if column in dataset.column_names:
            final_columns.append(column)
    columns = final_columns

    dataset.set_format(type="torch", columns=columns)
    dataloader = torch.utils.data.DataLoader(dataset, batch_size=batch_size)

    all_labels = []
    all_predictions = []
    for batch in tqdm(dataloader, desc="In progress..."):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = get_labels(batch.pop("labels"))
        outputs = model(**batch)
        predictions = get_predictions(outputs)
        assert len(predictions) == len(labels)
        all_labels += labels
        all_predictions += predictions

    if balanced:
        all_labels, all_predictions = balance_evaluation(
            all_labels, all_predictions)
    stats = metric.compute(predictions=all_predictions, references=all_labels)
    print(stats)
    return stats
def main():

    TRAIN_BATCH_SIZE = 2
    VALID_BATCH_SIZE = 2
    TRAIN_EPOCHS = 1
    VAL_EPOCHS = 1
    LEARNING_RATE = 1e-4
    SEED = 42
    MAX_LEN = 512
    SUMMARY_LEN = 150

    torch.manual_seed(SEED)
    np.random.seed(SEED)
    torch.backends.cudnn.deterministic = True

    tokenizer = T5Tokenizer.from_pretrained("google/mt5-base")

    df = pd.read_csv(r"data.csv")
    df = df[['summary', 'text']]
    df = df.dropna().reset_index(drop=True)
    df['text'] = df.apply(lambda x: clean_text(x['text']), axis=1)
    df = df.dropna().reset_index(drop=True)
    print(df.shape)
    df.text = 'summarize: ' + df.text
    print(df.head())

    train_size = 0.90
    train_dataset = df.sample(frac=train_size, random_state=SEED)
    val_dataset = df.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(val_dataset.shape))

    training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN,
                                 SUMMARY_LEN)
    val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

    train_params = {
        'batch_size': TRAIN_BATCH_SIZE,
        'shuffle': True,
        'num_workers': 0
    }

    val_params = {
        'batch_size': VALID_BATCH_SIZE,
        'shuffle': False,
        'num_workers': 0
    }

    training_loader = DataLoader(training_set, **train_params)
    val_loader = DataLoader(val_set, **val_params)

    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-base")
    model = model.to(device)

    optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

    t1 = datetime.datetime.now()
    print(t1)
    for epoch in range(TRAIN_EPOCHS):
        train(epoch, tokenizer, model, device, training_loader, optimizer)

    t2 = datetime.datetime.now()
    print(t2)
    print(str(t2 - t1))
    for epoch in range(VAL_EPOCHS):
        predictions, actuals = validate(tokenizer, model, device, val_loader)
        final_df = pd.DataFrame({
            'Generated Text': predictions,
            'Actual Text': actuals
        })
        final_df.to_csv('predictions.csv')

    saved_model_dir = "./saved_model_summary/"

    if not os.path.exists(saved_model_dir):
        os.makedirs(saved_model_dir)

    model.save_pretrained(saved_model_dir)
    tokenizer.save_pretrained(saved_model_dir)
Esempio n. 23
0
            for pair in l.split():
                word, _ = pair.split('_')
                texts.append(word)
            data['tokens'].append(''.join(texts))

    return data


def get_max_length(data):
    lengths = [len(i) for i in data]
    return int(np.percentile(lengths, 80)) + 1


data = read_data('udp/train.txt')
tokenizer = MT5TokenizerFast.from_pretrained('mt5tokenizer')
model = MT5ForConditionalGeneration.from_pretrained('mt5small')

X = data["tokens"]
y = data["tags"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
from IPython import embed
embed()
X_train_tokenized = tokenizer.encode_plus(X_train,
                                          padding=True,
                                          truncation=True,
                                          max_length=get_max_length(X_train))
#TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]
X_val_tokenized = tokenizer.encode_plus(X_val,
                                        padding=True,
                                        truncation=True,
                                        max_length=get_max_length(X_val))
Esempio n. 24
0
def top_level_task():
    ffconfig = FFConfig()
    ffmodel = FFModel(ffconfig)
    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small")

    # Load train data as numpy arrays
    print("Loading data...")
    ids = np.load(os.path.join(NUMPY_DIR, "train_source_ids.npy"))
    mask = np.load(os.path.join(NUMPY_DIR, "train_source_mask.npy"))
    y_ids = np.load(os.path.join(NUMPY_DIR, "train_y_ids.npy"))
    lm_labels = np.load(os.path.join(NUMPY_DIR, "train_lm_labels.npy"))

    batch_size = ffconfig.batch_size
    input_ids_shape = (batch_size, ids.shape[1])
    attention_mask_shape = (batch_size, mask.shape[1])
    decoder_input_ids_shape = (batch_size, y_ids.shape[1])
    input_tensors = [
        ffmodel.create_tensor(input_ids_shape, DataType.DT_INT64),  # input_ids
        ffmodel.create_tensor(attention_mask_shape,
                              DataType.DT_INT64),  # attention_mask
        ffmodel.create_tensor(decoder_input_ids_shape,
                              DataType.DT_INT64),  # decoder_input_ids
    ]
    encoder_seq_length = ids.shape[1]
    decoder_seq_length = y_ids.shape[1]
    seq_length = (encoder_seq_length, decoder_seq_length)
    input_names = ["input_ids", "attention_mask", "decoder_input_ids"]

    print("Tracing the model...")
    hf_model = PyTorchModel(
        model,
        is_hf_model=True,
        input_names=input_names,
        batch_size=batch_size,
        seq_length=seq_length,
    )
    output_tensors = hf_model.torch_to_ff(ffmodel, input_tensors, verbose=True)
    ffoptimizer = SGDOptimizer(ffmodel, lr=0.01)

    print("Compiling the model...")
    ffmodel.compile(
        optimizer=ffoptimizer,
        loss_type=LossType.LOSS_SPARSE_CATEGORICAL_CROSSENTROPY,
        metrics=[
            MetricsType.METRICS_ACCURACY,
            MetricsType.METRICS_SPARSE_CATEGORICAL_CROSSENTROPY,
        ],
    )

    print("Creating data loaders...")
    input_ids_dl = ffmodel.create_data_loader(input_tensors[0], ids)
    attention_mask_dl = ffmodel.create_data_loader(input_tensors[1], mask)
    decoder_input_ids_dl = ffmodel.create_data_loader(input_tensors[2], y_ids)
    # NOTE: We cast down the label tensor data to 32-bit to accommodate the
    # label tensor's required dtype
    labels_dl = ffmodel.create_data_loader(ffmodel.label_tensor,
                                           lm_labels.astype("int32"))

    print("Initializing model layers...")
    ffmodel.init_layers()

    print("Training...")
    epochs = ffconfig.epochs
    ffmodel.fit(
        x=[input_ids_dl, attention_mask_dl, decoder_input_ids_dl],
        y=labels_dl,
        batch_size=batch_size,
        epochs=epochs,
    )
Esempio n. 25
0
    for title, content in data:
        text_ids = tokenizer.encode(content, max_length=max_len, truncation='only_first')

        summary_ids = tokenizer.encode(title, max_length=max_len, truncation='only_first')
        features = {'input_ids': text_ids, 'decoder_input_ids': summary_ids, 'attention_mask': [1] * len(text_ids),
                    'decoder_attention_mask': [1] * len(summary_ids)}
        ret.append(features)
    return ret


train_data, _ = create_data(train_data)

train_data = KeyDataset(train_data)
train_data = DataLoader(train_data, batch_size=batch_size, collate_fn=default_collate)

model = MT5ForConditionalGeneration.from_pretrained(model_path)

device = 'cuda:1'
model.to(device)
adam = torch.optim.Adam(model.parameters(), lr=lr)


def generate(text, max_length=30):
    max_content_length = max_len - max_length
    feature = tokenizer.encode(text, return_token_type_ids=True, return_tensors='pt',
                               max_length=512)
    feature = {'input_ids': feature}
    feature = {k: v.to(device) for k, v in list(feature.items())}

    gen = model.generate(max_length=max_length, eos_token_id=tokenizer.sep_token_id,
                         decoder_start_token_id=tokenizer.cls_token_id,
Esempio n. 26
0
from transformers import MT5Config, MT5Tokenizer, MT5ForConditionalGeneration

config = None
with open('config.yaml') as fp:
    config = yaml.load(fp, Loader=yaml.FullLoader)

model_dir = config['MODEL_DIR']
special_tokens = config['SPECIAL_TOKENS']
vocab_size = config['VOCAB_SIZE']
num_layers = config['NUM_LAYERS']
num_heads = config['NUM_HEADS']

try:
    tokenizer = MT5Tokenizer.from_pretrained(model_dir)
except:
    tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")
    tokenizer.add_special_tokens({'additional_special_tokens': special_tokens})
    tokenizer.save_pretrained(model_dir)

config = MT5Config(vocab_size=vocab_size,
                   num_layers=num_layers,
                   num_heads=num_heads)

try:
    model = MT5ForConditionalGeneration.from_pretrained(model_dir,
                                                        config=config)
except:
    model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small",
                                                        config=config)
    model.save_pretrained(model_dir)
Esempio n. 27
0
    precision = truep / pred
    recall = truep / ref
    if precision == 0 and recall == 0:
        f1 = 0
    else:
        f1 = (2 * precision * recall) / (precision + recall)
    return {"precision": precision, "recall": recall, "f1": f1}


'''    predictions = [{'id': str(i), 'prediction': pred.strip().lower()} \
                 for i, pred in enumerate(predictions)]
    references = [{'id': str(i), 'reference': ref.strip().lower()} \
                for i, ref in enumerate(references)]'''

model = MT5ForConditionalGeneration.from_pretrained('mt5small')
'''device = torch.device("cpu")
model.to(device)
print(next(model.parameters()).device)'''

training_args = Seq2SeqTrainingArguments(
    output_dir='./results',
    num_train_epochs=NUM_EPOCHS,
    per_device_train_batch_size=TRAIN_BATCH_SIZE,
    per_device_eval_batch_size=EVAL_BATCH_SIZE,
    warmup_steps=WARMUP_STEPS,
    gradient_accumulation_steps=8,
#    weight_decay=WEIGHT_DECAY,
    logging_dir='./logs/',
    evaluation_strategy="epoch",
    logging_steps=LOGGING_STEPS,