Ejemplo n.º 1
0
def main(cli_args):
    # Read from config file and make args
    with open(
            os.path.join(cli_args.config_dir, cli_args.task,
                         cli_args.config_file)) as f:
        args = AttrDict(json.load(f))
    logger.info("Training/evaluation parameters {}".format(args))

    args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)

    init_logger()
    set_seed(args)

    processor = processors[args.task](args)
    labels = processor.get_labels()
    if output_modes[args.task] == "regression":
        config = CONFIG_CLASSES[args.model_type].from_pretrained(
            args.model_name_or_path, num_labels=tasks_num_labels[args.task])
    else:
        config = CONFIG_CLASSES[args.model_type].from_pretrained(
            args.model_name_or_path,
            num_labels=tasks_num_labels[args.task],
            id2label={str(i): label
                      for i, label in enumerate(labels)},
            label2id={label: i
                      for i, label in enumerate(labels)},
        )
    tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(
        args.model_name_or_path, do_lower_case=args.do_lower_case)
    model = MODEL_FOR_SEQUENCE_CLASSIFICATION[args.model_type].from_pretrained(
        args.model_name_or_path, config=config)

    # GPU or CPU
    args.device = "cuda" if torch.cuda.is_available(
    ) and not args.no_cuda else "cpu"
    model.to(args.device)

    # Load dataset
    train_dataset = load_and_cache_examples(
        args, tokenizer, mode="train") if args.train_file else None
    dev_dataset = load_and_cache_examples(
        args, tokenizer, mode="dev") if args.dev_file else None
    test_dataset = load_and_cache_examples(
        args, tokenizer, mode="test") if args.test_file else None

    if dev_dataset == None:
        args.evaluate_test_during_training = True  # If there is no dev dataset, only use testset

    if args.do_train:
        global_step, tr_loss = train(args, model, train_dataset, dev_dataset,
                                     test_dataset)
        logger.info(" global_step = {}, average loss = {}".format(
            global_step, tr_loss))

    results = {}
    if args.do_eval:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(
                glob.glob(args.output_dir + "/**/" + "pytorch_model.bin",
                          recursive=True)))
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(
                logging.WARN)  # Reduce logging
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1]
            model = MODEL_FOR_SEQUENCE_CLASSIFICATION[
                args.model_type].from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args,
                              model,
                              test_dataset,
                              mode="test",
                              global_step=global_step)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as f_w:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))
Ejemplo n.º 2
0
  def main(cli_args):
    # Read from config file and make args
    with open(os.path.join(cli_args.config_dir, cli_args.task, cli_args.config_file)) as f:
        args = AttrDict(json.load(f))
    logger.info("Training/evaluation parameters {}".format(args))

    args.output_dir = os.path.join(args.ckpt_dir, args.output_dir)

    if args.doc_stride >= args.max_seq_length - args.max_query_length:
        logger.warning(
            "WARNING - You've set a doc stride which may be superior to the document length in some "
            "examples. This could result in errors when building features from the examples. Please reduce the doc "
            "stride or increase the maximum length to ensure the features are correctly built."
        )

    init_logger()
    set_seed(args)

    logging.getLogger("transformers.data.metrics.squad_metrics").setLevel(logging.WARN)  # Reduce model loading logs

    # Load pretrained model and tokenizer
    config = CONFIG_CLASSES[args.model_type].from_pretrained(
        args.model_name_or_path,
    )
    tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(
        args.model_name_or_path,
        do_lower_case=args.do_lower_case,
    )
    model = MODEL_FOR_QUESTION_ANSWERING[args.model_type].from_pretrained(
        args.model_name_or_path,
        config=config,
    )
    # GPU or CPU
    args.device = "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu"
    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
    results = {}
    if args.do_eval:
        checkpoints = list(
            os.path.dirname(c)
            for c in sorted(glob.glob(args.output_dir + "/**/" + "pytorch_model.bin", recursive=True))
        )
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(logging.WARN)  # Reduce model loading logs
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

        logger.info("Evaluate the following checkpoints: %s", checkpoints)

        for checkpoint in checkpoints:
            # Reload the model
            global_step = checkpoint.split("-")[-1]
            model = MODEL_FOR_QUESTION_ANSWERING[args.model_type].from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, global_step=global_step)
            result = dict((k + ("_{}".format(global_step) if global_step else ""), v) for k, v in result.items())
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as f_w:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))
Ejemplo n.º 3
0
from dotenv import load_dotenv
import os
from src import init_logger

logger = init_logger("Config")

load_dotenv(override=True)


def getKey(key: str, default_value=None):
    value = os.getenv(key)
    if default_value is None and value is None:
        logger.critical("{} is not defined.".format(key))
        exit(1)
    if value is None and default_value is not None:
        return default_value
    return value


GOOGLE_TOKEN = getKey("GOOGLE_TOKEN")
DISCORD_TOKEN = getKey("DISCORD_TOKEN")
DISCORD_CHANNEL_ID = getKey("DISCORD_CHANNEL_ID")
Ejemplo n.º 4
0
def main(cli_args):
    # Read from config file and make args
    with open(os.path.join(cli_args.config_dir, cli_args.config_file)) as f:
        args = AttrDict(json.load(f))
    logger.info("Training/evaluation parameters {}".format(args))
    logger.info("cliargs parameters {}".format(cli_args))

    args.output_dir = os.path.join(args.ckpt_dir, cli_args.result_dir)
    args.model_mode = cli_args.model_mode
    args.margin = cli_args.margin

    init_logger()
    set_seed(args)

    model_link = None
    if cli_args.transformer_mode.upper() == "T5":
        model_link = "t5-base"
    elif cli_args.transformer_mode.upper() == "ELECTRA":
        model_link = "google/electra-base-discriminator"
    elif cli_args.transformer_mode.upper() == "ALBERT":
        model_link = "albert-base-v2"
    elif cli_args.transformer_mode.upper() == "ROBERTA":
        model_link = "roberta-base"
    elif cli_args.transformer_mode.upper() == "BERT":
        model_link = "bert-base-uncased"

    print(model_link)
    tokenizer = AutoTokenizer.from_pretrained(model_link)

    args.test_file = os.path.join(cli_args.dataset, args.test_file)
    args.dev_file = os.path.join(cli_args.dataset, args.dev_file)
    args.train_file = os.path.join(cli_args.dataset, args.train_file)
    # Load dataset
    train_dataset = BaseDataset(args, tokenizer,
                                mode="train") if args.train_file else None
    dev_dataset = BaseDataset(args, tokenizer,
                              mode="dev") if args.dev_file else None
    test_dataset = BaseDataset(args, tokenizer,
                               mode="test") if args.test_file else None

    if dev_dataset == None:
        args.evaluate_test_during_training = True  # If there is no dev dataset, only use testset

    args.logging_steps = int(len(train_dataset) / args.train_batch_size) + 1
    args.save_steps = args.logging_steps
    labelNumber = train_dataset.getLabelNumber()

    labels = [str(i) for i in range(labelNumber)]
    config = AutoConfig.from_pretrained(model_link)

    # GPU or CPU
    args.device = "cuda:{}".format(
        cli_args.gpu
    ) if torch.cuda.is_available() and not args.no_cuda else "cpu"
    config.device = args.device
    args.model_mode = cli_args.model_mode

    model = MODEL_LIST[cli_args.model_mode](model_link, args.model_type,
                                            args.model_name_or_path, config,
                                            labelNumber, args.margin)
    model.to(args.device)

    if args.do_train:
        global_step, tr_loss = train(args, model, train_dataset, dev_dataset,
                                     test_dataset)
        logger.info(" global_step = {}, average loss = {}".format(
            global_step, tr_loss))

    results = {}
    if args.do_eval:
        checkpoints = list(
            os.path.dirname(c) for c in sorted(
                glob.glob(args.output_dir + "/**/" + "pytorch_model.bin",
                          recursive=True)))
        if not args.eval_all_checkpoints:
            checkpoints = checkpoints[-1:]
        else:
            logging.getLogger("transformers.configuration_utils").setLevel(
                logging.WARN)  # Reduce logging
            logging.getLogger("transformers.modeling_utils").setLevel(
                logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1]
            model = MODEL_LIST[args.model_type].from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args,
                              model,
                              test_dataset,
                              mode="test",
                              global_step=global_step)
            result = dict(
                (k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as f_w:
            for key in sorted(results.keys()):
                f_w.write("{} = {}\n".format(key, str(results[key])))
Ejemplo n.º 5
0
def main(cli_args):
    # Read from config file and make args
    max_checkpoint = "checkpoint-best"

    args = torch.load(
        os.path.join("ckpt", cli_args.result_dir, max_checkpoint,
                     "training_args.bin"))
    with open(os.path.join(cli_args.config_dir, cli_args.config_file)) as f:
        args = AttrDict(json.load(f))
    logger.info("Training/evaluation parameters {}".format(args))
    logger.info("cliargs parameters {}".format(cli_args))

    args.output_dir = os.path.join(args.ckpt_dir, cli_args.result_dir)
    args.model_mode = cli_args.model_mode
    args.device = "cuda:{}".format(
        cli_args.gpu
    ) if torch.cuda.is_available() and not args.no_cuda else "cpu"

    init_logger()
    set_seed(args)

    model_link = None
    if cli_args.transformer_mode.upper() == "T5":
        model_link = "t5-base"
    elif cli_args.transformer_mode.upper() == "ELECTRA":
        model_link = "google/electra-base-discriminator"
    elif cli_args.transformer_mode.upper() == "ALBERT":
        model_link = "albert-base-v2"
    elif cli_args.transformer_mode.upper() == "ROBERTA":
        model_link = "roberta-base"
    elif cli_args.transformer_mode.upper() == "BERT":
        model_link = "bert-base-uncased"

    tokenizer = AutoTokenizer.from_pretrained(model_link)

    args.test_file = os.path.join(cli_args.dataset, args.test_file)
    args.dev_file = os.path.join(cli_args.dataset, args.train_file)
    args.train_file = os.path.join(cli_args.dataset, args.train_file)
    # Load dataset
    train_dataset = BaseDataset(args, tokenizer,
                                mode="train") if args.train_file else None
    dev_dataset = BaseDataset(args, tokenizer,
                              mode="dev") if args.dev_file else None
    test_dataset = BaseDataset(args, tokenizer,
                               mode="test") if args.test_file else None

    if dev_dataset == None:
        args.evaluate_test_during_training = True  # If there is no dev dataset, only use testset

    args.logging_steps = int(len(train_dataset) / args.train_batch_size) + 1
    args.save_steps = args.logging_steps
    labelNumber = train_dataset.getLabelNumber()

    labels = [str(i) for i in range(labelNumber)]
    config = AutoConfig.from_pretrained(model_link)

    args.device = "cuda:{}".format(
        cli_args.gpu
    ) if torch.cuda.is_available() and not args.no_cuda else "cpu"
    config.device = args.device
    args.model_mode = cli_args.model_mode

    logger.info("Testing model checkpoint to {}".format(max_checkpoint))
    global_step = max_checkpoint.split("-")[-1]

    # GPU or CPU
    args.device = "cuda:{}".format(
        cli_args.gpu
    ) if torch.cuda.is_available() and not args.no_cuda else "cpu"
    config.device = args.device
    args.model_mode = cli_args.model_mode

    model = MODEL_LIST[cli_args.model_mode](model_link, args.model_type,
                                            args.model_name_or_path, config,
                                            labelNumber, -0.75)
    model.load_state_dict(
        torch.load(
            os.path.join("ckpt", cli_args.result_dir, max_checkpoint,
                         "training_model.bin")))

    model.to(args.device)

    preds, labels, result, txt_all = evaluate(args,
                                              model,
                                              test_dataset,
                                              mode="test",
                                              global_step=global_step)
    pred_and_labels = pd.DataFrame([])
    pred_and_labels["data"] = txt_all
    pred_and_labels["pred"] = preds
    pred_and_labels["label"] = labels
    pred_and_labels["result"] = preds == labels
    decode_result = list(pred_and_labels["data"].apply(
        lambda x: tokenizer.convert_ids_to_tokens(tokenizer(x)["input_ids"])))
    pred_and_labels["tokenizer"] = decode_result

    pred_and_labels.to_csv(os.path.join(
        "ckpt", cli_args.result_dir, "test_result_" + max_checkpoint + ".csv"),
                           encoding="utf-8")
Ejemplo n.º 6
0
def main(cli_args):
    # Read from config file and make args
    max_checkpoint = "checkpoint-best"

    args = torch.load(os.path.join("ckpt", cli_args.result_dir, max_checkpoint, "training_args.bin"))
    args.test_file = cli_args.test_file
    with open(os.path.join(cli_args.config_dir, cli_args.config_file)) as f:
        config = json.load(f)
        args.data_dir = config["data_dir"]
        if args.test_file == None:
            args.test_file = config["test_file"]
    logger.info("Testing parameters {}".format(args))

    args.model_mode = cli_args.model_mode
    args.device = "cuda:"+str(cli_args.gpu)

    init_logger()

    labels = ["0", "1"]
    config = CONFIG_CLASSES[args.model_type].from_pretrained(
        args.model_name_or_path,
        num_labels=2,
        id2label={str(i): label for i, label in enumerate(labels)},
        label2id={label: i for i, label in enumerate(labels)},
    )

    tokenizer = TOKENIZER_CLASSES[args.model_type].from_pretrained(
        args.model_name_or_path,
        do_lower_case=args.do_lower_case
    )
    args.device = "cuda:{}".format(cli_args.gpu) if torch.cuda.is_available() and not args.no_cuda else "cpu"
    config.device = args.device
    print(args.test_file)
    # Load dataset
    test_dataset = BaseDataset(args, tokenizer, mode="test") if args.test_file else None

    logger.info("Testing model checkpoint to {}".format(max_checkpoint))
    global_step = max_checkpoint.split("-")[-1]
    model = MODEL_LIST[cli_args.model_mode](args.model_type, args.model_name_or_path, config)
    model.load_state_dict(torch.load(os.path.join("ckpt", cli_args.result_dir, max_checkpoint, "training_model.bin")))

    model.to(args.device)

    if "KOSAC" in args.model_mode:
        preds, labels, result, txt_all, polarity_ids, intensity_ids = evaluate(args, model, test_dataset, mode="test",
                                                                             global_step=global_step)
    else:
        preds, labels, result, txt_all= evaluate(args, model, test_dataset, mode="test",
                                                                               global_step=global_step)
    pred_and_labels = pd.DataFrame([])
    pred_and_labels["data"] = txt_all
    pred_and_labels["pred"] = preds
    pred_and_labels["label"] = labels
    pred_and_labels["result"] = preds == labels
    decode_result = list(
        pred_and_labels["data"].apply(lambda x: tokenizer.convert_ids_to_tokens(tokenizer(x)["input_ids"])))
    pred_and_labels["tokenizer"] = decode_result

    if "KOSAC" in args.model_mode:
        tok_an = [list(zip(x, test_dataset.convert_ids_to_polarity(y)[:len(x) + 1], test_dataset.convert_ids_to_intensity(z)[:len(x) + 1])) for x, y, z in
                  zip(decode_result, polarity_ids, intensity_ids)]
        pred_and_labels["tokenizer_analysis(token,polarity,intensitiy)"] = tok_an

    pred_and_labels.to_excel(os.path.join("ckpt", cli_args.result_dir, "test_result_" + max_checkpoint + ".xlsx"),
                             encoding="cp949")