Exemple #1
0
    print(f"Labels mapping: {LABELS_TO_ID}")
    print()

    metrics = {"validation": [AccuracyMeter]}

    model_config = config.SenseModelParameters(model_name=args.config_name,
                                               hidden_size=args.hidden_size,
                                               num_classes=len(LABELS_TO_ID),
                                               freeze_weights=False,
                                               context_layers=(-1, ))

    configuration = config.Configuration(
        model_parameters=model_config,
        model=args.model,
        save_path=args.save_path,
        sequence_max_len=args.seq_len,
        batch_size=args.batch_size,
        epochs=args.epochs,
        device=torch.device(args.device),
        tokenizer=transformers.AutoTokenizer.from_pretrained(args.model),
    )
    """
    model = TransformerWrapper.load_pretrained(
            args.pretrained_model, 
            params=configuration,
            pooler = BertPoolingStrategy(configuration),
            loss = SoftmaxLoss(configuration))
    """
    model = DistilBertForSequenceClassification.from_pretrained(
        args.pretrained_model, num_labels=len(LABELS_TO_ID))

    train_split, valid_split = dataset.split_dataset(test_perc=0.1)
Exemple #2
0
if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    parser.add_argument('--name', type=str, dest="config_name")
    parser.add_argument(
        '--model',
        type=str,
        dest="model",
        default="sentence-transformers/quora-distilbert-multilingual")
    parser.add_argument('--save_path',
                        dest="save_path",
                        type=str,
                        default="./output")

    args = parser.parse_args()

    tokenizer = AutoTokenizer.from_pretrained(args.model)

    model_config = config.ModelParameters(model_name=args.config_name)

    configuration = config.Configuration(model_parameters=model_config,
                                         model=args.model,
                                         save_path=args.save_path,
                                         tokenizer=tokenizer)

    sentence_model = OnnxSentenceTransformerWrapper.load_pretrained(
        path=args.model, params=configuration)

    convert_to_onnx(sentence_model, configuration, quantize=True)
    model_config = config.SenseModelParameters(
        model_name=args.config_name,
        hidden_size=args.hidden_size,
        num_classes=2,
        use_pretrained_embeddings=args.use_pretrained_embeddings,
        freeze_weights=args.freeze_weights,
        context_layers=(-1, -2, -3, -4))

    configuration = config.Configuration(
        model_parameters=model_config,
        model=args.model,
        save_path=args.save_path,
        sequence_max_len=args.seq_len,
        dropout_prob=args.dropout,
        lr=args.lr,
        batch_size=args.batch_size,
        epochs=args.epochs,
        device=torch.device(args.device),
        embedding_map=config.CONFIG.embedding_map,
        bnids_map=config.CONFIG.bnids_map,
        tokenizer=transformers.AutoTokenizer.from_pretrained(args.model),
        pretrained_embeddings_dim=config.DIMENSIONS_MAP[
            args.sense_embeddings_type],
        senses_as_features=args.senses_as_features)

    model = WordEncoderModel(
        params=configuration,
        loss=SoftmaxLoss,
        pooling_strategy=POOLING_STRATEGIES[args.pooling_strategy],
    )

    num_train_steps = len(train_data_loader) * args.epochs
    tokenizer = DistilBertTokenizer.from_pretrained(args.pretrained_path)

    model_config = config.ModelParameters(
        model_name = "args.config_name",
        hidden_size = 768,
        num_classes = 10,
        use_pretrained_embeddings = False,
        freeze_weights = False,
        context_layers = (-1,)
    )

    configuration = config.Configuration(
        model_parameters=model_config,
        model = "",
        save_path = "args.save_path",
        sequence_max_len = 128,
        dropout_prob = 0.1,
        lr = 2e-5,
        batch_size = 16,
        epochs = 1,
        device = torch.device("cpu"),
        tokenizer = tokenizer,
    )

    
    model = transformers.DistilBertForSequenceClassification.from_pretrained(args.pretrained_path)
    model.eval()

    input = "福岡市は公立小中学校などの教員採用で、筆記試験と面接を省く新たな採用方式を2022年から導入する"

    convert_to_torchscript(model, tokenizer, input, os.path.join(args.save_path, args.config_name))
Exemple #5
0
    train_split, valid_split = dataset.split_dataset(test_perc=0.1)
    #train_dataset = Dataset(train_split)
    valid_dataset = Dataset(valid_split)
    LABELS_TO_ID = dataset.label_to_id

    model_config = config.SenseModelParameters(model_name=args.config_name,
                                               hidden_size=args.embed_dim,
                                               num_classes=len(LABELS_TO_ID),
                                               freeze_weights=False,
                                               context_layers=(-1, ))

    configuration = config.Configuration(
        model_parameters=model_config,
        model=args.model,
        save_path=args.output_dir,
        sequence_max_len=args.seq_len,
        batch_size=args.batch_size,
        epochs=args.epochs,
        device=torch.device(args.device),
        tokenizer=tokenizer,
    )

    valid_data_loader = SmartParaphraseDataloader.build_batches(
        valid_dataset, 16, mode="sequence", config=configuration)
    autoconfig = AutoConfig.from_pretrained(
        args.pretrained_model_path,
        output_attentions=True,
    )
    autoconfig.num_labels = len(LABELS_TO_ID)
    model = AutoModelForSequenceClassification.from_pretrained(
        args.pretrained_model_path, config=autoconfig)
    """
                        type=float,
                        dest="corpus_percentage",
                        default=0.01)
    parser.add_argument('--nq', type=int, dest="num_queries", default=10)
    parser.add_argument('--topk', type=int, dest="topk", default=3)
    parser.add_argument('--sbert', type=bool, dest="use_sbert", default=False)
    args = parser.parse_args()

    random.seed(43)

    model_config = config.ModelParameters(model_name="eval_sentence_mining")

    configuration_teacher = config.Configuration(
        model_parameters=model_config,
        model=args.teacher_model,
        save_path="./results",
        batch_size=16,
        device=torch.device(args.device),
        tokenizer=transformers.AutoTokenizer.from_pretrained(
            args.teacher_model, use_fast=True))

    configuration_student = config.Configuration(
        model_parameters=model_config,
        model=args.student_model,
        save_path="./results",
        batch_size=16,
        device=torch.device(args.device),
        tokenizer=transformers.AutoTokenizer.from_pretrained(
            args.teacher_model, use_fast=True))

    teacher_model = SentenceTransformerWrapper.load_pretrained(
        args.teacher_model, params=configuration_teacher)