Ejemplo n.º 1
0
def setup_bert_embedder_large(request):
    emb_dim = 1024
    dropout_value = 0.0

    bow_bert_encoder = BertEmbedder(
        emb_dim=emb_dim,
        dropout_value=dropout_value,
        aggregation_type=request.param[1],
        bert_type=request.param[0],
    )
    strings = [
        "Lets start by talking politics",
        "there are radical ways to test your code",
    ]
    iter_dict = {"raw_instance": strings}
    return bow_bert_encoder, iter_dict
Ejemplo n.º 2
0
def get_bow_bert_emb_lc_gensect_infer(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    hyperparam_config_filepath = exp_dirpath.joinpath("config.json")
    test_dataset_params = exp_dirpath.joinpath("test_dataset_params.json")

    with open(hyperparam_config_filepath, "r") as fp:
        config = json.load(fp)

    with open(test_dataset_params, "r") as fp:
        test_dataset_args = json.load(fp)

    EMBEDDING_DIM = config["EMBEDDING_DIMENSION"]
    NUM_CLASSES = config["NUM_CLASSES"]
    BERT_TYPE = config["BERT_TYPE"]

    DEVICE = config["DEVICE"]
    MODEL_SAVE_DIR = config["MODEL_SAVE_DIR"]

    model_filepath = os.path.join(MODEL_SAVE_DIR, "best_model.pt")

    embedder = BertEmbedder(
        emb_dim=EMBEDDING_DIM,
        dropout_value=0.0,
        aggregation_type="average",
        bert_type=BERT_TYPE,
        device=torch.device(DEVICE),
    )

    encoder = BOW_Encoder(
        embedder=embedder, emb_dim=EMBEDDING_DIM, aggregation_type="average"
    )

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIM,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    dataset = GenericSectDataset(**test_dataset_args)

    parsect_inference = ClassificationInference(
        model=model, model_filepath=model_filepath, dataset=dataset
    )

    return parsect_inference
Ejemplo n.º 3
0
def setup_bert_embedder(request):
    dropout_value = 0.0
    bert_type, aggregation_type = request.param

    bert_embedder = BertEmbedder(
        dropout_value=dropout_value,
        aggregation_type=aggregation_type,
        bert_type=bert_type,
    )
    strings = [
        "Lets start by talking politics",
        "there are radical ways to test your code",
    ]

    lines = []
    for string in strings:
        line = Line(text=string)
        lines.append(line)

    return bert_embedder, lines
Ejemplo n.º 4
0
def build_sectlabel_bow_bert(dirname: str):
    exp_dirpath = pathlib.Path(dirname)
    DATA_PATH = pathlib.Path(DATA_DIR)

    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BertEmbedder(
        dropout_value=0.0,
        aggregation_type="average",
        bert_type="bert-base-uncased",
        device=torch.device("cpu"),
    )

    encoder = BOW_Encoder(embedder=embedder, aggregation_type="average")
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=768,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
    )

    parsect_inference = ClassificationInference(
        model=model,
        model_filepath=str(exp_dirpath.joinpath("checkpoints",
                                                "best_model.pt")),
        datasets_manager=data_manager,
    )

    return parsect_inference
Ejemplo n.º 5
0
    args = parser.parse_args()

    DATA_PATH = pathlib.Path(DATA_DIR)
    train_file = DATA_PATH.joinpath("sectLabel.train")
    dev_file = DATA_PATH.joinpath("sectLabel.dev")
    test_file = DATA_PATH.joinpath("sectLabel.test")

    data_manager = TextClassificationDatasetManager(
        train_filename=str(train_file),
        dev_filename=str(dev_file),
        test_filename=str(test_file),
    )

    embedder = BertEmbedder(
        dropout_value=0.0,
        aggregation_type="average",
        bert_type=args.bert_type,
        device=torch.device(args.device),
    )

    encoder = BOW_Encoder(embedder=embedder,
                          aggregation_type="average",
                          device=args.device)

    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=768,
        num_classes=23,
        classification_layer_bias=True,
        datasets_manager=data_manager,
        device=args.device,
    )
Ejemplo n.º 6
0
    with open(os.path.join(EXP_DIR_PATH, "test_dataset_params.json"),
              "w") as fp:
        json.dump(test_dataset_params, fp)

    VOCAB_SIZE = train_dataset.word_vocab.get_vocab_len()
    NUM_CLASSES = train_dataset.get_num_classes()
    config["VOCAB_SIZE"] = VOCAB_SIZE
    config["NUM_CLASSES"] = NUM_CLASSES
    with open(os.path.join(EXP_DIR_PATH, "config.json"), "w") as fp:
        json.dump(config, fp)

    embedder = BertEmbedder(
        emb_dim=EMBEDDING_DIMENSION,
        dropout_value=0.0,
        aggregation_type="average",
        bert_type=BERT_TYPE,
        device=torch.device(DEVICE),
    )

    encoder = BOW_Encoder(emb_dim=EMBEDDING_DIMENSION,
                          embedder=embedder,
                          aggregation_type="average")
    model = SimpleClassifier(
        encoder=encoder,
        encoding_dim=EMBEDDING_DIMENSION,
        num_classes=NUM_CLASSES,
        classification_layer_bias=True,
    )

    optimizer = optim.Adam(params=model.parameters(), lr=LEARNING_RATE)
Ejemplo n.º 7
0
        train_filename=train_filename,
        dev_filename=dev_filename,
        test_filename=test_filename,
        column_names=["POS", "DEP", "NER"],
        train_only="ner",
        namespace_vocab_options={
            "tokens": {"preprocessing_pipeline": [instance_preprocessing.lowercase]}
        },
    )

    embedder = WordEmbedder(
        embedding_type=args.emb_type, datasets_manager=data_manager, device=args.device
    )

    bert_embedder = BertEmbedder(
        datasets_manager=data_manager, device=args.device, bert_type=args.bert_type
    )

    embedder = ConcatEmbedders([embedder, bert_embedder])

    lstm2seqencoder = Lstm2SeqEncoder(
        embedder=embedder,
        dropout_value=args.dropout,
        hidden_dim=args.hidden_dim,
        bidirectional=args.bidirectional,
        combine_strategy=args.combine_strategy,
        rnn_bias=True,
        device=args.device,
        num_layers=args.num_layers,
        add_projection_layer=args.add_projection_layer,
    )