def setUp(self): config_static = str( pathlib.Path(__file__).parent.absolute().joinpath("test_configs/phrase_context_static_config.json")) config_contextualized = str( pathlib.Path(__file__).parent.absolute().joinpath("test_configs/phrase_context_contextualized_config.json")) with open(config_static, 'r') as f: self.config_static = json.load(f) with open(config_contextualized, 'r') as f: self.config_contextualized = json.load(f) _, _, self.static_set = training_utils.get_datasets(self.config_static) _, _, self.bert_set = training_utils.get_datasets(self.config_contextualized)
def setUp(self): config_static = str(pathlib.Path(__file__).parent.absolute().joinpath("test_configs/simple_phrase_config.json")) with open(config_static, 'r') as f: self.config_static = json.load(f) _, _, self._static_dataset = training_utils.get_datasets(self.config_static) self._data = DataLoader(self._static_dataset, batch_size=2) self._batch = next(iter(self._data)) self._batch["device"] = "cpu" self.input_dim = 600 self.hidden_dim = 6 self.labels = 6
def setUp(self): config_static = str( pathlib.Path(__file__).parent.absolute().joinpath( "test_configs/simple_phrase_config.json")) data_pretrain = str( pathlib.Path(__file__).parent.absolute().joinpath( "data_pretraining/train.txt")) embeddings = str( pathlib.Path(__file__).parent.absolute().joinpath( "embeddings/german-structgram-mincount-30-ctx-10-dims-300.fifu" )) with open(config_static, 'r') as f: self.config_static = json.load(f) _, _, self.simple_phrase_test = training_utils.get_datasets( self.config_static) self.data_loader = DataLoader(dataset=self.simple_phrase_test, batch_size=4) self.pretrain_dataset = StaticRankingDataset(data_path=data_pretrain, embedding_path=embeddings, separator=" ", head="head", mod="modifier", phrase="phrase") self.pretrain_loader = DataLoader(dataset=self.pretrain_dataset, batch_size=4) self.model_multiclass = MatrixTwoWordClassifier( input_dim=300, hidden_dim=100, label_nr=3, dropout_rate=0.1, normalize_embeddings=True) self.model_pretrain = MatrixPretrain(input_dim=300, dropout_rate=0.1, normalize_embeddings=True) self.train_matrix_classifier() self.model_transfer = MatrixTransferClassifier( input_dim=300, hidden_dim=100, label_nr=3, dropout_rate=0.1, normalize_embeddings=True, pretrained_model="models/matrix_classifier") self.train_matrix_pretrain() self.model_transfer_rank = MatrixTransferRanker( dropout_rate=0.1, normalize_embeddings=True, pretrained_model="models/matrix_pretrain")
with open(argp.path_to_config, 'r') as f: config = json.load(f) prediction_path_dev = str( Path(config["model_path"]).joinpath(config["save_name"] + "_dev_predictions.npy")) prediction_path_test = str( Path(config["model_path"]).joinpath(config["save_name"] + "_test_predictions.npy")) eval_path_dev = str( Path(config["model_path"]).joinpath(config["save_name"] + "_evaluation_dev.txt")) eval_path_test = str( Path(config["model_path"]).joinpath(config["save_name"] + "_evaluation_test.txt")) dataset_train, dataset_valid, dataset_test = get_datasets(config) # load validation data in batches valid_loader = DataLoader(dataset_valid, batch_size=len(dataset_valid), shuffle=False) # load test data in batches test_loader = DataLoader(dataset_test, batch_size=len(dataset_test), shuffle=False) if argp.ranking: rank_path_dev = config["model_path"] + "_dev_ranks.txt" rank_path_test = config["model_path"] + "_test_ranks.txt" labels = extract_all_labels( training_data=config["train_data_path"],
model_path, "_reconstructed_rep", "dev") prediction_path_test_reconstructed, rank_path_test_reconstructed = get_save_path( model_path, "_reconstructed_rep", "test") logging.config.dictConfig(create_config(log_file)) logger = logging.getLogger("train") logger.info( "Training a joint model with the following parameter for the first dataset: %s" % str(config_1)) logger.info("The following parameter for the second dataset: %s" % str(config_2)) # set random seed np.random.seed(config_1["seed"]) # create two PretrainCompModel datasets dataset_train_1, dataset_valid_1, dataset_test_1 = get_datasets(config_1) dataset_train_2, dataset_valid_2, dataset_test_2 = get_datasets(config_2) assert type(dataset_train_1) == StaticRankingDataset and type( dataset_train_2 ) == StaticRankingDataset, "the dataset type is invalid for this kind of training" labels_dataset_1 = extract_all_words( training_data=config_1["train_data_path"], validation_data=config_1["validation_data_path"], test_data=config_1["test_data_path"], separator=config_1["data_loader"]["separator"], modifier=config_1["data_loader"]["modifier"], head=config_1["data_loader"]["head"], phrase=config_1["data_loader"]["phrase"]) labels_dataset_2 = extract_all_labels(