def __init__(self, args): self.GLOBAL_BEST_DEVEL_PRED_RESULTS = [] self.args = args self._LogFileHandler = open(args.logfileaddress, "wt") self.lp("Program started ...") self.__validate_args__() self.PARAMS = collections.OrderedDict() self.PARAMS[ "train_filename"] = args.data_folder + '/' + args.ann_set + '/' + args.ann_type + '/' + args.ann_type + '-train-annotations.txt' self.PARAMS[ "devel_filename"] = args.data_folder + '/' + args.ann_set + '/' + args.ann_type + '/' + args.ann_type + '-devel-annotations.txt' self.PARAMS[ "test_filename"] = args.data_folder + '/' + args.ann_set + '/' + args.ann_type + '/' + args.ann_type + '-test-annotations.txt' self.PARAMS["X_lower_row_len"] = 1 # Lower text length threshold self.PARAMS["X_upper_row_len"] = 400 # Upper text length threshold self.PARAMS["X_used_row_len"] = -1 self.PARAMS[ "default_embeddings_dim"] = 300 # default size of the used word embeddings when no pre-created embeddings model is given MSG = ["" * 80, "PARAMETERS:", "-" * 20] for key in self.PARAMS.keys(): MSG.append(GF.NVLR(key, 20) + " : " + str(self.PARAMS[key])) MSG.append("*" * 80) self.lp(MSG)
def __validate_args__(self): self.lp("Validating args ...") D = self.args.__dict__ MSG = ["" * 80, "Command-Line args:", "-" * 20] for key in sorted(D.keys()): MSG.append(GF.NVLR(key, 20) + " : " + str(D[key])) MSG.append("*" * 80) self.lp(MSG)
def __LoadData__(self): self.lp("Fetching information about the data set ...") # ---------------------------------- train_data_obj = X_y_dataHandler(ann_set=self.args.ann_set, include_o_labels=0) train_data_obj.load_data_set(self.PARAMS["train_filename"]) # ---------------------------------- devel_data_obj = X_y_dataHandler(ann_set=self.args.ann_set, include_o_labels=0) devel_data_obj.load_data_set(self.PARAMS["devel_filename"]) # ---------------------------------- test_data_obj = X_y_dataHandler(ann_set=self.args.ann_set, include_o_labels=0) test_data_obj.load_data_set(self.PARAMS["test_filename"]) # ---------------------------------- X_word_max_value = max([ train_data_obj.get_X_max_word_value(), devel_data_obj.get_X_max_word_value(), test_data_obj.get_X_max_word_value() ]) X_lemma_max_value = max([ train_data_obj.get_X_max_lemma_value(), devel_data_obj.get_X_max_lemma_value(), test_data_obj.get_X_max_lemma_value() ]) X_pos_max_value = max([ train_data_obj.get_X_max_pos_value(), devel_data_obj.get_X_max_pos_value(), test_data_obj.get_X_max_pos_value() ]) y_max_value = max([ train_data_obj.get_y_max_value(), devel_data_obj.get_y_max_value(), test_data_obj.get_y_max_value() ]) # ---------------------------------- X_data_max_row_len = max([ train_data_obj.get_X_max_len(), devel_data_obj.get_X_max_len(), test_data_obj.get_X_max_len() ]) if X_data_max_row_len <= self.PARAMS["X_lower_row_len"]: X_used_row_len = self.PARAMS["X_lower_row_len"] elif X_data_max_row_len >= self.PARAMS["X_upper_row_len"]: X_used_row_len = self.PARAMS["X_upper_row_len"] else: X_used_row_len = X_data_max_row_len # ---------------------------------- train_data_obj.make_numpy_arrays(X_used_row_len, y_max_value, padding_side=self.args.padding_side) # ---------------------------------- devel_data_obj.make_numpy_arrays(X_used_row_len, y_max_value, padding_side=self.args.padding_side) # ---------------------------------- test_data_obj.make_numpy_arrays(X_used_row_len, y_max_value, padding_side=self.args.padding_side) # ---------------------------------- # Need to check again due to potential removal of the O label column y_max_value = max([ train_data_obj.get_y_max_value(), devel_data_obj.get_y_max_value(), test_data_obj.get_y_max_value() ]) train_data_size = train_data_obj.get_size() devel_data_size = devel_data_obj.get_size() test_data_size = test_data_obj.get_size() MSG = ["*" * 80, "Information about data:", "-" * 30] MSG.append( GF.NVLR('word max value', 40) + ": " + str(X_word_max_value)) MSG.append( GF.NVLR('lemma max value', 40) + ": " + str(X_lemma_max_value)) MSG.append(GF.NVLR('pos max value', 40) + ": " + str(X_pos_max_value)) MSG.append(GF.NVLR('used row length', 40) + ": " + str(X_used_row_len)) MSG.append( GF.NVLR('max row length', 40) + ": " + str(X_data_max_row_len)) MSG.append(GF.NVLR('max value', 40) + ": " + str(y_max_value)) MSG.append( GF.NVLR('Train data', 40) + ": " + self.PARAMS["train_filename"]) MSG.append(GF.NVLR('Train size', 40) + ": " + str(train_data_size)) MSG.append("") MSG.append( GF.NVLR('Devel data', 40) + ": " + self.PARAMS["devel_filename"]) MSG.append(GF.NVLR('Devel size', 40) + ": " + str(devel_data_size)) MSG.append("") MSG.append( GF.NVLR('Test data', 40) + ": " + self.PARAMS["test_filename"]) MSG.append(GF.NVLR('Test size', 40) + ": " + str(test_data_size)) MSG.append("") self.lp(MSG) self.train_data_obj = train_data_obj self.devel_data_obj = devel_data_obj self.test_data_obj = test_data_obj self.PARAMS["X_word_max_value"] = X_word_max_value self.PARAMS["X_lemma_max_value"] = X_lemma_max_value self.PARAMS["X_pos_max_value"] = X_pos_max_value self.PARAMS["X_used_row_len"] = X_used_row_len self.PARAMS["y_max_value"] = y_max_value