def train_parser(cls, options, data_train=None, data_dev=None, data_test=None): set_proc_name(options.title) ensure_dir(options.output) path = os.path.join(options.output, "{}_{}_train.log".format(options.title, int(time.time()))) log_to_file(path) logger.name = options.title cls.options_hook(options) DataFormatClass = cls.get_data_formats()[options.data_format] if data_train is None: data_train = DataFormatClass.from_file(options.conll_train) if data_dev is None: data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev} try: os.makedirs(options.output) except OSError: pass parser = cls(options, data_train) random_obj = random.Random(1) for epoch in range(options.epochs): logger.info('Starting epoch %d', epoch) random_obj.shuffle(data_train) options.is_train = True parser.train(data_train) # save model and delete old model for i in range(0, epoch - options.max_save): path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1) if os.path.exists(path): os.remove(path) path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1) parser.save(path) def predict(sentences, gold_file, output_file): options.is_train = False with open(output_file, "w") as f_output: if hasattr(DataFormatClass, "file_header"): f_output.write(DataFormatClass.file_header + "\n") for i in parser.predict(sentences): f_output.write(i.to_string()) # script_path = os.path.join(os.path.dirname(__file__), "main.py") # p = subprocess.Popen([sys.executable, script_path, "mst+empty", "predict", "--model", path, # "--test", gold_file, # "--output", output_file], stdout=sys.stdout) # p.wait() DataFormatClass.evaluate_with_external_program(gold_file, output_file) for file_name, file_content in data_dev.items(): try: prefix, suffix = os.path.basename(file_name).rsplit(".", 1) except ValueError: prefix = os.path.basename(file_name) suffix = "" dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch + 1, suffix)) predict(file_content, file_name, dev_output)
def train_parser(cls, options, data_train=None, data_dev=None, data_test=None): if sys.platform.startswith("linux"): set_proc_name(options.title) ensure_dir(options.output) path = os.path.join(options.output, "{}_{}_train.log".format(options.title, int(time.time()))) log_to_file(path) logger.name = options.title cls.options_hook(options) DataFormatClass = cls.get_data_formats()[options.data_format] if data_train is None: data_train = DataFormatClass.from_file(options.conll_train) if data_dev is None: data_dev = {i: DataFormatClass.from_file(i, False) for i in options.conll_dev} if data_test is None and options.conll_test is not None: data_test = DataFormatClass.from_file(options.conll_test, False) else: data_test = None try: os.makedirs(options.output) except OSError: pass return cls.repeat_train_and_validate(data_train, data_dev, data_test, options)
def train_parser(cls, options, data_train=None, data_dev=None, data_test=None): set_proc_name(options.title) ensure_dir(options.output) path = os.path.join(options.output, "{}_{}_train.log".format(options.title, int(time.time()))) log_to_file(path) logger.name = options.title logger.info('Options:\n%s', pformat(options.__dict__)) if data_train is None: data_train = cls.DataType.from_file(options.conll_train) if data_dev is None: data_dev = {i: cls.DataType.from_file(i, False) for i in options.conll_dev} try: os.makedirs(options.output) except OSError: pass parser = cls(options, data_train) random_obj = random.Random(1) def do_predict(epoch): for file_name, dev_sentences in data_dev.items(): try: prefix, suffix = os.path.basename(file_name).rsplit(".", 1) except ValueError: prefix = file_name suffix = "" dev_output = os.path.join(options.output, '{}_epoch_{}.{}'.format(prefix, epoch, suffix)) cls.predict_and_output(parser, options, dev_sentences, dev_output) if options.epochs == 0: print("Predict directly.") do_predict(0) for epoch in range(options.epochs): logger.info('Starting epoch %d', epoch) random_obj.shuffle(data_train) parser.train(data_train) # save model and delete old model for i in range(0, epoch - options.max_save): path = os.path.join(options.output, os.path.basename(options.model)) + str(i + 1) if os.path.exists(path): os.remove(path) path = os.path.join(options.output, os.path.basename(options.model)) + str(epoch + 1) parser.save(path) do_predict(epoch)
def get_lib(): hostname = platform.node() source_dir = os.path.join(os.path.dirname(__file__), "../libs", "xEisner") build_dir = os.path.join(source_dir, "build-{}".format(hostname)) ensure_dir(build_dir) lib_path = os.path.join(build_dir, "libxEisner.so") print("Building xEisner...") p = subprocess.Popen("MAX_SENTENCE_SIZE=128 cmake ../ -DCMAKE_BUILD_TYPE=Release && make -j4", shell=True, cwd=build_dir) p.communicate() assert p.returncode == 0 assert os.path.exists(lib_path) return ctypes.cdll.LoadLibrary(lib_path)
def __init__( self, model, hrg_statistics, # type: HRGStatistics options): super(StructuredPeceptronHRGScorer, self).__init__(model) self.options = options self.activation = nn.activations[options.activation] self.edge_labels = list( word for word, count in hrg_statistics.nonterminals.most_common(300)) + \ list(hrg_statistics.structural_edges) + \ list(hrg_statistics.categories) self.possible_features = [("Edge", k) for k in self.edge_labels] logger.info("Consider {} features as graph embedding".format( len(self.possible_features))) self.possible_features.append("head_left") self.possible_features.append("head_right") # self.possible_features.append("head_left_1/2") # self.possible_features.append("head_left_2/2") # self.possible_features.append("head_right_1/2") # self.possible_features.append("head_right_2/2") self.feature_index = { i: idx for idx, i in enumerate(self.possible_features) } dense_dims = [options.lstm_dims * 2 * options.span_lstm_layers + len(self.possible_features) + 1] + \ options.hrg_mlp_dims + [1] # don't use bias in last transform use_bias = [True] * (len(dense_dims) - 2) + [False] self.dense_layer = nn.DenseLayers(self, dense_dims, self.activation, use_bias) self.count_scale = self.add_parameters((1, )) self.count_scale_2 = self.add_parameters((1, )) if self.options.conflict_output_dir: ensure_dir(self.options.conflict_output_dir)
def k_fold_validation(train_file, dev_file, op, FormatClass, project_name, outdir_prefix, scheduler, k=5, prevent_redundant_preparation=True, header=None): train_file_basename = os.path.basename(train_file) train_file_prefix, _, ext = train_file_basename.rpartition(".") train_sents = FormatClass.from_file(train_file) project_dir = os.path.join(outdir_prefix, project_name) ensure_dir(project_dir) train_file_i = os.path.join(project_dir, train_file_prefix + ".{}." + ext) train_file_except_i = os.path.join(project_dir, train_file_prefix + ".except-{}." + ext) data_preparation_done_file = os.path.join( project_dir, "." + train_file_basename + ".done") # do data preparation if not prevent_redundant_preparation or not os.path.exists( data_preparation_done_file): train_sents_splitted = [] for i in range(k): start = int(i * len(train_sents) / k) end = int((i + 1) * len(train_sents) / k) train_sents_splitted.append(train_sents[start:end]) f_train_list = [open(train_file_i.format(i), "w") for i in range(k)] f_train_except_list = [ open(train_file_except_i.format(i), "w") for i in range(k) ] if header is not None: for f_i in f_train_list: f_i.write(header + "\n") for f_i in f_train_except_list: f_i.write(header + "\n") for i, train_sents_i in enumerate(train_sents_splitted): for sent in train_sents_i: for j in range(k): if j == i: f_train_list[j].write(sent.to_string()) else: f_train_except_list[j].write(sent.to_string()) for f_i in f_train_list: f_i.close() for f_i in f_train_except_list: f_i.close() with open(data_preparation_done_file, "w") as f: f.write("Done!") logger.info("{}-fold data preparation done!".format(k)) else: logger.info("No need to prepare {}-fold data.".format(k)) # create training tasks for i in range(k): op_i = dict(op) op_i["train"] = train_file_except_i.format(i) op_i["dev"] = dev_file scheduler.add_options("except-{}".format(i), op_i, project_dir)