from loader import augment_with_pretrained, calculate_global_maxes from loader import update_tag_scheme, prepare_dataset from loader import word_mapping, char_mapping, tag_mapping # from model import Model from model_tensorflow import Model from utils import models_path, evaluate, eval_script, eval_temp from utils import read_args, form_parameters_dict logging.basicConfig(level=logging.INFO) logger = logging.getLogger("main") # Read parameters from command line opts = read_args() # Parse parameters parameters = form_parameters_dict(opts) # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['t_s'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script)
def train_a_single_configuration( datasets_root, crf, lr_method, batch_size, sparse_updates_enabled, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, skip_testing, max_epochs, train_filepath, dev_filepath, test_filepath, yuret_train_filepath, yuret_test_filepath, train_with_yuret, test_with_yuret, use_golden_morpho_analysis_in_word_representation, embeddings_filepath, integration_mode, active_models, multilayer, shortcut_connections, reload, dynet_gpu, _run): """ python train.py --pre_emb ../../data/we-300.txt --train dataset/gungor.ner.train.only_consistent --dev dataset/gungor.ner.dev.only_consistent --test dataset/gungor.ner.test.only_consistent --word_di m 300 --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 [email protected] --maximum-epochs 50 --char_dim 200 --char_lstm_dim 200 --char_bid irect 1 --overwrite-mappings 1 --batch-size 1 --morpho_tag_dim 100 --integration_mode 2 """ execution_part = "python main.py --command train --overwrite-mappings 1 " if sparse_updates_enabled == 0: execution_part += "--disable_sparse_updates " if dynet_gpu == 1: execution_part += "--dynet-gpu 1 " if train_with_yuret == 1: execution_part += "--train_with_yuret " if use_golden_morpho_analysis_in_word_representation == 1: execution_part += "--use_golden_morpho_analysis_in_word_representation " if word_dim == 0: embeddings_part = "" else: if embeddings_filepath: embeddings_part = "--pre_emb %s/%s " % (datasets_root, embeddings_filepath) else: embeddings_part = "" print (train_filepath, dev_filepath, test_filepath, skip_testing, max_epochs) always_constant_part = "-T %s/%s " \ "-d %s/%s " \ "-t %s/%s " \ "%s" \ "%s" \ "--yuret_train %s/%s " \ "--yuret_test %s/%s " \ "%s" \ "--skip-testing %d " \ "--tag_scheme iobes " \ "--maximum-epochs %d " % (datasets_root, train_filepath, datasets_root, dev_filepath, datasets_root, test_filepath, "--train_with_yuret " if train_with_yuret else "", "--test_with_yuret " if test_with_yuret else "", datasets_root, yuret_train_filepath, datasets_root, yuret_test_filepath, embeddings_part, skip_testing, max_epochs) commandline_args = always_constant_part + \ "--crf %d " \ "--lr_method %s " \ "--batch-size %d " \ "--dropout %1.1lf " \ "--char_dim %d " \ "--char_lstm_dim %d " \ "--morpho_tag_dim %d " \ "--morpho_tag_lstm_dim %d " \ "--morpho_tag_type %s " \ "--morpho-tag-column-index %d " \ "--word_dim %d " \ "--word_lstm_dim %d "\ "--cap_dim %d "\ "--integration_mode %d " \ "--active_models %d " \ "--multilayer %d " \ "--shortcut_connections %d " \ "--reload %d" % (crf, lr_method, batch_size, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, integration_mode, active_models, multilayer, shortcut_connections, reload) # tagger_root = "/media/storage/genie/turkish-ner/code/tagger" print _run print _run.info print subprocess.check_output(["id"]) print subprocess.check_output(["pwd"]) opts = read_args(args_as_a_list=commandline_args.split(" ")) print opts parameters = form_parameters_dict(opts) print parameters # model_path = get_name(parameters) model_path = get_model_subpath(parameters) print model_path task_names = ["NER", "MORPH", "YURET"] for task_name in task_names: _run.info["%s_dev_f_score" % task_name] = dict() _run.info["%s_test_f_score" % task_name] = dict() _run.info['starting'] = 1 dummy_prefix = "" full_commandline = dummy_prefix + execution_part + commandline_args print full_commandline process = subprocess.Popen(full_commandline.split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def record_metric(epoch, label, value): if str(epoch) in _run.info[label]: _run.info[label][str(epoch)].append(value) else: _run.info[label][str(epoch)] = list() _run.info[label][str(epoch)].append(value) def capture_information(line): # 1 """ NER Epoch: %d Best dev and accompanying test score, best_dev, best_test: %lf %lf """ for task_name in task_names: m = re.match("^%s Epoch: (\d+) .* best_dev, best_test: (.+) (.+)$" % task_name, line) if m: epoch = int(m.group(1)) best_dev = float(m.group(2)) best_test = float(m.group(3)) record_metric(epoch, "%s_dev_f_score" % task_name, best_dev) record_metric(epoch, "%s_test_f_score" % task_name, best_test) for line in iter(process.stdout.readline, ''): sys.stdout.write(line) capture_information(line) sys.stdout.flush() return model_path
def run_a_single_configuration_without_fabric( crf, lr_method, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, separate_bilstms, skip_testing, max_epochs, train_filepath, dev_filepath, test_filepath, embeddings_filepath, reload, _run): from sacred.observers import MongoObserver """ python train.py --pre_emb ../../data/we-300.txt --train dataset/tr.train --dev dataset/tr.test --test dataset/tr.test --word_dim 300 --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 --lr_method=sgd-lr_0.01 --maximum-epochs 100 --char_dim 200 --char_lstm_dim 200 --char_bidirect 1 --morpho_tag_dim 100 --morpho_tag_lstm_dim 100 --morpho_tag_type char --overwrite-mappings 1 --batch-size 5 """ execution_part = "python train.py " if word_dim == 0: embeddings_part = "" else: embeddings_part = "--pre_emb ../../datasets/%s " % embeddings_filepath print(train_filepath, dev_filepath, test_filepath, skip_testing, max_epochs) always_constant_part = "-T ../../datasets/%s " \ "-d ../../datasets/%s " \ "-t ../../datasets/%s " \ "%s" \ "--skip-testing %d " \ "--tag_scheme iobes " \ "--maximum-epochs %d " % (train_filepath, dev_filepath, test_filepath, embeddings_part, skip_testing, max_epochs) commandline_args = always_constant_part + \ "--crf %d " \ "--lr_method %s " \ "--dropout %1.1lf " \ "--char_dim %d " \ "--char_lstm_dim %d " \ "--morpho_tag_dim %d " \ "--morpho_tag_lstm_dim %d " \ "--morpho_tag_type %s " \ "--morpho-tag-column-index %d " \ "--word_dim %d " \ "--word_lstm_dim %d "\ "--cap_dim %d "\ "--separate-bilstms %d "\ "--reload %d" % (crf, lr_method, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, separate_bilstms, reload) tagger_root = "/media/storage/genie/turkish-ner/code/tagger" print _run print _run.info print subprocess.check_output(["id"]) print subprocess.check_output(["pwd"]) opts = read_args(commandline_args.split(" ")) # print opts parameters = form_parameters_dict(opts) # print parameters # model_path = get_name(parameters) model_path = get_model_subpath(parameters) print model_path _run.info['costs'] = dict() _run.info['best_performances'] = dict() _run.info['starting'] = 1 dummy_prefix = "" print dummy_prefix + execution_part + commandline_args process = subprocess.Popen( (dummy_prefix + execution_part + commandline_args).split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def record_metric(_run, epoch, samples, label, value): if str(epoch) in _run.info[label]: _run.info[label][str(epoch)].append(value) else: _run.info[label][str(epoch)] = list() _run.info[label][str(epoch)].append(value) for line in iter(process.stdout.readline, ''): sys.stdout.write(line) m = re.match( "^Epoch (\d+): (\d+) Samples read. Avg. cost: ([^,]+), Scores on dev: ([^,]+), (.+)$", line) if m: epoch = int(m.group(1)) samples = int(m.group(2)) epoch_avg_cost = float(m.group(3)) if skip_testing == 1 or dev_filepath == test_filepath: epoch_performance = float(m.group(4)) else: epoch_performance = float(m.group(5)) record_metric(_run, epoch, samples, "costs", epoch_avg_cost) record_metric(_run, epoch, samples, "best_performances", epoch_performance) sys.stdout.flush() # for epoch in range(max_epochs): # epoch_cost = subprocess.check_output(("tail -1 %s" % os.path.join("models", model_path, "epoch-%08d" % epoch, "epoch_cost.txt")).split(" ")) # best_performances = subprocess.check_output(("cat %s" % os.path.join("models", model_path, "epoch-%08d" % epoch, "best_performances.txt")).split(" ")) # print "EPOCHCOST: " + epoch_cost # _run.info['costs'][str(epoch)] = float(epoch_cost.strip()) # print "BESTPERF: " + best_performances # if skip_testing == 1 or dev_filepath == test_filepath: # _run.info['best_performances'][str(epoch)] = float(best_performances.split(" ")[0]) # else: # _run.info['best_performances'][str(epoch)] = float(best_performances.split(" ")[1]) return model_path
def main(argv=None): # pylint: disable=unused-argument # if tf.gfile.Exists(FLAGS.eval_dir): # tf.gfile.DeleteRecursively(FLAGS.eval_dir) # tf.gfile.MakeDirs(FLAGS.eval_dir) # Read parameters from command line opts = read_args(evaluation=True) # Parse parameters parameters = form_parameters_dict(opts) # Check parameters validity assert os.path.isfile(opts.train) assert os.path.isfile(opts.dev) assert os.path.isfile(opts.test) assert parameters['char_dim'] > 0 or parameters['word_dim'] > 0 assert 0. <= parameters['dropout'] < 1.0 assert parameters['t_s'] in ['iob', 'iobes'] assert not parameters['all_emb'] or parameters['pre_emb'] assert not parameters['pre_emb'] or parameters['word_dim'] > 0 assert not parameters['pre_emb'] or os.path.isfile(parameters['pre_emb']) # Check evaluation script / folders if not os.path.isfile(eval_script): raise Exception('CoNLL evaluation script not found at "%s"' % eval_script) if not os.path.exists(eval_temp): os.makedirs(eval_temp) if not os.path.exists(models_path): os.makedirs(models_path) event_logs_path = os.path.join(eval_temp, "eval_logs") # if not os.path.exists(event_logs_path): # os.makedirs(event_logs_path) # Initialize model model = MainTaggerModel(parameters=parameters, models_path=models_path, overwrite_mappings=opts.overwrite_mappings) print "MainTaggerModel location: %s" % model.model_path # Data parameters lower = parameters['lower'] zeros = parameters['zeros'] tag_scheme = parameters['t_s'] max_sentence_lengths = {} max_word_lengths = {} # Load sentences train_sentences, max_sentence_lengths['train'], max_word_lengths['train'] = \ loader.load_sentences(opts.train, lower, zeros) dev_sentences, max_sentence_lengths['dev'], max_word_lengths['dev'] = loader.load_sentences( opts.dev, lower, zeros) test_sentences, max_sentence_lengths['test'], max_word_lengths['test'] = loader.load_sentences( opts.test, lower, zeros) global_max_sentence_length, global_max_char_length = \ calculate_global_maxes(max_sentence_lengths, max_word_lengths) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, tag_scheme) update_tag_scheme(dev_sentences, tag_scheme) update_tag_scheme(test_sentences, tag_scheme) # Create a dictionary / mapping of words # If we use pretrained embeddings, we add them to the dictionary. if parameters['pre_emb']: dico_words_train = word_mapping(train_sentences, lower)[0] dico_words, word_to_id, id_to_word = augment_with_pretrained( dico_words_train.copy(), parameters['pre_emb'], list(itertools.chain.from_iterable( [[w[0] for w in s] for s in dev_sentences + test_sentences]) ) if not parameters['all_emb'] else None ) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, lower) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences) dico_tags, tag_to_id, id_to_tag = tag_mapping(train_sentences) if opts.overwrite_mappings: print 'Saving the mappings to disk...' model.save_mappings(id_to_word, id_to_char, id_to_tag) model.reload_mappings() # Index data train_buckets, train_stats, train_unique_words = prepare_dataset( train_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) dev_buckets, dev_stats, dev_unique_words = prepare_dataset( dev_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) test_buckets, test_stats, test_unique_words = prepare_dataset( test_sentences, word_to_id, char_to_id, tag_to_id, global_max_sentence_length, global_max_char_length, lower ) print "%i / %i / %i sentences in train / dev / test." % ( len(train_stats), len(dev_stats), len(test_stats)) print "%i / %i / %i words in train / dev / test." % ( sum([x[0] for x in train_stats]), sum([x[0] for x in dev_stats]), sum([x[0] for x in test_stats])) print "%i / %i / %i longest sentences in train / dev / test." % ( max([x[0] for x in train_stats]), max([x[0] for x in dev_stats]), max([x[0] for x in test_stats])) print "%i / %i / %i shortest sentences in train / dev / test." % ( min([x[0] for x in train_stats]), min([x[0] for x in dev_stats]), min([x[0] for x in test_stats])) for i, label in [[2, 'char']]: print "%i / %i / %i total %s in train / dev / test." % ( sum([sum(x[i]) for x in train_stats]), sum([sum(x[i]) for x in dev_stats]), sum([sum(x[i]) for x in test_stats]), label) print "%i / %i / %i max. %s lengths in train / dev / test." % ( max([max(x[i]) for x in train_stats]), max([max(x[i]) for x in dev_stats]), max([max(x[i]) for x in test_stats]), label) print "%i / %i / %i min. %s lengths in train / dev / test." % ( min([min(x[i]) for x in train_stats]), min([min(x[i]) for x in dev_stats]), min([min(x[i]) for x in test_stats]), label) print "Max. sentence lengths: %s" % max_sentence_lengths print "Max. char lengths: %s" % max_word_lengths for label, bin_stats, n_unique_words in [['train', train_stats, train_unique_words], ['dev', dev_stats, dev_unique_words], ['test', test_stats, test_unique_words]]: int32_items = len(train_stats) * ( max_sentence_lengths[label] * (5 + max_word_lengths[label]) + 1) float32_items = n_unique_words * parameters['word_dim'] total_size = int32_items + float32_items logging.info("Input ids size of the %s dataset is %d" % (label, int32_items)) logging.info("Word embeddings (unique: %d) size of the %s dataset is %d" % ( n_unique_words, label, float32_items)) logging.info("Total size of the %s dataset is %d" % (label, total_size)) batch_size = 5 # Build the model cost, train_step, tag_scores, tag_ids, word_ids, \ crf_transition_params, sentence_lengths, enqueue_op, placeholders = model.build( max_sentence_length_scalar=global_max_sentence_length, max_word_length_scalar=global_max_char_length, batch_size_scalar=batch_size, **parameters) FLAGS = tf.app.flags.FLAGS tf.app.flags.DEFINE_string('eval_dir', event_logs_path, """Directory where to write event logs.""") tf.app.flags.DEFINE_string('eval_data', 'test', """Either 'test' or 'train_eval'.""") tf.app.flags.DEFINE_string('checkpoint_dir', model.model_path, """Directory where to read model checkpoints.""") tf.app.flags.DEFINE_integer('eval_interval_secs', 60 * 5, """How often to run the eval.""") tf.app.flags.DEFINE_integer('num_examples', 10000, """Number of examples to run.""") tf.app.flags.DEFINE_boolean('run_once', False, """Whether to run eval only once.""") evaluate(model, dev_buckets, test_buckets, FLAGS, opts, id_to_tag, batch_size, placeholders, enqueue_op, tag_scores, tag_ids, word_ids, crf_transition_params, sentence_lengths, FLAGS.eval_dir, tag_scheme)
def train_a_single_configuration( lang_name, datasets_root, crf, lr_method, batch_size, sparse_updates_enabled, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, skip_testing, starting_epoch_no, maximum_epochs, file_format, debug, ner_train_file, ner_dev_file, ner_test_file, md_train_file, md_dev_file, md_test_file, use_golden_morpho_analysis_in_word_representation, embeddings_filepath, integration_mode, active_models, multilayer, shortcut_connections, reload, model_path, model_epoch_path, dynet_gpu, _run): """ python train.py --pre_emb ../../data/we-300.txt --train dataset/gungor.ner.train.only_consistent --dev dataset/gungor.ner.dev.only_consistent --test dataset/gungor.ner.test.only_consistent --word_di m 300 --word_lstm_dim 200 --word_bidirect 1 --cap_dim 100 --crf 1 [email protected] --maximum-epochs 50 --char_dim 200 --char_lstm_dim 200 --char_bid irect 1 --overwrite-mappings 1 --batch-size 1 --morpho_tag_dim 100 --integration_mode 2 """ execution_part = "python main.py --command train --overwrite-mappings 1 " if sparse_updates_enabled == 0: execution_part += "--disable_sparse_updates " if dynet_gpu == 1: execution_part += "--dynet-gpu 1 " if use_golden_morpho_analysis_in_word_representation == 1: execution_part += "--use_golden_morpho_analysis_in_word_representation " execution_part += "--debug " + str(debug) + " " if word_dim == 0: embeddings_part = "" else: if embeddings_filepath: embeddings_part = "--pre_emb %s/%s " % (datasets_root, embeddings_filepath) else: embeddings_part = "" always_constant_part = "--lang_name %s --file_format %s " \ "--ner_train_file %s/%s/%s " \ "%s" \ "--ner_test_file %s/%s/%s " \ "--md_train_file %s/%s/%s " \ "%s" \ "--md_test_file %s/%s/%s " \ "%s" \ "--skip-testing %d " \ "--tag_scheme iobes " \ "--starting-epoch-no %d " \ "--maximum-epochs %d " % (lang_name, file_format, datasets_root, lang_name, ner_train_file, ("--ner_dev_file %s/%s/%s " % (datasets_root, lang_name, ner_dev_file)) if ner_dev_file else "", datasets_root, lang_name, ner_test_file, datasets_root, lang_name, md_train_file, ("--md_dev_file %s/%s/%s " % (datasets_root, lang_name, md_dev_file)) if md_dev_file else "", datasets_root, lang_name, md_test_file, embeddings_part, skip_testing, starting_epoch_no, maximum_epochs) if reload == 1: reload_part = "--reload %d --model_path %s --model_epoch_path %s " % ( reload, model_path, model_epoch_path) else: reload_part = "--reload 0 " commandline_args = always_constant_part + \ "--crf %d " \ "--lr_method %s " \ "--batch-size %d " \ "--dropout %1.1lf " \ "--char_dim %d " \ "--char_lstm_dim %d " \ "--morpho_tag_dim %d " \ "--morpho_tag_lstm_dim %d " \ "--morpho_tag_type %s " \ "--morpho-tag-column-index %d " \ "--word_dim %d " \ "--word_lstm_dim %d "\ "--cap_dim %d "\ "--integration_mode %d " \ "--active_models %d " \ "--multilayer %d " \ "--shortcut_connections %d " \ "%s" % (crf, lr_method, batch_size, dropout, char_dim, char_lstm_dim, morpho_tag_dim, morpho_tag_lstm_dim, morpho_tag_type, morpho_tag_column_index, word_dim, word_lstm_dim, cap_dim, integration_mode, active_models, multilayer, shortcut_connections, reload_part) # tagger_root = "/media/storage/genie/turkish-ner/code/tagger" print(_run) print(_run.info) print(subprocess.check_output(["id"])) print(subprocess.check_output(["pwd"])) opts = read_args(args_as_a_list=commandline_args.split(" ")) print(opts) parameters = form_parameters_dict(opts) print(parameters) # model_path = get_name(parameters) model_path = get_model_subpath(parameters) print(model_path) task_names = ["NER", "MORPH"] for task_name in task_names: _run.info["%s_dev_f_score" % task_name] = dict() _run.info["%s_test_f_score" % task_name] = dict() _run.info["avg_loss"] = dict() _run.info['starting'] = 1 dummy_prefix = "" full_commandline = dummy_prefix + execution_part + commandline_args print(full_commandline) process = subprocess.Popen(full_commandline.split(" "), stdout=subprocess.PIPE, stderr=subprocess.STDOUT) def record_metric(epoch, label, value): """ Each label can have multiple values in an epoch. This is for updates to the metric's value. i.e. metrics calculated before an epoch has finished. :param epoch: :param label: :param value: :return: """ epoch_str = str(epoch) if epoch_str in _run.info[label]: _run.info[label][epoch_str].append(value) else: _run.info[label][epoch_str] = [value] def capture_information(line): # 1 """ NER Epoch: %d Best dev and accompanying test score, best_dev, best_test: %lf %lf """ for task_name in task_names: m = re.match( "^.*%s Epoch: (\d+) .* best_dev, best_test: (.+) (.+)$" % task_name, line) if m: epoch = int(m.group(1)) best_dev = float(m.group(2)) best_test = float(m.group(3)) record_metric(epoch, "%s_dev_f_score" % task_name, best_dev) record_metric(epoch, "%s_test_f_score" % task_name, best_test) m = re.match("^.*Epoch (\d+) Avg. loss over training set: (.+)$", line) if m: epoch = int(m.group(1)) avg_loss_over_training_set = float(m.group(2)) record_metric(epoch, "avg_loss", avg_loss_over_training_set) """ MainTaggerModel location: ./models/model-00000227 """ m = re.match("^.*MainTaggerModel location: (.+)$", line) if m: model_dir_path = m.group(1) _run.info["model_dir_path"] = model_dir_path """ LOG: model_epoch_dir_path: {} """ m = re.match("^.*LOG: model_epoch_dir_path: (.+)$", line) if m: model_epoch_dir_path = m.group(1) _run.info["model_epoch_dir_path"] = model_epoch_dir_path for line in process.stdout: sys.stdout.write(line.decode("utf8")) capture_information(line.decode("utf8")) sys.stdout.flush() return model_path