def return_simple_data(self, debug, data, model, name, train): data_paths = self._prepare_paths(data, debug) if name == "train": train = morpho_dataset.MorphoDataset( data_paths[0], embeddings=None, bert=model, lemma_re_strip=r"(?<=.)(?:`|_|-[^0-9]).*$", lemma_rule_min=2, simple=True) if name == "dev": if os.path.exists(data_paths[1]): dev = morpho_dataset.MorphoDataset(data_paths[1], train=train, shuffle_batches=False, bert=model, simple=True) else: dev = None return dev if name == "test": if os.path.exists(data_paths[2]): test = morpho_dataset.MorphoDataset(data_paths[2], train=train, shuffle_batches=False, bert=model, simple=True) else: test = None return test return train
default=64, type=int, help="Word embedding dimension.") args = parser.parse_args() # Create logdir name args.logdir = "logs/{}-{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join( ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))) if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself # Load the data train = morpho_dataset.MorphoDataset("czech-cac-train.txt", max_sentences=5000) dev = morpho_dataset.MorphoDataset("czech-cac-dev.txt", train=train, shuffle_batches=False) # Construct the network network = Network(threads=args.threads) network.construct(args, len(train.factors[train.FORMS].words), len(train.factors[train.FORMS].alphabet), len(train.factors[train.TAGS].words)) # Train for i in range(args.epochs): network.train_epoch(train, args.batch_size) accuracy = network.evaluate("dev", dev, args.batch_size)
default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Create logdir name args.logdir = "logs/{}-{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join( ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))) if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself home = expanduser('~') train = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-train.txt", lowercase=True) dev = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-dev.txt", train=train, shuffle_batches=False, lowercase=True) test = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-test.txt", train=train, shuffle_batches=False, lowercase=True) #train = morpho_dataset.MorphoDataset(home + "/data/cs/czech-pdt-train.txt", lowercase=False) #train = morpho_dataset.MorphoDataset("czech-pdt-train.txt") #dev = morpho_dataset.MorphoDataset("czech-pdt-dev.txt", train=train, shuffle_batches=False) #test = morpho_dataset.MorphoDataset("czech-pdt-test.txt", train=train, shuffle_batches=False) batches = len(train.sentence_lens) // args.batch_size
param['logdir'] = logdir param['epochs'] = args.epochs param['threads'] = args.threads param = namedtuple('Params', param.keys())(*param.values()) break num_retry += 1 if num_retry > n_params: exit(111) os.makedirs(param.logdir) print("=====================================================") print(param.logdir) print("=====================================================") # Load the data train = morpho_dataset.MorphoDataset("czech-pdt-train.txt") dev = morpho_dataset.MorphoDataset("czech-pdt-dev.txt", train=train, shuffle_batches=False) test = morpho_dataset.MorphoDataset("czech-pdt-test.txt", train=train, shuffle_batches=False) analyzer_dictionary = MorphoAnalyzer("czech-pdt-analysis-dictionary.txt") analyzer_guesser = MorphoAnalyzer("czech-pdt-analysis-guesser.txt") # Construct the network network = get_model(param.name)(param, len(train.factors[train.FORMS].words), len(train.factors[train.FORMS].alphabet), len(train.factors[train.TAGS].words))
class Network: def __init__(self, threads, seed=42): # Create an empty graph and a session graph = tf.Graph() graph.seed = seed self.session = tf.Session(graph = graph, config=tf.ConfigProto(inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)) def construct(self, args, source_chars, target_chars, bow, eow): with self.session.graph.as_default(): if args.recodex: tf.get_variable_scope().set_initializer(tf.glorot_uniform_initializer(seed=42)) # Inputs self.sentence_lens = tf.placeholder(tf.int32, [None], name="sentence_lens") self.source_ids = tf.placeholder(tf.int32, [None, None], name="source_ids") self.source_seqs = tf.placeholder(tf.int32, [None, None], name="source_seqs") self.source_seq_lens = tf.placeholder(tf.int32, [None], name="source_seq_lens") self.target_ids = tf.placeholder(tf.int32, [None, None], name="target_ids") self.target_seqs = tf.placeholder(tf.int32, [None, None], name="target_seqs") self.target_seq_lens = tf.placeholder(tf.int32, [None], name="target_seq_lens") # Append EOW after target_seqs target_seqs = tf.reverse_sequence(self.target_seqs, self.target_seq_lens, 1) target_seqs = tf.pad(target_seqs, [[0, 0], [1, 0]], constant_values=eow) target_seq_lens = self.target_seq_lens + 1 target_seqs = tf.reverse_sequence(target_seqs, target_seq_lens, 1) # Encoder # TODO: Generate source embeddings for source chars, of shape [source_chars, args.char_dim]. # TODO: Embed the self.source_seqs using the source embeddings. # TODO: Using a GRU with dimension args.rnn_dim, process the embedded self.source_seqs # using bidirectional RNN. Store the summed fwd and bwd outputs in `source_encoded` # and the summed fwd and bwd states into `source_states`. # Index the unique words using self.source_ids and self.target_ids. sentence_mask = tf.sequence_mask(self.sentence_lens) source_encoded = tf.boolean_mask(tf.nn.embedding_lookup(source_encoded, self.source_ids), sentence_mask) source_states = tf.boolean_mask(tf.nn.embedding_lookup(source_states, self.source_ids), sentence_mask) source_lens = tf.boolean_mask(tf.nn.embedding_lookup(self.source_seq_lens, self.source_ids), sentence_mask) target_seqs = tf.boolean_mask(tf.nn.embedding_lookup(target_seqs, self.target_ids), sentence_mask) target_lens = tf.boolean_mask(tf.nn.embedding_lookup(target_seq_lens, self.target_ids), sentence_mask) # Decoder # TODO: Generate target embeddings for target chars, of shape [target_chars, args.char_dim]. # TODO: Embed the target_seqs using the target embeddings. # TODO: Generate a decoder GRU with wimension args.rnn_dim. # TODO: Create a `decoder_layer` -- a fully connected layer with # target_chars neurons used in the decoder to classify into target characters. # Attention # TODO: Generate three fully connected layers without activations: # - `source_layer` with args.rnn_dim units # - `state_layer` with args.rnn_dim units # - `weight_layer` with 1 unit def with_attention(inputs, states): # Generate the attention # TODO: Project source_encoded using source_layer. # TODO: Change shape of states from [a, b] to [a, 1, b] and project it using state_layer. # TODO: Sum the two above projections, apply tf.tanh and project the result using weight_layer. # The result has shape [x, y, 1]. # TODO: Apply tf.nn.softmax to the latest result, using axis corresponding to source characters. # TODO: Multiply the source_encoded by the latest result, and sum the results with respect # to the axis corresponding to source characters. This is the final attention. # TODO: Return concatenation of inputs and the computed attention. # The DecoderTraining will be used during training. It will output logits for each # target character. class DecoderTraining(tf.contrib.seq2seq.Decoder): @property def batch_size(self): return # TODO: Return size of the batch, using for example source_states size @property def output_dtype(self): return tf.float32 # Type for logits of target characters @property def output_size(self): return target_chars # Length of logits for every output def initialize(self, name=None): finished = # TODO: False if target_lens > 0, True otherwise states = # TODO: Initial decoder state to use inputs = # TODO: Call with_attention on the embedded BOW characters of shape [self.batch_size]. # You can use tf.fill to generate BOWs of appropriate size. return finished, inputs, states def step(self, time, inputs, states, name=None): outputs, states = # TODO: Run the decoder GRU cell using inputs and states. outputs = # TODO: Apply the decoder_layer on outputs. next_input = # TODO: Next input is with_attention called on words with index `time` in target_embedded. finished = # TODO: False if target_lens > time + 1, True otherwise. return outputs, states, next_input, finished output_layer, _, _ = tf.contrib.seq2seq.dynamic_decode(DecoderTraining()) self.predictions_training = tf.argmax(output_layer, axis=2, output_type=tf.int32) # The DecoderPrediction will be used during prediction. It will # directly output the predicted target characters. class DecoderPrediction(tf.contrib.seq2seq.Decoder): @property def batch_size(self): return # TODO: Return size of the batch, using for example source_states size @property def output_dtype(self): return tf.int32 # Type for predicted target characters @property def output_size(self): return 1 # Will return just one output def initialize(self, name=None): finished = # TODO: False of shape [self.batch_size]. states = # TODO: Initial decoder state to use. inputs = # TODO: Call with_attention on the embedded BOW characters of shape [self.batch_size]. # You can use tf.fill to generate BOWs of appropriate size. return finished, inputs, states def step(self, time, inputs, states, name=None): outputs, states = # TODO: Run the decoder GRU cell using inputs and states. outputs = # TODO: Apply the decoder_layer on outputs. outputs = # TODO: Use tf.argmax to choose most probable class (supply parameter `output_type=tf.int32`). next_input = # TODO: Embed `outputs` using target_embeddings and pass it to with_attention. finished = # TODO: True where outputs==eow, False otherwise # Use tf.equal for the comparison, Python's '==' is not overloaded return outputs, states, next_input, finished self.predictions, _, self.prediction_lens = tf.contrib.seq2seq.dynamic_decode( DecoderPrediction(), maximum_iterations=tf.reduce_max(source_lens) + 10) # Training weights = tf.sequence_mask(target_lens, dtype=tf.float32) loss = tf.losses.sparse_softmax_cross_entropy(target_seqs, output_layer, weights=weights) global_step = tf.train.create_global_step() self.training = tf.train.AdamOptimizer().minimize(loss, global_step=global_step, name="training") # Summaries accuracy_training = tf.reduce_all(tf.logical_or( tf.equal(self.predictions_training, target_seqs), tf.logical_not(tf.sequence_mask(target_lens))), axis=1) self.current_accuracy_training, self.update_accuracy_training = tf.metrics.mean(accuracy_training) minimum_length = tf.minimum(tf.shape(self.predictions)[1], tf.shape(target_seqs)[1]) accuracy = tf.logical_and( tf.equal(self.prediction_lens, target_lens), tf.reduce_all(tf.logical_or( tf.equal(self.predictions[:, :minimum_length], target_seqs[:, :minimum_length]), tf.logical_not(tf.sequence_mask(target_lens, maxlen=minimum_length))), axis=1)) self.current_accuracy, self.update_accuracy = tf.metrics.mean(accuracy) self.current_loss, self.update_loss = tf.metrics.mean(loss, weights=tf.reduce_sum(weights)) self.reset_metrics = tf.variables_initializer(tf.get_collection(tf.GraphKeys.METRIC_VARIABLES)) summary_writer = tf.contrib.summary.create_file_writer(args.logdir, flush_millis=10 * 1000) self.summaries = {} with summary_writer.as_default(), tf.contrib.summary.record_summaries_every_n_global_steps(10): self.summaries["train"] = [tf.contrib.summary.scalar("train/loss", self.update_loss), tf.contrib.summary.scalar("train/accuracy", self.update_accuracy_training)] with summary_writer.as_default(), tf.contrib.summary.always_record_summaries(): for dataset in ["dev", "test"]: self.summaries[dataset] = [tf.contrib.summary.scalar(dataset + "/loss", self.current_loss), tf.contrib.summary.scalar(dataset + "/accuracy", self.current_accuracy)] # Initialize variables self.session.run(tf.global_variables_initializer()) with summary_writer.as_default(): tf.contrib.summary.initialize(session=self.session, graph=self.session.graph) def train_epoch(self, train, batch_size): import sys while not train.epoch_finished(): sentence_lens, _, charseq_ids, charseqs, charseq_lens = train.next_batch(batch_size, including_charseqs=True) self.session.run(self.reset_metrics) predictions, _, _ = self.session.run( [self.predictions_training, self.training, self.summaries["train"]], {self.sentence_lens: sentence_lens, self.source_ids: charseq_ids[train.FORMS], self.target_ids: charseq_ids[train.LEMMAS], self.source_seqs: charseqs[train.FORMS], self.target_seqs: charseqs[train.LEMMAS], self.source_seq_lens: charseq_lens[train.FORMS], self.target_seq_lens: charseq_lens[train.LEMMAS]}) form, gold_lemma, system_lemma = "", "", "" for i in range(charseq_lens[train.FORMS][0]): form += train.factors[train.FORMS].alphabet[charseqs[train.FORMS][0][i]] for i in range(charseq_lens[train.LEMMAS][0]): gold_lemma += train.factors[train.LEMMAS].alphabet[charseqs[train.LEMMAS][0][i]] system_lemma += train.factors[train.LEMMAS].alphabet[predictions[0][i]] print("Gold form: {}, gold lemma: {}, predicted lemma: {}".format(form, gold_lemma, system_lemma), file=sys.stderr) def evaluate(self, dataset_name, dataset, batch_size): self.session.run(self.reset_metrics) while not dataset.epoch_finished(): sentence_lens, _, charseq_ids, charseqs, charseq_lens = dataset.next_batch(batch_size, including_charseqs=True) self.session.run([self.update_accuracy, self.update_loss], {self.sentence_lens: sentence_lens, self.source_ids: charseq_ids[train.FORMS], self.target_ids: charseq_ids[train.LEMMAS], self.source_seqs: charseqs[train.FORMS], self.target_seqs: charseqs[train.LEMMAS], self.source_seq_lens: charseq_lens[train.FORMS], self.target_seq_lens: charseq_lens[train.LEMMAS]}) return self.session.run([self.current_accuracy, self.summaries[dataset_name]])[0] if __name__ == "__main__": import argparse import datetime import os import re # Fix random seed np.random.seed(42) # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=10, type=int, help="Batch size.") parser.add_argument("--char_dim", default=64, type=int, help="Character embedding dimension.") parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.") parser.add_argument("--recodex", default=False, action="store_true", help="ReCodEx mode.") parser.add_argument("--rnn_dim", default=64, type=int, help="Dimension of the encoder and the decoder.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Create logdir name args.logdir = "logs/{}-{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items()))) ) if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself # Load the data train = morpho_dataset.MorphoDataset("czech-cac-train.txt", max_sentences=5000) dev = morpho_dataset.MorphoDataset("czech-cac-dev.txt", train=train, shuffle_batches=False) # Construct the network network = Network(threads=args.threads) network.construct(args, len(train.factors[train.FORMS].alphabet), len(train.factors[train.LEMMAS].alphabet), train.factors[train.LEMMAS].alphabet_map["<bow>"], train.factors[train.LEMMAS].alphabet_map["<eow>"]) # Train for i in range(args.epochs): network.train_epoch(train, args.batch_size) accuracy = network.evaluate("dev", dev, args.batch_size) print("{:.2f}".format(100 * accuracy))
type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Create logdir name args.logdir = "logs/{}-{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join( ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))) if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself # Load the data if not args.recodex: train = morpho_dataset.MorphoDataset( "../19_lemmatizer_noattn/czech-cac-train.txt", max_sentences=5000) dev = morpho_dataset.MorphoDataset( "../19_lemmatizer_noattn/czech-cac-dev.txt", train=train, shuffle_batches=False) else: train = morpho_dataset.MorphoDataset("czech-cac-train.txt", max_sentences=5000) dev = morpho_dataset.MorphoDataset("czech-cac-dev.txt", train=train, shuffle_batches=False) # Construct the network network = Network(threads=args.threads) network.construct(args, len(train.factors[train.FORMS].alphabet), len(train.factors[train.LEMMAS].alphabet),
# Dump passed options to allow future prediction. with open("{}/options.json".format(args.logdir), mode="w") as options_file: json.dump(vars(args), options_file, sort_keys=True) # Postprocess args args.epochs = [(int(epochs), float(lr)) for epochs, lr in (epochs_lr.split(":") for epochs_lr in args.epochs.split(","))] # Load the data seq2seq = args.decoding == "seq2seq" train = morpho_dataset.MorphoDataset( args.train_data, max_sentences=args.max_sentences, seq2seq=seq2seq, bert_embeddings_filename=args.bert_embeddings_train, flair_filename=args.flair_train, elmo_filename=args.elmo_train) if args.dev_data: dev = morpho_dataset.MorphoDataset( args.dev_data, train=train, shuffle_batches=False, seq2seq=seq2seq, bert_embeddings_filename=args.bert_embeddings_dev, flair_filename=args.flair_dev, elmo_filename=args.elmo_dev) test = morpho_dataset.MorphoDataset( args.test_data, train=train,
else: f.read(binary_len) # skip return we #, word_to_index (optional) #sess.run(cnn.W.assign(initW)) if __name__ == "__main__": import numpy as np import tensorflow as tf from tensorflow.contrib import learn import morpho_dataset train = morpho_dataset.MorphoDataset("/home/liefe/data/cs/train.txt", lowercase=True) # To read as text #file = 'word2vec_cs.txt' #we, index_to_word, word_to_index = get_params(file) #print(we) #print(index_to_word[14]) #print(word_to_index['odkazy']) # Read bin file with open('wv_we', 'wb') as f: file = 'word2vec_cs.bin' we = load(file) print(we.shape) #print(index_to_word[14]) idx = train.factors[train.FORMS].words_map.get('odkazy')
type=float, help="Norm for gradient clipping.") args = parser.parse_args() # Create logdir name args.logdir = "logs/{}-{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join( ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))) if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself # Load the data train = morpho_dataset.MorphoDataset("esp.train") dev = morpho_dataset.MorphoDataset("esp.testa", train=train, shuffle_batches=False) test = morpho_dataset.MorphoDataset("esp.testb", train=train, shuffle_batches=False) print(len(train.factors[train.FORMS].words), len(train.factors[train.FORMS].alphabet), len(train.factors[train.NE].words)) print(train.factors[train.NE].words) # Construct the network network = Network(threads=args.threads)
default="GRU", type=str, help="RNN cell type.") parser.add_argument("--rnn_cell_dim", default=100, type=int, help="RNN cell dimension.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Load the data print("Loading the data.", file=sys.stderr) data_train = morpho_dataset.MorphoDataset(args.data_train, add_bow_eow=True) data_dev = morpho_dataset.MorphoDataset(args.data_dev, add_bow_eow=True, train=data_train) data_test = morpho_dataset.MorphoDataset(args.data_test, add_bow_eow=True, train=data_train) bow_char = data_train.alphabet.index("<bow>") eow_char = data_train.alphabet.index("<eow>") # Construct the network print("Constructing the network.", file=sys.stderr) expname = "lemmatizer-{}{}-bs{}-epochs{}".format(args.rnn_cell, args.rnn_cell_dim, args.batch_size, args.epochs)
def main(args): import argparse import datetime import json import os import re np.random.seed(42) tf.random.set_seed(42) #command_line = " ".join(sys.argv[1:]) # Parse arguments parser = argparse.ArgumentParser() # parser.add_argument("--threads", default=4, type=int, help="Maximum number of threads to use.") parser.add_argument("--accu", default=1, type=int, help="accumulate batch size") parser.add_argument("--batch_size", default=64, type=int, help="Batch size.") parser.add_argument("--bert", default=None, type=str, help="Bert model for embeddings") parser.add_argument("--bert_model", default=None, type=str, help="Bert model for training") parser.add_argument("--beta_2", default=0.99, type=float, help="Adam beta 2") parser.add_argument("--char_dropout", default=0, type=float, help="Character dropout") parser.add_argument("--checkp", default=None, type=str, help="Checkpoint name") parser.add_argument("--cle_dim", default=256, type=int, help="Character-level embedding dimension.") parser.add_argument("--cont", default=0, type=int, help="load finetuned model and continue training?") parser.add_argument("--debug", default=0, type=int, help="debug on small dataset") parser.add_argument("--dropout", default=0.5, type=float, help="Dropout") parser.add_argument("--embeddings", default=None, type=str, help="External embeddings to use.") parser.add_argument("--epochs", default="40:1e-3,20:1e-4", type=str, help="Epochs and learning rates.") parser.add_argument("--exp", default=None, type=str, help="Experiment name.") parser.add_argument("--factor_layers", default=1, type=int, help="Per-factor layers.") parser.add_argument("--factors", default="Lemmas,Tags", type=str, help="Factors to predict.") parser.add_argument("--fine_lr", default=0, type=float, help="Learning rate for bert layers") parser.add_argument("--label_smoothing", default=0.00, type=float, help="Label smoothing.") parser.add_argument("--layers", default=None, type=str, help="Which layers should be used") parser.add_argument("--lemma_re_strip", default=r"(?<=.)(?:`|_|-[^0-9]).*$", type=str, help="RE suffix to strip from lemma.") parser.add_argument("--lemma_rule_min", default=2, type=int, help="Minimum occurences to keep a lemma rule.") # parser.add_argument("--min_epoch_batches", default=300, type=int, help="Minimum number of batches per epoch.") parser.add_argument("--predict", default=None, type=str, help="Predict using the passed model.") parser.add_argument("--rnn_cell", default="LSTM", type=str, help="RNN cell type.") parser.add_argument("--rnn_cell_dim", default=512, type=int, help="RNN cell dimension.") parser.add_argument("--rnn_layers", default=3, type=int, help="RNN layers.") parser.add_argument("--test_only", default=None, type=str, help="Only test evaluation") parser.add_argument( "--warmup_decay", default=None, type=str, help= "Type i or c. Number of warmup steps, than will be applied inverse square root decay" ) parser.add_argument("--we_dim", default=512, type=int, help="Word embedding dimension.") parser.add_argument("--word_dropout", default=0.2, type=float, help="Word dropout") parser.add_argument("data", type=str, help="Input data") args = parser.parse_args(args) args.debug = args.debug == 1 args.cont = args.cont == 1 # Postprocess args args.factors = args.factors.split(",") args.epochs = [(int(epochs), float(lr)) for epochs, lr in (epochs_lr.split(":") for epochs_lr in args.epochs.split(","))] if args.warmup_decay is not None: print("decay is not none") print(args.warmup_decay) args.warmup_decay = args.warmup_decay.split(":") args.decay_type = args.warmup_decay[0] args.warmup_decay = int(args.warmup_decay[1]) else: args.decay_type = None args.bert_load = None name = None if args.bert or args.bert_model: if args.bert_model: print("před parsovanim") print(args.bert_model) args.bert_model = args.bert_model.split(":") if len(args.bert_model) > 1: args.bert_load = args.bert_model[0] print(args.bert_load) print("load") args.bert_model = args.bert_model[1] else: args.bert_model = args.bert_model[0] name = args.bert_model elif args.bert: args.bert = args.bert.split(":") if len(args.bert) > 1: args.bert_load = args.bert[0] print(args.bert_load) print("load") args.bert = args.bert[1] else: args.bert = args.bert[0] name = args.bert if name is not None and "robeczech" in name: sys.path.append(name) import tokenizer.robeczech_tokenizer # TODO vyřešit # tf.config.threading.set_inter_op_parallelism_threads(args.threads) # tf.config.threading.set_intra_op_parallelism_threads(args.threads) # tf.config.set_soft_device_placement(True) if args.predict is None: # Create logdir name if args.exp is None: args.exp = "{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S")) do_not_log = { "exp", "legtomma_re_strip", "predict", "threads", "bert_model", "bert" } args.logdir = "models/{}".format( args.exp # ",".join(("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), # re.sub("[^,]*/", "", value) if type(value) == str else value) # for key, value in sorted(vars(args).items()) if key not in do_not_log)) ) if not os.path.exists("models"): os.mkdir("models") if not os.path.exists(args.logdir): os.mkdir(args.logdir) # Dump passed options with open("{}/options.json".format(args.logdir), mode="w") as options_file: json.dump(vars(args), options_file, sort_keys=True) # Load embeddings if args.embeddings: with np.load(args.embeddings, allow_pickle=True) as embeddings_npz: args.embeddings_words = embeddings_npz["words"] args.embeddings_data = embeddings_npz["embeddings"] args.embeddings_size = args.embeddings_data.shape[1] # Nechceme to vsechno dohromady if args.bert and args.bert_model: warnings.warn( "embeddings and whole bert model training are both selected.") model_bert = None if args.bert or args.bert_model: model_bert = BertModel(name, args) if args.predict: # Load training dataset maps from the checkpoint saved = args.exp args.train = morpho_dataset.MorphoDataset.load_mappings( "models/{}/mappings.pickle".format(saved)) # To je ulozeno v # models/jmeno experimentu a checkpoints, predict bude jmneo modelu, v data bude cele jeno vcetne test.txt # Load input data predict = morpho_dataset.MorphoDataset(args.data, train=args.train, shuffle_batches=False, bert=model_bert) else: # Load input data data_paths = [None] * 3 if args.debug: print("DEBUG MODE") data_paths[0] = "{}-train-small.txt".format(args.data) data_paths[1] = "{}-dev-small.txt".format(args.data) data_paths[2] = "{}-test-small.txt".format(args.data) else: data_paths[0] = "{}-train.txt".format(args.data) data_paths[1] = "{}-dev.txt".format(args.data) data_paths[2] = "{}-test.txt".format(args.data) args.train = morpho_dataset.MorphoDataset( data_paths[0], embeddings=args.embeddings_words if args.embeddings else None, bert=model_bert, lemma_re_strip=args.lemma_re_strip, lemma_rule_min=args.lemma_rule_min) if os.path.exists(data_paths[1]): args.dev = morpho_dataset.MorphoDataset(data_paths[1], train=args.train, shuffle_batches=False, bert=model_bert) else: args.dev = None if os.path.exists(data_paths[2]): args.test = morpho_dataset.MorphoDataset(data_paths[2], train=args.train, shuffle_batches=False, bert=model_bert) else: args.test = None print(args.bert_load) print("again") # TODO nacitat velikost args.bert_size = 768 if args.decay_type != None: args.steps_in_epoch = math.floor( len(args.train.factors[1].word_strings) / (args.batch_size * args.accu)) network = Network( args=args, num_words=len(args.train.factors[args.train.FORMS].words), num_chars=len(args.train.factors[args.train.FORMS].alphabet), factor_words=dict( (factor, len(args.train.factors[args.train.FACTORS_MAP[factor]].words)) for factor in args.factors), model=model_bert) if args.debug: ... # tf.keras.utils.plot_model(network.outer_model, "my_first_model_with_shape_info.svg", show_shapes=True) if args.fine_lr > 0: args.lr_split = len(network.outer_model.trainable_variables) - len( network.model.trainable_variables) # print("model variables:") # print(str(network.model.trainable_variables)) # print("outer model variables:") # print(str(network.outer_model.trainable_variables)) network.args = args if args.predict: # network.saver_inference.restore(network.session, "{}/checkpoint-inference".format(args.predict)) network.outer_model.load_weights(args.predict) network.predict(predict, args, open(saved + "_vystup", "w"), compare=True) else: log_file = open("{}/log".format(args.logdir), "w") for factor in args.factors: print("{}: {}".format( factor, len(args.train.factors[args.train.FACTORS_MAP[factor]].words)), file=log_file, flush=True) print("Tagging with args:", "\n".join(("{}: {}".format(key, value) for key, value in sorted(vars(args).items()) if key not in [ "embeddings_data", "embeddings_words", "train", "test", "dev" ])), flush=True) def test_eval(predict=None): metrics = network.evaluate(args.test, "test", args, predict) metrics_log = ", ".join( ("{}: {:.2f}".format(metric, 100 * metrics[metric]) for metric in metrics)) for f in [sys.stderr, log_file]: print("Test, epoch {}, lr {}, {}".format( epoch + 1, learning_rate, metrics_log), file=f, flush=True) for i, (epochs, learning_rate) in enumerate(args.epochs): tf.summary.experimental.set_step(0) epoch = 0 test_eval() for epoch in range(epochs): network.train_epoch(args.train, args, learning_rate) if args.dev: print("evaluate") metrics = network.evaluate(args.dev, "dev", args) metrics_log = ", ".join( ("{}: {:.2f}".format(metric, 100 * metrics[metric]) for metric in metrics)) for f in [sys.stderr, log_file]: print("Dev, epoch {}, lr {}, {}".format( epoch + 1, learning_rate, metrics_log), file=f, flush=True) if args.cont and test: test_eval() args.train.save_mappings("{}/mappings.pickle".format(args.logdir)) if args.checkp: checkp = args.checkp else: checkp = args.logdir.split("/")[1] network.outer_model.save_weights('./checkpoints/' + checkp) output_file = args.logdir.split("/")[1] print(output_file) if args.test: test_eval(predict=open("./" + output_file + "_vysledky", "w"))
class Network: MAX_GEN_LEN = 99 EMBEDDING_SIZE = 100 ALIGNMENT_SIZE = 100 def __init__(self, encoder, decoder, rnn_cell, rnn_cell_dim, chars_size, words_size, tags_size, bow_char, eow_char, logdir, expname, threads=1, seed=42): # Create an empty graph and a session graph = tf.Graph() graph.seed = seed self.session = tf.Session( graph=graph, config=tf.ConfigProto( inter_op_parallelism_threads=threads, intra_op_parallelism_threads=threads)) timestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S") self.summary_writer = tf.summary.FileWriter("{}/{}-{}".format(logdir, timestamp, expname), flush_secs=10) # Construct the graph with self.session.graph.as_default(): if rnn_cell == "LSTM": rnn_cell = tf.contrib.rnn.LSTMCell(rnn_cell_dim) elif rnn_cell == "GRU": rnn_cell = tf.contrib.rnn.GRUCell(rnn_cell_dim) else: raise ValueError("Unknown rnn_cell {}".format(rnn_cell)) self.global_step = tf.Variable(0, dtype=tf.int64, trainable=False, name="global_step") self.sentence_lens = tf.placeholder(tf.int32, [None], name="sent_lens") self.lemma_ids = tf.placeholder(tf.int32, [None, None], name="lemma_ids") self.lemmas = tf.placeholder(tf.int64, [None, None], name="lemmas") self.lemma_lens = tf.placeholder(tf.int32, [None], name="lemma_lens") self.tag_ids = tf.placeholder(tf.int32, [None, None], name="tag_ids") self.tags = tf.placeholder(tf.int64, [None, None], name="tags") self.tag_lens = tf.placeholder(tf.int32, [None], name="tag_lens") self.form_ids = tf.placeholder(tf.int32, [None, None], name="form_ids") self.forms = tf.placeholder(tf.int64, [None, None], name="forms") self.form_lens = tf.placeholder(tf.int32, [None], name="form_lens") self.alphabet_len = chars_size self.word_vocab_len = words_size self.tag_vocab_len = tags_size self.dummy_inputs = tf.zeros([tf.shape(self.sentence_lens)[0], self.MAX_GEN_LEN], name="inference_shape") self.char_embedding_matrix = tf.get_variable( "char_embeddings", [self.alphabet_len, self.EMBEDDING_SIZE], initializer=tf.random_normal_initializer(stddev=0.01), dtype=tf.float32) self.we_lookup_matrix = tf.get_variable( "we_lookup_matrix", [self.word_vocab_len, self.EMBEDDING_SIZE], initializer=tf.random_normal_initializer(stddev=0.01), dtype=tf.float32, trainable=True) self.tag_lookup_matrix = tf.get_variable( "tag_lookup_matrix", [self.tag_vocab_len, self.EMBEDDING_SIZE], initializer=tf.random_normal_initializer(stddev=0.01), dtype=tf.float32, trainable=True) # Encode words with tf.variable_scope("encoder"): self.char_embeddings = tf.nn.embedding_lookup(self.char_embedding_matrix, self.lemmas) ch_rnn_cell = tf.contrib.rnn.GRUCell(rnn_cell_dim) hidden_states, final_states = tf.nn.bidirectional_dynamic_rnn( cell_fw=ch_rnn_cell, cell_bw=ch_rnn_cell, inputs=self.char_embeddings, sequence_length=self.lemma_lens, dtype=tf.float32, scope="char_BiRNN") self.sentence_mask = tf.sequence_mask(self.sentence_lens) # Create decoder input self.we_encoder_matrix = tf_layers.linear( tf.concat(axis=1, values=final_states), self.EMBEDDING_SIZE, scope="we_encoder_matrix") self.encoder_output = tf.nn.embedding_lookup(self.we_encoder_matrix, self.lemma_ids) self.encoder_output = tf.reshape( tf.boolean_mask(self.encoder_output, self.sentence_mask), [-1, self.EMBEDDING_SIZE], name="encoder_output_flat") # Encode tags self.tags_embedded = tf.nn.embedding_lookup(self.tag_lookup_matrix, self.tag_ids) self.tags_embedded = tf.reshape( tf.boolean_mask(self.tags_embedded, self.sentence_mask), [-1, self.EMBEDDING_SIZE], name="tag_embeddings_flat") # Combine encoder_output with tag embedding self.encoder_output = tf_layers.linear( tf.concat(axis=1, values=[self.encoder_output, self.tags_embedded]), self.EMBEDDING_SIZE, scope="encoder_output_with_tags") # Create annotations for attention self.annot_matrix = tf_layers.linear( tf.concat(axis=2, values=hidden_states), self.EMBEDDING_SIZE, scope="annot_matrix") self.annotations = tf.nn.embedding_lookup(self.annot_matrix, self.lemma_ids) self.annotations = tf.reshape( tf.boolean_mask(self.annotations, self.sentence_mask), [-1, tf.shape(self.annot_matrix)[1], self.EMBEDDING_SIZE], name="annotations_flat") # Reshape form values self.forms_flat = tf.nn.embedding_lookup(self.forms, self.form_ids) self.forms_flat = tf.reshape( tf.boolean_mask(self.forms_flat, self.sentence_mask), [-1, tf.shape(self.forms)[1]], name="forms_flat") self.forms_flat_lens = tf.nn.embedding_lookup(self.form_lens, self.form_ids) self.forms_flat_lens = tf.reshape( tf.boolean_mask(self.forms_flat_lens, self.sentence_mask), [-1], name="lemmas_flat_lens") self.attention_fn = None if decoder in ["individual", "individual_attention", "combined_attention", "combined_attention_birnn"]: if decoder in ["individual_attention", "combined_attention", "combined_attention_birnn"]: #self.attention_fn = self.attention_fn_builder(self.annotations) if decoder == "combined_attention": word_embeddings = tf.nn.embedding_lookup(self.we_lookup_matrix, self.lemma_ids) word_embeddings = tf.reshape( tf.boolean_mask(word_embeddings, self.sentence_mask), [-1, self.EMBEDDING_SIZE], name="word_embeddings_flat") self.encoder_output = tf_layers.linear( tf.concat(axis=1, values=[self.encoder_output, word_embeddings]), self.EMBEDDING_SIZE, scope="combined_encoder_output") if decoder == "combined_attention_rnn": else: raise ValueError("Unknown decoder ({}).".format(decoder)) # Decoder training with tf.variable_scope("decoder"): if decoder == "individual": self.training_logits, states = tf_seq2seq.rnn_decoder( decoder_inputs=self.forms_flat, initial_state=self.encoder_output, cell=rnn_cell) else: self.training_logits, states = tf_seq2seq.attention_decoder( decoder_inputs=self.forms_flat, initial_state=self.encoder_output, attention_states=self.annotations, cell=rnn_cell) #self.training_logits, states = tf_seq2seq.dynamic_rnn_decoder( #cell=rnn_cell, #decoder_fn=self.decoder_fn_train( # self.encoder_output, # self.output_fn_builder(), # self.input_fn_builder(self.char_embedding_matrix, self.attention_fn)), #inputs=tf.expand_dims(self.forms_flat, -1), #sequence_length=self.forms_flat_lens) # Decoder inference with tf.variable_scope("decoder", reuse=True): if decoder == "individual": self.training_logits, states = tf_seq2seq.rnn_decoder( decoder_inputs=self.dummy_inputs, initial_state=self.encoder_output, cell=rnn_cell, loop_function=decoder_fn) else: self.training_logits, states = tf_seq2seq.attention_decoder( decoder_inputs=self.dummy_inputs, initial_state=self.encoder_output, attention_states=self.annotations, cell=rnn_cell, loop_function=decoder_fn) #self.inference_logits, states = tf_seq2seq.dynamic_rnn_decoder( #cell=rnn_cell, #decoder_fn=self.decoder_fn_inference( # self.encoder_output, # self.output_fn_builder(), # self.input_fn_builder(self.char_embedding_matrix, self.attention_fn), #bow_char, #eow_char, #self.MAX_GEN_LEN)) self.predictions = tf.argmax(self.inference_logits, 2) loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.training_logits, labels=self.forms_flat[:,1:])) self.training = tf.train.AdamOptimizer().minimize(loss, global_step=self.global_step) self.forms_flat = tf.cond( tf.reduce_max(self.forms_flat_lens) > self.MAX_GEN_LEN, lambda: tf.slice(self.forms_flat, [0, 0], [-1, self.MAX_GEN_LEN]), lambda: self.forms_flat) self.pred_padded = tf.pad( self.predictions, [[0,0],[0, self.MAX_GEN_LEN - tf.shape(self.predictions)[1]]], mode="CONSTANT") self.forms_padded = tf.pad( self.forms_flat, [[0,0],[0, self.MAX_GEN_LEN - tf.shape(self.forms_flat)[1] + 1]], mode="CONSTANT") self.char_accuracy = tf_metrics.accuracy(self.pred_padded, self.forms_padded[:,1:]) self.word_accuracy = tf.reduce_mean(tf.reduce_min(tf.cast(tf.equal(self.pred_padded, self.forms_padded[:,1:]), tf.float32), axis=1)) self.summary = {} for dataset_name in ["train", "dev"]: self.summary[dataset_name] = tf.summary.merge([tf.summary.scalar(dataset_name+"/loss", loss), tf.summary.scalar(dataset_name+"/char_accuracy", self.char_accuracy), tf.summary.scalar(dataset_name+"/word_accuracy", self.word_accuracy)]) # Initialize variables self.session.run(tf.global_variables_initializer()) if self.summary_writer: self.summary_writer.add_graph(self.session.graph) # Simple decoder for training def decoder_fn_train(self, encoder_state, output_fn, input_fn, name=None): def decoder_fn(time, cell_state, next_id, cell_output, context_state): cell_output = output_fn(cell_output) reuse = True if cell_state is None: # first call, return encoder_state cell_state = encoder_state reuse = None next_input = input_fn(tf.squeeze(next_id, [1]), cell_state, reuse) return (None, cell_state, next_input, cell_output, context_state) return decoder_fn # TODO: Beam search # Simple decoder for inference def decoder_fn_inference(self, encoder_state, output_fn, input_fn, beginning_of_word="<bow>", end_of_word="<eow>", maximum_length=MAX_GEN_LEN): batch_size = tf.shape(encoder_state)[0] def decoder_fn(time, cell_state, cell_input, cell_output, context_state): cell_output = output_fn(cell_output) if cell_state is None: cell_state = encoder_state next_id = tf.tile([beginning_of_word], [batch_size]) done = tf.zeros([batch_size], dtype=tf.bool) else: next_id = tf.argmax(cell_output, 1) done = tf.equal(next_id, end_of_word) done = tf.cond( tf.greater_equal(time, maximum_length), # return true if time >= maxlen lambda: tf.ones([batch_size], dtype=tf.bool), lambda: done) next_input = input_fn(next_id, cell_state, True) return (done, cell_state, next_input, cell_output, context_state) return decoder_fn def decoder_fn_builder(self, encoder_state, output_fn, input_fn, beginning_of_word="<bow>", end_of_word="<eow>", maximum_length=MAX_GEN_LEN): def decoder_fn(cell_output, i): cell_output = output_fn(cell_output) next_input = tf.argmax(cell_output, 1) next_input = input_fn(next_input) return decoder_fn # TODO: dropout def attention_fn_builder(self, annotations): def attention_fn(state): batch_size = tf.shape(state)[0] annot_len = tf.shape(annotations)[1] annot_dim = annotations.get_shape().as_list()[2] state_dim = state.get_shape().as_list()[1] e_dim = self.ALIGNMENT_SIZE a = tf.reshape(annotations, [-1, annot_dim]) U = tf.get_variable( "annot_weight", shape=[annot_dim, e_dim], initializer=tf.random_normal_initializer(stddev=0.1), trainable=True) U_b = tf.get_variable( "annot_bias", shape=[e_dim], initializer=tf.constant_initializer(0.1)) W = tf.get_variable( "state_weight", shape=[state_dim, e_dim], initializer=tf.random_normal_initializer(stddev=0.1), trainable=True) W_b = tf.get_variable( "state_bias", shape=[e_dim], initializer=tf.constant_initializer(0.1)) v = tf.get_variable( "lin_combo", shape=[e_dim, 1], initializer=tf.random_normal_initializer(stddev=0.1), trainable=True) w_res = tf.matmul(state, W) + W_b w_res = tf.tile(tf.reshape(w_res, [-1, 1]), [1, annot_len]) u_res = tf.matmul(a, U) + U_b u_res = tf.reshape(u_res, [-1, annot_len]) e = tf.matmul(tf.tanh(tf.reshape(w_res + u_res, [-1, e_dim])), v) e = tf.reshape(e, [batch_size, -1]) alpha = tf.nn.softmax(e) alpha = tf.tile(tf.reshape(alpha, [-1, 1]), [1, annot_dim]) c = tf.multiply(alpha, a) c = tf.reduce_sum(tf.reshape(c, [batch_size, -1, annot_dim]), 1) C = tf.get_variable( "attention_weight", shape=[state_dim, state_dim], initializer=tf.random_normal_initializer(stddev=0.1), trainable=True) C_b = tf.get_variable( "attention_bias", shape=[state_dim], initializer=tf.constant_initializer(0.1)) return tf.add(tf.matmul(c, C), C_b) return attention_fn # Output function builder (makes logits out of rnn outputs) def output_fn_builder(self): def output_fn(cell_output): if cell_output is None: return tf.zeros([self.alphabet_len], tf.float32) # only used for shape inference else: return tf_layers.linear( cell_output, num_outputs=self.alphabet_len, scope="decoder_output") return output_fn # Input function builder (makes rnn input from word id and cell state) def input_fn_builder(self, embeddings): def input_fn(next_id): return tf.nn.embedding_lookup(embeddings, next_id) return input_fn # Input function builder (makes rnn input from word id and cell state) #def input_fn_builder(self, embeddings, attention_fn=None): # def input_fn(next_id, cell_state, reuse=True): # if attention_fn is not None: # with tf.variable_scope("attention", reuse=reuse): # return tf.add( # tf.nn.embedding_lookup(embeddings, next_id), # attention_fn(cell_state)) # else: # return tf.nn.embedding_lookup(embeddings, next_id) # # return input_fn @property def training_step(self): return self.session.run(self.global_step) def train(self, sentence_lens, forms, form_ids, form_lens, tags, tag_ids, tag_lens, lemmas, lemma_ids, lemma_lens): try: _, summary, pred = self.session.run([self.training, self.summary, self.predictions], {self.sentence_lens: sentence_lens, self.forms: forms, self.form_ids: form_ids, self.form_lens: form_lens, self.tags: tags, self.tag_ids: tag_ids, self.tag_lens: tag_lens, self.lemmas: lemmas, self.lemma_ids: lemma_ids, self.lemma_lens: lemma_lens}) except Exception as e: import pdb; pdb.set_trace() raise e self.summary_writer.add_summary(summary["train"], self.training_step) def evaluate(self, sentence_lens, forms, form_ids, form_lens, tags, tag_ids, tag_lens, lemmas, lemma_ids, lemma_lens): try: ch_acc, w_acc, summary, pred = self.session.run([self.char_accuracy, self.word_accuracy, self.summary, self.predictions], {self.sentence_lens: sentence_lens, self.forms: forms, self.form_ids: form_ids, self.form_lens: form_lens, self.tags: tags, self.tag_ids: tag_ids, self.tag_lens: tag_lens, self.lemmas: lemmas, self.lemma_ids: lemma_ids, self.lemma_lens: lemma_lens}) except Exception as e: import pdb; pdb.set_trace() raise e self.summary_writer.add_summary(summary["dev"], self.training_step) return ch_acc, w_acc def predict(self, sentence_lens, lemmas, lemma_ids, lemma_lens, tags, tag_ids, tag_lens): predictions = self.session.run([self.predictions], {self.sentence_lens: sentence_lens, self.lemmas: lemmas, self.lemma_ids: lemma_ids, self.lemma_lens: lemma_lens, self.tags: tags, self.tag_ids: tag_ids, self.tag_lens: tag_lens}) return predictions if __name__ == "__main__": # Fix random seed np.random.seed(42) # Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=64, type=int, help="Batch size.") parser.add_argument("--data_train", default="data/en-train-gen.txt", type=str, help="Training data file.") parser.add_argument("--data_dev", default="data/en-dev.txt", type=str, help="Development data file.") parser.add_argument("--data_test", default="data/en-test-gen.txt", type=str, help="Testing data file.") parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.") parser.add_argument("--logdir", default="logs", type=str, help="Logdir name.") parser.add_argument("--rnn_cell", default="GRU", type=str, help="RNN cell type.") parser.add_argument("--rnn_cell_dim", default=100, type=int, help="RNN cell dimension.") parser.add_argument("--encoder", default="simple", type=str, help="Which encoder should we use.") parser.add_argument("--decoder", default="individual", type=str, help="Which decoder should we use.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Load the data print("Loading the data.", file=sys.stderr) data_train = morpho_dataset.MorphoDataset(args.data_train, add_bow_eow=True) data_dev = morpho_dataset.MorphoDataset(args.data_dev, add_bow_eow=True, train=data_train) data_test = morpho_dataset.MorphoDataset(args.data_test, add_bow_eow=True, train=data_train) bow_char = data_train.alphabet.index("<bow>") eow_char = data_train.alphabet.index("<eow>") # Construct the network print("Constructing the network.", file=sys.stderr) expname = "generator-{}{}-bs{}-epochs{}".format(args.rnn_cell, args.rnn_cell_dim, args.batch_size, args.epochs) network = Network(rnn_cell=args.rnn_cell, encoder=args.encoder, decoder=args.decoder, rnn_cell_dim=args.rnn_cell_dim, chars_size=len(data_train.alphabet), words_size=len(data_train.factors[data_train.FORMS]['words']), tags_size=len(data_train.factors[data_train.TAGS]['words']), bow_char=bow_char, eow_char=eow_char, logdir=args.logdir, expname=expname, threads=args.threads) # Train best_dev_ch_acc = 0 best_dev_w_acc = 0 test_predictions = None for epoch in range(args.epochs): print("Training epoch {}".format(epoch + 1), file=sys.stderr) while not data_train.epoch_finished(): sentence_lens, form_ids, charseq_ids, charseqs, charseq_lens = \ data_train.next_batch(args.batch_size, including_charseqs=True) network.train( sentence_lens, charseqs[data_train.FORMS], charseq_ids[data_train.FORMS], charseq_lens[data_train.FORMS], charseqs[data_train.TAGS], charseq_ids[data_train.TAGS], charseq_lens[data_train.TAGS], charseqs[data_train.LEMMAS], charseq_ids[data_train.LEMMAS], charseq_lens[data_train.LEMMAS]) sentence_lens, form_ids, charseq_ids, charseqs, charseq_lens = data_dev.whole_data_as_batch(including_charseqs=True) dev_ch_acc, dev_w_acc = network.evaluate( sentence_lens, charseqs[data_train.FORMS], charseq_ids[data_train.FORMS], charseq_lens[data_train.FORMS], charseqs[data_train.TAGS], charseq_ids[data_train.TAGS], charseq_lens[data_train.TAGS], charseqs[data_train.LEMMAS], charseq_ids[data_train.LEMMAS], charseq_lens[data_train.LEMMAS]) print("Development ch_acc after epoch {} is {:.2f}, w_acc is {:.2f}.".format(epoch + 1, 100. * dev_ch_acc, 100. * dev_w_acc), file=sys.stderr) if dev_w_acc > best_dev_w_acc or (dev_w_acc == best_dev_w_acc and dev_ch_acc > best_dev_ch_acc): best_dev_w_acc = dev_w_acc best_dev_ch_acc = dev_ch_acc sentence_lens, form_ids, charseq_ids, charseqs, charseq_lens = data_test.whole_data_as_batch(including_charseqs=True) test_predictions = network.predict( sentence_lens, charseqs[data_train.LEMMAS], charseq_ids[data_train.LEMMAS], charseq_lens[data_train.LEMMAS], charseqs[data_train.TAGS], charseq_ids[data_train.TAGS], charseq_lens[data_train.TAGS]) # Print test predictions test_forms = data_test.factors[data_test.FORMS]['strings'] # We use strings instead of words, because words can be <unk> test_predictions = list(test_predictions) for i in range(len(data_test.sentence_lens)): for j in range(data_test.sentence_lens[i]): form = '' pred = test_predictions.pop(0) for k in range(len(pred)): if pred[k] == eow_char: break form += data_test.alphabet[pred[k]] print("{}\t{}\t_".format(test_forms[i][j], form)) print() print("Final best dev set accuracy: {:.2f}".format(100. * best_dev_w_acc))
# np.random.seed(42) # Parse arguments # parser = argparse.ArgumentParser() # parser.add_argument("--batch_size", default=10, type=int, help="Batch size.") # parser.add_argument("--epochs", default=10, type=int, help="Number of epochs.") # parser.add_argument("--threads", default=8, type=int, help="Maximum number of threads to use.") parser = argparse.ArgumentParser() parser.add_argument("best", type=str, help="dev.") parser.add_argument("prediction", type=str, help="prediction.") args = parser.parse_args() analyzer_dictionary = MorphoAnalyzer("../18_tagger_sota/czech-pdt-analysis-dictionary.txt") analyzer_guesser = MorphoAnalyzer("../18_tagger_sota/czech-pdt-analysis-guesser.txt") prediction = morpho_dataset.MorphoDataset(args.prediction) dir = os.path.dirname(args.prediction) f = os.path.basename(args.prediction) with open("{}/a_{}.txt".format(dir, f), "w", encoding="utf-8") as test_file: forms = prediction.factors[prediction.FORMS].strings tags = prediction.factors[prediction.LEMMAS].strings for s in range(len(forms)): for j in range(len(forms[s])): print("{}\t{}\t_".format(forms[s][j], analyze(forms[s][j], tags[s][j], analyzer_dictionary, analyzer_guesser)), file=test_file) print("", file=test_file) print("Puvodni") os.system('python morpho_eval.py ' + args.best + " " + args.prediction) print("+Analyzer")
default=8, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Create logdir name args.logdir = "logs/{}-{}-{}".format( os.path.basename(__file__), datetime.datetime.now().strftime("%Y-%m-%d_%H%M%S"), ",".join( ("{}={}".format(re.sub("(.)[^_]*_?", r"\1", key), value) for key, value in sorted(vars(args).items())))) if not os.path.exists("logs"): os.mkdir("logs") # TF 1.6 will do this by itself # Load the data train = morpho_dataset.MorphoDataset( "../18_tagger_sota/czech-pdt-train.txt", max_sentences=1000) dev = morpho_dataset.MorphoDataset("../18_tagger_sota/czech-pdt-dev.txt", train=train, shuffle_batches=False) test = morpho_dataset.MorphoDataset("../18_tagger_sota/czech-pdt-test.txt", train=train, shuffle_batches=False) analyzer_dictionary = MorphoAnalyzer( "../18_tagger_sota/czech-pdt-analysis-dictionary.txt") analyzer_guesser = MorphoAnalyzer( "../18_tagger_sota/czech-pdt-analysis-guesser.txt") # Construct the network network = Network(threads=args.threads) network.construct(args, len(train.factors[train.FORMS].alphabet),
#train = morpho_dataset.MorphoDataset("/home/liefe/data/cs/train.txt", lowercase=True) #train = morpho_dataset.MorphoDataset("/afs/ms/u/l/liefe/data/cs/train.txt", lowercase=True) # To read as text #file = 'word2vec_cs.txt' #we, index_to_word, word_to_index = get_params(file) #print(we) #print(index_to_word[14]) #print(word_to_index['odkazy']) # Read bin file model_file = sys.argv[1] train_file = sys.argv[2] home = expanduser('~') train_file = home + '/data/cs/' + train_file train = morpho_dataset.MorphoDataset(train_file, lowercase=False) #train = morpho_dataset.MorphoDataset(train_file, lowercase=True) # Save file in numpy format #with open(model_file, 'wb') as f: #file = '/home/liefe/py/wv_data/word2vec_cs64.bin' #model = load_text(model_file) # read text file model = load_bin(model_file) # read text file print 'model shape: ' print model.shape #print(index_to_word[14]) #print('done emebedding..testing') idx = train.factors[train.FORMS].words_map.get('odkazy') print 'odkazy'
#train = morpho_dataset.MorphoDataset("/home/liefe/data/cs/train.txt", lowercase=True) #train = morpho_dataset.MorphoDataset("/afs/ms/u/l/liefe/data/cs/train.txt", lowercase=True) # To read as text #file = 'word2vec_cs.txt' #we, index_to_word, word_to_index = get_params(file) #print(we) #print(index_to_word[14]) #print(word_to_index['odkazy']) # Read bin file model_file = sys.argv[1] train_file = sys.argv[2] home = expanduser('~') train_file = home + '/data/cs/' + train_file train = morpho_dataset.MorphoDataset(train_file, lowercase=True) # Save file in numpy format #with open(model_file, 'wb') as f: #file = '/home/liefe/py/wv_data/word2vec_cs64.bin' model = load_text(model_file) # read text file print('model shape: ', model.shape) #print(index_to_word[14]) print('done emebedding..testing') idx = train.factors[train.FORMS].words_map.get('odkazy') print('odkazy: {}, we={}'.format(idx, model[idx,:])) print('saving model') np.save(model_file + '_embedded', model)