def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print 'Processed ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1
def __generate_word_matrix(self, index_lookup): """ Generate a BOW matrix with rows, columns corresponding to documents, words respectively. @param index_lookup: A dictionary with keys for the attributes. In order to know which colounm should be incremented in word_matrix. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) bag_of_words_matrix = zeros([len(docs_list), len(index_lookup)]) row = 0 for doc in docs_list: for token in doc: try: # If word is not found in the dictionary col = index_lookup[token] bag_of_words_matrix[row, col] += 1 except KeyError: continue row += 1 # Serialize bag of words s.dump(bag_of_words_matrix.tolist(), open(env_paths.get_bow_matrix_path(self.training, batch), "wb")) print "Processed " + str(processed) + " of " + str(length) + " batches" processed += 1
def __generate_input_data(self): """ Generate the input data for the DBN so that it can be visualized. """ if not len(self.input_data) == 0: return try: self.input_data = s.load(open('output/input_data.p', 'rb')) self.class_indices = s.load(open('output/class_indices.p', 'rb')) if not self.classes_to_visualise == None: self.__filter_input_data(self.classes_to_visualise) except: self.input_data = generate_input_data_list( training=False) if self.testing else generate_input_data_list( ) self.class_indices = get_all_class_indices( training=False) if self.testing else get_all_class_indices() if not self.classes_to_visualise == None: self.__filter_input_data(self.classes_to_visualise) s.dump([input.tolist() for input in self.input_data], open('output/input_data.p', 'wb')) s.dump(self.class_indices, open('output/class_indices.p', 'wb')) self.legend = get_class_names_for_class_indices( list(set(sorted(self.class_indices))))
def __generate_output_data(self): """ Generate the output data of the DBN so that it can be visualised. """ if not len(self.output_data) == 0: return try: self.output_data = s.load(open('output/output_data.p', 'rb')) self.class_indices = s.load(open('output/class_indices.p', 'rb')) if not self.classes_to_visualise == None: self.__filter_output_data(self.classes_to_visualise) except: self.output_data = generate_output_for_test_data( image_data=self.image_data, binary_output=self.binary_output ) if self.testing else generate_output_for_train_data( image_data=self.image_data, binary_output=self.binary_output) self.class_indices = get_all_class_indices( training=False) if self.testing else get_all_class_indices() if not self.classes_to_visualise == None: self.__filter_output_data(self.classes_to_visualise) s.dump([out.tolist() for out in self.output_data], open('output/output_data.p', 'wb')) s.dump(self.class_indices, open('output/class_indices.p', 'wb')) self.legend = get_class_names_for_class_indices( list(set(sorted(self.class_indices))))
def __init__(self, testing=True, binary_output=False): """ @param testing: Should be True if test data is to be plottet. Otherwise False. @param image_data: If the testing should be done on image data. @param binary_output: If the output of the DBN must be binary. """ if not check_for_data: print 'No DBN data or testing data.' return self.status = -1 self.output = [] self.testing = testing self.binary_output = binary_output try: self.output_data = s.load(open('output/output_data.p', 'rb')) self.class_indices = s.load(open('output/class_indices.p', 'rb')) except: self.output_data = generate_output_for_test_data( binary_output=self.binary_output) if testing else generate_output_for_train_data( binary_output=self.binary_output) self.class_indices = get_all_class_indices(training=False) if testing else get_all_class_indices() s.dump([out.tolist() for out in self.output_data], open('output/output_data.p', 'wb')) s.dump(self.class_indices, open('output/class_indices.p', 'wb')) self.output_data = np.array(self.output_data)
def compare_real_data_to_reconstructed_data(): weights = s.load(open(env_paths.get_dbn_weight_path(),"rb")) batches = s.load(open(env_paths.get_batches_path(train=False),"rb")) class_indices = s.load(open(env_paths.get_class_indices_path(False,batches[0]).replace(".0",""),"rb")) batch = batches[0] data = data_processing.get_bag_of_words_matrix(batch,training = False) dict = {} for i in range(len(class_indices)): idx = class_indices[i] if idx in dict.keys(): continue dict[idx] = data[i] if len(dict) >= 10: break print dict.keys() data_points = dict.values() output_data_points = [] for d in data_points: d = append(d,1.) out = generate_output_data(d,weights) output_data_points.append(out) visualise_data_points(data_points,output_data_points)
def onTurnSelected(self, evt): turn = evt.attr1 #only load if db does not know about this turn if not turn in db.db.turns or not db.db.turns[turn]: serialization.load(turn, self) self.map.turn = turn self.map.update() log.info('update info panel with turn %s'%(self.map.turn,)) self.info_panel.update(self.map.turn)
def load_rbm_weights(): """ Load the weight matrices from the rbm pretraining. @param weight_matrices: the weight matrices of the rbm pretraining. """ weights = [array(w) for w in s.load( open( env_paths.get_rbm_weights_path(), "rb" ) )] hid_bias = [array(b) for b in s.load( open( env_paths.get_rbm_hidden_biases_path(), "rb" ) )] vis_bias = [array(b) for b in s.load( open( env_paths.get_rbm_visible_biases_path(), "rb" ) )] return weights,hid_bias,vis_bias
def get_document_class(row, batch, training=True): """ The class of a document corresponding to a row in a batch. @param row: row in the bag of words matrix in batch. @param batch: the number of the batch. @param training: is this the training set or the test set. """ class_indices_for_batch = s.load(open(env_paths.get_class_indices_path(training, batch), "rb")) class_names_for_batch = s.load(open(env_paths.get_class_names_path(training), "rb")) return class_names_for_batch[class_indices_for_batch[row]]
def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list)))) ) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes)) ) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst) ) # Only consider words in the acceptance list. print "Processed attribute " + str(processed) + " of " + str(length) + " batches" processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print "Processed summing " + str(processed) + " of " + str(length) + " batches" processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix :] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def get_attributes(training=True): """ Get the attributes. @param training: is this the training set or the test set. """ return s.load(open(env_paths.get_attributes_path(training), "rb"))
def get_bag_of_words_matrix(batch, training=True): """ Retrieve the bag of words matrix for a batch. @param batch: the number of the batch. """ return array(s.load(open(env_paths.get_bow_matrix_path(training, int(batch)), "rb")))
def get_batch_list(training=True): """ Retrieve the list containing the batch numbers. @param training: is this the training set or the test set. """ return s.load(open(env_paths.get_batches_path(training), "rb"))
def get_weights(): """ Retrieve the weights from the generated DBN. @return: Weights of the DBN. """ return [array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb" ) )]
def load_model(model_path, _log, _run): _log.info('Loading model from %s', model_path) with open(model_path) as f: model = load(f.read()) if SACRED_OBSERVE_FILES: _run.add_resource(model_path) return model
def load_dbn_weights(): """ Load the weight matrices from the finetuning. @param weight_matrices: the weight matrices of the finetuning. """ return [array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb" ) )]
def get_all_document_names(training=True): batches = get_batch_list(training) doc_names_collected = [] for batch in batches: doc_names_collected += list(s.load(open(env_paths.get_doc_names_path(training, int(batch)), "rb"))) return doc_names_collected
def load_metadata(save_dir, _log, _run): filename = os.path.join(save_dir, METADATA_FILENAME) _log.info('Loading metadata from %s', filename) with open(filename) as f: metadata = load(f.read()) if SACRED_OBSERVE_FILES: _run.add_resource(filename) return metadata
def __set_attributes(self): """ Set the attributes containing of a list of words of all attributes in the bag of words matrix. @return: The generated list of words acting as attributes for the BOWs. """ batches = s.load(open(env_paths.get_batches_path(self.training), "rb")) length = len(batches) attributes = [] processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) tmp_attributes = list( set(sorted(list(chain(*docs_list))))) # Retrieve the each word of the docs list in a sorted list attributes += tmp_attributes attributes = list( set(sorted(attributes))) # Sort the attributes list so that there is no 2 occurrences of the same word. if not self.acceptance_lst == None: attributes = list( set(attributes).intersection(self.acceptance_lst)) # Only consider words in the acceptance list. print 'Processed attribute ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 # Find attributes of the most common words. d = dict.fromkeys(attributes) processed = 1 for batch in batches: docs_list = s.load(open(env_paths.get_doc_list_path(self.training, batch), "rb")) words = list(list(chain(*docs_list))) for w in words: try: if d[w] == None: d[w] = 1 else: d[w] += 1 except KeyError: continue print 'Processed summing ' + str(processed) + ' of ' + str(length) + ' batches' processed += 1 sorted_att = sorted(d.items(), key=lambda x: x[1]) sorted_att = sorted_att[len(sorted_att) - self.max_words_matrix:] attributes = [elem[0] for elem in sorted_att] # Serialize attributes s.dump(attributes, open(env_paths.get_attributes_path(self.training), "wb")) return attributes
def load_model(model_path, _log, _run): _log.info('Loading model from %s', model_path) with open(model_path) as f: model = load(f.read()) assert isinstance(model, HMMSummarizer), 'model is not an HMM summarizer' if SAVE_FILES: _run.add_resource(model_path) return model
def compare_real_data_to_reconstructed_data_random(): weights = s.load(open(env_paths.get_dbn_weight_path(),"rb")) batches = s.load(open(env_paths.get_batches_path(train=False),"rb")) batch = choice(batches) # make sure to pick batch at random data = data_processing.get_bag_of_words_matrix(batch,training = False) # choose 10 data points at random data_points = [] indices = random.randint(0,len(data),10) for idx in indices: data_points.append(data[idx]) output_data_points = [] for d in data_points: d = append(d,1.) out = generate_output_data(d,weights) output_data_points.append(out) visualise_data_points(data_points,output_data_points)
def get_weights(): """ Retrieve the weights from the generated DBN. @return: Weights of the DBN. """ return [ array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb")) ]
def load_dbn_weights(): """ Load the weight matrices from the finetuning. @param weight_matrices: the weight matrices of the finetuning. """ return [ array(w) for w in s.load(open(env_paths.get_dbn_weight_path(), "rb")) ]
def load_rbm_weights(): """ Load the weight matrices from the rbm pretraining. @param weight_matrices: the weight matrices of the rbm pretraining. """ weights = [ array(w) for w in s.load(open(env_paths.get_rbm_weights_path(), "rb")) ] hid_bias = [ array(b) for b in s.load(open(env_paths.get_rbm_hidden_biases_path(), "rb")) ] vis_bias = [ array(b) for b in s.load(open(env_paths.get_rbm_visible_biases_path(), "rb")) ] return weights, hid_bias, vis_bias
def get_document_names(batch, training=True): """ Get document names. @param batch: the number of the batch. @param training: is this the training set or the test set. """ names = s.load(open(env_paths.get_doc_names_path(training, batch), "rb")) return names
def get_document_name(row, batch, training=True): """ The name of the document corresponding to a row in a batch. @param row: row in the bag of words matrix in batch. @param batch: the number of the batch. @param training: is this the training set or the test set. """ return s.load(open(env_paths.get_doc_names_path(training, batch), "rb"))[row]
def get_class_indices(batch, training=True): """ Get all class indices of the documents in a batch. @param batch: the number of the batch. @param training: is this the training set or the test set. """ indices = s.load(env_paths.get_class_indices_path(training, batch), "rb") return indices
def __get_input_data__(self, batch_index, first_layer): """ Retrieve the word-count matrix from HDD. @param batch_index: Index of the batch. @return: The word-count matrix corresponding to the batch_index. """ if first_layer: return DataPreparation.data_processing.get_bag_of_words_matrix(self.batches[batch_index]) return array(s.load(open(env_paths.get_rbm_output_path(self.num_vis, batch_index, self.layer_index - 1), "rb")))
def __generate_input_data(self): """ Generate the input data for the DBN so that it can be visualized. """ if not len(self.input_data) == 0: return try: self.input_data = s.load(open('output/input_data.p', 'rb')) self.class_indices = s.load(open('output/class_indices.p', 'rb')) if not self.classes_to_visualise == None: self.__filter_input_data(self.classes_to_visualise) except: self.input_data = generate_input_data_list(training=False) if self.testing else generate_input_data_list() self.class_indices = get_all_class_indices(training=False) if self.testing else get_all_class_indices() if not self.classes_to_visualise == None: self.__filter_input_data(self.classes_to_visualise) s.dump([input.tolist() for input in self.input_data], open('output/input_data.p', 'wb')) s.dump(self.class_indices, open('output/class_indices.p', 'wb')) self.legend = get_class_names_for_class_indices(list(set(sorted(self.class_indices))))
def get_all_class_indices(training=True): """ Get all class indices for all batches in one list. @param training: is this the training set or the test set. """ batches = get_batch_list(training) indices_collected = [] for batch in batches: indices_collected += list(s.load(open(env_paths.get_class_indices_path(training, int(batch)), "rb"))) return indices_collected
def __generate_output_data(self): """ Generate the output data of the DBN so that it can be visualised. """ if not len(self.output_data) == 0: return try: self.output_data = s.load(open('output/output_data.p', 'rb')) self.class_indices = s.load(open('output/class_indices.p', 'rb')) if not self.classes_to_visualise == None: self.__filter_output_data(self.classes_to_visualise) except: self.output_data = generate_output_for_test_data(image_data=self.image_data, binary_output=self.binary_output) if self.testing else generate_output_for_train_data( image_data=self.image_data, binary_output=self.binary_output) self.class_indices = get_all_class_indices(training=False) if self.testing else get_all_class_indices() if not self.classes_to_visualise == None: self.__filter_output_data(self.classes_to_visualise) s.dump([out.tolist() for out in self.output_data], open('output/output_data.p', 'wb')) s.dump(self.class_indices, open('output/class_indices.p', 'wb')) self.legend = get_class_names_for_class_indices(list(set(sorted(self.class_indices))))
def __get_input_data__(self, batch_index, first_layer): """ Retrieve the word-count matrix from HDD. @param batch_index: Index of the batch. @return: The word-count matrix corresponding to the batch_index. """ if first_layer: return DataPreparation.data_processing.get_bag_of_words_matrix( self.batches[batch_index]) return array( s.load( open( env_paths.get_rbm_output_path(self.num_vis, batch_index, self.layer_index - 1), "rb")))
def handle_load(): ''' tries to load, telling user if the savefile does not exist :returns: tuple (dmap, player) - deserialized DungeonMap and Player or None if the load was unsuccessful ''' if not save_exists(): dlog.debug('Tried to load when savefile does not exist') olog.info('You haven\'t saved it yet!') return None try: deserialized = load() dlog.debug('changed state to loaded:') olog.info('Loaded your game!\n') return deserialized except UnpicklingError: olog.info('Could not load the save, savefile corrupted') return None
def load_large_batch(batch): return array(s.load(open(env_paths.get_dbn_large_batch_data_path(batch), 'rb')))
def load_large_batches_lst(): return s.load(open(env_paths.get_dbn_batches_lst_path(), 'rb'))
def evaluate( _log, _run, max_length=None, artifacts_dir="artifacts", load_params="model.pth", word_emb_path="wiki.id.vec", device="cpu", ): """Evaluate a trained self-attention graph-based parser.""" if max_length is None: max_length = {} artifacts_dir = Path(artifacts_dir) samples = {} try: samples["dev"] = list( read_samples(which="dev", max_length=max_length.get("dev"))) except FileNotFoundError: _log.info("Dev set is not found, skipping") samples["test"] = list( read_samples(which="test", max_length=max_length.get("test"))) for wh in samples: n_toks = sum(len(s["words"]) for s in samples[wh]) _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh, n_toks) path = artifacts_dir / "vocab.yml" _log.info("Loading source vocabulary from %s", path) vocab = load(path.read_text(encoding="utf8")) for name in vocab.keys(): _log.info("Found %d %s", len(vocab[name]), name) _log.info("Extending vocab with target words") old_n_words = len(vocab["words"]) vocab.extend(chain(*samples.values()), ["words"]) _log.info("Found %d words now", len(vocab["words"])) samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples} path = artifacts_dir / "model.yml" _log.info("Loading model from metadata %s", path) model = load(path.read_text(encoding="utf8")) path = artifacts_dir / load_params _log.info("Loading model parameters from %s", path) model.load_state_dict(torch.load(path, "cpu")) if len(vocab["words"]) > old_n_words: _log.info("Creating extended word embedding layer") if word_emb_path: kv = KeyedVectors.load_word2vec_format(word_emb_path) assert model.word_emb.embedding_dim == kv.vector_size else: _log.warning( "Word embedding file not specified; any extra target words will be treated as unks" ) kv = None with torch.no_grad(): model.word_emb = torch.nn.Embedding.from_pretrained( extend_word_embedding( model.word_emb.weight, vocab["words"], kv, vocab["words"].index(vocab.UNK_TOKEN), )) model.to(device) dev_accs = {} for wh in samples: _log.info("Evaluating on %s", wh) state = run_eval(model, vocab, samples[wh]) accs = state["counts"].accs if wh == "dev": dev_accs = accs print_accs(accs, on=wh, run=_run) if "type2counts" in state: _log.info("Type-wise accuracies:") for type_, c in state["type2counts"].items(): for key, acc in c.accs.items(): metric_name = f"{wh}_{type_}_{key}" _log.info(f"{metric_name}: {acc:.2%}") _run.log_scalar(metric_name, acc) for suffix in ("", "_nopunct"): metric_name = f"{wh}_{type_}_n_arcs{suffix}" _log.info("%s: %d", metric_name, getattr(c, f"n_arcs{suffix}")) _run.log_scalar(metric_name, getattr(c, f"n_arcs{suffix}")) return dev_accs.get("las_nopunct")
def get_all_class_names(): """ Get all class names for training set. """ return s.load(open(env_paths.get_class_names_path(train=True), "rb"))
def __read_docs_from_filesystem(self): """ Read all docs and assign them to batches, so that each doc category is represented equally across batches. """ docs_names = [] docs_names_split = [] class_indices = [] class_indices_split = [] class_names = [] batches = [] print "Generating class indices and docs names list." doc_count = 0 for folder in self.paths: docs_names_split.append([]) class_indices_split.append([]) class_names.append(folder.split("/")[len(folder.split("/")) - 1]) if self.trainingset_size == None: # If data processing should be done on all data in the specified folders. docs = os.listdir(folder) elif ( not self.trainingset_size == None and self.trainingset_attributes == None ): # If data processing should be done on parts of the docs in the specified folders - for training and testing purposes. docs = os.listdir(folder)[: int(len(os.listdir(folder)) * self.trainingset_size)] else: # If data processing should be done on a test set. docs = os.listdir(folder)[int(len(os.listdir(folder)) * self.trainingset_size) :] for doc in docs: if doc.endswith(".p"): # Append the name of the document to the list containing document names. docs_names_split[-1].append(folder + "/" + doc) class_indices_split[-1].append(len(class_names) - 1) doc_count += 1 if len(docs_names_split) == 0: # Check if docs have been stemmed. print "Documents have not been stemmed. Please stem documents in order to create bag of words matrices." return 0 # Ensure that batches contain an even amount of docs from each category. print "Arranging the documents." if doc_count < self.batchsize: print "Number of documents must be greater than batchsize. Please revise the batchsize." return 0 number_of_batches = doc_count / self.batchsize number_of_classes = len(self.paths) batches_collected_class_indices = [] batches_collected_docs_names = [] # Calculate fraction of category in each batch. d = {} for i in range(len(class_indices_split)): d[i] = float(len(class_indices_split[i])) / number_of_batches count = 0 for i in range(number_of_batches): batch_class_indices = [] batch_docs_names = [] d_tmp = array([int(v) for v in d.values()]) while True: if ( (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (count == doc_count) ): break if len(d_tmp[d_tmp > 0]) == 0: break for j in range(number_of_classes): if ( (len(batch_class_indices) == self.batchsize) and (not doc_count - count < self.batchsize) or (count == doc_count) ): break if len(class_indices_split[j]) > 0 and d_tmp[j] != 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) d_tmp[j] -= 1 count += 1 batches_collected_class_indices.append(batch_class_indices) batches_collected_docs_names.append(batch_docs_names) for i in range(number_of_batches): bsize = self.batchsize if i < number_of_batches - 1 else self.batchsize + (doc_count % self.batchsize) batch_class_indices = batches_collected_class_indices[i] batch_docs_names = batches_collected_docs_names[i] if len(batch_class_indices) < bsize: while True: if len(batch_class_indices) == bsize: break for j in range(number_of_classes): if len(batch_class_indices) == bsize: break if len(class_indices_split[j]) > 0: batch_class_indices.append(class_indices_split[j].pop(0)) batch_docs_names.append(docs_names_split[j].pop(0)) # Shuffle the batch batch_class_indices_shuf = [] batch_docs_names_shuf = [] index_shuf = range(len(batch_class_indices)) shuffle(index_shuf) for k in index_shuf: batch_class_indices_shuf.append(batch_class_indices[k]) batch_docs_names_shuf.append(batch_docs_names[k]) # Append batch to full lists class_indices += batch_class_indices_shuf docs_names += batch_docs_names_shuf print "Reading and saving docs from file system" count = 0 class_indices_batch = [] docs_names_batch = [] docs_list = [] for i in xrange(len(class_indices)): if ( not count == 0 and (count % self.batchsize) == 0 ): # Save the batch if batchsize is reached or if the last document has been read. if not (len(class_indices) - count) < self.batchsize: print "Read ", str(count), " of ", len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) # Reset the lists docs_list = [] docs_names_batch = [] class_indices_batch = [] d = s.load(open(docs_names[i], "rb")) docs_list.append(d) docs_names_batch.append(docs_names[i]) class_indices_batch.append(class_indices[i]) count += 1 # Save the remaining docs if len(docs_list) > 0: print "Read ", str(count), " of ", len(class_indices) self.__save_batch_loading_docs(count, docs_list, docs_names_batch, class_indices_batch) batches.append(count) s.dump(class_names, open(env_paths.get_class_names_path(self.training), "wb")) s.dump(batches, open(env_paths.get_batches_path(self.training), "wb")) return 1
def load_large_batch(batch): return array( s.load(open(env_paths.get_dbn_large_batch_data_path(batch), 'rb')))
def __init__(self, parent): sz = int(config.options['window']['width']), int(config.options['window']['height']) wx.Frame.__init__(self, parent, -1, "dcLord (%s): Divide & Conquer client (www.the-game.ru)"%(version.getVersion(),), style=wx.DEFAULT_FRAME_STYLE | wx.NO_FULL_REPAINT_ON_RESIZE, size=sz) if int(config.options['window']['is_maximized'])==1: self.Maximize() #import_raw.processAllUnpacked() #self.map.turn = db.db.max_turn self.log_dlg = wx.TextCtrl(self, 1, style=wx.TE_MULTILINE) self.log_dlg.Disable() self.log_dlg.SetBackgroundColour('WHITE') serialization.load(ev_cb = self) self.info_panel = planet_window.InfoPanel(self) self.object_filter = object_filter.FilterPanel(self) self.planet_filter = object_filter.FilterFrame(self) #self.unit_list = unit_list.UnitPrototypeListWindow(self, 0) self.history = history.HistoryPanel(self) #self.area_list = area_panel.AreaListWindow(self) self.sync_path = config.options['data']['sync_path'] self.info_panel.turn = db.getTurn() print 'db max turn is %s'%(db.getTurn(),) self.map = map.Map(self) self.map.turn = db.getTurn() self.map.set_planet_filter(self.planet_filter) print 'map turn is set to %s'%(self.map.turn,) self.map.update() self.started = False self.actions_queue = [] self.pf = None if self.map.turn != 0: self.log('loaded data for turn %d'%(self.map.turn,)) self.pending_actions = request.RequestMaker() self._mgr = wx.aui.AuiManager(self) self.command_selected_user = False info = wx.aui.AuiPaneInfo() info.CenterPane() info.Fixed() info.DefaultPane() info.Resizable(True) info.CaptionVisible(False) self._mgr.AddPane(self.map, info) self._mgr.AddPane(self.history, wx.RIGHT, "Turn") self._mgr.AddPane(self.info_panel, wx.RIGHT, "Info") self._mgr.AddPane(self.planet_filter, wx.LEFT, "Planets") self._mgr.AddPane(self.object_filter, wx.LEFT, "Filter") #self._mgr.AddPane(self.unit_list, wx.RIGHT, "Units") self._mgr.AddPane(self.log_dlg, wx.BOTTOM, "Log") #self._mgr.AddPane(self.area_list, wx.RIGHT, "Areas") #self.map.set_planet_fileter(self.planet_filter) self._mgr.Update() #TODO: load from data self.manual_control_units = set() #unit id self.manual_control_units.add( 7906 ) self.manual_control_units.add( 7291 ) # probes over Othes planets #TODO: load from file self.exclude_fleet_names = [] #busy, taken, etc... #p = config.options['window']['pane-info'] #if p: # print 'load p %s'%(p,) # self._mgr.LoadPerspective( p ) self.recv_data_callback = {} self.makeMenu() self.Bind(event.EVT_DATA_DOWNLOAD, self.onDownloadRawData) self.Bind(event.EVT_MAP_UPDATE, self.onMapUpdate) self.Bind(event.EVT_USER_SELECT, self.onSelectUser) self.Bind(event.EVT_ACTIONS_REPLY, self.onActionsReply) self.Bind(event.EVT_SELECT_OBJECT, self.info_panel.selectObject) self.Bind(event.EVT_TURN_SELECTED, self.onTurnSelected) self.Bind(event.EVT_LOG_APPEND, self.onLog) #import_raw.processAllUnpacked() #serialization.save() #todo - restore previous state #self.Maximize() self.history.updateTurns(self.map.turn)
def main(): # noqa: C901 parser = argparse.ArgumentParser() parser.add_argument("--env", help="environment ID", type=str, default="Walker2d-v2") parser.add_argument("-f", "--folder", help="Log folder", type=str, default="logs") parser.add_argument("--algo", help="RL Algorithm", default="ppo", type=str, required=False, choices=list(ALGOS.keys())) parser.add_argument("-n", "--n-timesteps", help="number of timesteps", default=1000, type=int) parser.add_argument("--num-threads", help="Number of threads for PyTorch (-1 to use default)", default=-1, type=int) parser.add_argument("--n-envs", help="number of environments", default=1, type=int) parser.add_argument("--exp-id", help="Experiment ID (default: 0: latest, -1: no exp folder)", default=0, type=int) parser.add_argument("--verbose", help="Verbose mode (0: no output, 1: INFO)", default=1, type=int) parser.add_argument( "--no-render", action="store_true", default=False, help="Do not render the environment (useful for tests)" ) parser.add_argument("--deterministic", action="store_true", default=False, help="Use deterministic actions") parser.add_argument( "--load-best", action="store_true", default=False, help="Load best model instead of last model if available" ) parser.add_argument( "--load-checkpoint", type=int, help="Load checkpoint instead of last model if available, " "you must pass the number of timesteps corresponding to it", ) parser.add_argument("--stochastic", action="store_true", default=False, help="Use stochastic actions") parser.add_argument( "--norm-reward", action="store_true", default=False, help="Normalize reward if applicable (trained with VecNormalize)" ) parser.add_argument("--seed", help="Random generator seed", type=int, default=0) parser.add_argument("--reward-log", help="Where to log reward", default="", type=str) parser.add_argument( "--gym-packages", type=str, nargs="+", default=[], help="Additional external Gym environemnt package modules to import (e.g. gym_minigrid)", ) parser.add_argument( "--env-kwargs", type=str, nargs="+", action=StoreDict, help="Optional keyword argument to pass to the env constructor" ) args = parser.parse_args() # Going through custom gym packages to let them register in the global registory for env_module in args.gym_packages: importlib.import_module(env_module) env_id = args.env algo = args.algo folder = args.folder if args.exp_id == 0: args.exp_id = get_latest_run_id(os.path.join(folder, algo), env_id) print(f"Loading latest experiment, id={args.exp_id}") # Sanity checks if args.exp_id > 0: log_path = os.path.join(folder, algo, f"{env_id}_{args.exp_id}") else: log_path = os.path.join(folder, algo) assert os.path.isdir(log_path), f"The {log_path} folder was not found" found = False for ext in ["zip"]: model_path = os.path.join(log_path, f"{env_id}.{ext}") found = os.path.isfile(model_path) if found: break if args.load_best: model_path = os.path.join(log_path, "best_model.zip") found = os.path.isfile(model_path) if args.load_checkpoint is not None: model_path = os.path.join(log_path, f"rl_model_{args.load_checkpoint}_steps.zip") found = os.path.isfile(model_path) if not found: raise ValueError(f"No model found for {algo} on {env_id}, path: {model_path}") off_policy_algos = ["qrdqn", "dqn", "ddpg", "sac", "her", "td3", "tqc"] if algo in off_policy_algos: args.n_envs = 1 set_random_seed(args.seed) if args.num_threads > 0: if args.verbose > 1: print(f"Setting torch.num_threads to {args.num_threads}") th.set_num_threads(args.num_threads) is_atari = ExperimentManager.is_atari(env_id) stats_path = os.path.join(log_path, env_id) hyperparams, stats_path = get_saved_hyperparams(stats_path, norm_reward=args.norm_reward, test_mode=True) # load env_kwargs if existing env_kwargs = {} args_path = os.path.join(log_path, env_id, "args.yml") if os.path.isfile(args_path): with open(args_path, "r") as f: loaded_args = yaml.load(f, Loader=yaml.UnsafeLoader) # pytype: disable=module-attr if loaded_args["env_kwargs"] is not None: env_kwargs = loaded_args["env_kwargs"] # overwrite with command line arguments if args.env_kwargs is not None: env_kwargs.update(args.env_kwargs) log_dir = args.reward_log if args.reward_log != "" else None env = create_test_env( env_id, n_envs=args.n_envs, stats_path=stats_path, seed=args.seed, log_dir=log_dir, should_render=not args.no_render, hyperparams=hyperparams, env_kwargs=env_kwargs, ) kwargs = dict(seed=args.seed) if algo in off_policy_algos: # Dummy buffer size as we don't need memory to enjoy the trained agent kwargs.update(dict(buffer_size=1)) # Check if we are running python 3.8+ # we need to patch saved model under python 3.6/3.7 to load them newer_python_version = sys.version_info.major == 3 and sys.version_info.minor >= 8 custom_objects = {} if newer_python_version: custom_objects = { "learning_rate": 0.0, "lr_schedule": lambda _: 0.0, "clip_range": lambda _: 0.0, } model = load(ALGOS[algo], model_path, env=env, custom_objects=custom_objects, **kwargs) obs = env.reset() # Deterministic by default except for atari games stochastic = args.stochastic or is_atari and not args.deterministic deterministic = not stochastic state = None episode_reward = 0.0 episode_rewards, episode_lengths = [], [] ep_len = 0 # For HER, monitor success rate successes = [] try: for _ in range(args.n_timesteps): action, state = model.predict(obs, state=state, deterministic=deterministic) obs, reward, done, infos = env.step(action) if not args.no_render: env.render("human") episode_reward += reward[0] ep_len += 1 if args.n_envs == 1: # For atari the return reward is not the atari score # so we have to get it from the infos dict if is_atari and infos is not None and args.verbose >= 1: episode_infos = infos[0].get("episode") if episode_infos is not None: print(f"Atari Episode Score: {episode_infos['r']:.2f}") print("Atari Episode Length", episode_infos["l"]) if done and not is_atari and args.verbose > 0: # NOTE: for env using VecNormalize, the mean reward # is a normalized reward when `--norm_reward` flag is passed print(f"Episode Reward: {episode_reward:.2f}") print("Episode Length", ep_len) episode_rewards.append(episode_reward) episode_lengths.append(ep_len) episode_reward = 0.0 ep_len = 0 state = None # Reset also when the goal is achieved when using HER if done and infos[0].get("is_success") is not None: if args.verbose > 1: print("Success?", infos[0].get("is_success", False)) if infos[0].get("is_success") is not None: successes.append(infos[0].get("is_success", False)) episode_reward, ep_len = 0.0, 0 except KeyboardInterrupt: pass episode_rewards.append(episode_reward) episode_lengths.append(ep_len) if args.verbose > 0 and len(successes) > 0: print(f"Success rate: {100 * np.mean(successes):.2f}%") if args.verbose > 0 and len(episode_rewards) > 0: print(f"{len(episode_rewards)} Episodes") print(f"Mean reward: {np.mean(episode_rewards):.2f} +/- {np.std(episode_rewards):.2f}") if args.verbose > 0 and len(episode_lengths) > 0: print(f"Mean episode length: {np.mean(episode_lengths):.2f} +/- {np.std(episode_lengths):.2f}") # Workaround for https://github.com/openai/gym/issues/893 if not args.no_render: if args.n_envs == 1 and "Bullet" not in env_id and not is_atari and isinstance(env, VecEnv): # DummyVecEnv # Unwrap env while isinstance(env, VecEnvWrapper): env = env.venv if isinstance(env, DummyVecEnv): env.envs[0].env.close() else: env.close() else: # SubprocVecEnv env.close()
def finetune( _log, _run, _rnd, max_length=None, artifacts_dir="ft_artifacts", overwrite=False, load_from="artifacts", load_params="model.pth", device="cpu", word_emb_path="wiki.id.vec", freeze=False, projective=False, multiroot=True, batch_size=32, lr=1e-5, l2_coef=1.0, max_epoch=5, ): """Finetune a trained model with self-training.""" if max_length is None: max_length = {} artifacts_dir = Path(artifacts_dir) _log.info("Creating artifacts directory %s", artifacts_dir) artifacts_dir.mkdir(exist_ok=overwrite) samples = { wh: list(read_samples(which=wh, max_length=max_length.get(wh))) for wh in ["train", "dev", "test"] } for wh in samples: n_toks = sum(len(s["words"]) for s in samples[wh]) _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh, n_toks) path = Path(load_from) / "vocab.yml" _log.info("Loading vocabulary from %s", path) vocab = load(path.read_text(encoding="utf8")) for name in vocab: _log.info("Found %d %s", len(vocab[name]), name) _log.info("Extending vocabulary with target words") vocab.extend(chain(*samples.values()), ["words"]) _log.info("Found %d words now", len(vocab["words"])) path = artifacts_dir / "vocab.yml" _log.info("Saving vocabulary to %s", path) path.write_text(dump(vocab), encoding="utf8") samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples} path = Path(load_from) / "model.yml" _log.info("Loading model from metadata %s", path) model = load(path.read_text(encoding="utf8")) path = Path(load_from) / load_params _log.info("Loading model parameters from %s", path) model.load_state_dict(torch.load(path, "cpu")) _log.info("Creating extended word embedding layer") kv = KeyedVectors.load_word2vec_format(word_emb_path) assert model.word_emb.embedding_dim == kv.vector_size with torch.no_grad(): model.word_emb = torch.nn.Embedding.from_pretrained( extend_word_embedding(model.word_emb.weight, vocab["words"], kv)) path = artifacts_dir / "model.yml" _log.info("Saving model metadata to %s", path) path.write_text(dump(model), encoding="utf8") model.word_emb.requires_grad_(not freeze) model.tag_emb.requires_grad_(not freeze) model.to(device) for wh in ["train"]: for i, s in enumerate(samples[wh]): s["_id"] = i runner = Runner() runner.state.update({"st_heads": [], "st_types": [], "_ids": []}) runner.on( Event.BATCH, [ batch2tensors(device, vocab), set_train_mode(model, training=False), compute_total_arc_type_scores(model, vocab), predict_batch(projective, multiroot), ], ) @runner.on(Event.BATCH) def save_st_trees(state): state["st_heads"].extend(state["pred_heads"].tolist()) state["st_types"].extend(state["pred_types"].tolist()) state["_ids"].extend(state["batch"]["_id"].tolist()) state["n_items"] = state["batch"]["words"].numel() n_toks = sum(len(s["words"]) for s in samples[wh]) ProgressBar(total=n_toks, unit="tok").attach_on(runner) _log.info("Computing ST trees for %s set", wh) with torch.no_grad(): runner.run( BucketIterator(samples[wh], lambda s: len(s["words"]), batch_size)) assert len(runner.state["st_heads"]) == len(samples[wh]) assert len(runner.state["st_types"]) == len(samples[wh]) assert len(runner.state["_ids"]) == len(samples[wh]) for i, st_heads, st_types in zip(runner.state["_ids"], runner.state["st_heads"], runner.state["st_types"]): assert len(samples[wh][i]["words"]) == len(st_heads) assert len(samples[wh][i]["words"]) == len(st_types) samples[wh][i]["st_heads"] = st_heads samples[wh][i]["st_types"] = st_types _log.info("Creating optimizer") opt = torch.optim.Adam(model.parameters(), lr=lr) finetuner = Runner() origin_params = { name: p.clone().detach() for name, p in model.named_parameters() } finetuner.on( Event.BATCH, [ batch2tensors(device, vocab), set_train_mode(model), compute_l2_loss(model, origin_params), ], ) @finetuner.on(Event.BATCH) def compute_loss(state): bat = state["batch"] words, tags, heads, types = bat["words"], bat["tags"], bat[ "st_heads"], bat["st_types"] mask = bat["mask"] arc_scores, type_scores = model(words, tags, mask, heads) arc_scores = arc_scores.masked_fill(~mask.unsqueeze(2), -1e9) # mask padding heads type_scores[..., vocab["types"].index(vocab.PAD_TOKEN)] = -1e9 # remove root arc_scores, type_scores = arc_scores[:, :, 1:], type_scores[:, 1:] heads, types, mask = heads[:, 1:], types[:, 1:], mask[:, 1:] arc_scores = rearrange(arc_scores, "bsz slen1 slen2 -> (bsz slen2) slen1") heads = heads.reshape(-1) arc_loss = torch.nn.functional.cross_entropy(arc_scores, heads, reduction="none") type_scores = rearrange(type_scores, "bsz slen ntypes -> (bsz slen) ntypes") types = types.reshape(-1) type_loss = torch.nn.functional.cross_entropy(type_scores, types, reduction="none") arc_loss = arc_loss.masked_select(mask.reshape(-1)).mean() type_loss = type_loss.masked_select(mask.reshape(-1)).mean() loss = arc_loss + type_loss + l2_coef * state["l2_loss"] state["loss"] = loss state["stats"] = { "arc_ppl": arc_loss.exp().item(), "type_ppl": type_loss.exp().item(), "l2_loss": state["l2_loss"].item(), } state["extra_stats"] = { "arc_loss": arc_loss.item(), "type_loss": type_loss.item() } finetuner.on( Event.BATCH, [ get_n_items(), update_params(opt), log_grads(_run, model), log_stats(_run) ], ) @finetuner.on(Event.EPOCH_FINISHED) def eval_on_dev(state): _log.info("Evaluating on dev") eval_state = run_eval(model, vocab, samples["dev"]) accs = eval_state["counts"].accs print_accs(accs, run=_run, step=state["n_iters"]) state["dev_accs"] = accs @finetuner.on(Event.EPOCH_FINISHED) def maybe_eval_on_test(state): if state["epoch"] != max_epoch: return _log.info("Evaluating on test") eval_state = run_eval(model, vocab, samples["test"]) print_accs(eval_state["counts"].accs, on="test", run=_run, step=state["n_iters"]) finetuner.on(Event.EPOCH_FINISHED, save_state_dict("model", model, under=artifacts_dir)) EpochTimer().attach_on(finetuner) n_tokens = sum(len(s["words"]) for s in samples["train"]) ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(finetuner) bucket_key = lambda s: (len(s["words"]) - 1) // 10 trn_iter = ShuffleIterator( BucketIterator(samples["train"], bucket_key, batch_size, shuffle_bucket=True, rng=_rnd), rng=_rnd, ) _log.info("Starting finetuning") try: finetuner.run(trn_iter, max_epoch) except KeyboardInterrupt: _log.info("Interrupt detected, training will abort") else: return finetuner.state["dev_accs"]["las_nopunct"]
def finetune( corpus, _log, _run, _rnd, max_length=None, artifacts_dir="ft_artifacts", load_samples_from=None, overwrite=False, load_src=None, src_key_as_lang=False, main_src=None, device="cpu", word_emb_path="wiki.id.vec", freeze=False, thresh=0.95, projective=False, multiroot=True, batch_size=32, save_samples=False, lr=1e-5, l2_coef=1.0, max_epoch=5, ): """Finetune a trained model with PPTX.""" if max_length is None: max_length = {} if load_src is None: load_src = {"src": ("artifacts", "model.pth")} main_src = "src" elif main_src not in load_src: raise ValueError(f"{main_src} not found in load_src") artifacts_dir = Path(artifacts_dir) _log.info("Creating artifacts directory %s", artifacts_dir) artifacts_dir.mkdir(exist_ok=overwrite) if load_samples_from: _log.info("Loading samples from %s", load_samples_from) with open(load_samples_from, "rb") as f: samples = pickle.load(f) else: samples = { wh: list(read_samples(which=wh, max_length=max_length.get(wh))) for wh in ["train", "dev", "test"] } for wh in samples: n_toks = sum(len(s["words"]) for s in samples[wh]) _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh, n_toks) kv = KeyedVectors.load_word2vec_format(word_emb_path) if load_samples_from: _log.info( "Skipping non-main src because samples are processed and loaded") srcs = [] else: srcs = [src for src in load_src if src != main_src] if src_key_as_lang and corpus["lang"] in srcs: _log.info("Removing %s from src parsers because it's the tgt", corpus["lang"]) srcs.remove(corpus["lang"]) srcs.append(main_src) for src_i, src in enumerate(srcs): _log.info("Processing src %s [%d/%d]", src, src_i + 1, len(srcs)) load_from, load_params = load_src[src] path = Path(load_from) / "vocab.yml" _log.info("Loading %s vocabulary from %s", src, path) vocab = load(path.read_text(encoding="utf8")) for name in vocab: _log.info("Found %d %s", len(vocab[name]), name) _log.info("Extending %s vocabulary with target words", src) vocab.extend(chain(*samples.values()), ["words"]) _log.info("Found %d words now", len(vocab["words"])) samples_ = {wh: list(vocab.stoi(samples[wh])) for wh in samples} path = Path(load_from) / "model.yml" _log.info("Loading %s model from metadata %s", src, path) model = load(path.read_text(encoding="utf8")) path = Path(load_from) / load_params _log.info("Loading %s model parameters from %s", src, path) model.load_state_dict(torch.load(path, "cpu")) _log.info("Creating %s extended word embedding layer", src) assert model.word_emb.embedding_dim == kv.vector_size with torch.no_grad(): model.word_emb = torch.nn.Embedding.from_pretrained( extend_word_embedding(model.word_emb.weight, vocab["words"], kv)) model.to(device) for wh in ["train", "dev"]: if load_samples_from: assert all("pptx_mask" in s for s in samples[wh]) continue for i, s in enumerate(samples_[wh]): s["_id"] = i runner = Runner() runner.state.update({"pptx_masks": [], "_ids": []}) runner.on( Event.BATCH, [ batch2tensors(device, vocab), set_train_mode(model, training=False), compute_total_arc_type_scores(model, vocab), ], ) @runner.on(Event.BATCH) def compute_pptx_ambiguous_arcs_mask(state): assert state["batch"]["mask"].all() scores = state["total_arc_type_scores"] pptx_mask = compute_ambiguous_arcs_mask( scores, thresh, projective, multiroot) state["pptx_masks"].extend(pptx_mask) state["_ids"].extend(state["batch"]["_id"].tolist()) state["n_items"] = state["batch"]["words"].numel() n_toks = sum(len(s["words"]) for s in samples_[wh]) ProgressBar(total=n_toks, unit="tok").attach_on(runner) _log.info( "Computing PPTX ambiguous arcs mask for %s set with source %s", wh, src) with torch.no_grad(): runner.run( BucketIterator(samples_[wh], lambda s: len(s["words"]), batch_size)) assert len(runner.state["pptx_masks"]) == len(samples_[wh]) assert len(runner.state["_ids"]) == len(samples_[wh]) for i, pptx_mask in zip(runner.state["_ids"], runner.state["pptx_masks"]): samples_[wh][i]["pptx_mask"] = pptx_mask.tolist() _log.info("Computing (log) number of trees stats on %s set", wh) report_log_ntrees_stats(samples_[wh], "pptx_mask", batch_size, projective, multiroot) _log.info("Combining the ambiguous arcs mask") assert len(samples_[wh]) == len(samples[wh]) for i in range(len(samples_[wh])): pptx_mask = torch.tensor(samples_[wh][i]["pptx_mask"]) assert pptx_mask.dim() == 3 if "pptx_mask" in samples[wh][i]: old_mask = torch.tensor(samples[wh][i]["pptx_mask"]) else: old_mask = torch.zeros(1, 1, 1).bool() samples[wh][i]["pptx_mask"] = (old_mask | pptx_mask).tolist() assert src == main_src _log.info("Main source is %s", src) path = artifacts_dir / "vocab.yml" _log.info("Saving vocabulary to %s", path) path.write_text(dump(vocab), encoding="utf8") path = artifacts_dir / "model.yml" _log.info("Saving model metadata to %s", path) path.write_text(dump(model), encoding="utf8") if save_samples: path = artifacts_dir / "samples.pkl" _log.info("Saving samples to %s", path) with open(path, "wb") as f: pickle.dump(samples, f) samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples} for wh in ["train", "dev"]: _log.info("Computing (log) number of trees stats on %s set", wh) report_log_ntrees_stats(samples[wh], "pptx_mask", batch_size, projective, multiroot) model.word_emb.requires_grad_(not freeze) model.tag_emb.requires_grad_(not freeze) _log.info("Creating optimizer") opt = torch.optim.Adam(model.parameters(), lr=lr) finetuner = Runner() origin_params = { name: p.clone().detach() for name, p in model.named_parameters() } finetuner.on( Event.BATCH, [ batch2tensors(device, vocab), set_train_mode(model), compute_l2_loss(model, origin_params), compute_total_arc_type_scores(model, vocab), ], ) @finetuner.on(Event.BATCH) def compute_loss(state): mask = state["batch"]["mask"] pptx_mask = state["batch"]["pptx_mask"].bool() scores = state["total_arc_type_scores"] pptx_loss = compute_aatrn_loss(scores, pptx_mask, mask, projective, multiroot) pptx_loss /= mask.size(0) loss = pptx_loss + l2_coef * state["l2_loss"] state["loss"] = loss state["stats"] = { "pptx_loss": pptx_loss.item(), "l2_loss": state["l2_loss"].item(), } state["extra_stats"] = {"loss": loss.item()} state["n_items"] = mask.long().sum().item() finetuner.on(Event.BATCH, [update_params(opt), log_grads(_run, model), log_stats(_run)]) @finetuner.on(Event.EPOCH_FINISHED) def eval_on_dev(state): _log.info("Evaluating on dev") eval_state = run_eval(model, vocab, samples["dev"]) accs = eval_state["counts"].accs print_accs(accs, run=_run, step=state["n_iters"]) pptx_loss = eval_state["mean_pptx_loss"] _log.info("dev_pptx_loss: %.4f", pptx_loss) _run.log_scalar("dev_pptx_loss", pptx_loss, step=state["n_iters"]) state["dev_accs"] = accs @finetuner.on(Event.EPOCH_FINISHED) def maybe_eval_on_test(state): if state["epoch"] != max_epoch: return _log.info("Evaluating on test") eval_state = run_eval(model, vocab, samples["test"], compute_loss=False) print_accs(eval_state["counts"].accs, on="test", run=_run, step=state["n_iters"]) finetuner.on(Event.EPOCH_FINISHED, save_state_dict("model", model, under=artifacts_dir)) EpochTimer().attach_on(finetuner) n_tokens = sum(len(s["words"]) for s in samples["train"]) ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(finetuner) bucket_key = lambda s: (len(s["words"]) - 1) // 10 trn_iter = ShuffleIterator( BucketIterator(samples["train"], bucket_key, batch_size, shuffle_bucket=True, rng=_rnd), rng=_rnd, ) _log.info("Starting finetuning") try: finetuner.run(trn_iter, max_epoch) except KeyboardInterrupt: _log.info("Interrupt detected, training will abort") else: return finetuner.state["dev_accs"]["las_nopunct"]
def finetune( _log, _run, _rnd, max_length=None, artifacts_dir="ft_artifacts", overwrite=False, load_from="artifacts", load_params="model.pth", device="cpu", word_emb_path="wiki.id.vec", freeze=False, thresh=0.95, projective=False, multiroot=True, batch_size=32, lr=1e-5, l2_coef=1.0, max_epoch=5, ): """Finetune a trained model with PPT.""" if max_length is None: max_length = {} artifacts_dir = Path(artifacts_dir) _log.info("Creating artifacts directory %s", artifacts_dir) artifacts_dir.mkdir(exist_ok=overwrite) samples = { wh: list(read_samples(which=wh, max_length=max_length.get(wh))) for wh in ["train", "dev", "test"] } for wh in samples: n_toks = sum(len(s["words"]) for s in samples[wh]) _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh, n_toks) path = Path(load_from) / "vocab.yml" _log.info("Loading vocabulary from %s", path) vocab = load(path.read_text(encoding="utf8")) for name in vocab: _log.info("Found %d %s", len(vocab[name]), name) _log.info("Extending vocabulary with target words") vocab.extend(chain(*samples.values()), ["words"]) _log.info("Found %d words now", len(vocab["words"])) path = artifacts_dir / "vocab.yml" _log.info("Saving vocabulary to %s", path) path.write_text(dump(vocab), encoding="utf8") samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples} path = Path(load_from) / "model.yml" _log.info("Loading model from metadata %s", path) model = load(path.read_text(encoding="utf8")) path = Path(load_from) / load_params _log.info("Loading model parameters from %s", path) model.load_state_dict(torch.load(path, "cpu")) _log.info("Creating extended word embedding layer") kv = KeyedVectors.load_word2vec_format(word_emb_path) assert model.word_emb.embedding_dim == kv.vector_size with torch.no_grad(): model.word_emb = torch.nn.Embedding.from_pretrained( extend_word_embedding(model.word_emb.weight, vocab["words"], kv)) path = artifacts_dir / "model.yml" _log.info("Saving model metadata to %s", path) path.write_text(dump(model), encoding="utf8") model.word_emb.requires_grad_(not freeze) model.tag_emb.requires_grad_(not freeze) model.to(device) for wh in ["train", "dev"]: for i, s in enumerate(samples[wh]): s["_id"] = i runner = Runner() runner.state.update({"ppt_masks": [], "_ids": []}) runner.on( Event.BATCH, [ batch2tensors(device, vocab), set_train_mode(model, training=False), compute_total_arc_type_scores(model, vocab), ], ) @runner.on(Event.BATCH) def compute_ppt_ambiguous_arcs_mask(state): assert state["batch"]["mask"].all() scores = state["total_arc_type_scores"] ppt_mask = compute_ambiguous_arcs_mask(scores, thresh, projective, multiroot) state["ppt_masks"].extend(ppt_mask.tolist()) state["_ids"].extend(state["batch"]["_id"].tolist()) state["n_items"] = state["batch"]["words"].numel() n_toks = sum(len(s["words"]) for s in samples[wh]) ProgressBar(total=n_toks, unit="tok").attach_on(runner) _log.info("Computing PPT ambiguous arcs mask for %s set", wh) with torch.no_grad(): runner.run( BucketIterator(samples[wh], lambda s: len(s["words"]), batch_size)) assert len(runner.state["ppt_masks"]) == len(samples[wh]) assert len(runner.state["_ids"]) == len(samples[wh]) for i, ppt_mask in zip(runner.state["_ids"], runner.state["ppt_masks"]): samples[wh][i]["ppt_mask"] = ppt_mask _log.info("Computing (log) number of trees stats on %s set", wh) report_log_ntrees_stats(samples[wh], "ppt_mask", batch_size, projective, multiroot) _log.info("Creating optimizer") opt = torch.optim.Adam(model.parameters(), lr=lr) finetuner = Runner() origin_params = { name: p.clone().detach() for name, p in model.named_parameters() } finetuner.on( Event.BATCH, [ batch2tensors(device, vocab), set_train_mode(model), compute_l2_loss(model, origin_params), compute_total_arc_type_scores(model, vocab), ], ) @finetuner.on(Event.BATCH) def compute_loss(state): mask = state["batch"]["mask"] ppt_mask = state["batch"]["ppt_mask"].bool() scores = state["total_arc_type_scores"] ppt_loss = compute_aatrn_loss(scores, ppt_mask, mask, projective, multiroot) ppt_loss /= mask.size(0) loss = ppt_loss + l2_coef * state["l2_loss"] state["loss"] = loss state["stats"] = { "ppt_loss": ppt_loss.item(), "l2_loss": state["l2_loss"].item(), } state["extra_stats"] = {"loss": loss.item()} state["n_items"] = mask.long().sum().item() finetuner.on(Event.BATCH, [update_params(opt), log_grads(_run, model), log_stats(_run)]) @finetuner.on(Event.EPOCH_FINISHED) def eval_on_dev(state): _log.info("Evaluating on dev") eval_state = run_eval(model, vocab, samples["dev"]) accs = eval_state["counts"].accs print_accs(accs, run=_run, step=state["n_iters"]) ppt_loss = eval_state["mean_ppt_loss"] _log.info("dev_ppt_loss: %.4f", ppt_loss) _run.log_scalar("dev_ppt_loss", ppt_loss, step=state["n_iters"]) state["dev_accs"] = accs @finetuner.on(Event.EPOCH_FINISHED) def maybe_eval_on_test(state): if state["epoch"] != max_epoch: return _log.info("Evaluating on test") eval_state = run_eval(model, vocab, samples["test"], compute_loss=False) print_accs(eval_state["counts"].accs, on="test", run=_run, step=state["n_iters"]) finetuner.on(Event.EPOCH_FINISHED, save_state_dict("model", model, under=artifacts_dir)) EpochTimer().attach_on(finetuner) n_tokens = sum(len(s["words"]) for s in samples["train"]) ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(finetuner) bucket_key = lambda s: (len(s["words"]) - 1) // 10 trn_iter = ShuffleIterator( BucketIterator(samples["train"], bucket_key, batch_size, shuffle_bucket=True, rng=_rnd), rng=_rnd, ) _log.info("Starting finetuning") try: finetuner.run(trn_iter, max_epoch) except KeyboardInterrupt: _log.info("Interrupt detected, training will abort") else: return finetuner.state["dev_accs"]["las_nopunct"]
def train( _log, _run, _rnd, artifacts_dir="artifacts", overwrite=False, max_length=None, load_types_vocab_from=None, batch_size=16, device="cpu", lr=0.001, patience=5, max_epoch=1000, ): """Train a self-attention graph-based parser.""" if max_length is None: max_length = {} artifacts_dir = Path(artifacts_dir) _log.info("Creating artifacts directory %s", artifacts_dir) artifacts_dir.mkdir(exist_ok=overwrite) samples = { wh: list(read_samples(which=wh, max_length=max_length.get(wh))) for wh in ["train", "dev", "test"] } for wh in samples: n_toks = sum(len(s["words"]) for s in samples[wh]) _log.info("Read %d %s samples and %d tokens", len(samples[wh]), wh, n_toks) _log.info("Creating vocabulary") vocab = Vocab.from_samples(chain(*samples.values())) if load_types_vocab_from: path = Path(load_types_vocab_from) _log.info("Loading types vocab from %s", path) vocab["types"] = load(path.read_text(encoding="utf8"))["types"] _log.info("Vocabulary created") for name in vocab: _log.info("Found %d %s", len(vocab[name]), name) path = artifacts_dir / "vocab.yml" _log.info("Saving vocabulary to %s", path) path.write_text(dump(vocab), encoding="utf8") samples = {wh: list(vocab.stoi(samples[wh])) for wh in samples} model = make_model(vocab) model.to(device) _log.info("Creating optimizer") opt = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode="max", factor=0.5) trainer = Runner() trainer.state.update({"dev_larcs_nopunct": -1, "dev_uarcs_nopunct": -1}) trainer.on(Event.BATCH, [batch2tensors(device, vocab), set_train_mode(model)]) @trainer.on(Event.BATCH) def compute_loss(state): bat = state["batch"] words, tags, heads, types = bat["words"], bat["tags"], bat[ "heads"], bat["types"] mask = bat["mask"] arc_scores, type_scores = model(words, tags, mask, heads) arc_scores = arc_scores.masked_fill(~mask.unsqueeze(2), -1e9) # mask padding heads type_scores[..., vocab["types"].index(Vocab.PAD_TOKEN)] = -1e9 # remove root arc_scores, type_scores = arc_scores[:, :, 1:], type_scores[:, 1:] heads, types, mask = heads[:, 1:], types[:, 1:], mask[:, 1:] arc_scores = rearrange(arc_scores, "bsz slen1 slen2 -> (bsz slen2) slen1") heads = heads.reshape(-1) arc_loss = torch.nn.functional.cross_entropy(arc_scores, heads, reduction="none") type_scores = rearrange(type_scores, "bsz slen ntypes -> (bsz slen) ntypes") types = types.reshape(-1) type_loss = torch.nn.functional.cross_entropy(type_scores, types, reduction="none") arc_loss = arc_loss.masked_select(mask.reshape(-1)).mean() type_loss = type_loss.masked_select(mask.reshape(-1)).mean() loss = arc_loss + type_loss state["loss"] = loss arc_loss, type_loss = arc_loss.item(), type_loss.item() state["stats"] = { "arc_ppl": math.exp(arc_loss), "type_ppl": math.exp(type_loss), } state["extra_stats"] = {"arc_loss": arc_loss, "type_loss": type_loss} state["n_items"] = bat["mask"].long().sum().item() trainer.on(Event.BATCH, [update_params(opt), log_grads(_run, model), log_stats(_run)]) @trainer.on(Event.EPOCH_FINISHED) def eval_on_dev(state): _log.info("Evaluating on dev") eval_state = run_eval(model, vocab, samples["dev"]) accs = eval_state["counts"].accs print_accs(accs, run=_run, step=state["n_iters"]) scheduler.step(accs["las_nopunct"]) if eval_state["counts"].larcs_nopunct > state["dev_larcs_nopunct"]: state["better"] = True elif eval_state["counts"].larcs_nopunct < state["dev_larcs_nopunct"]: state["better"] = False elif eval_state["counts"].uarcs_nopunct > state["dev_uarcs_nopunct"]: state["better"] = True else: state["better"] = False if state["better"]: _log.info("Found new best result on dev!") state["dev_larcs_nopunct"] = eval_state["counts"].larcs_nopunct state["dev_uarcs_nopunct"] = eval_state["counts"].uarcs_nopunct state["dev_accs"] = accs state["dev_epoch"] = state["epoch"] else: _log.info("Not better, the best so far is epoch %d:", state["dev_epoch"]) print_accs(state["dev_accs"]) print_accs(state["test_accs"], on="test") @trainer.on(Event.EPOCH_FINISHED) def maybe_eval_on_test(state): if not state["better"]: return _log.info("Evaluating on test") eval_state = run_eval(model, vocab, samples["test"]) state["test_accs"] = eval_state["counts"].accs print_accs(state["test_accs"], on="test", run=_run, step=state["n_iters"]) trainer.on( Event.EPOCH_FINISHED, [ maybe_stop_early(patience=patience), save_state_dict("model", model, under=artifacts_dir, when="better"), ], ) EpochTimer().attach_on(trainer) n_tokens = sum(len(s["words"]) for s in samples["train"]) ProgressBar(stats="stats", total=n_tokens, unit="tok").attach_on(trainer) bucket_key = lambda s: (len(s["words"]) - 1) // 10 trn_iter = ShuffleIterator( BucketIterator(samples["train"], bucket_key, batch_size, shuffle_bucket=True, rng=_rnd), rng=_rnd, ) _log.info("Starting training") try: trainer.run(trn_iter, max_epoch) except KeyboardInterrupt: _log.info("Interrupt detected, training will abort") else: return trainer.state["dev_accs"]["las_nopunct"]