def train(self, iteration_count): tensorboard_dir = 'tensorboard/dir' merged = tf.summary.merge_all() if not os.path.exists(tensorboard_dir): os.makedirs(tensorboard_dir) writer = tf.summary.FileWriter(tensorboard_dir) dictionary, inverse_dictionary, dictionary_len = read_dictionary( self.__data_manager.dictionary_path) with self.__session.as_default(): print('Training') for i in range(self.step, iteration_count + self.step): iter_loss = 0 rs = 0 for batch_y, batch_dt, batch_x in self.__data_manager.train_batches: op, loss_value, rs = self.__session.run( [self.__optimizer, self.__cost, merged], feed_dict={ self.__inputs: batch_x, self.__seq_len: [self.__max_char_count] * self.__data_manager.batch_size, self.__targets: batch_dt }) if i % 50 == 0: for j in range(2): [decoded, acc] = self.__session.run( [self.__decoded, self.__acc], feed_dict={ self.__inputs: batch_x, self.__seq_len: [self.__max_char_count] * self.__data_manager.batch_size, self.__targets: batch_dt }) print(batch_y[j]) print(acc) print( ground_truth_to_word(decoded[j], inverse_dictionary)) iter_loss += loss_value rs += rs writer.add_summary(rs, i) if i % 500 == 0: self.__saver.save(self.__session, self.__save_path, global_step=self.step) print('[{}] Iteration loss: {}'.format(self.step, iter_loss)) self.step += 1 writer.add_graph(self.__session.graph) return None
def make_dict(paths: List[str], path_dict: str): """ Write a dictionary from word to senses, from the given input files. :param paths: files to read. :param path_dict: where to save the dictionary. :return: """ bnwn_map = utils.read_dictionary(const.BN2WN_MAP) word_synset_map = utils.compute_word_sysnet_map(paths, bnwn_map) utils.write_dictionary(path_dict, word_synset_map)
def main(use_cuda=False): print('Hello!') config = load_config() dataset = read_corpus(sents) char_fpi = read_dictionary(is_char=True) word_fpi = read_dictionary(is_char=False) char_lexicon, char_emb_layer = get_lexicon( config['token_embedder']['char_dim'], char_fpi, use_cuda) word_lexicon, word_emb_layer = get_lexicon( config['token_embedder']['word_dim'], word_fpi, use_cuda) b_w, b_c, b_l, b_m, recover_ind = create_batches(dataset, word_lexicon, char_lexicon, config, batch_size=64, shuffle=True, use_cuda=use_cuda) token_embedder = ConvTokenEmbedder(config, word_emb_layer, char_emb_layer, use_cuda) for w, c, m in zip(b_w, b_c, b_m): token_embedding = token_embedder(w, c, m[0].size()) print(token_embedding)
def semcor(path_input: str, path_output: str, map_path: str, label: bool): """ Preprocess SemCor dataset and writes it in a single text file. :param path_input: semcor path. :param path_output: where to save the preprocessed file. :param map_path: the path to the gold key dictionary. :param label: if True, parse the semcor set as label. :return: """ # load maps semcor_map = utils.read_dictionary(map_path) with open(path_output, mode="w", encoding="utf8") as out: root = cElementTree.parse(path_input).getroot() for sentence in root.findall(".//sentence"): out.write(_extract_sentence(sentence, semcor_map, label) + "\n")
def evaluate_keras(model) -> None: """ Evaluate the model printing a classification report (acc, prec, recall and f1). :param model: the trained model. :return: """ x_test, y_test = utils.load_datasets(config.CRISIS_EVAL_DIR, config.NORMAL_EVAL_DIR, limit=8000) x_test = preprocess.clear_tweets(x_test) vocab = utils.read_dictionary(config.TRAIN_VOCAB) x_test = preprocess.compute_x(x_test, vocab, max_len=100)[:, :, 0] y_pred = model.predict(x_test, batch_size=256, verbose=1) y_pred = [1 if y > 0.5 else 0 for y in y_pred] cr = classification_report(y_test, y_pred, ["normal", "crisis"]) print("Classification report : \n", cr)
def convert_synsets(lines: List, mode: str) -> List[List[str]]: """ Convert every wordnet synset in the given mode synset. :param lines: List of strings to convert :param mode: type of synset: bn: BabelNet synset dom: WordNet domains lex: LexNames labels :return: """ wn2bn_map = utils.read_dictionary(const.WN2BN_MAP) mode_map = utils.load_mode_map(mode) for line in lines: for i, word in enumerate(line): if "_wn:" in word: lemma, _, synset = word.rpartition("_") line[i] = _convert_synset(lemma, synset, mode, mode_map, wn2bn_map) else: if mode == "lex" or mode == "dom": line[i] = "other" return lines
def __load_data(self): """ Load all the images in the folder """ print('Loading data') examples = [] count = 0 skipped = 0 # for f in os.listdir(self.examples_picture_path): # if len(f.split('_')[0]) > self.max_char_count: # continue # arr, initial_len = resize_image( # os.path.join(self.examples_path, f), # self.max_image_width # ) with open(self.examples_label_path, 'r') as f: # Address of target_label.txt for line in f.readlines(): address = line.split("__")[0] label = line.split("__")[1] if len(label) > self.max_char_count: continue if list(label)[0] == '#': continue img = cv2.imread(address, cv2.IMREAD_GRAYSCALE) arr, initial_len = resize_image(img, self.max_image_width) dictionary, _, dictionary_len = read_dictionary( self.dictionary_path) examples.append((arr, label, label_to_array(label, dictionary))) count += 1 dictionary_len = dictionary_len + 1 #! return examples, len(examples), dictionary_len
import torch import os import models import utils import numpy as np import torch.nn.functional as F import matplotlib.pyplot as plt from PIL import Image from torchvision import transforms from torch.autograd import Variable transform = transforms.Compose([transforms.ToTensor()]) classes_mapper = utils.read_dictionary('output/classes.txt') keys = list(classes_mapper.keys()) def image_deal(image): image = image.resize((100, 100)) plt.imshow(image) plt.show() image = transform(image) image = Variable(torch.unsqueeze(image, dim=0).float(), requires_grad=False) return image def get_state_dict(path): state_dict = torch.load(path) model.load_state_dict(state_dict) return model
def _predict(input_path: str, output_path: str, resources_path: str, task: int = None): """ Wrapper function for all the prediction functions. :param input_path: the path of the input file to predict in the same format as Raganato's framework. :param output_path: the path of the output file (where you save your predictions) :param resources_path: the path of the resources folder containing your model and stuff you might need. :param task: 0 for BN 1 for DOM 2 for LEX :return: """ print("Loading", input_path.split("/")[-1]) sentences = load_test(input_path) # Loads all the mapping files. word_index = utils.read_dictionary( os.path.join(resources_path, "vocabs/label_vocab_bn.txt")) word_index_dom = utils.read_dictionary( os.path.join(resources_path, "vocabs/label_vocab_dom.txt")) word_index_lex = utils.read_dictionary( os.path.join(resources_path, "vocabs/label_vocab_lex.txt")) outputs_size = [len(word_index), len(word_index_dom), len(word_index_lex)] lemma2syn = utils.read_dictionary( os.path.join(resources_path, "mapping/lemma2wordnet.txt")) wn2bn = utils.read_dictionary( os.path.join(resources_path, "mapping/wordnet2babelnet.txt")) bn2coarse = None coarse_index = None if task != 0: # if task != 0, DOM or LEX prediction. coarse_index = word_index_dom if task == 1 else word_index_lex coarse_path = (os.path.join(resources_path, "mapping/babelnet2wndomains.tsv") if task == 1 else os.path.join( resources_path, "mapping/babelnet2lexnames.tsv")) bn2coarse = utils.read_dictionary(coarse_path) print("Loading weights...") model = models.keras_model( hidden_size=256, dropout=0.6, recurrent_dropout=0.5, learning_rate=0.0003, outputs_size=outputs_size, elmo=True, mtl=True, ) model.load_weights(os.path.join(resources_path, "model.h5")) with open(output_path, mode="w", encoding="utf8") as out_file: for s in tqdm(sentences): line = [list(l.keys())[0] for l in s] ids = preprocess.get_ids(s) pos = preprocess.get_pos(s) lemmas = preprocess.get_lemmas(s) line_input = TextSequence.compute_x_elmo([line], pad=False) pred = model.predict(line_input)[task] lables = _get_labels( pred, lemmas, ids, pos, lemma2syn, wn2bn, word_index, coarse_index, bn2coarse, ) out_file.writelines(k + " " + v.rsplit("_")[-1] + "\n" for k, v in lables.items()) return
def make_trie(dictionary_filename): words = read_dictionary(dictionary_filename) return reduce(trie_put, words, tree())
def main(path_input: str, path_output: str, check_synset: bool = False): # read bn to wn mapping file bnwn_map = utils.read_dictionary(const.BN2WN_MAP) # write a file with only sentences, each annotated word is replaced with the sense write_sentences(path_input, path_output, bnwn_map, check_synset)