コード例 #1
0
ファイル: crnn.py プロジェクト: ucassjy/SimpleOCR
    def train(self, iteration_count):
        tensorboard_dir = 'tensorboard/dir'
        merged = tf.summary.merge_all()
        if not os.path.exists(tensorboard_dir):
            os.makedirs(tensorboard_dir)
        writer = tf.summary.FileWriter(tensorboard_dir)
        dictionary, inverse_dictionary, dictionary_len = read_dictionary(
            self.__data_manager.dictionary_path)
        with self.__session.as_default():
            print('Training')

            for i in range(self.step, iteration_count + self.step):
                iter_loss = 0
                rs = 0

                for batch_y, batch_dt, batch_x in self.__data_manager.train_batches:
                    op, loss_value, rs = self.__session.run(
                        [self.__optimizer, self.__cost, merged],
                        feed_dict={
                            self.__inputs:
                            batch_x,
                            self.__seq_len: [self.__max_char_count] *
                            self.__data_manager.batch_size,
                            self.__targets:
                            batch_dt
                        })

                    if i % 50 == 0:
                        for j in range(2):
                            [decoded, acc] = self.__session.run(
                                [self.__decoded, self.__acc],
                                feed_dict={
                                    self.__inputs:
                                    batch_x,
                                    self.__seq_len: [self.__max_char_count] *
                                    self.__data_manager.batch_size,
                                    self.__targets:
                                    batch_dt
                                })
                            print(batch_y[j])
                            print(acc)
                            print(
                                ground_truth_to_word(decoded[j],
                                                     inverse_dictionary))

                    iter_loss += loss_value
                    rs += rs
                    writer.add_summary(rs, i)

                if i % 500 == 0:
                    self.__saver.save(self.__session,
                                      self.__save_path,
                                      global_step=self.step)

                print('[{}] Iteration loss: {}'.format(self.step, iter_loss))

                self.step += 1
            writer.add_graph(self.__session.graph)
        return None
コード例 #2
0
ファイル: parse.py プロジェクト: Riccorl/sense-embedding
def make_dict(paths: List[str], path_dict: str):
    """
    Write a dictionary from word to senses, from the given input files.
    :param paths: files to read.
    :param path_dict: where to save the dictionary.
    :return:
    """
    bnwn_map = utils.read_dictionary(const.BN2WN_MAP)
    word_synset_map = utils.compute_word_sysnet_map(paths, bnwn_map)
    utils.write_dictionary(path_dict, word_synset_map)
コード例 #3
0
def main(use_cuda=False):
    print('Hello!')
    config = load_config()
    dataset = read_corpus(sents)
    char_fpi = read_dictionary(is_char=True)
    word_fpi = read_dictionary(is_char=False)
    char_lexicon, char_emb_layer = get_lexicon(
        config['token_embedder']['char_dim'], char_fpi, use_cuda)
    word_lexicon, word_emb_layer = get_lexicon(
        config['token_embedder']['word_dim'], word_fpi, use_cuda)
    b_w, b_c, b_l, b_m, recover_ind = create_batches(dataset,
                                                     word_lexicon,
                                                     char_lexicon,
                                                     config,
                                                     batch_size=64,
                                                     shuffle=True,
                                                     use_cuda=use_cuda)
    token_embedder = ConvTokenEmbedder(config, word_emb_layer, char_emb_layer,
                                       use_cuda)
    for w, c, m in zip(b_w, b_c, b_m):
        token_embedding = token_embedder(w, c, m[0].size())
        print(token_embedding)
コード例 #4
0
def semcor(path_input: str, path_output: str, map_path: str, label: bool):
    """
    Preprocess SemCor dataset and writes it in a single text file.
    :param path_input: semcor path.
    :param path_output: where to save the preprocessed file.
    :param map_path: the path to the gold key dictionary.
    :param label: if True, parse the semcor set as label.
    :return:
    """
    # load maps
    semcor_map = utils.read_dictionary(map_path)
    with open(path_output, mode="w", encoding="utf8") as out:
        root = cElementTree.parse(path_input).getroot()
        for sentence in root.findall(".//sentence"):
            out.write(_extract_sentence(sentence, semcor_map, label) + "\n")
コード例 #5
0
def evaluate_keras(model) -> None:
    """
    Evaluate the model printing a classification report (acc, prec, recall and f1).
    :param model: the trained model.
    :return:
    """
    x_test, y_test = utils.load_datasets(config.CRISIS_EVAL_DIR,
                                         config.NORMAL_EVAL_DIR,
                                         limit=8000)
    x_test = preprocess.clear_tweets(x_test)
    vocab = utils.read_dictionary(config.TRAIN_VOCAB)
    x_test = preprocess.compute_x(x_test, vocab, max_len=100)[:, :, 0]
    y_pred = model.predict(x_test, batch_size=256, verbose=1)
    y_pred = [1 if y > 0.5 else 0 for y in y_pred]
    cr = classification_report(y_test, y_pred, ["normal", "crisis"])
    print("Classification report : \n", cr)
コード例 #6
0
def convert_synsets(lines: List, mode: str) -> List[List[str]]:
    """
    Convert every wordnet synset in the given mode synset.
    :param lines: List of strings to convert
    :param mode: type of synset:
        bn: BabelNet synset
        dom: WordNet domains
        lex: LexNames labels
    :return:
    """
    wn2bn_map = utils.read_dictionary(const.WN2BN_MAP)
    mode_map = utils.load_mode_map(mode)

    for line in lines:
        for i, word in enumerate(line):
            if "_wn:" in word:
                lemma, _, synset = word.rpartition("_")
                line[i] = _convert_synset(lemma, synset, mode, mode_map,
                                          wn2bn_map)
            else:
                if mode == "lex" or mode == "dom":
                    line[i] = "other"
    return lines
コード例 #7
0
ファイル: data_manager.py プロジェクト: ucassjy/SimpleOCR
    def __load_data(self):
        """
            Load all the images in the folder
        """

        print('Loading data')

        examples = []
        count = 0
        skipped = 0
        # for f in os.listdir(self.examples_picture_path):
        #     if len(f.split('_')[0]) > self.max_char_count:
        #         continue
        #     arr, initial_len = resize_image(
        #         os.path.join(self.examples_path, f),
        #         self.max_image_width
        #     )
        with open(self.examples_label_path,
                  'r') as f:  # Address of target_label.txt
            for line in f.readlines():
                address = line.split("__")[0]

                label = line.split("__")[1]
                if len(label) > self.max_char_count:
                    continue
                if list(label)[0] == '#':
                    continue
                img = cv2.imread(address, cv2.IMREAD_GRAYSCALE)
                arr, initial_len = resize_image(img, self.max_image_width)
                dictionary, _, dictionary_len = read_dictionary(
                    self.dictionary_path)

                examples.append((arr, label, label_to_array(label,
                                                            dictionary)))
                count += 1
                dictionary_len = dictionary_len + 1  #!
        return examples, len(examples), dictionary_len
コード例 #8
0
import torch
import os
import models
import utils
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from PIL import Image
from torchvision import transforms
from torch.autograd import Variable

transform = transforms.Compose([transforms.ToTensor()])
classes_mapper = utils.read_dictionary('output/classes.txt')
keys = list(classes_mapper.keys())


def image_deal(image):
    image = image.resize((100, 100))
    plt.imshow(image)
    plt.show()
    image = transform(image)
    image = Variable(torch.unsqueeze(image, dim=0).float(),
                     requires_grad=False)
    return image


def get_state_dict(path):
    state_dict = torch.load(path)
    model.load_state_dict(state_dict)
    return model
コード例 #9
0
ファイル: predict.py プロジェクト: Riccorl/elmo-wsd
def _predict(input_path: str,
             output_path: str,
             resources_path: str,
             task: int = None):
    """
    Wrapper function for all the prediction functions.
    :param input_path: the path of the input file to predict in the same format as Raganato's framework.
    :param output_path: the path of the output file (where you save your predictions)
    :param resources_path: the path of the resources folder containing your model and stuff you might need.
    :param task:    0 for BN
                    1 for DOM
                    2 for LEX
    :return:
    """

    print("Loading", input_path.split("/")[-1])
    sentences = load_test(input_path)

    # Loads all the mapping files.
    word_index = utils.read_dictionary(
        os.path.join(resources_path, "vocabs/label_vocab_bn.txt"))
    word_index_dom = utils.read_dictionary(
        os.path.join(resources_path, "vocabs/label_vocab_dom.txt"))
    word_index_lex = utils.read_dictionary(
        os.path.join(resources_path, "vocabs/label_vocab_lex.txt"))
    outputs_size = [len(word_index), len(word_index_dom), len(word_index_lex)]
    lemma2syn = utils.read_dictionary(
        os.path.join(resources_path, "mapping/lemma2wordnet.txt"))
    wn2bn = utils.read_dictionary(
        os.path.join(resources_path, "mapping/wordnet2babelnet.txt"))

    bn2coarse = None
    coarse_index = None
    if task != 0:
        # if task != 0, DOM or LEX prediction.
        coarse_index = word_index_dom if task == 1 else word_index_lex
        coarse_path = (os.path.join(resources_path,
                                    "mapping/babelnet2wndomains.tsv")
                       if task == 1 else os.path.join(
                           resources_path, "mapping/babelnet2lexnames.tsv"))
        bn2coarse = utils.read_dictionary(coarse_path)

    print("Loading weights...")
    model = models.keras_model(
        hidden_size=256,
        dropout=0.6,
        recurrent_dropout=0.5,
        learning_rate=0.0003,
        outputs_size=outputs_size,
        elmo=True,
        mtl=True,
    )
    model.load_weights(os.path.join(resources_path, "model.h5"))

    with open(output_path, mode="w", encoding="utf8") as out_file:
        for s in tqdm(sentences):
            line = [list(l.keys())[0] for l in s]
            ids = preprocess.get_ids(s)
            pos = preprocess.get_pos(s)
            lemmas = preprocess.get_lemmas(s)
            line_input = TextSequence.compute_x_elmo([line], pad=False)
            pred = model.predict(line_input)[task]
            lables = _get_labels(
                pred,
                lemmas,
                ids,
                pos,
                lemma2syn,
                wn2bn,
                word_index,
                coarse_index,
                bn2coarse,
            )
            out_file.writelines(k + " " + v.rsplit("_")[-1] + "\n"
                                for k, v in lables.items())
    return
コード例 #10
0
def make_trie(dictionary_filename):
    words = read_dictionary(dictionary_filename)
    return reduce(trie_put, words, tree())
コード例 #11
0
def main(path_input: str, path_output: str, check_synset: bool = False):
    # read bn to wn mapping file
    bnwn_map = utils.read_dictionary(const.BN2WN_MAP)
    # write a file with only sentences, each annotated word is replaced with the sense
    write_sentences(path_input, path_output, bnwn_map, check_synset)