Exemple #1
0
    def parse_file(self,
                   dataset,
                   graph_outputs,
                   sess,
                   output_dir=None,
                   output_filename=None,
                   print_time=True):
        """"""

        probability_tensors = graph_outputs.probabilities
        input_filename = dataset.conllu_files[0]
        graph_outputs.restart_timer()
        for i, indices in enumerate(dataset.batch_iterator(shuffle=False)):
            with Timer('Parsing batch %d' % i):
                tokens, lengths = dataset.get_tokens(indices)
                feed_dict = dataset.set_placeholders(indices)
                probabilities = sess.run(probability_tensors,
                                         feed_dict=feed_dict)
                predictions = graph_outputs.probs_to_preds(
                    probabilities, lengths)
                tokens.update({
                    vocab.field: vocab[predictions[vocab.field]]
                    for vocab in self.output_vocabs
                })
                graph_outputs.cache_predictions(tokens, indices)

        with Timer('Dumping predictions'):
            if output_dir is None and output_filename is None:
                graph_outputs.print_current_predictions()
            else:
                input_dir, input_filename = os.path.split(input_filename)
                if output_dir is None:
                    output_dir = os.path.join(self.save_dir, 'parsed',
                                              input_dir)
                elif output_filename is None:
                    output_filename = input_filename

                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                output_filename = os.path.join(output_dir, output_filename)
                with codecs.open(output_filename, 'w', encoding='utf-8') as f:
                    graph_outputs.dump_current_predictions(f)
        if print_time:
            print('\033[92mParsing 1 file took {:0.1f} seconds\033[0m'.format(
                time.time() - graph_outputs.time))
        return
Exemple #2
0
    def parse(self, conllu_files, output_dir=None, output_filename=None):
        """"""

        with Timer('Building dataset'):
            parseset = conllu_dataset.CoNLLUDataset(conllu_files,
                                                    self.vocabs,
                                                    config=self._config)

        if output_filename:
            assert len(
                conllu_files
            ) == 1, "output_filename can only be specified for one input file"
        factored_deptree = None
        factored_semgraph = None
        for vocab in self.output_vocabs:
            if vocab.field == 'deprel':
                factored_deptree = vocab.factorized
            elif vocab.field == 'semrel':
                factored_semgraph = vocab.factorized
        with Timer('Building TF'):
            with tf.variable_scope(self.classname, reuse=False):
                parse_graph = self.build_graph(reuse=True)
                parse_outputs = DevOutputs(*parse_graph,
                                           load=False,
                                           factored_deptree=factored_deptree,
                                           factored_semgraph=factored_semgraph,
                                           config=self._config)
            parse_tensors = parse_outputs.accuracies
            all_variables = set(tf.global_variables())
            non_save_variables = set(tf.get_collection('non_save_variables'))
            save_variables = all_variables - non_save_variables
            saver = tf.train.Saver(list(save_variables), max_to_keep=1)

        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        config.allow_soft_placement = True
        with tf.Session(config=config) as sess:
            with Timer('Initializing non_save variables'):
                print(list(non_save_variables))
                sess.run(tf.variables_initializer(list(non_save_variables)))
            with Timer('Restoring save variables'):
                saver.restore(sess, tf.train.latest_checkpoint(self.save_dir))
            if len(conllu_files) == 1 or output_filename is not None:
                with Timer('Parsing file'):
                    self.parse_file(parseset,
                                    parse_outputs,
                                    sess,
                                    output_dir=output_dir,
                                    output_filename=output_filename)
            else:
                with Timer('Parsing files'):
                    self.parse_files(parseset,
                                     parse_outputs,
                                     sess,
                                     output_dir=output_dir)
        return
Exemple #3
0
    def parse_files(self,
                    dataset,
                    graph_outputs,
                    sess,
                    output_dir=None,
                    print_time=True):
        """"""

        probability_tensors = graph_outputs.probabilities
        graph_outputs.restart_timer()
        for input_filename in dataset.conllu_files:
            for i, indices in enumerate(dataset.batch_iterator(shuffle=False)):
                with Timer('batch {}'.format(i)):
                    tokens, lengths = dataset.get_tokens(indices)
                    feed_dict = dataset.set_placeholders(indices)
                    probabilities = sess.run(probability_tensors,
                                             feed_dict=feed_dict)
                    predictions = graph_outputs.probs_to_preds(
                        probabilities, lengths)
                    tokens.update({
                        vocab.field: vocab[predictions[vocab.field]]
                        for vocab in self.output_vocabs
                    })
                    graph_outputs.cache_predictions(tokens, indices)

            input_dir, input_filename = os.path.split(input_filename)
            if output_dir is None:
                file_output_dir = os.path.join(self.save_dir, 'parsed',
                                               input_dir)
            else:
                file_output_dir = output_dir
            if not os.path.exists(file_output_dir):
                os.makedirs(file_output_dir)
            output_filename = os.path.join(file_output_dir, input_filename)
            with codecs.open(output_filename, 'w', encoding='utf-8') as f:
                graph_outputs.dump_current_predictions(f)

            # Load the next conllu file
            dataset.load_next()

        if print_time:
            n_files = len(dataset.conllu_files)
            print(
                '\033[92mParsing {} file{} took {:0.1f} seconds\033[0m'.format(
                    n_files, 's' if n_files > 1 else '',
                    time.time() - graph_outputs.time))
        return
Exemple #4
0
    def __init__(self, input_networks=set(), config=None):
        """"""

        with Timer('Initializing the network (including pretrained vocab)'):
            self._config = config
            #os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

            self._input_networks = input_networks
            input_network_classes = set(
                input_network.classname
                for input_network in self._input_networks)
            assert input_network_classes == set(
                self.input_network_classes
            ), 'Not all input networks were passed in to {}'.format(
                self.classname)

            extant_vocabs = {}
            for input_network in self.input_networks:
                for vocab in input_network.vocabs:
                    if vocab.classname in extant_vocabs:
                        assert vocab is extant_vocabs[
                            vocab.
                            classname], "Two input networks have different instances of {}".format(
                                vocab.classname)
                    else:
                        extant_vocabs[vocab.classname] = vocab

            if 'IDIndexVocab' in extant_vocabs:
                self._id_vocab = extant_vocabs['IDIndexVocab']
            else:
                self._id_vocab = vocabs.IDIndexVocab(config=config)
                extant_vocabs['IDIndexVocab'] = self._id_vocab

            self._input_vocabs = []
            for input_vocab_classname in self.input_vocab_classes:
                if input_vocab_classname in extant_vocabs:
                    self._input_vocabs.append(
                        extant_vocabs[input_vocab_classname])
                else:
                    VocabClass = getattr(vocabs, input_vocab_classname)
                    vocab = VocabClass(config=config)
                    vocab.load() or vocab.count(self.train_conllus)
                    self._input_vocabs.append(vocab)
                    extant_vocabs[input_vocab_classname] = vocab

            self._output_vocabs = []
            for output_vocab_classname in self.output_vocab_classes:
                if output_vocab_classname in extant_vocabs:
                    self._output_vocabs.append(
                        extant_vocabs[output_vocab_classname])
                else:
                    VocabClass = getattr(vocabs, output_vocab_classname)
                    vocab = VocabClass(config=config)
                    vocab.load() or vocab.count(self.train_conllus)
                    self._output_vocabs.append(vocab)
                    extant_vocabs[output_vocab_classname] = vocab

            self._throughput_vocabs = []
            for throughput_vocab_classname in self.throughput_vocab_classes:
                if throughput_vocab_classname in extant_vocabs:
                    self._throughput_vocabs.append(
                        extant_vocabs[throughput_vocab_classname])
                else:
                    VocabClass = getattr(vocabs, throughput_vocab_classname)
                    vocab = VocabClass(config=config)
                    vocab.load() or vocab.count(self.train_conllus)
                    self._throughput_vocabs.append(vocab)
                    extant_vocabs[throughput_vocab_classname] = vocab

            with tf.variable_scope(self.classname, reuse=False):
                self.global_step = tf.Variable(0.,
                                               trainable=False,
                                               name='Global_step')
            self._vocabs = set(extant_vocabs.values())
        return