Esempio n. 1
0
    def _create_embeddings(self, embeddings_set, vocabs):

        unif = self.config_params['unif']
        keep_unused = self.config_params.get('keep_unused', False)

        if 'word' in vocabs:
            embeddings_section = self.config_params['word_embeddings']
            embed_label = embeddings_section.get('label', None)

            embeddings = dict()
            if embed_label is not None:
                embed_file = embeddings_set[embed_label]['file']
                embed_dsz = embeddings_set[embed_label]['dsz']
                embed_sha1 = embeddings_set[embed_label].get('sha1',None)
                embeddings['word'] = Task._create_embeddings_from_file(embed_file, embed_dsz, embed_sha1,
                                                                       self.data_download_cache, vocabs['word'],
                                                                       unif=unif, keep_unused=keep_unused)
            else:
                dsz = embeddings_section['dsz']
                embeddings['word'] = baseline.RandomInitVecModel(dsz, vocabs['word'], unif_weight=unif)

        if 'char' in vocabs:
            if self.config_params.get('charsz', -1) > 0:
                embeddings['char'] = baseline.RandomInitVecModel(self.config_params['charsz'], vocabs['char'], unif_weight=unif)

        extended_embed_info = self.config_params.get('extended_embed_info', {})
        for key, vocab in vocabs.items():
            if key in extended_embed_info:
                print('Adding extended feature embeddings {}'.format(key))
                ext_embed = None if extended_embed_info[key].get("embedding", None) is None \
                    else extended_embed_info[key]["embedding"]
                ext_emb_dsz = extended_embed_info[key].get("dsz", None)
                if ext_embed is not None:
                    EmbeddingT = baseline.GloVeModel if ext_embed.endswith('.txt') else baseline.Word2VecModel
                    print("using {} to read external embedding file {}".format(EmbeddingT, ext_embed))
                    embeddings[key] = EmbeddingT(ext_embed, known_vocab=vocab, unif_weight=unif, keep_unused=False)
                else:
                    print("randomly initializing external feature with dimension {}".format(ext_emb_dsz))
                    embeddings[key] = baseline.RandomInitVecModel(ext_emb_dsz, vocab, unif_weight=unif)
            elif key not in ['word', 'char']:
                raise Exception("Error: must specify a field '{}' in 'extended_embed_sz' dictionary for embedding dim size".format(key))

        out_vocabs = {}
        for key, value in embeddings.items():
            out_vocabs[key] = value.vocab
        return embeddings, out_vocabs
Esempio n. 2
0
 def _initialize_embedding(self, dimensions_size, vocab):
     return baseline.RandomInitVecModel(dimensions_size, vocab, False)
Esempio n. 3
0
    def _run(self, sess, model_file, embeddings_set, output_dir,
             model_version):
        self.word2index, vocab = ClassifyTensorFlowExporter.read_vocab(
            model_file)
        labels = self.load_labels(model_file)
        # Make the TF example, network input
        serialized_tf_example = tf.placeholder(tf.string, name='tf_example')
        feature_configs = {
            FIELD_NAME: tf.FixedLenFeature(shape=[], dtype=tf.string),
        }
        tf_example = tf.parse_example(serialized_tf_example, feature_configs)
        raw_posts = tf_example[FIELD_NAME]

        dense = tf.map_fn(self._preproc_post_creator(),
                          raw_posts,
                          dtype=tf.int64)
        word_embeddings = self.task.config_params["word_embeddings"]
        dsz = embeddings_set[word_embeddings["label"]]["dsz"]
        init_vectors = baseline.RandomInitVecModel(dsz, vocab, False)
        print(len(init_vectors.weights), len(vocab), init_vectors.vsz)
        model_params = self.task.config_params["model"]
        model_params["x"] = dense
        model_params["pkeep"] = 1
        model_params["sess"] = sess
        print(model_params)
        model = baseline.tf.classify.create_model({'word': init_vectors},
                                                  labels, **model_params)
        softmax_output = tf.nn.softmax(model.logits)

        values, indices = tf.nn.top_k(softmax_output, len(labels))
        class_tensor = tf.constant(model.labels)
        table = tf.contrib.lookup.index_to_string_table_from_tensor(
            class_tensor)
        classes = table.lookup(tf.to_int64(indices))
        self.restore_model(sess, model_file)
        output_path = os.path.join(tf.compat.as_bytes(output_dir),
                                   tf.compat.as_bytes(str(model_version)))

        print('Exporting trained model to %s' % output_path)
        builder = tf.saved_model.builder.SavedModelBuilder(output_path)

        # Build the signature_def_map.
        classify_inputs_tensor = tf.saved_model.utils.build_tensor_info(
            serialized_tf_example)
        classes_output_tensor = tf.saved_model.utils.build_tensor_info(classes)
        scores_output_tensor = tf.saved_model.utils.build_tensor_info(values)

        classification_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                    classify_inputs_tensor
                },
                outputs={
                    tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                    classes_output_tensor,
                    tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
                    scores_output_tensor
                },
                method_name=tf.saved_model.signature_constants.
                CLASSIFY_METHOD_NAME))

        predict_inputs_tensor = tf.saved_model.utils.build_tensor_info(
            raw_posts)
        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={'tokens': predict_inputs_tensor},
                outputs={
                    'classes': classes_output_tensor,
                    'scores': scores_output_tensor
                },
                method_name=tf.saved_model.signature_constants.
                PREDICT_METHOD_NAME))

        legacy_init_op = tf.group(tf.tables_initializer(),
                                  name='legacy_init_op')
        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                'predict_text':
                prediction_signature,
                tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                classification_signature,
            },
            legacy_init_op=legacy_init_op)

        builder.save()
        print('Successfully exported model to %s' % output_dir)
Esempio n. 4
0
    def _run(self, sess, model_file, embeddings_set, output_dir,
             model_version):
        self.word2index, vocab_word = TaggerTensorFlowExporter.read_vocab(
            model_file, 'word')
        self.char2index, vocab_char = TaggerTensorFlowExporter.read_vocab(
            model_file, 'char')
        upchars = tf.constant([chr(i) for i in range(65, 91)])
        self.lchars = tf.constant([chr(i) for i in range(97, 123)])
        self.upchars_lut = tf.contrib.lookup.index_table_from_tensor(
            mapping=upchars, num_oov_buckets=1, default_value=-1)

        labels = self.load_labels(model_file)
        # Make the TF example, network input
        serialized_tf_example = tf.placeholder(tf.string, name='tf_example')
        feature_configs = {
            FIELD_NAME: tf.FixedLenFeature(shape=[], dtype=tf.string),
        }
        tf_example = tf.parse_example(serialized_tf_example, feature_configs)
        raw_posts = tf_example[FIELD_NAME]

        # Run for each post
        x, xch, lengths = tf.map_fn(self._preproc_post_creator(),
                                    raw_posts,
                                    dtype=(tf.int64, tf.int64, tf.int32),
                                    back_prop=False)

        word_embeddings = self.task.config_params["word_embeddings"]
        dsz = embeddings_set[word_embeddings["label"]]["dsz"]
        char_dsz = self.task.config_params["charsz"]
        init_word_vectors = baseline.RandomInitVecModel(dsz, vocab_word, False)
        init_char_vectors = baseline.RandomInitVecModel(
            char_dsz, vocab_char, False)
        embeddings = {}
        embeddings['word'] = init_word_vectors
        embeddings['char'] = init_char_vectors
        vocabs = {}
        vocabs['word'] = vocab_word
        vocabs['char'] = vocab_char
        # WARNING: This can be a bug if the user defaults the values (-1)
        # for conll, the mxlen=124, for idr, the mxlen is forced to a max BPTT
        # for twpos, the mxlen=38
        # this should probably be fixed by serializing the mxlen of the model
        # or rereading it from the tensor from file
        mxlen = self.task.config_params['preproc']['mxlen']
        mxwlen = self.task.config_params['preproc']['mxwlen']

        model_params = self.task.config_params["model"]
        model_params["x"] = x
        model_params["xch"] = xch
        model_params["lengths"] = lengths
        model_params["pkeep"] = 1
        model_params["sess"] = sess
        model_params["maxs"] = mxlen
        model_params["maxw"] = mxwlen
        print(model_params)
        model = baseline.tf.tagger.create_model(labels, embeddings,
                                                **model_params)
        model.create_loss()

        softmax_output = tf.nn.softmax(model.probs)
        values, indices = tf.nn.top_k(softmax_output, 1)

        if model.crf is True:
            indices, _ = tf.contrib.crf.crf_decode(
                model.probs, model.A,
                tf.constant([mxlen]))  ## We are assuming the batchsz is 1 here

        list_of_labels = [''] * len(labels)
        for label, idval in labels.items():
            list_of_labels[idval] = label

        class_tensor = tf.constant(list_of_labels)
        table = tf.contrib.lookup.index_to_string_table_from_tensor(
            class_tensor)
        classes = table.lookup(tf.to_int64(indices))
        self.restore_model(sess, model_file)
        output_path = os.path.join(tf.compat.as_bytes(output_dir),
                                   tf.compat.as_bytes(str(model_version)))

        print('Exporting trained model to %s' % output_path)
        builder = tf.saved_model.builder.SavedModelBuilder(output_path)

        # Build the signature_def_map.
        classify_inputs_tensor = tf.saved_model.utils.build_tensor_info(
            serialized_tf_example)
        classes_output_tensor = tf.saved_model.utils.build_tensor_info(classes)
        scores_output_tensor = tf.saved_model.utils.build_tensor_info(values)

        classification_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={
                    tf.saved_model.signature_constants.CLASSIFY_INPUTS:
                    classify_inputs_tensor
                },
                outputs={
                    tf.saved_model.signature_constants.CLASSIFY_OUTPUT_CLASSES:
                    classes_output_tensor,
                    tf.saved_model.signature_constants.CLASSIFY_OUTPUT_SCORES:
                    scores_output_tensor
                },
                method_name=tf.saved_model.signature_constants.
                CLASSIFY_METHOD_NAME))

        predict_inputs_tensor = tf.saved_model.utils.build_tensor_info(
            raw_posts)
        prediction_signature = (
            tf.saved_model.signature_def_utils.build_signature_def(
                inputs={'tokens': predict_inputs_tensor},
                outputs={
                    'classes': classes_output_tensor,
                    'scores': scores_output_tensor
                },
                method_name=tf.saved_model.signature_constants.
                PREDICT_METHOD_NAME))

        legacy_init_op = tf.group(tf.tables_initializer(),
                                  name='legacy_init_op')
        builder.add_meta_graph_and_variables(
            sess, [tf.saved_model.tag_constants.SERVING],
            signature_def_map={
                'tag_text':
                prediction_signature,
                tf.saved_model.signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY:
                classification_signature,
            },
            legacy_init_op=legacy_init_op)

        builder.save()
        print('Successfully exported model to %s' % output_dir)