Esempio n. 1
0
def dump_token_embeddings(vocab_file, options_file, weight_file, outfile):
    """
    Given an input vocabulary file, dump all the token embeddings to the
    outfile.  The result can be used as the embedding_weight_file when
    constructing a BidirectionalLanguageModel.
    """

    if type(options_file) == ZipExtFile:
        options = json.load(options_file)
        options_file.seek(0)
    else:
        with open(options_file, 'r') as of:
            options = json.load(of)

    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.compat.v1.placeholder('int32',
                                               shape=(None, None,
                                                      max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    embedding_op = model(ids_placeholder)['token_embeddings']

    n_tokens = vocab.size
    embed_dim = int(embedding_op.shape[2])

    embeddings = np.zeros((n_tokens, embed_dim), dtype=DTYPE)

    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    with tf.compat.v1.Session(config=config) as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        for k in range(n_tokens):
            token = vocab.id_to_word(k)
            char_ids = batcher.batch_sentences([[token]
                                                ])[0, 1, :].reshape(1, 1, -1)
            embeddings[k, :] = sess.run(embedding_op,
                                        feed_dict={ids_placeholder: char_ids})

    with h5py.File(outfile, 'w') as fout:
        ds = fout.create_dataset('embedding',
                                 embeddings.shape,
                                 dtype='float32',
                                 data=embeddings)
Esempio n. 2
0
def dump_bilm_embeddings(vocab_file, dataset_file, options_file, weight_file,
                         outfile):

    if type(options_file) == ZipExtFile:
        options = json.load(options_file)
        options_file.seek(0)
    else:
        with open(options_file, 'r') as of:
            options = json.load(of)

    max_word_length = options['char_cnn']['max_characters_per_token']

    vocab = UnicodeCharsVocabulary(vocab_file, max_word_length)
    batcher = Batcher(vocab_file, max_word_length)

    ids_placeholder = tf.compat.v1.placeholder('int32',
                                               shape=(None, None,
                                                      max_word_length))
    model = BidirectionalLanguageModel(options_file, weight_file)
    ops = model(ids_placeholder)

    config = tf.compat.v1.ConfigProto(allow_soft_placement=True)
    with tf.compat.v1.Session(config=config) as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        sentence_id = 0
        with open(dataset_file, 'r') as fin, h5py.File(outfile, 'w') as fout:
            for line in fin:
                sentence = line.strip().split()
                char_ids = batcher.batch_sentences([sentence])
                embeddings = sess.run(ops['lm_embeddings'],
                                      feed_dict={ids_placeholder: char_ids})
                ds = fout.create_dataset('{}'.format(sentence_id),
                                         embeddings.shape[1:],
                                         dtype='float32',
                                         data=embeddings[0, :, :, :])

                sentence_id += 1
Esempio n. 3
0
    def load(self, directory, top=False, max_batch_size=96):
        # Loading a pre-trained ELMo model:
        # You can call load with top=True to use only the top ELMo layer
        """
        :param directory: directory or a ZIP archive with an ELMo model
        ('model.hdf5', 'options.json' and 'vocab.txt*' files must be present)
        :param top: use only top ELMo layer
        :param max_batch_size: the maximum allowable batch size during inference
        :return: ELMo batcher, character id placeholders, op object
        """
        if not os.path.exists(directory):
            raise SystemExit("Error: model not found!")
        self.batch_size = max_batch_size
        if os.path.isfile(directory) and directory.endswith(".zip"):
            message = """
            Assuming the model is a ZIP archive downloaded from the NLPL vector repository.
            Loading a model from a ZIP archive directly is slower than from the extracted files,
            but does not require additional disk space
            and allows to load from directories without write permissions.
            """
            self.logger.info(message)
            if sys.version_info.major < 3 or sys.version_info.minor < 7:
                raise SystemExit(
                    "Error: loading ELMo from ZIP archives requires Python >= 3.7."
                )
            zf = zipfile.ZipFile(directory)
            vocab_file = zf.open("vocab.txt")
            options_file = zf.open("options.json")
            weight_file = zf.open("model.hdf5")
            m_options = json.load(options_file)
            options_file.seek(0)
        else:
            # We have all the files already extracted in a separate directory
            if os.path.isfile(os.path.join(directory, "vocab.txt.gz")):
                vocab_file = os.path.join(directory, "vocab.txt.gz")
            elif os.path.isfile(os.path.join(directory, "vocab.txt")):
                vocab_file = os.path.join(directory, "vocab.txt")
            else:
                raise SystemExit(
                    "Error: no vocabulary file found in the model.")
            options_file = os.path.join(directory, "options.json")
            weight_file = os.path.join(directory, "model.hdf5")
            with open(options_file, 'r') as of:
                m_options = json.load(of)

        self.logger.info(f"Loading model from {directory}...")
        max_chars = m_options['char_cnn']['max_characters_per_token']
        self.max_chars = max_chars

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, max_chars)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.compat.v1.placeholder(
            'int32', shape=(None, None, max_chars))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options_file,
                                          weight_file,
                                          max_batch_size=max_batch_size)
        self.vector_size = int(bilm.options['lstm']['projection_dim'] * 2)

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 use_top_only=top)

        return self.batcher, self.sentence_character_ids, self.elmo_sentence_input, self.batch_size
Esempio n. 4
0
class ElmoModel:
    """
    Embeddings from Language Models (ELMo)
    """
    def __init__(self):
        self.batcher = None
        self.sentence_character_ids = None
        self.elmo_sentence_input = None
        self.batch_size = None
        self.max_chars = None
        self.vector_size = None

        # We do not use eager execution from TF 2.0
        tf.compat.v1.disable_eager_execution()

        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.INFO)
        self.logger = logging.getLogger(__name__)

    def load(self, directory, top=False, max_batch_size=96):
        # Loading a pre-trained ELMo model:
        # You can call load with top=True to use only the top ELMo layer
        """
        :param directory: directory or a ZIP archive with an ELMo model
        ('model.hdf5', 'options.json' and 'vocab.txt*' files must be present)
        :param top: use only top ELMo layer
        :param max_batch_size: the maximum allowable batch size during inference
        :return: ELMo batcher, character id placeholders, op object
        """
        if not os.path.exists(directory):
            raise SystemExit("Error: model not found!")
        self.batch_size = max_batch_size
        if os.path.isfile(directory) and directory.endswith(".zip"):
            message = """
            Assuming the model is a ZIP archive downloaded from the NLPL vector repository.
            Loading a model from a ZIP archive directly is slower than from the extracted files,
            but does not require additional disk space
            and allows to load from directories without write permissions.
            """
            self.logger.info(message)
            if sys.version_info.major < 3 or sys.version_info.minor < 7:
                raise SystemExit(
                    "Error: loading ELMo from ZIP archives requires Python >= 3.7."
                )
            zf = zipfile.ZipFile(directory)
            vocab_file = zf.open("vocab.txt")
            options_file = zf.open("options.json")
            weight_file = zf.open("model.hdf5")
            m_options = json.load(options_file)
            options_file.seek(0)
        else:
            # We have all the files already extracted in a separate directory
            if os.path.isfile(os.path.join(directory, "vocab.txt.gz")):
                vocab_file = os.path.join(directory, "vocab.txt.gz")
            elif os.path.isfile(os.path.join(directory, "vocab.txt")):
                vocab_file = os.path.join(directory, "vocab.txt")
            else:
                raise SystemExit(
                    "Error: no vocabulary file found in the model.")
            options_file = os.path.join(directory, "options.json")
            weight_file = os.path.join(directory, "model.hdf5")
            with open(options_file, 'r') as of:
                m_options = json.load(of)

        self.logger.info(f"Loading model from {directory}...")
        max_chars = m_options['char_cnn']['max_characters_per_token']
        self.max_chars = max_chars

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, max_chars)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.compat.v1.placeholder(
            'int32', shape=(None, None, max_chars))

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(options_file,
                                          weight_file,
                                          max_batch_size=max_batch_size)
        self.vector_size = int(bilm.options['lstm']['projection_dim'] * 2)

        # Get ops to compute the LM embeddings.
        sentence_embeddings_op = bilm(self.sentence_character_ids)

        # Get an op to compute ELMo (weighted average of the internal biLM layers)
        self.elmo_sentence_input = weight_layers('input',
                                                 sentence_embeddings_op,
                                                 use_top_only=top)

        return self.batcher, self.sentence_character_ids, self.elmo_sentence_input, self.batch_size

    def get_elmo_vectors(self, texts):
        """
        :param texts: list of sentences (lists of words)
        :return: embedding matrix for all sentences (max word count by vector size)
        """

        max_text_length = max([len(t) for t in texts])

        # Creating the matrix which will eventually contain all embeddings from all batches:
        final_vectors = np.zeros(
            (len(texts), max_text_length, self.vector_size))

        with tf.compat.v1.Session() as sess:
            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())

            # Running batches:
            chunk_counter = 0
            for chunk in divide_chunks(texts, self.batch_size):
                # Converting sentences to character ids:
                sentence_ids = self.batcher.batch_sentences(chunk)
                self.logger.info(f"Texts in the current batch: {len(chunk)}")

                # Compute ELMo representations.
                elmo_vectors = sess.run(
                    self.elmo_sentence_input['weighted_op'],
                    feed_dict={self.sentence_character_ids: sentence_ids})
                # Updating the full matrix:
                first_row = self.batch_size * chunk_counter
                last_row = first_row + elmo_vectors.shape[0]
                final_vectors[first_row:last_row, :elmo_vectors.
                              shape[1], :] = elmo_vectors
                chunk_counter += 1

            return final_vectors

    def get_elmo_vector_average(self, texts):
        """
        :param texts: list of sentences (lists of words)
        :return: matrix of averaged embeddings for all sentences
        """
        average_vectors = np.zeros((len(texts), self.vector_size))

        counter = 0

        with tf.compat.v1.Session() as sess:
            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())

            # Running batches:
            for chunk in divide_chunks(texts, self.batch_size):
                # Converting sentences to character ids:
                sentence_ids = self.batcher.batch_sentences(chunk)
                self.logger.info(f"Sentences in this batch: {len(chunk)}")

                # Compute ELMo representations.
                elmo_vectors = sess.run(
                    self.elmo_sentence_input['weighted_op'],
                    feed_dict={self.sentence_character_ids: sentence_ids})

                self.logger.debug(
                    f"ELMo sentence input shape: {elmo_vectors.shape}")

                for sentence in range(len(chunk)):
                    sent_vec = np.zeros(
                        (elmo_vectors.shape[1], elmo_vectors.shape[2]))
                    for word_vec in enumerate(elmo_vectors[sentence, :, :]):
                        sent_vec[word_vec[0], :] = word_vec[1]
                    semantic_fingerprint = np.sum(sent_vec, axis=0)
                    semantic_fingerprint = np.divide(semantic_fingerprint,
                                                     sent_vec.shape[0])
                    query_vec = preprocessing.normalize(
                        semantic_fingerprint.reshape(1, -1), norm='l2')
                    average_vectors[counter] = query_vec.reshape(-1)
                    counter += 1

        return average_vectors
    def load(self, directory, max_batch_size=32, limit=100):
        # Loading a pre-trained ELMo model:
        # You can call load with top=True to use only the top ELMo layer
        """
        :param directory: directory or a ZIP archive with an ELMo model
        ('*.hdf5' and 'options.json' files must be present)
        :param max_batch_size: the maximum allowable batch size during inference
        :param limit: cache only the first <limit> words from the vocabulary file
        :return: ELMo batcher, character id placeholders, op object
        """
        if not os.path.exists(directory):
            raise SystemExit(f"Error: path  not found for {directory}!")
        self.batch_size = max_batch_size
        self.logger.info(f"Loading model from {directory}...")

        if os.path.isfile(directory) and directory.endswith(".zip"):
            message = """
            Assuming the model is a ZIP archive downloaded from the NLPL vector repository.
            Loading a model from a ZIP archive directly is slower than from the extracted files,
            but does not require additional disk space
            and allows to load from directories without write permissions.
            """
            self.logger.info(message)
            if sys.version_info.major < 3 or sys.version_info.minor < 7:
                raise SystemExit(
                    "Error: loading models from ZIP archives requires Python >= 3.7."
                )
            zf = zipfile.ZipFile(directory)
            vocab_file = zf.open("vocab.txt")
            options_file = zf.open("options.json")
            weight_file = zf.open("model.hdf5")
            m_options = json.load(options_file)
            options_file.seek(0)
        elif os.path.isdir(directory):
            # We have all the files already extracted in a separate directory
            if os.path.isfile(os.path.join(directory, "vocab.txt.gz")):
                vocab_file = os.path.join(directory, "vocab.txt.gz")
            elif os.path.isfile(os.path.join(directory, "vocab.txt")):
                vocab_file = os.path.join(directory, "vocab.txt")
            else:
                self.logger.info("No vocabulary file found in the model.")
                vocab_file = None
            if os.path.exists(os.path.join(directory, "model.hdf5")):
                weight_file = os.path.join(directory, "model.hdf5")
            else:
                weight_files = [
                    fl for fl in os.listdir(directory) if fl.endswith(".hdf5")
                ]
                if not weight_files:
                    raise SystemExit(
                        f"Error: no HDF5 model files found in the {directory} directory!"
                    )
                weight_file = os.path.join(directory, weight_files[0])
                self.logger.info(
                    f"No model.hdf5 file found. Using {weight_file} as a model file."
                )
            options_file = os.path.join(directory, "options.json")
            with open(options_file, "r") as of:
                m_options = json.load(of)
        else:
            raise SystemExit(
                "Error: either provide a path to a directory with the model "
                "or to the model in a ZIP archive."
            )

        max_chars = m_options["char_cnn"]["max_characters_per_token"]
        self.max_chars = max_chars
        if m_options["char_cnn"]["n_characters"] == 261:
            raise SystemExit(
                "Error: invalid number of characters in the options.json file: 261. "
                "Set n_characters to 262 for inference."
            )

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, max_chars, limit=limit)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.compat.v1.placeholder(
            "int32", shape=(None, None, max_chars)
        )

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            options_file, weight_file, max_batch_size=max_batch_size
        )

        self.vector_size = int(bilm.options["lstm"]["projection_dim"] * 2)
        self.n_layers = bilm.options["lstm"]["n_layers"] + 1

        # Get ops to compute the LM embeddings.
        self.sentence_embeddings_op = bilm(self.sentence_character_ids)

        return "The model is now loaded."
class ElmoModel:
    """
    Embeddings from Language Models (ELMo)
    """

    def __init__(self):
        self.batcher = None
        self.sentence_character_ids = None
        self.elmo_sentence_input = None
        self.sentence_embeddings_op = None
        self.batch_size = None
        self.max_chars = None
        self.vector_size = None
        self.n_layers = None

        # We do not use eager execution from TF 2.0
        tf.compat.v1.disable_eager_execution()

        logging.basicConfig(
            format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
        )
        self.logger = logging.getLogger(__name__)

    def load(self, directory, max_batch_size=32, limit=100):
        # Loading a pre-trained ELMo model:
        # You can call load with top=True to use only the top ELMo layer
        """
        :param directory: directory or a ZIP archive with an ELMo model
        ('*.hdf5' and 'options.json' files must be present)
        :param max_batch_size: the maximum allowable batch size during inference
        :param limit: cache only the first <limit> words from the vocabulary file
        :return: ELMo batcher, character id placeholders, op object
        """
        if not os.path.exists(directory):
            raise SystemExit(f"Error: path  not found for {directory}!")
        self.batch_size = max_batch_size
        self.logger.info(f"Loading model from {directory}...")

        if os.path.isfile(directory) and directory.endswith(".zip"):
            message = """
            Assuming the model is a ZIP archive downloaded from the NLPL vector repository.
            Loading a model from a ZIP archive directly is slower than from the extracted files,
            but does not require additional disk space
            and allows to load from directories without write permissions.
            """
            self.logger.info(message)
            if sys.version_info.major < 3 or sys.version_info.minor < 7:
                raise SystemExit(
                    "Error: loading models from ZIP archives requires Python >= 3.7."
                )
            zf = zipfile.ZipFile(directory)
            vocab_file = zf.open("vocab.txt")
            options_file = zf.open("options.json")
            weight_file = zf.open("model.hdf5")
            m_options = json.load(options_file)
            options_file.seek(0)
        elif os.path.isdir(directory):
            # We have all the files already extracted in a separate directory
            if os.path.isfile(os.path.join(directory, "vocab.txt.gz")):
                vocab_file = os.path.join(directory, "vocab.txt.gz")
            elif os.path.isfile(os.path.join(directory, "vocab.txt")):
                vocab_file = os.path.join(directory, "vocab.txt")
            else:
                self.logger.info("No vocabulary file found in the model.")
                vocab_file = None
            if os.path.exists(os.path.join(directory, "model.hdf5")):
                weight_file = os.path.join(directory, "model.hdf5")
            else:
                weight_files = [
                    fl for fl in os.listdir(directory) if fl.endswith(".hdf5")
                ]
                if not weight_files:
                    raise SystemExit(
                        f"Error: no HDF5 model files found in the {directory} directory!"
                    )
                weight_file = os.path.join(directory, weight_files[0])
                self.logger.info(
                    f"No model.hdf5 file found. Using {weight_file} as a model file."
                )
            options_file = os.path.join(directory, "options.json")
            with open(options_file, "r") as of:
                m_options = json.load(of)
        else:
            raise SystemExit(
                "Error: either provide a path to a directory with the model "
                "or to the model in a ZIP archive."
            )

        max_chars = m_options["char_cnn"]["max_characters_per_token"]
        self.max_chars = max_chars
        if m_options["char_cnn"]["n_characters"] == 261:
            raise SystemExit(
                "Error: invalid number of characters in the options.json file: 261. "
                "Set n_characters to 262 for inference."
            )

        # Create a Batcher to map text to character ids.
        self.batcher = Batcher(vocab_file, max_chars, limit=limit)

        # Input placeholders to the biLM.
        self.sentence_character_ids = tf.compat.v1.placeholder(
            "int32", shape=(None, None, max_chars)
        )

        # Build the biLM graph.
        bilm = BidirectionalLanguageModel(
            options_file, weight_file, max_batch_size=max_batch_size
        )

        self.vector_size = int(bilm.options["lstm"]["projection_dim"] * 2)
        self.n_layers = bilm.options["lstm"]["n_layers"] + 1

        # Get ops to compute the LM embeddings.
        self.sentence_embeddings_op = bilm(self.sentence_character_ids)

        return "The model is now loaded."

    def get_elmo_vectors(self, texts, warmup=True, layers="average"):
        """
        :param texts: list of sentences (lists of words)
        :param warmup: warm up the model before actual inference (by running it over the 1st batch)
        :param layers: ["top", "average", "all"].
        Yield the top ELMo layer, the average of all layers, or all layers as they are.
        :return: embedding tensor for all sentences
        (number of used layers by max word count by vector size)
        """
        max_text_length = max([len(t) for t in texts])

        # Creating the matrix which will eventually contain all embeddings from all batches:
        if layers == "all":
            final_vectors = np.zeros((len(texts), self.n_layers, max_text_length, self.vector_size))
        else:
            final_vectors = np.zeros((len(texts), max_text_length, self.vector_size))

        with tf.compat.v1.Session() as sess:
            # Get an op to compute ELMo vectors (a function of the internal biLM layers)
            self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op,
                                                     use_layers=layers)

            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())

            if warmup:
                self.warmup(sess, texts)

            # Running batches:
            chunk_counter = 0
            for chunk in divide_chunks(texts, self.batch_size):
                # Converting sentences to character ids:
                sentence_ids = self.batcher.batch_sentences(chunk)
                self.logger.info(f"Texts in the current batch: {len(chunk)}")

                # Compute ELMo representations.
                elmo_vectors = sess.run(
                    self.elmo_sentence_input["weighted_op"],
                    feed_dict={self.sentence_character_ids: sentence_ids},
                )
                # Updating the full matrix:
                first_row = self.batch_size * chunk_counter
                last_row = first_row + elmo_vectors.shape[0]
                if layers == "all":
                    final_vectors[first_row:last_row, :, : elmo_vectors.shape[2], :] = elmo_vectors
                else:
                    final_vectors[first_row:last_row, : elmo_vectors.shape[1], :] = elmo_vectors
                chunk_counter += 1

            return final_vectors

    def get_elmo_vectors_no_context(self, texts, layers="average"):
        """
        :param texts: list of sentences (lists of words)
        :param warmup: warm up the model before actual inference (by running it over the 1st batch)
        :param layers: ["top", "average", "all"].
        Yield the top ELMo layer, the average of all layers, or all layers as they are.
        :return: embedding tensor for all sentences
        (number of used layers by max word count by vector size)
        """
        max_text_length = max([len(t) for t in texts])

        # Creating the matrix which will eventually contain all embeddings from all batches:
        if layers == "all":
            final_vectors = np.zeros((len(texts), self.n_layers, max_text_length, self.vector_size))
        else:
            final_vectors = np.zeros((len(texts), max_text_length, self.vector_size))

        with tf.compat.v1.Session() as sess:
            # Get an op to compute ELMo vectors (a function of the internal biLM layers)
            self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op,
                                                     use_layers=layers)

            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())


            # Running batches:
            chunk_counter = 0
            for chunk in divide_chunks(texts, self.batch_size):

                self.warmup(sess, texts)
                # Converting sentences to character ids:
                sentence_ids = self.batcher.batch_sentences(chunk)
                self.logger.info(f"Texts in the current batch: {len(chunk)}")

                # Compute ELMo representations.
                elmo_vectors = sess.run(
                    self.elmo_sentence_input["weighted_op"],
                    feed_dict={self.sentence_character_ids: sentence_ids},
                )
                # Updating the full matrix:
                first_row = self.batch_size * chunk_counter
                last_row = first_row + elmo_vectors.shape[0]
                if layers == "all":
                    final_vectors[first_row:last_row, :, : elmo_vectors.shape[2], :] = elmo_vectors
                else:
                    final_vectors[first_row:last_row, : elmo_vectors.shape[1], :] = elmo_vectors
                chunk_counter += 1

            return final_vectors

    def get_elmo_vectors_shit(self, texts_in, layers="average"):
        """
        :param texts: list of sentences (lists of words)
        :param warmup: warm up the model before actual inference (by running it over the 1st batch)
        :param layers: ["top", "average", "all"].
        Yield the top ELMo layer, the average of all layers, or all layers as they are.
        :return: embedding tensor for all sentences
        (number of used layers by max word count by vector size)
        """
        result=[]
        max_text_length = max([len(t) for t in texts_in])

        # Creating the matrix which will eventually contain all embeddings from all batches:
        if layers == "all":
            final_vectors = np.zeros((len(texts_in), self.n_layers, max_text_length, self.vector_size))
        else:
            final_vectors = np.zeros((len(texts_in), max_text_length, self.vector_size))

        with tf.compat.v1.Session() as sess:
            # Get an op to compute ELMo vectors (a function of the internal biLM layers)
            self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op,
                                                     use_layers=layers)

            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())
            for texts_pre in texts_in:
                texts=[texts_pre]
                for i in range(10):
                    self.warmup(sess, texts)
                # Running batches:
                chunk_counter = 0
                for chunk in divide_chunks(texts, self.batch_size):


                    # Converting sentences to character ids:
                    sentence_ids = self.batcher.batch_sentences(chunk)
                    self.logger.info(f"Texts in the current batch: {len(chunk)}")

                    # Compute ELMo representations.
                    elmo_vectors = sess.run(
                        self.elmo_sentence_input["weighted_op"],
                        feed_dict={self.sentence_character_ids: sentence_ids},
                    )
                    # Updating the full matrix:
                    first_row = self.batch_size * chunk_counter
                    last_row = first_row + elmo_vectors.shape[0]
                    if layers == "all":
                        final_vectors[first_row:last_row, :, : elmo_vectors.shape[2], :] = elmo_vectors
                    else:
                        final_vectors[first_row:last_row, : elmo_vectors.shape[1], :] = elmo_vectors
                    chunk_counter += 1

                result.append(final_vectors[0])
        return result


    def get_elmo_vector_average(self, texts, warmup=True, layers="average"):
        """
        :param texts: list of sentences (lists of words)
        :param warmup: warm up the model before actual inference (by running it over the 1st batch)
        :param layers: ["top", "average", "all"].
        Yield the top ELMo layer, the average of all layers, or all layers as they are.
        :return: matrix of averaged embeddings for all sentences
        """

        if layers == "all":
            average_vectors = np.zeros((len(texts), self.n_layers, self.vector_size))
        else:
            average_vectors = np.zeros((len(texts), self.vector_size))

        counter = 0

        with tf.compat.v1.Session() as sess:
            # Get an op to compute ELMo vectors (a function of the internal biLM layers)
            self.elmo_sentence_input = weight_layers("input", self.sentence_embeddings_op,
                                                     use_layers=layers)

            # It is necessary to initialize variables once before running inference.
            sess.run(tf.compat.v1.global_variables_initializer())

            if warmup:
                self.warmup(sess, texts)

            # Running batches:
            for chunk in divide_chunks(texts, self.batch_size):
                # Converting sentences to character ids:
                sentence_ids = self.batcher.batch_sentences(chunk)
                self.logger.info(f"Texts in the current batch: {len(chunk)}")

                # Compute ELMo representations.
                elmo_vectors = sess.run(
                    self.elmo_sentence_input["weighted_op"],
                    feed_dict={self.sentence_character_ids: sentence_ids},
                )

                self.logger.debug(f"ELMo sentence input shape: {elmo_vectors.shape}")

                if layers == "all":
                    elmo_vectors = elmo_vectors.reshape((len(chunk), elmo_vectors.shape[2],
                                                         self.n_layers, self.vector_size))
                for sentence in range(len(chunk)):
                    if layers == "all":
                        sent_vec = np.zeros((elmo_vectors.shape[1], self.n_layers,
                                             self.vector_size))
                    else:
                        sent_vec = np.zeros((elmo_vectors.shape[1], self.vector_size))
                    for nr, word_vec in enumerate(elmo_vectors[sentence]):
                        sent_vec[nr] = word_vec
                    semantic_fingerprint = np.sum(sent_vec, axis=0)
                    semantic_fingerprint = np.divide(
                        semantic_fingerprint, sent_vec.shape[0]
                    )
                    query_vec = semantic_fingerprint / np.linalg.norm(
                        semantic_fingerprint
                    )

                    average_vectors[counter] = query_vec
                    counter += 1

        return average_vectors

    def warmup(self, sess, texts):
        for chunk0 in divide_chunks(texts, self.batch_size):
            #self.logger.info(f"Warming up ELMo on {len(chunk0)} sentences...")
            sentence_ids = self.batcher.batch_sentences(chunk0)
            _ = sess.run(
                self.elmo_sentence_input["weighted_op"],
                feed_dict={self.sentence_character_ids: sentence_ids},
            )
            break