Ejemplo n.º 1
0
def get_RandomForest_model(model_name,
                           embedding_dim=embedding.embedding_dim,
                           train_data=None,
                           load_model=True,
                           train_model=False,
                           save_model=True,
                           test_data=None,
                           build_params=None,
                           train_params=None,
                           **kwargs):
    """
       Creates a new instance of RF_classi
       Parameters:
       :param model_name: (str) the name of the model to create, or the name of the model to load.
       :param embedding_dim: (int) dimension of the embedding space
       :param train_data : (np.ndarray) the training data in a single matrix (like the one produced by
               the embedding.pipeline.build_embedding_matrix method
       :param load_model: (bool) whether to load the model from file.
       :param train_model: (bool) whether to train the model.
       :param save_model: (bool) whether to save the model
       :param test_data: (np.ndarray) if not None, the model will be tested against this test data.
       :param build_params: (dict) dictionary of parameters to pass to build the model
               Example : {c:1,
                         kernel:'rbf',
                         }
       :param train_params: (dict) dictionary of parameters to pass to build the model
               Example : {validation_split:0.2}
       :return: an instance of RF_classi class
       """

    vocabulary = kwargs.get("vocabulary")
    if not vocabulary: vocabulary = standard_vocab_name
    vocab_dim = get_vocab_dimension(vocabulary)

    load_embedding = kwargs.get("load_embedding", True)
    embedding_name = kwargs.get("embedding_location", "glove_emb.npz")
    generator_mode = kwargs.get("generator_mode", False)
    max_len = kwargs.get("max_len", 100)
    if load_embedding:
        glove_embedding = get_glove_embedding(
            vocabulary_file=vocabulary,
            load_from_file=True,
            load_Stanford=False,
            # no need to reload the stanford embedding when we already load the embedding matrix from file
            file_name=embedding_name,
            train=False,
            save=False)
        embedding_matrix = glove_embedding.embedding_matrix

    ourRF = RF_classi(embedding_dim, model_name)

    ourRF.build(**build_params)
    if load_model: ourRF.load()

    if train_model:
        print("train_model", train_model)
        x_train, y_train = None, None
        if not generator_mode:
            print(generator_mode)
            x_train = train_data[:, 0:-1]
            y_train = train_data[:, -1]

        generator_params = {
            "embedding":
            glove_embedding,
            "input_files": [
                replaced_train_negative_location,
                replaced_train_positive_location
            ],
            "input_entries":
            full_dimension,
            "max_len":
            max_len
        }

        ourRF.train(x_train,
                    y_train,
                    generator_mode=generator_mode,
                    **generator_params,
                    **train_params)
    if save_model: ourRF.save()

    if test_data is not None:
        x_test = test_data[:, 0:-1]
        y_test = test_data[:, -1]
        ourRF.test(x_test, y_test)
    if save_model: ourRF.save()
    return ourRF
Ejemplo n.º 2
0
def get_convolutional_model(model_name,
                            embedding_dim=embedding.embedding_dim,
                            train_data=None,
                            load_model=False,
                            train_model=False,
                            save_model=False,
                            test_data=None,
                            build_params=None,
                            train_params=None,
                            **kwargs):
    """
    Creates a new instance of convolutional_NN
    Parameters:
    :param model_name: (str) the name of the model to create, or the name of the model to load.
    :param embedding_dim: (int) dimension of the embedding space
    :param train_data : (np.ndarray) the training data in a single matrix (like the one produced by
            the embedding.pipeline.build_embedding_matrix method
    :param load_model: (bool) whether to load the model from file.
    :param train_model: (bool) whether to train the model.
    :param save_model: (bool) whether to save the model
    :param test_data: (np.ndarray) if not None, the model will be tested against this test data.
    :param build_params: (dict) dictionary of parameters to pass to build the model
    :param train_params: (dict) dictionary of parameters to pass to build the model
    :param kwargs: additional arguments
        Arguments accepted:
        - :arg load_embedding: (bool) whether to load an embedding matrix into the classifier
            (if false, the classifier will learn the embedding from scratch)
        - :arg embedding_location: (str) - only used if the above parameter is true- path to the
            file that stores the embedding matrix
        - :arg vocabulary: (str) vocabulary in use
    :return: an instance of Vanilla_NN class
    """
    vocabulary = kwargs.get("vocabulary")
    if not vocabulary: vocabulary = standard_vocab_name
    vocab_dim = get_vocab_dimension(vocabulary)
    # --------------------
    # Opening pre-trained embedding matrix
    load_embedding = kwargs.get("load_embedding")
    embedding_name = kwargs.get("embedding_location", "glove_emb.npz")
    generator_mode = kwargs.get("generator_mode", False)
    if load_embedding:
        glove_embedding = get_glove_embedding(
            vocabulary_file=vocabulary,
            load_from_file=True,
            load_Stanford=
            False,  #no need to reload the stanford embedding when we already load the embedding matrix from file
            file_name=embedding_name,
            train=False,
            save=False)
        embedding_matrix = glove_embedding.embedding_matrix
    # -------------------
    # Building the model
    convolutional_fun = convolutional_NN
    convolutional = convolutional_fun(embedding_dimension=embedding_dim,
                                      vocabulary_dimension=vocab_dim,
                                      name=model_name,
                                      embedding_matrix=embedding_matrix)
    convolutional.build(**build_params)
    if load_model: convolutional.load()
    # ----------------
    # Training, testing and saving
    if train_model:
        x_train, y_train = None, None
        if not generator_mode:
            x_train = train_data[:, 0:-1]
            y_train = train_data[:, -1]

        convolutional.train(x_train,
                            y_train,
                            generator_mode=generator_mode,
                            **train_params)

    if save_model: convolutional.save()
    if test_data is not None:
        idx2word = load_inverse_vocab(vocabulary)
        x_test = test_data[:, 0:-1]
        y_test = test_data[:, -1]
        convolutional.test(x_test, y_test, idx2word=idx2word)

    return convolutional
Ejemplo n.º 3
0
def get_ET_model(model_name,
                 train_data=None,
                 load_model=False,
                 train_model=False,
                 save_model=False,
                 test_data=None,
                 build_params=None,
                 train_params=None,
                 **kwargs):
    """
    Creates a new instance of convolutional_NN
    Parameters:
    :param model_name: (str) the name of the model to create, or the name of the model to load.
    :param train_data : (np.ndarray) the training data in a single matrix (like the one produced by
            the embedding.pipeline.build_embedding_matrix method
    :param load_model: (bool) whether to load the model from file.
    :param train_model: (bool) whether to train the model.
    :param save_model: (bool) whether to save the model
    :param test_data: (np.ndarray) if not None, the model will be tested against this test data.
    :param build_params: (dict) dictionary of parameters to pass to build the model
    :param train_params: (dict) dictionary of parameters to pass to build the model
    """
    number_of_embeddings = kwargs.get("number_of_embeddings")
    vocabularies = kwargs.get("vocabularies")
    embedding_locations = kwargs.get("embedding_locations")
    assert not vocabularies is None and not number_of_embeddings is None and not embedding_locations is None, \
        "Usage error. To use the met network you need to specify the embeddings and vocabularies to use."
    # --------------------
    # Opening pre-trained embedding matrix
    embeddings = []
    for i in range(number_of_embeddings):
        #Note: the "get glove embedding matrix can load multiple embeddings"
        emb = get_glove_embedding(
            vocabulary_file=vocabularies[i],
            load_from_file=True,
            load_Stanford=
            False,  #no need to reload the stanford embedding when we already load the embedding matrix from file
            file_name=embedding_locations[i],
            train=False,
            save=False)
        embedding_matrix = emb.embedding_matrix
        embeddings.append(embedding_matrix)
    # -------------------
    # Building the model
    if number_of_embeddings == 1:
        my_transformer = etransformer_NN(
            embedding_dimension=embeddings[0].shape[1],
            vocabulary_dimension=get_vocab_dimension(vocabularies[0]),
            embedding_matrices=embeddings[0],
            number_of_embeddings=number_of_embeddings,
            name=model_name)
    else:
        my_transformer = metransformer_NN(
            embedding_dimension=-1,
            embedding_matrices=embeddings,
            number_of_embeddings=number_of_embeddings,
            name=model_name)
    my_transformer.build(**build_params)
    if load_model: my_transformer.load()
    # ----------------
    # Training, testing and saving
    if train_model:
        x_train = train_data[:, 0:-1]
        y_train = train_data[:, -1]
        my_transformer.train(x_train,
                             y_train,
                             generator_mode=False,
                             **train_params)
    if save_model: my_transformer.save()
    if test_data is not None:
        idx2word = None
        if number_of_embeddings == 1:
            idx2word = load_inverse_vocab(vocabularies[0])
        x_test = test_data[:, 0:-1]
        y_test = test_data[:, -1]
        my_transformer.test(x_test, y_test, idx2word=idx2word)
    return my_transformer
Ejemplo n.º 4
0
def get_recurrent_model(model_name,
                        embedding_dim=embedding.embedding_dim,
                        train_data=None,
                        load_model=False,
                        train_model=False,
                        save_model=False,
                        test_data=None,
                        build_params=None,
                        train_params=None,
                        **kwargs):
    """
    Creates a new instance of recurrent_NN
    Parameters:
    :param model_name: (str) the name of the model to create, or the name of the model to load.
    :param embedding_dim: (int) dimension of the embedding space
    :param train_data : (np.ndarray) the training data in a single matrix (like the one produced by
            the embedding.pipeline.build_embedding_matrix method
    :param load_model: (bool) whether to load the model from file.
    :param train_model: (bool) whether to train the model.
    :param save_model: (bool) whether to save the model
    :param test_data: (np.ndarray) if not None, the model will be tested against this test data.
    :param build_params: (dict) dictionary of parameters to pass to build the model
            ```
            >>> Example : build_params = {"activation":'relu', \
                                            "loss":"binary_crossentropy",\
                                            "metrics":None,\
                                            "cell_type":"LSTM",\
                                            "num_layers":3,\
                                            "hidden_size":64,\
                                            "train_embedding":False,\
                                            "use_attention":False, \
                                            "optimizer":"rmsprop"}
            ```
    :param train_params: (dict) dictionary of parameters to pass to build the model
            >>> Example : {"epochs":10, \
                            "batch_size":32, \
                            "validation_split":0.2}

    :param kwargs: additional arguments
        Arguments accepted:
        - :arg load_embedding: (bool) whether to load an embedding matrix into the classifier
            (if false, the classifier will learn the embedding from scratch)
        - :arg embedding_location: (str) - only used if the above parameter is true- path to the
            file that stores the embedding matrix
        - :arg vocabulary: (str) vocabulary in use
    :return: an instance of Vanilla_NN class
    """
    vocabulary = kwargs.get("vocabulary")
    if not vocabulary: vocabulary = standard_vocab_name
    vocab_dim = get_vocab_dimension(vocabulary)
    # --------------------
    # Opening pre-trained embedding matrix
    load_embedding = kwargs.get("load_embedding")
    embedding_name = kwargs.get("embedding_location", "glove_emb.npz")
    generator_mode = kwargs.get("generator_mode", True)
    max_len = kwargs.get("max_len", 100)
    if load_embedding:
        glove_embedding = get_glove_embedding(
            vocabulary_file=vocabulary,
            load_from_file=True,
            load_Stanford=
            False,  #no need to reload the stanford embedding when we already load the embedding matrix from file
            file_name=embedding_name,
            train=False,
            save=False)
        embedding_matrix = glove_embedding.embedding_matrix
    # -------------------
    # Building the model
    use_attention = build_params.get("use_attention")
    recurrent_fun = recurrent_NN
    if use_attention: recurrent_fun = attention_NN
    recurrent = recurrent_fun(embedding_dimension=embedding_dim,
                              vocabulary_dimension=vocab_dim,
                              name=model_name,
                              embedding_matrix=embedding_matrix)
    recurrent.build(**build_params)
    if load_model: recurrent.load()
    # ----------------
    # Training, testing and saving
    if train_model:
        x_train, y_train = None, None
        print("generator mode ", generator_mode)
        if not generator_mode:
            x_train = train_data[:, 0:-1]
            y_train = train_data[:, -1]

        generator_params = {
            "embedding": glove_embedding,
            "input_files": [train_negative_location, train_positive_location],
            "input_entries": full_dimension,
            "max_len": max_len
        }

        recurrent.train(x_train,
                        y_train,
                        generator_mode=generator_mode,
                        **generator_params,
                        **train_params)
    if save_model: recurrent.save()
    if test_data is not None:
        idx2word = load_inverse_vocab(vocabulary)
        x_test = test_data[:, 0:-1]
        y_test = test_data[:, -1]
        recurrent.test(x_test, y_test, idx2word=idx2word)
    # ---------------
    # Visualization
    visualize_attention = kwargs.get("visualize_attention", train_model)
    sentence_pos = "I'm loving this project, let's keep on working guys!"
    sentence_neg = "I hate bugs, but not as much as I hate cooking."
    if visualize_attention and use_attention:
        #Note: visualization can only be used with the attention model
        # 1. get the vectorised representation of the sentence
        sentence_pos_vec = no_embeddings(sentence_pos,
                                         embedding=glove_embedding)
        sentence_neg_vec = no_embeddings(sentence_neg,
                                         embedding=glove_embedding)
        # 2. get the attention plot
        recurrent.visualize_attention(sentence_pos, sentence_pos_vec)
        recurrent.visualize_attention(sentence_neg, sentence_neg_vec)

    return recurrent