Ejemplo n.º 1
0
def predict(sentence, path_to_model, path_to_data, text_column_name,
            vocab_size):
    """Returns a sentiment score.

    Args:
        sentence (str) : The sentence to be analised.
        path_to_model (str) : The path to the model from the current directory.
        path_to_data (str) : The path to the dataset from the current directory.
        text_column_name (str) : The name of the column of the dataset with the text to be classified
        vocab_size (int) : The size of dictionary of the most frequent n_words in the corpus.

    Returns:
        score (float) : The sentiment score of the sentence. 1 - cyber abusive, 0 - not cyber abusive.
    """

    model = load_model(path_to_model)

    data = pd.read_csv(os.getcwd() + path_to_data)

    corpus_vocabulary = create_dictionary(data[text_column_name], vocab_size)

    parsed_test = pd.DataFrame({"content": pd.Series(sentence)})
    X_test = parsed_test['content']

    test_sequences = corpus_vocabulary.texts_to_sequences(X_test.values)

    padded_test = keras.preprocessing.sequence.pad_sequences(test_sequences,
                                                             padding='post',
                                                             maxlen=140)

    sentiment_score = round(model.predict(padded_test).item(0))

    print(f"This sentence has a sentiment score of: {sentiment_score}")

    return sentiment_score
Ejemplo n.º 2
0
def create_dataset_vocabulary():
    data = pd.read_csv("../data/external/dataturks/example.csv")
    return create_dictionary(data["content"], 10000)
Ejemplo n.º 3
0
def create_dataset_vocabulary(read_in_dataset):
    data = read_in_dataset
    return create_dictionary(data['content'], 10000)
Ejemplo n.º 4
0
def build(path_to_data, text_column_name, label_column_name, hyperparameters):
    """Returns the built model. This function prepares the text, turns
    them into tensors, creates a word embedding and trains the neural
    net and build the final model. The model will always be saved.
    There is one flag that allow the embeddings to be saved.

    Args:
        path_to_data (str) : The path to the dataset.
        text_column_name (str) : The name of the column of the dataset
        with the text to be classified.
        label_column_name (str) : The name of the column of labels.
        hyperparameters (dict) : A dictionary of all of the
        hyperparameters for the model.

    Returns
        model : The sentiment analyser model, fit to the training data.
    """
    data = pd.read_csv(os.getcwd() + path_to_data)

    corpus_vocabulary = create_dictionary(data[text_column_name],
                                          hyperparameters["vocab_size"])

    train, test = split(data)

    x_train = train[text_column_name]
    y_train = train[label_column_name]

    train_sequences = corpus_vocabulary.texts_to_sequences(x_train.values)
    padded_train = keras.preprocessing.sequence.pad_sequences(train_sequences,
                                                              padding="post",
                                                              maxlen=140)
    model = keras.Sequential()
    model.add(keras.layers.Embedding(hyperparameters["vocab_size"], 40))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(4, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

    model.summary()

    model.compile(optimizer="adam",
                  loss="binary_crossentropy",
                  metrics=["acc"])

    split_val = int(len(x_train) / 4)  # number of comments halved

    x_val = padded_train[:split_val]
    partial_x_train = padded_train[split_val:]

    y_val = y_train[:split_val]
    partial_y_train = y_train[split_val:]

    model.fit(
        partial_x_train,
        partial_y_train,
        epochs=hyperparameters["epoch"],
        batch_size=hyperparameters["batch_size"],
        validation_data=(x_val, y_val),
        verbose=hyperparameters["verbose"],
    )

    import datetime as dt

    now = dt.datetime.now().__str__()
    model.save(os.getcwd() + "../models/dataturks " + now + ".h5")
    print("Model saved.")