def LinearSVC_classification(train, test, train_labels, test_labels, res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with LinearSVC...")

    linear_svc = LinearSVC()
    linear_svc.fit(train, train_labels)

    prediction = linear_svc.predict(test)
    utils.report_and_confmat(test_labels, prediction, "LinearSVC")
    score = linear_svc.score(test, test_labels)

    res["LinearSVC"] = {
        "model": linear_svc,
        "accuracy": score,
        "name": "LinearSVC"
    }
    print("LinearSVC ended...")
    return score, linear_svc
def MultinomialNB_classification(train,
                                 test,
                                 train_labels,
                                 test_labels,
                                 res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    multiNB = MultinomialNB()
    multiNB.fit(train, train_labels)

    prediction = multiNB.predict(test)
    utils.report_and_confmat(test_labels, prediction, "MultinomialNB")
    score = multiNB.score(test, test_labels)

    res["MultinomialNB"] = {
        "model": multiNB,
        "accuracy": score,
        "name": "MultinomialNB"
    }
    print("Multinomial ended...")
    return score, multiNB
def ExtrExtraTrees_classification(train,
                                  test,
                                  train_labels,
                                  test_labels,
                                  res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with ExtraTrees...")

    extra = ExtraTreesClassifier()
    extra.fit(train, train_labels)
    prediction = extra.predict(test)

    utils.report_and_confmat(test_labels, prediction, "ExtraTrees")
    score = extra.score(test, test_labels)

    res["ExtraTrees"] = {
        "model": extra,
        "accuracy": score,
        "name": "ExtraTreesClassifier"
    }
    print("ExtraTrees ended...")
    return score, extra
def AdaBoost_classification(train, test, train_labels, test_labels, res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with AdaBoost...")

    # Uso l'svc perché è quello che funziona meglio per ora
    Linsvc = LinearSVC()

    adab = AdaBoostClassifier(base_estimator=Linsvc,
                              algorithm='SAMME',
                              n_estimators=50)
    adab.fit(train, train_labels)

    prediction = adab.predict(test)
    utils.report_and_confmat(test_labels, prediction, "AdaBoost")
    score = adab.score(test, test_labels)
    print("Adaboost ended...")
    res["AdaBoostClassifier"] = {
        "model": adab,
        "accuracy": score,
        "name": "AdaBoostClassifier"
    }

    return score, adab
def LogisticRegression_classification(train,
                                      test,
                                      train_labels,
                                      test_labels,
                                      res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """

    print("Classifying with LogisticRegression...")

    # TODO CONTROLLARE I SOLVER DIVERSI
    reg = LogisticRegression(max_iter=250,
                             multi_class='multinomial',
                             solver='newton-cg')
    reg.fit(train, train_labels)

    prediction = reg.predict(test)
    utils.report_and_confmat(test_labels, prediction, "LogisticReg")
    score = reg.score(test, test_labels)

    res["LogisticRegression"] = {
        "model": reg,
        "accuracy": score,
        "name": "LogisticRegression"
    }
    print("Logistic Regression ended...")
    return score, reg
def random_forest_classification(train,
                                 test,
                                 train_labels,
                                 test_labels,
                                 res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data iterable/list
    :param train_labels: training labels
    :param test_labels: testing labels
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Random Forest Classifier...")
    rand = RandomForestClassifier(n_estimators=70, max_depth=None)
    rand.fit(train, train_labels)

    prediction = rand.predict(test)
    utils.report_and_confmat(test_labels, prediction, "Random Forest")
    score = rand.score(test, test_labels)

    res["RandomForestClassifier"] = {
        "model": rand,
        "accuracy": score,
        "name": "RandomForestClassifier"
    }
    print("RandomForset ended...")
    return score, rand
def GradientBoosting_classification(train,
                                    test,
                                    train_labels,
                                    test_labels,
                                    res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Gradient Boosting...")

    gradb = GradientBoostingClassifier(n_estimators=100)
    gradb.fit(train, train_labels)

    prediction = gradb.predict(test)
    utils.report_and_confmat(test_labels, prediction, "GradientBoosting")
    score = gradb.score(test, test_labels)
    res["GradientBoostingClassifier"] = {
        "model": gradb,
        "accuracy": score,
        "name": "GradientBoostingClassifier"
    }
    print("GradientBoosting ended...")

    return score, gradb
def BernoulliNB_classification(train, test, train_labels, test_labels, res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Bernoulli Nive Bayes...")

    bernNB = BernoulliNB(alpha=0.7)
    bernNB.fit(train, train_labels)

    prediction = bernNB.predict(test)
    utils.report_and_confmat(test_labels, prediction, "BernoulliNB")
    score = bernNB.score(test, test_labels)
    res["BernoulliNB"] = {
        "model": bernNB,
        "accuracy": score,
        "name": "BernoulliNB"
    }
    print("Bernoulli ended...")

    return score, bernNB
def ComplementNB_classification(train,
                                test,
                                train_labels,
                                test_labels,
                                res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Complement Nive Bayes...")

    complNB = ComplementNB()
    complNB.fit(train, train_labels)

    prediction = complNB.predict(test)
    utils.report_and_confmat(test_labels, prediction, "ComplementNB")
    score = complNB.score(test, test_labels)

    res["ComplementNB"] = {
        "model": complNB,
        "accuracy": score,
        "name": "ComplementNB"
    }
    print("Complement ended...")
    return score, complNB
Beispiel #10
0
def lstm_classification(train,
                        valid,
                        labels_train,
                        labels_valid,
                        save_path,
                        num_classes,
                        num_epochs=10):

    train_lab = labels_for_NN(labels_train)
    EMBEDDING_DIM = 300
    MAX_SEQUENCE_LENGTH = 750
    embedding_matrix, vocab, train_we, test_we = create_embedding(train, valid)
    VOCAB_SIZE = len(vocab)

    model = Sequential()
    model.add(
        Embedding(VOCAB_SIZE,
                  EMBEDDING_DIM,
                  input_length=MAX_SEQUENCE_LENGTH,
                  weights=[embedding_matrix]))
    model.add(LSTM(512))
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    ## Fit train data
    history = model.fit(train_we,
                        np.array(train_lab),
                        validation_split=0.2,
                        epochs=num_epochs,
                        batch_size=batch_size)
    utils.plot_history(history)

    # SE LA MATRICE TFIDF NON VA BENE O I BAG OF WORDS NON VANNO BENE ALLORA USO QUESTO
    # tokenizer = Tokenizer(num_words=VOCAB_SIZE)
    # sequences = tokenizer.texts_to_sequences(valid)
    # data_test = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    # list_prediction_proba = model.predict(data_test)

    list_prediction_proba = model.predict(test_we)

    predizione = [
        np.where(probabilities == probabilities.max())[0].min()
        for probabilities in list_prediction_proba
    ]

    utils.report_and_confmat(labels_train, labels_valid, predizione, save_path,
                             "TINY_lstm_" + str(EMBEDDING_DIM))
Beispiel #11
0
def conv_classification(train,
                        valid,
                        labels_train,
                        labels_valid,
                        save_path,
                        num_classes,
                        num_epochs=10):

    train_lab = labels_for_NN(labels_train)
    EMBEDDING_DIM = 300
    MAX_SEQUENCE_LENGTH = 750
    embedding_matrix, vocab, train_we, test_we = create_embedding(train, valid)
    VOCAB_SIZE = len(vocab)

    model = Sequential()
    model.add(
        Embedding(VOCAB_SIZE,
                  EMBEDDING_DIM,
                  input_length=MAX_SEQUENCE_LENGTH,
                  weights=[embedding_matrix]))
    model.add(Dropout(0.2))
    model.add(Conv1D(512, 7, activation='relu'))
    model.add(GlobalMaxPooling1D())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))

    # NB binary classification -->binary_crossentropy, Multi-class classification --> categorical_crossentropy
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    history = model.fit(train_we,
                        np.array(train_lab),
                        validation_split=0.2,
                        epochs=num_epochs,
                        batch_size=batch_size)
    utils.plot_history(history)

    list_prediction_proba = model.predict(test_we)

    predizione = [
        np.where(probabilities == probabilities.max())[0].min()
        for probabilities in list_prediction_proba
    ]

    utils.report_and_confmat(labels_train, labels_valid, predizione, save_path,
                             "TINY_conv_1_layer" + str(EMBEDDING_DIM))
Beispiel #12
0
def bi_lstm_classification(train,
                           valid,
                           labels_train,
                           labels_valid,
                           save_path,
                           num_classes,
                           num_epochs=10):

    train_lab = labels_for_NN(labels_train)

    EMBEDDING_DIM = 300
    MAX_SEQUENCE_LENGTH = 750
    embedding_matrix, vocab, train_we, test_we = create_embedding(train, valid)
    VOCAB_SIZE = len(vocab)

    model = Sequential()
    model.add(
        Embedding(VOCAB_SIZE,
                  EMBEDDING_DIM,
                  input_length=MAX_SEQUENCE_LENGTH,
                  weights=[embedding_matrix]))
    model.add(Bidirectional(LSTM(512, return_sequences=False)))
    model.add(Dropout(0.2))
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    ## Fit train data
    history = model.fit(train_we,
                        np.array(train_lab),
                        validation_split=0.2,
                        epochs=num_epochs,
                        batch_size=batch_size)
    utils.plot_history(history)

    list_prediction_proba = model.predict(test_we)

    predizione = [
        np.where(probabilities == probabilities.max())[0].min()
        for probabilities in list_prediction_proba
    ]

    utils.report_and_confmat(labels_train, labels_valid, predizione, save_path,
                             "TINY_bilstm" + str(EMBEDDING_DIM))
def VotingClassifier_classification(train,
                                    test,
                                    train_labels,
                                    test_labels,
                                    res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with Voting classifier...")

    cl1 = LogisticRegression(max_iter=250, multi_class='auto')
    cl6 = MultinomialNB()
    cl3 = AdaBoostClassifier(base_estimator=cl1,
                             algorithm='SAMME',
                             n_estimators=150)
    cl4 = GradientBoostingClassifier()
    cl5 = ComplementNB()
    cl8 = RandomForestClassifier(n_estimators=70, max_depth=None)
    cl9 = ExtraTreesClassifier()

    vote = VotingClassifier(estimators=[('LogisticReg', cl1),
                                        ('AdaBoost', cl3), ('GradBoost', cl4),
                                        ('ComplementNB', cl5),
                                        ('MultinomialNB', cl6),
                                        ('RandomForest', cl8),
                                        ('ExtraTree', cl9)],
                            voting='soft')
    vote.fit(train, train_labels)

    prediction = vote.predict(test)
    utils.report_and_confmat(test_labels, prediction, "VotingClass")
    score = vote.score(test, test_labels)
    print("Voting ended...")
    res["VotingClassifier"] = {
        "model": vote,
        "accuracy": score,
        "name": "VotingClassifier"
    }

    return score, vote
def SVC_classification(train, test, train_labels, test_labels, res={}):
    """

    :param train: training data, iterable/list
    :param test: testing data, iterable/list
    :param train_labels: training labels, iterable/list
    :param test_labels: testing labels, iterable/list
    :return: / --> Saves data in folder "Results"
    """
    print("Classifying with SVC...")

    svc = SVC(kernel='poly', gamma='scale')
    svc.fit(train, train_labels)

    prediction = svc.predict(test)
    utils.report_and_confmat(test_labels, prediction, "SVC")
    score = svc.score(test, test_labels)

    res["SVC"] = {"model": svc, "accuracy": score, "name": "SVC"}
    print("SVC ended...")
    return score, svc
Beispiel #15
0
def word2vec_classification(train,
                            valid,
                            labels_train,
                            labels_valid,
                            save_path,
                            num_classes,
                            num_epochs=10):
    """
    Calls all the classifiers functions in order to choose and save the best one.
    :param train: training data, iterable/list
    :param valid: testing data, iterable/list
    :param label_train: training labels, iterable/list
    :param label_test: testing labels, iterable/list
    :param num_classes: number of classes in training data, integer
    :param num_epochs=10: number of epochs to perform, integer
    :param save_path: (fixed to Models directory)
    :return: /
    """
    train_lab = utils.labels_for_NN(labels_train)

    train_clean = train
    train_tokens = []

    list_tot = []
    for sentence in train_clean:
        train_tokens.append(sentence.split())

    test_tokens = []
    for sentence in valid:
        test_tokens.append(sentence.split())

    # Dimension of the embedding vector representing the words
    EMBEDDING_DIM = 300

    # USING GENSIM, it needs a list of training TOKENS and it builds the vocabulary
    model = word2vec.Word2Vec(train_tokens,
                              iter=10,
                              min_count=10,
                              size=EMBEDDING_DIM,
                              workers=4)
    VOCAB_SIZE = len(model.wv.vocab)
    MAX_SEQUENCE_LENGTH = 750

    # Compute training embedding
    train_sequences = utils.convert_data_to_index(train_tokens, model.wv)
    test_sequences = utils.convert_data_to_index(test_tokens, model.wv)

    # Pad the vectors so they're all the same length
    train_data = pad_sequences(train_sequences,
                               maxlen=MAX_SEQUENCE_LENGTH,
                               padding="pre",
                               truncating="post")
    test_data = pad_sequences(test_sequences,
                              maxlen=MAX_SEQUENCE_LENGTH,
                              padding="pre",
                              truncating="post")

    # Getting the embedding matrix, a lookup table that translates a known word into a vector
    embedding_matrix = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
    for i in range(len(model.wv.vocab)):
        embedding_vector = model.wv[model.wv.index2word[i]]
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    # Build a network: made out of first convolutional part and second recurrent part (LSTM)
    # NB Thenetwork is very small and basic because of strict system requirements
    model = Sequential()
    model.add(
        Embedding(VOCAB_SIZE,
                  EMBEDDING_DIM,
                  input_length=MAX_SEQUENCE_LENGTH,
                  weights=[embedding_matrix]))
    model.add(Conv1D(512, 5, activation='sigmoid'))
    model.add(GlobalMaxPooling1D())
    model.add(Bidirectional(LSTM(600, return_sequences=False)))
    model.add(Dense(100, activation='sigmoid'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.002,
                                 clipnorm=.25,
                                 beta_1=0.7,
                                 beta_2=0.99),
                  metrics=['acc'])

    # Train the nwtwork
    model.fit(train_data,
              train_lab,
              validation_split=0.2,
              epochs=num_epochs,
              batch_size=80)

    # Make predictions
    list_prediction_proba = model.predict(test_data)

    # Compute report and confusion matrix
    predizione = [
        np.where(probabilities == probabilities.max())[0].min()
        for probabilities in list_prediction_proba
    ]
    utils.report_and_confmat(labels_train, labels_valid, predizione, save_path,
                             "word2vec_")