Example #1
0
def train_mlp():
    df = load_from_file('technical_debt_dataset.csv', amount=100000)
    #df = load_new('file.csv', amount = 10000, type = "general")
    Global_y = to_categorical(df['category_id'])
    unique = np.unique(df['project'])
    Histories = []

    for i in range(0, len(unique)):
        print("Running for test project " + str(unique[i]))
        newDF = df[df['project'] != unique[i]]
        test = df[df['project'] == unique[i]]
        print("Train data = " + str(len(newDF)) + "  |  Test data = " +
              str(len(test)))

        X = newDF['commenttext']
        y = newDF['category_id']

        X_test = test['commenttext']
        y_test = test['category_id']

        # Create vectorizer for words. Use this to determine input shape of predictor network.
        tfidf = create_vectorizer(10)
        tfidf.fit(X)
        X_v = tfidf.transform(X).toarray()
        X_test = tfidf.transform(X_test).toarray()
        input_dim = X_v.shape[1]

        # Convert integer values of y to lists where int is implicit by index.
        y = to_categorical(y)
        y_test = to_categorical(y_test)

        X_test, X_val, y_test, y_val = train_test_split(X_test,
                                                        y_test,
                                                        random_state=42,
                                                        train_size=0.5)

        model = Sequential()
        model.add(layers.Dense(25, activation='elu', input_dim=input_dim))
        model.add(layers.Dense(Global_y.shape[1], activation='sigmoid'))

        model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=[get_f1])

        print("Begin model fitting")
        history = model.fit(X_v,
                            y,
                            epochs=15,
                            verbose=False,
                            batch_size=35,
                            validation_data=(X_val, y_val))
        Histories.append(history)
        print("Model fitting complete")

        #print(classification_report(y_test, y_pred_bool, zero_division=0))
        report = classification(model, X_test, y_test)
        print(report)
Example #2
0
def train_naive_bayes(X_train, y_train, min_df):
    ## Create bag of words.
    tfidf = create_vectorizer(min_df)

    ## Pipe functions together to create pipeable model.
    from sklearn.pipeline import Pipeline
    model = Pipeline([('tfidf', tfidf), ('clf', MultinomialNB())])

    model.fit(X_train, y_train)
    return model
Example #3
0
def train_decision_tree(X_train, y_train, min_df):
    ## Create bag of words.
    tfidf = create_vectorizer(min_df)

    ## Pipe functions together to create pipeable model.
    from sklearn.pipeline import Pipeline
    model = Pipeline([('tfidf', tfidf), ('dtc', DecisionTreeClassifier())])

    model.fit(X_train, y_train)
    return model
Example #4
0
def train_logistic_regression(X_train, y_train, min_df):
    ## Create bag of words.
    tfidf = create_vectorizer(min_df)

    ## Pipe functions together to create pipeable model.
    from sklearn.pipeline import Pipeline
    model = Pipeline([('tfidf', tfidf),
                      ('lg', LogisticRegression(random_state=0))])

    model.fit(X_train, y_train)
    return model
Example #5
0
def train_knn(X_train, y_train, amount_neighbors, min_df):
    ## Create bag of words.
    tfidf = create_vectorizer(min_df)

    ## Pipe functions together to create pipeable model.
    from sklearn.pipeline import Pipeline
    model = Pipeline([('tfidf', tfidf),
                      ('knn',
                       KNeighborsClassifier(n_neighbors=amount_neighbors))])

    model.fit(X_train, y_train)
    return model
Example #6
0
File: SVM.py Project: Oli-26/Thesis
def train_svm(X_train, y_train, min_df): 
    ## Create bag of words.
    tfidf = create_vectorizer(min_df)

    ## Pipe functions together to create pipeable model.
    
    from sklearn.neighbors import KNeighborsClassifier

    from sklearn.pipeline import Pipeline
    model = Pipeline([('tfidf', tfidf), ('svm', svm.SVC())])
    
    model.fit(X_train, y_train)
    return model