def train_mlp(): df = load_from_file('technical_debt_dataset.csv', amount=100000) #df = load_new('file.csv', amount = 10000, type = "general") Global_y = to_categorical(df['category_id']) unique = np.unique(df['project']) Histories = [] for i in range(0, len(unique)): print("Running for test project " + str(unique[i])) newDF = df[df['project'] != unique[i]] test = df[df['project'] == unique[i]] print("Train data = " + str(len(newDF)) + " | Test data = " + str(len(test))) X = newDF['commenttext'] y = newDF['category_id'] X_test = test['commenttext'] y_test = test['category_id'] # Create vectorizer for words. Use this to determine input shape of predictor network. tfidf = create_vectorizer(10) tfidf.fit(X) X_v = tfidf.transform(X).toarray() X_test = tfidf.transform(X_test).toarray() input_dim = X_v.shape[1] # Convert integer values of y to lists where int is implicit by index. y = to_categorical(y) y_test = to_categorical(y_test) X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, random_state=42, train_size=0.5) model = Sequential() model.add(layers.Dense(25, activation='elu', input_dim=input_dim)) model.add(layers.Dense(Global_y.shape[1], activation='sigmoid')) model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=[get_f1]) print("Begin model fitting") history = model.fit(X_v, y, epochs=15, verbose=False, batch_size=35, validation_data=(X_val, y_val)) Histories.append(history) print("Model fitting complete") #print(classification_report(y_test, y_pred_bool, zero_division=0)) report = classification(model, X_test, y_test) print(report)
def train_naive_bayes(X_train, y_train, min_df): ## Create bag of words. tfidf = create_vectorizer(min_df) ## Pipe functions together to create pipeable model. from sklearn.pipeline import Pipeline model = Pipeline([('tfidf', tfidf), ('clf', MultinomialNB())]) model.fit(X_train, y_train) return model
def train_decision_tree(X_train, y_train, min_df): ## Create bag of words. tfidf = create_vectorizer(min_df) ## Pipe functions together to create pipeable model. from sklearn.pipeline import Pipeline model = Pipeline([('tfidf', tfidf), ('dtc', DecisionTreeClassifier())]) model.fit(X_train, y_train) return model
def train_logistic_regression(X_train, y_train, min_df): ## Create bag of words. tfidf = create_vectorizer(min_df) ## Pipe functions together to create pipeable model. from sklearn.pipeline import Pipeline model = Pipeline([('tfidf', tfidf), ('lg', LogisticRegression(random_state=0))]) model.fit(X_train, y_train) return model
def train_knn(X_train, y_train, amount_neighbors, min_df): ## Create bag of words. tfidf = create_vectorizer(min_df) ## Pipe functions together to create pipeable model. from sklearn.pipeline import Pipeline model = Pipeline([('tfidf', tfidf), ('knn', KNeighborsClassifier(n_neighbors=amount_neighbors))]) model.fit(X_train, y_train) return model
def train_svm(X_train, y_train, min_df): ## Create bag of words. tfidf = create_vectorizer(min_df) ## Pipe functions together to create pipeable model. from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline model = Pipeline([('tfidf', tfidf), ('svm', svm.SVC())]) model.fit(X_train, y_train) return model