def algorithm_analysis(): """ Prepares a classifier based on non-validated data, and evaluates its performance in the valdated portion of the dataset. :return: Dictionary of classifiers """ raw_dataframe = preprocessing.load_original_dataframe() issues_train, issues_train_std, priorities_train, issues_test, issues_test_std, priorities_test = get_training_datasets( raw_dataframe) unfiltered_dataframe = preprocessing.filter_issues_dataframe(raw_dataframe, priority_changer=False) filtered_dataframe = preprocessing.filter_issues_dataframe(raw_dataframe, priority_changer=True) train, test = train_test_split(unfiltered_dataframe, test_size=0.2, random_state=0) test_valid = preprocessing.filter_issues_dataframe(test, priority_changer=True) issues_test_valid, issues_test_valid_std, priorities_test_valid = prepare_for_classification(test_valid, issues_train) issues_valid, issues_valid_std, priorities_valid = prepare_for_classification(filtered_dataframe, issues_train) results = [] estimators = {} for algorithm, grid_search in selector.get_algorithms(): training_set = issues_train_std test_set = issues_test_std test_valid_set = issues_test_valid_std valid_set = issues_valid_std print "Current algorithm: ", algorithm if algorithm == "RandomForest": training_set = issues_train test_set = issues_test test_valid_set = issues_test_valid valid_set = issues_valid optimal_estimator, best_params = tuning.parameter_tuning(grid_search, training_set, priorities_train) estimators[algorithm] = optimal_estimator result = selector.analyse_performance(optimal_estimator, best_params, grid_search, algorithm, training_set, priorities_train, test_set, priorities_test) print "Evaluating on the valid portion of the test dataset ..." assigner.evaluate_performance(prefix=algorithm, classifier=optimal_estimator, issues_test_std=test_valid_set, priority_test=priorities_test_valid) print "Evaluating on the complete valid dataset ..." assigner.evaluate_performance(prefix=algorithm, classifier=optimal_estimator, issues_test_std=valid_set, priority_test=priorities_valid) if result: results.append(result + ("ALL", len(unfiltered_dataframe.index))) selector.write_results("all_experiment_results.csv", results) return estimators
def predict_priority(): """ Trains a classifier and runs it on a dataset. :return: """ # This values are product of the experiments best_class_label = "Severe" best_estimators = 51 best_depth = 21 classifier = RandomForestClassifier(n_estimators=best_estimators, max_depth=best_depth, random_state=0, n_jobs=-1) # classifier = SVC(kernel='linear', C=1.0, class_weight='balanced') target_dataframe = preprocessing.load_original_dataframe() training_dataframe = preprocessing.filter_issues_dataframe(target_dataframe) issues_training, labels_traning = preprocessing.encode_and_split(issues_dataframe=training_dataframe, class_label=best_class_label, numerical_features=preprocessing.NUMERICAL_FEATURES, nominal_features=[], text_feature=None) assigner.train_and_predict(classifier, target_dataframe, issues_training, labels_traning, best_class_label, preprocessing.NUMERICAL_FEATURES, [])
def predict_priority(): """ Trains a classifier and runs it on a dataset. :return: """ # This values are product of the experiments best_class_label = "Severe" best_estimators = 51 best_depth = 21 classifier = RandomForestClassifier(n_estimators=best_estimators, max_depth=best_depth, random_state=0, n_jobs=-1) # classifier = SVC(kernel='linear', C=1.0, class_weight='balanced') target_dataframe = preprocessing.load_original_dataframe() training_dataframe = preprocessing.filter_issues_dataframe( target_dataframe) issues_training, labels_traning = preprocessing.encode_and_split( issues_dataframe=training_dataframe, class_label=best_class_label, numerical_features=preprocessing.NUMERICAL_FEATURES, nominal_features=[], text_feature=None) assigner.train_and_predict(classifier, target_dataframe, issues_training, labels_traning, best_class_label, preprocessing.NUMERICAL_FEATURES, [])
def main(): original_dataframe = preprocessing.load_original_dataframe() issues_dataframe = preprocessing.filter_issues_dataframe( original_dataframe) # Plotting projects figure, axes = plt.subplots(1, 1) issues_dataframe['Git Repository'].value_counts(normalize=True).plot( kind='bar', ax=axes) plt.show() issues_dataframe, encoded_priorities = preprocessing.encode_and_split( issues_dataframe, preprocessing.CLASS_LABEL, preprocessing.NUMERICAL_FEATURES, preprocessing.NOMINAL_FEATURES) # Plotting priorities figure, axes = plt.subplots(1, 1) encoded_priorities.value_counts(normalize=True, sort=True).plot(kind='bar', ax=axes) plt.show() issues_train, issues_test, priority_train, priority_test = train_test_split( issues_dataframe, encoded_priorities, test_size=0.2, random_state=0) print len(issues_train.index), " issues on the train set." issues_train_std, issues_test_std = preprocessing.escale_numerical_features( preprocessing.NUMERICAL_FEATURES, issues_train, issues_test) logit_classifier = select_features_l1(issues_train_std, priority_train, issues_test_std, priority_test) knn_classifier = sequential_feature_selection(issues_train_std, priority_train, issues_test_std, priority_test) print "Building Random Forest Classifier ..." rforest_classifier = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) rforest_classifier.fit(issues_train, priority_train) forest_classifier = feature_importance_with_forest(issues_train, priority_train, issues_test, priority_test) rforest_classifier = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) train_and_predict(rforest_classifier, original_dataframe, issues_dataframe, encoded_priorities, preprocessing.CLASS_LABEL, preprocessing.NUMERICAL_FEATURES, preprocessing.NOMINAL_FEATURES)
def main(): """ Initial execution point. :return: None. """ best_forest = algorithm_analysis()["RandomForest"] raw_dataframe = preprocessing.load_original_dataframe() issues_train, issues_train_std, priorities_train, issues_test, issues_test_std, priorities_test = get_training_datasets( raw_dataframe) assigner.feature_importance_with_forest(best_forest, issues_train, priorities_train, issues_test, priorities_test)
def main(): """ Initial execution point :return: None """ original_dataframe = preprocessing.load_original_dataframe() repositories = get_all_repositories(original_dataframe) results = [] for repository in repositories: print "Working on repository ", repository, " ..." project_dataframe = preprocessing.filter_issues_dataframe( original_dataframe, repository=repository) # Threslhold taking into account considering the scikit-learn cheat sheet # http://scikit-learn.org/stable/tutorial/machine_learning_map/ minimum_threshold = 50 issues_found = len(project_dataframe.index) print issues_found, " issues found on repository ", repository if issues_found > minimum_threshold: # The Git Repository feauture is not needed since it is filtered. nominal_features = [] issues, priorities = preprocessing.encode_and_split( project_dataframe, assigner.CLASS_LABEL, assigner.NUMERICAL_FEATURES, nominal_features) train_test = preprocessing.train_test_encode( repository, issues, priorities) if train_test: issues_train_std, priority_train, issues_test_std, priority_test = train_test result = run_algorithm_analysis(issues_train_std, priority_train, issues_test_std, priority_test, repository, issues_found) if result: results.extend(result) else: print "Issues corresponding to repository ", repository, " are not enough for analysis." write_results("project_experiment_results.csv", results)
def algorithm_analysis(): """ Executes the analysis for finding the optimal class label and algorithm configuration. :return: None. """ consolidated_results = [] try: minimum_records = 50 class_labels = [ 'Severe' # 'Blocker' # , 'Non-Severe', 'Trivial', 'Critical' ] original_dataframe = preprocessing.load_original_dataframe() repositories = [] # valid_dataframe = preprocessing.filter_issues_dataframe(original_dataframe) # repositories.append(("VALID", valid_dataframe)) for repo_name in selector.get_all_repositories(original_dataframe): project_dataframe = preprocessing.filter_issues_dataframe( original_dataframe, repository=repo_name) repositories.append((repo_name, project_dataframe)) for class_label in class_labels: for repository_name, dataframe in repositories: print "Using ", class_label, " as the class feature." print "Working on repository ", repository_name, " with ", len( dataframe.index), "Issues" if len(dataframe.index) >= minimum_records: results = execute_analysis( dataframe, class_label + "-" + repository_name, class_label) consolidated_results.extend(results) else: print "Not enough issues for analysis: ", len( dataframe.index) finally: selector.write_results("Binary_Classification.csv", consolidated_results) winsound.Beep(2500, 1000)
def main(): """ Initial execution point :return: None """ original_dataframe = preprocessing.load_original_dataframe() repositories = get_all_repositories(original_dataframe) results = [] for repository in repositories: print "Working on repository ", repository, " ..." project_dataframe = preprocessing.filter_issues_dataframe(original_dataframe, repository=repository) # Threslhold taking into account considering the scikit-learn cheat sheet # http://scikit-learn.org/stable/tutorial/machine_learning_map/ minimum_threshold = 50 issues_found = len(project_dataframe.index) print issues_found, " issues found on repository ", repository if issues_found > minimum_threshold: # The Git Repository feauture is not needed since it is filtered. nominal_features = [] issues, priorities = preprocessing.encode_and_split(project_dataframe, assigner.CLASS_LABEL, assigner.NUMERICAL_FEATURES, nominal_features) train_test = preprocessing.train_test_encode(repository, issues, priorities) if train_test: issues_train_std, priority_train, issues_test_std, priority_test = train_test result = run_algorithm_analysis(issues_train_std, priority_train, issues_test_std, priority_test, repository, issues_found) if result: results.extend(result) else: print "Issues corresponding to repository ", repository, " are not enough for analysis." write_results("project_experiment_results.csv", results)
def main(): original_dataframe = preprocessing.load_original_dataframe() issues_dataframe = preprocessing.filter_issues_dataframe(original_dataframe) # Plotting projects figure, axes = plt.subplots(1, 1) issues_dataframe['Git Repository'].value_counts(normalize=True).plot(kind='bar', ax=axes) plt.show() issues_dataframe, encoded_priorities = preprocessing.encode_and_split(issues_dataframe, preprocessing.CLASS_LABEL, preprocessing.NUMERICAL_FEATURES, preprocessing.NOMINAL_FEATURES) # Plotting priorities figure, axes = plt.subplots(1, 1) encoded_priorities.value_counts(normalize=True, sort=True).plot(kind='bar', ax=axes) plt.show() issues_train, issues_test, priority_train, priority_test = train_test_split(issues_dataframe, encoded_priorities, test_size=0.2, random_state=0) print len(issues_train.index), " issues on the train set." issues_train_std, issues_test_std = preprocessing.escale_numerical_features(preprocessing.NUMERICAL_FEATURES, issues_train, issues_test) logit_classifier = select_features_l1(issues_train_std, priority_train, issues_test_std, priority_test) knn_classifier = sequential_feature_selection(issues_train_std, priority_train, issues_test_std, priority_test) print "Building Random Forest Classifier ..." rforest_classifier = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) rforest_classifier.fit(issues_train, priority_train) forest_classifier = feature_importance_with_forest(issues_train, priority_train, issues_test, priority_test) rforest_classifier = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1) train_and_predict(rforest_classifier, original_dataframe, issues_dataframe, encoded_priorities, preprocessing.CLASS_LABEL, preprocessing.NUMERICAL_FEATURES, preprocessing.NOMINAL_FEATURES)
def algorithm_analysis(): """ Executes the analysis for finding the optimal class label and algorithm configuration. :return: None. """ consolidated_results = [] try: minimum_records = 50 class_labels = [ 'Severe' # 'Blocker' # , 'Non-Severe', 'Trivial', 'Critical' ] original_dataframe = preprocessing.load_original_dataframe() repositories = [] # valid_dataframe = preprocessing.filter_issues_dataframe(original_dataframe) # repositories.append(("VALID", valid_dataframe)) for repo_name in selector.get_all_repositories(original_dataframe): project_dataframe = preprocessing.filter_issues_dataframe(original_dataframe, repository=repo_name) repositories.append((repo_name, project_dataframe)) for class_label in class_labels: for repository_name, dataframe in repositories: print "Using ", class_label, " as the class feature." print "Working on repository ", repository_name, " with ", len(dataframe.index), "Issues" if len(dataframe.index) >= minimum_records: results = execute_analysis(dataframe, class_label + "-" + repository_name, class_label) consolidated_results.extend(results) else: print "Not enough issues for analysis: ", len(dataframe.index) finally: selector.write_results("Binary_Classification.csv", consolidated_results) winsound.Beep(2500, 1000)
def algorithm_analysis(): """ Prepares a classifier based on non-validated data, and evaluates its performance in the valdated portion of the dataset. :return: Dictionary of classifiers """ raw_dataframe = preprocessing.load_original_dataframe() issues_train, issues_train_std, priorities_train, issues_test, issues_test_std, priorities_test = get_training_datasets( raw_dataframe) unfiltered_dataframe = preprocessing.filter_issues_dataframe( raw_dataframe, priority_changer=False) filtered_dataframe = preprocessing.filter_issues_dataframe( raw_dataframe, priority_changer=True) train, test = train_test_split(unfiltered_dataframe, test_size=0.2, random_state=0) test_valid = preprocessing.filter_issues_dataframe(test, priority_changer=True) issues_test_valid, issues_test_valid_std, priorities_test_valid = prepare_for_classification( test_valid, issues_train) issues_valid, issues_valid_std, priorities_valid = prepare_for_classification( filtered_dataframe, issues_train) results = [] estimators = {} for algorithm, grid_search in selector.get_algorithms(): training_set = issues_train_std test_set = issues_test_std test_valid_set = issues_test_valid_std valid_set = issues_valid_std print "Current algorithm: ", algorithm if algorithm == "RandomForest": training_set = issues_train test_set = issues_test test_valid_set = issues_test_valid valid_set = issues_valid optimal_estimator, best_params = tuning.parameter_tuning( grid_search, training_set, priorities_train) estimators[algorithm] = optimal_estimator result = selector.analyse_performance(optimal_estimator, best_params, grid_search, algorithm, training_set, priorities_train, test_set, priorities_test) print "Evaluating on the valid portion of the test dataset ..." assigner.evaluate_performance(prefix=algorithm, classifier=optimal_estimator, issues_test_std=test_valid_set, priority_test=priorities_test_valid) print "Evaluating on the complete valid dataset ..." assigner.evaluate_performance(prefix=algorithm, classifier=optimal_estimator, issues_test_std=valid_set, priority_test=priorities_valid) if result: results.append(result + ("ALL", len(unfiltered_dataframe.index))) selector.write_results("all_experiment_results.csv", results) return estimators