Beispiel #1
0
 def get_configurations(self, top_summaries):
     self.logs.general(
         "{0} | {1} | 9/11 | Getting Configurations ...".format(
             self.metric, self.project_name))
     configurations = {
         method: list(
             map(
                 lambda x: x[1].to_dict(),
                 top_summary.drop(
                     EstimatorSelectionHelper.get_scores_info(),
                     axis=1).where(pd.notnull(top_summary),
                                   None).iterrows()))
         for method, top_summary in top_summaries.items()
     }
     self.logs.success("{0} | {1} | 9/11 | Got Configurations.".format(
         self.metric, self.project_name))
     return configurations
Beispiel #2
0
def execute(project):
    dataset_dir = Config.get_work_dir_path(
        os.path.join("paper", "datasets", "designite", project.github()))
    Path(dataset_dir).mkdir(parents=True, exist_ok=True)
    training_path = os.path.join(dataset_dir, "training.csv")
    testing_path = os.path.join(dataset_dir, "testing.csv")

    training_df = pd.read_csv(training_path).dropna().astype(int)
    testing_df = pd.read_csv(testing_path).dropna().astype(int)

    original_training_y = training_df.pop('Bugged').values
    original_training_X = training_df.values

    testing_y = testing_df.pop('Bugged').values
    testing_X = testing_df.values

    selection_methods = {
        'chi2_10p':
        SelectPercentile(chi2, percentile=10),
        'chi2_20p':
        SelectPercentile(chi2, percentile=20),
        'chi2_50p':
        SelectPercentile(chi2, percentile=50),
        'mutual_info_classif_10p':
        SelectPercentile(mutual_info_classif, percentile=10),
        'mutual_info_classif_20p':
        SelectPercentile(mutual_info_classif, percentile=20),
        'mutual_info_classif_50p':
        SelectPercentile(mutual_info_classif, percentile=50),
        'f_classif_10':
        SelectPercentile(f_classif, percentile=10),
        'f_classif_20':
        SelectPercentile(f_classif, percentile=20),
        'f_classif_50':
        SelectPercentile(f_classif, percentile=50),
        'linear_svc':
        SelectFromModel(LinearSVC(C=0.01, penalty="l1", dual=False))
    }

    features = training_df.columns
    selector = FeatureSelectionHelper(selection_methods, features)
    selector.select(original_training_X, original_training_y)
    features = selector.get_selected_features()
    data = selector.get_selected_dataset()

    columns = [
        'estimator', 'configuration', 'feature_selection', 'precision',
        'recall', 'f1-measure', 'auc-roc', 'brier score'
    ]
    scores = pd.DataFrame(columns=columns)

    for method_name, training_X in data.items():
        training_y = original_training_y
        oversample = SMOTE()
        training_X, training_y = oversample.fit_resample(
            training_X, training_y)

        models = {
            'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
            # 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
            # 'LogisticRegression': LogisticRegression(),
            # 'BernoulliNaiveBayes': BernoulliNB(),
            # 'K-NearestNeighbor': KNeighborsClassifier(),
            # 'DecisionTree': DecisionTreeClassifier(),
            # 'RandomForest': RandomForestClassifier(),
            # 'SupportVectorMachine': SVC(),
            # 'MultilayerPerceptron': MLPClassifier()
        }
        params = {
            'LinearDiscriminantAnalysis': {},
            # 'QuadraticDiscriminantAnalysis': {},
            # 'LogisticRegression': {'C': list(np.logspace(-4, 4, 3))},
            # 'BernoulliNaiveBayes': {},
            # 'K-NearestNeighbor': {},
            # 'DecisionTree': {'criterion': ['gini', 'entropy'], },
            # 'RandomForest': {'n_estimators': [10, 100]},
            # 'SupportVectorMachine': {'C': [0.1, 100]},
            # 'MultilayerPerceptron': {'hidden_layer_sizes': [(17, 8, 17)],
            #                          'activation': ['tanh', 'relu']}
        }

        helper = EstimatorSelectionHelper(models, params)
        helper.fit(training_X, training_y, scoring='f1')
        summary = helper.score_summary()
        top_summary = summary[:10]
        top_summary_iter = top_summary.drop(EstimatorSelectionHelper.get_scores_info(), axis=1) \
            .where(pd.notnull(top_summary), None) \
            .iterrows()

        models_info = list(map(lambda x: x[1].to_dict(), top_summary_iter))

        selected_testing_X = testing_df[testing_df.columns.intersection(
            features[method_name])].values

        predictions = []
        for model_info in models_info:
            # TODO Cloning also copies the parameters. That is not what I want.
            estimator = clone(models[model_info['estimator']])
            params = {
                key: val
                for key, val in model_info.items()
                if not (val is None or key == 'estimator')
            }
            estimator.set_params(**params)
            estimator.fit(training_X, training_y)
            prediction_y = estimator.predict(selected_testing_X)
            predictions.append(prediction_y)
            scores_dict = {
                'estimator': model_info['estimator'],
                'configuration': str(params),
                'feature_selection': method_name,
                'precision': precision_score(testing_y, prediction_y),
                'recall': recall_score(testing_y, prediction_y),
                'f1-measure': f1_score(testing_y, prediction_y),
                'auc-roc': roc_auc_score(testing_y, prediction_y),
                'brier score': brier_score_loss(testing_y, prediction_y)
            }
            scores = scores.append(scores_dict, ignore_index=True)
    pass
    scores_dir = Config.get_work_dir_path(
        os.path.join("paper", "scores", "designite", project.github()))
    Path(scores_dir).mkdir(parents=True, exist_ok=True)
    scores_path = os.path.join(scores_dir, "scores.csv")
    feature_selection_dir = os.path.join(scores_dir, 'feature_selection')
    features_dir = os.path.join(feature_selection_dir, 'features')
    training_dir = os.path.join(feature_selection_dir, 'training_x')
    Path(features_dir).mkdir(parents=True, exist_ok=True)
    Path(training_dir).mkdir(parents=True, exist_ok=True)
    for method_name, training_X in data.items():
        features_path = os.path.join(features_dir, method_name + ".csv")
        pd.DataFrame({
            'Features': features[method_name]
        }).to_csv(features_path, index=False)
        training_x_path = os.path.join(training_dir, method_name + ".csv")
        pd.DataFrame(data=training_X,
                     columns=features[method_name]).to_csv(training_x_path,
                                                           index=False)

    training_y_path = os.path.join(scores_dir, "training_y.csv")
    testing_x_path = os.path.join(scores_dir, "testing_x.csv")
    testing_y_path = os.path.join(scores_dir, "testing_y.csv")
    prediction_y_path = os.path.join(scores_dir, "prediction_y.csv")
    prediction_real_y_path = os.path.join(scores_dir, "prediction_real_y.csv")
    summary_path = os.path.join(scores_dir, "summary.csv")
    scores.to_csv(scores_path, index=False)
    pd.DataFrame(data=training_y, columns=['Bugged']).to_csv(training_y_path,
                                                             index=False)
    pd.DataFrame(data=testing_X,
                 columns=training_df.columns).to_csv(testing_x_path,
                                                     index=False)
    pd.DataFrame(data=testing_y, columns=['Bugged']).to_csv(testing_y_path,
                                                            index=False)
    columns = list(map(lambda x: str(x), models_info))
    pd.DataFrame(data=np.array(predictions).transpose(),
                 columns=columns).to_csv(prediction_y_path, index=False)
    predictions.append(testing_y)
    columns.append("real")
    pd.DataFrame(data=np.array(predictions).transpose(),
                 columns=columns).to_csv(prediction_real_y_path, index=False)
    summary.to_csv(summary_path, index=False)
Beispiel #3
0
def execute(project):
    dataset_dir = Config.get_work_dir_path(
        os.path.join("paper", "datasets", "traditional_designite",
                     project.github()))
    Path(dataset_dir).mkdir(parents=True, exist_ok=True)
    training_path = os.path.join(dataset_dir, "training.csv")
    testing_path = os.path.join(dataset_dir, "testing.csv")

    training_df = pd.read_csv(training_path).dropna().replace({
        'True': 1,
        'False': 0
    })
    testing_df = pd.read_csv(testing_path).dropna().replace({
        'True': 1,
        'False': 0
    })

    training_y = training_df.pop('Bugged').values
    training_X = training_df.values
    training_X = preprocessing.scale(training_X)

    oversample = SMOTE()
    training_X, training_y = oversample.fit_resample(training_X, training_y)

    models = {
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'LogisticRegression': LogisticRegression(),
        'BernoulliNaiveBayes': BernoulliNB(),
        'K-NearestNeighbor': KNeighborsClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'SupportVectorMachine': SVC(),
        # 'MultilayerPerceptron': MLPClassifier()
    }
    params = {
        'LinearDiscriminantAnalysis': {},
        'QuadraticDiscriminantAnalysis': {},
        'LogisticRegression': {
            'C': list(np.logspace(-4, 4, 3))
        },
        'BernoulliNaiveBayes': {},
        'K-NearestNeighbor': {},
        'DecisionTree': {
            'criterion': ['gini', 'entropy'],
        },
        'RandomForest': {
            'n_estimators': [10, 100]
        },
        'SupportVectorMachine': {
            'C': [0.1, 100]
        },
        # 'MultilayerPerceptron': {'hidden_layer_sizes': [(55, 27, 55)],
        #                         'activation': ['tanh', 'relu']}
    }

    helper = EstimatorSelectionHelper(models, params)
    helper.fit(training_X, training_y, scoring='f1')
    summary = helper.score_summary()
    top_summary = summary[:10]
    top_summary_iter = top_summary.drop(EstimatorSelectionHelper.get_scores_info(), axis=1)\
                                  .where(pd.notnull(top_summary), None)\
                                  .iterrows()

    testing_y = testing_df.pop('Bugged').values
    testing_X = preprocessing.scale(testing_df.values)
    models_info = list(map(lambda x: x[1].to_dict(), top_summary_iter))

    columns = [
        'estimator', 'configuration', 'precision', 'recall', 'f1-measure',
        'auc-roc', 'brier score'
    ]
    scores = pd.DataFrame(columns=columns)
    predictions = []
    for model_info in models_info:
        estimator = models[model_info['estimator']]
        params = {
            key: val
            for key, val in model_info.items()
            if not (val is None or key == 'estimator')
        }
        estimator.set_params(**params)
        estimator.fit(training_X, training_y)
        prediction_y = estimator.predict(testing_X)
        predictions.append(prediction_y)
        scores_dict = {
            'estimator': model_info['estimator'],
            'configuration': str(params),
            'precision': precision_score(testing_y, prediction_y),
            'recall': recall_score(testing_y, prediction_y),
            'f1-measure': f1_score(testing_y, prediction_y),
            'auc-roc': roc_auc_score(testing_y, prediction_y),
            'brier score': brier_score_loss(testing_y, prediction_y)
        }
        scores = scores.append(scores_dict, ignore_index=True)
    scores_dir = Config.get_work_dir_path(
        os.path.join("paper", "scores", "traditional_designite",
                     project.github()))
    Path(scores_dir).mkdir(parents=True, exist_ok=True)
    scores_path = os.path.join(scores_dir, "scores.csv")
    training_x_path = os.path.join(scores_dir, "training_x.csv")
    training_y_path = os.path.join(scores_dir, "training_y.csv")
    testing_x_path = os.path.join(scores_dir, "testing_x.csv")
    testing_y_path = os.path.join(scores_dir, "testing_y.csv")
    prediction_y_path = os.path.join(scores_dir, "prediction_y.csv")
    prediction_real_y_path = os.path.join(scores_dir, "prediction_real_y.csv")
    summary_path = os.path.join(scores_dir, "summary.csv")
    scores.to_csv(scores_path, index=False)
    pd.DataFrame(data=training_X,
                 columns=training_df.columns).to_csv(training_x_path,
                                                     index=False)
    pd.DataFrame(data=training_y, columns=['Bugged']).to_csv(training_y_path,
                                                             index=False)
    pd.DataFrame(data=testing_X,
                 columns=training_df.columns).to_csv(testing_x_path,
                                                     index=False)
    pd.DataFrame(data=testing_y, columns=['Bugged']).to_csv(testing_y_path,
                                                            index=False)
    columns = list(map(lambda x: str(x), models_info))
    pd.DataFrame(data=np.array(predictions).transpose(),
                 columns=columns).to_csv(prediction_y_path, index=False)
    predictions.append(testing_y)
    columns.append("real")
    pd.DataFrame(data=np.array(predictions).transpose(),
                 columns=columns).to_csv(prediction_real_y_path, index=False)
    summary.to_csv(summary_path, index=False)
Beispiel #4
0
 def get_summary(X, y):
     helper = EstimatorSelectionHelper(self.models, self.params)
     helper.fit(X, y)
     return helper.score_summary()