Exemple #1
0
def retrieve_labelled_instances(dataset, refactoring: LowLevelRefactoring):
    """
    This method retrieves all the labelled instances for a given refactoring and dataset.
    It performs the following pipeline:
      1. Get all refactored and non refactored instances from the db.
      2. Merge them into a single dataset, having 1=true and 0=false, as labels.
      3. Removes possible NAs (the data collection process is tough; bad data might had make it through)
      4. Shuffles the dataset (good practice)
      5. Balances the dataset (if configured)
      6. Scales the features values (if configured)
      7. Performs feature reduction (if configured)

    :param dataset: a string containing the name of the dataset to be retrieved
    :param refactoring: the refactoring object, containing the refactoring to be retrieved
    :return:
        features: an array with the features of the instances
        x: a dataframe with the feature values
        y: the label (1=true, a refactoring has happened, 0=false, no refactoring has happened)
        scaler: the scaler object used in the scaling process.
    """

    # get all refactoring examples we have in our dataset
    refactored_instances = refactoring.get_refactored_instances(dataset)

    # load non-refactoring examples
    non_refactored_instances = refactoring.get_non_refactored_instances(dataset)

    log("raw number of refactoring instances: {}".format(refactored_instances.shape[0]))
    log("raw number of not refactoring instances: {}".format(non_refactored_instances.shape[0]))

    # if there' still a row with NAs, drop it as it'll cause a failure later on.
    refactored_instances = refactored_instances.dropna()
    non_refactored_instances = non_refactored_instances.dropna()

    log("refactoring instance (after dropping NA)s: {}".format(refactored_instances.shape[0]))
    log("not refactoring instances (after dropping NA)s: {}".format(non_refactored_instances.shape[0]))

    assert refactored_instances.shape[0] > 0, "No refactorings found"

    # set the prediction variable as true and false in the datasets
    refactored_instances["prediction"] = 1
    non_refactored_instances["prediction"] = 0

    # if it's a test run, we reduce the sample randomly
    if TEST:
        refactored_instances = refactored_instances.sample(frac=0.1)
        non_refactored_instances = non_refactored_instances.sample(frac=0.1)

    # now, combine both datasets (with both TRUE and FALSE predictions)
    assert non_refactored_instances.shape[1] == refactored_instances.shape[1], "number of columns differ from both datasets"
    merged_dataset = pd.concat([refactored_instances, non_refactored_instances])

    # separate the x from the y (as required by the scikit-learn API)
    x = merged_dataset.drop("prediction", axis=1)
    y = merged_dataset["prediction"]

    # class level refactoring is the only one with process and ownership metrics
    if USE_PROCESS_AND_AUTHORSHIP_METRICS and not refactoring.refactoring_level() == 'class':
        x = x.drop(["authorOwnership", "bugFixCount", "linesAdded", "linesDeleted", "qtyMajorAuthors",
                    "qtyMinorAuthors", "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved"], axis=1)

    # number of default fields and methods is always 0
    # so, remove it from the data
    x = x.drop(["classNumberOfDefaultFields", "classNumberOfDefaultMethods"], axis=1)

    # balance the datasets, as we have way more 'non refactored examples' rather than refactoring examples
    # for now, we basically perform under sampling
    if BALANCE_DATASET:
        log("instances before balancing: {}".format(Counter(y)))
        x, y = perform_balancing(x, y)
        assert x.shape[0] == y.shape[0], "Balancing did not work, x and y have different shapes."
        log("instances after balancing: {}".format(Counter(y)))

    # apply some scaling to speed up the algorithm
    scaler = None
    if SCALE_DATASET:
        x, scaler = perform_scaling(x)

    # let's reduce the number of features in the set
    if FEATURE_REDUCTION:
        x = perform_feature_reduction(x, y)

    return x.columns.values, x, y, scaler
Exemple #2
0
def retrieve_ordered_labelled_instances(dataset, refactoring: LowLevelRefactoring):
    """
    This method retrieves all the labelled instances for a given refactoring and dataset.
    It performs the same pipeline as above, but train happens always before test,
    by ORDERED_DATA_TEST_SPLIT%:

    :param dataset: a string containing the name of the dataset to be retrieved
    :param refactoring: the refactoring object, containing the refactoring to be retrieved
    :return:
        features: an array with the features of the instances
        x_train: a dataframe with the feature values
        y_train: the label (1=true, a refactoring has happened, 0=false, no refactoring has happened)
        x_test: same, but for test
        y_test: same, but for test
        scaler: the scaler object used in the scaling process.
    """

    # get all refactoring examples we have in our dataset
    refactored_instances = refactoring.get_refactored_instances(dataset)

    # load non-refactoring examples
    non_refactored_instances = refactoring.get_non_refactored_instances(dataset)

    log("raw number of refactoring instances: {}".format(refactored_instances.shape[0]))
    log("raw number of not refactoring instances: {}".format(non_refactored_instances.shape[0]))

    # if there' still a row with NAs, drop it as it'll cause a failure later on.
    refactored_instances = refactored_instances.dropna()
    non_refactored_instances = non_refactored_instances.dropna()

    log("refactoring instance (after dropping NA)s: {}".format(refactored_instances.shape[0]))
    log("not refactoring instances (after dropping NA)s: {}".format(non_refactored_instances.shape[0]))

    assert refactored_instances.shape[0] > 0, "No refactorings found"

    # set the prediction variable as true and false in the datasets
    refactored_instances["prediction"] = 1
    non_refactored_instances["prediction"] = 0

    # if it's a test run, we reduce the sample randomly
    if TEST:
        refactored_instances = refactored_instances.sample(frac=0.1)
        non_refactored_instances = non_refactored_instances.sample(frac=0.1)

    # now, combine both datasets (with both TRUE and FALSE predictions)
    assert non_refactored_instances.shape[1] == refactored_instances.shape[1], "number of columns differ from both datasets"

    # class level refactoring is the only one with process and ownership metrics
    if USE_PROCESS_AND_AUTHORSHIP_METRICS and not refactoring.refactoring_level() == 'class':
        refactored_instances = refactored_instances.drop(["authorOwnership", "bugFixCount", "linesAdded", "linesDeleted", "qtyMajorAuthors",
                    "qtyMinorAuthors", "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved"], axis=1)

        non_refactored_instances = non_refactored_instances.drop(["authorOwnership", "bugFixCount", "linesAdded", "linesDeleted", "qtyMajorAuthors",
                    "qtyMinorAuthors", "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved"], axis=1)

    # number of default fields and methods is always 0
    # so, remove it from the data
    refactored_instances = refactored_instances.drop(["classNumberOfDefaultFields", "classNumberOfDefaultMethods"], axis=1)
    non_refactored_instances = non_refactored_instances.drop(["classNumberOfDefaultFields", "classNumberOfDefaultMethods"],
                                                     axis=1)

    # splitting both refactored and non refactored instances into train and test
    # note shuffle = false, as to keep the ordering we get from the database
    r_x = refactored_instances.drop("prediction", axis=1)
    r_y = refactored_instances["prediction"]

    # apply some scaling to speed up the algorithm
    scaler = None
    if SCALE_DATASET:
        r_x, scaler = perform_scaling(r_x)

    split_line = int(((1.0 - ORDERED_DATA_TEST_SPLIT) * len(r_x)))
    r_x_train = r_x.iloc[:split_line]
    r_x_test = r_x.iloc[split_line:]
    r_y_train = r_y.iloc[:split_line]
    r_y_test = r_y.iloc[split_line:]

    # now for the non refactored data
    nr_x = non_refactored_instances.drop("prediction", axis=1)
    nr_y = non_refactored_instances["prediction"]

    if SCALE_DATASET:
        nr_x = pd.DataFrame(scaler.transform(nr_x), columns=nr_x.columns)

    split_line_rn = int(((1.0 - ORDERED_DATA_TEST_SPLIT) * len(nr_x)))
    nr_x_train = nr_x.iloc[:split_line_rn]
    nr_x_test = nr_x.iloc[split_line_rn:]
    nr_y_train = nr_y.iloc[:split_line_rn]
    nr_y_test = nr_y.iloc[split_line_rn:]

    # combine refactoring and non refactoring data now
    merged_x_train = pd.concat([r_x_train, nr_x_train])
    merged_y_train = pd.concat([r_y_train, nr_y_train])
    merged_x_test = pd.concat([r_x_test, nr_x_test])
    merged_y_test = pd.concat([r_y_test, nr_y_test])

    # balance the datasets, as we have way more 'non refactored examples' rather than refactoring examples
    # for now, we basically perform under sampling
    if BALANCE_DATASET:
        log("train instances before balancing: {}".format(Counter(merged_y_train)))
        merged_x_train, merged_y_train = perform_balancing(merged_x_train, merged_y_train)
        assert merged_x_train.shape[0] == merged_y_train.shape[0], "Undersampling did not work, x and y have different shapes."
        log("train instances after balancing: {}".format(Counter(merged_y_train)))

        # for the test, we always apply under sampling
        log("test instances before balancing: {}".format(Counter(merged_y_test)))
        merged_x_test, merged_y_test = perform_balancing(merged_x_test, merged_y_test, "random")
        assert merged_x_test.shape[0] == merged_y_test.shape[
            0], "Balancing did not work, x and y have different shapes."
        log("test instances after balancing: {}".format(Counter(merged_y_test)))

    # TODO: let's reduce the number of features in the set
    # if FEATURE_REDUCTION:
    #     x = perform_feature_reduction(x, y)

    return r_x.columns.values, merged_x_train, merged_y_train, merged_x_test, merged_y_test, scaler
def check_model_performance(refactoring_level, counts_function,
                            get_refactored_function,
                            get_non_refactored_function):

    log("Starting cross model analysis at " + refactoring_level)

    counts = counts_function("")

    for d1 in DATASETS:  # d1 being the model we load
        for d2 in DATASETS:  # d2 being the dataset we'll try to predict
            if d1 == d2 or d1 == '' or d2 == '':
                continue

            for refactoring_name in counts["refactoring"].values:
                refactored_instances = get_refactored_function(
                    refactoring_name, d2)
                non_refactored_instances = get_non_refactored_function(d2)

                # if there' still a row with NAs, drop it as it'll cause a failure later on.
                refactored_instances = refactored_instances.dropna()
                non_refactored_instances = non_refactored_instances.dropna()

                # set the prediction variable as true and false in the datasets
                refactored_instances["prediction"] = 1
                non_refactored_instances["prediction"] = 0
                merged_dataset = pd.concat(
                    [refactored_instances, non_refactored_instances])

                # shuffle the array
                # (not really necessary, though, as this dataset is entirely for test)
                merged_dataset = shuffle(merged_dataset)

                # separate the x from the y (as required by the scikit-learn API)
                x = merged_dataset.drop("prediction", axis=1)
                y = merged_dataset["prediction"]

                # drop process and ownership metrics, if not class level
                if not refactoring_level == 'class-level':
                    x = x.drop([
                        "authorOwnership", "bugFixCount", "linesAdded",
                        "linesDeleted", "qtyMajorAuthors", "qtyMinorAuthors",
                        "qtyOfAuthors", "qtyOfCommits", "refactoringsInvolved"
                    ],
                               axis=1)

                # drop 'default fields' and 'default methods' as
                # they were not properly collected during the collection phase
                x = x.drop([
                    "classNumberOfDefaultFields", "classNumberOfDefaultMethods"
                ],
                           axis=1)

                # balance the datasets
                balanced_x, balanced_y = perform_balancing(x, y)
                log("instances after balancing: {}".format(
                    Counter(balanced_y)))

                for model_name in MODELS:
                    try:
                        log("Refactoring %s, model %s, dataset 1 %s, dataset 2 %s"
                            % (refactoring_name, model_name, d1, d2))

                        # scale it (as in the training of the model)
                        # using the scaler that was generated during training time
                        scaler = load_scaler("models", model_name, d1,
                                             refactoring_name)
                        balanced_x_2 = scaler.transform(balanced_x)

                        model_under_eval = load_model("models", model_name, d1,
                                                      refactoring_name)

                        if model_name == 'deep-learning':
                            y_predicted = model_under_eval.predict_classes(
                                balanced_x_2)
                        else:
                            y_predicted = model_under_eval.predict(
                                balanced_x_2)

                        results = metrics.classification_report(
                            balanced_y, y_predicted, output_dict=True)

                        log(results)
                        log("CSV," + d1 + "," + d2 + "," + refactoring_name +
                            "," + model_name + "," +
                            str(results["macro avg"]["precision"]) + "," +
                            str(results["macro avg"]["recall"]))
                        # TODO: log more info, like the entire confusion matrix

                    except Exception as e:
                        log("An error occurred while working on refactoring " +
                            refactoring_name + " model " + model_name)
                        log(e)
                        log(traceback.format_exc())
def retrieve_labelled_instances(datasets: Iterable[str],
                                refactoring: LowLevelRefactoring,
                                is_training_data: bool = True,
                                scaler=None):
    """
    This method retrieves all the labelled instances
    for a given refactoring and dataset.
    It performs the following pipeline:
      1. Get all refactored and non refactored instances from the db.
      2. Merge them into a single dataset,
      having 1=true and 0=false, as labels.
      3. Removes possible NAs
      (the data collection process is tough;
      bad data might had make it through)
      4. Shuffles the dataset (good practice)
      5. Balances the dataset (if configured)
      6. Scales the features values (if configured)

    :param dataset: a string containing the name of the dataset to be retrieved
    :param refactoring: the refactoring object,
    containing the refactoring to be retrieved
    :param is_training_data: is this training data? If so,
    :param scaler: a predefined scaler, for this data

    :return:
        x: a dataframe with the feature values
        y: the label (1=true, a refactoring has happened,
        ƒ0=false, no refactoring has happened)
        ids: instance ids, to query the actual data from the database
        scaler: the scaler object used in the scaling process.
    """
    log(f"---- Retrieve labeled instances for dataset: {datasets} and the\
             refactoring {refactoring.name()}")

    # get all refactoring examples we have in our dataset
    refactored_instances = refactoring.get_refactored_instances(datasets)
    # load non-refactoring examples
    non_refactored_instances = refactoring.get_non_refactored_instances(
        datasets)

    log(f"raw number of refactoring instances:\
             {refactored_instances.shape[0]}")
    log(f"raw number of non-refactoring with K={refactoring.commit_threshold()}\
             instances: {non_refactored_instances.shape[0]}")

    # if there' still a row with NAs, drop it as it'll cause a failure later
    # on.
    refactored_instances = refactored_instances.dropna()
    non_refactored_instances = non_refactored_instances.dropna()

    # test if any refactorings were found for the given refactoring type
    if refactored_instances.shape[0] == 0:
        log(f"No refactorings found for refactoring type:\
                 {refactoring.name()}")
        return None, None, None

    if non_refactored_instances.shape[0] == 0:
        log(f"No non-refactorings found for threshold:\
                 {refactoring.commit_threshold()}")
        return None, None, None
    # test if any refactorings were found for the given refactoring type

    log("refactoring instances (after dropping NA)s: {}".format(
        refactored_instances.shape[0]))
    log("non-refactoring instances (after dropping NA)s: {}".format(
        non_refactored_instances.shape[0]))

    assert non_refactored_instances.shape[0] > 0, \
        "Found no non-refactoring instances for level: " + refactoring.level()

    # set the prediction variable as true and false in the datasets
    refactored_instances["prediction"] = 1
    non_refactored_instances["prediction"] = 0

    # reduce the amount training samples, if specified, also keep the
    # specified balance
    if is_training_data and \
            0 < TRAINING_SAMPLE_RATIO < 1 and\
            not BALANCE_DATASET:
        refactored_instances, non_refactored_instances = sample_reduction(
            refactored_instances, non_refactored_instances,
            TRAINING_SAMPLE_RATIO)

    refactored_instances = refactored_instances.drop_duplicates()
    non_refactored_instances = non_refactored_instances.drop_duplicates()
    log("refactoring instances (after dropping duplicates)s: {}".format(
        refactored_instances.shape[0]))
    log("non-refactoring instances (after dropping duplicates)s: {}".format(
        non_refactored_instances.shape[0]))
    # now, combine both datasets (with both TRUE and FALSE predictions)
    if non_refactored_instances.shape[1] != refactored_instances.shape[1]:
        raise ImportError("Number of columns differ from both datasets.")
    merged_dataset = pd.concat(
        [refactored_instances, non_refactored_instances])
    # do we want to try the models without some metrics, e.g. process and
    # authorship metrics?
    merged_dataset = merged_dataset.drop(DROP_METRICS, axis=1)

    # Remove all instances with a -1 value
    # in the process and authorship metrics,
    # ToDo: do this after the feature reduction to simplify the query and do
    # not drop instances which are not affected by faulty process and
    # authorship metrics, which are not in the feature set
    if DROP_FAULTY_PROCESS_AND_AUTHORSHIP_METRICS and \
            not DROP_PROCESS_AND_AUTHORSHIP_METRICS:
        log("Instance count before dropping faulty process metrics: {}".format(
            len(merged_dataset.index)))
        metrics = [
            metric for metric in PROCESS_AND_AUTHORSHIP_METRICS
            if metric in merged_dataset.columns.values
        ]
        query = " and ".join(["%s != -1" % metric for metric in metrics])
        merged_dataset = merged_dataset.query(query)
        log("Instance count after dropping faulty process metrics: {}".format(
            len(merged_dataset.index)))

    # separate the x from the y (as required by the scikit-learn API)
    y = merged_dataset["prediction"]
    x = merged_dataset.drop("prediction", axis=1)
    # y = merged_dataset["prediction"]
    # balance the datasets, as we have way more 'non refactored examples'
    #  rather than refactoring examples
    # for now, we basically perform under sampling
    if BALANCE_DATASET:
        log("instances before balancing: {}".format(Counter(y)))
        x, y = perform_balancing(x, y)
        assert x.shape[0] == y.shape[0], "Balancing did not work,\
        x and y have different shapes."

        log("instances after balancing: {}".format(Counter(y)))

    # shuffle data after balancing it, because some of the samplers order the
    # data during balancing it

    # apply some scaling to speed up the algorithm
    if SCALE_DATASET and scaler is None:
        x, scaler = perform_fit_scaling(x)
    elif SCALE_DATASET and scaler is not None:
        x = perform_scaling(x, scaler)

    log(f"Got {x.shape[0]} instances with {x.shape[1]}\
        features for the dataset: {datasets}\
        at threshold {refactoring.commit_threshold()}.")
    return x, y, scaler
def retrieve_labelled_instances(dataset,
                                refactoring: LowLevelRefactoring,
                                is_training_data: bool = True,
                                scaler=None,
                                allowed_features=None):
    log("---- Retrieve labeled instances for dataset: %s" % dataset)

    # get all refactoring examples we have in our dataset
    refactored_instances = refactoring.get_refactored_instances(dataset)
    # load non-refactoring examples
    non_refactored_instances = refactoring.get_non_refactored_instances(
        dataset)

    log(
        "raw number of refactoring instances: {}".format(
            refactored_instances.shape[0]), False)
    log(
        "raw number of non-refactoring instances: {}".format(
            non_refactored_instances.shape[0]), False)

    # if there' still a row with NAs, drop it as it'll cause a failure later on.
    refactored_instances = refactored_instances.dropna()
    non_refactored_instances = non_refactored_instances.dropna()

    # test if any refactorings were found for the given refactoring type
    if refactored_instances.shape[0] == 0:
        log("No refactorings found for refactoring type: " +
            refactoring.name())
        return None, None, None, None
    # test if any refactorings were found for the given refactoring type
    if non_refactored_instances.shape[0] == 0:
        log("No non-refactorings found for refactoring type: " +
            refactoring.name())
        return None, None, None, None

    log(
        "refactoring instances (after dropping NA)s: {}".format(
            refactored_instances.shape[0]), False)
    log(
        "non-refactoring instances (after dropping NA)s: {}".format(
            non_refactored_instances.shape[0]), False)

    assert non_refactored_instances.shape[
        0] > 0, "Found no non-refactoring instances for level: " + refactoring.refactoring_level(
        )

    # set the prediction variable as true and false in the datasets
    refactored_instances["prediction"] = 1
    non_refactored_instances["prediction"] = 0

    # if it's a test run, we reduce the sample randomly
    if TEST:
        refactored_instances = refactored_instances.sample(frac=0.1)
        non_refactored_instances = non_refactored_instances.sample(frac=0.1)

    # now, combine both datasets (with both TRUE and FALSE predictions)
    if non_refactored_instances.shape[1] != refactored_instances.shape[1]:
        raise ImportError("Number of columns differ from both datasets.")
    merged_dataset = pd.concat(
        [refactored_instances, non_refactored_instances])

    #just to be sure, shuffle the dataset
    merged_dataset = merged_dataset.sample(frac=1, random_state=42)

    # do we want to try the models without some metrics, e.g. process and authorship metrics?
    merged_dataset = merged_dataset.drop(DROP_METRICS, axis=1)

    # separate the x from the y (as required by the scikit-learn API)
    x = merged_dataset.drop("prediction", axis=1)
    y = merged_dataset["prediction"]

    # balance the datasets, as we have way more 'non refactored examples' rather than refactoring examples
    # for now, we basically perform under sampling
    if is_training_data and BALANCE_DATASET:
        log("instances before balancing: {}".format(Counter(y)), False)
        x, y = perform_balancing(x, y)
        assert x.shape[0] == y.shape[
            0], "Balancing did not work, x and y have different shapes."
        log("instances after balancing: {}".format(Counter(y)), False)

    # apply some scaling to speed up the algorithm
    if SCALE_DATASET and scaler is None:
        x, scaler = perform_fit_scaling(x)
    elif SCALE_DATASET and scaler is not None:
        x = perform_scaling(x, scaler)

    # let's reduce the number of features in the set
    if is_training_data and FEATURE_REDUCTION and allowed_features is None:
        x = perform_feature_reduction(x, y)
    # enforce the specified feature set
    elif allowed_features is not None:
        drop_list = [
            column for column in x.columns.values
            if column not in allowed_features
        ]
        x = x.drop(drop_list, axis=1)
        assert x.shape[1] == len(
            allowed_features
        ), "Incorrect number of features for dataset " + dataset

    #Remove all instances with a -1 value in the process and authorship metrics, after the feature reduction to simplify the query
    #and do not drop instances which are not affected by faulty process and authorship metrics, which are not in the feature set
    if DROP_FAULTY_PROCESS_AND_AUTHORSHIP_METRICS and not DROP_PROCESS_AND_AUTHORSHIP_METRICS:
        log(
            "Instance count before dropping faulty process metrics: {}".format(
                len(merged_dataset.index)), False)
        metrics = [
            metric for metric in PROCESS_AND_AUTHORSHIP_METRICS
            if metric in x.columns.values
        ]
        query = " and ".join(["%s != -1" % metric for metric in metrics])
        merged_dataset = merged_dataset.query(query)
        log(
            "Instance count after dropping faulty process metrics: {}".format(
                len(merged_dataset.index)), False)

    log("Got %d instances with %d features for the dataset: %s." %
        (x.shape[0], x.shape[1], dataset))
    return x.columns.values, x, y, scaler