Exemple #1
0
def tnb(source, target, n_rep=12):
    """
    TNB: Transfer Naive Bayes
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    plot_data =[("Xalan", "Log4j", "Lucene", "Poi", "Velocity")]
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        charts = []
        print("{} \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                # print("{}  \r".format(src_name[0].upper() + src_name[1:]))
                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)
                pd, pf, g, auc = [], [], [], []
                for _ in xrange(n_rep):
                    lo, hi, test_mass = target_details(tgt)
                    weights = get_weights(maxs=hi, mins=lo, train_set=src, test_set=tgt)
                    _train, __test = weight_training(weights=weights, training_instance=src, test_instance=tgt)
                    actual, predicted, distribution = predict_defects(train=_train, test=__test)

                    # loc = tgt["$loc"].values
                    # loc = loc * 100 / np.max(loc)
                    # recall, loc, au_roc = get_curve(loc, actual, predicted, distribution)
                    # effort_plot(recall, loc,
                    #             save_dest=os.path.abspath(os.path.join(root, "plot", "plots", tgt_name)),
                    #             save_name=src_name)

                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(actual, predicted, distribution, threshold=0.4)

                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(_g)
                    auc.append(int(auroc))
                stats.append([src_name, int(np.mean(pd)), int(np.std(pd)),
                              int(np.mean(pf)), int(np.std(pf)),
                              int(np.mean(auc)), int(np.std(auc))])  # ,
                # int(np.mean(g)), int(np.std(g))])
                # print("")
        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[-2], reverse=True),  # Sort by G Score
                                 columns=["Name", "Pd (Mean)", "Pd (Std)",
                                          "Pf (Mean)", "Pf (Std)",
                                          "AUC (Mean)", "AUC (Std)"])  # ,
        # "G (Mean)", "G (Std)"])
        print(tabulate(stats,
                       headers=["Name", "Pd (Mean)", "Pd (Std)",
                                "Pf (Mean)", "Pf (Std)",
                                "AUC (Mean)", "AUC (Std)"],
                       showindex="never",
                       tablefmt="fancy_grid"))

        result.update({tgt_name: stats})

    return result
Exemple #2
0
def tca_plus(source, target, verbose=False, n_rep=12):
    """
    TCA: Transfer Component Analysis
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        print("{}  \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:

                src = list2dataframe(src_path)
                tgt = list2dataframe(tgt_path)
                pd, pf, pr, f1, g, auc = [], [], [], [], [], []
                dcv_src, dcv_tgt = get_dcv(src, tgt)

                for _ in xrange(n_rep):
                    norm_src, norm_tgt = smart_norm(src, tgt, dcv_src, dcv_tgt)
                    _train, __test = map_transform(norm_src, norm_tgt)
                    actual, predicted, distribution = predict_defects(
                        train=_train, test=__test)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(
                        actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    pr.append(p_f)
                    f1.append(p_f)
                    g.append(_g)
                    auc.append(int(auroc))

                stats.append([
                    src_name,
                    int(np.mean(pd)),
                    int(np.std(pd)),
                    int(np.mean(pf)),
                    int(np.std(pf)),
                    int(np.mean(auc)),
                    int(np.std(auc))
                ])  # ,

        stats = pandas.DataFrame(
            sorted(stats, key=lambda lst: lst[0]),  # Sort by G Score
            columns=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"])  # ,

        if verbose:
            print(
                tabulate(
                    stats,
                    headers=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"],
                    showindex="never",
                    tablefmt="fancy_grid"))

        result.update({tgt_name: stats})
    return result
Exemple #3
0
def tca_plus(source, target, verbose=False, n_rep=12):
    """
    TCA: Transfer Component Analysis
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        print("{}  \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:

                # set_trace()
                src = list2dataframe(src_path)
                tgt = list2dataframe(tgt_path)
                # set_trace()
                pd, pf, g, auc = [], [], [], []
                dcv_src, dcv_tgt = get_dcv(src, tgt)

                for _ in xrange(n_rep):
                    recall, loc = None, None
                    norm_src, norm_tgt = smart_norm(src, tgt, dcv_src, dcv_tgt)
                    _train, __test = map_transform(norm_src, norm_tgt)
                    # for k in np.arange(0.1,1,0.1):
                    actual, predicted, distribution = predict_defects(train=_train, test=__test)
                    # loc = tgt["$loc"].values
                    # loc = loc * 100 / np.max(loc)
                    # recall, loc, au_roc = get_curve(loc, actual, predicted)
                    # effort_plot(recall, loc,
                    #             save_dest=os.path.abspath(os.path.join(root, "plot", "plots", tgt_name)),
                    #             save_name=src_name)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, _ = abcd(actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(_g)
                    # auc.append(int(au_roc))

                    # set_trace()

                stats.append([src_name, int(np.mean(pd)), int(np.std(pd)),
                              int(np.mean(pf)), int(np.std(pf))])  # ,
                              # int(np.mean(auc)), int(np.std(auc))])  # ,
                # int(np.mean(g)), int(np.std(g))])

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[0]),  # Sort by G Score
                                 columns=["Name", "Pd (Mean)", "Pd (Std)",
                                          "Pf (Mean)", "Pf (Std)"])  # ,,
                                          # "AUC (Mean)", "AUC (Std)"])  # ,
        # "G (Mean)", "G (Std)"])
        result.update({tgt_name: stats})
    # set_trace()
    return result
Exemple #4
0
def seer(source, target, n_rep=20, n_redo=5):
    """
    seer: Causal Inference Learning
    :param source:
    :param target:
    :return: result: A dictionary of estimated
    """
    result = dict()
    t0 = time()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)
                pd, pf, g = [], [], []

                matched_src = metrics_match(src, tgt, n_redo)

                for n in xrange(n_rep):
                    target_columns = []
                    source_columns = []

                    all_columns = [(key, val[0], val[1]) for key, val in matched_src.iteritems() if val[1] > 1]
                    all_columns = sorted(all_columns, key=lambda x: x[-1], reverse=True)  # Sort descending

                    # Filter all columns to remove dupes
                    for elem in all_columns:
                        if not elem[1] in source_columns:
                            target_columns.append(elem[0])
                            source_columns.append(elem[1])

                    _train, __test = src[source_columns + [src.columns[-1]]], \
                                     tgt[target_columns + [tgt.columns[-1]]]

                    # _train, __test = map_transform(src[source_columns + [src.columns[-1]]],
                    #                          tgt[target_columns + [tgt.columns[-1]]])

                    # set_trace()
                    actual, predicted = predict_defects(train=_train, test=__test)
                    p_d, p_f, p_r, rc, f_1, e_d, _g = abcd(actual, predicted)
                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(e_d)

                stats.append([src_name, round(np.mean(pd), 2), round(np.std(pd)),
                              round(np.mean(pf), 2), round(np.std(pf), 2),
                              round(np.mean(g), 2), round(np.std(g), 2)])
                # set_trace()

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[0]),  # Sort by G Score
                                 columns=["Name", "Pd (Mean)", "Pd (Std)",
                                          "Pf (Mean)", "Pf (Std)",
                                          "G (Mean)", "G (Std)"])
        set_trace()
        result.update({tgt_name: stats})
    return result
Exemple #5
0
def tca_plus(source, target, n_rep=12):
    """
    TCA: Transfer Component Analysis
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()

    for tgt_name, tgt_path in target.iteritems():
        stats = []
        print("{}  \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                print("{}  \r".format(src_name[0].upper() + src_name[1:]))
                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)
                pd, pf, g, auc = [], [], [], []
                dcv_src, dcv_tgt = get_dcv(src, tgt)

                for _ in xrange(n_rep):
                    recall, loc = None, None
                    norm_src, norm_tgt = smart_norm(src, tgt, dcv_src, dcv_tgt)
                    _train, __test = map_transform(norm_src, norm_tgt)

                    try:
                        actual, predicted, distribution = predict_defects(train=_train, test=__test)
                    except:
                        set_trace()

                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(_g)
                    auc.append(int(auroc))

                stats.append([src_name, int(np.mean(pd)), int(np.std(pd)),
                              int(np.mean(pf)), int(np.std(pf)),
                              int(np.mean(auc)), int(np.std(auc))])  # ,

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[-2], reverse=True),  # Sort by G Score
                                 columns=["Name", "Pd (Mean)", "Pd (Std)",
                                          "Pf (Mean)", "Pf (Std)",
                                          "AUC (Mean)", "AUC (Std)"])  # ,
        print(tabulate(stats,
                       headers=["Name", "Pd (Mean)", "Pd (Std)",
                                "Pf (Mean)", "Pf (Std)",
                                "AUC (Mean)", "AUC (Std)"],
                       showindex="never",
                       tablefmt="fancy_grid"))

        result.update({tgt_name: stats})
    return result
Exemple #6
0
def vcb(source, target, n_rep=12):
    """
    TNB: Transfer Naive Bayes
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    plot_data = [("Xalan", "Log4j", "Lucene", "Poi", "Velocity")]
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        charts = []
        print("{} \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                # print("{}  \r".format(src_name[0].upper() + src_name[1:]))
                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)
                pd, pf, g, auc = [], [], [], []
                for _ in xrange(n_rep):
                    _train, clf_w, classifiers = weight_training(train=src, test=tgt)
                    actual, predicted, distribution = predict_defects(tgt, clf_w, classifiers)
                    loc = tgt["$loc"].values
                    loc = loc * 100 / np.max(loc)
                    recall, loc, au_roc = get_curve(loc, actual, predicted, distribution)
                    effort_plot(recall, loc,
                                save_dest=os.path.abspath(os.path.join(root, "plot", "plots", tgt_name)),
                                save_name=src_name)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(_g)
                    auc.append(int(auroc))
                stats.append([src_name, int(np.mean(pd)), int(np.std(pd)),
                              int(np.mean(pf)), int(np.std(pf)),
                              int(np.mean(auc)), int(np.std(auc))])

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[-2], reverse=True),  # Sort by G Score
                                 columns=["Name", "Pd (Mean)", "Pd (Std)",
                                          "Pf (Mean)", "Pf (Std)",
                                          "AUC (Mean)", "AUC (Std)"])  # ,
        # "G (Mean)", "G (Std)"])
        print(tabulate(stats,
                       headers=["Name", "Pd (Mean)", "Pd (Std)",
                                "Pf (Mean)", "Pf (Std)",
                                "AUC (Mean)", "AUC (Std)"],
                       showindex="never",
                       tablefmt="fancy_grid"))

        result.update({tgt_name: stats})
    return result
Exemple #7
0
def tca_plus(source, target, verbose=True, n_rep=12):
    """
    TCA: Transfer Component Analysis
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()

    for tgt_name, tgt_path in target.iteritems():
        stats = []
        if verbose: print("{}  \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                src = pandas.read_csv(src_path)
                tgt = pandas.read_csv(tgt_path)
                pd, pf, pr, f1, g, auc = [], [], [], [], [], []
                dcv_src, dcv_tgt = get_dcv(src, tgt)

                for _ in xrange(n_rep):
                    norm_src, norm_tgt = smart_norm(src, tgt, dcv_src, dcv_tgt)

                    _train, __test = map_transform(norm_src.dropna(axis=1, inplace=False),
                                                   norm_tgt.dropna(axis=1, inplace=False))

                    actual, predicted, distribution = predict_defects(train=_train, test=__test)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    pr.append(p_r)
                    f1.append(f_1)
                    g.append(_g)
                    auc.append(int(auroc))

                stats.append([src_name, int(np.mean(pd)), int(np.mean(pf)),
                              int(np.mean(pr)), int(np.mean(f1)),
                              int(np.mean(g)), int(np.mean(auc))])  # ,

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[-2], reverse=True),  # Sort by G Score
                                 columns=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"])  # ,

        if verbose: print(tabulate(stats,
                       headers=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"],
                       showindex="never",
                       tablefmt="fancy_grid"))

        result.update({tgt_name: stats})

    return result
Exemple #8
0
def bellw(source, target, n_rep=12, verbose=False):
    """
    TNB: Transfer Naive Bayes
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        charts = []
        print("{} \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:

                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)

                pd, pf, pr, f1, g, auc = [], [], [], [], [], []
                for _ in xrange(n_rep):
                    _train, __test = weight_training(test_instance=tgt, training_instance=src)
                    actual, predicted, distribution = predict_defects(train=_train, test=__test)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    pr.append(p_r)
                    f1.append(f_1)
                    g.append(_g)
                    auc.append(int(auroc))

                stats.append([src_name, int(np.mean(pd)), int(np.mean(pf)),
                              int(np.mean(pr)), int(np.mean(f1)),
                              int(np.mean(g)), int(np.mean(auc))])  # ,

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[-2], reverse=True),  # Sort by G Score
                                 columns=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"])  # ,

        if verbose: print(tabulate(stats,
                                   headers=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"],
                                   showindex="never",
                                   tablefmt="fancy_grid"))

        result.update({tgt_name: stats})

    return result
Exemple #9
0
def tca_plus(source, target, n_rep=12):
    """
    TCA: Transfer Component Analysis
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        print("{}  \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:

                # set_trace()
                src = list2dataframe(src_path)
                tgt = list2dataframe(tgt_path)
                # set_trace()
                pd, pf, g, auc = [], [], [], []
                dcv_src, dcv_tgt = get_dcv(src, tgt)

                for _ in xrange(n_rep):
                    recall, loc = None, None
                    norm_src, norm_tgt = smart_norm(src, tgt, dcv_src, dcv_tgt)
                    _train, __test = map_transform(norm_src, norm_tgt)
                    # for k in np.arange(0.1,1,0.1):
                    actual, predicted, distribution = predict_defects(train=_train, test=__test)
                    # loc = tgt["$loc"].values
                    # loc = loc * 100 / np.max(loc)
                    # recall, loc, au_roc = get_curve(loc, actual, predicted)
                    # effort_plot(recall, loc,
                    #             save_dest=os.path.abspath(os.path.join(root, "plot", "plots", tgt_name)),
                    #             save_name=src_name)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, _ = abcd(actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(_g)
                    # auc.append(int(au_roc))

                    # set_trace()

                stats.append([src_name, int(np.mean(pd)), int(np.std(pd)),
                              int(np.mean(pf)), int(np.std(pf)),
                              int(np.mean(auc)), int(np.std(auc))])  # ,
                # int(np.mean(g)), int(np.std(g))])

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[0]),  # Sort by G Score
                                 columns=["Name", "Pd (Mean)", "Pd (Std)",
                                          "Pf (Mean)", "Pf (Std)",
                                          "AUC (Mean)", "AUC (Std)"])  # ,
        # "G (Mean)", "G (Std)"])

        print(tabulate(stats,
                       headers=["Name", "Pd (Mean)", "Pd (Std)",
                                "Pf (Mean)", "Pf (Std)",
                                "AUC (Mean)", "AUC (Std)"],
                       showindex="never",
                       tablefmt="fancy_grid"))

        result.update({tgt_name: stats})
    # set_trace()
    return result
Exemple #10
0
def tnb(source, target, verbose=False, n_rep=12):
    """
    TNB: Transfer Naive Bayes
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        if verbose: print("{} \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                src = pandas.read_csv(src_path)
                tgt = pandas.read_csv(tgt_path)
                pd, pf, pr, f1, g, auc = [], [], [], [], [], []
                for _ in xrange(n_rep):
                    lo, hi, test_mass = target_details(tgt)
                    weights = get_weights(maxs=hi,
                                          mins=lo,
                                          train_set=src,
                                          test_set=tgt)
                    _train, __test = weight_training(weights=weights,
                                                     training_instance=src,
                                                     test_instance=tgt)
                    actual, predicted, distribution = predict_defects(
                        train=_train, test=__test)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(
                        actual, predicted, distribution)

                    pd.append(p_d)
                    pf.append(p_f)
                    pr.append(p_r)
                    f1.append(f_1)
                    g.append(_g)
                    auc.append(int(auroc))

                stats.append([
                    src_name,
                    int(np.mean(pd)),
                    int(np.mean(pf)),
                    int(np.mean(pr)),
                    int(np.mean(f1)),
                    int(np.mean(g)),
                    int(np.mean(auc))
                ])  # ,

        stats = pandas.DataFrame(
            sorted(stats, key=lambda lst: lst[-2],
                   reverse=True),  # Sort by G Score
            columns=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"])  # ,

        if verbose:
            print(
                tabulate(
                    stats,
                    headers=["Name", "Pd", "Pf", "Prec", "F1", "G", "AUC"],
                    showindex="never",
                    tablefmt="fancy_grid"))

        result.update({tgt_name: stats})

    return result
Exemple #11
0
def seer(source, target, n_rep=20, n_redo=5):
    """
    seer: Causal Inference Learning
    :param source:
    :param target:
    :return: result: A dictionary of estimated
    """
    result = dict()
    t0 = time()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)
                pd, pf, g = [], [], []

                matched_src = metrics_match(src, tgt, n_redo)

                for n in xrange(n_rep):
                    target_columns = []
                    source_columns = []

                    all_columns = [(key, val[0], val[1])
                                   for key, val in matched_src.iteritems()
                                   if val[1] > 1]
                    all_columns = sorted(all_columns,
                                         key=lambda x: x[-1],
                                         reverse=True)  # Sort descending

                    # Filter all columns to remove dupes
                    for elem in all_columns:
                        if not elem[1] in source_columns:
                            target_columns.append(elem[0])
                            source_columns.append(elem[1])

                    _train, __test = src[source_columns + [src.columns[-1]]], \
                                     tgt[target_columns + [tgt.columns[-1]]]

                    # _train, __test = map_transform(src[source_columns + [src.columns[-1]]],
                    #                          tgt[target_columns + [tgt.columns[-1]]])

                    # set_trace()
                    actual, predicted = predict_defects(train=_train,
                                                        test=__test)
                    p_d, p_f, p_r, rc, f_1, e_d, _g = abcd(actual, predicted)
                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(e_d)

                stats.append([
                    src_name,
                    round(np.mean(pd), 2),
                    round(np.std(pd)),
                    round(np.mean(pf), 2),
                    round(np.std(pf), 2),
                    round(np.mean(g), 2),
                    round(np.std(g), 2)
                ])
                # set_trace()

        stats = pandas.DataFrame(
            sorted(stats, key=lambda lst: lst[0]),  # Sort by G Score
            columns=[
                "Name", "Pd (Mean)", "Pd (Std)", "Pf (Mean)", "Pf (Std)",
                "G (Mean)", "G (Std)"
            ])
        set_trace()
        result.update({tgt_name: stats})
    return result
Exemple #12
0
def seer(source, target, n_rep=20, n_redo=5):
    """
    seer: Causal Inference Learning
    :param source:
    :param target:
    :return: result: A dictionary of estimated
    """
    result = dict()
    t0 = time()
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        print("{} \r".format(tgt_name[0].upper() + tgt_name[1:]))
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)
                pd, pf, g, auc = [], [], [], []

                matched_src = metrics_match(src, tgt, n_redo)

                for n in xrange(n_rep):
                    target_columns = []
                    source_columns = []

                    all_columns = [(key, val[0], val[1]) for key, val in matched_src.iteritems() if val[1] > 1]
                    all_columns = sorted(all_columns, key=lambda x: x[-1], reverse=True)  # Sort descending

                    # Filter all columns to remove dupes
                    for elem in all_columns:
                        if not elem[1] in source_columns:
                            target_columns.append(elem[0])
                            source_columns.append(elem[1])
                    selected_col = list(set(target_columns).intersection(source_columns))
                    _train, __test = map_transform(src[selected_col + [src.columns[-1]]],
                                             tgt[selected_col + [tgt.columns[-1]]])

                    # _train, __test = src[source_columns + [src.columns[-1]]], \
                    #                  tgt[target_columns + [tgt.columns[-1]]]

                    # set_trace()
                    actual, predicted, distribution = predict_defects(train=_train, test=__test)
                    p_d, p_f, p_r, rc, f_1, e_d, _g, auroc = abcd(actual, predicted, distribution)
                    pd.append(p_d)
                    pf.append(p_f)
                    g.append(e_d)
                    auc.append(int(auroc))

                stats.append([src_name, int(np.mean(pd)), int(np.std(pd)),
                              int(np.mean(pf)), int(np.std(pf)),
                              int(np.mean(auc)), int(np.std(auc))])  # ,

        stats = pandas.DataFrame(sorted(stats, key=lambda lst: lst[-2], reverse=True),  # Sort by G Score
                                 columns=["Name", "Pd (Mean)", "Pd (Std)",
                                          "Pf (Mean)", "Pf (Std)",
                                          "AUC (Mean)", "AUC (Std)"])  # ,
        # "G (Mean)", "G (Std)"])
        print(tabulate(stats,
                       headers=["Name", "Pd (Mean)", "Pd (Std)",
                                "Pf (Mean)", "Pf (Std)",
                                "AUC (Mean)", "AUC (Std)"],
                       showindex="never",
                       tablefmt="fancy_grid"))

        result.update({tgt_name: stats})
    return result
Exemple #13
0
def tnb(source, target, verbose=False, n_rep=12):
    """
    TNB: Transfer Naive Bayes
    :param source:
    :param target:
    :param n_rep: number of repeats
    :return: result
    """
    result = dict()
    plot_data = [("Xalan", "Log4j", "Lucene", "Poi", "Velocity")]
    for tgt_name, tgt_path in target.iteritems():
        stats = []
        charts = []
        if verbose: print("{} \r".format(tgt_name[0].upper() + tgt_name[1:]))
        val = []
        for src_name, src_path in source.iteritems():
            if not src_name == tgt_name:
                # print("{}  \r".format(src_name[0].upper() + src_name[1:]))
                src = list2dataframe(src_path.data)
                tgt = list2dataframe(tgt_path.data)
                pd, pf, g, auc = [], [], [], []
                lo, hi, test_mass = target_details(tgt)
                weights = get_weights(maxs=hi,
                                      mins=lo,
                                      train_set=src,
                                      test_set=tgt)
                _train = weight_training(weights=weights,
                                         training_instance=src)
                __test = (tgt[tgt.columns[:-1]] - tgt[tgt.columns[:-1]].min()
                          ) / (tgt[tgt.columns[:-1]].max() -
                               tgt[tgt.columns[:-1]].min())
                __test[tgt.columns[-1]] = tgt[tgt.columns[-1]]
                actual, predicted, distribution = predict_defects(train=_train,
                                                                  test=__test)
                loc = tgt["$loc"].values
                loc = loc * 100 / np.max(loc)
                recall, loc, au_roc = get_curve(loc, actual, predicted)
                effort_plot(recall,
                            loc,
                            save_dest=os.path.abspath(
                                os.path.join(root, "plot", "plots", tgt_name)),
                            save_name=src_name)
                p_d, p_f, p_r, rc, f_1, e_d, _g = abcd(actual, predicted,
                                                       distribution)

                pd.append(p_d)
                pf.append(p_f)
                g.append(_g)
                auc.append(int(au_roc))
                stats.append([
                    src_name,
                    int(np.mean(pd)),
                    int(np.std(pd)),
                    int(np.mean(pf)),
                    int(np.std(pf)),
                    int(np.mean(auc)),
                    int(np.std(auc))
                ])  # ,
                # int(np.mean(g)), int(np.std(g))])
                # print("")
        stats = pandas.DataFrame(
            sorted(stats, key=lambda lst: lst[0]),  # Sort by G Score
            columns=[
                "Name", "Pd (Mean)", "Pd (Std)", "Pf (Mean)", "Pf (Std)",
                "AUC (Mean)", "AUC (Std)"
            ])  # ,
        # "G (Mean)", "G (Std)"])
        if verbose:
            print(
                tabulate(stats,
                         headers=[
                             "Name", "Pd (Mean)", "Pd (Std)", "Pf (Mean)",
                             "Pf (Std)", "AUC (Mean)", "AUC (Std)"
                         ],
                         showindex="never",
                         tablefmt="fancy_grid"))

        result.update({tgt_name: stats})

    set_trace()
    return result