def PlotMimicProbabilities(inputfile, outputfile, topk, size):
    data = pd.read_csv(inputfile)
    data = data[data['size'] == size].head(topk)
    y_ser = []
    ser = u.YSeries(data['n2'],
                    line_color='k',
                    xvalues=data['iters'],
                    points_marker='*',
                    plot_legend_label="P(X = 1 | parent = 0)")
    y_ser.append(ser)
    ser = u.YSeries(data['root_node_prob'],
                    line_color='r',
                    xvalues=data['iters'],
                    points_marker='x',
                    plot_legend_label="P(X_root = 1)")
    y_ser.append(ser)
    ser = u.YSeries(data['n1'],
                    line_color='g',
                    xvalues=data['iters'],
                    points_marker='o',
                    plot_legend_label="P(X = 1 | parent = 1)")
    y_ser.append(ser)

    u.SaveDataPlotWithLegends(y_ser,
                              filename=outputfile,
                              y1_axis_name="probabilities",
                              x_axis_name="iterations")
Ejemplo n.º 2
0
def PlotAdaboostPerIterationCurves(file_template,
                                   filter,
                                   plot_output_file,
                                   iters,
                                   y_axis_name='F-Measure'):
    ts = [20, 30, 40, 50, 60, 70, 80, 90, 100]
    colors = u.GetColorCombinations()
    y = []
    for _ts in ts:
        data = pd.read_csv(file_template.format(str(_ts)))
        data = u.FilterRows(data, filter)
        data = data.set_index('iter')
        train_data = FilterRows(data, lambda x: x['istrain'] == 1)
        test_data = FilterRows(data, lambda x: x['istrain'] == 0)
        train_y = []
        test_y = []
        for iter in iters:
            train_y.append(train_data.loc[iter]['m'])
            test_y.append(test_data.loc[iter]['m'])
        c = colors.pop()
        y.append(
            u.YSeries(train_y,
                      points_marker='o',
                      line_color=c['color'],
                      plot_legend_label=str(_ts) + "-train"))
        y.append(
            u.YSeries(test_y,
                      points_marker='x',
                      line_color=c['color'],
                      plot_legend_label=str(_ts) + "-validation"))
    u.SaveDataPlotWithLegends(y,
                              iters,
                              plot_output_file,
                              x_axis_name="num of iters/weak learners",
                              y1_axis_name=y_axis_name)
Ejemplo n.º 3
0
def PlotCrossValidationCurves2(dataset_instance_root,
                               plot_output_file,
                               x_axis_name,
                               y_axis_name,
                               title,
                               parameter_value_getter_fn,
                               cv_results_file_getter_fn,
                               cv_save_file=None,
                               should_plot=lambda x: True,
                               label_maker=lambda x: x):
    grid = ParameterGrid([{
        'marker': ['o', 'x', 'd', '^', '+', 'v', '8', 's', 'p', '>', '<'],
        'color': [
            'orange', 'red', 'blue', 'green', 'black', 'saddlebrown', 'violet',
            'darkcyan', 'maroon', 'lightcoral'
        ]
    }])
    combinations = [p for p in grid]
    random.seed(30)
    random.shuffle(combinations)
    param_dict = {}
    x_value_dict = {}
    for parameter_value_dataset in u.Get_Subdirectories(dataset_instance_root):
        cv_file_path = cv_results_file_getter_fn(parameter_value_dataset)
        if (os.path.isfile(cv_file_path) == False):
            continue
        cv_results = pd.read_csv(cv_file_path)
        parameter_value = parameter_value_getter_fn(parameter_value_dataset)
        for i in range(len(cv_results)):
            #param_dict = {param1 : series_1}
            param = cv_results.iloc[i]['params']
            s = pd.Series(
                {parameter_value: cv_results.iloc[i]['mean_test_score']})
            if param in param_dict:
                param_dict[param] = param_dict[param].append(s)
            else:
                param_dict[param] = s
    yseries = []
    x = []
    for name, value in param_dict.items():
        if (should_plot(name) == False):
            continue
        theme = combinations.pop()
        y = u.YSeries(value.sort_index().values,
                      points_marker=theme['marker'],
                      line_color=theme['color'],
                      plot_legend_label=name)
        yseries.append(y)
        x = value.sort_index().index
    transpose_data = pd.DataFrame(param_dict).transpose()
    x_values = transpose_data.index.values
    x_values = list(map(label_maker, x_values))
    col = transpose_data.columns[0]
    y_values = transpose_data[col]
    yseries = [u.YSeries(y_values)]
    u.SaveDataPlotWithLegends(yseries, x_values, plot_output_file, True,
                              x_axis_name, y_axis_name)
    if (cv_save_file is not None):
        pd.DataFrame(param_dict).transpose().to_csv(cv_save_file)
def PlotClusteringMetrics(rootfolder, data, k,dim='raw',p=2):
    filter = lambda x : x['dim_red_method'] == dim and x['p'] == p
    filtered_data = u.FilterRows(data,filter)
    metrics = ["ami_raw","ami_true","sc","bic"]
    gmm_data = filtered_data.loc[filtered_data['clustering'] == "gmm",:]
    kmeans_data = filtered_data.loc[filtered_data['clustering'] == "kmeans",:]
    d = {"kmeans":('o','b','kmeans'),"gmm":('x','r','gmm')}
    for metric in metrics:
        outputfile = u.PreparePath(rootfolder + "/plots/metrics/{0}_{1}_p={2}.png".format(metric,dim,str(p)))
        kmeans_ser = u.YSeries(kmeans_data[metric],xvalues=kmeans_data["k"],points_marker = d["kmeans"][0],line_color=d["kmeans"][1],plot_legend_label=d["kmeans"][2])
        gmm_ser = u.YSeries(gmm_data[metric],xvalues=gmm_data["k"],points_marker = d["gmm"][0],line_color=d["gmm"][1],plot_legend_label=d["gmm"][2])
        u.SaveDataPlotWithLegends([kmeans_ser,gmm_ser],x_axis_name="number of clusters",y1_axis_name=metric,filename=outputfile)
def PlotClusteringMetricsForDimsAndBic(rootfolder, data, dims, dim_reds,k,metrics = ["ami_raw","ami_true","sc","bic"]):
    colors = {"ica":'r','pca':'b','rp':'g','mi':'k','raw':'orange'}
    markers = {"kmeans":'o',"gmm":'x'}
    for _k in dims:
        for metric in metrics:
            for dim_red in dim_reds:
                ser = []
                outputfile = u.PreparePath(rootfolder + "/plots/metrics/dr_{0}_p={1}_{2}.png".format(metric,str(_k),dim_red))
                d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'kmeans') ,:]
                ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['kmeans'],plot_legend_label="{0}-{1}".format(dim_red,'kmeans')))
                d = data.loc[(data['dim_red_method'] == dim_red) & (data['p'] == _k) & (data['clustering'] == 'gmm') ,:]
                ser.append(u.YSeries(d[metric],xvalues=d['k'],line_color=colors[dim_red],points_marker=markers['gmm'],plot_legend_label="{0}-{1}".format(dim_red,'gmm')))
                u.SaveDataPlotWithLegends(ser,x_axis_name="k",y1_axis_name=metric,filename = outputfile)
Ejemplo n.º 6
0
def PlotLossCurvesForNeuralNets(metrics_file, output_file_template):
    metrics = pd.read_csv(metrics_file)
    es = [False, True]
    y = []
    for _es in es:
        colors = u.GetColorCombinations(4)
        filter = lambda x: x['earlystopping'] == _es
        data = FilterRows(metrics, filter)
        train_data = FilterRows(
            data,
            lambda x: x['istrain'] == 1).set_index('train_split_percent_used')
        for label in train_data.index:
            yvalues = [
                float(x)
                for x in train_data.loc[label]['loss_curve'].split(';')
            ]
            xvalues = np.arange(len(yvalues)) + 1
            y.append(
                u.YSeries(yvalues,
                          points_marker='.',
                          legend_marker='o',
                          line_color=colors.pop()['color'],
                          plot_legend_label=str(label),
                          xvalues=xvalues))
        filename = output_file_template.format(str(_es))
        u.SaveDataPlotWithLegends(y,
                                  None,
                                  filename,
                                  x_axis_name="epochs",
                                  y1_axis_name="train loss",
                                  x_limits=[1, 200])
        y.clear()
Ejemplo n.º 7
0
def PlotSupportVectorsOverlap(root, output_file, data_file=None):
    file_template = root + '/i-0_t-80_T-20/i-0_t-80_ts-{0}/svm/cvresults/cvresults.model'

    y = []
    x = []
    for i in np.arange(30, 110, 10):
        file1 = file_template.format(str(i - 10))
        file2 = file_template.format(str(i))
        s1 = u.ReadBinaryFile(file1).support_
        s2 = u.ReadBinaryFile(file2).support_
        _y = len(set(s1).intersection(s2)) / len(s1)
        _x = i
        y.append(_y)
        x.append(_x)
    outputfile = root + "/" + output_file
    u.SaveDataPlotWithLegends(
        [u.YSeries(y)],
        x,
        outputfile,
        x_axis_name="Train size % used",
        y1_axis_name="Common support vectors fraction wrt previous size %",
        y_limits=[0, 1])
    if (data_file is not None):
        pd.DataFrame({
            'size %': x,
            'overlap': y
        }).to_csv(root + '/' + data_file, index=False)
def Plot(rootfolder, cols_to_plot_dict):
    data = pd.read_csv(rootfolder + "/stats_agg.csv")
    sizes = data['size'].unique()
    algos = ['rhc', 'sa', 'mimic', 'ga']
    algo_decoration = {
        'mimic': ('r', 'o', 'mimic'),
        'ga': ('g', 's', 'genetic algo'),
        'sa': ('b', '+', 'sim annealing'),
        'rhc': ('k', '*', 'rhc')
    }
    for col in cols_to_plot_dict.keys():
        y_ser = []
        for algo in algos:
            x = data[data['algo'] == algo].loc[:, 'size']
            y = data[data['algo'] == algo].loc[:, col]
            legend_label = algo_decoration[algo][2]
            marker = algo_decoration[algo][1]
            color = algo_decoration[algo][0]
            yseries = u.YSeries(y,
                                points_marker=marker,
                                line_color=color,
                                xvalues=x,
                                plot_legend_label=legend_label)
            y_ser.append(yseries)
        y_axis_name = cols_to_plot_dict[col]
        x_axis_name = 'size'
        savepath = u.PreparePath(rootfolder + "/plots/" + col + ".png")
        u.SaveDataPlotWithLegends(y_ser,
                                  filename=savepath,
                                  y1_axis_name=y_axis_name,
                                  x_axis_name=x_axis_name)
 def f(data, name):
     x = data['iters']
     y = data['fn_value']
     deco = algo_decoration[name]
     return u.YSeries(y,
                      xvalues=x,
                      points_marker='.',
                      plot_legend_label=deco[2],
                      legend_marker='o',
                      line_color=deco[0])
Ejemplo n.º 10
0
def TestPlotting():
    y1 = u.YSeries(np.arange(10) * 2,
                   line_style='-',
                   points_marker='o',
                   line_color='r',
                   plot_legend_label='x^2')
    y2 = u.YSeries(np.arange(10),
                   line_style='-',
                   points_marker='x',
                   line_color='b',
                   plot_legend_label='x')
    x = np.arange(10)
    fig, ax = u.SaveDataPlotWithLegends([y1, y2],
                                        x,
                                        r"c:/temp/testfig.png",
                                        dispose_fig=False,
                                        x_axis_name="x values",
                                        y1_axis_name="y values",
                                        title="x square")
    plt.show(fig)
def PlotTempVariationCurvesForSa(rootfolder, algoname, temperatures):
    """

    """
    rhcdata = u.FilterRows(
        pd.read_csv(rootfolder + "/" + algoname + "/stats_agg.csv"),
        lambda x: x['algo'] == 'rhc')
    y_Ser = []
    y_Ser.append(
        u.YSeries(rhcdata['converged_iters'],
                  xvalues=rhcdata['size'],
                  points_marker="*",
                  line_color="k",
                  plot_legend_label="rhc"))
    data_dict = {}
    deco = {
        '0': ("r", "x"),
        '90': ("b", "o"),
        '95': ("g", "+"),
        '99': ("orange", ">")
    }
    for t in temperatures:
        path = rootfolder + "/" + algoname + "_" + t
        CompteStats(path, ["sa.csv"])
        data_dict[t] = pd.read_csv(path + "/stats_agg.csv")
        y_Ser.append(
            u.YSeries(data_dict[t]['converged_iters'],
                      xvalues=data_dict[t]['size'],
                      points_marker=deco[t][1],
                      line_color=deco[t][0],
                      plot_legend_label="sa_" + t))
    outputfile = rootfolder + "/" + algoname + "/plots/sa_temperatures.png"
    u.SaveDataPlotWithLegends(y_Ser,
                              y1_axis_name="iterations to converge",
                              x_axis_name="size",
                              filename=outputfile)
Ejemplo n.º 12
0
def GetQRewardSeriesToPlot(x, xvalues, key_to_plot):
    markers = u.GetAllMarkers()
    colors = u.GetColorCombinations()
    if (x['exp_strategy'] == 'boltzmann'):
        line_color = 'b'
        if (x['param'] == 0.1):
            marker = markers[0]
        elif (x['param'] == 1):
            marker = markers[1]
        elif (x['param'] == 10):
            marker = markers[2]
        elif (x['param'] == 100):
            marker = markers[3]
    else:
        line_color = 'r'
        if (x['param'] == 0.01):
            marker = markers[0]
        elif (x['param'] == 0.05):
            marker = markers[1]
        elif (x['param'] == 0.1):
            marker = markers[2]
        elif (x['param'] == 0.2):
            marker = markers[3]

    cum_rewards = np.array([float(a) for a in x['cum_rewards'].split(';')])
    avg_rewards = np.array([float(a) for a in x['avg_rewards'].split(';')])
    completion = np.array(
        [float(a) for a in x['ran_to_completion'].split(';')])
    episode_len = cum_rewards / avg_rewards
    xvalues = xvalues[xvalues < cum_rewards.size]
    if (key_to_plot == "ar"):
        y = avg_rewards[xvalues]
    if (key_to_plot == "cr"):
        y = cum_rewards[xvalues]
    if (key_to_plot == "goal"):
        y = completion[xvalues]
    if (key_to_plot == "len"):
        y = episode_len[xvalues]

    xdata = np.arange(len(y)) + 1
    ser = u.YSeries(y,
                    xvalues=xvalues,
                    points_marker=marker,
                    plot_legend_label=x['exp_strategy'] + "-" +
                    str(x['param']),
                    line_color=line_color)
    return ser
Ejemplo n.º 13
0
def PlotCrossValidationCurvesForWeka(cv_file,
                                     model_complexity_param_name,
                                     metric_name,
                                     plt_output_file,
                                     title,
                                     x_axis_name,
                                     y_axis_name,
                                     rows_filter_fn=None):
    data = pd.read_csv(cv_file)
    if (rows_filter_fn is not None):
        data = FilterRows(data, rows_filter_fn)
    metric_vals = data[[model_complexity_param_name, metric_name
                        ]].set_index(model_complexity_param_name).sort_index()
    x = metric_vals.index
    y = metric_vals[metric_name]
    y = u.YSeries(y)
    u.SaveDataPlotWithLegends([y], x, plt_output_file, True, x_axis_name,
                              y_axis_name, title)
def ScatterPlot(X,Y_df,output):
    Y = Y_df
    markers = u.GetMarkerColorCombinations(10)
    labels = np.unique(Y)
    y_ser = []
    i = 0
    for label in labels:
        label_data = X[Y == label]
        ser = u.YSeries(
            label_data[:,0],
            xvalues=label_data[:,1] if label_data.shape[1] > 1 else np.arange(label_data.shape[0])+1,
            line_style = ".",
            points_marker = markers[i]["marker"],
            line_color=markers[i]["color"],
            plot_legend_label = label)
        plt.scatter(ser.values,ser.xvalues,c = markers[i]["color"],marker=markers[i]["marker"],label=label)
        y_ser.append(ser)
        i = i + 1
    plt.show()
Ejemplo n.º 15
0
def NNetAnalysis(output_root,
                 output_file_prefix,
                 metrics_file,
                 iters_to_ignore,
                 y_axis_name="F-Measure"):
    data_all = pd.read_csv(metrics_file)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelbuildtimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': y_axis_name,
        dataset_types[0]: 'Train size % used',
        'modelbuildtimesecs': 'Time to build model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return (~np.isnan(x[dataset_type]) &
                    (x['total_iter'] > iters_to_ignore))

        def train_earlystopping_filter(x):
            return x['earlystopping'] & (x['istrain'] == 1)

        def train_no_earlystopping_filter(x):
            return (x['earlystopping'] == False) & (x['istrain'] == 1)

        def test_earlystopping_filter(x):
            return x['earlystopping'] & (x['istrain'] == 0)

        def test_no_earlystopping_filter(x):
            return (x['earlystopping'] == False) & (x['istrain'] == 0)

        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(
            data,
            col_funcs=col_funcs,
            gpby=[dataset_type, 'earlystopping', 'istrain'])
        x = data_agg[dataset_type].unique()

        def MissingValuesHandler(curr_values_frame, keyCol, valueCol,
                                 required_values):
            data = dict(
                zip(curr_values_frame[keyCol], curr_values_frame[valueCol]))
            y = []
            for v in required_values:
                if (v in data):
                    y.append(data[v])
                else:
                    y.append(0)
            return y

        for k, v in col_funcs.items():
            for agg in v:
                mvh = lambda df: MissingValuesHandler(df, dataset_type, k + "_"
                                                      + agg, x)
                y_train_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, train_earlystopping_filter)),
                    line_color='r',
                    points_marker='o',
                    plot_legend_label="Train_with_earlystopping")
                y_train_no_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, train_no_earlystopping_filter)),
                    line_color='r',
                    points_marker='x',
                    plot_legend_label="Train_without_earlystopping")
                y_test_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, test_earlystopping_filter)),
                    line_color='b',
                    points_marker='o',
                    plot_legend_label="Validation_with_earlystopping")
                y_no_test_earlystopping = u.YSeries(
                    mvh(FilterRows(data_agg, test_no_earlystopping_filter)),
                    line_color='b',
                    points_marker='x',
                    plot_legend_label="Validation_without_earlystopping")

                output_file_name = u.PreparePath(
                    "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k,
                                                     agg, output_root,
                                                     dataset_type))
                f, ax = u.SaveDataPlotWithLegends(
                    [
                        y_test_earlystopping, y_no_test_earlystopping,
                        y_train_no_earlystopping, y_train_earlystopping
                    ], x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k],
                    'Neural Nets Performance'.format(agg))
    return data_agg
Ejemplo n.º 16
0
def AdaBoostAnalysis(output_root, output_file_prefix, metrics_file):
    data_all = pd.read_csv(metrics_file)
    dataset_types = [
        'train_split_percent_used', 'imbalance_perc', 'noise_perc'
    ]
    col_funcs = {
        'p': ['mean', 'std'],
        'r': ['mean', 'std'],
        'm': ['mean', 'std']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': 'F-Measure',
        dataset_types[0]: 'Train size % used',
        dataset_types[1]: 'Fraction of postives to negatives',
        dataset_types[2]: 'Noise %',
        'modelbuildtimesecs': 'Time to build AdaBoost model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return (~np.isnan(x[dataset_type]))

        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(
            data,
            col_funcs=col_funcs,
            gpby=[dataset_type, 'prune', 'istrain', 'iter'])
        for metric, v in col_funcs.items():
            for agg in v:
                iterations = np.sort(data_agg['iter'].unique())
                prune_vals = data_agg['prune'].unique()
                dataset_type_values = data_agg[dataset_type].unique()
                for type_val in dataset_type_values:
                    for prune_val in prune_vals:
                        metric_col = metric + "_" + agg
                        y_test = []
                        y_train = []
                        for i in iterations:
                            filtered_data = data_agg[
                                (data_agg['prune'] == prune_val)
                                & (data_agg['iter'] == i) &
                                (data_agg[dataset_type] == type_val)]
                            train_data = filtered_data[filtered_data['istrain']
                                                       == 1]
                            assert (len(train_data) == 1)
                            y_train.append(train_data[metric_col].iloc[0])

                            test_data = filtered_data[filtered_data['istrain']
                                                      == 0]
                            assert (len(test_data) == 1)
                            y_test.append(test_data[metric_col].iloc[0])
                        # now we can plot since we have test and train values for each iter
                        output_file_name = u.PreparePath(
                            "{4}/{0}.{1}.prune-{5}.{6}-{7}.{2}.{3}.png".format(
                                output_file_prefix, dataset_type, metric, agg,
                                output_root, prune_val, dataset_type,
                                type_val))
                        y_train_series = u.YSeries(y_train,
                                                   line_color='r',
                                                   plot_legend_label='train')
                        y_test_series = u.YSeries(y_test,
                                                  line_color='b',
                                                  plot_legend_label='test')
                        if (~os.path.isfile(output_file_name)):
                            u.SaveDataPlotWithLegends(
                                [y_train_series, y_test_series], iterations,
                                output_file_name, True, "num of iterations",
                                mapping_output_words[metric],
                                "AdaBoost Performance ({0})".format(agg))
                        print(output_file_name)
Ejemplo n.º 17
0
def SvmAnalysis(output_root,
                output_file_prefix,
                metrics_file,
                dataset_filter_fn=None,
                y_axis_name="F-Measure"):
    def ComputeTotalSupportVectors(s):
        return np.array([int(t) for t in s.split(';')]).sum()

    data_all = pd.read_csv(metrics_file)
    data_all['numsupportvectors'] = data_all['numsupportvectors'].apply(
        ComputeTotalSupportVectors)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelbuildtimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': y_axis_name,
        dataset_types[0]: 'Train size % used',
        'modelbuildtimesecs': 'Time to build model (sec)',
        'numsupportvectors': 'Number of Support Vectors'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return ~np.isnan(x[dataset_type])

        def train_filter(x):
            return (x['istrain'] == 1)

        def test_filter(x):
            return (x['istrain'] == 0)

        if (dataset_filter_fn is not None):
            data_all = FilterRows(data_all, dataset_filter_fn)
        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(data,
                                 col_funcs=col_funcs,
                                 gpby=[dataset_type, 'istrain'])
        x = data_agg[dataset_type].unique()

        for k, v in col_funcs.items():
            for agg in v:
                y_train = u.YSeries(FilterRows(data_agg,
                                               train_filter)[k + "_" + agg],
                                    line_color='r',
                                    points_marker='o',
                                    plot_legend_label="Train")
                y_test = u.YSeries(FilterRows(data_agg,
                                              test_filter)[k + "_" + agg],
                                   line_color='b',
                                   points_marker='o',
                                   plot_legend_label='validation')
                if ((k == 'numsupportvectors') | (k == 'modelbuildtimesecs')):
                    y_series = [y_train]
                else:
                    y_series = [y_test, y_train]

                output_file_name = u.PreparePath(
                    "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k,
                                                     agg, output_root,
                                                     dataset_type))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], 'SVM Performance'.format(agg))
    return data_agg
Ejemplo n.º 18
0
def DecisionTreeAnalysis(output_root,
                         output_file_prefix,
                         metrics_file,
                         dataset_filter_fn=None,
                         plt_title="Decision Trees Performance",
                         y_axis_name='F-Measure'):

    data_all = pd.read_csv(metrics_file)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelbuildtimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': y_axis_name,
        dataset_types[0]: 'Train size % used',
        'modelbuildtimesecs': 'Time to build model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return ~np.isnan(x[dataset_type])

        def train_prune_filter(x):
            return x['prune'] & (x['istrain'] == 1)

        def train_no_prune_filter(x):
            return (x['prune'] == False) & (x['istrain'] == 1)

        def test_prune_filter(x):
            return x['prune'] & (x['istrain'] == 0)

        def test_no_prune_filter(x):
            return (x['prune'] == False) & (x['istrain'] == 0)

        if (dataset_filter_fn is not None):
            data_all = FilterRows(data_all, dataset_filter_fn)
        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(data,
                                 col_funcs=col_funcs,
                                 gpby=[dataset_type, 'prune', 'istrain'])
        x = data_agg[dataset_type].unique()

        for k, v in col_funcs.items():
            for agg in v:
                y_train_prune = u.YSeries(
                    FilterRows(data_agg, train_prune_filter)[k + "_" + agg],
                    line_color='r',
                    points_marker='o',
                    plot_legend_label="Train_with_pruning")
                y_train_no_prune = u.YSeries(
                    FilterRows(data_agg, train_no_prune_filter)[k + "_" + agg],
                    line_color='r',
                    points_marker='x',
                    plot_legend_label="Train_without_pruning")
                y_test_prune = u.YSeries(
                    FilterRows(data_agg, test_prune_filter)[k + "_" + agg],
                    line_color='b',
                    points_marker='o',
                    plot_legend_label="Validation_with_pruning")
                y_no_test_prune = u.YSeries(
                    FilterRows(data_agg, test_no_prune_filter)[k + "_" + agg],
                    line_color='b',
                    points_marker='x',
                    plot_legend_label="Validation_without_pruning")

                if (len(y_train_prune.values) == 0):
                    y_no_test_prune.plot_legend_label = "Validation"
                    y_train_no_prune.plot_legend_label = "Train"
                    if ((k == 'modelbuildtimesecs')):
                        y_series = [y_train_no_prune]
                    else:
                        y_series = [y_no_test_prune, y_train_no_prune]
                else:
                    if ((k == 'modelbuildtimesecs')):
                        y_series = [y_train_no_prune, y_train_prune]
                    else:
                        y_series = [
                            y_test_prune, y_no_test_prune, y_train_no_prune,
                            y_train_prune
                        ]

                output_file_name = u.PreparePath(
                    "{3}/{0}.{4}.{1}.{2}.png".format(output_file_prefix, k,
                                                     agg, output_root,
                                                     dataset_type))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], plt_title)
    return data_agg
Ejemplo n.º 19
0
def PlotPiViConvergenceForSmallAndLargeMdp(outputfolder, datafile, gamma):
    data = pd.read_csv(datafile)
    decorations = {1: 'g', 10: 'k', 10000: 'r'}
    pi_sweeps = [1, 10, 10000]
    ser = []
    ser1 = []
    vi_added = False
    for sweep in pi_sweeps:
        data_vi = u.FilterRows(
            data, lambda x: (x['mdp'] == 'LargeMdpRwTraps50') &
            (x['solver'] == 'vi') & (x['gamma'] == gamma))
        data_pi = u.FilterRows(
            data, lambda x:
            (x['mdp'] == 'LargeMdpRwTraps50') & (x['solver'] == 'pi') &
            (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep))
        assert (len(data_vi) == 1)
        assert (len(data_pi) == 1)

        data_vi_qchange = np.array(
            [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')])
        data_vi_value = np.array([
            float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';')
        ])
        data_pi_qchange = np.array(
            [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')])
        data_pi_value = np.array([
            float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';')
        ])
        if (vi_added == False):
            s_vi = u.YSeries(data_vi_qchange,
                             xvalues=np.arange(len(data_vi_qchange)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser.append(s_vi)
        s_pi = u.YSeries(data_pi_qchange,
                         xvalues=np.arange(len(data_pi_qchange)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser.append(s_pi)

        if (vi_added == False):
            s_vi = u.YSeries(data_vi_value,
                             xvalues=np.arange(len(data_vi_value)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser1.append(s_vi)
        s_pi = u.YSeries(data_pi_value,
                         xvalues=np.arange(len(data_pi_value)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser1.append(s_pi)
        vi_added = True

    outputfile = u.PreparePath(outputfolder + "/plots/large_qchange_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Max change in state value")
    outputfile = u.PreparePath(outputfolder + "/plots/large_value_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser1,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Total value accross states")

    ser = []
    ser1 = []
    vi_added = False
    for sweep in pi_sweeps:
        data_vi = u.FilterRows(
            data, lambda x: (x['mdp'] == 'SmallMdpRwTraps') &
            (x['solver'] == 'vi') & (x['gamma'] == gamma))
        data_pi = u.FilterRows(
            data, lambda x:
            (x['mdp'] == 'SmallMdpRwTraps') & (x['solver'] == 'pi') &
            (x['gamma'] == gamma) & (x['maxSweepsPerIteration'] == sweep))
        assert (len(data_vi) == 1)
        assert (len(data_pi) == 1)

        data_vi_qchange = np.array(
            [float(s) for s in data_vi.iloc[0]['cum_rewards'].split(';')])
        data_vi_value = np.array([
            float(s) for s in data_vi.iloc[0]['ran_to_completion'].split(';')
        ])
        data_pi_qchange = np.array(
            [float(s) for s in data_pi.iloc[0]['cum_rewards'].split(';')])
        data_pi_value = np.array([
            float(s) for s in data_pi.iloc[0]['ran_to_completion'].split(';')
        ])
        if (vi_added == False):
            s_vi = u.YSeries(data_vi_qchange,
                             xvalues=np.arange(len(data_vi_qchange)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser.append(s_vi)
        s_pi = u.YSeries(data_pi_qchange,
                         xvalues=np.arange(len(data_pi_qchange)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser.append(s_pi)

        if (vi_added == False):
            s_vi = u.YSeries(data_vi_value,
                             xvalues=np.arange(len(data_vi_value)) + 1,
                             line_color='b',
                             plot_legend_label='VI')
            ser1.append(s_vi)
        s_pi = u.YSeries(data_pi_value,
                         xvalues=np.arange(len(data_pi_value)) + 1,
                         line_color=decorations[sweep],
                         plot_legend_label='PI_' + str(sweep))
        ser1.append(s_pi)
        vi_added = True

    outputfile = u.PreparePath(outputfolder + "/plots/small_qchange_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Max change in state value")
    outputfile = u.PreparePath(outputfolder + "/plots/small_value_gamma=" +
                               str(gamma) + ".png")
    u.SaveDataPlotWithLegends(ser1,
                              filename=outputfile,
                              x_axis_name="iterations",
                              y1_axis_name="Total value accross states")
def RunExperiments(X,Y,rootfolder,clusters,dims,compute_acc=None):
    datasets = {}
    datasets["raw"] = (X,Y)
    err_series = []
    decorations = {}
    decorations["pca"] = ("o","r","pca")
    decorations["ica"] = ("x","b","ica")
    decorations["rp"] = ("+","g","rp")
    decorations["mi"] = ("o","k","mi")
    flags = [True,True,True,True]
    nn_output_lines = []
    nn_output_file = rootfolder + "/nn.csv"
    if(compute_acc is not None):
        h,l = CreateOutputLineForNN(RunNeuralNetwork(X,Y,10,compute_acc,False),"raw")
        nn_output_lines.append(h)
        nn_output_lines.append(l)

    best_bic = None
    ################### PCA #####################
    if(flags[0]):
        pca_results = PerformPca(X,Y,dims,0)
        pca_var_explained_plot = u.PreparePath(rootfolder + "/plots/pca/var.png")
        recons_err_plot = u.PreparePath(rootfolder + "/plots/err.png")
        recons_err_dict = []
        var_y = []
        err_y = []

        for dim in dims:
            key = "pca_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(pca_results["{0}data".format(key)]),Y)
            err_y.append(pca_results[key+"reconstruction_error"])
            var_y = pca_results[key+"explained_var_ratio"]
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"pca")
            #    #nn_output_lines.append(h)
            #    nn_output_lines.append(l)

        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2])
        recons_err_dict.append(ser)
        ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["pca"][0],line_color=decorations["pca"][1],plot_legend_label=decorations["pca"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="dimensions",y1_axis_name="% explained variance",filename=pca_var_explained_plot)

    ################### ICA #####################

    if(flags[1]):
        ica_kt_plot = u.PreparePath(rootfolder + "/plots/ica/kt.png")
        err_y = []
        ica_results = PerformIca(X,Y,dims,0)
        for dim in dims:
            key = "ica_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(ica_results[key+"data"]),Y)
            err_y.append(ica_results[key+"reconstruction_error"])
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"ica")
            #    nn_output_lines.append(l)

        var_y = ica_results["ica_kt_all"]
        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2])
        recons_err_dict.append(ser)
        ser = u.YSeries(var_y,xvalues = np.arange(len(var_y)) + 1,points_marker=decorations["ica"][0],line_color=decorations["ica"][1],plot_legend_label=decorations["ica"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="components",y1_axis_name="kurtosis",filename=ica_kt_plot)

    ################### RP #####################
    if(flags[2]):
        rp_runs_plot = u.PreparePath(rootfolder + "/plots/rp/runs.png")
        err_y = []
        runs = 10
        rp_results = PerformRandomProjections(X,Y,dims,runs)
        runs_series = []
        markers = u.GetColorCombinations(10)
        i=0
        for dim in dims:
            key = "rp_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(rp_results[key+"data"]),Y)
            err_y.append(rp_results[key+"reconstruction_error"])
            runs_ser = u.YSeries(rp_results[key+"reconstruction_errors_all"],xvalues=np.arange(runs)+1,points_marker = "o",line_color = markers[i]["color"],plot_legend_label="proj dims = "+str(dim))
            runs_series.append(runs_ser)
            i = i + 1
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"rp")
            #    nn_output_lines.append(l)

        ser = u.YSeries(err_y,xvalues = dims,points_marker=decorations["rp"][0],line_color=decorations["rp"][1],plot_legend_label=decorations["rp"][2])
        recons_err_dict.append(ser)
        u.SaveDataPlotWithLegends(runs_series,x_axis_name="run number",y1_axis_name="reconstruction err",filename=rp_runs_plot)

        u.SaveDataPlotWithLegends(recons_err_dict,x_axis_name="dimensions",y1_axis_name="reconstruction_error",filename=recons_err_plot)

    ###################### MI Feature Selection #########################
    if(flags[3]):
        mi_results = PerformMiBasedFeatureSelection(X,Y,dims,10)
        mi_plot = u.PreparePath(rootfolder + "/plots/mi/scores.png")
        for dim in dims:
            key = "mi_{0}_".format(str(dim))
            datasets[key] = (DoStandardScalingNumpyArray(mi_results[key+"data"]),Y)
            #if(compute_acc is not None and dim == 2):
            #    h,l = CreateOutputLineForNN(RunNeuralNetwork(datasets[key][0],datasets[key][1],10,compute_acc),"mi")
            #    nn_output_lines.append(l)
        ser = u.YSeries(mi_results["scores"],xvalues = np.arange(len(mi_results["scores"])) + 1,points_marker=decorations["mi"][0],line_color=decorations["mi"][1],plot_legend_label=decorations["mi"][2])
        u.SaveDataPlotWithLegends([ser],x_axis_name="feature number",y1_axis_name="mutual information", filename=mi_plot)

    ###################### CLUSTERING #########################
    clustering_output_file = rootfolder + "/clustering.csv"
    clustering_plots_output_root = u.PreparePath(rootfolder + "/plots")
    lines = []
    lines.append("clustering,dim_red_method,k,p,ami_raw,ami_true,sc,bic")
    raw_clustering_results = {}
    best_bic_raw_clustering = {}
    curr_best_bic = {}
    actual_labels = Y
    for dim in dims:
        for algo in ["raw","ica","rp","mi","pca"]:
            raw_data_plot_done = False
            key = "{0}_{1}_".format(algo,str(dim))
            if(algo == "raw"):
                key = "raw"
            dataset = datasets[key]
            for cluster in clusters:
                for mthd in ["kmeans","gmm"]:
                    raw_key = "{0}_{1}".format(str(cluster),mthd)
                    print("doing clustering for dim = {0} {1} k = {2} {3}".format(str(dim),algo,str(cluster), mthd))
                    c_key = "{0}_{1}_predicted".format(mthd,str(cluster))
                    c_key1 = "{0}_{1}_".format(mthd,str(cluster))
                    if(algo == "raw" and raw_key in raw_clustering_results):
                        results = raw_clustering_results[raw_key]
                    else:
                        #if(algo == "raw" and cluster == 2 and compute_acc):
                        #    results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd]
                        #    h,l = CreateOutputLineForNN(RunNeuralNetwork(results[c_key.replace("predicted","new_data")],dataset[1],10,compute_acc),mthd)
                        #    nn_output_lines.append(l)
                        #else:
                        results = RunClustering(dataset[0],dataset[1],[cluster],0,[mthd],dim)[mthd]
                        if(algo == "raw"):
                           raw_clustering_results[raw_key] = results
                        if(compute_acc):
                            mthd_key = mthd+algo if algo == "raw" else mthd+algo+str(cluster)+str(dim)
                            if((mthd_key not in curr_best_bic) or (curr_best_bic[mthd_key] > results[c_key1+"bic"])):
                                curr_best_bic[mthd_key] = results[c_key1+"bic"]
                                best_bic_raw_clustering[mthd_key] = (results[c_key1+"new_data"],dataset[1],results[c_key1+"metrics"]["ami"],results[c_key1+"bic"])
                                print("new best {0} {1}".format(c_key1,str(results[c_key1+"bic"])))

                    clustering_prediction_file = u.PreparePath(rootfolder + "/clustering_output/mthd={0}_k={1}_d={2}_algo={3}.csv".format(mthd,str(cluster),str(dim),algo))
                    np.savetxt(clustering_prediction_file,results[c_key])
                    bic = c_key.replace("predicted","bic")
                    bic = results[bic]
                    act = ComputeClusteringMetrics(actual_labels,results[c_key],dataset[0])
                    raw = ComputeClusteringMetrics(raw_clustering_results[raw_key][c_key],results[c_key],dataset[0])
                    line = "{0},{1},{2},{3},{4},{5},{6},{7}".format(mthd,algo,str(cluster),str(dim),str(raw["ami"]),str(act["ami"]),str(raw["sl"]),str(bic))
                    print(line)
                    plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_{3}.png".format(mthd,str(cluster),algo,str(dim))
                    #if(mthd == "gmm"):
                    #    prob_output_file = rootfolder + "/{0}_{1}_{2}_{3}.csv".format(mthd,str(cluster),algo,str(dim))
                    #    np.savetxt(prob_output_file,results[c_key.replace("predicted","prob")],delimiter=",")
                    ScatterPlotForClustering(results[c_key],actual_labels,plot_output_file)
                    if(dim == 2 and algo != "raw"):
                        if(raw_data_plot_done == False):
                            plot_output_file = clustering_plots_output_root + "/{0}_{1}_data.png".format(mthd,algo)
                            ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],np.zeros_like(actual_labels),actual_labels,plot_output_file)
                            raw_data_plot_done = True
                        plot_output_file = clustering_plots_output_root + "/{0}_{1}_{2}_data.png".format(mthd,str(cluster),algo)
                        ScatterPlotForClusteringData(dataset[0][:,0],dataset[0][:,1],results[c_key],actual_labels,plot_output_file)
                    lines.append(line)

    #if(compute_acc):
    #    keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","gmmpca":"pca","gmmica":"ica","gmmrp":"rp","gmmmi":"mi"}
    #    for key in keys_to_output.keys():
    #        if("raw" not in key):
    #            curr_best = None
    #            for cluster in clusters:
    #                datakey = key+str(cluster)
    #                if(curr_best is None or best_bic_raw_clustering[datakey][2] > curr_best):
    #                    curr_best = best_bic_raw_clustering[datakey][2]
    #                    _X = best_bic_raw_clustering[datakey][0]
    #                    _Y = best_bic_raw_clustering[datakey][1]
    #        else:
    #            _X = best_bic_raw_clustering[key][0]
    #            _Y = best_bic_raw_clustering[key][1]

    #        h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key])
    #        nn_output_lines.append(l)
    #    u.WriteTextArrayToFile(nn_output_file,nn_output_lines)

    if(compute_acc):
        keys_to_output = {"kmeansraw":"kmeans","gmmraw":"gmm","pca":"pca","ica":"ica","rp":"rp","mi":"mi"}
        for key in keys_to_output.keys():
            if("raw" not in key):
                dim_best_val = None
                dim_result = None
                for dim in dims:
                    best = {} # {x,y,p,k,bic,ami}
                    for cluster_mthd in ["kmeans","gmm"]:
                        for cluster in clusters:
                            datakey = cluster_mthd+key+str(cluster)+str(dim)
                            if(cluster_mthd not in best or best_bic_raw_clustering[datakey][2] > best[cluster_mthd][4]):
                                best[cluster_mthd] = (best_bic_raw_clustering[datakey][0],best_bic_raw_clustering[datakey][1],dim,cluster,best_bic_raw_clustering[datakey][3],best_bic_raw_clustering[datakey][2])
                    curr_val = (best["kmeans"][5] + best["gmm"][5]) / 2
                    if(dim_best_val is None or dim_best_val < curr_val):
                        dim_best_val = curr_val
                        dim_result = best

                _X = dim_result["gmm"][0]
                _Y = dim_result["gmm"][1]
            else:
                _X = best_bic_raw_clustering[key][0]
                _Y = best_bic_raw_clustering[key][1]

            h,l = CreateOutputLineForNN(RunNeuralNetwork(_X,_Y,10,compute_acc,scale=False if "gmmraw" == key else True),keys_to_output[key])
            nn_output_lines.append(l)
        u.WriteTextArrayToFile(nn_output_file,nn_output_lines)

    u.WriteTextArrayToFile(clustering_output_file,lines)
Ejemplo n.º 21
0
def KnnAnalysis(output_root, output_file_prefix, metrics_file):
    data_all = pd.read_csv(metrics_file)
    dataset_types = ['train_split_percent_used']
    col_funcs = {
        'p': ['mean'],
        'r': ['mean'],
        'm': ['mean'],
        'modelevaltimesecs': ['mean']
    }

    mapping_output_words = {
        'p': 'Precision',
        'r': 'Recall',
        'm': 'F-Measure',
        dataset_types[0]: 'Train size % used',
        dataset_types[1]: 'Fraction of postives to negatives',
        dataset_types[2]: 'Noise %',
        'modelevaltimesecs': 'Time to run Knn model (sec)'
    }

    for dataset_type in dataset_types:

        def filter_query(x):
            return (~np.isnan(x[dataset_type]) & (x['istrain'] == 0))

        def distance_weights_filter(x):
            return x['weights'] == 'distance'

        def uniform_weights_filter(x):
            return x['weights'] == 'uniform'

        data = FilterRows(data_all, filter_query)
        data_agg = GetAggMetrics(data,
                                 col_funcs=col_funcs,
                                 gpby=[dataset_type, 'weights', 'neighbors'])
        x = data_agg[dataset_type].unique()
        for k, v in col_funcs.items():
            for agg in v:
                data_for_distance_based_weighting = FilterRows(
                    data_agg, distance_weights_filter)
                nneighbors = [5, 10, 20, 50]
                marker_and_color_map = {
                    5: ('g', 'o'),
                    10: ('r', '+'),
                    20: ('b', 'x'),
                    50: ('k', 'd')
                }
                y_series = []
                for n in nneighbors:
                    d = data_for_distance_based_weighting[
                        data_for_distance_based_weighting['neighbors'] == n]
                    y = u.YSeries(d[k + "_" + agg],
                                  line_color=marker_and_color_map[n][0],
                                  points_marker=marker_and_color_map[n][1],
                                  plot_legend_label="k = " + str(n))
                    y_series.append(y)
                output_file_name = u.PreparePath(
                    "{4}/{0}.{1}.weighted.{2}.{3}.png".format(
                        output_file_prefix, dataset_type, k, agg, output_root))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], 'K Nearest Neighbor'.format(agg))

                data_for_distance_based_weighting = FilterRows(
                    data_agg, uniform_weights_filter)
                y_series = []
                for n in nneighbors:
                    d = data_for_distance_based_weighting[
                        data_for_distance_based_weighting['neighbors'] == n]
                    y = u.YSeries(d[k + "_" + agg],
                                  line_color=marker_and_color_map[n][0],
                                  points_marker=marker_and_color_map[n][1],
                                  plot_legend_label="k = " + str(n))
                    y_series.append(y)
                output_file_name = u.PreparePath(
                    "{4}/{0}.{1}.uniform.{2}.{3}.png".format(
                        output_file_prefix, dataset_type, k, agg, output_root))
                f, ax = u.SaveDataPlotWithLegends(
                    y_series, x, output_file_name, True,
                    mapping_output_words[dataset_type],
                    mapping_output_words[k], 'K Nearest Neighbor'.format(agg))
    return data_agg
def NeuralNetworkResults(rootfolder):
    data, ga = ReadNNetResultsFile10k(rootfolder)
    ga = ga.set_index('size')
    #data.loc[:,data.columns != 'loss'].to_csv(r'c:\temp\nnets10knew.csv')
    #ga.loc[:,data.columns != 'loss'].to_csv(r'c:\temp\nnets_ga.csv')
    algos = ['ga', 'rhc', 'sa', 'bp']
    algo_decoration = {
        'bp': ('r', 'o', 'backprop'),
        'ga': ('g', 's', 'genetic algo'),
        'sa': ('b', '+', 'sim annealing'),
        'rhc': ('k', '*', 'rhc')
    }
    y_ser = []
    time_y_ser = []
    loss_y_ser = []

    size_for_loss_curves = {20: [], 90: [], 100: []}
    size_for_loss_ga = {20: [], 50: [], 100: []}

    for algo in algos:
        filtered_data = data[data['algo'] == algo].set_index('size')
        train_ser = []
        valid_ser = []
        x = []
        time = []
        for size in [20, 30, 40, 50, 60, 70, 80, 90, 100]:
            x.append(size)
            train_ser.append(filtered_data.loc[size]['train_f1'])
            valid_ser.append(filtered_data.loc[size]['valid_f1'])
            time.append(filtered_data.loc[size]['time'])

            if (size in size_for_loss_curves):
                y_vals = np.array(filtered_data.loc[size]['loss'].split(';'),
                                  dtype=float)
                x_vals = np.arange(y_vals.size) + 1
                _ser = u.YSeries(y_vals,
                                 xvalues=x_vals,
                                 line_color=algo_decoration[algo][0],
                                 points_marker='.',
                                 legend_marker='o',
                                 plot_legend_label=algo_decoration[algo][2])
                size_for_loss_curves[size].append(_ser)

            if (algo == "ga" and size in size_for_loss_ga):
                ga_y_vals = np.array(
                    filtered_data.loc[size]['loss'].split(';'), dtype=float)
                bad_ga_y_vals = np.array(ga.loc[size]['loss'].split(';'),
                                         dtype=float)[0:10000]
                _ser = u.YSeries(ga_y_vals,
                                 xvalues=np.arange(ga_y_vals.size) + 1,
                                 line_color='b',
                                 points_marker='.',
                                 legend_marker='o',
                                 plot_legend_label='tournament selection')
                size_for_loss_ga[size].append(_ser)
                _ser = u.YSeries(bad_ga_y_vals,
                                 xvalues=np.arange(ga_y_vals.size) + 1,
                                 line_color='r',
                                 points_marker='.',
                                 legend_marker='o',
                                 plot_legend_label='roulette wheel')
                size_for_loss_ga[size].append(_ser)

        y_ser.append(
            u.YSeries(train_ser,
                      xvalues=x,
                      line_color=algo_decoration[algo][0],
                      points_marker='x',
                      plot_legend_label=algo_decoration[algo][2] + "-train"))
        y_ser.append(
            u.YSeries(valid_ser,
                      xvalues=x,
                      line_color=algo_decoration[algo][0],
                      points_marker='o',
                      plot_legend_label=algo_decoration[algo][2] + "-valid"))
        time_y_ser.append(
            u.YSeries(time,
                      xvalues=x,
                      line_color=algo_decoration[algo][0],
                      points_marker=algo_decoration[algo][1],
                      plot_legend_label=algo_decoration[algo][2]))
        x_axis_name = 'trainset size %'

    y_axis_name = 'f-measure'
    plot_file = u.PreparePath(rootfolder + "/plot10k/learning_curves.png")
    time_plot_file = u.PreparePath(rootfolder + "/plot10k/time.png")
    u.SaveDataPlotWithLegends(y_ser,
                              filename=plot_file,
                              x_axis_name=x_axis_name,
                              y1_axis_name=y_axis_name)
    u.SaveDataPlotWithLegends(time_y_ser,
                              filename=time_plot_file,
                              x_axis_name=x_axis_name,
                              y1_axis_name="Time (MilliSec)")
    for key in size_for_loss_curves.keys():
        loss_plot_file = u.PreparePath(
            rootfolder + "/plot10k/loss_curves_{0}.png".format(str(key)))
        u.SaveDataPlotWithLegends(size_for_loss_curves[key],
                                  filename=loss_plot_file,
                                  title="Size % : " + str(key),
                                  x_axis_name='iters',
                                  y1_axis_name="Loss")

    for key in size_for_loss_ga.keys():
        loss_plot_file = u.PreparePath(
            rootfolder + "/plot10k/loss_curves_ga_{0}.png".format(str(key)))
        u.SaveDataPlotWithLegends(size_for_loss_ga[key],
                                  filename=loss_plot_file,
                                  title="Size % : " + str(key),
                                  x_axis_name='iters',
                                  y1_axis_name="Loss")