def do_plot(flair_stats, filename):
    months = []
    new_flaired = []
    total = []

    for month, month_stats in sorted(flair_stats.items()):
        total.append(sum(month_stats.values()))
        new_flaired.append(month_stats['new'])
        months.append(month)

    sns.set_style('whitegrid')

    total_plot_color = sns.xkcd_rgb["denim blue"]
    ignored_plot_color = sns.xkcd_rgb["orange red"]

    total_plot = sns.pointplot(x=months, y=total, color=total_plot_color)
    sns.pointplot(x=months, y=new_flaired, color=ignored_plot_color)

    total_patch = mpatches.Patch(color=total_plot_color)
    ignored_patch = mpatches.Patch(color=ignored_plot_color)

    total_plot.set(ylabel="Number of bugreports", xlabel="Month")
    total_plot.set_title('/r/bugs statistics by month:\nReddit admins consistently ignore half of bugreports', y=1.02)
    sns.plt.legend([total_patch, ignored_patch], ['Total number of bugreports',
                                                  'Number of ignored bugreports (submissions with "new" flair)'],
                   loc="lower left")

    sns.plt.savefig(filename)
Esempio n. 2
0
def plot_topk_all_models():
    '''
    Generate line plot that visualizes, for various values of k, the proportion of trials
    for which the model assigned the correct sketch category a rank of <= k.
    '''
    Q = load_all_topk_predictions()
    krange = 64 ## how many values of k to plot
    sns.set_context('poster')
    fig = plt.figure(figsize=(8,8))
    colors = [(0.2,0.2,0.2),(0.8,0.3,0.3),(0.3,0.3,0.8),(0.5,0.5,0.5),(0.6,0.2,0.6)]
    sns.pointplot(x='k',
                  y='prop',
                  hue='adaptor',
                  data=Q,
                  palette=colors,
                  markers = '.',
                  ci='sd',              
                  join=True)
    plt.ylabel('proportion',fontsize=24)
    plt.xlabel('k',fontsize=24)
    plt.title('% correct within top k')
    plt.ylim([0,1.1])
    # plt.xlim([-0.1,krange])
    plt.xlim([0,18])
    # locs, labels = plt.xticks(np.linspace(0,krange-1,9),map(int,np.linspace(0,krange-1,9)+1),fontsize=16)
    plt.tight_layout()
    plt.legend(bbox_to_anchor=(1.0, 0.9))
Esempio n. 3
0
def classifier_selection_time(data, filename):
    """
    """

    num_classifiers = data.shape[0]

    training_time = data.filter(regex="classifier|training time", axis=1)
    training_time.columns = ["classifier", "time"]
    training_time["Dataset"] = pd.Series(["Training"] * num_classifiers)

    testing_time = data.filter(regex="classifier|testing time", axis=1)
    testing_time.columns = ["classifier", "time"]
    testing_time["Dataset"] = pd.Series(["Testing"] * num_classifiers)

    time_data = pd.concat(
        (testing_time, training_time), ignore_index=True, axis=0)

    with sns.axes_style("ticks"):
        fig, ax = plt.subplots()

        sns.pointplot(x="classifier", y="time", hue="Dataset", data=time_data,
                      palette="Set1", dodge=True, ax=ax)

        ax.set_xticklabels(ax.xaxis.get_ticklabels(),
                           rotation=params.X_TICK_ROTATION)
        ax.set_xlabel("Classifier")
        ax.set_ylabel("Average Time (msec / example)")

        sns.despine()
        fig.set_size_inches(params.FIGSIZE)
        plt.tight_layout()
        plt.savefig(filename)
Esempio n. 4
0
def classifier_selection_error(data, filename):
    """
    """

    num_classifiers = data.shape[0]

    data["training error"] = data["training error"].round(decimals=3)
    data["testing error"] = data["testing error"].round(decimals=3)

    training_error = data.filter(regex="classifier|training error", axis=1)
    training_error.columns = ["classifier", "error"]
    training_error["Dataset"] = pd.Series(["Training"] * num_classifiers)

    testing_error = data.filter(regex="classifier|testing error", axis=1)
    testing_error.columns = ["classifier", "error"]
    testing_error["Dataset"] = pd.Series(["Testing"] * num_classifiers)

    error_data = pd.concat(
        (testing_error, training_error), ignore_index=True, axis=0)

    with sns.axes_style("ticks"):
        fig, ax = plt.subplots()

        sns.pointplot(x="classifier", y="error", hue="Dataset", data=error_data,
                      palette="Set1", dodge=True, ax=ax)

        ax.set_xticklabels(ax.xaxis.get_ticklabels(),
                           rotation=params.X_TICK_ROTATION)
        ax.set_xlabel("Classifier")
        ax.set_ylabel("Mean Error")

        sns.despine()
        fig.set_size_inches(params.FIGSIZE)
        plt.tight_layout()
        plt.savefig(filename)
def skill_correlations(runs=50, n_clusters=5):
    results = []
    clustering = kmeans
    for run in range(runs):
        for skill_correlation in list(np.arange(0, 0.9, 0.1)) + [0.85]:
            for clustering in clusterings:
                for students in [10, 20, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 5000, 10000, 20000, 50000, 100000, 200000, 500000, 1000000]:
                    answers, items  = data(n_students=students, n_items=20, n_concepts=n_clusters, skill_correlation=skill_correlation)
                    true_cluster_names = list(items['concept'].unique())
                    X = similarity(answers)
                    items_ids = X.index
                    ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

                    labels = clustering(X, n_clusters, euclid=euclid)
                    rand = rand_index(ground_truth, labels)

                    print(run, skill_correlation, clustering.__name__, students, '===', rand)
                    if rand >= 0.9:
                        results.append([students, clustering.__name__, rand, skill_correlation])
                        break

    results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation'])

    print(results)
    f, ax = plt.subplots(figsize=(7, 7))
    ax.set(yscale="log")
    sns.pointplot(data=results, x='skill_correlation', y='students', hue='clustering', ax=ax)
def main(argv):

    # Lists of marker styles and line styles
    markers = 10 * ['o','^','x']
    lines = 10 * ['-','--','-.']

    infile = sys.argv[1]

    resframe = pd.read_csv(infile)

    print "Summary of all results found:"
    print resframe
    labels = map(int, resframe['Writers'].unique())
    labels.sort()

    fig, ax = plt.subplots()

    sns.pointplot(x='Writers', y='Write Bandwidth (MiB/s)',
      data=resframe, hue='Scheme', scale=0.75, markers=markers,
      linestyles=lines)
    ax.set_ylim(ymin=0)

    plt.ylabel('Max. Write Bandwidth / MiB/s')
    plt.xlabel('Writers')
    plt.legend()
    plt.savefig('max_bandwidth_stats.png')
    plt.clf()

    sys.exit(0)
def cumm_plot(data, **kwargs):
#    data.sort('Opt1Value', inplace = True)
#    data['rank'] = data.sort('Opt1Value')['Opt1Value'].transform(lambda score: np.linspace(0, 1, len(score)))    
    data['order'] = data.sort('Opt1Value').groupby(['new_type'])['Opt1Value'].transform(lambda score: np.linspace(0, 1, len(score)))
    data.sort('new_type', inplace = True)
    
    sb.pointplot('Opt1Value', 'order', data = data, hue = 'new_type', **kwargs)
Esempio n. 8
0
def rolling_success_diff(answers, last_count=4, filters=None, only_last=True):
    if filters is None:
        filters = [None]

    data = []
    for filter in filters:
        df = filter_users(answers, min_answer_count=filter)
        for df in df.groupby('user'):
            df = df[1]
            mean = df['correct'].mean()
            if len(df) < last_count:
                continue
            for x in df['correct'].rolling(last_count, last_count).mean():
                if np.isnan(x):
                    continue
                if not only_last:
                    data.append([np.round(x - mean, 1), filter, 0])
            if not only_last:
                data[-1][-1] = 1
            else:
                data.append([x - mean, filter, 1])
    df = pd.DataFrame(data, columns=['rolling_success_diff', 'min_answers', 'leave'])
    if not only_last:
        sns.pointplot(data=df, x='rolling_success_diff', y='leave', hue='min_answers').set(ylim=(0, 0.2))
    else:
        for filter in filters:
            sns.distplot(df.loc[df['min_answers'] == filter, 'rolling_success_diff'], label=str(filter))
        plt.legend(loc=1)
    return df
def _make_all_pred_plots(big_d, time_d, fig_name):
    fig, axarr = plt.subplots(2, 3, figsize=(11, 5))
    axarr = np.ravel(axarr)
    letters = "abcdef"
    for i, (field, ax) in enumerate(zip(fields, axarr)):
        title = "({}) {}".format(letters[i], titles[i])
        if i < 5:
            ax = sns.pointplot(x="n_vehicles", y=field, hue="predictions",
                               data=big_d, palette=clrs, ax=ax)
        else:
            ax =sns.pointplot(x="vehicles", y=field, hue="predictions",
                              data=time_d, palette=clrs, ax=ax)
        ax.set_xticklabels(vehicles)
        ax.set_title(title, fontsize=13, y=-0.45)
        ax.set_ylabel(prettify(field))
        ax.set_xlabel("Number of Vehicles")
        if "%" in prettify(field):
            ax.set_ylim([0, 1])
            vals = ax.get_yticks()
            ax.set_yticklabels(['{:3.0f}%'.format(x * 100) for x in vals])
        handles, _ = ax.get_legend_handles_labels()
        ax.legend().remove()
    fig.subplots_adjust(wspace=0.4, hspace=0.58)
    lgd = fig.legend(
        handles,
        fancy_preds,
        loc="lower center", fancybox=True,
        bbox_to_anchor=(0.46, 0.96),
        title="Number of Samples", markerscale=2.5, ncol=4)
    fig.savefig(
        fig_dir + fig_name,
        bbox_inches='tight', bbox_extra_artists=(lgd,))
    plt.close()
Esempio n. 10
0
def calibration_curve_plotter(y_test, prob_pos, n_bins=10):

    brier = brier_score_loss(y_test, prob_pos, pos_label=1)

    fig = plt.figure(0, figsize=(10, 10))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))

    df = pd.DataFrame({"true": y_test})
    bins = np.linspace(0.0, 1.0, n_bins + 1)
    binids = np.digitize(prob_pos, bins) - 1
    df["Bin center"] = bins[binids] + 0.5 / n_bins
    df[""] = "Model calibration: (%1.5f)" % brier
    o = bins + 0.5 / n_bins

    df2 = pd.DataFrame({"true": o, "Bin center": o})
    df2[""] = "Perfect calibration"

    df = pd.concat([df, df2])

    sns.pointplot(x="Bin center", y="true", data=df, order=o, hue="", ax=ax1)

    ax2.hist(prob_pos, range=(0, 1), bins=10, label="Model", histtype="step", lw=2)

    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    # ax1.legend(loc="lower right")
    ax1.set_title("Calibration plots")

    ax2.set_xlabel("Predicted Probability")
    ax2.set_ylabel("Count")

    plt.tight_layout()
Esempio n. 11
0
def make_interval_comp_plots(df):
    fig, ax = plt.subplots(1, 1, figsize=aux_fig_size)
    sns.pointplot(x="interval", y="comp_time", data=df, ax=ax,
                  color=sns.xkcd_rgb["bright red"])
    ax.set_xlabel("Step Size [s]")
    ax.set_ylabel(prettify("comp_time"))
    ax.set_xticklabels(intervals)
    plt.savefig("figs/interval-comp_time.png", bbox_inches="tight")
    plt.close()
Esempio n. 12
0
def test_activity_timecourse_with_inlay():
	import pandas as pd
	import matplotlib.pyplot as plt
	import samri.plotting.maps as maps
	import seaborn as sns
	from os import path

	# Style elements
	palette=["#56B4E9", "#E69F00"]

	data_dir = path.join(path.dirname(path.realpath(__file__)),"../../tests/data")
	data_path = path.join(data_dir,'drs_activity.csv')
	df = pd.read_csv(data_path)

	df = df.rename(columns={'t':'Mean t-Statistic'})
	df['Session']=df['Session'].map({
		'ofM':'naïve',
		'ofMaF':'acute',
		'ofMcF1':'chronic/2w',
		'ofMcF2':'chronic/4w',
		'ofMpF':'post',
		})


	# definitions for the axes
	left, width = 0.06, 0.9
	bottom, height = 0.06, 0.9

	session_coordinates = [left, bottom, width, height]
	roi_coordinates = [left+0.02, bottom+0.7, 0.3, 0.2]

	fig = plt.figure(1)

	ax1 = plt.axes(session_coordinates)
	sns.pointplot(
	       x='Session',
	       y='Mean t-Statistic',
	       units='subject',
	       data=df,
	       hue='treatment',
	       dodge=True,
	       palette=palette,
	       order=['naïve','acute','chronic/2w','chronic/4w','post'],
	       ax=ax1,
	       ci=95,
	       )

	ax2 = plt.axes(roi_coordinates)
	maps.atlas_label("/usr/share/mouse-brain-atlases/dsurqec_200micron_roi-dr.nii",
		scale=0.3,
		color="#E69F00",
		ax=ax2,
		annotate=False,
		alpha=0.8,
		)

	plt.savefig('_activity_timecourse_with_inlay.png')
Esempio n. 13
0
    def plot(self):
        """ Graphical summary of pointwise pareto-k importance-sampling indices

        Pareto-k tail indices are plotted (on the y axis) for each observation unit (on the x axis)

        """
        seaborn.pointplot(
            y = self.pointwise.pareto_k,
            x = self.pointwise.index,
            join = False)
Esempio n. 14
0
def make_demand_comp_plots(df):
    fig, ax = plt.subplots(1, 1, figsize=aux_fig_size)
    sns.pointplot(x="demand", y="comp_time", hue="capacity", data=df, ax=ax,
                  palette=dem_clrs)
    ax.set_ylabel(prettify("comp_time"))
    ax.set_xlabel("Nominal Number of Requests")
    ax.set_xticklabels(["x0.5", "x1", "x2"])
    handles, _ = ax.get_legend_handles_labels()
    ax.legend(handles, [1, 4], title="Capacity")
    plt.savefig("figs/demand-comp_time.png", bbox_inches="tight")
    plt.close()
def individual_graph(transparencies_1, transparencies_2, condition,
                     subject_number, display_graph=True):

    x = [i for i in range(1, 81)]
    sns.pointplot(x, transparencies_1, color='red')
    plot = sns.pointplot(x, transparencies_2)
    plot.set(xlabel="Trial", ylabel="Contrast",
             title="{} Condition".format(condition))
    if display_graph:
        plt.show()
    plot = plot.get_figure()
    plot.savefig("Subject {}/{}.png".format(subject_number, condition))
    plt.cla()
def plotexp(res):
    data = res['res']['data']
    _, basename = os.path.split(res['args']['filename'])
    basename = basename.replace('.', '')

    plt.figure(facecolor='white', tight_layout=True, figsize=(4.5, 3.5),
               dpi=300)
    sns.pointplot(x='Number of topics', y='Log evidence', data=data)
    plt.savefig(os.path.join(DATA_DIR, basename + '_nt_evidence.png'), dpi=300)

    plt.figure(facecolor='white', tight_layout=True, figsize=(4.5, 3.5),
               dpi=300)
    sns.pointplot(x='Number of topics', y='Runtime', data=data)
    plt.savefig(os.path.join(DATA_DIR, basename + '_nt_runtime.png'), dpi=300)
Esempio n. 17
0
def make_avg_plots_with_preds(big_d):
    d = big_d.query("capacity == 4 and waiting_time == 300")
    cap, wt = 4, 300
    fig = plt.figure()
    fig.set_size_inches(13, 10)
    for field in fields + ["n_shared_per_passenger"]:
        ax = sns.pointplot(x="vehicles", y=field, hue="predictions", data=d)
        ax.set_xticklabels(vehicles)
        plt.ylabel(prettify(field))
        if "%" in prettify(field):
            ax.set_ylim([0, 1])
            vals = ax.get_yticks()
            ax.set_yticklabels(['{:3.0f}%'.format(x * 100) for x in vals])
        plt.xlabel("Num Vehicles")
        handles, _ = ax.get_legend_handles_labels()
        plt.legend(
            handles,
            ["No R.B.", 0, 100, 200, 300, 400],
            loc="center left", fancybox=True,
            shadow=True, bbox_to_anchor=(1, 0.5),
            title="Predictions", markerscale=3)
        plt.title(make_pred_title(wt, cap))
        plt.savefig(
            "figs/avg-with-preds-{}.png".format(field),
            bbox_inches='tight')
        plt.close()
Esempio n. 18
0
def _plotWeekdayByMonthStats(stats, stat_name):
    dataToPlot = _prepareWeekdayByMonthStats(stats)

    # Plot
    g = sns.pointplot(data=dataToPlot, x="day", y=stat_name, hue="month", order=dayOfWeekOrder)
    g.set(xlabel='')
    g.set_ylabel(NAMES[stat_name])
    return g
Esempio n. 19
0
def draw_graph(plot_data, rankingSystem, numberOfUv, hue):
    plot_data['world_rank'] = plot_data['world_rank'].astype(int)
    ax = sns.pointplot(x='year', y='world_rank', hue=hue, data=plot_data);
    pylab.title("Top " + str(numberOfUv) + " university by " + rankingSystem, fontsize=26)
    pylab.xticks(fontsize=20)
    pylab.yticks(fontsize=20)
    pylab.ylabel("World Rank", fontsize=26)
    pylab.xlabel("Year", fontsize=26)
    pylab.savefig('resources/images/topuv.png')
    pylab.cla()
    pylab.clf()
    pylab.close()
Esempio n. 20
0
def compare_groups(df, x, mpu = False, order = None, hue = None, plot = True, table = True):
    agg = 'pred_aggression_score'
    rec = 'pred_recipient_score'
    
    if table:
        if hue:
            print(df.groupby([x, hue])[agg, rec].mean())
        else:
            print(df.groupby([x])[agg, rec].mean())
    
    if plot:
        
        fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=False, figsize = (12,6))

        if mpu:
            cols = ['user_text', x]
            plt.figure()
            sns.pointplot(x=x, y= agg, data=mpg(df, agg, cols) , order = order, hue = hue, ax = ax1)
            plt.figure()
            sns.pointplot(x=x, y= rec, data=mpg(df, rec, cols) , order = order, hue = hue, ax = ax2)
        else:
            
            ax = sns.pointplot(x=x, y= agg, data=df, order = order, hue = hue, ax = ax1)
            plt.figure()
            ax = sns.pointplot(x=x, y= rec, data=df, order = order, hue = hue, ax = ax2)
Esempio n. 21
0
def qualitative_times(df,
	ax=None,
	x="relative_date",
	y="weight",
	unit="Animal_id",
	condition="treatment",
	err_style="unit_traces",
	order=None,
	bp_style=True,
	save_as='',
	legend_title=False,
	palette=QUALITATIVE_COLORSET,
	renames={},
	model='',
	print_model=False,
	print_anova=False,
	anova_type=3,
	groups=None,
	ci=95,
	):
	"""Plot a timecourse based on qualitative times (e.g. sessions).
	"""

	if bp_style:
		plt.style.use(u'seaborn-darkgrid')
		plt.style.use('ggplot')

	if renames:
		for key in renames:
			for subkey in renames[key]:
				df.loc[df[key] == subkey, key] = renames[key][subkey]

	ax = sns.pointplot(
		x=x,
		y=y,
		units=unit,
		data=df,
		hue=condition,
		dodge=True,
		palette=sns.color_palette(palette),
		order=order,
		ax=ax,
		ci=ci,
		)

	ax.set_ylabel(y)

	if not legend_title:
		legend_title = ax.legend().set_title('')

	if save_as:
		plt.savefig(path.abspath(path.expanduser(save_as)), bbox_inches='tight')
def line_plot(data ,title = "", x_title ="", y_title="", legend_label="",group_labels=None):
    plot_data = DataFrame()

    plot_data['x'] = data[:, 1].astype(int)
    plot_data['y'] = data[:, 0].astype(float)

    plot_data[legend_label] = data[:, 2]
    sns.set(style="whitegrid")
    g = sns.pointplot(x="x", y="y", hue=legend_label, data=plot_data, hue_order=np.unique(plot_data[legend_label]))
    plt.title(title, fontsize=25)
    plt.ylabel(y_title, fontsize=12)
    plt.xlabel(x_title, fontsize=12)
    plt.show()
def students(runs=15):
    results = []
    for run in range(runs):
        # for n_students in range(100, 1001, 100):
        # for n_students in [10, 25, 50, 100, 200, 300,  400, 600]:
        for difficulty_shift in np.arange(-1, 1.1, 0.2):
            answers, items = data(n_students=n_students, n_items=n_items, n_concepts=n_clusters, skill_correlation=skill_correlation, difficulty_shift=difficulty_shift, missing=missing)
            true_cluster_names = list(items['concept'].unique())
            # for i, clustering in enumerate(clusterings):
            for similarity, euclid, similarity_name in similarities:
                X = similarity(answers)
                items_ids = X.index
                ground_truth = np.array([true_cluster_names.index(items.get_value(item, 'concept')) for item in items_ids])

                labels = clustering(X, n_clusters, euclid=euclid)
                rand = rand_index(ground_truth, labels)
                results.append([n_students, clustering.__name__, rand, skill_correlation, difficulty_shift, similarity_name])
                print(run, n_students, similarity_name, rand)

    results = pd.DataFrame(results, columns=['students', 'clustering', 'rand_index', 'skill_correlation', 'difficulty_shift', 'similarity'])
    print(results)

    plt.figure(figsize=(16, 24))
    sns.pointplot(data=results, x='difficulty_shift', y='rand_index', hue='similarity')
Esempio n. 24
0
def make_interval_plots(df):
    for field in tqdm(fields + ["n_shared_per_passenger"]):
        fig, ax = plt.subplots(1, 1, figsize=aux_fig_size)
        ax = sns.pointplot(x="interval", y=field, data=df,
                           color=sns.xkcd_rgb["bright red"], ax=ax)
        filename = "figs/interval-{}.png".format(field)
        ax.set_xlabel("Step Size [s]")
        ax.set_ylabel(prettify(field))
        if "%" in prettify(field):
            ax.set_ylim(0, 1)
            vals = ax.get_yticks()
            yticklabels = ['{:3.0f}%'.format(x * 100) for x in vals]
            ax.set_yticklabels(yticklabels)

        plt.savefig(filename, bbox_inches="tight")
        plt.close()
Esempio n. 25
0
def process_graph(university):
    timesData = Data.get_time_data()
    shanghaiData = Data.get_shanghai_data()
    cwurData = Data.get_cwur_data()
    # university_name = []
    # university_name = university
    times_plot_data = timesData[timesData.university_name.isin(university)][['world_rank', 'year']]
    shanghai_plot_data = shanghaiData[shanghaiData.university_name.isin(university)][['world_rank', 'year']]
    cwur_plot_data = cwurData[cwurData.institution.isin(university)][['world_rank', 'year']]

    times_plot_data['source'] = 'Times'
    shanghai_plot_data['source'] = 'Shanghai'
    cwur_plot_data['source'] = 'CWUR'

    ## parse the first number in rank for data ranges

    times_plot_data['world_rank'] = times_plot_data['world_rank'].str.split('-').str[0]
    shanghai_plot_data['world_rank'] = shanghai_plot_data['world_rank'].str.split('-').str[0]

    plot_data = times_plot_data.append(shanghai_plot_data).append(cwur_plot_data)
    plot_data['world_rank'] = plot_data['world_rank'].astype(int)

    sns.set(style="ticks", color_codes=True)
    plt.rcParams['figure.figsize'] = 16, 12
    ax = sns.pointplot(x='year', y='world_rank', hue='source', data=plot_data);

    # Styling

    plt.title(university[0] + " Ranking", fontsize=26)
    plt.xticks(fontsize=20)
    plt.yticks(fontsize=20)
    plt.ylabel("World Rank", fontsize=26)
    plt.xlabel("Year", fontsize=26)
    plt.tight_layout()
    plt.legend(loc='upper left', fontsize=20)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.get_xaxis().tick_bottom()
    ax.get_yaxis().tick_left()

    # Save File
    plt.savefig('resources/images/university.png')
    plt.cla()
    plt.clf()
    plt.close()
Esempio n. 26
0
def compare2(y, y_hat1, y_hat2):
    thetas = np.linspace(0, 1, num=101)
    maes = []
    for theta in thetas:
        blended = blend2([y_hat1, y_hat2], [theta, 1 - theta])
        err = mae(y, blended)
        maes.append(err)
    maes = np.array(maes)
    min_i = np.argmin(maes)
    d = {'mae': maes, 'theta': thetas}
    df = pd.DataFrame(d)
    sns.set_style('darkgrid')
    ax = sns.pointplot(x='theta', y='mae', data=df)
    ax.set_xticks([])
    sns.plt.axvline(x=min_i)
    lab = 'BEST (theta: ' + str(thetas[min_i]) + ', MAE: ' + str(maes[min_i]) + ')'
    ax.set(xlabel=lab, ylabel='MAE')
    sns.plt.show()
Esempio n. 27
0
def make_demand_plots(df):
    for field in tqdm(fields + ["n_shared_per_passenger"]):
        fig, ax = plt.subplots(1, 1, figsize=aux_fig_size)
        ax = sns.pointplot(x="demand", y=field, hue="capacity", data=df,
                           palette=dem_clrs, ax=ax)
        ax.set_xlabel("Nominal Number of Requests")
        ax.set_ylabel(prettify(field))
        ax.set_xticklabels(["x0.5", "x1", "x2"])
        if "%" in prettify(field):
            ax.set_ylim(0, 1)
            vals = ax.get_yticks()
            yticklabels = ['{:3.0f}%'.format(x * 100) for x in vals]
            ax.set_yticklabels(yticklabels)
        handles, _ = ax.get_legend_handles_labels()
        ax.legend(handles, [1, 4], title="Capacity")
        filename = "figs/demand-{}.png".format(field)
        plt.savefig(filename, bbox_inches="tight")
        plt.close()
Esempio n. 28
0
def compare_multiway(y, y_hat):
    thetas = np.linspace(0, 1, num=101)
    maes = []
    for theta in thetas:
        theta2 = (1 - theta) / (len(y_hat) - 1)
        blended = blend(y_hat, [theta] + list(np.repeat(theta2, len(y_hat) - 1)))
        err = mae(y, blended)
        maes.append(err)
    maes = np.array(maes)
    min_i = np.argmin(maes)
    d = {'mae': maes, 'theta': thetas}
    df = pd.DataFrame(d)
    sns.set_style('darkgrid')
    ax = sns.pointplot(x='theta', y='mae', data=df)
    ax.set_xticks([])
    sns.plt.axvline(x=min_i)
    lab = 'BEST (theta: ' + str(thetas[min_i]) + ', MAE: ' + str(maes[min_i]) + ')'
    ax.set(xlabel=lab, ylabel='MAE')
    sns.plt.show()
Esempio n. 29
0
def save_solvers_cmp(is_power_point = False):
    dfs = []
    for filename in glob.glob("../output/cifar10classifier_resnet32_*.csv"):
        target = filename.split("_")[-1].split(".csv")[0] 
        if target in ["adadelta", "adagrad", "adam", "momentum", "rmsprop"]:
            df = pd.read_csv(filename)
            df["train_error"] = 1 - df["train_accuracy"]
            df["test_error"] = 1 - df["test_accuracy"]
            dfs.append(df)
    total_df = pd.concat(dfs)
    total_df["name"] = total_df["name"].str.split("_").str.get(-1).str.replace("Momentum", "Nesterov(Original Paper)")
    ax = sns.pointplot(x="epoch", y="test_error", hue="name", data=total_df, scale=0.2)
    if is_power_point:
        ax.legend(loc="lower left", markerscale=9.0, fontsize=20)  
    else:
        ax.legend(loc="lower left", markerscale=3.0)
    ax.set(ylim=(0, 0.2))
    ax.set_xticklabels([i if i % 10 == 0 else "" for i in range(200)])
    ax.set(xlabel='epoch', ylabel='error(%)')
    ax.get_figure().savefig("../figures/resnet.solvers.png")
    sns.plt.close()
Esempio n. 30
0
def twentyMins(ultilive, nattys):
    # create array for every 20 minutes from 10-16 00:00 to 10-20 00:00
    ulti_twentyMins = np.zeros(288)

    for u in ultilive.iterrows():
        day = (u[1]['day'] - 16)*72
        hour = (u[1]['hour'])*3
        min = (u[1]['minute'])%3

        ulti_twentyMins[day+hour+min]+=1



    # create array for every 20 minutes from 10-16 00:00 to 10-20 00:00
    natty_twentyMins = np.zeros(288)

    for u in nattys.iterrows():
        day = (u[1]['day'] - 16)*72
        hour = (u[1]['hour'])*3
        min = (u[1]['minute'])%3
        
        natty_twentyMins[day+hour+min]+=1

    sns.set(style='darkgrid', context='poster')
    plt.figure(figsize=(20,15))

    dfTime = pd.DataFrame(index = np.arange(0,576), columns = ['Time', 'Tweets', 'Source'])
    dfTime.Time = np.concatenate([range(0,288),range(0,288)])
    dfTime.Tweets = np.concatenate([ulti_twentyMins, natty_twentyMins])
    dfTime.Source = np.concatenate([["Ultiworld Live"]*288, ["#NationalsTX"]*288])

    s = sns.pointplot('Time', 'Tweets', 'Source', data=dfTime, palette="Paired")
    s.set_title("Tweets per Twenty Minutes")
    s.set_xticks([72, 144, 216, 287])
    s.set_xticklabels(['17th', '18th', '19th', '20th'])
    s.set_xlabel("Day")

    s.axis([0,288,0,50])

    s.figure.savefig("ByTwenty.png")
Esempio n. 31
0
# Here we plot the results. We first make a pointplot with the average
# performance of each pipeline across session and subjects.

fig, axes = plt.subplots(1, 2, figsize=[8, 4], sharey=True)

sns.stripplot(
    data=results,
    y="score",
    x="pipeline",
    ax=axes[0],
    jitter=True,
    alpha=0.5,
    zorder=1,
    palette="Set1",
)
sns.pointplot(data=results, y="score", x="pipeline", ax=axes[0], zorder=1, palette="Set1")

axes[0].set_ylabel("ROC AUC")
axes[0].set_ylim(0.5, 1)

##############################################################################
# The second plot is a paired scatter plot. Each point representing the score
# of a single session. An algorithm will outperform another is most of the
# points are in its quadrant.

paired = results.pivot_table(
    values="score", columns="pipeline", index=["subject", "session"]
)
paired = paired.reset_index()

sns.regplot(data=paired, y="RG+LR", x="CSP+LDA", ax=axes[1], fit_reg=False)
def plot_performance_graph(metric='AUROC',
                           evaluation_set='dev',
                           title="",
                           file_name="",
                           data=None,
                           color_map=None):
    """
    Plot the graphs onto a multi-subplot grid using seaborn
    Args:
        metric - the metric to plot for the y axis
        evaluation_set - whehter to plot the dev set or test set
        title - the main title of the large graph
        file_name - the name of the file to save the graph
        data - the dataframe tree to plot the large graph
        color_map - the color coded to plot each point on
    """
    fig, axes = plt.subplots(len(file_tree),
                             len(file_tree["DaG"]),
                             figsize=(25, 15),
                             sharey='row')

    for row_ind, col in enumerate(data):
        for col_ind, row in enumerate(data[col]):

            if metric == "AUROC":
                axes[row_ind][col_ind].set_ylim([0.5, 1])

            if metric == "AUPR":
                axes[row_ind][col_ind].set_ylim([0, 0.7])

            # Data Not Available Yet
            if len(data[col][row]) == 0:
                lower, upper = axes[row_ind][col_ind].get_ylim()
                axes[row_ind][col_ind].annotate("Coming Soon!!",
                                                (0.2, (lower + upper) / 2),
                                                color="red",
                                                fontsize=20)

            else:
                sns.pointplot(x="num_lfs",
                              y=metric if metric == "AUROC" else "AUPRC",
                              data=data[col][row][evaluation_set],
                              ax=axes[row_ind][col_ind],
                              hue="label",
                              ci="sd",
                              scale=1.2,
                              markers=["^", "o"])

                # remove x axis labels
                axes[row_ind][col_ind].set_xlabel('')
                axes[row_ind][col_ind].get_legend().remove()

                # unstable code
                # if order of error bars
                # change then this code will not work
                for idx, item in enumerate(
                        axes[row_ind][col_ind].get_children()):
                    # if the points in graph
                    # change color map accordingly
                    if idx == 0 or idx == 1:
                        item.set_edgecolor([
                            color_map[col] if index == 0 else color_map[row]
                            for index in range(
                                len(data[col][row]
                                    [evaluation_set].num_lfs.unique()))
                        ])
                        item.set_facecolor([
                            color_map[col] if index == 0 else color_map[row]
                            for index in range(
                                len(data[col][row]
                                    [evaluation_set].num_lfs.unique()))
                        ])

                    #if error bars change accordingly
                    elif isinstance(item, plt.Line2D):
                        if idx == 2:
                            item.set_linestyle('dashed')
                            item.set_color("black")
                            item.set_alpha(0.25)
                        elif idx == 9:
                            item.set_linestyle('dashed')
                            item.set_color("black")
                            item.set_alpha(0.25)
                        else:
                            item.set_color(color_map[row])

            # only set first column and first row titles
            if row_ind == 0:
                axes[row_ind][col_ind].set_title(row, color=color_map[row])

            if col_ind == 0:
                axes[row_ind][col_ind].set_ylabel(col, color=color_map[col])
            else:
                axes[row_ind][col_ind].set_ylabel('')

    for item in axes.flat:
        item.title.set_fontsize(30)
        item.yaxis.label.set_fontsize(24)
        item.xaxis.label.set_fontsize(24)
        for tick in item.get_yticklabels() + item.get_xticklabels():
            tick.set_fontsize(23)

    if "label" in data["DaG"]["DaG"]["dev"].columns:
        axes.flatten()[3].legend(loc='upper center',
                                 bbox_to_anchor=(2.54, 0.8),
                                 fontsize=20)
        leg = axes.flatten()[3].get_legend()
        leg.legendHandles[0].set_edgecolor('black')
        leg.legendHandles[0].set_facecolor('white')

        leg.legendHandles[1].set_edgecolor('black')
        leg.legendHandles[1].set_facecolor('white')

    fig.text(0.5, 0.89, 'Label Sources', ha='center', fontsize=30)
    fig.text(0.5,
             0.04,
             'Number of Additional Label Functions',
             ha='center',
             fontsize=30)
    fig.text(0.04,
             0.5,
             f'Predicted Relations ({metric})',
             va='center',
             rotation='vertical',
             fontsize=25)
    fig.suptitle(title, fontsize=30)
    fig.text(0.69,
             0.02,
             '0-Only Uses Relation Specific Databases.',
             fontsize=27)
    plt.subplots_adjust(top=0.85)
    plt.savefig(file_name, format='png')
Esempio n. 33
0
def plot_roi_per_session(
    df,
    x='Session',
    y='Mean t-Statistic',
    condition='treatment',
    unit='subject',
    ci=90,
    palette=["#56B4E9", "#E69F00"],
    dodge=True,
    order=[],
    feature_map=True,
    roi_left=0.02,
    roi_bottom=0.74,
    roi_width=0.3,
    roi_height=0.2,
    roi_anat='/usr/share/mouse-brain-atlases/dsurqec_40micron_masked.nii',
    roi_threshold=None,
    cut_coords=None,
    samri_style=True,
    renames=[],
    save_as='',
    ax=None,
    fig=None,
):
    """Plot a ROI t-values over the session timecourse
	"""

    if samri_style:
        plt.style.use(u'seaborn-darkgrid')
        plt.style.use('ggplot')

    try:
        df = path.abspath(path.expanduser(df))
    except AttributeError:
        pass

    # definitions for the axes
    height = rcParams['figure.subplot.top']
    bottom = rcParams['figure.subplot.bottom']
    left = rcParams['figure.subplot.left']
    width = rcParams['figure.subplot.right']

    session_coordinates = [left, bottom, width, height]

    roi_coordinates = [
        left + roi_left, bottom + roi_bottom, roi_width, roi_height
    ]

    if not fig:
        fig = plt.figure(1)

    if renames:
        for key in renames:
            for subkey in renames[key]:
                df.loc[df[key] == subkey, key] = renames[key][subkey]

    if not ax:
        ax1 = plt.axes(session_coordinates)
    else:
        ax1 = ax
    ax = sns.pointplot(
        x=x,
        y=y,
        units=unit,
        data=df,
        hue=condition,
        dodge=dodge,
        palette=sns.color_palette(palette),
        order=order,
        ax=ax1,
        ci=ci,
    )
    ax.set_ylabel(y)

    if isinstance(feature_map, str):
        ax2 = plt.axes(roi_coordinates)
        if roi_threshold and cut_coords:
            maps.stat(
                feature,
                cut_coords=cut_coords,
                template=roi_anat,
                annotate=False,
                scale=0.3,
                show_plot=False,
                interpolation=None,
                threshold=roi_threshold,
                draw_colorbar=False,
                ax=ax2,
            )
        else:
            maps.atlas_label(
                feature_map,
                scale=0.3,
                color="#E69F00",
                ax=ax2,
                annotate=False,
                alpha=0.8,
            )
    elif feature_map:
        try:
            features = df['feature'].unique()
        except KeyError:
            pass
        else:
            if len(features) > 1:
                print(
                    'WARNING: The features list contains more than one feature. We will highlight the first one in the list. This may be incorrect.'
                )
            feature = features[0]
            ax2 = plt.axes(roi_coordinates)
            if path.isfile(feature):
                if roi_threshold and cut_coords:
                    maps.stat(
                        stat_maps=feature,
                        cut_coords=cut_coords,
                        template=roi_anat,
                        annotate=False,
                        scale=0.3,
                        show_plot=False,
                        interpolation=None,
                        threshold=roi_threshold,
                        draw_colorbar=False,
                        ax=ax2,
                    )
                else:
                    maps.atlas_label(
                        feature,
                        scale=0.3,
                        color="#E69F00",
                        ax=ax2,
                        annotate=False,
                        alpha=0.8,
                    )
            else:
                atlas = df['atlas'].unique()[0]
                mapping = df['mapping'].unique()[0]
                if isinstance(feature, str):
                    feature = [feature]
                maps.atlas_label(
                    atlas,
                    scale=0.3,
                    color="#E69F00",
                    ax=ax2,
                    mapping=mapping,
                    label_names=feature,
                    alpha=0.8,
                    annotate=False,
                )

    if save_as:
        plt.savefig(path.abspath(path.expanduser(save_as)),
                    bbox_inches='tight')

    return fig, ax
Esempio n. 34
0
# Show each observation with a scatterplot
sns.stripplot(x="measurement",
              y="value",
              hue="species",
              data=iris,
              dodge=True,
              jitter=True,
              alpha=.25,
              zorder=1)

# Show the conditional means
sns.pointplot(x="measurement",
              y="value",
              hue="species",
              data=iris,
              dodge=.532,
              join=False,
              palette="dark",
              markers="d",
              scale=.75,
              ci=None)

# Improve the legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles[3:],
          labels[3:],
          title="species",
          handletextpad=0,
          columnspacing=1,
          loc="lower right",
          ncol=3,
          frameon=True)
import seaborn as sb
from matplotlib import pyplot as plt

df = sb.load_dataset('titanic')
sb.pointplot(x="sex", y="survived", hue="class", data=df)
plt.show()
Esempio n. 36
0
##############################################################################
# Plot Results
# ------------
#
# Here we plot the results.

fig, ax = plt.subplots(facecolor="white", figsize=[8, 4])

n_subs = len(dataset.subject_list)

if n_subs > 1:
    r = results.groupby(["pipeline", "subject",
                         "data_size"]).mean().reset_index()
else:
    r = results

sns.pointplot(data=r,
              x="data_size",
              y="score",
              hue="pipeline",
              ax=ax,
              palette="Set1")

errbar_meaning = "subjects" if n_subs > 1 else "permutations"
title_str = f"Errorbar shows Mean-CI across {errbar_meaning}"
ax.set_xlabel("Amount of training samples")
ax.set_ylabel("ROC AUC")
ax.set_title(title_str)
fig.tight_layout()
plt.show()
Esempio n. 37
0
    for time in times:
        data_dicts.append({
            "Number of samples": num_samples,
            "Convergence time (s)": time,
            "Inference method": "DP/VI"
        })
for num_samples, times in SciClone_times.iteritems():
    for time in times:
        data_dicts.append({
            "Number of samples": num_samples,
            "Convergence time (s)": time,
            "Inference method": "SciClone (VI)"
        })
for num_samples, times in PyClone_times.iteritems():
    for time in times:
        data_dicts.append({
            "Number of samples": num_samples,
            "Convergence time (s)": time,
            "Inference method": "PyClone (DP/MCMC)"
        })

data = pd.DataFrame(data_dicts)
ax = sns.pointplot(x="Number of samples",
                   y="Convergence time (s)",
                   hue="Inference method",
                   data=data,
                   capsize=0.1,
                   markers=['x', 'o', '^'],
                   linestyles=['--', '--', '--'])
sns.plt.savefig('time_comparisons.png')
Esempio n. 38
0
df_M.head(2)

# <a id='eda'></a>
# ## Exploratory Data Analysis
#
# >  After trimmed and cleaned The data, Now move on to exploration. Compute statistics and create visualizations to find Patterns between the data , find answers of My research questions .
# ### Research Question 1 (Top 20 movies based on its Profit)

# In[51]:

info = pd.DataFrame(df_M['revenue'].sort_values(ascending=False))
info['original_title'] = df_M['original_title']
data = list(map(str, (info['original_title'])))
x = list(data[:20])
y = list(info['revenue'][:20])
ax = sns.pointplot(x=y, y=x)
sns.set(rc={'figure.figsize': (10, 10)})
ax.set_title("Top 20 Movies has high Profit", fontsize=15)
ax.set_xlabel("revenue", fontsize=15)
sns.set_style("darkgrid")

# The Avatar Movie has the highst provit in the dataset

# ### Research Question 2 (Which movie Has Highest /  Lowest Profit and budget)

# In[52]:

#calculate Profit for each of the movie
#add a new column Profit for each of the movie
df_M['Profit'] = df_M['revenue'] - df_M['budget']
# Produces Pandas Series
plant_growth.groupby('group')['weight'].mean()

#%%
# Produces Pandas DataFrame
plant_growth.groupby('group')[['weight']].mean()

#%%
# Easy and flexible
plant_growth.groupby(['group']).agg({'weight': ['mean', 'std']})

#%%
# plot the data:
sns.boxplot(x="group", y="weight", data=plant_growth)
sns.catplot(x="group", y="weight", data=plant_growth)
sns.pointplot(x="group", y="weight", data=plant_growth, join=False)
sns.catplot(x="group", y="weight", data=plant_growth, kind="point")

#%%
# base R plotting functions:
""" boxplot()
hist()
plot()
plot(density()) """

#%%
# specify the model
""" import statsmodels.api as sm
from statsmodels.formula.api import ols """
model = ols("weight ~ group", data=plant_growth)
results = model.fit()
Esempio n. 40
0
# In[6]:

train, test = pd.read_csv("../input/train.csv"), pd.read_csv(
    "../input/test.csv")
test_ids = test[["PassengerId"]]

# In[7]:

train.head()

# We will plot various features with their relation to survival rate to have an idea of correlations

# In[8]:

fig, axs = plt.subplots(ncols=3, figsize=(16, 5))
sns.pointplot(x="Embarked", y="Survived", hue="Sex", data=train, ax=axs[0])
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=train, ax=axs[1])
sns.violinplot(x="Survived", y="Age", hue="Sex", data=train, ax=axs[2])

# We can already see some (strong) correlation between sex, age, Pclass, embarked and survival rate

# In[9]:

data_age = [
    train[train.Survived == 1].Age.dropna(),
    train[train.Survived == 0].Age.dropna()
]
labels = ["Survived", "Not survived"]
fig = ff.create_distplot(data_age, labels, bin_size=2, show_rug=False)
pyo.iplot(fig)
Esempio n. 41
0
                'Number of articles':
                articles_count,
                'Recall std error':
                Micro_Recall_Values_std_error
            })

        # stop after certain number of articles
        if (articles_count > max_count): break

# visualize Recall values
Articles_Recall_Values.boxplot()

# visualize standard errors values
Recall_std_errors = pd.DataFrame(Macro_Recall_st_errors)
Recall_std_errors['Recall Type'] = 'Macro Recall'

temp_df = pd.DataFrame(Micro_Recall_st_errors)
temp_df['Recall Type'] = 'Micro Recall'

Recall_std_errors = Recall_std_errors.append(temp_df)

plt.figure(figsize=(8, 6))
fig = sns.pointplot(x='Number of articles',
                    y='Recall std error',
                    hue='Recall Type',
                    data=Recall_std_errors,
                    col='Recall Type')

fig.set(xlabel="Number of Articles")
fig.set(ylabel="Recall Standard Error")
Esempio n. 42
0
# just by eyeballing, normal distrib does not capture tails
# nbinom seems like best description, but params are not very intuitive...
import scipy.stats as stats
fig, axes = plt.subplots(len(strains2plot))
fit = []
for ax, (strain, group) in zip(axes.ravel(), parts[(parts.mass_norm>=2)&(parts.corrwideal>=0.5)&(parts.strain.isin(strains2plot))].groupby('strain')):
    values = group.mass_norm.values
    fit.append((strain, stats.probplot(values, dist="geom", sparams=(0.3), plot=ax)[1]))

# Summary plot
df2plot =  parts[(parts.mass_norm>=8)&(parts.corrwideal>=0.5)&(parts.strain.isin(strains2plot))]
order = sorted(df2plot.CTDr.unique())
with sns.axes_style(*style):
    fig, ax = plt.subplots(figsize=(9, 6))
# pooled
sns.pointplot(x='strain', y='mass_norm', order=strains2plot,
        data=df2plot, join=False,
        estimator=np.median, ci=99, ax=ax)
# each image
sns.stripplot(x='strain', y='mass_norm', order=strains2plot,
        data=parts[(parts.mass_norm>=7)&(parts.corrwideal>=0.5)&(parts.strain.isin(strains2plot))].groupby(['mov_name', 'strain']).median().reset_index(),
        ax=ax, size=12, alpha=0.25)
ax.set(ylabel='Median TS intensity (a.u.)', xlabel='CTD repeats', ylim=(ax.get_yticks()[0]-0.2, ax.get_yticks()[-1]))
sns.despine(left=False, bottom=False)
plt.tight_layout()

# check peaks found
from skimage import io
import trackpy as tp
imname = '03052019_yQC21_255u100%int480_150msExp_30-45minPosGal_6_w2GFPlow'
imname = '03272019_TL47pQC99_255u100%int480_150msExp_30-50minPosGal_13_w2GFPlow'
Esempio n. 43
0
#设置轴的间隔
#ax为两条坐标轴的实例
ax = plt.gca()
#把x轴的刻度间隔设置为5,并存在变量里
x_major_locator = MultipleLocator(1)
#把x轴的主刻度设置为5的倍数
ax.xaxis.set_major_locator(x_major_locator)

sns.set_context(context="poster", font_scale=0.1)
# plt.bar(np.arange(len(df["high"].value_counts()))+0.5,df["high"].value_counts(),width=0.8)
plt.plot(df["date"], df["high"], c='green')
for x, y in zip(np.arange(len(df["date"].value_counts())),
                df["high"].value_counts()):
    plt.text(x, y, y, ha="center", va="bottom")
plt.savefig("weather.jpg")
plt.show()

sns.pointplot(df["date"], df["high"])
sns.set_context(context="poster", font_scale=0.1)
plt.savefig("2.jpg")
plt.show()

line.add("气温",
         df["date"],
         df["high"],
         ymbol_size=2,
         is_step=False,
         is_label_show=df['date'])
line.render("zhexian.html")  #生成对于的HTML文件
# print(df_new.head(10))
Esempio n. 44
0
#     <li>linestyles : string or list of strings, optional</li>
#     <li>color : matplotlib color, optional</li>
#     <li>palette : palette name, list, or dict, optional</li>
#     <li>ax : matplotlib Axes, optional</li>
# </ul>
#
#

# In[ ]:

#Gender show point plot
data['Race/Ethnicity'].unique()
len(data[(data['Race/Ethnicity'] == 'group B')].Math_Score)
f, ax1 = plt.subplots(figsize=(25, 10))
sns.pointplot(x=np.arange(1, 191),
              y=data[(data['Race/Ethnicity'] == 'group B')].Math_Score,
              color='lime',
              alpha=0.8)
sns.pointplot(x=np.arange(1, 191),
              y=data[(data['Race/Ethnicity'] == 'group B')].Reading_Score,
              color='red',
              alpha=0.5)
#sns.pointplot(x=np.arange(1,191),y=data[(data['Race/Ethnicity']=='group B')].Math_Score,color='lime',alpha=0.8)
plt.xlabel('Group B index State')
plt.ylabel('Frequency')
plt.title('Group B Math Score & Reading_Score')
plt.xticks(rotation=90)
plt.grid()
plt.show()

# In[ ]:
#graph individual features by survival
fig, saxis = plt.subplots(2, 3, figsize=(16, 12))

sns.barplot(x='Embarked', y='Survived', data=data1, ax=saxis[0, 0])
sns.barplot(x='Pclass',
            y='Survived',
            order=[1, 2, 3],
            data=data1,
            ax=saxis[0, 1])
sns.barplot(x='Isalone',
            y='Survived',
            order=[1, 0],
            data=data1,
            ax=saxis[0, 2])

sns.pointplot(x='FareBin', y='Survived', data=data1, ax=saxis[1, 0])
sns.pointplot(x='AgeBin', y='Survived', data=data1, ax=saxis[1, 1])
sns.pointplot(x='FamilyMembers', y='Survived', data=data1, ax=saxis[1, 2])

# In[ ]:

#graph distribution of qualitative data: Pclass
#we know class mattered in survival, now let's compare class and a 2nd feature
fig, (axis1, axis2, axis3) = plt.subplots(1, 3, figsize=(14, 12))

sns.boxplot(x='Pclass', y='Fare', hue='Survived', data=data1, ax=axis1)
axis1.set_title('Pclass vs Fare Survival Comparison')

sns.violinplot(x='Pclass',
               y='Age',
               hue='Survived',
part.loc[:,'M'].plot()
plt.title('Variation of Male Athletes over time')
part = WomenOverTime.groupby('Year')['Sex'].value_counts()
plt.figure(figsize=(20, 10))
part.loc[:,'F'].plot()
plt.title('Variation of Female Athletes over time')
plt.figure(figsize=(20, 10))
sns.boxplot('Year', 'Age', data=MenOverTime)
plt.title('Variation of Age for Male Athletes over time')
MenOverTime.loc[MenOverTime['Age'] > 80].head(10)
plt.figure(figsize=(20, 10))
sns.boxplot('Year', 'Age', data=WomenOverTime)
plt.title('Variation of Age for Female Athletes over time')
WomenOverTime.loc[WomenOverTime['Year'] == 1904]
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Weight', data=MenOverTime)
plt.title('Variation of Weight for Male Athletes over time')
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Weight', data=WomenOverTime)
plt.title('Variation of Weight for Female Athletes over time')
womenInOlympics.loc[womenInOlympics['Year'] < 1924].head(20)
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Height', data=MenOverTime, palette='Set2')
plt.title('Variation of Height for Male Athletes over time')
plt.figure(figsize=(20, 10))
sns.pointplot('Year', 'Height', data=WomenOverTime, palette='Set2')
plt.title('Variation of Height for Female Athletes over time')
WomenOverTime.loc[(WomenOverTime['Year'] > 1924) & (WomenOverTime['Year'] < 1952)].head(10)
MenOverTime.head(5)
itMenOverTime = MenOverTime.loc[MenOverTime['region'] == 'Italy']
itMenOverTime.head(5)
Esempio n. 47
0
means = dataTarLoc.groupby(['cond_tarLocation'])['responseTime'].mean().values
dataTarLocER.accuracy = (1-dataTarLocER.accuracy)*100  # make accuracy error rate
#mobs = dataTarLoc['cond_tarLocation'].value_counts().values
#pos = range(len(mobs))

# Plotting
fig1 = plt.figure(figsize=(3.25, 6), dpi=100)
ax1 = plt.subplot2grid((6, 1), (0, 0), rowspan=5, colspan=1)
sns.violinplot(x='cond_tarLocation', y='responseTime', data=dataTarLoc, cut=vioCut, saturation=vioSat, linewidth=vioLw, palette=pTarLoc)
sns.swarmplot(x="cond_tarLocation", y="responseTime", data=dataTarLoc, color=swaCol, alpha=swaAlp, linewidth=swaLwE, edgecolor=swaColE)
ax1.plot(range(len(means)), [means[0], means[1], means[2]], color=lpColor, marker=lpMarker, markersize=lpMarkerS,
        markeredgecolor=lpMarkerEC, markeredgewidth=lpMarkerEW, lw=lpLw, ls=lpLs, zorder=3)#, dashes=(0.75, 0.75))
ax1.set_xlabel('')
ax1.set_ylabel("Response Time [in ms]")
ax2 = plt.subplot2grid((6, 1), (5, 0), rowspan=1, colspan=1, sharex=ax1)
sns.pointplot(x='cond_tarLocation', y='accuracy', data=dataTarLocER, color=lpColor, markers=lpMarker, ci=ci)
ax2.axes.get_xaxis().set_visible(False)
ax2.axes.get_xaxis().set_ticks([])
ax2.yaxis.set_ticks(np.arange(3, 11, 2))
plt.ylabel('Error Rate\n [in %]')
sns.despine(offset=10, trim=True)
plt.show()

###  Figure 2 ###

# Data selection: RT
dataTarLocGrad = pd.pivot_table(data[(data.cond_disPresent == 'absent') & (data.RTquicker200 == 0)],
                                values='responseTime', index='subject_nr', columns='TarDistanceFromColor')
dataTarLocGrad = pd.melt(
    dataTarLocGrad.reset_index(),
    id_vars='subject_nr',
Esempio n. 48
0
def model_(x_train, y_train, x_test, y_test, boost_type='lgb'):
    tStart = time.time()
    if boost_type == 'lgb':
        model = lgb.LGBMClassifier(
            boosting_type='gbdt',
            objective='binary',
            learning_rate=0.01,
            n_estimators=9000,
            max_depth=8,
            min_child_weight=5,
            scale_pos_weight=9,  # refer: 70
            subsample=0.7,
            colsample_bytree=0.7,
            subsample_freq=1,
            n_jobs=-1)

    elif boost_type == 'xgb':
        model = XGBClassifier(learning_rate=0.025,
                              tree_method='gpu_hist',
                              n_estimators=6000,
                              max_depth=9,
                              min_child_weight=1,
                              gamma=0,
                              subsample=0.8,
                              colsample_bytree=0.8,
                              objective='binary:logistic',
                              nthread=-1,
                              scale_pos_weight=11,
                              seed=27)
    print('--' * 25)
    print('Start training ...')
    model.fit(x_train, y_train)
    yp_train = model.predict_proba(x_train)[:, 1]
    yp_valid = model.predict_proba(x_test)[:, 1]
    print(
        f'Use time: { np.int_((time.time()-tStart)/60)  } mins\nCaluate prob ...'
    )

    ## probability tune
    mat = np.zeros([5, 100])
    for threshold in range(100):
        y_pred_train = np.int_(yp_train > threshold * 0.01)
        y_pred_valid = np.int_(yp_valid > threshold * 0.01)
        mat[0, threshold] = round(threshold * 0.01, 2)
        mat[1, threshold] = f1_score(y_train, y_pred_train)
        mat[2, threshold] = f1_score(y_test, y_pred_valid)
        mat[3, threshold] = (y_train == y_pred_train).mean()
        mat[4, threshold] = (y_test == y_pred_valid).mean()

    # Fig1 for F1
    sns.pointplot(x=mat[0, :], y=mat[1, :], color='r')
    sns.pointplot(x=mat[0, :], y=mat[2, :], color='b')
    plt.title(f'{boost_type} F1 performance', color='r')
    plt.show()

    # Fig2 for acc
    sns.pointplot(x=mat[0, 10:], y=mat[3, 10:], color='r')
    sns.pointplot(x=mat[0, 10:], y=mat[4, 10:], color='b')
    plt.title(f'{boost_type} Acc performance', color='r')
    plt.show()
    print('--' * 20)

    # reult for best probalility
    best_prob = round(np.argmax(mat[2, :]) * 0.01, 2)
    print('Valid Result:\nprob: {}, F1 : {}, acc : {}'.\
          format(best_prob,max(mat[2,:]).round(3), mat[4,:][np.argmax(mat[2,:])].round(3)))
    print('--' * 20)

    # confusion matrix
    y_pred_train = np.int_(yp_train > best_prob)
    y_pred_valid = np.int_(yp_valid > best_prob)
    print('Train confusion matrix')
    display(
        pd.crosstab(y_train, y_pred_train, margins=True, margins_name="Total"))
    print('--' * 20)
    print('Valid confusion matrix')
    display(
        pd.crosstab(y_test, y_pred_valid, margins=True, margins_name="Total"))
    print('--' * 20)

    print('Feature Importance (Top 10)')
    display(pd.DataFrame({'feature':feature,'gain':model.feature_importances_}).\
        sort_values(by='gain',ascending=False).iloc[0:10,:])
    print('--' * 25)
    return model, best_prob
pd.cut(data_train['Age'], bins=5)
#[(0.34, 16.336] < (16.336, 32.252] < (32.252, 48.168] < (48.168, 64.084] < (64.084, 80.0]]
pd.cut(data_test['Age'], bins=5)
#[(0.0942, 15.336] < (15.336, 30.502] < (30.502, 45.668] < (45.668, 60.834] < (60.834, 76.0]]

# Now we will create age bins for the full dataset
ages = data_full['Age']
ages = ages.append(pd.Series([0, 80]))
bins = pd.cut(ages, bins=8, labels=[1, 2, 3, 4, 5, 6, 7, 8])

data_full['AgeBin'] = bins[:-2].astype(float)

fig = plt.figure(figsize=(15, 8))
sns.pointplot(x='AgeBin',
              y='Survived',
              ci=95.0,
              hue='Sex',
              data=data_full,
              dodge=True)

# From all this it looks like it would be good to create a dummy for kids - since agebin does not really provide a whole lot of differentiation except in the agebin=1 category

# In[ ]:

data_full['IsKid'] = 0
data_full.loc[data_full['AgeBin'] == 1, 'IsKid'] = 1

# ** Dealing with missing values in the age feature **
#
# Age is the feature that has the most missing values after the cabin feature. In this case I want to fill the agebin with the mode of the agebin based on the title of the person. This especially seems to make sense for Master/Mr and Mrs/Miss where the title allows us to extract information about age based instead of just picking at random.

# In[ ]:
Esempio n. 50
0
    t_p1 = (start_hrf + t_p) * x_bins / max_val_x
    r_t1 = (start_hrf + r_t) * x_bins / max_val_x
    #
    d_p2 = d_p1 + sec_hdrf * x_bins / max_val_x
    t_p2 = t_p1 + sec_hdrf * x_bins / max_val_x
    r_t2 = r_t1 + sec_hdrf * x_bins / max_val_x

    y_vl_min = df_all_by_subj.Decoding_error.min()
    y_vl_max = df_all_by_subj.Decoding_error.max()

    range_hrf = [float(5) / x_bins, float(6) / x_bins]  #
    paper_rc = {'lines.linewidth': 2, 'lines.markersize': 2}
    sns.set_context("paper", rc=paper_rc)
    sns.pointplot(x='timepoint',
                  y='Decoding_error',
                  hue='ROI',
                  data=df_all_by_subj,
                  size=5,
                  aspect=1.5)
    ##all subj visual
    paper_rc = {'lines.linewidth': 0.25, 'lines.markersize': 0.5}
    sns.set_context("paper", rc=paper_rc)
    for a in ['visual', 'ips']:
        if a == 'visual':
            c = 'b'
        elif a == 'ips':
            c = 'darkorange'
        for s in df_all_by_subj.subj.unique():
            sns.pointplot(
                x='timepoint',
                y='Decoding_error',
                data=df_all_by_subj.loc[(df_all_by_subj['ROI'] == a)
Esempio n. 51
0
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

if __name__ == '__main__':
    exp_name = 'output/exp_p3_w4_s4_deadline'

    general_file = f'{exp_name}-general_data.csv'
    print(f'Loading data file: {general_file}')
    df = pd.read_csv(general_file)
    print(df)

    plt.figure()
    sns.pointplot(data=df, x='epoch', y='accuracy')
    plt.title('Accuracy per epoch')
    plt.show()

    plt.figure()
    # sns.pointplot(data=df[df['epoch'] > 1], x='epoch', y='duration')
    sns.pointplot(data=df, x='epoch', y='duration')
    plt.title('Train time per epoch')
    plt.show()

    dfs = []
    for file in [
            f'{exp_name}_client1_epochs.csv', f'{exp_name}_client2_epochs.csv',
            f'{exp_name}_client3_epochs.csv', f'{exp_name}_client4_epochs.csv'
    ]:
        dfs.append(pd.read_csv(file))
    client_df = pd.concat(dfs, ignore_index=True)
Esempio n. 52
0
sns.countplot(x='Gamma', hue='Segmentation', data=s2_tads_all)
plt.xticks(rotation=90)
plt.show()


# Let's compare the lengths of TADs depending on gamma with these two segmentation methods.

# In[16]:

s2_tads_all['Length'] = s2_tads_all['End']-s2_tads_all['Start']
s2_tads_all['Length'] = s2_tads_all['Length'].astype(int)


# In[17]:

sns.pointplot(x='Gamma', y='Length', hue='Segmentation', data=s2_tads_all, zorder=15)
sns.stripplot(x='Gamma', y='Length', hue='Segmentation', data=s2_tads_all, jitter=True, zorder=1, alpha=0.5)
handles, labels = plt.gca().get_legend_handles_labels() #Have to do this because of a weird legend behavior otherwise...
plt.legend(handles[:2], labels[:2], loc='upper right')
plt.xticks(rotation=90)
plt.yscale('log')
plt.show()


# And again, let's look at the Score in the same way.

# In[18]:

sns.pointplot(x='Gamma', y='Score', hue='Segmentation', data=s2_tads_all, zorder=15)
sns.stripplot(x='Gamma', y='Score', hue='Segmentation', data=s2_tads_all, jitter=True, zorder=1, alpha=0.5)
handles, labels = plt.gca().get_legend_handles_labels()
Esempio n. 53
0
    'world_rank', 'year'
]]

times_plot_data['source'] = 'Times'
shanghai_plot_data['source'] = 'Shanghai'
cwur_plot_data['source'] = 'CWUR'

# parse the first number in rank for data ranges
times_plot_data['world_rank'] = times_plot_data['world_rank'].str.split(
    '-').str[0]
shanghai_plot_data['world_rank'] = shanghai_plot_data['world_rank'].str.split(
    '-').str[0]

plot_data = times_plot_data.append(shanghai_plot_data).append(cwur_plot_data)
plot_data['world_rank'] = plot_data['world_rank'].astype(int)
ax = sns.pointplot(x='year', y='world_rank', hue='source', data=plot_data)

# Styling

plt.title(my_university_name[0] + " Ranking", fontsize=26)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.ylabel("World Rank", fontsize=26)
plt.xlabel("Year", fontsize=26)
plt.tight_layout()
plt.legend(loc='upper left', fontsize=20)
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
Esempio n. 54
0
plt.figure(figsize=(12, 8))
sns.countplot(x="pickup_hour", data=train_df)
plt.ylabel('Count', fontsize=12)
plt.xlabel('pick up hour', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

# The distribution shows the car demand with pick up hour time. After mid night less number's of trips are taken. Now let us see how the trip duration changes with respect to trip time.

# In[ ]:

grouped_df = train_df.groupby('pickup_hour')['trip_duration'].aggregate(
    np.median).reset_index()
plt.figure(figsize=(12, 8))
sns.pointplot(grouped_df.pickup_hour.values,
              grouped_df.trip_duration.values,
              alpha=0.8,
              color=color[3])
plt.ylabel('median trip duration', fontsize=12)
plt.xlabel('pick up hour', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()

# In[ ]:

# Group by day
grouped_df = train_df.groupby('day_week')['trip_duration'].aggregate(
    np.median).reset_index()
plt.figure(figsize=(12, 8))
sns.pointplot(grouped_df.day_week.values,
              grouped_df.trip_duration.values,
              alpha=0.8,
Esempio n. 55
0
temp_series = order_products_prior_df['department'].value_counts()
labels = (np.array(temp_series.index))
sizes = (np.array((temp_series / temp_series.sum()) * 100))
plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=200)
plt.title("Departments distribution", fontsize=15)
plt.show()

print 'press any key to continue'
raw_input('')

grouped_df = order_products_prior_df.groupby(
    ["department"])["reordered"].aggregate("mean").reset_index()

plt.figure(figsize=(12, 8))
sns.pointplot(grouped_df['department'].values,
              grouped_df['reordered'].values,
              alpha=0.8,
              color=color[2])
plt.ylabel('Reorder ratio', fontsize=12)
plt.xlabel('Department', fontsize=12)
plt.title("Department wise reorder ratio", fontsize=15)
plt.xticks(rotation='vertical')
plt.show()

print 'press any key to continue'
raw_input('')

grouped_df = order_products_prior_df.groupby(
    ["department_id", "aisle"])["reordered"].aggregate("mean").reset_index()
fig, ax = plt.subplots(figsize=(12, 20))
ax.scatter(grouped_df.reordered.values, grouped_df.department_id.values)
for i, txt in enumerate(grouped_df.aisle.values):
# sys.exit(0)

# s = '1_fs__k' if '1_fs__k' in parameters else 'jmi__k_feat'
# s = 'jmi__k_feat'
s = 'clf__n_neighbors'
# s = 'nb__alpha'
# s = 'clf3__gr__group'
# s = 'clf1__fs__k'
# s = 'fs__k'
# s = 'clf3__fs__k'
# s = 'clf3__xgb__n_estimators'

for p in parameters.keys():
    plt.figure()
    # sns.pointplot(x=p, hue='max_depth', y='result', data=df_results[df_results['folds'] == 2], estimator=np.median)
    sns.pointplot(x=p, hue=s, y='result', data=df_results, estimator=np.median)
    plt.ylim(0.6, 0.8)

if '1_clf__C' in parameters:
    plt.figure()
    plt.subplot(311)
    sns.pointplot(x='1_clf__C',
                  y='result_1',
                  data=df_results,
                  estimator=np.median,
                  color='r')
    plt.legend('patient 1')
    plt.grid()
    plt.subplot(312)
    sns.pointplot(x='2_clf__C',
                  y='result_2',
Esempio n. 57
0
# d = rs.normal(size=1000)
# f, axes = plt.subplots(2, 2, figsize=(7, 10), sharex=False)
# sns.distplot(d, kde=False, color="b", ax=axes[0, 0])
# sns.distplot(d, hist=False, rug=True, color="r", ax=axes[0, 1])
# sns.distplot(d, hist=False, color="g", kde_kws={"shade": True}, ax=axes[1, 0])
# sns.distplot(d, color="m", ax=axes[1, 1])
# plt.show()

#eg4 箱型图
# iris = sns.load_dataset("data",data_home="/Users/fangchi/PycharmProjects/python_project/marchineLearning/ch3/3.4")
# sns.boxplot(x = iris['萼片_长度'],y = iris['品种'])
# plt.show()

#eg5
# iris = sns.load_dataset("data",data_home="/Users/fangchi/PycharmProjects/python_project/marchineLearning/ch3/3.4")
# sns.jointplot("萼片_长度", "花瓣_长度", iris)
# plt.show()

#eg6
iris = sns.load_dataset(
    "data",
    data_home=
    "/Users/fangchi/PycharmProjects/python_project/marchineLearning/ch3/3.4")

plt.figure(figsize=(12, 8))

sns.pointplot(iris.萼片_长度.values, iris.品种.values, alpha=0.8, color='blue')
plt.ylabel('品种', fontsize=12)
plt.xlabel('萼片_长度', fontsize=12)
plt.xticks(rotation='vertical')
plt.show()
Esempio n. 58
0
        
       
    fs = []
    for k in range(1):
        for i in ['fc','fs_5dis','fs_6dis']:
            fs+=[i for j in range(fs_ref.shape[1])]
        
    ref = []
    for k in ['Rep_50']:
        ref += [k for i in range(3*fs_ref.shape[1])]
    
    data = pd.DataFrame({'icc':icc, 'icc_msr':icc_msr, 'icc_mse':icc_mse, 'fs':fs, 'ref': ref})
        
    plt.figure(figsize=(20, 10))
    sns.pointplot(x="ref", y="icc_msr", data=data, hue= 'fs', dodge=0.53, join=False, palette="dark",markers="d", scale=.75, ci='sd',capsize = 0.07)
    sns.stripplot(x="ref", y="icc_msr", data=data, hue= 'fs', size = 3, dodge=0.45, alpha = 0.05).set_title('Edge-wise ICC MSr')
    pt.half_violinplot(x="ref", y="icc_msr", data=data, hue= 'fs',scale = "area",inner = None, offset = 0.03, saturation=0.5)
    plt.legend(ncol=2)       
    plt.savefig(plotd+'icc_msr.png')######
    plt.close()

    plt.figure(figsize=(20, 10))
    sns.pointplot(x="ref", y="icc_mse", data=data, hue= 'fs', dodge=0.53, join=False, palette="dark",markers="d", scale=.75, ci='sd',capsize = 0.07)
    sns.stripplot(x="ref", y="icc_mse", data=data, hue= 'fs', size = 3, dodge=0.45, alpha = 0.05).set_title('Edge-wise ICC MSe')
    pt.half_violinplot(x="ref", y="icc_mse", data=data, hue= 'fs',scale = "area",inner = None, offset = 0.03, saturation=0.5)

    t1,p1 = stats.ttest_ind(icc_mse[0:int(len(icc_mse)/3)],icc_mse[int(len(icc_mse)/3):int(len(icc_mse)/3*2)], nan_policy ='omit', equal_var=False)
    t2,p2 = stats.ttest_ind(icc_mse[0:int(len(icc_mse)/3)],icc_mse[int(len(icc_mse)/3*2):], nan_policy ='omit', equal_var=False)

    plt.text(-0.15,0,'T: '+str(round(t1,5))+'\n'+'P: '+str(round(p1,5)),fontsize=18)
Esempio n. 59
0
    def numerical_pca_egv(df, conf_dict, col1, col2, col3):
        # Do the PCA.
        n_components = len(conf_dict['NumericalColumns'])
        df2 = df[conf_dict['NumericalColumns']]

        scaler = StandardScaler()
        scaler.fit(df2)
        df2 = scaler.transform(df2)
        df2 = pd.DataFrame(df2, columns = conf_dict['NumericalColumns'])

        pca = PCA(n_components=n_components)
        reduced = pca.fit_transform(df2)

        # Append the principle components for each entry to the dataframe
        for i in range(0, n_components):
            df2['PC' + str(i + 1)] = reduced[:, i]

        #display(df2.head())
        if col1 not in conf_dict['NumericalColumns']:
            df.reset_index(drop=True, inplace=True)
            df2.reset_index(drop=True, inplace=True)
            df2[col1] = df[col1]

        # Show the points in terms of the first two PCs
        g = sns.lmplot(('PC' + str(col2)),
                       ('PC' + str(col3)),
                       hue=col1,
                       data=df2,
                       fit_reg=False,
                       scatter=True,
                       size=7)
        plt.show()

        # Plot a variable factor map for the first two dimensions.
        (fig, ax) = plt.subplots(figsize=(8, 8))
        for i in range(0, len(pca.components_)):
            ax.arrow(0,
                     0,  # Start the arrow at the origin
                     pca.components_[int(col2) - 1, i],  #0 for PC1
                     pca.components_[int(col3) - 1, i],  #1 for PC2
                     head_width=0.05,
                     head_length=0.08)

            plt.text(pca.components_[int(col2) - 1, i] + 0.05,
                     pca.components_[int(col3) - 1, i] + 0.05,
                     df2.columns.values[i])

        an = np.linspace(0, 2 * np.pi, 100)
        plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale
        plt.axis('equal')
        ax.set_title('Variable factor compass')
        plt.show()

        # Do a scree plot
        ind = np.arange(0, n_components)
        (fig, ax) = plt.subplots(figsize=(8, 6))
        sns.pointplot(x=ind, y=pca.explained_variance_ratio_)
        ax.set_title('Scree plot')
        ax.set_xticks(ind)
        ax.set_xticklabels(ind)
        ax.set_xlabel('Component Number')
        ax.set_ylabel('Explained Variance')
        plt.show()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
### matplotlib inline

data_train = pd.read_csv('../input/train.csv')
data_test = pd.read_csv('../input/test.csv')

data_train.sample(3)
sns.barplot(x="Embarked", y="Survived", hue="Sex", data=data_train)
sns.pointplot(x="Pclass",
              y="Survived",
              hue="Sex",
              data=data_train,
              palette={
                  "male": "blue",
                  "female": "pink"
              },
              markers=["*", "o"],
              linestyles=["-", "--"])


def simplify_ages(df):
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = [
        'Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult',
        'Adult', 'Senior'
    ]
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories