def get_hara_stats(df):
    # gets statistics on haralick features
    # takes dataframe with haralick and breeds
    x = list(range(1,14))
    xs = []
    haraFG = []
    breed = []
    for i in range(df.shape[0]):
        a = df.iloc[i]
        xs.append(x)
        haraFG.append(a.fgHaralick)
        breed.append([a.breed]*13)
    
    newDF = pd.DataFrame(columns=['Haralick feature', 'Haralick feature value', 'breed'])
    newDF['Haralick feature'] = np.array(xs).flatten()
    newDF['Haralick FG feature value'] = np.array(haraFG).flatten()
    newDF['breed'] = np.array(breed).flatten()
    stds = []
    for i in x:
        stds.append(newDF[newDF['Haralick feature']==i]['Haralick FG feature value'].std()
                    / newDF[newDF['Haralick feature']==i]['Haralick FG feature value'].mean())
    
    data = np.vstack((np.array(x), np.array(stds))).T
    pltDF = pd.DataFrame(columns=['Haralick feature', 'relative standard deviation'], data=data)
    sns.lmplot(x='Haralick feature', y='relative standard deviation', data=pltDF, fit_reg=False)
    plt.xticks(x)
    plt.show()
Ejemplo n.º 2
0
 def finalLayer(self, X, y, epochs=1):
     print "Final Layer" 
     V = self.predict(X)
     softmax = Layers.SoftmaxLayer(self.Layers[-1].W.shape[1], y.shape[1]) 
     
     #########################
     # Final layer of THE MODEL #
     #########################
     batchsize = X.shape[0]
     softmax, dA_avg_perm = Trainer().train([softmax], V, y, epochs, batchsize)
     self.Layers.append(softmax[0])
     
     ##############
     # Format      
     #################           
     LLdata = [float(L) for L in dA_avg_perm]
     LLiter = [float(it) for it in range(epochs)]
     dfpredata = pd.DataFrame( LLdata )
     dfpredata.columns = ['dA_avg_costs']
     dfpredata['iter'] = LLiter
     
     ############
     ### plotting or cost
     ### the cost we minimize during training is the negative log likelihood of
     ############
     plt.figure()
     sns.lmplot('iter', 'dA_avg_costs', data=dfpredata, fit_reg=False)
     plt.xlabel('epoch', fontsize=14)
     plt.ylabel('softmax error', fontsize=14)
     plt.title('softmax_train_epochs_'+str(epochs), fontsize=9)
     plt.savefig('grid_searchResults/'+'softmax_train_'+str(self.item)+'.png')
Ejemplo n.º 3
0
 def fine_tune(self, X, y, epochs=1):
     print "Fine Tunning" 
     #########################
     # Fine Tunning THE MODEL #
     #########################
     batchsize = X.shape[0]
     self.Layers, dA_avg_perm = Trainer().train(self.Layers, X, y, epochs, batchsize)
     
     ##############
     # Format      
     #################           
     LLdata = [float(L) for L in dA_avg_perm ]
     LLiter = [float(it) for it in range(epochs)]
     dfinedata = pd.DataFrame( LLdata )
     dfinedata.columns = ['dA_avg_costs']
     dfinedata['iter'] = LLiter
     
     ############
     ### plotting or cost
     ### the cost we minimize during training is the negative log likelihood of
     ############
     plt.figure()
     sns.lmplot('iter', 'dA_avg_costs', data=dfinedata, fit_reg=False)
     plt.xlabel('epoch', fontsize=14)
     plt.ylabel('finetune error', fontsize=14)
     plt.title('fine_tune_structure_'+str([self.structure])+'_train_epochs_'+str(epochs), fontsize=9)
     plt.savefig('grid_searchResults/'+'fine_tune_structure_'+str(self.item)+'.png')
Ejemplo n.º 4
0
 def visualize_data(self):
     """
     Transform the DataFrame to the 2-dimensional case and visualizes the data. The first tags are used as labels.
     :return:
     """
     logging.debug("Preparing visualization of DataFrame")
     # Reduce dimensionality to 2 features for visualization purposes
     X_visualization = self.reduce_dimensionality(self.X, n_features=2)
     df = self.prepare_dataframe(X_visualization)
     # Set X and Y coordinate for each articles
     df['X coordinate'] = df['coordinates'].apply(lambda x: x[0])
     df['Y coordinate'] = df['coordinates'].apply(lambda x: x[1])
     # Create a list of markers, each tag has its own marker
     n_tags_first = len(self.df['tags_first'].unique())
     markers_choice_list = ['o', 's', '^', '.', 'v', '<', '>', 'D']
     markers_list = [markers_choice_list[i % 8] for i in range(n_tags_first)]
     # Create scatter plot
     sns.lmplot("X coordinate",
                "Y coordinate",
                hue="tags_first",
                data=df,
                fit_reg=False,
                markers=markers_list,
                scatter_kws={"s": 150})
     # Adjust borders and add title
     sns.set(font_scale=2)
     sns.plt.title('Visualization of TMT articles in a 2-dimensional space')
     sns.plt.subplots_adjust(right=0.80, top=0.90, left=0.12, bottom=0.12)
     # Show plot
     sns.plt.show()
Ejemplo n.º 5
0
def deal_rs():
    data_set = pd.read_csv('datas/result.csv')
    data_set.columns = ['AvH', 'AvD', 'AvA', 'Hc', 'Dc', 'Ac', 'R', 'P']

    sns.set(style='ticks')
    sns.lmplot(x='R', y='P', data=data_set)
    sns.plt.show()
Ejemplo n.º 6
0
def view_timeline(df,x="unix_time",y="rate",plt=plt):
    asset = df.symbol.values[0]
    plt.figure(1, figsize=(15,15))
    sns.lmplot(x=x, y=y, hue="type", data=df, palette=dict(sell="r", buy="g"))
    plt.ylim(df[y].min(), df[y].max())
    plt.title(y+' over time ('+asset+')')
    plt.show()
def plot_compare_median_consensus(output_dir, df_order, metric, type = 'ts',DISPLAY = 0):
    plt.figure()


    if type =='ts':
        #sb.tsplot(data=df_order, value=metric,time='order',unit="algorithm",condition="algorithm",err_style="unit_traces")

        ax = sb.boxplot(x=metric, y="algorithm", data=df_order,
                 whis=np.inf, color="c")

        # Add in points to show each observation
        sb.stripplot(x=metric, y="algorithm", data=df_order,
                jitter=True, size=3, color=".3", linewidth=0)
        ax.set_xscale("log")
        sb.despine(trim=True)

       # plt.xlabel('images sorted by the average neuron distance of the median reconstruction')
        plt.savefig(output_dir + '/ts_compare_median_with_consensus_'+metric+'.png', format='png')


    if type =='lm':
        sb.lmplot(x="order", y=metric, hue="algorithm", data=df_order)
        plt.xlabel('images sorted by the average neuron distance of the median reconstruction')
        plt.savefig(output_dir + '/lm_compare_median_with_consensus_'+metric+'.lm.png', format='png')

    if DISPLAY:
         plt.show()
    plt.close()
Ejemplo n.º 8
0
def vcf_stats(vcfin, outdir, sample):
    inp = vcf.Reader(open(vcfin))
    variants = list()
    ref = list()
    alt = list()
    basename = os.path.splitext(os.path.basename(vcfin))[1]
    genotype = {'0/0':'Homozygous Rerfeence','0/1':'Heterozygous','1/1':'Homozygous Alternate', '1/2':'Non Reference Heterozygous'}
    for lines in inp:
        try:
            var = {'Chrom':lines.CHROM,'Pos':lines.POS, 'Ref':lines.REF, 'Alt': ','.join([str(alt) for alt in lines.ALT]),
                 'Sample':sample, 'Genotype': genotype[lines.genotype(sample)['GT']],'Depth_at_reference':lines.genotype(sample)['AD'][0],
                'Depth_at_alternate':lines.genotype(sample)['AD'][1]}
            variants.append(var)
        except KeyError:
            continue
    variants = pd.DataFrame(variants)
    plt.figure()
    sns.set(style='ticks', context='talk')
    sns.lmplot('Depth_at_reference','Depth_at_alternate',hue='Genotype', data=variants, fit_reg=False)
    plt.xlim([0,max([max(variants.Depth_at_reference), max(variants.Depth_at_alternate)])])
    plt.ylim([0,max([max(variants.Depth_at_reference), max(variants.Depth_at_alternate)])])
    plt.xlabel('Depth at reference allele')
    plt.ylabel('Depth at alternate allele')
    plt.title('Allelic depth distribution')
    plt.savefig(basename+'_allele_depth')
    plt.close()
    return 
Ejemplo n.º 9
0
def plot_building_temp():
    sns.set_context("paper", font_scale=1.5)
    b = 'AZ0000FF'
    s = 'KTUS'
    filelist = glob.glob(os.getcwd() + '/csv_FY/testWeather/{0}*.csv'.format(b))
    dfs = [pd.read_csv(csv) for csv in filelist]
    col = 'eui_gas'
    dfs2 = [df[[col, 'month', 'year']] for df in dfs]
    df3 = (pd.concat(dfs2))

    temp = pd.read_csv(os.getcwd() + '/csv_FY/weather/weatherData_meanTemp.csv')
    temp['year'] = temp['Unnamed: 0'].map(lambda x: float(x[:4]))
    temp['month'] = temp['Unnamed: 0'].map(lambda x: float(x[5:7]))
    temp.set_index(pd.DatetimeIndex(temp['Unnamed: 0']), inplace=True)
    temp = temp[[s, 'month', 'year']]
    joint2 = pd.merge(df3, temp, on = ['year', 'month'], how = 'inner')
    joint2.to_csv(os.getcwd() + '/csv_FY/testWeather/test_temp.csv', index=False)

    sns.lmplot(s, col, data=joint2, col='year', fit_reg=False)
    plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10))
    plt.ylim((0, joint2[col].max() + 0.1))
    P.savefig(os.getcwd() + '/csv_FY/testWeather/plot/scatter_temp_byyear.png', dpi=150)
    plt.close()

    joint2 = joint2[(2012 < joint2['year']) & (joint2['year'] < 2015)]
    sns.regplot(s, col, data=joint2, fit_reg=False)
    plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10))
    plt.ylim((0, joint2[col].max() + 0.1))
    P.savefig(os.getcwd() + '/csv_FY/testWeather/plot/scatter_temp_1314.png', dpi=150)
    plt.close()
Ejemplo n.º 10
0
    def plot_scatter_n_accuracy_joint(self, data_objects, labels, label_self, markers):
        """Make plot from this and other data objects.

        Args:
            data_objects ([Data]): Other Data objects to include in plot.
            labels ([str]): Labels to use for Data_objects.
            label_self (str): Label to use for this Data object.

        Returns: Axis object.

        """
        dataframes = [self.df] + [data.df for data in data_objects]
        labels = [label_self] + labels

        acc = []
        n = []
        statistics = []
        for df, label in zip(dataframes, labels):
            acc = df.groupby('worker')['correct'].mean()
            n = df.groupby('worker')['question'].count()
            df_new = pd.concat([acc, n], axis=1)
            df_new['dataset'] = label
            statistics.append(df_new)

        df = pd.concat(statistics, axis=0)
        sns.lmplot('question', 'correct', data=df, hue='dataset',
                   markers=markers, fit_reg=False)
        plt.xlabel('Number of questions answered')
        plt.ylabel('Accuracy')
        plt.xlim((0, None))
        plt.ylim((0, 1))
        plt.title('')
        return plt.gca()
Ejemplo n.º 11
0
def plot_data(data, has_label=True):
	import numpy as np
	import seaborn as sns
	from sklearn.manifold import TSNE
	from sklearn.decomposition import PCA

	if not has_label:
		data = data.copy()
		data['label'] = np.zeros([len(data),1])

	LIMIT = 4000
	if data.shape[0] > LIMIT:
		dt = data.sample(n=LIMIT, replace=False)
		X = dt.ix[:,:-1]
		labels = dt.ix[:,-1]
	else:
		X = data.ix[:,:-1]
		labels = data.ix[:,-1]

	tsne_model = TSNE(n_components=2, random_state=0)
	np.set_printoptions(suppress=True)
	points1 = tsne_model.fit_transform(X)
	df1 = pd.DataFrame(data=np.column_stack([points1,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df1, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('TNSE')

	pca = PCA(n_components=2)
	pca.fit(X)
	points2 = pca.transform(X)
	df2 = pd.DataFrame(data=np.column_stack([points2,labels]), columns=["x","y","class"])
	sns.lmplot("x", "y", data=df2, hue='class', fit_reg=False, palette=sns.color_palette('colorblind'))
	sns.plt.title('PCA')
Ejemplo n.º 12
0
def fig2(ppl, fname):
  '''For each contact, plot number of characters sent and received. (UNUSED)'''
  sns.lmplot("lensent", "lenrec",ppl) 
  plt.xlabel('Characters Sent')
  plt.ylabel('Characters Received')
  sns.despine()
  savefig(fname)
def show_examples(idxs, printStd=True):
    # prints example dataset from supplied indexs, idxs
    # and plots the foreground haralick features
    x = list(range(1,14))
    xs = []
    hara = []
    breed = []
    for idx in idxs:
        a = hNt.iloc[idx]
        xs.append(x)
        hara.append(np.log(abs(a.fgHaralick)))
        breed.append([a.breed] * 13)
        
        if printStd:
            print('breed:', a.breed)
            print('filename:', a.file)
            print('foreground Haralick:', a.fgHaralick)
            print('background Haralick:', a.bgHaralick)
    
    newDF = pd.DataFrame(columns=['Haralick feature', 'log(Haralick feature value)', 'breed'])
    newDF['Haralick feature'] = np.array(xs).flatten()
    newDF['log(Haralick feature value)'] = np.array(hara).flatten()
    newDF['breed'] = np.array(breed).flatten()
    newDF.sort_values(by='breed', inplace=True)
    sns.lmplot(x='Haralick feature', y='log(Haralick feature value)', data=newDF, fit_reg=False, hue='breed')
    plt.xticks(x)
    plt.show()
Ejemplo n.º 14
0
def plot_building_temp():
    sns.set_context("paper", font_scale=1.5)
    b = "AZ0000FF"
    s = "KTUS"
    filelist = glob.glob(os.getcwd() + "/csv_FY/testWeather/{0}*.csv".format(b))
    dfs = [pd.read_csv(csv) for csv in filelist]
    col = "eui_gas"
    dfs2 = [df[[col, "month", "year"]] for df in dfs]
    df3 = pd.concat(dfs2)

    temp = pd.read_csv(os.getcwd() + "/csv_FY/weather/weatherData_meanTemp.csv")
    temp["year"] = temp["Unnamed: 0"].map(lambda x: float(x[:4]))
    temp["month"] = temp["Unnamed: 0"].map(lambda x: float(x[5:7]))
    temp.set_index(pd.DatetimeIndex(temp["Unnamed: 0"]), inplace=True)
    temp = temp[[s, "month", "year"]]
    joint2 = pd.merge(df3, temp, on=["year", "month"], how="inner")
    joint2.to_csv(os.getcwd() + "/csv_FY/testWeather/test_temp.csv", index=False)

    sns.lmplot(s, col, data=joint2, col="year", fit_reg=False)
    plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10))
    plt.ylim((0, joint2[col].max() + 0.1))
    P.savefig(os.getcwd() + "/csv_FY/testWeather/plot/scatter_temp_byyear.png", dpi=150)
    plt.close()

    joint2 = joint2[(2012 < joint2["year"]) & (joint2["year"] < 2015)]
    sns.regplot(s, col, data=joint2, fit_reg=False)
    plt.xlim((joint2[s].min() - 10, joint2[s].max() + 10))
    plt.ylim((0, joint2[col].max() + 0.1))
    P.savefig(os.getcwd() + "/csv_FY/testWeather/plot/scatter_temp_1314.png", dpi=150)
    plt.close()
Ejemplo n.º 15
0
def relationship_spearman_size_source(dir, model="logreg3", feats="ecfps1", dset="bcrp"):
    """
    Plots the relationship between the size of the source vs the average relevant Spearman corr coeff. One point per
    source on the plot.
    """
    small_dict = defaultdict(list)
    # list all spearman files
    for f in glob.glob(op.join(dir, "spearmans_*")):
        if "relfeats" in op.basename(f):
            source = op.basename(f).partition("_lso_relfeats_")[2].partition("_logreg")[0]
            print source
            small_dict["source"].append(source)
            small_dict["size"].append(len(ManysourcesDataset(dset).mols().sources2molids([source])))
            with open(f, "rb") as reader:
                dict_spearman = pickle.load(reader)
                spearmans = map(lambda x: x[0], dict_spearman.values())
                small_dict["average spearman"].append(np.mean(np.array(spearmans)))
    df = pd.DataFrame.from_dict(small_dict)
    import seaborn

    seaborn.set_style("ticks")
    seaborn.set_context("talk")
    seaborn.lmplot(
        "size",
        "average spearman",
        data=df,
        scatter_kws={"marker": "o", "color": "slategray"},
        line_kws={"linewidth": 1, "color": "seagreen"},
    )
    plt.show()
Ejemplo n.º 16
0
    def _corr(self, sel, suffix):
        formula = str('model_accuracy ~ human_accuracy')
        logreg = smf.logit(formula=formula, data=sel).fit()
        summ = logreg.summary()
        if self.html is None:
            print(summ)
        else:
            summ = summ.as_html().replace('class="simpletable"',
                                          'class="simpletable table"')

        sel = sel.rename(columns={'human_accuracy': 'human accuracy',
                                  'model_accuracy': 'model accuracy'})

        sns.lmplot('human accuracy', 'model accuracy', data=sel, x_jitter=.01,
                    y_jitter=.05, logistic=True, truncate=True)

        bins = np.digitize(sel['human accuracy'], np.arange(.05,1,.1))
        #bins[bins==11] = 10
        count = sel['model accuracy'].groupby(bins).count()
        mean = sel['model accuracy'].groupby(bins).mean()
        sns.plt.scatter(.1*mean.index, mean, s=10*count, c='.15',
                        linewidths=0, alpha=.8)
        sns.plt.title(models.NICE_NAMES[self.model_name])
        sns.plt.xlim([-.1, 1.1])
        sns.plt.ylim([-.1, 1.1])
        self.show(pref='corr_sil', suffix=self.model_name + '_' + suffix,
                  caption=suffix + summ)
Ejemplo n.º 17
0
    def aucplot(cls, df):

        seaborn.lmplot(data=df,
                       x="b",
                       y="auc",
                       lowess=True,
                       size=5,
                       aspect=2) 
Ejemplo n.º 18
0
def plot(data, total, title, width=800.0, unit='', dosort=True,
		target=None, target2=None):
	"""A HTML bar plot given a dictionary and max value."""
	if len(data) > 30 and target is not None:
		df = pandas.DataFrame(index=data)
		df[title] = pandas.Series(data, index=df.index)
		df[target.name] = target.ix[df.index]
		if target2 is not None:
			df[target2.name] = target2.ix[df.index]
		if target.dtype == numpy.number:
			if target2 is None:
				seaborn.jointplot(target.name, title, data=df, kind='reg')
			else:
				seaborn.lmplot(target.name, title, data=df, hue=target2.name)
		else:  # X-axis is categorical
			df.sort_values(by=target.name, inplace=True)
			if target2 is None:
				seaborn.barplot(target.name, title, data=df)
			else:
				seaborn.barplot(target.name, title, data=df, hue=target2.name)
			fig = plt.gcf()
			fig.autofmt_xdate()
		# Convert to D3, SVG, javascript etc.
		# import mpld3
		# result = mpld3.fig_to_html(plt.gcf(), template_type='general',
		# 		use_http=True)

		# Convert to PNG
		figfile = io.BytesIO()
		plt.savefig(figfile, format='png')
		result = '<div><img src="data:image/png;base64, %s"/></div>' % (
				base64.b64encode(figfile.getvalue()).decode('utf8'))
		plt.clf()
		return result

	result = ['<div class=barplot>',
			('<text style="font-family: sans-serif; font-size: 16px; ">'
			'%s</text>' % title)]
	if target is not None:
		data = OrderedDict([(key, data[key]) for key in
				target.sort_values().index if key in data])
	keys = {key.split('_')[0] if '_' in key else key[0] for key in data}
	color = {}
	if len(keys) <= 5:
		color.update(zip(keys, range(1, 6)))
	keys = list(data)
	if dosort:
		keys.sort(key=data.get, reverse=True)
	for key in keys:
		result.append('<br><div style="width:%dpx;" class=b%d></div>'
				'<span>%s: %g %s</span>' % (
				int(round(width * data[key] / total)) if data[key] else 0,
				color.get(key.split('_')[0] if '_' in key else key[0], 1)
					if data[key] else 0,
				htmlescape(key), data[key], unit,))
	result.append('</div>\n')
	return '\n'.join(result)
Ejemplo n.º 19
0
def plot_complexity(models=ALL_MODELS, save_path='../resources/cached_model_grid_scores.csv'):
    grid = pd.read_csv(save_path)    
    grid = grid[grid['model_names'].isin(models)]

    plt.figure(figsize=(12,12));
    sns.lmplot(data=grid, x='time_to_train', y='params', 
               hue='model_names', fit_reg=False, legend=False);
    plt.legend(bbox_to_anchor=(1.05, 1), loc='lower right', borderaxespad=0.);
    plt.show();
Ejemplo n.º 20
0
def plotScatterLabelled(data, x_param, y_param, huey, output_path, output_directory, output_filename):
	sns.lmplot(x_param, y_param, data, hue=huey, fit_reg=False);
	output_ = "%s/%s/%s" % (output_path, output_directory, output_filename)
	try:
		plt.savefig(output_)
	except IOError:
		os.makedirs('%s/%s/' % (output_path, output_directory))
		plt.savefig(output_)	
	plt.close()
Ejemplo n.º 21
0
def grafico_l2(conjunto, xl=None, yl=None, titulox="", tituloy="", titulo="", filename="", tamanho=5):
    a = np.array(conjunto[0].map(_dic_cruzes))
    b = np.array(conjunto[1].map(_dic_cruzes))
    c = DataFrame([a, b]).transpose()
    c.columns = ["A", "B"]
    sns.lmplot("A", "B", c, x_jitter=0.2, y_jitter=0.3, size=tamanho)
    plt.title(titulo, fontsize=16)
    sns.axlabel(titulox, tituloy, fontsize=fontetamanho)
    plt.savefig(filename)
Ejemplo n.º 22
0
def seabornScatterPlot(data,xName,yName,titleIn):
    '''
        seabornScatterPlot plots a scatter plot using seaborn.

    :param X: x axis data
    :param Y: y axis data
    :param xName: name of x axis
    :param yName: name of y axis
    :param titleIn: plot title
    '''
    sns.lmplot(xName, yName, data, palette="Set1", fit_reg=False);
    plt.title(titleIn);
def lm_plot(df,dep_var, indep_var,grpby,units):
    if grpby:
        seaborn.lmplot(x=indep_var, y=dep_var, data=df, hue =grpby,fit_reg=False )
    else:
        seaborn.lmplot(x=indep_var, y=dep_var, data=df,fit_reg=False)
    
    #seaborn.lmplot(x=indep_var, y=dep_var, data=df, fit_reg=False)
    #would be great to figure out how to remove '_cat'    
    plt.xlabel(indep_var)
    plt.ylabel(dep_var + ", " + units)   
    plt.title("Scatterplot of " + dep_var + " versus " + indep_var)  
    plt.savefig(wd + "Scatterplot_" + dep_var + "_vs_"+ indep_var + '.png')
    plt.close
Ejemplo n.º 24
0
def plot_avg_width_exp(mark="H3K4me3"):
    def get_90quantile(arr):
        arr.sort()
        return arr[int(0.9*len(arr))]
    gene_id, EID_list, exp_matrix = get_gene_exp_matrix()
    _, len_dict = get_len_num(mark)
    quantile_arr = np.array([get_90quantile(len_dict[EID]) for EID in EID_list])
    gene_avg = np.mean(exp_matrix, axis=0)
    print quantile_arr
    print gene_avg
    result = pd.DataFrame({'quantile90':quantile_arr,"gene_avg":gene_avg})
    sns.lmplot('quantile90','gene_avg',result)
    plt.show()
Ejemplo n.º 25
0
 def plot_scatter_n_accuracy(self):
     ax = plt.gca()
     acc = self.df.groupby('worker')['correct'].mean()
     n = self.df.groupby('worker')['question'].count()
     condition = self.df.groupby('worker')['condition'].first()
     df = pd.concat([acc, n, condition], axis=1)
     sns.lmplot('question', 'correct', data=df, hue='condition',
                fit_reg=False)
     plt.xlabel('Number of questions answered')
     plt.ylabel('Accuracy')
     plt.xlim((0, None))
     plt.ylim((0, 1))
     plt.title('')
     return ax
Ejemplo n.º 26
0
    def createScatter(self, event):
        dlg = GraphDialog(self.parent, "Scatterplot Input", ("X", "Y"),
                size=(700, 200), groups=False)
        regress = wx.CheckBox(dlg, label="Add Regression Polynomial?")
        regress.SetValue(True)
        jitter = wx.CheckBox(dlg, label="Jitter?")
        jitter.SetValue(False)
        dlg.Add(jitter)
        ci = dlg.AddSpinCtrl("Confidence (>=100 for None)", 0, 101, 95)
        order = dlg.AddSpinCtrl("Polynomial Degree", 1, 10, 1)

        regress.Bind(wx.EVT_CHECKBOX, 
            lambda e: ci.Enable(regress.GetValue()) and order.Enable(regress.GetValue()))
        dlg.Add(regress)

        if dlg.ShowModal() == wx.ID_OK:
            ds = dlg.GetName()
            dlg.Destroy()
            regress, ci = regress.GetValue(), ci.GetValue()
            order, jitter = order.GetValue(), jitter.GetValue()

            data = self.parent.data[list({b for bs in ds for b in bs})].astype(float)
            snData = pd.DataFrame()
            for x, y in ds: # Deals with silly SNS stuff
                d = {"x":data[x], "y":data[y], "group":np.repeat(y, len(data[x]))}
                d = pd.DataFrame(d)
                snData = snData.append(d, ignore_index=True)

            if jitter:
                xjitter = snData["x"].std() / 4
                yjitter = snData["y"].std() / 4
            else:
                xjitter, yjitter = 0, 0

            try:
                if ci < 100 and regress:
                    sns.lmplot("x", "y", snData, hue="group", ci=ci, order=order, 
                            x_jitter=xjitter, y_jitter=yjitter)
                else:
                    sns.lmplot("x", "y", snData, fit_reg=regress, ci=None, order=order,
                            x_jitter=xjitter, y_jitter=yjitter)
                plt.show()
            except np.RankWarning:
                dlg = wx.MessageDialog(self.parent, "Polynomial Degree Too High",
                        style = wx.OK | wx.ICON_ERROR)
                dlg.ShowModal()
                dlg.Destroy()
                plt.show()
Ejemplo n.º 27
0
Archivo: grapher.py Proyecto: rht/bssim
 def latmeanbw(self):
     # take log of bw array for better sizing
     self.load_block_times()
     normbws = np.array(self.df.bandwidths) 
     g = sns.lmplot("latencies", "means", data=self.df[['latencies', 'means']], scatter_kws={"s": np.log2(normbws) * 10, "alpha" : .5})
     g.set(ylim=(0, 400))
     g = self.with_title(g)
Ejemplo n.º 28
0
Archivo: grapher.py Proyecto: rht/bssim
    def bttime(self):
        print 'loading block time vs time...'
        # get block_time rows for most recent run
        self.cur.execute('SELECT timestamp, time, runid FROM block_times where runid=(select max(runid) from runs)')
        rows = self.cur.fetchall()
        rid = (rows[0][2],)

        # get tuple reflecting run config to show under graph
        self.cur.execute('SELECT * FROM runs where runid=?', rid)
        config = self.cur.fetchone()
        config = map(str, config)
        names = [i[0] for i in self.cur.description]
        desc = str(zip(names, config))

        timestamps = []
        times = []
        for ts, time, rid in rows:
            timestamps.append(ts)
            times.append(time)
        
        timedf = pd.DataFrame.from_dict({'timestamps' : timestamps, 'times' : times})
        # change nanosecond timestamps to seconds
        timedf['timestamps'] = timedf['timestamps'].astype(float) / (1000 * 1000)
        g = sns.lmplot("timestamps", "times", data=timedf)
        print desc
        g.ax.set_title(self.wl)
        g.set_axis_labels("time (seconds)", "block times (ms)")
Ejemplo n.º 29
0
Archivo: grapher.py Proyecto: rht/bssim
    def latdur(self):
        print 'latency vs duration'
        filtered = util.lock_float_field(self.df, 'bandwidths', self.bws)
        if filtered is None:
            return self.latmeanbw()

        g = sns.lmplot("latencies", "durations", data=filtered[['latencies', 'durations', 'bandwidths']].astype(float), col='bandwidths')
def draw_boundary(power, l):
    """
    power: polynomial power for mapped feature
    l: lambda constant
    """
    density = 1000
    threshhold = 2 * 10**-3

    final_theta = feature_mapped_logistic_regression(power, l)
    x, y = find_decision_boundary(density, power, final_theta, threshhold)

    df = pd.read_csv('ex2data2.txt', names=['test1', 'test2', 'accepted'])
    sns.lmplot('test1', 'test2', hue='accepted', data=df, size=6, fit_reg=False, scatter_kws={"s": 100})

    plt.scatter(x, y, c='R', s=10)
    plt.title('Decision boundary')
Ejemplo n.º 31
0
sns.boxplot(data=[data1, data2], whis=np.inf)
plt.show()

sns.violinplot(data=[data1, data2])
plt.show()

# バンド幅を細かくしてみましょう。
sns.violinplot(data=data2, bw=0.01)
plt.show()


# Seabornにサンプルデータがあります。
tips = sns.load_dataset("tips")
tips.head()

sns.lmplot("total_bill", "tip", tips, size=10)
plt.show()


# グラフごとにパラメータを変えられます。
sns.lmplot("total_bill", "tip", tips,
           scatter_kws={'marker': 'o', 'color': 'indianred', 's': 10},
           line_kws={'linewidth': 1, 'color': 'blue'})

plt.show()

# 4次関数で回帰曲線をひくこともできます。
sns.lmplot("total_bill", "tip", tips, order=4,
           scatter_kws={"marker": "o", "color": "indianred", "s": 8},
           line_kws={"linewidth": 1, "color": "blue"})
plt.show()
Ejemplo n.º 32
0
    return data_mean, lower, upper


if __name__ == "__main__":
    df = pd.read_csv('./salaries.csv')

    data = df.values.T[1]
    boots = []
    for i in range(100, 100000, 1000):
        boot = boostrap(data, data.shape[0], i)
        boots.append([i, boot[0], "mean"])
        boots.append([i, boot[1], "lower"])
        boots.append([i, boot[2], "upper"])

    df_boot = pd.DataFrame(boots,
                           columns=['Boostrap Iterations', 'Mean', "Value"])
    sns_plot = sns.lmplot(df_boot.columns[0],
                          df_boot.columns[1],
                          data=df_boot,
                          fit_reg=False,
                          hue="Value")

    sns_plot.axes[0, 0].set_ylim(0, )
    sns_plot.axes[0, 0].set_xlim(0, 100000)

    sns_plot.savefig("bootstrap_confidence.png", bbox_inches='tight')
    sns_plot.savefig("bootstrap_confidence.pdf", bbox_inches='tight')

    #print ("Mean: %f")%(np.mean(data))
    #print ("Var: %f")%(np.var(data))
Ejemplo n.º 33
0
    def learn(self, message):
        try:
            for i in range(len(self.connections)):
                print('start learning')

                #우선 필요한 파일을 부른다
                learningFileName = 'learning_1_saved.sav'
                df = joblib.load(learningFileName)

                # 해당 파일에서의 DataFrame (df)를 불러서 출력해본다
                sb.lmplot('x',
                          'y',
                          data=df,
                          fit_reg=False,
                          scatter_kws={"s": 150},
                          hue="cluster")
                plt.title('Before')

                #이전 df2의 마지막 index 번호를 구한다
                lastIndex = len(df.index) - 1
                #그 후 message에서 숫자를 추출하고 새로운 값을 넣되 초기 cluster 번호를 0으로 지정한다.
                Xtext = message[message.index('_') + 1:message.index(',')]
                Ytext = message[message.index(',') + 1:message.index('.')]
                inputX = int(Xtext)
                inputY = int(Ytext)
                df.loc[lastIndex + 1] = [inputX, inputY, 0]

                print('Received input: ', inputX, ', ', inputY)
                # 재학습을 실행한다
                newpoints = df.values
                kmeans = KMeans(n_clusters=5).fit(newpoints)

                # 새로운 'cluster' 라벨을 붙여주고 러닝 결과를 출력한다.
                df['cluster'] = kmeans.labels_
                sb.lmplot('x',
                          'y',
                          data=df,
                          fit_reg=False,
                          scatter_kws={"s": 150},
                          hue="cluster")
                plt.title('After')

                # 각 클러스터의 중심값을 읽어오기
                clusterData = kmeans.cluster_centers_

                # 이 clusterData를 X축 기준으로 정렬하기
                sortedCluster = clusterData[clusterData[:, 0].argsort()]

                # 여기서 3번째 열(클러스터 번호)를 잘라낸다
                finalCutCluster = np.delete(sortedCluster, np.s_[2], axis=1)

                #최종 결과를 파일에 저장하고 (러닝파일 + 클러스터)
                joblib.dump(df, learningFileName)
                np.save('clusterCenter', finalCutCluster)
                print("Learning is complete!")

                #마지막으로 성공했다는 메세지를 보내기
                newMessage = "complete\n"
                print(type(newMessage))
                self.connections[i].sendall(newMessage.encode())
                print("(", newMessage, ") has been sent to client")
        except:
            pass
Ejemplo n.º 34
0
def plot_iris(iris, col1, col2):
    sns.lmplot(x=col1, y=col2, data=iris, hue="Species", fit_reg=False)
    plt.xlabel(col1)
    plt.ylabel(col2)
    plt.title("Iris species shown by colour")
    plt.show
Ejemplo n.º 35
0
g.add_legend();

# Difference between M & F split fractions
sns.kdeplot(data.split_frac[data.gender=='M'], label='men', shade=True)
sns.kdeplot(data.split_frac[data.gender=='W'], label='women', shade=True)
plt.xlabel('split_frac');    

# Bimodal distribution among M & F : Violinplot
sns.violinplot?
sns.violinplot('gender', 'split_frac', data=data, palette=['lightblue', 'lightpink']);

# Violin plot as a function of gender and age
data['age_dec'] = data.age.map(lambda age: 10 * (age//10))
data.head()
sns.violinplot('age_dec', 'split_frac', data=data, palette=['lightblue', 'lightpink']); # OR
sns.violinplot('age_dec', 'split_frac', hue='gender', data=data, palette=['lightblue', 'lightpink']); # OR
sns.violinplot('age_dec', 'split_frac', hue='gender', data=data, split=True, inner='quartile', 
               palette=['lightblue', 'lightpink']); # OR
#
men = (data.gender == 'M')
women = (data.gender == 'W')
with sns.axes_style(style=None):
    sns.violinplot("age_dec", "split_frac", hue="gender", data=data, split=True, inner="quartile",
                   palette=["lightblue", "lightpink"]);
# Elder aged
(data.age>80).sum()    

# regplot to fit a linear regression to the data  automatically
g = sns.lmplot('final_sec', 'split_frac', col='gender', data=data,markers=".", scatter_kws=dict(color='c'))
g.map(plt.axhline, y=0.1, color="k", ls=":");
Ejemplo n.º 36
0
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
#%matplotlib inline
customers = pd.read_csv('H:\Ecommerce prediction\Datasets\dt1.csv')

customers.head()

sns.set_palette("GnBu_d")
sns.set_style('whitegrid')
sns.jointplot(x='Time on Website', y='Yearly Amount Spent', data=customers)

sns.jointplot(x='Time on App', y='Yearly Amount Spent', data=customers)
sns.jointplot(x='Time on App',
              y='Length of Membership',
              kind="hex",
              data=customers)
sns.pairplot(customers)

sns.lmplot(x='Length of Membership', y='Yearly Amount Spent', data=customers)

plt.show()
Τυπώνουμε το min και το max κάθε τιμής.
"""
columns = df_dataset.columns
for num in range(1, 31):
    print('Min value of ', columns[num], ' is', min(df_dataset[columns[num]]),
          ' and max value is ', max(df_dataset[columns[num]]), '\n')

# Βρίσκουμε τον βέλτιστο αριθμό γειτόνων
find_best_K(10, dataset)
# Βλέπουμε πως αξιολογείται το μοντέλο μας ανάλογα τον αριθμό των folds.
fold_num_and_accuracy(11, dataset)

df_dataset.dtypes

# Σχεδιασμός διαγραμμάτων ως προς τα χαρακτηριστικά των όγκων και κατάταξη τους σε καλοήθεις και κακοήθεις.
df = pd.read_csv(
    'C:/Users/user/Desktop/ERGASIES_&_ARXEIA/Διαχείριση_Γνώσης_2/data.csv')
sns.lmplot(x='radius_mean', y='texture_mean', hue='diagnosis', data=df)
sns.lmplot(x='perimeter_mean', y='smoothness_mean', hue='diagnosis', data=df)
sns.lmplot(x='area_mean', y='compactness_mean', hue='diagnosis', data=df)

# Κάνουμε προβλέψεις με δεδομένα που ξέρουμε σε ποιο class ανήκουν.
k_nearest_neighbors(dataset, dataset[0:10], 5)  # 1 λάθος.
k_nearest_neighbors(dataset, dataset[0:20], 5)  # 3 λάθη.
k_nearest_neighbors(dataset, dataset[0:100], 5)  #Στα 100 παίρνουμε 10 λάθη.
k_nearest_neighbors(dataset, dataset[0:200], 5)  # 16 λάθη.

add_new_patient_data_and_predict(df_dataset, dataset, 10)
add_new_patient_data_and_predict(df_dataset, dataset, 20)
add_new_patient_data_and_predict(df_dataset, dataset, 30)
Ejemplo n.º 38
0
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
from sklearn.datasets import load_boston

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')
boston = load_boston()
boston_df = DataFrame(boston['data'])
boston_df.columns = boston['feature_names']
boston_df['Price'] = boston['target']
print(boston_df.head())

# plt.hist(boston['target'],bins=50)
# plt.scatter(boston['data'][:,5],boston['target'])
# plt.xlabel('Prices in $1000s')
# plt.ylabel('Number of houses')
# plt.show()

sns.lmplot('RM', 'Price', data=boston_df)
plt.show()
Ejemplo n.º 39
0
# sns.lmplot('Flour', 'Sugar', data=recipes, hue='Type', palette='Set1', fit_reg=False, scatter_kws={"s": 70});
# plt.plot(xx, yy, linewidth=2, color='black')
# plt.plot(xx, yy_down, 'k--')
# plt.plot(xx, yy_up, 'k--')
# plt.show()

# create function to predict muffin of cupcake


def muffin_or_cupcake(flour, sugar):
    if (modal.predict([[flour, sugar]])) == 0:
        print("Muffin")
    else:
        print("Cupcake")


muffin_or_cupcake(10, 55)

# plotting Predicted data

sns.lmplot('Flour',
           'Sugar',
           data=recipes,
           hue='Type',
           palette='Set1',
           fit_reg=False,
           scatter_kws={"s": 70})
plt.plot(xx, yy, linewidth=2, color='black')
plt.plot(10, 55, 'yo', markersize=9)
plt.show()
Ejemplo n.º 40
0
# Import Data
# ===========
#
# Load data from Google Trends.

data = pd.read_csv('data/GoogleTrendsData.csv',
                   index_col='Date',
                   parse_dates=True)
data.head()

# Show DJIA vs. debt related query volume.
display_charts(data,
               chart_type="stock",
               title="DJIA vs. Debt Query Volume",
               secondary_y="debt")
seaborn.lmplot("debt", "djia", data=data, size=7)

# Detect if search volume is increasing or decreasing in
# any given week by forming a moving average and testing if the current value
# crosses the moving average of the past 3 weeks.
#
# Let's first compute the moving average.

data['debt_mavg'] = data.debt.rolling(window=3, center=False).mean()
data.head()

# Since we want to see if the current value is above the moving average of the
# *preceeding* weeks, we have to shift the moving average timeseries forward by one.

data['debt_mavg'] = data.debt_mavg.shift(1)
data.head()
Ejemplo n.º 41
0
def association_userInput():

    print("\n===============================================================================")
    print("a. Fixed Acidity")
    print("b. Volatile Acidity")
    print("c. Citric Acid")
    print("d. Residual Sugar")
    print("e. Chlorides")
    print("f. Free Sulfur")
    print("g. Dioxide")
    print("h. Total Sulfur Dioxide")
    print("i. Density")
    print("j. pH")
    print("k. Sulphates")
    print("l. Alcohol")
    print("m. Quality")
    print("===============================================================================")

    print("\nPlease select two characteristics from above to test an association for (enter the letter)")
    print("Note: If one of the characteristics you want to test for is quality, it is recommended you choose this characteristic for characteristic 1.")

    while True:
        choice1 = input("\nCharacteristic 1: ").lower().strip()
        if choice1 == "a":
            choice1 = "fixed acidity"
            break
        if choice1 == "b":
            choice1 = "volatile acidity"
            break
        if choice1 == "c":
            choice1 = "citric acid"
            break
        if choice1 == "d":
            choice1 = "residual sugar"
            break
        if choice1 == "e":
            choice1 = "chlorides"
            break
        if choice1 == "f":
            choice1 = "free sulfur"
            break
        if choice1 == "g":
            choice1 = "dioxide"
            break
        if choice1 == "h":
            choice1 = "total sulfur dioxide"
            break
        if choice1 == "i":
            choice1 = "density"
            break
        if choice1 == "j":
            choice1 = "pH"
            break
        if choice1 == "k":
            choice1 = "sulphates"
            break
        if choice1 == "l":
            choice1 = "alcohol"
            break
        if choice1 == "m":
            choice1 = "quality"
            break
        else:
            print("\nYou must select only one menu choice from above by typing the letter. Please try again.")

    while True:
        choice2 = input("\nCharacteristic 2: ").lower().strip()
        if choice2 == "a":
            choice2 = "fixed acidity"
            break
        if choice2 == "b":
            choice2 = "volatile acidity"
            break
        if choice2 == "c":
            choice2 = "citric acid"
            break
        if choice2 == "d":
            choice2 = "residual sugar"
            break
        if choice2 == "e":
            choice2 = "chlorides"
            break
        if choice2 == "f":
            choice2 = "free sulfur"
            break
        if choice2 == "g":
            choice2 = "dioxide"
            break
        if choice2 == "h":
            choice2 = "total sulfur dioxide"
            break
        if choice2 == "i":
            choice2 = "density"
            break
        if choice2 == "j":
            choice2 = "pH"
            break
        if choice2 == "k":
            choice2 = "sulphates"
            break
        if choice2 == "l":
            choice2 = "alcohol"
            break
        if choice2 == "m":
            choice2 = "quality"
            break
        else:
            print("\nYou must select only one menu choice from above by typing the letter. Please try again.")


    while True:
        wine_choice = input("\nWould like to test for red or white wine? (enter 'red' or 'white'): ").strip().lower()

        if wine_choice == "red":
            try:
                WineCharX = choice1
                WineCharY = choice2
                allWines = pd.read_csv('winequality-both.csv', sep=',', header=0)
                red = allWines.loc[allWines['type'] == 'red', :]

                getCorr = scipy.stats.pearsonr(red[WineCharX], red[WineCharY])
                correlation = str(getCorr[0])
                pValue = str(getCorr[1])
                print("\nFor red wine, the correlation between " + WineCharX + " and " + WineCharY + " is: " + correlation)
                print("With p-value of: " + pValue)

                seaborn.lmplot(x=WineCharX, y=WineCharY, data=red)
                plt.xlabel(WineCharX)
                plt.ylabel(WineCharY)
                plt.title("Red Wine: " + WineCharX + " X " + WineCharY)
                plt.show()

            except (KeyError) as e:
                print("\nError. Please check that your spelling is correct of the wine characteristic you wish to test.")
            break

        if wine_choice == "white":
            try:
                WineCharX = choice1
                WineCharY = choice2
                allWines = pd.read_csv('winequality-both.csv', sep=',', header=0)
                white = allWines.loc[allWines['type'] == 'white', :]

                getCorr = scipy.stats.pearsonr(white[WineCharX], white[WineCharY])
                correlation = str(getCorr[0])
                pValue = str(getCorr[1])
                print("\nFor white wine, the correlation between " + WineCharX + " and " + WineCharY + " is: " + correlation)
                print("With p-value of: " + pValue)

                seaborn.lmplot(x=WineCharX, y=WineCharY, data=white)
                plt.xlabel(WineCharX)
                plt.ylabel(WineCharY)
                plt.title("White Wine: " + WineCharX + " X " + WineCharY)
                plt.show()

            except (KeyError) as e:
                print("\nError. Please check that your spelling is correct of the wine characteristic you wish to test.")
            break

        if wine_choice != "red" or wine_choice != "white":
            print("\nYou must enter either 'red' or 'white' based on which wine you want to test associations for. Please try again.")

    while True:
        after = input("\nWould you like to test more associations or return to the main menu? (enter 'test' or 'main'): ").lower().strip()
        if after == "test":
            association_userInput()
            break
        if after == "main":
            break
        else:
            print("\nYou must enter either 'test' or 'main' based on what you want to do. Please try again.")
Ejemplo n.º 42
0
medals_all = round(
    medals_all.groupby(['NOC', 'Country']).Medal_Perc.mean(),
    2).reset_index()  #remove season
medals_all.columns = ['NOC', 'Country', 'Medal_Perc']  # remove season
host_medals = games_total_df[['Year', 'Host_NOC',
                              'Host_Medal_Perc']]  #remove season, games
host_medals.columns = ['Year', 'NOC', 'Host_Medal_Perc']  #remove season, games
host_difference = pd.merge(host_medals, medals_all, how='left')

print(host_difference)
print(noc_total_df)

# Plot of difference with hosting
facet = sns.lmplot(data=host_difference,
                   x='Medal_Perc',
                   y='Host_Medal_Perc',
                   robust=True,
                   palette=['C1'])
plt.plot([0, 15], [0, 15], 'black', linewidth=2, linestyle='dashed')
facet.ax.set_xticks(np.arange(0, 15, 2.5))
facet.ax.set_yticks(np.arange(0, 36, 2.5))
plt.text(8, 7, 'x=y')
facet.ax.ticklabel_format(useOffset=False)
facet.ax.set_xlim(left=0)
facet.ax.set_ylim(bottom=0)
plt.title('The difference in percentage of medals won by host countries')
plt.show()

# Get the top 20 countries
noc_colors = sns.color_palette("Paired", n_colors=11)
noc_colors[-1] = (0.0, 0.0, 0.0)
Ejemplo n.º 43
0
def main():
    # input_dir = "/Users/odedkushnir/Google Drive/Studies/PhD/Stretch_analysis"
    mutation_lst = ["A>G", "T>C", "G>A", "C>T"] # ["A>G", "T>C", "G>A", "C>T", "A>C", "T>G", "A>T", "T>A", "G>C", "C>G", "C>A", "G>T"]
    input_dir = "C:/Users/odedku/Stretch_analysis"#.format(mutation.replace(">", ""))
    for mutation in mutation_lst:
        # mutation = "A>G"
        mutation_in_stretch = 13
        output_dir = input_dir + "_{0}".format(mutation.replace(">", ""))
        try:
            os.mkdir(output_dir)
        except OSError:
            print("Creation of the directory {0} failed".format(output_dir))
        else:
            print("Successfully created the directory {0}".format(output_dir))

        prefix = "20201012_q38/all_parts.blast"
        p2_1 = pd.read_table(input_dir + "/p2_1/{0}".format(prefix), sep="\t")
        p2_2 = pd.read_table(input_dir + "/p2_2/{0}".format(prefix), sep="\t")
        p5_1 = pd.read_table(input_dir + "/p5_1/{0}".format(prefix), sep="\t")
        p5_2 = pd.read_table(input_dir + "/p5_2/{0}".format(prefix), sep="\t")
        p8_1 = pd.read_table(input_dir + "/p8_1/{0}".format(prefix), sep="\t")
        p8_2 = pd.read_table(input_dir + "/p8_2/{0}".format(prefix), sep="\t")
        p10_1 = pd.read_table(input_dir + "/p10_1/{0}".format(prefix), sep="\t")
        p10_2 = pd.read_table(input_dir + "/p10_2/{0}".format(prefix), sep="\t")
        p12_1 = pd.read_table(input_dir + "/p12_1/{0}".format(prefix), sep="\t")
        p12_2 = pd.read_table(input_dir + "/p12_2/{0}".format(prefix), sep="\t")
        barcode_data = pd.read_csv(input_dir + "/barcode/PrimerID_barcode_Results.csv")
        # Dictionary of passage and number of PrimerID
        data_dict = {"p2_1": [p2_1, 23507], "p2_2": [p2_2, 38726], "p5_1": [p5_1, 17903], "p5_2": [p5_2, 12395],
                     "p8_1": [p8_1, 8666], "p8_2": [p8_2, 9990], "p10_1": [p10_1, 6068], "p10_2": [p10_2, 40623],
                     "p12_1": [p12_1, 9668], "p12_2": [p12_2, 11110]}
        control_id = 27962
        """NOT from memory"""
        passage_lst = glob.glob(input_dir + "/p*")
        for passage in passage_lst:
            passage_num = passage.split("\\")[-1]
            try:
                os.mkdir(output_dir + "/{0}".format(passage_num))
                os.mkdir(output_dir + "/{0}/20201012_q38".format(passage_num))
            except OSError:
                print("Creation of the directory {0}/{1}/20201012_q38 failed".format(output_dir, passage_num))
            else:
                print("Successfully created the directory {0}/{1}/20201012_q38".format(output_dir, passage_num))
        create_crosstab_df(input_dir, output_dir, prefix, data_dict, control_id, mutation, mutation_in_stretch)

        """from memory"""
        passage_lst = glob.glob(input_dir + "/p*")
        crosstab_lst = []
        for passage in passage_lst:
            passage_num = passage.split("\\")[-1]
            crosstab_df = pd.read_pickle(output_dir + "/{0}/20201012_q38/corsstab_df.pkl".format(passage_num))
            crosstab_lst.append(crosstab_df)
        """Creation of the final tables and figs"""
        crosstab_df_all = pd.concat(crosstab_lst, axis=1)
        crosstab_df_all = crosstab_df_all[
            ["Control", "p2_1", "p2_2", "p5_1", "p5_2", "p8_1", "p8_2", "p10_1", "p10_2", "p12_1", "p12_2"]]
        crosstab_df_all = crosstab_df_all.iloc[0:4, 9:]
        crosstab_df_all = crosstab_df_all.transpose()
        crosstab_df_all["Stretch_percentage"] = crosstab_df_all["No._of_reads_with_stretch_{0}".format(mutation)] / \
                                                (crosstab_df_all["No._of_reads_with_stretch_{0}".format(mutation)] +
                                                 crosstab_df_all["No._of_reads_without_stretch_{0}".format(mutation)])
        crosstab_df_all["Stretch_percentage"] = crosstab_df_all["Stretch_percentage"] * 100
        crosstab_df_all.reset_index(inplace=True, drop=False)
        crosstab_df_all = crosstab_df_all.rename(columns={"index": "Sample"})
        crosstab_df_all = crosstab_df_all.merge(barcode_data, on="Sample", how="inner")
        crosstab_df_all["Hyper mutation read frequency/sequenced genome"] = crosstab_df_all["Stretch_percentage"] / \
                                                                            crosstab_df_all["PrimerID_barcode"]
        crosstab_df_all["Hyper mutation read frequency/sequenced genome"] = crosstab_df_all[
            "Hyper mutation read frequency/sequenced genome"].astype(float)
        crosstab_df_all["passage"] = np.where(crosstab_df_all["Sample"] != "Control",
                                              crosstab_df_all.apply(lambda x: str(x["Sample"]).split("_")[0].split("p")[-1],
                                                                    axis=1), 0)
        crosstab_df_all["replica"] = np.where(crosstab_df_all["Sample"] != "Control",
                                              crosstab_df_all.apply(lambda x: str(x["Sample"]).split("_")[-1], axis=1), 1)
        crosstab_df_all["passage"] = crosstab_df_all["passage"].astype(int)
        crosstab_df_all.to_csv(output_dir + "/crosstab_df_all.csv", sep=",")
        mean_crosstab_df_all = crosstab_df_all.groupby("passage", as_index=False).mean()
        mean_crosstab_df_all["sem"] = crosstab_df_all.groupby("passage", as_index=False).sem()[
            "Hyper mutation read frequency/sequenced genome"]
        mean_crosstab_df_all["PrimerID_barcode"] = round(mean_crosstab_df_all["PrimerID_barcode"])
        mean_crosstab_df_all.to_csv(output_dir + "/mean_crosstab_df_all.csv", sep=",")

        try:
            os.mkdir(output_dir + "/figs")
        except OSError:
            print("Creation of the directory {0}/figs failed".format(output_dir))
        else:
            print("Successfully created the directory {0}/figs".format(output_dir))
        crosstab_df = pd.read_pickle(output_dir + "/{0}/20201012_q38/corsstab_df.pkl".format(passage_num))
        crosstab_lst.append(crosstab_df)
        slope1, intercept1, r_value1, p_value1, std_err1 = stats.linregress(crosstab_df_all['passage'],
                                                                            crosstab_df_all[
                                                                                'Stretch_percentage'])
        fig1 = sns.lmplot(x="passage", y="Stretch_percentage", data=crosstab_df_all, fit_reg=True,
                          line_kws={'label': "Linear Reg"}, )
        fig1.set(xlabel="Passage", ylabel="Stretch Percentage [%]", xlim=(0, 12))
        ax = fig1.axes[0, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        label_line_1 = "y={0:.3g}x+{1:.3g}\nstderr={2:.3g} Rsq={3:.3g}".format(slope1, intercept1, std_err1, r_value1 ** 2)
        L_labels[0].set_text(label_line_1)
        plt.savefig(output_dir + "/figs/points.png", dpi=300)


        slope2, intercept2, r_value2, p_value2, std_err2 = stats.linregress(mean_crosstab_df_all['passage'],
                                                                            mean_crosstab_df_all[
                                                                                'Stretch_percentage'])
        fig2 = sns.lmplot(x="passage", y="Stretch_percentage", data=mean_crosstab_df_all, fit_reg=True,
                          line_kws={'label': "Linear Reg"}, )
        fig2.set(xlabel="Passage", ylabel="Stretch Percentage [%]", xlim=(0, 12))
        ax = fig2.axes[0, 0]
        ax.legend()
        leg = ax.get_legend()
        leg._loc = 2
        L_labels = leg.get_texts()
        label_line_2 = "y={0:.3g}x+{1:.3g}\nstderr={2:.3g} Rsq={3:.3g}".format(slope2, intercept2, std_err2, r_value2 ** 2)
        L_labels[0].set_text(label_line_2)
        plt.savefig(output_dir + "/figs/mean.png", dpi=300)
Ejemplo n.º 44
0
df['Hour'] = df['timeStamp'].apply(lambda time: time.hour)
df['Month'] = df['timeStamp'].apply(lambda time: time.month)
df['Day of Week'] = df['timeStamp'].apply(lambda time: time.dayofweek)

dmap = {0:'Mon',1:'Tue',2:'Wed',3:'Thu',4:'Fri',5:'Sat',6:'Sun'}
df['Day of Week'] = df['Day of Week'].map(dmap)

sns.countplot(x='Day of Week',data=df,hue='Reason',palette='viridis')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

sns.countplot(x='Month',data=df,hue='Reason',palette='viridis')
byMonth = df.groupby('Month').count()
byMonth.head()
byMonth['twp'].plot()

sns.lmplot(x='Month',y='twp',data=byMonth.reset_index())
df['Date']=df['timeStamp'].apply(lambda t: t.date())

df.groupby('Date').count()['twp'].plot()
plt.tight_layout()

df[df['Reason']=='Traffic'].groupby('Date').count()['twp'].plot()
plt.title('Traffic')
plt.tight_layout()

df[df['Reason']=='Fire'].groupby('Date').count()['twp'].plot()
plt.title('Fire')
plt.tight_layout()

df[df['Reason']=='EMS'].groupby('Date').count()['twp'].plot()
plt.title('EMS')
Ejemplo n.º 45
0
catMTCARS = ['gear','cyl','am','carb','vs']
mtcars[catMTCARS] = mtcars[catMTCARS].astype('category')

plt.figure(figsize=(5,2))
sns.countplot(data=mtcars, x='gear')

plt.figure(figsize=(3,5))
sns.countplot(data=mtcars, x='gear')
#---
#needs one numeric
g = sns.catplot(data=mtcars, x='gear', y='mpg', hue='am')
g.fig.set_figheight(6)
g.fig.set_figheight(3)

#---
sns.lmplot()), use the size and aspect

#
sns.catplot(data=mtcars, x='gear', y='mpg',  hue='am', height=5, aspect=1/1)


#
sns.countplot(data=mtcars, x='gear')
plt.gcf().set_size_inches(4, 3)

#
fig, ax = plt.subplots()
# the size of A4 paper
fig.set_size_inches(5, 4)
sns.violinplot(data=mtcars[['mpg','wt']], inner="points", ax=ax)    
sns.despine()
Ejemplo n.º 46
0
def single_data(df):
    sns.lmplot('square', 'price', df, fit_reg=True)
    plt.show()
    print(df.head())
    print(df.info())
Ejemplo n.º 47
0
geo = geo[f1]

geo # Dataframe of regions

lifeEx # Dataframe of life expectancy 

dataset1 = lifeEx.merge(geo, how = 'inner',on ='CountryCode')
type(dataset1)

stats.columns = ['CountryName', 'CountryCode', 'BirthRate', 'InternetUsers','IncomeGroup']
type(dataset1)

merged_dataset = stats.merge(dataset1, how = 'inner',on ='CountryCode')
merged_dataset

# Create the Final Dataset 
final_dataset = merged_dataset[['CountryName_x','CountryCode','BirthRate','InternetUsers','IncomeGroup',1960,2013,'Region']]
final_dataset.columns = ['CountryName','CountryCode','BirthRate','InternetUsers','IncomeGroup','Year_1960','Year_2013','Region']
final_dataset

# Visualizations

# Regression Plot : Life Expectancy in 1960 vs BirthRate per Region
vis3 = sns.lmplot(x='BirthRate',y='Year_1960', data = final_dataset ,fit_reg=False, hue = 'Region', height = 10, aspect = 1)

# Regression Plot : Life Expectancy in 2013 vs BirthRate per Region
vis3 = sns.lmplot(x='BirthRate',y='Year_2013', data = final_dataset ,fit_reg=False, hue = 'Region', height = 10, aspect = 1)

# Regression Plot : BirthRate vs Internet Users per Region 
vis3 = sns.lmplot(x='BirthRate',y='InternetUsers', data = final_dataset ,fit_reg=False, hue = 'Region', height = 10, aspect = 1)
Ejemplo n.º 48
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
%matplotlib inline

sns.lmplot('weight', 'mpg', data=df, fit_reg=False, aspect=1, size=5, hue='cylinders', col='origin')
plt.show()
Ejemplo n.º 49
0
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import numpy as np

sns.set()

df = pd.read_csv("times_bt_podas_mejor_caso.csv")
df_p = pd.read_csv("times_bt_podas_peor_caso.csv")

# Nano -> Milli
df['time'] /= 1000000.0
df_p['time'] /= 1000000.0
df['peor'] = df_p['time']

df.plot('n', ['time', 'peor'], title='Mejor vs Peor')

r = np.corrcoef(df['time'], df['peor'])[0, 1]
print(r)
#out: r = 0.9366349410059747

ax1 = sns.lmplot(x='time', y='peor', data=df)
plt.xlabel("N")
plt.ylabel("tiempo")

plt.show()
Ejemplo n.º 50
0
        vectors_set.append(
            [np.random.normal(3.0, 0.5),
             np.random.normal(1.0, 0.5)])

import matplotlib.pyplot as plt
# 데이터 조작 패키지
import pandas as pd
# 시각화 패키지
import seaborn as sns

#난수 데이터 그래프
df = pd.DataFrame({
    "x": [v[0] for v in vectors_set],
    "y": [v[1] for v in vectors_set]
})
sns.lmplot("x", "y", data=df, fit_reg=False, size=6)
plt.show()

# 4개의 군집으로 그룹화하는 K-평균 알고리즘
import tensorflow as tf

# 무작위 데이터를 가지고 상수 텐서를 생성
vectors = tf.constant(vectors_set)
# 입력데이터에서 무작위로 K개의 데이터를 선택하는 방법 => 텐서플로가 무작위로 섞어서 K개의 중심을 선택하게 함
# K개의 데이터 포인트는 2D텐서로 저장됨
k = 4
centroids = tf.Variable(tf.slice(tf.random_shuffle(vectors), [0, 0], [k, -1]))

# 텐서 구조 확인
print vectors.get_shape()
print centroids.get_shape()
Ejemplo n.º 51
0
     # No outlier , No need any operation for Item_Weight
     #full.loc[full['Item_Weight'].isin(outlier),'Item_Weight']=full['Item_Weight'].mean()

     #Item_Outlet_Sales
     BoxPlot=boxplot(full[0:8522]['Item_Outlet_Sales'])
     outlier= BoxPlot['fliers'][0].get_data()[1]
     full.loc[full['Item_Outlet_Sales'].isin(outlier),'Item_Outlet_Sales']=full[0:8522]['Item_Outlet_Sales'].mean()
    

#-----------------Step 5:Exploration analysis of data---------------------------------------------------------

        # Create photocopy of  trian portion of full and assign it full1
        full1=full[0:8522].copy()
        
       # Analying relation between Item_Weight & Item_Outlet_Sales
        sns.lmplot(x='Item_Weight', y='Item_Outlet_Sales', data=full1)
        
       # Analying relation between Item_MRP & Item_Outlet_Sales
        sns.lmplot(x='Item_MRP', y='Item_Outlet_Sales', data=full1)
        
     # Analying relation between Item_Visibility & Item_Outlet_Sales
       full2= full1[(full1['Item_MRP']>=240) & (full1['Item_MRP']<=241)]
       
       sns.lmplot(x='Item_Visibility', y='Item_Outlet_Sales', data=full2)
       
   
      # Analying relation between Item_Id & Item_Outlet_Sales
      # Retrieve numeric part of Item_Identifier and create new column
       full1['Item_Id'] = full1['Item_Identifier'].str[3:].astype(int)
       full2= full1[(full1['Item_MRP']>=240) & (full1['Item_MRP']<=241)]
       
Ejemplo n.º 52
0
        print("Grafica para ver que genero de pelicula obtuvo mas likes en facebook.\n")
        print(separador)
        df.groupby('genres')['movie_facebook_likes'].sum().plot(kind='barh',legend='Reverse',color="green")
        plt.xlabel("Suma de likes")
        plt.show()
    elif opcion =="2":
        print("Grafica para ver el promedio de ganancias.\n")
        print(separador)
        df.gross.groupby(df.genres).mean().plot(kind='pie',cmap="Paired")
        plt.axis("equal")
        plt.ylabel("")
        plt.title("Promedio de ganancias")
        plt.show()
    elif opcion =="3":
        print("Grafica para comparar el presupuesto con la calificacion de la pelicula.\n")
        print(separador)
        df.groupby('budget')['imdb_score'].sum().plot(kind='bar',legend='Reverse',color="Black")
        plt.xlabel("Presupuesto")
        plt.ylabel("Calificación")
        plt.show()
    elif opcion =="4":
        print("Grafica de Dispercion para ver la pelicula con mas likes.\n")
        print(separador)
        sns.lmplot(x="num",y="movie_facebook_likes",data=df,fit_reg=False,hue="num",legend=False,palette="Paired")
        plt.show()
    elif opcion =="5":
        darInicio=False
    else:
        print("Debes de elegir una opción valida\n ")
else:
    print("Programa Terminado.")
Ejemplo n.º 53
0
        https://en.wikipedia.org/wiki/Median_absolute_deviation 
        http://stackoverflow.com/questions/8930370/where-can-i-find-mad-mean-absolute-deviation-in-scipy
    """
    arr = np.ma.array(
        arr).compressed()  # should be faster to not use masked arrays.
    med = np.median(arr)
    return np.median(np.abs(arr - med))


#-------------------------------------------------------------------------------
# Main program.
#-------------------------------------------------------------------------------
if __name__ == "__main__":
    df = pd.read_csv('./customers.csv')
    print((df.columns))
    sns_plot = sns.lmplot(df.columns[0], df.columns[1], data=df, fit_reg=False)

    sns_plot.axes[0, 0].set_ylim(0, )
    sns_plot.axes[0, 0].set_xlim(0, )

    sns_plot.savefig("s_scaterplot.png", bbox_inches='tight')
    sns_plot.savefig("s_scaterplot.pdf", bbox_inches='tight')

    data = df.values.T[1]

    print((("Mean: %f") % (np.mean(data))))
    print((("Median: %f") % (np.median(data))))
    print((("Var: %f") % (np.var(data))))
    print((("std: %f") % (np.std(data))))
    print((("MAD: %f") % (mad(data))))
Ejemplo n.º 54
0
#fig, axes = plt.subplots(1, 3)

# plot learning rate vs CRPS
#ax = sns.lmplot(x="hp_learning_rate", y="metric_CRPS", hue="task", data=df,)
#ax = sns.scatterplot(data=df, x='hp_learning_rate', y='metric_CRPS', hue='task')
#ax.set(xscale="log")
#ax.set_xlabel("x (learning rate)")
#ax.set_ylabel("y")

height = 4
aspect = 1.2
ax = sns.lmplot(x="hp_learning_rate",
                y="metric_CRPS",
                hue="task",
                ci=None,
                data=df,
                height=height,
                aspect=aspect,
                legend_out=False,
                fit_reg=False)
ax.set(xscale="log", yscale="log")
ax.ax.set_ylim(0.02, )
ax.ax.set_xlabel("x (learning rate)")
ax.ax.set_ylabel("y")

plt.tight_layout()
plt.savefig("y_plot.jpg")
plt.show()

# plot learning rate vs CRPS mapped through psi = Phi^{-1} o F
for task in df.task.unique():
Ejemplo n.º 55
0
from sklearn import svm
import pandas
import matplotlib.pyplot as plt
import seaborn as sns
df = pandas.read_csv('flowers.csv')
df.columns = ['X1', 'X2', 'X3', 'X4', 'Y']
df.head()

from sklearn.model_selection import train_test_split
support = svm.SVC()
X = df.values[:, 0:2]
Y = df.values[:, 4]
#print(Y)
trainX, testX, trainY, testY = train_test_split(X, Y, test_size=0.3)

support.fit(trainX, trainY)
print('Accuracy: \n', support.score(testX, testY))
pred = support.predict(testX)
print("!")

sns.set_context("notebook", font_scale=1.1)
sns.set_style("ticks")
sns.lmplot('X1', 'X2', scatter=True, fit_reg=False, data=df, hue='Y')
plt.ylabel('X2')
plt.xlabel('X1')
plt.show()
Ejemplo n.º 56
0
                  ress_HC_meta]).reset_index(drop=True)
sns.pairplot(ress, hue="Label")

#------------------------------------------------------------------------------
from scipy.stats import spearmanr
import seaborn as sns

automl = AutoML_Regression()

lasso_best, _, _, _ = automl.XGBoost(X_train, y_train, X_test, y_test)
lasso_best.fit(X_train, y_train)
y_pred = lasso_best.predict(X_test)

dt = {"True RRS_Brooding": y_test, "Predicted RRS_Brooding": y_pred}
df = pd.DataFrame(dt)
g = sns.lmplot(x="True RRS_Brooding", y="Predicted RRS_Brooding", data=df)
g.set(ylim=(min(y_test), max(y_test)))
g.set(xlim=(min(y_test), max(y_test)))
plt.text(-3.9,
         max(y_test) - 1, r'MSE = %.2f' % (mean_squared_error(y_test, y_pred)))
plt.text(-3.9,
         max(y_test) - 2, r'Corr = %.2f' % (spearmanr(y_test, y_pred)[0]))

plt.scatter(y_pred, y_test, s=8)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], '--k')
plt.xlim(min(y_test), max(y_test))
plt.ylim(min(y_test), max(y_test))
plt.ylabel('True RRS_Brooding')
plt.xlabel('Predicted RRS_Brooding')
#plt.text(s='Random Forest without Forward varible', x=1,
#            y=2, fontsize=12, multialignment='center')
testje.rolling(3).mean().plot(figsize=(20,10), linewidth=5, fontsize=20)
plt.xlabel('Date', fontsize=20)


# In[29]:
testje.diff().plot(figsize=(20,10),linewidth=5,fontsize=20)
plt.xlabel('Date',fontsize=20)


# In[30]:
import seaborn as sns
sns.set()


# In[31]:
sns.lmplot(x='Br_Mean', y='Gr_Mean',fit_reg=False, data=tableJoin, hue='OBJECTID')


# In[32]:
tableJoin.corr()


# In[33]:
tableJoin.groupby(['OBJECTID']).corr()


# In[34]:
test[["Br_Mean","Gr_Mean","We_Mean"]].diff().plot(figsize=(20,10),linewidth=5,fontsize=20)
plt.xlabel('Date',fontsize=20)

Ejemplo n.º 58
0
plot = sns.catplot(x="BldgType", y="SalePrice", data=df, kind="boxen")
plot.savefig(path)

# Still, the type of a dwelling seems like it should be important information.
# Investigate whether BldgType produces a significant interaction with either of the following:

# GrLivArea  - Above ground living area
# MoSold     - Month sold

feature = "GrLivArea"
path = "../../../data/kaggleTutorials/output/figures/miE3.png"
plot = sns.lmplot(
    x=feature,
    y="SalePrice",
    hue="BldgType",
    col="BldgType",
    data=df,
    scatter_kws={"edgecolor": 'w'},
    col_wrap=3,
    height=4,
)
plot.savefig(path)

print(mi_scores.head(10))

# Do you recognize the themes here? Location, size, and quality.
# You needn't restrict development to only these top features,
# but you do now have a good place to start.
# Combining these top features with other related features,
# especially those you've identified as creating interactions,
# is a good strategy for coming up with a highly informative set of features to train your model on.
Ejemplo n.º 59
0
# Boxplot for tip by sex
sns.boxplot(x='sex', y='tip', data=tips)
plt.show()

# Scatter plot of total_bill and tip
sns.regplot(x='total_bill', y='tip', data=tips)
plt.show()

############################################
# Facet plots in Seaborn

import seaborn as sns
import matplotlib.pyplot as plt

# Scatter plot of total_bill and tip faceted by smoker and colored by sex
sns.lmplot(x='total_bill', y='tip', data=tips, hue='sex', col='smoker')
plt.show()

# FacetGrid of time and smoker colored by sex
facet = sns.FacetGrid(tips, col="time", row='smoker', hue='sex')

# Map the scatter plot of total_bill and tip to the FacetGrid
facet.map(plt.scatter, 'total_bill', 'tip')
plt.show()

############################################
# Univariate and Bivariate plots in Matplotlib

import matplotlib.pyplot as plt

# Univariate histogram
X = pd.to_datetime(CATdf.Date)
y = CATdf.Close

#plot
plt.plot(X, y)
plt.gcf().autofmt_xdate()
plt.show()

# In[32]:

#Linear Plot of Volume and HLcat on Market Up/Down
#illustrates low volatilty days more liekly to finish net positive
#also higher volume on low and high volatility days more likely to finish net positive

sns.lmplot('Volume', 'NetUpDown', data=CATdf, hue='HLcat')

# In[33]:

#Graph DJIA Close with HLdiffernce and Volume for insight

index = pd.read_csv('djia_df_cat.csv')

index.Date = pd.to_datetime(index.Date)
plt.figure(figsize=(10, 8))
plt.plot(index.Date, index.Close, label="DJIA closing price")
plt.plot(index.Date, index.HLdifference * 10, label="HLDifference")
#scale volume for readability
plt.plot(index.Date, index.Volume / 100000, label="Volume")
plt.legend()
plt.title("DJIA stocks")