예제 #1
0
def aggregate_by_month(coll):
    data = [x for x in coll.find()]
    index = [datetime.strptime(x['created']['timestamp'], DT_FRMT) for x in data]
    df = pd.DataFrame(dict(month=[month(x) for x in index], count=[1 for x in index]), index=index)
    month_count = df.groupby('month', as_index=False).aggregate(np.count_nonzero)
    print month_count
    print ggplot(aes(x='month', y='count'), data=month_count) + geom_bar(stat='identity') + labs(title='By Count') + ylab('Num Records')
	def timeseriesplots(self):
		rawdat = importSPOD(datafolder, 1, minTime, maxTime)
		rawdat['timeStamp'] = pd.Series(pd.date_range(minTime, maxTime, freq='10s'), index=pd.date_range(minTime, maxTime, freq='10s')).resample('1s', fill_method = 'pad')

		font = {'weight' : 'bold',
		        'size'   : 6}
		mpl.rcParams['axes.xmargin'] = .25
		mpl.rc('font', **font)

		base = ggplot(aes(x='timeStamp', y='Base'), data=rawdat) +\
		   geom_line(color='blue') +\
		   ylab('Base Sensor (V)') +\
		   xlab('') + ylim(0,5.1) +\
		   scale_x_date(labels='%m/%d %H:00', breaks=date_breaks('6 hours'))
		   #   theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False)
		ggsave(plot = base, filename = figfolder+'Base.png', width = 8, height = 3)

		remote = ggplot(aes(x='timeStamp', y='Remote'), data=rawdat) +\
		    geom_line(color='blue') +\
		    ylab('Remote Sensor (V)') +\
		    xlab('') + ylim(0,5.1) +\
		    scale_x_date(labels='%m/%d %H:00', breaks=date_breaks('6 hours'))
		#   theme_matplotlib(mpl.rc('font', **font), matplotlib_defaults=False)

		ggsave(plot = remote, filename = figfolder+'Remote.png', width = 8, height = 3)
예제 #3
0
def test_scale():
    meat = _build_meat_df()
    p = ggplot(aes(x='date', y='beef'), data=meat)
    print(p + geom_point() + scale_x_continuous("This is the X") + scale_y_continuous("Squared", limits=[0, 1500]))
    print(p + geom_point() + ylim(0, 1500))
    gg = ggplot(aes(x='date', y='beef'), data=meat) + geom_line() 
    print(gg+scale_x_date(labels="%Y-%m-%d"))
예제 #4
0
def plotHist(arr, category, save_dir):

    def space2Highfen(string):
        if ' ' in string:
            print('{0} has space\n'.format(string))
            strList = list(string)
            length = len(strList)
            for i in range(length):
                if strList[i] == ' ':
                    strList[i] = '-'
            return ''.join(strList)
        return string

    arr = [x for x in arr if x != 0]
    maxi = max(arr)
    col1 = 'original-'+space2Highfen(category)
#   col2 = 'linear-'+category
#   col3 = 'log-'+category
    col4 = 'log-Scale-'+space2Highfen(category)
    df = pd.DataFrame(pd.Series(arr), columns = [col1]) #original
#   df[col2] = (maxi - df[col1])/maxi
#   df[col3] = (np.log(maxi) - np.log(df[col1]))/np.log(maxi)
    df[col4] = np.log(df[col1])                         #logscale

    width = 6
    height = 5.5
    p1 = ggplot(aes(x = col1), data = df) + geom_histogram()
#   p2 = ggplot(aes(x = col2), data = df) + geom_histogram()
#   p3 = ggplot(aes(x = col3), data = df) + geom_histogram()
    p4 = ggplot(aes(x = col4), data = df) + geom_histogram()
    ggsave(plot = p1, filename = col1 + ".png", path = save_dir, width = width, height = height, dpi = 75) # reduce dpi to save compile time
#   ggsave(plot = p2, filename = col2 + "no0.png", path = save_dir)
#   ggsave(plot = p3, filename = col3 + "no0.png", path = save_dir)
#   ggsave(plot = p4, filename = col4 + ".png", path = save_dir, width = 5, height = 5, dpi = 100)
    ggsave(plot = p4, filename = col4 + ".png", path = save_dir, width = width, height = height, dpi = 75)
예제 #5
0
def lineplot_compare(hr_by_team_year_sf_la_csv):
	#ggplot(data, aes(xvar, yvar, color=category_var))
	dataframe = pandas.read_csv(hr_by_team_year_sf_la_csv)
	gg = ggplot(dataframe, aes(x='yearID', y='HR', color='teamID'))

	#gives the plot with the two categories seperated from each other. 
	gg = ggplot(dataframe, aes(x='yearID', y='HR', color='teamID')) + geom_point() + geom_line()
예제 #6
0
def test_geom_rect():
    df = pd.DataFrame({
        'xmin': [1,3,5],
        'xmax': [2, 3.5, 7],
        'ymin': [1, 4, 6],
        'ymax': [5, 5, 9],
        'fill': ['blue', 'red', 'green'],
        'quality': ['good', 'bad', 'ugly'],
        'alpha': [0.1, 0.5, 0.9],
        'texture': ['hard', 'soft', 'medium']})
    p = ggplot(df, aes(xmin='xmin', xmax='xmax', ymin='ymin', ymax='ymax',
               colour='quality', fill='fill', alpha='alpha',
               linetype='texture'))
    p += geom_rect(size=5)
    assert_same_ggplot(p, 'geom_rect')

    p = ggplot(df, aes(xmin='xmin', xmax='xmin + 1', ymin='ymin',
               ymax='ymin + 1'))
    p += geom_rect()
    assert_same_ggplot(p, 'geom_rect_plus')

    p = ggplot(df, aes(x='xmin', y='ymin'))
    p += geom_point(size=100, colour='red', alpha=0.5)
    p += geom_rect(aes(fill='fill', xmin='xmin', xmax='xmin + 1', ymin=0,
                   ymax='ymax'), alpha=0.1)
    assert_same_ggplot(p, 'geom_rect_with_point')
예제 #7
0
def test_stat_function():
    np.random.seed(7776)
    dnorm = lambda x : (1.0 / np.sqrt(2 * np.pi)) * (np.e ** (-.5 * (x ** 2)))
    print(ggplot(DataFrame({'x':np.random.normal(size=100)}),aes(x='x')) + \
              geom_density() + \
              stat_function(fun=dnorm,n=200))
    print(ggplot(DataFrame({'x':np.arange(10)}),aes(x='x')) + \
              stat_function(fun=np.sin,color="red") + \
              stat_function(fun=np.cos,color="blue"))
    # Test when args = list
    def to_the_power_of(n,p):
        return n ** p
    x = np.random.randn(100)
    y = x ** 3
    y += np.random.randn(100)
    data = DataFrame({'x':x,'y':y})
    print(ggplot(aes(x='x',y='y'),data) + geom_point() + \
              stat_function(fun=to_the_power_of,args=[3]))
    # Test when args = dict
    def dnorm(x,mean,var):
        return scipy.stats.norm(mean,var).pdf(x)
    data = DataFrame({'x':np.arange(-5,6)})
    print(ggplot(aes(x='x'),data=data) + \
        stat_function(fun=dnorm,color="blue",args={'mean':0.0,'var':0.2})   + \
        stat_function(fun=dnorm,color="red",args={'mean':0.0,'var':1.0})    + \
        stat_function(fun=dnorm,color="yellow",args={'mean':0.0,'var':5.0}) + \
        stat_function(fun=dnorm,color="green",args={'mean':-2.0,'var':0.5}))
예제 #8
0
def main():
    # Set system variables
    root = r'/Users/DC-MBP/Desktop/final-project'
    temp = os.path.join(root, 'Temp')
    data = r'/Users/DC-MBP/Desktop/yelp-api'
    data_file = 'yelp_academic_dataset_business.json'

    # Set regression formula
    rf = 'stars ~ review_count + state + Caters + Attire + BYOB + Alcohol'

    # Create data file
    df_business = process_data_restaurant(data, data_file)
    
    # Create Vegas data file
    #create distance from town center 36.175, -115.136389
    df_vegas = df_business[df_business.city == "Las Vegas"]
    df_vegas['distance'] = np.sqrt(np.power(df_vegas.latitude-36.175,2) +
                                   np.power(df_vegas.longitude+115.136389,2))
    
    # Create visualizations
    p1 = ggplot(aes(y='stars', x='review_count'),data=df_business)
    print(p1 + geom_point())
    
    p2 = ggplot(aes(y='latitude', x='longitude'), data=df_vegas)
    print(p2 + geom_point())
    
    p3 = ggplot(aes(y='stars', x='distance'), data=df_vegas)
    print(p3 + geom_point())
    
    print 'End'
예제 #9
0
def wrapper(name):
    global pltsize
    Xt, Yt=loadData(name, 'train')
    Xv, Yv=loadData(name, 'validate')
    w = Train(Xt, Yt, 0)
    print 'Classification Error (TR): ', classifyErr(LRPredict(w, Xt), Yt, 0.5), name
    print 'Classification Error (VAL):: ',classifyErr(LRPredict(w, Xv), Yv, 0.5), name
    t1 = 'Classification Error vs Decision Boundary - ' + name + ': Training'
    t2 = 'Classification Error vs Decision Boundary - ' + name + ': Validation'
    plotCEDB(w, Xt, Yt, '')
    plotCEDB(w, Xv, Yv, '')
    t1 = 'Logistic Regression - ' + name + ': Training'
    t2 = 'Logistic Regression - ' + name + ': Validation'
    plotDecisionBoundary(w, Xt, Yt, LRPredict, [0.5], '')
    plotDecisionBoundary(w, Xv, Yv, LRPredict, [0.5], '')
    l = array(linspace(0,100,101))
    tErr, tClass, vErr, vClass = GridL(Xt, Yt, Xv, Yv, l)
    DF1 = pd.DataFrame({'TR': pd.Series(tClass), 'VAL': pd.Series(vClass), 'Lambda': pd.Series(l)})
    DF1 = pd.melt(DF1,id_vars=['Lambda'])
    DF2 = pd.DataFrame({'TR': pd.Series(tErr), 'VAL': pd.Series(vErr), 'Lambda': pd.Series(l)})
    DF2 = pd.melt(DF2,id_vars=['Lambda'])
    title1 = 'Classification Error vs Lambda - ' + name
    title2 = 'Logisitic Loss vs Lambda - ' + name
    print p1 = ggplot(DF1, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False)
    print p2 = ggplot(DF2, aes(x='Lambda', y='value', color='variable')) + geom_line(size=4) + ggtitle('') + ylab('Error') + theme_matplotlib(rc=pltsize, matplotlib_defaults=False)
def test_stat_vhabline_functions():
    def fn_x(x):
        return 1
    def fn_y(y):
        return 1
    def fn_xy(x, y):
        return 1

    gg = ggplot(aes(x='wt'), mtcars)
    # needs y aesthetic
    with assert_raises(GgplotError):
        print(gg + stat_abline(slope=fn_xy))
    # needs y aesthetic
    with assert_raises(GgplotError):
        print(gg + stat_abline(intercept=fn_xy))

    gg = ggplot(aes(x='wt', y='mpg'), mtcars)
    # Functions with 2 args, no problem
    print(gg + stat_abline(slope=fn_xy, intercept=fn_xy))

    # slope function should take 2 args
    with assert_raises(GgplotError):
        print(gg + stat_abline(slope=fn_x, intercept=fn_xy))

    # intercept function should take 2 args
    with assert_raises(GgplotError):
        print(gg + stat_abline(slope=fn_xy, intercept=fn_y))

    # intercept function should take 1 arg
    with assert_raises(GgplotError):
        print(gg + stat_vline(xintercept=fn_xy))

    # intercept function should take 1 arg
    with assert_raises(GgplotError):
        print(gg + stat_hline(yintercept=fn_xy))
예제 #11
0
def plot_sed(tmp,phot = None, fname = None, ignore = None, err = None):
    '''
    make plots using ggplot
    '''
    wav = tmp.df.wav
    cols = list(tmp.df.columns[1:])
    if ignore is not None:
        for i in ignore:
            cols.remove(i)
    df_plot = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(tmp.df.loc[:,cols[0]]),'template':[cols[0] for x in range(len(wav))]})
    for i in cols[1:]:
        df = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(tmp.df.loc[:,i]),'template':[i for x in range(len(wav))]})
        df_plot = pd.concat([df_plot,df])
    if phot is None:
        plt_out=ggplot(df_plot,aes(x='log wav(um)',y='log flux',color='template'))+geom_line()
    elif err is None:
        if type(phot) != pd.Series:
            print('phot should be in pandas series')
        else:
            df_phot = ({'log wav(um)':np.log10(np.asarray([dict_wav[x] for x in phot.index])),
                        'log flux':np.log10(phot.values.astype(float)),
                        'template':['Data' for x in range(len(phot))]})
            plt_out=ggplot(df_phot,aes(x='log wav(um)', y='log flux',color='template'))+\
                    geom_point()+geom_line(df_plot)
    else:
        plt_out=ggplot(df_plot,aes(x='log wav(um)',y='log flux',color='template'))+\
        geom_line()+geom_point(data = df_phot)
    #if fname is None:
    #    fname = 'plot'
    #ggsave(plt_out,fname+'.pdf')
    self.sed = plt_out
예제 #12
0
def generateBathroomTilePlot(bl_vs_change_json):
    df = pd.read_json(bl_vs_change_json)
    summary_regions = ['ctx-lh-parsorbitalis','ctx-rh-parsorbitalis','ctx-rh-lateralorbitofrontal',
                       'ctx-lh-lateralorbitofrontal','ctx-rh-frontalpole','ctx-rh-parstriangularis',
                       'ctx-lh-frontalpole','ctx-lh-parstriangularis','ctx-lh-caudalanteriorcingulate',
                       'ctx-rh-rostralmiddlefrontal','ctx-lh-caudalmiddlefrontal',
                       'ctx-rh-caudalanteriorcingulate','ctx-rh-rostralanteriorcingulate',
                       'ctx-lh-rostralmiddlefrontal','ctx-rh-caudalmiddlefrontal',
                       'ctx-lh-superiorparietal','ctx-rh-isthmuscingulate',
                       'ctx-lh-rostralanteriorcingulate','ctx-rh-parsopercularis',
                       'ctx-rh-superiorparietal','ctx-lh-parsopercularis',
                       'ctx-rh-medialorbitofrontal','ctx-lh-isthmuscingulate',
                       'ctx-lh-supramarginal','ctx-lh-inferiorparietal','ctx-rh-supramarginal',
                       'ctx-lh-superiorfrontal','ctx-rh-superiorfrontal','ctx-rh-middletemporal',
                       'ctx-lh-middletemporal','ctx-rh-inferiorparietal','ctx-rh-superiortemporal',
                       'ctx-lh-posteriorcingulate','ctx-lh-precuneus','ctx-lh-medialorbitofrontal',
                       'ctx-lh-superiortemporal','ctx-rh-posteriorcingulate','ctx-rh-precuneus']
    ordering = {x:i for i,x in enumerate(summary_regions)}
    rank_by = summary_regions # could take subset of cortical summary regions
    subjects = GROUPS['increasing_low']['N']
    df = df[df['rid'].isin(subjects)]

    baseline_keys = ["%s_bl" % _ for _ in rank_by]
    change_keys = ["%s_change" % _ for _ in summary_regions]
    df['rank'] = df[baseline_keys].mean(axis=1)

    keep_keys = ['rid', 'rank'] + change_keys
    df = df[keep_keys]
    df_long = pd.melt(df,id_vars=['rank'],value_vars=change_keys)

    # sort change
    df_long['variable'] = [_.replace('_change','') for _ in df_long['variable']]
    df_long['variable'] = ['%s_%s' % (str(ordering[_]).zfill(2),_) for _ in df_long['variable']]

    print ggplot(aes(x='variable',y='rank'),data=df_long)+geom_tile(aes(fill='value'))+theme(axis_text_x=element_text(angle=270,size=8), axis_text_y=element_text(size=6))
예제 #13
0
def plot_weather_data(turnstile_weather):
	"""
	Plot turnstile weather data
	"""

	# Subway ridership by time of day
	# Create pivot table with UNIT on one hand, and cummulative entries on the other
	df_time_of_day = turnstile_weather.loc[:, ['Hour', 'ENTRIESn_hourly']].groupby(['Hour'], as_index = False).sum()
	# Create plot
	df_time_of_day_plot = ggplot(df_time_of_day, aes('Hour'))
	df_time_of_day_plot = df_time_of_day_plot + geom_bar(aes(x = 'Hour', weight = 'ENTRIESn_hourly'), binwidth = 1) + scale_x_continuous(limits = (0, 23))

	# Subway ridership by subway station
	# Create pivot table with UNIT on one hand, and cummulative entries on the other
	df_subway_station = turnstile_weather.loc[:, ['UNIT', 'ENTRIESn_hourly']].groupby(['UNIT'], as_index = False).sum()
	# Create plot
	df_subway_station_plot = ggplot(df_subway_station, aes(x = 'UNIT'))
	df_subway_station_plot = df_subway_station_plot + geom_bar(aes(x = 'UNIT', weight ='ENTRIESn_hourly'))

	# Subway ridership, total
	# Create pivot table with DATEn on one hand, and entries on the other
	df_total = turnstile_weather.loc[:, ['DATEn', 'ENTRIESn_hourly']].groupby(['DATEn'], as_index = False).sum()
	# Convert DATEn column to proper datetime
	df_total['DATEn'] = pandas.to_datetime(df_total['DATEn'])
	df_total['DATEn'] = [d.date() for d in df_total['DATEn']]
	# Create plot
	df_total_plot = ggplot(df_total, aes('DATEn'))
	df_total_plot = df_total_plot + geom_bar(aes(x = 'DATEn', weight = 'ENTRIESn_hourly')) + scale_x_date()

	return df_time_of_day_plot, df_subway_station_plot, df_total_plot
예제 #14
0
def plot_year_doy(df, title, palette='RdYlGn'):
    """ Plot year / doy with clear percent as color if available"""

    if 'clear' in df.columns:
        pct_clear = ((df['clear'] // 20) * 20).astype(np.uint8)
        df['Percent Clear'] = [' ' * (3 - len(str(v))) + str(v) 
                               if v < 100 else str(v) 
                               for v in pct_clear]

        # HACK to get all values shown
        need = ['  0', ' 20', ' 40', ' 60', ' 80', '100']
        to_add = [v for v in need if v not in np.unique(df['Percent Clear'])]
        for v in to_add:
            df = pd.concat([df, df[:1]])
            df['year'][-1:] = np.nan
            df['doy'][-1:] = np.nan
            df['Percent Clear'][-1:] = v

        plot = ggplot(aes('year', 'doy', color='Percent Clear'), df)
        plot = plot + scale_color_brewer(type='diverging', palette=palette)

    else:
        plot = ggplot(aes('year', 'doy'), df)

    return(plot + geom_point(size=50) +
           xlim(df.year.min() - 1, df.year.max() + 1) +
           ylim(0, 366) +
           xlab('Year') +
           ylab('Day of Year') +
           ggtitle(title))
def entries_histogram(turnstile_weather, fog=False):
    '''
    Before we perform any analysis, it might be useful to take a
    look at the data we're hoping to analyze. More specifically, lets 
    examine the hourly entries in our NYC subway data and determine what
    distribution the data follows. This data is stored in a dataframe
    called turnstile_weather under the ['ENTRIESn_hourly'] column.
    
    Why don't you plot two histograms on the same axes, showing hourly
    entries when raining vs. when not raining. Here's an example on how
    to plot histograms with pandas and matplotlib:
    turnstile_weather['column_to_graph'].hist()
    
    Your histograph may look similar to the following graph:
    http://i.imgur.com/9TrkKal.png
    
    You can read a bit about using matplotlib and pandas to plot
    histograms:
    http://pandas.pydata.org/pandas-docs/stable/visualization.html#histograms
    
    You can look at the information contained within the turnstile weather data at the link below:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    '''

    '''
    plot = (ggplot(turnstile_weather, aes(x='ENTRIESn_hourly')) + 
            geom_histogram(data=turnstile_weather[turnstile_weather['fog']==0], position="identity") + 
            ggtitle('Hourly ridership') + 
            xlab('Hourly entries') + 
            ylab('Frequency'))
    '''

    #turnstile_weather['ENTRIESn_hourly'] = np.log(turnstile_weather['ENTRIESn_hourly'][turnstile_weather['ENTRIESn_hourly'] > 0])
    turnstile_weather['ENTRIESn_hourly'] = turnstile_weather['ENTRIESn_hourly'].map(lambda y: boxcox1p(y))

    if fog:
        plot = (ggplot(turnstile_weather[turnstile_weather['fog'] == 0], 
            aes(x='ENTRIESn_hourly')) + 
            geom_histogram(color='red') +
            ggtitle('Hourly ridership on non-foggy days') + 
            xlab('log(Hourly entries+1)') + 
            ylab('Frequency'))
    else:
        plot = (ggplot(turnstile_weather[turnstile_weather['fog'] == 1], 
            aes(x='ENTRIESn_hourly')) + 
            geom_histogram(color='blue') +
            ggtitle('Hourly ridership on foggy days') + 
            xlab('log(Hourly entries+1)') + 
            ylab('Frequency'))
    
    
# What about fog option?
# Can we measure how correlated fog and rain? 
# Could we find out the level of fog from the meantempi and dewpointi? This would 
# be hard to predict? 
# BE SIMPLE AND ENJOY THE FLOP. TAKE FOG!
# STOP TRYING TO PREDICT THE FUTURE AND COVER YOUR ASS!

    return plot
예제 #16
0
파일: analysis.py 프로젝트: nairboon/bnrl
def main(parameters):
    label = sys.argv[-1]   # Sumatra appends the label to the command line
    subdir = os.path.join("mydata", label)
    #os.mkdir(subdir)

    res = {}
    an = []
    ax = []
    ay = []
    
    all_df = pd.DataFrame({"i":[],"Name":[]})
    
    final_df = pd.DataFrame({"Algorithm":[],"Task":[],"Steps":[]})

    for scenario in parameters["scenarios"]:
        res[scenario] = {}
        for algorithm in parameters["algorithms"]:
            name = scenario+"_"+algorithm
            fileid = "%s_%s.txt" % (scenario, algorithm)
            fn = os.path.join(subdir, fileid)
            da, ap = average_run(parameters["AcceptableScore"],fn)
            for i,r in ap.iterrows():
                an.append(name)
                ax.append(r["n"])
                ay.append(r["avg"])
                
            
            all_df = all_df.append(da)
            #print algorithm,ax,ay, ap
#            if len(ay) == 0:
                
            final_df = final_df.append(dict(Algorithm=algorithm,Task=scenario,Steps=ay[-1]),ignore_index=True)
                
    # showing that we have enough runs
    df = pd.DataFrame({"Name":an,"Runs":ax,"Avg":ay})
    #print df
    p = ggplot(aes(x='Runs',y="Avg"), data=df) + geom_point() + geom_line()+ \
    facet_wrap("Name")
    ggsave(p,os.path.join(subdir, "avg_runs.png"))
    
    
    #ploting all runs
    #all_df["y"] = all_df["0"]
    #print all_df
    all_plot = ggplot(aes(x='i', y='avg',colour="Name"), data=all_df) + geom_point() + geom_line()
    ggsave(all_plot,os.path.join(subdir, "all_runs.png"))

    #final comparison
    #do in R
    #print final_df
    final_df.to_csv(os.path.join(subdir, "final_comp.csv"),index=False)
    
    
    import subprocess
    proc = subprocess.Popen(['/usr/bin/Rscript','result.R',subdir], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    stdout, stderr = proc.communicate()
    print stdout,stderr
    proc.wait()
    print "done",subdir
예제 #17
0
def test_factor():
    p = ggplot(mtcars, aes(x='wt', y='mpg', colour='factor(cyl)', size='mpg', linetype='factor(cyl)'))
    print(p + geom_line())
    print(p + geom_point())
    print(p + geom_line() + geom_point())
    print(p + geom_point() + geom_line(color='lightblue') + ggtitle("Beef: It's What's for Dinner") + xlab("Date") + ylab("Head of Cattle Slaughtered"))
    p = ggplot(aes(x='factor(cyl)'), data=mtcars)
    print(p + geom_bar())
예제 #18
0
def plot_cost_history(alpha, cost_history):

   cost_df = pandas.DataFrame({
      'Cost_History': cost_history,
      'Iteration': range(len(cost_history))
   })
   print ggplot(cost_df, aes('Iteration', 'Cost_History')) + \
      geom_point() + ggtitle('Cost History for alpha = %.3f' % alpha )
def plot_weather_data(turnstile_weather):
    ''' 
    Use ggplot to make another data visualization focused on the MTA and weather
    data we used in assignment #3. You should make a type of visualization different
    than you did in exercise #1, and try to use the data in a different way (e.g., if you
    made a lineplot concerning ridership and time of day in exercise #1, maybe look at weather
    and try to make a histogram in exercise #2). 
    
    You should feel free to implement something that we discussed in class
    (e.g., scatterplots, line plots, or histograms) or attempt to implement 
    something more advanced if you'd like.  Here are some suggestions for things
    to investigate and illustrate:
    * Ridership by time of day or day of week
    * How ridership varies based on Subway station
    * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
    
    You can check out: 
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
     
    To see all the columns and data points included in the turnstile_weather 
    dataframe. 
    
    However, due to the limitation of our Amazon EC2 server, we are giving you about 1/3
    of the actual data in the turnstile_weather dataframe
    '''

    print turnstile_weather.ix[:10]

    df = turnstile_weather[['rain', 'ENTRIESn_hourly']].groupby('rain', as_index = False).sum()


    map_dict = {1: "rain", 0: "no rain"}
    turnstile_master["weather"] = turnstile_master["rain"].map(map_dict)
    plot = ggplot(turnstile_master, aes(x='ENTRIESn_hourly', color = 'weather')) \
        + geom_bar(aes(weight='ENTRIESn_hourly'),fill= '') \
        + ggtitle('NYC Subway ridership by weather') + xlab('Entries') + ylab('Frequency')

    #print plot

    
    plot = ggplot(turnstile_weather, aes(x='ENTRIESn_hourly', color = 'rain')) \
           + geom_bar(aes(weight='ENTRIESn_hourly'), fill='green') + geom_density()\
           + ggtitle('NYC Subway ridership by day of week') + xlab('Day') + ylab('Entries')
    plot = ggplot(turnstile_weather, aes(x='ENTRIESn_hourly', color = 'fog')) \
           + geom_bar(aes(weight='ENTRIESn_hourly'), fill='orange') \
           + ggtitle('NYC Subway ridership by day of week') + xlab('Day') + ylab('Entries')
            
    plot = ggplot(df,aes(x='rain')) \
          + geom_bar(aes(weight='ENTRIESn_hourly'),fill='orange',binwidth = 0.5) \
          + ggtitle('NYC Subway ridership by rain') + xlab('rain') + ylab('Entries') 
    return plot


    '''
예제 #20
0
def plotCEDB(w, X, Y, title):
    global pltsize
    axis = array(linspace(0,1,101))
    out = zeros(len(axis))
    py = LRPredict(w,X)
    for i in range(len(axis)):
        out[i] = classifyErr(py, Y, axis[i])
    DF = pd.DataFrame({'Decision Boundary': pd.Series(axis),'Classification Error': pd.Series(out)})
    print ggplot(DF, aes(x='Decision Boundary', y='Classification Error')) + geom_line(size=4) + ggtitle(title) + theme_matplotlib(rc=pltsize, matplotlib_defaults=False) 
예제 #21
0
파일: test_colors.py 프로젝트: Xbar/ggplot
def test_assign_colors():
    """
    Test how colors are assigned to different column types.
    """

    df = pd.DataFrame({"values": np.arange(10),
                       "int_col": np.arange(10),
                       "num_col": np.arange(10) / 2,
                       "bool_col": np.random.randn(10) > 0,
                       "char_col": ["a", "b"] * 5})

    color_mapping_col = 'color_mapping'

    # test integer column
    color_col = "int_col"
    gg_int = ggplot(df, aes(x="values", y="values", color="int_col"))
    gg_int += geom_point()
    gg_int.draw()

    new_data = assign_continuous_colors(df, gg_int, color_col)
    expected_cols = new_data[color_mapping_col]
    actual_cols = gg_int.data[color_mapping_col]
    assert_true((actual_cols == expected_cols).all())

    # test numeric column
    color_col = "num_col"
    gg_num = ggplot(df, aes(x="values", y="values", color="num_col"))
    gg_num += geom_point()
    gg_num.draw()

    new_data = assign_continuous_colors(df, gg_int, color_col)
    expected_cols = new_data[color_mapping_col]
    actual_cols = gg_num.data[color_mapping_col]
    assert_true((actual_cols == expected_cols).all())

    # test bool column
    color_col = "bool_col"
    gg_bool = ggplot(df, aes(x="values", y="values", color="bool_col"))
    gg_bool += geom_point()
    gg_bool.draw()

    new_data = assign_discrete_colors(df, gg_bool, color_col)
    expected_cols = new_data[color_mapping_col]
    actual_cols = gg_bool.data[color_mapping_col]
    assert_true((actual_cols == expected_cols).all())

    # test char column
    color_col = "char_col"
    gg_char = ggplot(df, aes(x="values", y="values", color="char_col"))
    gg_char += geom_point()
    gg_char.draw()

    new_data = assign_discrete_colors(df, gg_bool, color_col)
    expected_cols = new_data[color_mapping_col]
    actual_cols = gg_char.data[color_mapping_col]
    assert_true((actual_cols == expected_cols).all())
예제 #22
0
def rebound_list_draw():
    if True:
        global rblist2
        print rblist2
        powd = DataFrame(rblist2, columns=['index','open','year'])
        print ggplot(aes(x='index', y='open'), data=powd) + \
            geom_point(color='lightblue', size = 9) + \
            ggtitle("Rebound") + \
            xlab("Date") + \
            ylab("Open")
def plot_weather_data(turnstile_weather):
    '''
    You are passed in a dataframe called turnstile_weather.
    Use turnstile_weather along with ggplot to make a data visualization
    focused on the MTA and weather data we used in assignment #3.
    You should feel free to implement something that we discussed in class
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time of day or day of week
     * How ridership varies based on Subway station (UNIT)
     * Which stations have more exits or entries at different times of day
       (You can use UNIT as a proxy for subway station.)

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/

    You can check out:
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv

    To see all the columns and data points included in the turnstile_weather
    dataframe.

    However, due to the limitation of our Amazon EC2 server, we are giving you a random
    subset, about 1/3 of the actual data in the turnstile_weather dataframe.
    '''

    df = turnstile_weather[['Hour', 'ENTRIESn_hourly']]

    q = """
        SELECT Hour AS hour,
               sum(ENTRIESn_hourly)/count(*) AS hourlyentries
        FROM df
        GROUP BY hour
        """

    #Execute SQL command against the pandas frame
    rainy_days = pandasql.sqldf(q.lower(), locals())


    print ggplot(rainy_days, aes('hour', 'hourlyentries')) + \
            geom_bar(fill = '#cc2127', stat='bar') + \
            scale_x_continuous(name="Hour",
                               breaks=[0, 1, 2, 3, 4, 5,
                                       6, 7, 8, 9, 10, 11,
                                       12, 13, 14, 15, 16, 17,
                                       18, 19, 20, 21, 22, 23],
                               labels=['12AM', '1AM', '2AM', '3AM', '4AM', '5AM',
                                       '6AM', '7AM', '8AM', '9AM', '10AM', '11AM',
                                       '12PM', '1PM', '2PM', '3PM', '4PM', '5PM',
                                       '6PM', '7PM', '8PM', '9PM', '10PM', '11PM']) + \
            ggtitle("Average ENTRIESn_hourly by Hour") + \
            ylab("ENTRIESn_hourly")
예제 #24
0
def test_geoms():
    df = _build_testing_df()
    gg = ggplot(aes(x="x", color="c"), data=df)
    print(gg + geom_density() + xlab("x label") + ylab("y label"))
    gg = ggplot(aes(x="x", y="y", shape="cat2", color="cat"), data=df)
    print(gg + geom_histogram())
    print(gg + geom_histogram() + ggtitle("My Histogram"))
    print(gg + geom_point())
    print(gg + geom_point() + geom_vline(x=50, ymin=-10, ymax=10))
    gg = ggplot(aes(x='x', ymax='y', ymin='z', color="cat2"), data=df)
    print(gg + geom_area())
예제 #25
0
def graph_data():
    """
    Graphs the features with event_counts on the x axis and std on the y axis
    using a ggplot2 extension for Python. You can find out more at 
    ggplot.yhathq.com
    """

    data = pd.read_table('data/features.csv', sep=',')
    
    print ggplot(data, aes(x='event_counts', y='std', color='bot')) + \
            geom_point()
예제 #26
0
 def plot_sed(self,phot = None, fname = None, ignore = None, err = None):
     '''
     make plots using ggplot
     '''
     wav = self.df.wav
     cols = list(self.df.columns[1:])
     if ignore is not None:
         for i in ignore:
             cols.remove(i)
     df_plot = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(self.df.loc[:,cols[0]]),
                             'logf_l':np.log10(self.df.loc[:,cols[0]]),
                             'logf_h':np.log10(self.df.loc[:,cols[0]]),
                             'template':[cols[0] for x in range(len(wav))]})
     for i in cols[1:]:
         df = pd.DataFrame({'log wav(um)':np.log10(wav),'log flux':np.log10(self.df.loc[:,i]),
             'logf_l':np.log10(self.df.loc[:,i]),
             'logf_h':np.log10(self.df.loc[:,i]),
             'template':[i for x in range(len(wav))]})
         df_plot = pd.concat([df_plot,df])
     if phot is None:
         plt_out=ggplot(df_plot,aes(x='log wav(um)',y='log flux',color='template'))+geom_line()
     elif err is None:
         print('No error bars')
         if type(phot) != pd.Series:
             print('phot should be in pandas series')
         else:
             df_phot = pd.DataFrame({'log wav(um)':np.log10(np.asarray([dict_wav[x] for x in phot.index])),
                         'log flux':np.log10(phot.values.astype(float)),
                         'template':['Data' for x in range(len(phot))]})
             self.phot = df_phot
             plt_out=ggplot(df_phot,aes(x='log wav(um)', y='log flux',color='template'))+\
                     geom_point()+geom_line(df_plot)+\
                     ylim(min(df_phot['log flux'])-1.5,max(df_phot['log flux'])+0.5)+\
                     xlim(-0.7,1.5)
                     #+geom_point(df_phot,size=40,color='red')
     else:
         if type(phot) != pd.Series or type(err) != pd.Series:
             print('phot and err should be in pandas series with band names as index')
         else:
             df_phot = pd.DataFrame({'log wav(um)':np.log10(np.asarray([dict_wav[x] for x in phot.index]).astype(float)),
                         'log flux':np.log10(phot.values.astype(float)),
                         'logf_l':np.log10(phot.values.astype(float)-0.95*err.values.astype(float)),
                         'logf_h':np.log10(phot.values.astype(float)+0.95*err.values.astype(float)),
                         'template':['Data' for x in range(len(phot))]})
         plt_out=ggplot(df_phot,aes(x='log wav(um)',y='log flux',ymax='logf_h',ymin='logf_l',color='template'))+\
         geom_point()+geom_pointrange()+geom_line(df_plot)+\
         ylim(min(df_phot['log flux'])-1.5,max(df_phot['log flux'])+0.5)+\
         xlim(-0.7,1.5)
         self.phot = df_phot
     #if fname is None:
     #    fname = 'plot'
     #ggsave(plt_out,fname+'.pdf')
     self.sed = plt_out
예제 #27
0
def test_diamond():    
    p = ggplot(aes(x='x', y='y', colour='z'), data=diamonds.head(4))
    p = p + geom_point() + scale_colour_gradient(low="white", high="red") 
    p = p + facet_wrap("cut")
    print(p)
    
    p = ggplot(aes(x='x', y='y', colour='z'), data=diamonds.head(1000))
    p = p + geom_point() + scale_colour_gradient(low="white", high="red") 
    p = p + facet_grid("cut", "clarity")
    print(p)
    p = ggplot(aes(x='carat'), data=diamonds)
    print(p + geom_density() + facet_grid("cut", "clarity"))
예제 #28
0
def test_facet_grid():
    # only use a small subset of the data to speedup tests
    # N=53940 -> N=7916 and only 2x2 facets
    _mask1 = (diamonds.cut == "Ideal") | (diamonds.cut == "Good")
    _mask2 = (diamonds.clarity == "SI2") | (diamonds.clarity == "VS1")
    _df = diamonds[_mask1 & _mask2]
    p = ggplot(aes(x='x', y='y', colour='z'), data=_df)
    p = p + geom_point() + scale_colour_gradient(low="white", high="red") 
    p = p + facet_grid("cut", "clarity")
    print(p)   
    p = ggplot(aes(x='carat'), data=_df)
    print(p + geom_density() + facet_grid("cut", "clarity"))    
def plot_trip(trip_data_frame, details=tuple(), **kwargs):
    if check_trip_data_quality(trip_data_frame):
        plot = ggplot(trip_data_frame, aes(x='x', y='y')) + geom_point(**kwargs)
    else:
        d = trip_data_frame.copy()
        T = len(d)
        d['bad_x'] = T * [0]
        d['bad_y'] = T * [0]
        bad = ~d.check_velocity | ~d.check_angular_velocity
        d.ix[bad, ['bad_x', 'bad_y']] = d.ix[bad, ['x', 'y']]
        plot = ggplot(d, aes('x', 'y')) + geom_point(**kwargs) +\
            geom_point(aes('bad_x', 'bad_y'), color='red', size=90)
    return plot
def plot_dayofweek_data(filename):
    """
    Scatter plot of ridership by day-of-week
    """
    turnstile_weather = pandas.read_csv(filename)
    print ggplot(turnstile_weather, aes("day_week", "ENTRIESn_hourly")) + geom_point(size=5.0, color="red") + xlab(
        "Day of the week"
    ) + ggtitle("Ridership by day of the week") + scale_x_continuous(
        breaks=[0, 1, 2, 3, 4, 5, 6], labels=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    ) + ylim(
        0
    )
    return
예제 #31
0
from __future__ import print_function
from ggplot import *

print(
    ggplot(meat, aes(x='date', y='beef')) + stat_smooth() +
    scale_x_date(labels=date_format('%Y')))
print(
    ggplot(meat, aes(x='date', y='beef')) +
    stat_smooth(method='ma', window=12) +
    scale_x_date(labels=date_format('%Y')))
print()

print('TN = {}'.format(TN))
print('FP = {}'.format(FP))
print('FN = {}'.format(FN))
print('TP = {}'.format(TP))
print()

FPR, TPR, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])

roc_auc = auc(FPR, TPR)

# ROC Curve (using Python)
plt.figure()
plt.plot(FPR, TPR, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (FPR)')
plt.ylabel('True Positive Rate (TPR)')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()

# ROC Curve (using R)
# Compile it IPython notebook
from ggplot import *
df = pd.DataFrame(dict(fpr=FPR, tpr=TPR))
ggplot(df, aes(x='fpr',
               y='tpr')) + geom_line() + geom_abline(linetype='dashed')
예제 #33
0
#!/usr/bin/env python
# encoding: utf-8
"""
@version: python3.7
@author: JYFelt
@license: Apache Licence 
@contact: [email protected]
@site: https://blog.csdn.net/weixin_38034182
@software: PyCharm
@file: ggplot_demo.py
@time: 2019/8/15 17:16
"""
from ggplot import *

p = ggplot(mtcars, aes('mpg', 'wt', color='factor(cyl)')) + geom_point() + ggtitle('mtcars')
print(p)
예제 #34
0
import pandas as pd

from ggplot import *
from datetime import datetime

workingDirectory = "/home/owen/Dropbox/graphMining/"

df = pd.DataFrame.from_csv(path=workingDirectory + "trafficData2.tsv", sep = '\t', header = 0, index_col = False)
timestamp = datetime.strptime(df.timestamp, '%Y-%m-%d %X')

p = ggplot(aes(x = 'timestamp'), data = df)
p + geom_histogram(binwidth = 30)
예제 #35
0
파일: test_stats.py 프로젝트: ssydyc/ggplot
from ggplot import *

print ggplot(diamonds, aes('carat', 'price')) + stat_smooth(method='lm')

print ggplot(diamonds, aes('price')) + stat_density()
예제 #36
0
# <markdowncell>

# We're going to be using a RFC at first, so I'm swtiching the 'days on disabled list' metric to a simple injured boolean.

# <codecell>

data.columns

# <codecell>

data['InjuredBool'] = data['Days'] >= 1

# <codecell>

injury_days_chart = ggplot(aes(x='playerid', y='Days'),
                           data=data) + geom_point()
injury_days_chart

# <codecell>

X_cols = [col for col in data.columns if col not in ['InjuredBool', 'Days']]
X = data[X_cols]
y = data.InjuredBool

# <codecell>

objects = []
for each_col in X:
    if X[each_col].dtype == 'object':
        objects.append(each_col)
					try:
						qual=float(qual)
					except:
						continue
					else:
						if vcf_list[6]=='PASS' and  qual>= 25:
							try:
								POS=int(vcf_list[1])
							except:
								continue
							if POS <= window*n:
								type_info+=('\t' + vcf_list[7])
								heho_info+=('\t' + vcf_list[9].split(':')[0])
							else:
								window_point=window*n - window/2
								row=VariantStat(del_pattern, ins_pattern, HE_pattern, HO_pattern, type_info, heho_info, outputfilename, window_point)
								pos_list.append(row[0]), heho_list.append(row[1]), del_list.append(row[2]), ins_list.append(row[3])
								type_info=vcf_list[7]
								heho_info=vcf_list[9].split(':')[0]
								n+=1
						
	# Plotting using ggplot for Python
	plotting_data=DataFrame({'pos':pos_list, 'heho':heho_list, 'del':del_list, 'ins':ins_list})
	heho_p=ggplot(aes(x = 'pos', y = 'heho'), data= plotting_data) + geom_point() + ggtitle('heterozygous / (heterozygous + homozygous)') + scale_x_continuous('Position', breaks = [0, 1e+08, 2e+08, 3e+08], labels = ['0', '100Mb', '200Mb', '300Mb']) + xlim(low=0, high=3.7e8) + scale_y_continuous('Percent (%)', breaks = [25, 50, 75, 100]) + ylim(low = 0, high = 100)# + theme(axis.title.x = element_text())
	heho_p.save('heho_ratio_%s.png' % TD(window, 'readable'), dpi = 300)#, width = 8.43, height = 5.28, dpi = 300)#, limitsize = TRUE)
	del_p=ggplot(aes(x = 'pos', y = 'del'), data= plotting_data) + geom_point() + ggtitle('Deletion') + ylab('Count') + scale_x_continuous('Position', breaks = [0, 1e+08, 2e+08, 3e+08], labels = ['0', '100Mb', '200Mb', '300Mb']) + xlim(low = 0, high = 3.7e8) + scale_y_continuous('Count', breaks = [400, 800, 1200]) + ylim(low = 0)
	del_p.save('del_stat_%s.png' % TD(window, 'readable'), dpi = 300)#, width = 8.43, height = 5.28, dpi = 300)#, limitsize = TRUE)
	ins_p=ggplot(aes(x = 'pos', y = 'ins'), data= plotting_data) + geom_point() + ggtitle('Insertion') + ylab('Count') + scale_x_continuous('Position', breaks = [0, 1e+08, 2e+08, 3e+08], labels = ['0', '100Mb', '200Mb', '300Mb']) + xlim(low = 0, high = 3.7e8) + scale_y_continuous('Count', breaks = [400, 800, 1200]) + ylim(low = 0)
	ins_p.save('ins_stat_%s.png' % TD(window, 'readable'), dpi = 300)#, width = 8.43, height = 5.28, dpi = 300)#, limitsize = TRUE)

예제 #38
0
import pandas as pd
from ggplot import *

lst = []
for test_d in filter(lambda x: x.startswith('test-'), os.listdir('.')):
    for fill in os.listdir(test_d):
        if not fill.startswith('train-'):
            continue
        num = int(fill.split('-')[1])
        with open(os.path.join(os.path.join(test_d, fill), 'result.filtered'),
                  "rt") as fd:
            #fd.readline()
            error = float(fd.readline().split(':')[1].strip())
            lst.append((num, error))

with open("match.csv", "wt") as fd:
    fd.write("size,error\n")
    for size, error in sorted(lst):
        fd.write("%d, %f\n" % (size, error))

dataframe = pd.read_csv("test.csv")

gg = ggplot(aes(x='size', y='error'), data=dataframe) + \
    geom_point(color='lightblue') + \
    stat_smooth(span=.15, color='black', se=True) + \
    ggtitle("Germline data") + \
    xlab("References count") + \
    ylab("Error rate")

print(gg)

plt.figure(figsize=(12,12))

sns.jointplot(x=train_df['taxamount'].values, y=train_df['logerror'].values, size=10, color='g')

plt.ylabel('Log Error', fontsize=12)

plt.xlabel('Tax Amount', fontsize=12)

plt.title("Tax Amount Vs Log error", fontsize=15)

plt.show()
from ggplot import *

ggplot(aes(x='yearbuilt', y='logerror'), data=train_df) + \

    geom_point(color='steelblue', size=1) + \

    stat_smooth()
ggplot(aes(x='latitude', y='longitude', color='logerror'), data=train_df) + \

    geom_point() + \

    scale_color_gradient(low = 'red', high = 'blue')
ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df) + \

    geom_point(alpha=0.7) + \

    scale_color_gradient(low = 'pink', high = 'blue')
ggplot(aes(x='finishedsquarefeet12', y='taxamount', color='logerror'), data=train_df) + \
예제 #40
0
#clf=svc

clf.fit(X, y)

print "Accuracy of the model -"
print clf.score(X, y)
print clf.score(X_1, y_1)

#---------------------------  ROC CURVE ------------------------------------------------

#clf.probability=True

preds = clf.predict_proba(X_1)[:, 1]
fpr, tpr, _ = metrics.roc_curve(y_1, preds)
df = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
graph = ggplot(df, aes(x='fpr', y='tpr')) + geom_line(
    color="blue", size=3) + geom_abline(linetype='dashed')
print graph

#-------------AUC Curve

auc = metrics.auc(fpr, tpr)
print auc

#-------------------------------Precision and Recall----------------------------
print "Calculating Precision and Recall..."
y_2 = clf.predict(X_1)
y_3 = np.array(y_1)

false_pos = 0
false_neg = 0
예제 #41
0
from ggplot import *

print ggplot(aes(x='date', y='beef'), data=meat) + \
    geom_line()

plt.show(1)
# For visualization keep 2 principal components
print(pca.explained_variance_ratio_[0:2])  #variance explained by first two PCs

#x = ctdf.loc[:, ~ctdf.columns.isin(['SubjectId', 'Age'])]
firstTwoPCs = pd.DataFrame(data=pca.components_[:, :2], columns=['PC1', 'PC2'])
pcScores = pd.DataFrame(data=np.dot(x, firstTwoPCs), columns=['PC1', 'PC2'])
ages = ctdf.loc[:, ['Age']].reset_index()
pcScores = pd.concat([pcScores, ages['Age']], axis=1)
pcScores.head()

# In[13]:

from ggplot import *
#from ggplot import scale_fill_brewer

chart = ggplot(pcScores, aes(x='PC1', y='PC2', color='Age')) + geom_point(
    size=75, alpha=0.8) + ggtitle(
        "First and Second Principal Components colored by digit")
chart

# In[ ]:

import time

from sklearn.manifold import TSNE

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(features.values)

print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))
def plot_weather_data(version=1, nbins=18):
    ''' 
    plot_weather_data is passed a dataframe called turnstile_weather. 
    Use turnstile_weather along with ggplot to make another data visualization
    focused on the MTA and weather data we used in Project 3.
    
    Make a type of visualization different than what you did in the previous exercise.
    Try to use the data in a different way (e.g., if you made a lineplot concerning 
    ridership and time of day in exercise #1, maybe look at weather and try to make a 
    histogram in this exercise). Or try to use multiple encodings in your graph if 
    you didn't in the previous exercise.
    
    You should feel free to implement something that we discussed in class 
    (e.g., scatterplots, line plots, or histograms) or attempt to implement
    something more advanced if you'd like.

    Here are some suggestions for things to investigate and illustrate:
     * Ridership by time-of-day or day-of-week
     * How ridership varies by subway station
     * Which stations have more exits or entries at different times of day

    If you'd like to learn more about ggplot and its capabilities, take
    a look at the documentation at:
    https://pypi.python.org/pypi/ggplot/
     
    You can check out the link 
    https://www.dropbox.com/s/meyki2wl9xfa7yk/turnstile_data_master_with_weather.csv
    to see all the columns and data points included in the turnstile_weather 
    dataframe.
     
   However, due to the limitation of our Amazon EC2 server, we are giving you a random
    subset, about 1/3 of the actual data in the turnstile_weather dataframe.
    '''

    turnstile_weather = read_csv_data(version)

    # bins
    entries_max = turnstile_weather.ENTRIESn_hourly.max()
    entries_min = turnstile_weather.ENTRIESn_hourly.min()
    bins = np.linspace(entries_min, entries_max, nbins)
    binwidth = bins[1] - bins[0]

    # probabilities for number of entries per hour on dry days
    dry = turnstile_weather.ENTRIESn_hourly[turnstile_weather.rain == 0]
    (hist_dry, bin_edges) = np.histogram(dry, bins)
    prob_dry = hist_dry / float(np.sum(hist_dry))

    # probabilities for number of entries per hour on rainy days
    rainy = turnstile_weather.ENTRIESn_hourly[turnstile_weather.rain == 1]
    (hist_rain, bin_edges) = np.histogram(rainy, bins)
    prob_rain = hist_rain / float(np.sum(hist_rain))

    # plot histograms on rainy and dry days
    plt.hist(np.array(dry), bins=bins, color='w', label='dry')
    plt.hist(np.array(rainy), bins=bins, label='rainy')

    plt.legend()
    plt.title('Histograms of number of entries per hour on rainy and dry days')
    plt.xlabel('Number of entries per hour')
    plt.xlim(0, 20000)
    plt.ylabel('Count')
    plt.ylim(-1000, 55000)
    plt.show()

    # probablities when rainy - probabilities when dry
    prob_diff = prob_rain - prob_dry

    # improvised bar plot
    # geom_bar with stat='identity' does not seem to work

    # data frame for ggplot
    df = DataFrame({
        'xmin': bins[:-1],
        'xmax': bins[1:],
        'ymin': prob_diff * ((prob_diff < 0).astype(float)),
        'ymax': prob_diff * ((prob_diff > 0).astype(float)),
        'sign': np.sign(prob_diff)
    })

    # plot difference in probabilities on rainy and dry days
    plot = ggplot(df,aes(xmin='xmin',xmax='xmax',ymin='ymin',ymax='ymax',fill='sign')) + \
    geom_rect() + \
    geom_hline(yintercept=0,color='black') + \
    ggtitle('Probabilty of entries/hr when rainy - probability of entries/hr when dry') + \
    xlab('E = Entries/hr') + \
    ylab('P(E | rainy) - P(E | dry)')

    return plot
예제 #44
0
                       verbose=False)
            xs, drs, acts = [], [], []  # reset array memory

        if episode_number % 20 == 0:
            check_weight_1 = np.asarray(kmodel.get_weights()[1])[0].sum()
            check_weight_2 = np.asarray(kmodel.get_weights()[3])[0].sum()
            print('Weight check 1: {}, weight check 2: {}'.format(
                check_weight_1, check_weight_2))

        if episode_number % save_model_freq == 0:
            kmodel.save_weights("Models/" + model_name + ".h5")
            game_history_plot = pd.DataFrame(game_history,
                                             columns=["Ep", "Score"])
            game_history_plot["EMA100"] = game_history_plot["Score"].ewm(
                span=100).mean()
            newplot = (ggplot(aes(x="Ep", y="Score"), data=game_history_plot) +
                       geom_point(color="green") +
                       geom_line(aes(x="Ep", y="EMA100"), color="blue") +
                       geom_hline(y=0, color="darkorange") +
                       ggtitle(model_name))

            newplot.save("Plots/" + model_name + ".png")

            if running_reward > running_best:
                running_best = running_reward
                kmodel.save_weights("Models/BEST_" + model_name + ".h5")

        reward_sum = 0
        observation = env.reset()  # reset env
        prev_x = None
예제 #45
0
def test_scale_facet_wrap_visual():
    p = ggplot(aes(x="price"), data=diamonds) + geom_histogram()
    assert_same_ggplot(p + facet_wrap("cut", scales="free"), "free")
    assert_same_ggplot(p + facet_wrap("cut", scales="free_x"), "free_x")
    assert_same_ggplot(p + facet_wrap("cut", scales="free_y"), "free_y")
    assert_same_ggplot(p + facet_wrap("cut", scales=None), "none")
예제 #46
0
####### Import Packages #########
import os
import numpy as np
import pandas as pd
from ggplot import *

####### Set Simulation Parameters #########
os.chdir("/Users/bradley/SpeedTest")
np.random.seed(123)           # set the seed to ensure reproducibility
N = 1000             # set number of agents in economy
gamma = .5           # set Cobb-Douglas relative preference for consumption
tau = .2             # set tax rate

####### Draw Income Data and Optimal Consumption and Leisure #########
epsilon = np.random.normal(size=N)                                               # draw unobserved non-labor income
wage = 10+np.random.normal(size=N)                                               # draw observed wage
consump = gamma*(1-tau)*wage + gamma*epsilon                     # Cobb-Douglas demand for c
leisure = (1.0-gamma) + ((1.0-gamma)*epsilon)/((1.0-tau)*wage)  # Cobb-Douglas demand for l

####### Organize, Describe, and Export Data #########
df = pd.DataFrame()
df['consump'] = consump
df['leisure'] = leisure
df['wage'] = wage
df['epsilon'] = epsilon
plot_c = ggplot(aes(x='wage',y='consump'),data=df) + stat_smooth()
ggsave(plot_c,"plot_c.svg")
df.to_csv("consump_leisure.csv", index=False)

예제 #47
0
파일: themes.py 프로젝트: tacaswell/ggplot
from ggplot import *

p = ggplot(mtcars, aes('cyl')) + geom_bar()
print(p)
print(p + theme_bw())
print(p + theme_xkcd())
print(p + theme_matplotlib())
plt.show(1)
예제 #48
0
import ggplot
from ggplot import aes, meat, geom_line, stat_smooth

ggplot(aes(x='date', y='beef'), data=meat) +\
    geom_line() +\
    stat_smooth(colour='blue', span=0.2)
''' ggplot(diamonds, aes(x='carat', y='price', color='cut')) +\
    geom_point() +\
    scale_color_brewer(type='diverging', palette=4) +\
    xlab("Carats") + ylab("Price") + ggtitle("Diamonds")
    
ggplot(diamonds, aes(x='price', fill='cut')) +\
    geom_density(alpha=0.25) +\
    facet_wrap("clarity") '''
예제 #49
0
NA_Count = pd.DataFrame({'Sum of NA': df.isnull().sum()}).sort_values(by=['Sum of NA'], ascending=[0])
NA_Count['Percentage'] = NA_Count['Sum of NA'] / df.shape[1]

print(sum(NA_Count['Percentage']))

from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2, random_state=42)

cat = ['waterfront', 'view', 'condition', 'grade']
con = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'yr_built',
       'yr_renovated', 'sqft_living15', 'sqft_lot15']

from ggplot import *

lonlat = ggplot(train, aes(x='long', y='lat', color='price')) + geom_point() + scale_color_gradient(low='white',
                                                                                                    high='red') + ggtitle(
    'Color Map of Price')
print(lonlat)

lonprice = ggplot(train, aes(x='long', y='price')) + geom_point() + ggtitle('Price VS Longitude')
print(lonprice)

bedroom_price = ggplot(train, aes(x='bedrooms', y='price')) + geom_point() + ggtitle('Price VS No. of bedrooms')
print(bedroom_price)

year_built_price = ggplot(train, aes(x='yr_built', y='price')) + geom_point() + ggtitle('Price VS year built')
print(year_built_price)


def centralize_long(lon):
예제 #50
0
from ggplot import *

ggplot(aes(x='date', y='beef'), data=meat) +\
    geom_line() +\
    stat_smooth(colour='blue', span=0.2)
예제 #51
0
history=model.fit({'seq_input':train['seq']},train_output_dict,
	validation_data=({'seq_input':val['seq']},val_output_dict),
	nb_epoch=200,
	batch_size=100,
	callbacks=[early_stopping,checkpoint,reduce_lr],
	verbose=1)


with open('%s/history.pkl'%(log_dir),'wb') as f:
	pickle.dump([history.history],f)
with open('%s/history.pkl'%(log_dir),'rb') as f:
	x=pickle.load(f)


# Plot the learning curve:
history=pd.DataFrame(x[0])
history['epoch']=(range(1,history.shape[0]+1))
history_melt=pd.melt(history,id_vars=['epoch'],value_vars=['loss','val_loss'],var_name='type',value_name='loss')

p1=ggplot(history_melt,aes('epoch','loss',color='type'))+geom_line()+theme_bw()
p1.save(filename='%s/learning_curve.png'%(fig_dir))


# Plot prediction vs ground truth: 
pred=model.predict({'seq_input':test['seq'],'reg_input':test['reg']},batch_size=100,verbose=1)
plt.scatter(pred,test['expr'])
plt.savefig("%s/pred_vs_obs.png"%(fig_dir))
output=np.column_stack((test['expr'], pred[:,0]))
np.savetxt("%s/prediction.txt"%(out_dir), output,delimiter='\t')
예제 #52
0
def box_plot(var):
    pt = a = ggplot(train, aes(x=var, y='price')) + geom_boxplot() + theme_bw() + ggtitle(
        'Boxplot of ' + var + ' and price')
    return print(pt)
'''

#TSNE 2d
n_sne = int(size)

time_start = time.time()
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=400)
tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne], feat_cols].values)
print('t-SNE done! Time elapsed: {} seconds'.format(time.time() - time_start))

df_tsne = df.loc[rndperm[:n_sne], :].copy()
df_tsne['x-tsne'] = tsne_results[:, 0]
df_tsne['y-tsne'] = tsne_results[:, 1]

#plot t-SNE 2d
chart2 = ggplot( df_tsne, aes(x='x-tsne', y='y-tsne', color='label') ) \
        + geom_point(size=20,alpha=0.7) \
        + ggtitle("tSNE dimensions colored by digit")
#write chart in console to visulize chart2
chart2
'''
#TSNE 3d
n_sne = int(size)

time_start = time.time()
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=400)
tsne_results = tsne.fit_transform(df.loc[rndperm[:n_sne],feat_cols].values)


print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
# Create a dataframe
df=pd.DataFrame({"Animal":["dog","dolphin","chicken","ant","spider"],
                    "Legs":[4,0,2,6,8]})
df.head()


#####################################################################################
# ggplot examples

pip.main(['install', 'ggplot'])
#from ggplot import ggplot, aes, geom_bar, geom_line, stat_smooth
from ggplot import *

# bar chart
ggplot(df, aes(x="Animal", weight="Legs")) + geom_bar(fill='blue')


# line chart with smoothing
ggplot(aes(x='date', y='beef'), data=meat) + geom_line() + stat_smooth(colour='blue', span=0.2)


# scatter points
ggplot(diamonds, aes(x='carat', y='price', color='cut')) +\
    geom_point() +\
    scale_color_brewer(type='diverging', palette=4) +\
    xlab("Carats") + ylab("Price") + ggtitle("Diamonds")


# density and facets
ggplot(diamonds, aes(x='price', fill='cut')) +\
예제 #55
0
from pandas import *
from ggplot import *
import pprint
import csv
import itertools

import ggplot as gg
import numpy as np
import pandas as pd
from datetime import datetime, date, time

turnstile_weather=pandas.read_csv("C:/move - bwlee/Data Analysis/Nano/\
Intro to Data Science/project/code/turnstile_data_master_with_weather.csv")

plot=ggplot(turnstile_weather,aes(x='ENTRIESn_hourly',y='EXITSn_hourly',color='Hour')) \
+ geom_point() \
+ scale_color_brewer(type='diverging', palette=4) \
+ xlab("Entries") \
+ ylab("Exits")\
+ ggtitle("Entries vs Exists by hour")
#print plot

df = DataFrame({"rain": turnstile_weather[turnstile_weather['rain']==1]['ENTRIESn_hourly'], \
  "no_rain": turnstile_weather[turnstile_weather['rain'] == 0]['ENTRIESn_hourly']}).fillna(0)
df = melt(df)
plot = ggplot(aes(x='value', color='variable'), data=df) \
  + geom_histogram(binwidth=400) \
  + scale_y_log() \
  + ylab("Frequency") \
  + xlab("Entries Per Hour")\
  + ggtitle("Entries Per Hour vs Frequency")
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from ggplot import *

print ggplot(mtcars, aes(x='mpg')) + geom_histogram() + xlab("Scrüm")
예제 #57
0
def test_scale_facet_wrap_internals():
    def convertText(t):
        """Return a float for the text value of a matplotlib Text object."""
        try:
            return float(t.get_text())
        except:
            # don't mask the error, just let the assert raise the test failure
            return 0

    def empty(t):
        """Return True if the Text object is an empty string."""
        return len(t.get_text().strip()) == 0

    p = ggplot(aes(x="price"), data=diamonds) + geom_histogram()
    # Only p2 has the new measures for column!
    p2 = p + facet_wrap("cut", scales="free")
    print(p2)

    # FIXME: n_high is the number of columns, not rows, because n_high and
    # n_wide are being passed backwards to plt.subplot in ggplot.py
    columns = p2.n_high

    fig = plt.gcf()

    # When the scales are free, every plot should have x and y labels. Don't
    # test the tick values because each plot is free to set its own.
    for ax in fig.axes:
        assert_true(len(ax.get_xticklabels()) > 0)
        assert_true(len(ax.get_yticklabels()) > 0)

    print(p + facet_wrap("cut", scales="free_x"))
    fig = plt.gcf()

    yticks = fig.axes[0].get_yticks()
    for pos, ax in enumerate(fig.axes):
        # When only the x-axis is free, all plots should have the same y scale
        assert_true(all(ax.get_yticks() == yticks))

        if pos % columns == 0:
            # Only plots in the first column should have y labels
            assert_true(
                all(list(map(convertText, ax.get_yticklabels())) == yticks))
        else:
            # Plots in all other columns should have no labels
            assert_true(all(map(empty, ax.get_yticklabels())))

        # Every plot should have labels on its x-axis
        assert_true(len(ax.get_xticklabels()) > 0)

    print(p + facet_wrap("cut", scales="free_y"))
    fig = plt.gcf()

    xticks = fig.axes[0].get_xticks()
    subplots = len(fig.axes)
    for pos, ax in enumerate(fig.axes):
        assert_true(all(ax.get_xticks() == xticks))

        if subplots - pos > columns:
            # Only the bottom plot of each column gets x labels. So only the
            # last N plots (where N = number of columns) get labels.
            assert_true(all(map(empty, ax.get_xticklabels())))
        else:
            assert_true(
                all(list(map(convertText, ax.get_xticklabels())) == xticks))

        # All plots should have y labels
        assert_true(len(ax.get_yticklabels()) > 0)

    print(p + facet_wrap("cut", scales=None))
    fig = plt.gcf()

    xticks = fig.axes[0].get_xticks()
    yticks = fig.axes[0].get_yticks()
    for pos, ax in enumerate(fig.axes):
        # Every plot should have the same x and y scales
        assert_true(all(ax.get_xticks() == xticks))
        assert_true(all(ax.get_yticks() == yticks))

        # Repeat the tests for labels from both free_x and free_y
        if subplots - pos > columns:
            assert_true(all(map(empty, ax.get_xticklabels())))
        else:
            assert_true(
                all(list(map(convertText, ax.get_xticklabels())) == xticks))

        if pos % columns == 0:
            assert_true(
                all(list(map(convertText, ax.get_yticklabels())) == yticks))
        else:
            assert_true(all(map(empty, ax.get_yticklabels())))
예제 #58
0
#seaborn
import seaborn as sns
xt1 = pd.crosstab(mtcarsDF.cyl, mtcarsDF.gear)
xt1
sns.heatmap(xt1, cmap='YlGnBu', annot=True, cbar=False)
xt2 = pd.crosstab(index=mtcarsDF.gear, columns=[mtcarsDF.am, mtcarsDF.vs], rownames=['Gear'] , colnames =['AM','VS'])
xt2
sns.heatmap(xt2)
sns.heatmap(xt2, cmap='YlGnBu', annot=True, cbar=False)


#ggplot
#pip install ggplot
from ggplot import *
ggplot(data=mtcarsDF, mapping= aes(x='wt', y='mpg')) + geom_point(colour='r')
#error tslib https://github.com/yhat/ggpy/issues/662

#%% save to/from excel
mtcarsDF.to_csv('mtcars.csv') #check the folder in working dir tab
mtcarsDF.to_excel('mtcars.xlsx', sheet_name='mtcars1')
mtcarsDF.to_clipboard() #clipboard, paste it anywhere



import matplotlib.pyplot as plt
#scatter plot
plt.scatter(x=mtcarsDF.wt, y=mtcarsDF.mpg)
plt.scatter(x='wt', y='mpg', data=mtcarsDF)
plt.scatter(x='wt', y='mpg', data=mtcarsDF, label='MTCars : wt vs mpg')
예제 #59
0
def lineplot_compare():
    df = pandas.read_csv('hr_by_team_year_sf_la.csv')
    print(ggplot(df, aes(x='yearID', y='HR', color='teamID')) + geom_line())
예제 #60
0
# print(len(baby_unisex_names)) #10221
baby_unisex_names_str = ', '.join(
    str(unisex_name) for unisex_name in baby_unisex_names)
print("%r is %r" % ("Unisex names: ", str(baby_unisex_names_str)))

# b) Calculate the share of unisex names, relative to all other names
# count occurences of unisex names vs total names

list_all_baby_unisex = list(
    data_frame_baby[data_frame_baby['Name'].isin(baby_unisex_names)]['Name'])
total_baby_unisex = len(list_all_baby_unisex)
print(total_baby_unisex)
print(list_all_baby_unisex)
# data_frame.groupby(['occupation', 'gender']).size()
# “group by” involves split-apply-combine:
#     Splitting the data into groups based on some criteria.
#     Applying a function to each group independently.
#     Combining the results into a data structure.
#

# c) for a unisex name, plot name vs time with legend being gender
# data_frame_baby[data_frame_baby['Gender'] == 'F']
# data_frame_baby.loc[data_frame_baby["Gender"]  == 'F', ["Name"]] #1081683 rows x 1 columns

# data_frame_baby[data_frame_baby['Gender'] == 'Unisex']x, y = np.random.random((2, num))


ggplot(aes(x='x', y='y', color='gender'), data=df) +\
geom_point(size=50) +\
theme_bw()