def boxplot_all(self, prop, show=None, fig=None):
     values, data = {}, []
     for log in self.all_logs(filter_5=True):
         val = get_class_value(log, prop)
         if val is not None:
             values.setdefault(log.condition, []).append(val)
     conditions = EcoSonic_VP.condition_order_from_vp_id(1)
     data = [values[cond] for cond in conditions]
     boxplot(data, conditions, prop, 'condition', show, fig)
Beispiel #2
0
def main():

	if DOWNLOAD:
		print("-- Downloading fresh log file from the server...")
		progression_run(downloader.download, (LOG_NAME,), 0.1)

	print "-- Parsing Log File...",
	pings = dict()
	progression_run(parsing, (LOG_NAME, pings), 0.2)
	real_pings = pings['real']
	empty_pings = pings['empty']

	print "-- There are", len(real_pings), "real pings :)"
	print "-- There are", len(empty_pings), "empty pings", (":(" if len(empty_pings)>0 else "")

	if len(sys.argv) == 2:
		# 1 argv plot all starting from date in argv
		real_pings = statistics.createFrame(real_pings, sys.argv[1])
	elif len(sys.argv) == 3 and sys.argv[1] == '-1':
		real_pings = statistics.createFrame(real_pings, None, sys.argv[2])
	elif len(sys.argv) == 3:
		begin, end = sys.argv[1], sys.argv[2]
		real_pings = statistics.createFrame(real_pings, begin, end)
	
	print("-- Generating Plots...")
	plots.pointoverTime(real_pings)
	plots.pointoverPingNum(real_pings)
	plots.statisticalPlot(real_pings)
	plots.boxplot(real_pings, statistics.segDataHours, 'hours')
	plots.boxplot(real_pings, statistics.segDataWeekdays, 'weekdays')
	plots.boxplot(real_pings, statistics.segDataMonths, 'months')
	plt.show()
Beispiel #3
0
        plots.stemleaf(df,
                       title='Stem and Leaf',
                       save=True,
                       savepath='.\\png\\plots\\stemleaf\\' + datasetname +
                       '.txt')

        plots.histogram(df,
                        save=True,
                        savepath='.\\png\\plots\\histogram\\' + datasetname +
                        '.png',
                        close=True)

        plots.boxplot(df,
                      save=True,
                      savepath='.\\png\\plots\\boxplot\\' + datasetname +
                      '.png',
                      close=True)

        plots.scattermatrix(df,
                            save=True,
                            savepath='.\\png\\plots\\scattermatrix\\' +
                            datasetname + '.png',
                            close=True)

        plots.heatmap(df,
                      save=True,
                      savepath='.\\png\\plots\\heatmap\\' + datasetname +
                      '.png',
                      close=True)
Beispiel #4
0
import numpy as np

df = pd.read_csv("train.csv")
df.describe()
df.head()
df.columns # 7 col are num, 5 are cat
df.isnull().sum() #age and cabin missing values
df.dtypes

df.Name
    
plt.scatter(df,'Survived','Age')
plt.scatter(df,df.index,'Age')

plt.hist(df,'Cabin',30)
plt.boxplot(df,'Age')

import plotly.plotly as py
from plotly.graph_objs import *

data = {'x': df.Age.values,
        'y': df.Fare.values,
        'z': df.Survived.values,
        'type': 'surface'}

fig = Figure(data=data)
py.plot(fig)


ggplot(df,aes('Age','Fare',color='factor(Pclass)'))+\
geom_point(shape=df.Survived)
Beispiel #5
0
def eda(filepath: str,
        features=None,
        targets=None,
        removeOutliers: bool = False,
        datasetname: str = ''):

    # load the data
    df = pk.load(open(filepath, 'rb'))

    # process inputs
    # TODO: properly infer if features or targets are a sequence or a single string
    if features is None:
        features = list(set(df.columns) - set(targets))

    # examine the data
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Shape of dataset:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Number of Rows: {1}'.format('    ', df.shape[0]))
    print('{0}Number of Columns: {1}'.format('    ', df.shape[1]))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Column names:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    for col in df.columns:
        print('{0}{1}'.format('    ', col))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}First 10 rows:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.head(10))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Last 10 rows:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.tail(10))
    print('', end='\n\n\n')

    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Statistical Summary:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    print(df.describe())
    print('', end='\n\n\n')

    # ----------------------------------------------------------------------
    # infer data types of the input DataFrame
    # ----------------------------------------------------------------------
    colNumeric = dfutl.numericColumns(df)

    # ----------------------------------------------------------------------
    # mean centering and scaling: standardize or normalize
    # ----------------------------------------------------------------------
    dfNumeric = df.loc[:, colNumeric]
    df.loc[:, colNumeric] = (dfNumeric - dfNumeric.mean()) / dfNumeric.std()
    dfNumeric = df.loc[:, colNumeric]

    # ----------------------------------------------------------------------
    # outlier detection
    # ----------------------------------------------------------------------
    # use z-score filtering
    # samples that are more than 3 standard deviations away from mean are to be discarded
    print(
        '----------------------------------------------------------------------'
    )
    print('{0}Outlier Detection:'.format('    '))
    print(
        '----------------------------------------------------------------------'
    )
    numouttotal = 0
    numout = 1
    passNum = 0
    while (numout > 0):
        # determine the number of outliers using zscore
        zscores = stats.zscore(dfNumeric)
        idx = np.logical_not(np.logical_or(zscores < -3, zscores > 3))
        idxrows = np.all(idx, axis=1)
        idxrowsout = np.logical_not(idxrows)
        numout = len(idxrows) - len(idxrows[idxrows])

        print('{0}Pass {1} detected {2} outliers'.format(
            '    ', passNum, numout))
        if not removeOutliers:
            break

        # remove outliers and contineu
        if (numout > 0 and removeOutliers):
            df = df.loc[idxrows, :]
            dfNumeric = df.loc[:, colNumeric]

        numouttotal = numouttotal + numout
        passNum = passNum + 1
    if removeOutliers:
        print('{0}Total number of outliers: {1}'.format('    ', numouttotal))
    print('', end='\n\n\n')

    # ----------------------------------------------------------------------
    # visualization
    # ----------------------------------------------------------------------
    plt.close('all')

    save = True
    if len(datasetname) > 0:
        savepath = '.\\png\\{0}\\eda\\'.format(datasetname)
        isdir = os.path.isdir(savepath)
        if not isdir:
            os.makedirs(savepath)
    else:
        savepath = '.\\png\\'

    plots.boxplot(dfNumeric, save=save, savepath=savepath)
    plots.histogram(df, tightLayout=True, save=save, savepath=savepath)
    plots.scattermatrix(dfNumeric, save=save, savepath=savepath)
    plots.heatmap(dfNumeric, correlation=0.5, save=save, savepath=savepath)

    #plt.show()
    plt.close('all')

    return df
Beispiel #6
0
    # Numerical summaries of data
    print(df.describe())

    plots.stemleaf(df
        ,title = 'Stem and Leaf'
        ,save = True
        ,savepath = '.\\visual\\iris_stemleaf.txt')

    plots.histogram(df
        ,save = True
        ,savepath = '.\\visual\\iris_histogram.png'
        ,close = True)

    plots.boxplot(df
        ,save = True
        ,savepath = '.\\visual\\iris_boxplot.png'
        ,close = True)

    plots.scattermatrix(df
        ,save = True
        ,savepath = '.\\visual\\iris_scattermatrix.png'
        ,close = True)

    plots.heatmap(df
        ,save = True
        ,savepath = '.\\visual\\iris_heatmap.png'
        ,close = True)

    plots.probplot(df
        ,save = True
        ,savepath = '.\\visual\\iris_probplot.png'