def run(self):
		df = pd.read_csv(self.input().path)

		############bajo el supuesto que el host(ip) sea el usuario unico, sino, sólo hay que adecuar el paste
		df['id'] = df['Date_Time'].map(str) + df['URL']
		df['Rank'] = df.groupby(['Host'])['id'].rank(ascending=True)

		df['Date_Time'] = pd.to_datetime(df['Date_Time'])
		df['time_diff'] = df.groupby('Host')['Date_Time'].diff()
		df['time_diff'] = df['time_diff'].fillna(0)

		df['year'] = pd.DatetimeIndex(df['Date_Time']).year
		df['month'] = pd.DatetimeIndex(df['Date_Time']).month
		df['day'] = pd.DatetimeIndex(df['Date_Time']).day
		df['hour'] = pd.DatetimeIndex(df['Date_Time']).hour

		df['day_of_week'] = df['Date_Time'].dt.dayofweek
		days = {0:'Lunes',1:'Martes',2:'Miercoles',3:'Jueves',4:'Viernes',5:'Sabado',6:'Domingo'}
		df['day_of_week'] = df['day_of_week'].apply(lambda x: days[x])


		df['dif_seg_clicks'] = df['time_diff'].apply(lambda x: x  / np.timedelta64(1,'s')).astype('int64') % (24*60)

		#################################mandarlo a funciones
		def subconjunto(x,par):
		    x=df[x.dif_seg_clicks < par*60+1]
		    return x

		df1=subconjunto(df,self.par)
		pd.save(df1,self.output().path)
Example #2
0
def addreport(request, pk):
    therapist = request.user.getTherapist()
    patient = get_object_or_404(Patient, pk=pk)
    if patient.Therapist != therapist:
        return HttpResponseRedirect(reverse('access-restricted'))
    if request.method == "POST":
        form = AddReportForm(request.POST, request.FILES)
        if form.is_valid():
            pd = Patient_Data(user_ID=patient,
                              Name=form.cleaned_data['Name'],
                              FilePath=form.cleaned_data['FilePath'])
            pd.save()
            return HttpResponseRedirect(
                reverse("view-report", kwargs={'pk': pk}))
    else:
        form = AddReportForm()
    context = {
        'heading': 'Add Report',
        "name": therapist.Name,
        "nUnread": therapist.getNumberOfUnreadLogs,
        "sidebarOptions": therapistPatientOptions,
        "patient": patient,
        "form": form
    }
    return render(request, 'sehatagahiapp/add-report.html', context)
	def run(self):
		df=pd.read_pickle(self.input().path)
		#df=pd.read_pickle(self.output().path)
		df=df.drop_duplicates(['Host', 'Date_Time','URL'])
		df = df.sort(['Host', 'Date_Time','Response_Code'], ascending=[1,1,0])
		df["Date_Time"] = df["Date_Time"].map(lambda x: str(x)[0:20])
		df = df.drop(df.index[[len(df)-1]])

		print df.head()

		pd.save(df,self.output().path)  
def hysplit_pandas_generator(aerofilt_dir):
    #tool for extracting data from hysplit trajectory files and putting it into
    #a pandas dataframe for storage in float16 format
    import pandas as pan
    import os, sys

    data_cats = ('delta_t', 'lat', 'lon', 'alt', 'press')

    startdir = os.getcwd()

    os.chdir(aerofilt_dir)

    hysplit_files = os.listdir(os.getcwd())

    print 'Generating Hysplit dataframe ...'

    for h in hysplit_files:

        if pickle_test(h, hysplit_files):

            #import hysplit text file
            [head, data] = hysplit_import(h)

            #create dictionary with {varname: array} based on column names

            output_data = [
                data[:, 8], data[:, 9], data[:, 10], data[:, 11], data[:, 12]
            ]

            output_dict = dict(zip(data_cats, output_data))

            #create datetime index

            dates = []

            for n in range(0, len(data[:, 2])):
                yr = data[n, 2]
                mn = data[n, 3]
                dy = data[n, 4]
                hr = data[n, 5]
                dates.append(pan.datetime(yr, mn, dy, hr))

            ind = pan.DatetimeIndex(dates)

            df_out = pan.DataFrame(output_dict, index=ind)

            #save it as a .pickle file

            savename = h.split('.')[0]

            pan.save(df_out, savename + '.pickle')

    os.chdir(startdir)
    print '... Done'
def hysplit_pandas_generator(aerofilt_dir):
    #tool for extracting data from hysplit trajectory files and putting it into
    #mat files for storage in float16 format
    import pandas as pan
    import os,sys

    data_cats = ('delta_t','lat','lon','alt','press')


    startdir = os.getcwd()

    os.chdir(aerofilt_dir)

    hysplit_files = os.listdir(os.getcwd())

    print 'Generating Hysplit dataframe ...'
                    
    for h in hysplit_files:

        if pickle_test(h,hysplit_files):

            #import hysplit text file
            [head,data] = hysplit_import(h)

            #create dictionary with {varname: array} based on column names

            output_data = [data[:,8],data[:,9],data[:,10],data[:,11],data[:,12]]

            output_dict = dict(zip(data_cats,output_data))

            #create datetime index

            dates = []

            for n in range(0,len(data[:,2])):
                yr = data[n,2]
                mn = data[n,3]
                dy = data[n,4]
                hr = data[n,5]
                dates.append(pan.datetime(yr,mn,dy,hr))

            ind = pan.DatetimeIndex(dates)

            df_out = pan.DataFrame(output_dict, index = ind)

            #save it as a .pickle file

            savename = h.split('.')[0]

            pan.save(df_out,savename+'.pickle')

    os.chdir(startdir)
    print '... Done'
	def run(self):
		df=pd.read_pickle(self.input().path)
		#
		df['Date_Time'] =  pd.to_datetime(df['Date_Time'], format='%d/%b/%Y:%H:%M:%S')
		df['id'] = df['Date_Time'].map(str)+df['Response_Code']+df['URL']
		df['Rank'] = df.groupby(['Host'])['id'].rank(ascending=True)
		#df['Date_Time'] = pd.to_datetime(df['Date_Time'])
		df = df.sort(['Host', 'Date_Time','Rank'], ascending=[1,1,0])
		df['time_diff'] = df.groupby('Host')['Date_Time'].diff()
		df['time_diff'] = df['time_diff'].fillna(0)

		print df.head()

		pd.save(df,self.output().path)  
def traceproc(aerofilt_dir):
    import hysplit_tools as tools
    import os, sys
    import numpy as np
    import pandas as pan

    startdir = os.getcwd()

    topdir = aerofilt_dir

    os.chdir(topdir)

    data_files = os.listdir(os.getcwd())

    d_mean = []
    d_std = []
    t_mean = []
    t_std = []
    endpos_mean = []
    endpos_std = []
    start_time = []
    station = []

    #run through all location folders
    for f in data_files:
        if os.path.isdir(f):
            os.chdir(f)
            tracefile = f+'traceback'
            #open traceback file
            trace_df = pan.load(tracefile)

            #create a separate dict of lists for each day and put those into a
            #list called dictlist

            dates = trace_df.index()
            keys = trace_df.columns()

            by = lambda x: lambda y: getattr(y,x)

            trace_mean = trace_df.groupby([by('month'),by('day')]).mean()
            trace_std = trace_df.groupby([by('month'),by('day')]).std()
            
       
    pan.save(df_out,'Hyproc.pickle')

    os.chdir(startdir)
Example #8
0
def traceproc(aerofilt_dir):
    import hysplit_tools as tools
    import os, sys
    import numpy as np
    import pandas as pan

    startdir = os.getcwd()

    topdir = aerofilt_dir

    os.chdir(topdir)

    data_files = os.listdir(os.getcwd())

    d_mean = []
    d_std = []
    t_mean = []
    t_std = []
    endpos_mean = []
    endpos_std = []
    start_time = []
    station = []

    #run through all location folders
    for f in data_files:
        if os.path.isdir(f):
            os.chdir(f)
            tracefile = f + 'traceback'
            #open traceback file
            trace_df = pan.load(tracefile)

            #create a separate dict of lists for each day and put those into a
            #list called dictlist

            dates = trace_df.index()
            keys = trace_df.columns()

            by = lambda x: lambda y: getattr(y, x)

            trace_mean = trace_df.groupby([by('month'), by('day')]).mean()
            trace_std = trace_df.groupby([by('month'), by('day')]).std()

    pan.save(df_out, 'Hyproc.pickle')

    os.chdir(startdir)
	def run(self):
		df=pd.read_pickle(self.input().path)
		#
		df['year'] = pd.DatetimeIndex(df['Date_Time']).year
		df['month'] = pd.DatetimeIndex(df['Date_Time']).month
		df['day'] = pd.DatetimeIndex(df['Date_Time']).day
		df['hour'] = pd.DatetimeIndex(df['Date_Time']).hour
		df["date"] =  pd.DatetimeIndex(df['Date_Time']).date
		df['day_of_week'] = df['Date_Time'].dt.dayofweek
		days = {0:'Lunes',1:'Martes',2:'Miercoles',3:'Jueves',4:'Viernes',5:'Sabado',6:'Domingo'}
		df['day_of_week'] = df['day_of_week'].apply(lambda x: days[x])
		df['dif_seg_clicks'] = df['time_diff'].apply(lambda x: x  / np.timedelta64(1,'s')).astype('int64') % (24*60)
		df.loc[df.dif_seg_clicks == 0, ['dif_seg_clicks']] = 1
		print self.input().path

		pd.save(df,self.output().path)  

		df.to_csv('%s.csv' % self.input().path)
Example #10
0
def main():

    # parse arguments
    parser = ArgumentParser()
    parser.add_argument("-J", "--jobname", dest="job_name",
        type=str, help="job name")
    parser.add_argument("-SF", "--sequencefile", dest="sequence_file",
        type=str, help="sequence file")
    parser.add_argument("-FF", "--featurefile", dest="feature_file",
        type=str, help="feature file")
    parser.add_argument("-NC", "--numchunks", dest="num_chunks",
        type=int, help="number of chunks")
    parser.add_argument("-CP", "--chunkprefix", dest="chunk_prefix",
        type=str, help="input chunk prefix")
    parser.add_argument("-OF", "--output", dest="output_file",
        type=str, help="output file")
    parser.add_argument("-DC", "--delete_chunks", dest="delete_chunks",
        type=int, help="delete chunks? (0 = no, 1 = yes (default)",
        default=1)
    args = parser.parse_args()

    # load sequences
    seqrecords = load_sequences(args.sequence_file)
    seqids = [seq.id for seq in seqrecords]

    # load features
    ff = open(args.feature_file, 'r').readlines()
    ff = [fi.strip() for fi in ff]

    # initialize dataframe
    X = pd.DataFrame(columns=ff, index=seqids, dtype='bool')
    
    # load chunks and place in appropriate spot
    for i in range(1, args.num_chunks+1):
        chunk_file = '%s.%d.pkl' % (args.chunk_prefix, i)
        chunk = pd.load(chunk_file) 
        X.ix[chunk.index, chunk.columns] = chunk.as_matrix()
        if args.delete_chunks:
            os.remove(chunk_file)

    # save
    pd.save(X, args.output_file)
Example #11
0
def aeronet_pandas_generator(aerofile,aerofilt_dir):
    #tool for extracting values of interest from aeronet data file, putting it into a
    #dictionary and saving it as a pandas dataframe
    import os, sys
    import pandas as pan
    import numpy as np

    startdir = os.getcwd()

    print 'Generating Aeronet dataframe ...'

    output_folder = os.path.split(aerofilt_dir)[1]

    keylist = ['Inflection_Point[um]','VolCon-T','EffRad-T','VolMedianRad-T','StdDev-T',\
                'VolCon-F','EffRad-F','VolMedianRad-F','StdDev-F','VolCon-C','EffRad-C',\
               'VolMedianRad-C','StdDev-C',\
               '0.050000','0.065604','0.086077','0.112939','0.148184','0.194429','0.255105',\
               '0.334716','0.439173','0.576227','0.756052','0.991996','1.301571','1.707757',\
               '2.240702','2.939966','3.857452','5.061260','6.640745','8.713145','11.432287',\
               '15.000000']

    filename = 'Aerostats_'+output_folder+'.pickle'

    newdict = aeronet_extract(aerofile,keylist)

    #convert dates to datetime index
    dates = newdict['Date']
    del(newdict['Date'])
                                     
    df_out = pan.DataFrame(newdict, index = dates)
    
    os.chdir(aerofilt_dir)
            
    pan.save(newdict,filename)

    os.chdir(startdir)

    print '... Done'
Example #12
0
def aeronet_pandas_generator(aerofile, aerofilt_dir):
    #tool for extracting values of interest from aeronet data file, putting it into a
    #dictionary and saving it as a pandas dataframe
    import os, sys
    import pandas as pan
    import numpy as np

    startdir = os.getcwd()

    print 'Generating Aeronet dataframe ...'

    output_folder = os.path.split(aerofilt_dir)[1]

    keylist = ['Inflection_Point[um]','VolCon-T','EffRad-T','VolMedianRad-T','StdDev-T',\
                'VolCon-F','EffRad-F','VolMedianRad-F','StdDev-F','VolCon-C','EffRad-C',\
               'VolMedianRad-C','StdDev-C',\
               '0.050000','0.065604','0.086077','0.112939','0.148184','0.194429','0.255105',\
               '0.334716','0.439173','0.576227','0.756052','0.991996','1.301571','1.707757',\
               '2.240702','2.939966','3.857452','5.061260','6.640745','8.713145','11.432287',\
               '15.000000']

    filename = 'Aerostats_' + output_folder + '.pickle'

    newdict = aeronet_extract(aerofile, keylist)

    #convert dates to datetime index
    dates = newdict['Date']
    del (newdict['Date'])

    df_out = pan.DataFrame(newdict, index=dates)

    os.chdir(aerofilt_dir)

    pan.save(newdict, filename)

    os.chdir(startdir)

    print '... Done'
Example #13
0
def CreateDataFrames(examples, features, outNum):
    """ Translates a CSV file into a pandas.DataFrame
    
    Arguments:
        examples    - Name of the CSV containing the data

        features    - Name of the file containing the column names for the CSV

        outNum      - Number to label the output files with

    Returns:
        (train, test, tune), each pandas.DataFrame's containing data sets for
        training, testing, and tuning broken up as 70 / 20 / 10 percent of the
        data.
    """

    values = np.genfromtxt(examples, delimiter=',')

    file = open(features, 'rb')
    reader = csv.reader(file)
    feats = reader.next()
    file.close()

    df =  pd.DataFrame(values,columns=feats)

    # Turn the features into categorical ones
    Categoricalize(df, feats)

    # Turn labels into +/- 1
    df['Winner'] = np.sign(df['Winner'] - 1.5)

    # Following code adapted from PA01
    nsamples = df.shape[0]
    ntest = np.floor(.2 * nsamples)
    ntune = np.floor(.1 * nsamples)

    # we want to make this reporducible so we seed the random number generator
    np.random.seed(1)
    all_indices = np.arange(nsamples)+1
    np.random.shuffle(all_indices)
    test_indices = all_indices[:ntest]
    tune_indices = all_indices[ntest:(ntest+ntune)]
    train_indices = all_indices[(ntest+ntune):]

    train = df.ix[train_indices,:]
    tune = df.ix[tune_indices,:]
    test = df.ix[test_indices,:]

    pd.save(train, 'data/train/train' + outNum + '.pdat')
    pd.save(tune, 'data/train/tune' + outNum + '.pdat')
    pd.save(test, 'data/test/test' + outNum + '.pdat')

    return train, tune, test
Example #14
0
def aeroproc(aerofilt_dir):
    import hysplit_tools as tools
    import os, sys
    import scipy.io
    import numpy as np
    import pandas as pan

    startdir = os.getcwd()

    topdir = aerofilt_dir

    os.chdir(topdir)

    data_files = os.listdir(os.getcwd())

    total_mean = []
    total_std = []
    fine_mean = []
    fine_std = []
    coarse_mean = []
    coarse_std = []
    inpoint = []
    numdist_mean = []
    numdist_std = []
    station = []
    date = []
    diameters = []

    #run through all location folders
    for f in data_files:
        if os.path.isdir(f):
            os.chdir(f)
            aerofile = 'Aerostats_' + f
            #open traceback file
            aerodict = scipy.io.loadmat(aerofile)

            #create a separate dict of lists for each day and put those into a
            #list called dictlist

            dates = aerodict['Date']

            keys = aerodict.keys()

            oldday = dates[0][-2]

            tempdict = dict()
            dictlist = []

            #ignore these keys that come attached to the dictionary from loadmat
            rejectlist = ['__globals__', '__header__', '__version__']

            for k, v in aerodict.iteritems():
                if k == 'Diameters':
                    tempdict[k] = []
                    tempdict[k].append(v)
                elif k == 'Numdist':
                    tempdict[k] = []
                    tempdict[k].append(v[:, 0])
                elif k not in rejectlist:
                    tempdict[k] = []
                    tempdict[k].append(v[0])

            for n in range(1, len(dates)):
                newday = dates[n][-2]
                if newday == oldday:
                    for k, v in aerodict.iteritems():
                        if k == 'Diameters':
                            tempdict[k].append(v)
                        elif k == 'Numdist':
                            tempdict[k].append(v[:, n])
                        elif k not in rejectlist:
                            tempdict[k].append(v[n])
                    oldday = newday
                else:
                    dictlist.append(tempdict)
                    tempdict = dict()
                    for k, v in aerodict.iteritems():
                        if k == 'Diameters':
                            tempdict[k] = []
                            tempdict[k].append(v)
                        elif k == 'Numdist':
                            tempdict[k] = []
                            tempdict[k].append(v[:, n])
                        elif k not in rejectlist:
                            tempdict[k] = []
                            tempdict[k].append(v[n])
                    oldday = newday

            dictlist.append(tempdict)
            #generate mean daily values for each element of the dictionaires

            for line in dictlist:
                total_mean.append([np.mean(line['EffRad-T']),np.mean(line['VolMedianRad-T']),\
                               np.mean(line['VolCon-T']),np.mean(line['StdDev-T'])])
                total_std.append([
                    np.std(line['EffRad-T']),
                    np.std(line['VolMedianRad-T']),
                    np.std(line['VolCon-T']),
                    np.std(line['StdDev-T'])
                ])

                fine_mean.append([
                    np.mean(line['EffRad-F']),
                    np.mean(line['VolMedianRad-F']),
                    np.mean(line['VolCon-F']),
                    np.mean(line['StdDev-F'])
                ])
                fine_std.append([
                    np.std(line['EffRad-F']),
                    np.std(line['VolMedianRad-F']),
                    np.std(line['VolCon-F']),
                    np.std(line['StdDev-F'])
                ])

                coarse_mean.append([
                    np.mean(line['EffRad-C']),
                    np.mean(line['VolMedianRad-C']),
                    np.mean(line['VolCon-C']),
                    np.mean(line['StdDev-C'])
                ])
                coarse_std.append([
                    np.std(line['EffRad-C']),
                    np.std(line['VolMedianRad-C']),
                    np.std(line['VolCon-C']),
                    np.std(line['StdDev-C'])
                ])

                inpoint.append(np.mean(line['Inflection_Point[um]']))

                numdist_mean.append(np.mean(line['Numdist'], axis=0))
                numdist_std.append(np.std(line['Numdist'], axis=0))

                station.append(f)

                date.append(np.mean(line['Date'], axis=0))

                diameters.append(line['Diameters'])

            os.chdir('..')

    output_dict = {'numdist_mean':numdist_mean,'numdist_std':numdist_std,\
                   'total_mean':total_mean,'total_std':total_std,'fine_mean':fine_mean,\
                   'fine_std':fine_std,'coarse_mean':coarse_mean,'coarse_std':coarse_std,\
                   'inpoint':inpoint,'diameters':diameters}

    #convert dates to datetime objects: CURRENTLY ONLY WORKS FOR MARCH/APRIL!

    ind_date = []

    for d in date:
        year = int(d[0])
        month = int(d[1])
        day = int(d[2])
        hour = int(d[3])

        if hour == 24:
            hour = 0
            if month == 3:
                if day == 31:
                    day = 1
                    month += 1
                else:
                    day += 1
            else:
                if day == 30:
                    day - 1
                    month += 1
                else:
                    day += 1

            print 'Date is %i %i %i' % (month, day, hour)

        ind_date.append(pan.datetime(year, month, day, hour))

    #create multi-index tuples

    ind = [(s, d) for s in station for d in ind_date]

    multi = pan.MultiIndex.from_tuples(ind, names=['Station', 'Date'])

    df = pan.DataFrame(output_dict, index=multi)

    pan.save(df, 'Aeroproc.pickle')

    os.chdir(startdir)
def traceproc(aerofilt_dir):
    import hysplit_tools_v2 as tools
    import os, sys
    import scipy.io
    import numpy as np
    import pandas as pan

    startdir = os.getcwd()

    topdir = aerofilt_dir

    os.chdir(topdir)

    data_files = os.listdir(os.getcwd())

    d_mean = []
    d_std = []
    t_mean = []
    t_std = []
    endpos_mean = []
    endpos_std = []
    start_time = []
    station = []

    #run through all location folders
    for f in data_files:
        if os.path.isdir(f):
            os.chdir(f)
            tracefile = f+'traceback'
            #open traceback file
            tracedict = scipy.io.loadmat(tracefile)

            #create a separate dict of lists for each day and put those into a
            #list called dictlist

            dates = tracedict['start_date']
            keys = tracedict.keys()

            oldday = dates[0][2]

            tempdict = dict()

            dictlist = []

            #ignore these keys that come attached to the dictionary from loadmat
            rejectlist = ['__globals__','__header__','__version__']

            for k,v in tracedict.iteritems():
                if k not in rejectlist:
                    tempdict[k] = []
                    tempdict[k].append(v[0])

            for n in range(1,len(dates)):
                newday = dates[n][2]
                if newday == oldday:
                    for k,v in tracedict.iteritems():
                        if k not in rejectlist:
                            tempdict[k].append(v[n])
                    oldday = newday
                else:
                    dictlist.append(tempdict)
                    tempdict = dict()
                    for k,v in tracedict.iteritems():
                        if k not in rejectlist:
                            tempdict[k] = []
                            tempdict[k].append(v[n])               
                    oldday = newday

            dictlist.append(tempdict)
            
            #generate mean daily values for each element of the dictionaires
            
            for line in dictlist:
                d_mean.append(np.mean(line['delta_d']))
                d_std.append(np.std(line['delta_d']))
                t_mean.append(np.mean(line['delta_t']))
                t_std.append(np.std(line['delta_t']))

                endpos_mean.append(np.mean(line['end_loc'],axis=0))
                endpos_std.append(np.std(line['end_loc'],axis=0))
                start_time.append(line['start_date'][0])
                station.append(str(line['station'][0]))

            os.chdir('..')

    output_dict = {'d_mean':d_mean,'d_std':d_std,'t_mean':t_mean,'t_std':t_std,\
                   'endpos_mean':endpos_mean,'endpos_std':endpos_std}

    
    #convert dates to datetime objects
    
    ind_date = []
    
    for date in start_time:
        year = int(date[0])
        month = int(date[1])
        day = int(date[2])
        hour = int(date[3])
        ind_date.append(pan.datetime(year,month,day,hour))

    #create multi-index tuples 

    ind = [(s,d) for s in station for d in ind_date]

    multi = pan.MultiIndex.from_tuples(ind, names=['Station','Date'])

    df = pan.DataFrame(output_dict, index = multi)

    pan.save(df,'Hyproc.pickle')

    os.chdir(startdir)
Example #16
0
    multi_survival_d['log_pred'] = e['m_multi_survival_log_pred']
    multi_survival_d['RMSE'] = e['m_multi_survival_RMSE']
    multi_survival_d['MAE'] = e['m_multi_survival_MAE']
    multi_survival_d['log_likelihood'] = e['m_multi_survival_log_likelihood']
    multi_survival_d['prediction'] = e['m_multi_survival_predictions']

    exs_df = exs_df.append(gauss_d, ignore_index=True)
    exs_df = exs_df.append(stut_d, ignore_index=True)
    exs_df = exs_df.append(survival_d, ignore_index=True)
    exs_df = exs_df.append(laplace_stut_d, ignore_index=True)
    exs_df = exs_df.append(laplace_survival_d, ignore_index=True)
    exs_df = exs_df.append(multi_gauss_d, ignore_index=True)
    exs_df = exs_df.append(multi_stut_d, ignore_index=True)
    exs_df = exs_df.append(multi_survival_d, ignore_index=True)

pd.save(exs_df, filename )

def print_results(df):
    means_multi_stut_log_pred = df['m_multi_stut_log_pred'].apply(np.mean)
    means_stut_log_pred = df['m_stut_log_pred'].apply(np.mean)
    means_gauss_log_pred = df['m_gauss_log_pred'].apply(np.mean)
    means_multi_gauss_log_pred = df['m_multi_gauss_log_pred'].apply(np.mean)

    means_multi_stut_RMSE = df['m_multi_stut_RMSE'].apply(np.mean)
    means_stut_RMSE = df['m_stut_RMSE'].apply(np.mean)
    means_gauss_RMSE = df['m_gauss_RMSE'].apply(np.mean)
    means_multi_gauss_RMSE = df['m_multi_gauss_RMSE'].apply(np.mean)

    means_multi_stut_MAE = df['m_multi_stut_MAE'].apply(np.mean)
    means_stut_MAE = df['m_stut_MAE'].apply(np.mean)
    means_gauss_MAE = df['m_gauss_MAE'].apply(np.mean)
Example #17
0
 def put(self, name, obj):
     filename = self.get_filename(name)
     pd.save(obj, filename)
def aeroproc(aerofilt_dir):
    import hysplit_tools as tools
    import os, sys
    import scipy.io
    import numpy as np
    import pandas as pan

    startdir = os.getcwd()

    topdir = aerofilt_dir

    os.chdir(topdir)

    data_files = os.listdir(os.getcwd())

    total_mean = []
    total_std = []
    fine_mean = []
    fine_std = []
    coarse_mean = []
    coarse_std = []
    inpoint = []
    numdist_mean = []
    numdist_std = []
    station = []
    date = []
    diameters = []

    # run through all location folders
    for f in data_files:
        if os.path.isdir(f):
            os.chdir(f)
            aerofile = "Aerostats_" + f
            # open traceback file
            aerodict = scipy.io.loadmat(aerofile)

            # create a separate dict of lists for each day and put those into a
            # list called dictlist

            dates = aerodict["Date"]

            keys = aerodict.keys()

            oldday = dates[0][-2]

            tempdict = dict()
            dictlist = []

            # ignore these keys that come attached to the dictionary from loadmat
            rejectlist = ["__globals__", "__header__", "__version__"]

            for k, v in aerodict.iteritems():
                if k == "Diameters":
                    tempdict[k] = []
                    tempdict[k].append(v)
                elif k == "Numdist":
                    tempdict[k] = []
                    tempdict[k].append(v[:, 0])
                elif k not in rejectlist:
                    tempdict[k] = []
                    tempdict[k].append(v[0])

            for n in range(1, len(dates)):
                newday = dates[n][-2]
                if newday == oldday:
                    for k, v in aerodict.iteritems():
                        if k == "Diameters":
                            tempdict[k].append(v)
                        elif k == "Numdist":
                            tempdict[k].append(v[:, n])
                        elif k not in rejectlist:
                            tempdict[k].append(v[n])
                    oldday = newday
                else:
                    dictlist.append(tempdict)
                    tempdict = dict()
                    for k, v in aerodict.iteritems():
                        if k == "Diameters":
                            tempdict[k] = []
                            tempdict[k].append(v)
                        elif k == "Numdist":
                            tempdict[k] = []
                            tempdict[k].append(v[:, n])
                        elif k not in rejectlist:
                            tempdict[k] = []
                            tempdict[k].append(v[n])
                    oldday = newday

            dictlist.append(tempdict)
            # generate mean daily values for each element of the dictionaires

            for line in dictlist:
                total_mean.append(
                    [
                        np.mean(line["EffRad-T"]),
                        np.mean(line["VolMedianRad-T"]),
                        np.mean(line["VolCon-T"]),
                        np.mean(line["StdDev-T"]),
                    ]
                )
                total_std.append(
                    [
                        np.std(line["EffRad-T"]),
                        np.std(line["VolMedianRad-T"]),
                        np.std(line["VolCon-T"]),
                        np.std(line["StdDev-T"]),
                    ]
                )

                fine_mean.append(
                    [
                        np.mean(line["EffRad-F"]),
                        np.mean(line["VolMedianRad-F"]),
                        np.mean(line["VolCon-F"]),
                        np.mean(line["StdDev-F"]),
                    ]
                )
                fine_std.append(
                    [
                        np.std(line["EffRad-F"]),
                        np.std(line["VolMedianRad-F"]),
                        np.std(line["VolCon-F"]),
                        np.std(line["StdDev-F"]),
                    ]
                )

                coarse_mean.append(
                    [
                        np.mean(line["EffRad-C"]),
                        np.mean(line["VolMedianRad-C"]),
                        np.mean(line["VolCon-C"]),
                        np.mean(line["StdDev-C"]),
                    ]
                )
                coarse_std.append(
                    [
                        np.std(line["EffRad-C"]),
                        np.std(line["VolMedianRad-C"]),
                        np.std(line["VolCon-C"]),
                        np.std(line["StdDev-C"]),
                    ]
                )

                inpoint.append(np.mean(line["Inflection_Point[um]"]))

                numdist_mean.append(np.mean(line["Numdist"], axis=0))
                numdist_std.append(np.std(line["Numdist"], axis=0))

                station.append(f)

                date.append(np.mean(line["Date"], axis=0))

                diameters.append(line["Diameters"])

            os.chdir("..")

    output_dict = {
        "numdist_mean": numdist_mean,
        "numdist_std": numdist_std,
        "total_mean": total_mean,
        "total_std": total_std,
        "fine_mean": fine_mean,
        "fine_std": fine_std,
        "coarse_mean": coarse_mean,
        "coarse_std": coarse_std,
        "inpoint": inpoint,
        "diameters": diameters,
    }

    # convert dates to datetime objects: CURRENTLY ONLY WORKS FOR MARCH/APRIL!

    ind_date = []

    for d in date:
        year = int(d[0])
        month = int(d[1])
        day = int(d[2])
        hour = int(d[3])

        if hour == 24:
            hour = 0
            if month == 3:
                if day == 31:
                    day = 1
                    month += 1
                else:
                    day += 1
            else:
                if day == 30:
                    day - 1
                    month += 1
                else:
                    day += 1

            print "Date is %i %i %i" % (month, day, hour)

        ind_date.append(pan.datetime(year, month, day, hour))

    # create multi-index tuples

    ind = [(s, d) for s in station for d in ind_date]

    multi = pan.MultiIndex.from_tuples(ind, names=["Station", "Date"])

    df = pan.DataFrame(output_dict, index=multi)

    pan.save(df, "Aeroproc.pickle")

    os.chdir(startdir)
Example #19
0
    def put(self, name, obj):
        self.cache_dir_check()

        filename = self.get_filename(name)
        pd.save(obj, filename)
        return filename
Example #20
0
    
name = "Gemm"
filename = "Gemm.dat"
if len(sys.argv[1:]) > 0:
    name = sys.argv[1]
    filename = name + ".dat"
if len(sys.argv[1:]) > 1:
    filename = sys.argv[2]

if len(sys.argv[1:]) > 2:
    (NB, MB, VP, nW) = map(lambda x: int(x), sys.argv[3].split(':'))
else:
    NB = 256
    MB = 256
    VP = 96
    nW = 2

if len(sys.argv[1:]) > 3:
    sizes = map(lambda x: int(x), sys.argv[4:])
else:
    sizes = [10, 20, 40, 60, 80, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300]

print "running with params mB=%d, nB=%d, nVP=%d nWRK=%d sizes: %s" %(MB, NB, VP, nW, sizes)
pdata = performance_test(name, sizes, NB, MB, VP, nW)
series = Series(pdata)
print "GFlops/s: %s" % name
print series

if filename[0] != "-":
    pandas.save(series, filename)
def traceback(aerofilt_dir):
    import hysplit_tools as tools
    import numpy as np
    import pandas as pan
    import os,sys
    import math
    import datetime as dt

    Tman_coord = [39.0,84.0]
    Tman_a = 800 #km semi-major axis
    Tman_b = 300 #km semi-minor axis
    Tman_tilt = 70 #degrees between North and major axis

    Gobi_coord = [43.0,106.0]
    Gobi_a = 1000 #km semi_major axis
    Gobi_b = 500
    Gobi_tilt = 65 #degrees between North and major axis

    G_maxlon = 118
    G_minlon = 95

    G_maxlat = 49
    G_minlat = 37

    T_maxlon = 93
    T_minlon = 75

    T_maxlat = 43
    T_minlat = 35


    #load .mat file

    [aerofilt_topdir, station_id] = os.path.split(aerofilt_dir)
    

    data_keys = ('station','start_loc','start_date','end_loc','delta_t','delta_d','desert_tag')
    startdir = os.getcwd()
    os.chdir(aerofilt_dir)
    output_filename = station_id+'traceback'

    hysplit_files = os.listdir(os.getcwd())

    data_index = []
    start_date = []
    start_loc = []
    end_loc = []
    delta_t = []
    station = []
    delta_d = []
    desert_tag = []
    files = 0
    tracks = 0
    G = 0
    T = 0

    print 'Traceback is Processing %s folder' %station_id
    
    for h in hysplit_files:

        filetest = h.split('.')

        try:
            if filetest[1] == 'mat' and not('Aerostats' in h or 'traceback' in h):
                files += 1
                hysplit_dict = scipy.io.loadmat(h)

                lat = hysplit_dict['lat']
                lon = hysplit_dict['lon']

                data_index = []
                d_total = 0

                tracks = tracks + len(lat)
                
                for n in range(1,len(lat)):

                    [d_temp,theta_temp] = tools.haversine(lat[n-1],lon[n-1],lat[n],lon[n])
                    d_total = d_total + d_temp
                    
                    if G_minlat < lat[n] < G_maxlat:
                        if G_minlon < lon[n] < G_maxlon:

                            [G_range,G_theta] = tools.haversine(lat[n],lon[n],Gobi_coord[0],Gobi_coord[1])

                            G_rad = tools.ellipserad(Gobi_a,Gobi_b,G_theta,Gobi_tilt)

                            if G_range < G_rad:
                                data_index.append(n)
                                delta_d.append(d_total)
                                desert_tag.append('Gobi')
                                G += 1
                                
                    if T_minlat < lat[n] < T_maxlat:
                        if T_minlon < lon[n] < T_maxlon:

                            [T_range,T_theta] = tools.haversine(lat[n],lon[n],Tman_coord[0],Tman_coord[1])
                            T_rad = tools.ellipserad(Tman_a,Tman_b,T_theta,Tman_tilt)

                            if T_range < T_rad:
                                data_index.append(n)
                                delta_d.append(d_total)
                                desert_tag.append('Tman')
                                T += 1
                        
                for n in range(0,len(data_index)):
                    tempdate = dt.datetime(hysplit_dict['year'][0],hysplit_dict['month'][0],hysplit_dict['day'][0],\
                                     hysplit_dict['hour'][0])
                    start_date.append(tempdate)
                    start_loc.append((lat[0],lon[0]))
                    end_loc.append((lat[data_index[n]],lon[data_index[n]]))
                    delta_t.append(hysplit_dict['delta_t'][data_index[n]])
                    station.append(station_id)
            
        except IndexError:
            pass

    output_data = (station,start_loc,end_loc,delta_t,delta_d,desert_tag)

    output_dict = dict(zip(data_keys,output_data))

    df_out = pan.DataFrame(output_dict, index = start_date)

    pan.save(df_out, output_filename)
           
    os.chdir(startdir)

    print 'Out of %i files, %i total tracks checked' %(files,tracks)
    G_percent = 100.0*G/tracks
    T_percent = 100.0*T/tracks
    print '%0.3f percent of tracks passed over Gobi (or %i total)' %(G_percent, G)
    print '%0.3f percent of tracks passed over Taklimikan (or %i total)' %(T_percent, T)
Example #22
0
def main():
    
    #############################################
    # Set up the data as per the first Practicum
    #############################################
    
    spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',')
    fl = open('../input_data/spambase.names', 'r')
    lines = [line.strip() for line in fl] # J : strip from beginning and ending whitespace
    fl.close()
    
    colnames = [line.partition(':')[0] for line in lines if not (len(line) == 0 or line[0] == '|' or line[0] == '1')]
    colnames.append('spam')
    
    spam_df = pd.DataFrame(spam_values,columns=colnames)
    spam_df['spam']=2*spam_df['spam']-1
    
    # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame
    nsamples = spam_df.shape[0] 
    ntest = np.floor(.2 * nsamples)
    ntune = np.floor(.1 * nsamples)
    
    # we want to make this reproducible so we seed the random number generator
    np.random.seed(1)
    all_indices = np.arange(nsamples) 
    # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data
    np.random.shuffle(all_indices) 
    test_indices = all_indices[:ntest] # J: Get shuffled test indices first
    tune_indices = all_indices[ntest:(ntest+ntune)] # J: tune indices second
    train_indices = all_indices[(ntest+ntune):] # J: train indices (the majority) last
    
    # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through
    # DataFrame.ix. The second argument includes all columns, labels included.
    spam_train = spam_df.ix[train_indices,:]
    spam_tune = spam_df.ix[tune_indices,:]
    spam_test = spam_df.ix[test_indices,:]
    
    pd.save(spam_train, '../proc_data/training_data/spam_train.pdat')
    pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat')
    pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat')
    
    
    #######################################################################
    # See how features are sorted according to their Information Gain score
    #######################################################################
    
    # atestTree = DecisionTree(spam_train, 5, True)
    # print atestTree.__sortFeatures__(spam_train, spam_train.columns)
    
    ###############################################
    #  Training classifiers and saving them on disk
    ###############################################
    
    # Already trained those two, it took about 4 hours total. 
     
#    majVoteTree = DecTree.DecisionTree(spam_train, 5, False)
#    print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive."
#    majVoteTree.tune(spam_tune,1, 15)
#    print "Saving this classifier to disk."
#    majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
#    
#    IGTree = DecTree.DecisionTree(spam_train, 5, True)
#    print "Tuning an information gain classifier on all depths between 1 and 15 inclusive."
#    IGTree.tune(spam_tune,1, 15)
#    print "Saving this classifier to disk."
#    IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj")

    HectorsKNN = KNN(spam_train, spam_train['spam'], 5)
    print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:"
    HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1,42,2))
    print "Saving this classifier to disk."
    HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj") 
    
    ###########################################
    # Playing with stored classifiers
    ###########################################
    
    # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10

#    print "Loading a decision tree trained with Majority Vote for depths 1 to 10..."
#    majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
#    print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth)
#    classifications = majVoteTree.classify(spam_test)
#    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
#    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
#    print "We will now test all different hyper-parameters found during tuning on the test data:"
#    majVoteTree.classifyWithAllDepths(spam_test)
#    print "\n===========================================================\n"
#    
#    # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10
#    
#    print "Loading a decision tree trained with Information Gain for depths 1 to 10..."
#    IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj")
#    print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth)
#    classifications = IGTree.classify(spam_test)
#    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
#    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
#    print "We will now test all different hyper-parameters found during tuning on the test data:"
#    IGTree.classifyWithAllDepths(spam_test)
    
    # Part 3: Hector's KNN-classifier
    
    print "Reloading Hector's classifier from disk:"
    HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj")
    print "According to the tuning set, the optimal K for this classifier is: " + str(HectorsKNN.k) + "."
    classifications = HectorsKNN.classify(spam_test)
    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
    print 'For this value of K, the error on the test set was %0.3f' % testErrorRate
    print "We will now test all different hyper-parameters found during tuning on the test data:"
    HectorsKNN.classifyWithAllK(spam_test)
    
    # Part 4: Weighted Features KNN
    
    print "Exiting..."
Example #23
0
def main():

    #############################################
    # Set up the data as per the first Practicum
    #############################################

    spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',')
    fl = open('../input_data/spambase.names', 'r')
    lines = [line.strip()
             for line in fl]  # J : strip from beginning and ending whitespace
    fl.close()

    colnames = [
        line.partition(':')[0] for line in lines
        if not (len(line) == 0 or line[0] == '|' or line[0] == '1')
    ]
    colnames.append('spam')

    spam_df = pd.DataFrame(spam_values, columns=colnames)
    spam_df['spam'] = 2 * spam_df['spam'] - 1

    # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame
    nsamples = spam_df.shape[0]
    ntest = np.floor(.2 * nsamples)
    ntune = np.floor(.1 * nsamples)

    # we want to make this reproducible so we seed the random number generator
    np.random.seed(1)
    all_indices = np.arange(nsamples)
    # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data
    np.random.shuffle(all_indices)
    test_indices = all_indices[:ntest]  # J: Get shuffled test indices first
    tune_indices = all_indices[ntest:(ntest + ntune)]  # J: tune indices second
    train_indices = all_indices[(
        ntest + ntune):]  # J: train indices (the majority) last

    # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through
    # DataFrame.ix. The second argument includes all columns, labels included.
    spam_train = spam_df.ix[train_indices, :]
    spam_tune = spam_df.ix[tune_indices, :]
    spam_test = spam_df.ix[test_indices, :]

    pd.save(spam_train, '../proc_data/training_data/spam_train.pdat')
    pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat')
    pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat')

    #######################################################################
    # See how features are sorted according to their Information Gain score
    #######################################################################

    # atestTree = DecisionTree(spam_train, 5, True)
    # print atestTree.__sortFeatures__(spam_train, spam_train.columns)

    ###############################################
    #  Training classifiers and saving them on disk
    ###############################################

    # Already trained those two, it took about 4 hours total.

    #    majVoteTree = DecTree.DecisionTree(spam_train, 5, False)
    #    print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive."
    #    majVoteTree.tune(spam_tune,1, 15)
    #    print "Saving this classifier to disk."
    #    majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
    #
    #    IGTree = DecTree.DecisionTree(spam_train, 5, True)
    #    print "Tuning an information gain classifier on all depths between 1 and 15 inclusive."
    #    IGTree.tune(spam_tune,1, 15)
    #    print "Saving this classifier to disk."
    #    IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj")

    HectorsKNN = KNN(spam_train, spam_train['spam'], 5)
    print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:"
    HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1, 42, 2))
    print "Saving this classifier to disk."
    HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj")

    ###########################################
    # Playing with stored classifiers
    ###########################################

    # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10

    #    print "Loading a decision tree trained with Majority Vote for depths 1 to 10..."
    #    majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj")
    #    print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth)
    #    classifications = majVoteTree.classify(spam_test)
    #    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
    #    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
    #    print "We will now test all different hyper-parameters found during tuning on the test data:"
    #    majVoteTree.classifyWithAllDepths(spam_test)
    #    print "\n===========================================================\n"
    #
    #    # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10
    #
    #    print "Loading a decision tree trained with Information Gain for depths 1 to 10..."
    #    IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj")
    #    print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth)
    #    classifications = IGTree.classify(spam_test)
    #    testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0)
    #    print 'For this depth, the error on the test set was %0.3f' % testErrorRate
    #    print "We will now test all different hyper-parameters found during tuning on the test data:"
    #    IGTree.classifyWithAllDepths(spam_test)

    # Part 3: Hector's KNN-classifier

    print "Reloading Hector's classifier from disk:"
    HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj")
    print "According to the tuning set, the optimal K for this classifier is: " + str(
        HectorsKNN.k) + "."
    classifications = HectorsKNN.classify(spam_test)
    testErrorRate = np.mean((spam_test['spam'].values * classifications) < 0)
    print 'For this value of K, the error on the test set was %0.3f' % testErrorRate
    print "We will now test all different hyper-parameters found during tuning on the test data:"
    HectorsKNN.classifyWithAllK(spam_test)

    # Part 4: Weighted Features KNN

    print "Exiting..."
def traceback(aerofilt_dir):
    import hysplit_tools as tools
    import numpy as np
    import pandas as pan
    import os, sys
    import math
    import datetime as dt

    Tman_coord = [39.0, 84.0]
    Tman_a = 800  #km semi-major axis
    Tman_b = 300  #km semi-minor axis
    Tman_tilt = 70  #degrees between North and major axis

    Gobi_coord = [43.0, 106.0]
    Gobi_a = 1000  #km semi_major axis
    Gobi_b = 500
    Gobi_tilt = 65  #degrees between North and major axis

    G_maxlon = 118
    G_minlon = 95

    G_maxlat = 49
    G_minlat = 37

    T_maxlon = 93
    T_minlon = 75

    T_maxlat = 43
    T_minlat = 35

    #load .mat file

    [aerofilt_topdir, station_id] = os.path.split(aerofilt_dir)

    data_keys = ('station', 'start_loc', 'start_date', 'end_loc', 'delta_t',
                 'delta_d', 'desert_tag')
    startdir = os.getcwd()
    os.chdir(aerofilt_dir)
    output_filename = station_id + 'traceback'

    hysplit_files = os.listdir(os.getcwd())

    data_index = []
    start_date = []
    start_loc = []
    end_loc = []
    delta_t = []
    station = []
    delta_d = []
    desert_tag = []
    files = 0
    tracks = 0
    G = 0
    T = 0

    print 'Traceback is Processing %s folder' % station_id

    for h in hysplit_files:

        filetest = h.split('.')

        try:
            if filetest[1] == 'mat' and not ('Aerostats' in h
                                             or 'traceback' in h):
                files += 1
                hysplit_dict = scipy.io.loadmat(h)

                lat = hysplit_dict['lat']
                lon = hysplit_dict['lon']

                data_index = []
                d_total = 0

                tracks = tracks + len(lat)

                for n in range(1, len(lat)):

                    [d_temp,
                     theta_temp] = tools.haversine(lat[n - 1], lon[n - 1],
                                                   lat[n], lon[n])
                    d_total = d_total + d_temp

                    if G_minlat < lat[n] < G_maxlat:
                        if G_minlon < lon[n] < G_maxlon:

                            [G_range, G_theta
                             ] = tools.haversine(lat[n], lon[n], Gobi_coord[0],
                                                 Gobi_coord[1])

                            G_rad = tools.ellipserad(Gobi_a, Gobi_b, G_theta,
                                                     Gobi_tilt)

                            if G_range < G_rad:
                                data_index.append(n)
                                delta_d.append(d_total)
                                desert_tag.append('Gobi')
                                G += 1

                    if T_minlat < lat[n] < T_maxlat:
                        if T_minlon < lon[n] < T_maxlon:

                            [T_range, T_theta
                             ] = tools.haversine(lat[n], lon[n], Tman_coord[0],
                                                 Tman_coord[1])
                            T_rad = tools.ellipserad(Tman_a, Tman_b, T_theta,
                                                     Tman_tilt)

                            if T_range < T_rad:
                                data_index.append(n)
                                delta_d.append(d_total)
                                desert_tag.append('Tman')
                                T += 1

                for n in range(0, len(data_index)):
                    tempdate = dt.datetime(hysplit_dict['year'][0],hysplit_dict['month'][0],hysplit_dict['day'][0],\
                                     hysplit_dict['hour'][0])
                    start_date.append(tempdate)
                    start_loc.append((lat[0], lon[0]))
                    end_loc.append((lat[data_index[n]], lon[data_index[n]]))
                    delta_t.append(hysplit_dict['delta_t'][data_index[n]])
                    station.append(station_id)

        except IndexError:
            pass

    output_data = (station, start_loc, end_loc, delta_t, delta_d, desert_tag)

    output_dict = dict(zip(data_keys, output_data))

    df_out = pan.DataFrame(output_dict, index=start_date)

    pan.save(df_out, output_filename)

    os.chdir(startdir)

    print 'Out of %i files, %i total tracks checked' % (files, tracks)
    G_percent = 100.0 * G / tracks
    T_percent = 100.0 * T / tracks
    print '%0.3f percent of tracks passed over Gobi (or %i total)' % (
        G_percent, G)
    print '%0.3f percent of tracks passed over Taklimikan (or %i total)' % (
        T_percent, T)
Example #25
0
    data[name] = data[name][
            (data[name].index.get_level_values(1).month >= monthrange[0]) & 
            (data[name].index.get_level_values(1).month <= monthrange[1])]

    # Turn daily max temperatures column into 2d array (date by site)
    values = data[name]['Value']
    values = values.unstack(level=0)
    # limit to sites with at least the amount of data listed in the settings
    values = values.loc[:, values.count() > 0.25*len(values)]

    # apply filter to full dataframe
    if pd.__version__ == '0.7.3':
        tuples = list(itertools.product(values.columns,values.index.tolist()))
        reduced_index = pd.MultiIndex.from_tuples(tuples)
    else:
        reduced_index = pd.MultiIndex.from_product([values.columns,
            values.index.tolist()])
    data[name] = data[name].reindex(reduced_index)

    print "Writing "+join(settings['outputfolder'],
        name+settings['outputsuffix']+'.pkl')
    if pd.__version__ == '0.7.3':
        pd.save(data[name], join(settings['outputfolder'],
            name+settings['outputsuffix']+'.pkl'))
    else:
        data[name].to_pickle(join(settings['outputfolder'],
            name+settings['outputsuffix']+'.pkl'))
    
    # free up some memory
    del data[name]