def run(self): df = pd.read_csv(self.input().path) ############bajo el supuesto que el host(ip) sea el usuario unico, sino, sólo hay que adecuar el paste df['id'] = df['Date_Time'].map(str) + df['URL'] df['Rank'] = df.groupby(['Host'])['id'].rank(ascending=True) df['Date_Time'] = pd.to_datetime(df['Date_Time']) df['time_diff'] = df.groupby('Host')['Date_Time'].diff() df['time_diff'] = df['time_diff'].fillna(0) df['year'] = pd.DatetimeIndex(df['Date_Time']).year df['month'] = pd.DatetimeIndex(df['Date_Time']).month df['day'] = pd.DatetimeIndex(df['Date_Time']).day df['hour'] = pd.DatetimeIndex(df['Date_Time']).hour df['day_of_week'] = df['Date_Time'].dt.dayofweek days = {0:'Lunes',1:'Martes',2:'Miercoles',3:'Jueves',4:'Viernes',5:'Sabado',6:'Domingo'} df['day_of_week'] = df['day_of_week'].apply(lambda x: days[x]) df['dif_seg_clicks'] = df['time_diff'].apply(lambda x: x / np.timedelta64(1,'s')).astype('int64') % (24*60) #################################mandarlo a funciones def subconjunto(x,par): x=df[x.dif_seg_clicks < par*60+1] return x df1=subconjunto(df,self.par) pd.save(df1,self.output().path)
def addreport(request, pk): therapist = request.user.getTherapist() patient = get_object_or_404(Patient, pk=pk) if patient.Therapist != therapist: return HttpResponseRedirect(reverse('access-restricted')) if request.method == "POST": form = AddReportForm(request.POST, request.FILES) if form.is_valid(): pd = Patient_Data(user_ID=patient, Name=form.cleaned_data['Name'], FilePath=form.cleaned_data['FilePath']) pd.save() return HttpResponseRedirect( reverse("view-report", kwargs={'pk': pk})) else: form = AddReportForm() context = { 'heading': 'Add Report', "name": therapist.Name, "nUnread": therapist.getNumberOfUnreadLogs, "sidebarOptions": therapistPatientOptions, "patient": patient, "form": form } return render(request, 'sehatagahiapp/add-report.html', context)
def run(self): df=pd.read_pickle(self.input().path) #df=pd.read_pickle(self.output().path) df=df.drop_duplicates(['Host', 'Date_Time','URL']) df = df.sort(['Host', 'Date_Time','Response_Code'], ascending=[1,1,0]) df["Date_Time"] = df["Date_Time"].map(lambda x: str(x)[0:20]) df = df.drop(df.index[[len(df)-1]]) print df.head() pd.save(df,self.output().path)
def hysplit_pandas_generator(aerofilt_dir): #tool for extracting data from hysplit trajectory files and putting it into #a pandas dataframe for storage in float16 format import pandas as pan import os, sys data_cats = ('delta_t', 'lat', 'lon', 'alt', 'press') startdir = os.getcwd() os.chdir(aerofilt_dir) hysplit_files = os.listdir(os.getcwd()) print 'Generating Hysplit dataframe ...' for h in hysplit_files: if pickle_test(h, hysplit_files): #import hysplit text file [head, data] = hysplit_import(h) #create dictionary with {varname: array} based on column names output_data = [ data[:, 8], data[:, 9], data[:, 10], data[:, 11], data[:, 12] ] output_dict = dict(zip(data_cats, output_data)) #create datetime index dates = [] for n in range(0, len(data[:, 2])): yr = data[n, 2] mn = data[n, 3] dy = data[n, 4] hr = data[n, 5] dates.append(pan.datetime(yr, mn, dy, hr)) ind = pan.DatetimeIndex(dates) df_out = pan.DataFrame(output_dict, index=ind) #save it as a .pickle file savename = h.split('.')[0] pan.save(df_out, savename + '.pickle') os.chdir(startdir) print '... Done'
def hysplit_pandas_generator(aerofilt_dir): #tool for extracting data from hysplit trajectory files and putting it into #mat files for storage in float16 format import pandas as pan import os,sys data_cats = ('delta_t','lat','lon','alt','press') startdir = os.getcwd() os.chdir(aerofilt_dir) hysplit_files = os.listdir(os.getcwd()) print 'Generating Hysplit dataframe ...' for h in hysplit_files: if pickle_test(h,hysplit_files): #import hysplit text file [head,data] = hysplit_import(h) #create dictionary with {varname: array} based on column names output_data = [data[:,8],data[:,9],data[:,10],data[:,11],data[:,12]] output_dict = dict(zip(data_cats,output_data)) #create datetime index dates = [] for n in range(0,len(data[:,2])): yr = data[n,2] mn = data[n,3] dy = data[n,4] hr = data[n,5] dates.append(pan.datetime(yr,mn,dy,hr)) ind = pan.DatetimeIndex(dates) df_out = pan.DataFrame(output_dict, index = ind) #save it as a .pickle file savename = h.split('.')[0] pan.save(df_out,savename+'.pickle') os.chdir(startdir) print '... Done'
def run(self): df=pd.read_pickle(self.input().path) # df['Date_Time'] = pd.to_datetime(df['Date_Time'], format='%d/%b/%Y:%H:%M:%S') df['id'] = df['Date_Time'].map(str)+df['Response_Code']+df['URL'] df['Rank'] = df.groupby(['Host'])['id'].rank(ascending=True) #df['Date_Time'] = pd.to_datetime(df['Date_Time']) df = df.sort(['Host', 'Date_Time','Rank'], ascending=[1,1,0]) df['time_diff'] = df.groupby('Host')['Date_Time'].diff() df['time_diff'] = df['time_diff'].fillna(0) print df.head() pd.save(df,self.output().path)
def traceproc(aerofilt_dir): import hysplit_tools as tools import os, sys import numpy as np import pandas as pan startdir = os.getcwd() topdir = aerofilt_dir os.chdir(topdir) data_files = os.listdir(os.getcwd()) d_mean = [] d_std = [] t_mean = [] t_std = [] endpos_mean = [] endpos_std = [] start_time = [] station = [] #run through all location folders for f in data_files: if os.path.isdir(f): os.chdir(f) tracefile = f+'traceback' #open traceback file trace_df = pan.load(tracefile) #create a separate dict of lists for each day and put those into a #list called dictlist dates = trace_df.index() keys = trace_df.columns() by = lambda x: lambda y: getattr(y,x) trace_mean = trace_df.groupby([by('month'),by('day')]).mean() trace_std = trace_df.groupby([by('month'),by('day')]).std() pan.save(df_out,'Hyproc.pickle') os.chdir(startdir)
def traceproc(aerofilt_dir): import hysplit_tools as tools import os, sys import numpy as np import pandas as pan startdir = os.getcwd() topdir = aerofilt_dir os.chdir(topdir) data_files = os.listdir(os.getcwd()) d_mean = [] d_std = [] t_mean = [] t_std = [] endpos_mean = [] endpos_std = [] start_time = [] station = [] #run through all location folders for f in data_files: if os.path.isdir(f): os.chdir(f) tracefile = f + 'traceback' #open traceback file trace_df = pan.load(tracefile) #create a separate dict of lists for each day and put those into a #list called dictlist dates = trace_df.index() keys = trace_df.columns() by = lambda x: lambda y: getattr(y, x) trace_mean = trace_df.groupby([by('month'), by('day')]).mean() trace_std = trace_df.groupby([by('month'), by('day')]).std() pan.save(df_out, 'Hyproc.pickle') os.chdir(startdir)
def run(self): df=pd.read_pickle(self.input().path) # df['year'] = pd.DatetimeIndex(df['Date_Time']).year df['month'] = pd.DatetimeIndex(df['Date_Time']).month df['day'] = pd.DatetimeIndex(df['Date_Time']).day df['hour'] = pd.DatetimeIndex(df['Date_Time']).hour df["date"] = pd.DatetimeIndex(df['Date_Time']).date df['day_of_week'] = df['Date_Time'].dt.dayofweek days = {0:'Lunes',1:'Martes',2:'Miercoles',3:'Jueves',4:'Viernes',5:'Sabado',6:'Domingo'} df['day_of_week'] = df['day_of_week'].apply(lambda x: days[x]) df['dif_seg_clicks'] = df['time_diff'].apply(lambda x: x / np.timedelta64(1,'s')).astype('int64') % (24*60) df.loc[df.dif_seg_clicks == 0, ['dif_seg_clicks']] = 1 print self.input().path pd.save(df,self.output().path) df.to_csv('%s.csv' % self.input().path)
def main(): # parse arguments parser = ArgumentParser() parser.add_argument("-J", "--jobname", dest="job_name", type=str, help="job name") parser.add_argument("-SF", "--sequencefile", dest="sequence_file", type=str, help="sequence file") parser.add_argument("-FF", "--featurefile", dest="feature_file", type=str, help="feature file") parser.add_argument("-NC", "--numchunks", dest="num_chunks", type=int, help="number of chunks") parser.add_argument("-CP", "--chunkprefix", dest="chunk_prefix", type=str, help="input chunk prefix") parser.add_argument("-OF", "--output", dest="output_file", type=str, help="output file") parser.add_argument("-DC", "--delete_chunks", dest="delete_chunks", type=int, help="delete chunks? (0 = no, 1 = yes (default)", default=1) args = parser.parse_args() # load sequences seqrecords = load_sequences(args.sequence_file) seqids = [seq.id for seq in seqrecords] # load features ff = open(args.feature_file, 'r').readlines() ff = [fi.strip() for fi in ff] # initialize dataframe X = pd.DataFrame(columns=ff, index=seqids, dtype='bool') # load chunks and place in appropriate spot for i in range(1, args.num_chunks+1): chunk_file = '%s.%d.pkl' % (args.chunk_prefix, i) chunk = pd.load(chunk_file) X.ix[chunk.index, chunk.columns] = chunk.as_matrix() if args.delete_chunks: os.remove(chunk_file) # save pd.save(X, args.output_file)
def aeronet_pandas_generator(aerofile,aerofilt_dir): #tool for extracting values of interest from aeronet data file, putting it into a #dictionary and saving it as a pandas dataframe import os, sys import pandas as pan import numpy as np startdir = os.getcwd() print 'Generating Aeronet dataframe ...' output_folder = os.path.split(aerofilt_dir)[1] keylist = ['Inflection_Point[um]','VolCon-T','EffRad-T','VolMedianRad-T','StdDev-T',\ 'VolCon-F','EffRad-F','VolMedianRad-F','StdDev-F','VolCon-C','EffRad-C',\ 'VolMedianRad-C','StdDev-C',\ '0.050000','0.065604','0.086077','0.112939','0.148184','0.194429','0.255105',\ '0.334716','0.439173','0.576227','0.756052','0.991996','1.301571','1.707757',\ '2.240702','2.939966','3.857452','5.061260','6.640745','8.713145','11.432287',\ '15.000000'] filename = 'Aerostats_'+output_folder+'.pickle' newdict = aeronet_extract(aerofile,keylist) #convert dates to datetime index dates = newdict['Date'] del(newdict['Date']) df_out = pan.DataFrame(newdict, index = dates) os.chdir(aerofilt_dir) pan.save(newdict,filename) os.chdir(startdir) print '... Done'
def aeronet_pandas_generator(aerofile, aerofilt_dir): #tool for extracting values of interest from aeronet data file, putting it into a #dictionary and saving it as a pandas dataframe import os, sys import pandas as pan import numpy as np startdir = os.getcwd() print 'Generating Aeronet dataframe ...' output_folder = os.path.split(aerofilt_dir)[1] keylist = ['Inflection_Point[um]','VolCon-T','EffRad-T','VolMedianRad-T','StdDev-T',\ 'VolCon-F','EffRad-F','VolMedianRad-F','StdDev-F','VolCon-C','EffRad-C',\ 'VolMedianRad-C','StdDev-C',\ '0.050000','0.065604','0.086077','0.112939','0.148184','0.194429','0.255105',\ '0.334716','0.439173','0.576227','0.756052','0.991996','1.301571','1.707757',\ '2.240702','2.939966','3.857452','5.061260','6.640745','8.713145','11.432287',\ '15.000000'] filename = 'Aerostats_' + output_folder + '.pickle' newdict = aeronet_extract(aerofile, keylist) #convert dates to datetime index dates = newdict['Date'] del (newdict['Date']) df_out = pan.DataFrame(newdict, index=dates) os.chdir(aerofilt_dir) pan.save(newdict, filename) os.chdir(startdir) print '... Done'
def CreateDataFrames(examples, features, outNum): """ Translates a CSV file into a pandas.DataFrame Arguments: examples - Name of the CSV containing the data features - Name of the file containing the column names for the CSV outNum - Number to label the output files with Returns: (train, test, tune), each pandas.DataFrame's containing data sets for training, testing, and tuning broken up as 70 / 20 / 10 percent of the data. """ values = np.genfromtxt(examples, delimiter=',') file = open(features, 'rb') reader = csv.reader(file) feats = reader.next() file.close() df = pd.DataFrame(values,columns=feats) # Turn the features into categorical ones Categoricalize(df, feats) # Turn labels into +/- 1 df['Winner'] = np.sign(df['Winner'] - 1.5) # Following code adapted from PA01 nsamples = df.shape[0] ntest = np.floor(.2 * nsamples) ntune = np.floor(.1 * nsamples) # we want to make this reporducible so we seed the random number generator np.random.seed(1) all_indices = np.arange(nsamples)+1 np.random.shuffle(all_indices) test_indices = all_indices[:ntest] tune_indices = all_indices[ntest:(ntest+ntune)] train_indices = all_indices[(ntest+ntune):] train = df.ix[train_indices,:] tune = df.ix[tune_indices,:] test = df.ix[test_indices,:] pd.save(train, 'data/train/train' + outNum + '.pdat') pd.save(tune, 'data/train/tune' + outNum + '.pdat') pd.save(test, 'data/test/test' + outNum + '.pdat') return train, tune, test
def aeroproc(aerofilt_dir): import hysplit_tools as tools import os, sys import scipy.io import numpy as np import pandas as pan startdir = os.getcwd() topdir = aerofilt_dir os.chdir(topdir) data_files = os.listdir(os.getcwd()) total_mean = [] total_std = [] fine_mean = [] fine_std = [] coarse_mean = [] coarse_std = [] inpoint = [] numdist_mean = [] numdist_std = [] station = [] date = [] diameters = [] #run through all location folders for f in data_files: if os.path.isdir(f): os.chdir(f) aerofile = 'Aerostats_' + f #open traceback file aerodict = scipy.io.loadmat(aerofile) #create a separate dict of lists for each day and put those into a #list called dictlist dates = aerodict['Date'] keys = aerodict.keys() oldday = dates[0][-2] tempdict = dict() dictlist = [] #ignore these keys that come attached to the dictionary from loadmat rejectlist = ['__globals__', '__header__', '__version__'] for k, v in aerodict.iteritems(): if k == 'Diameters': tempdict[k] = [] tempdict[k].append(v) elif k == 'Numdist': tempdict[k] = [] tempdict[k].append(v[:, 0]) elif k not in rejectlist: tempdict[k] = [] tempdict[k].append(v[0]) for n in range(1, len(dates)): newday = dates[n][-2] if newday == oldday: for k, v in aerodict.iteritems(): if k == 'Diameters': tempdict[k].append(v) elif k == 'Numdist': tempdict[k].append(v[:, n]) elif k not in rejectlist: tempdict[k].append(v[n]) oldday = newday else: dictlist.append(tempdict) tempdict = dict() for k, v in aerodict.iteritems(): if k == 'Diameters': tempdict[k] = [] tempdict[k].append(v) elif k == 'Numdist': tempdict[k] = [] tempdict[k].append(v[:, n]) elif k not in rejectlist: tempdict[k] = [] tempdict[k].append(v[n]) oldday = newday dictlist.append(tempdict) #generate mean daily values for each element of the dictionaires for line in dictlist: total_mean.append([np.mean(line['EffRad-T']),np.mean(line['VolMedianRad-T']),\ np.mean(line['VolCon-T']),np.mean(line['StdDev-T'])]) total_std.append([ np.std(line['EffRad-T']), np.std(line['VolMedianRad-T']), np.std(line['VolCon-T']), np.std(line['StdDev-T']) ]) fine_mean.append([ np.mean(line['EffRad-F']), np.mean(line['VolMedianRad-F']), np.mean(line['VolCon-F']), np.mean(line['StdDev-F']) ]) fine_std.append([ np.std(line['EffRad-F']), np.std(line['VolMedianRad-F']), np.std(line['VolCon-F']), np.std(line['StdDev-F']) ]) coarse_mean.append([ np.mean(line['EffRad-C']), np.mean(line['VolMedianRad-C']), np.mean(line['VolCon-C']), np.mean(line['StdDev-C']) ]) coarse_std.append([ np.std(line['EffRad-C']), np.std(line['VolMedianRad-C']), np.std(line['VolCon-C']), np.std(line['StdDev-C']) ]) inpoint.append(np.mean(line['Inflection_Point[um]'])) numdist_mean.append(np.mean(line['Numdist'], axis=0)) numdist_std.append(np.std(line['Numdist'], axis=0)) station.append(f) date.append(np.mean(line['Date'], axis=0)) diameters.append(line['Diameters']) os.chdir('..') output_dict = {'numdist_mean':numdist_mean,'numdist_std':numdist_std,\ 'total_mean':total_mean,'total_std':total_std,'fine_mean':fine_mean,\ 'fine_std':fine_std,'coarse_mean':coarse_mean,'coarse_std':coarse_std,\ 'inpoint':inpoint,'diameters':diameters} #convert dates to datetime objects: CURRENTLY ONLY WORKS FOR MARCH/APRIL! ind_date = [] for d in date: year = int(d[0]) month = int(d[1]) day = int(d[2]) hour = int(d[3]) if hour == 24: hour = 0 if month == 3: if day == 31: day = 1 month += 1 else: day += 1 else: if day == 30: day - 1 month += 1 else: day += 1 print 'Date is %i %i %i' % (month, day, hour) ind_date.append(pan.datetime(year, month, day, hour)) #create multi-index tuples ind = [(s, d) for s in station for d in ind_date] multi = pan.MultiIndex.from_tuples(ind, names=['Station', 'Date']) df = pan.DataFrame(output_dict, index=multi) pan.save(df, 'Aeroproc.pickle') os.chdir(startdir)
def traceproc(aerofilt_dir): import hysplit_tools_v2 as tools import os, sys import scipy.io import numpy as np import pandas as pan startdir = os.getcwd() topdir = aerofilt_dir os.chdir(topdir) data_files = os.listdir(os.getcwd()) d_mean = [] d_std = [] t_mean = [] t_std = [] endpos_mean = [] endpos_std = [] start_time = [] station = [] #run through all location folders for f in data_files: if os.path.isdir(f): os.chdir(f) tracefile = f+'traceback' #open traceback file tracedict = scipy.io.loadmat(tracefile) #create a separate dict of lists for each day and put those into a #list called dictlist dates = tracedict['start_date'] keys = tracedict.keys() oldday = dates[0][2] tempdict = dict() dictlist = [] #ignore these keys that come attached to the dictionary from loadmat rejectlist = ['__globals__','__header__','__version__'] for k,v in tracedict.iteritems(): if k not in rejectlist: tempdict[k] = [] tempdict[k].append(v[0]) for n in range(1,len(dates)): newday = dates[n][2] if newday == oldday: for k,v in tracedict.iteritems(): if k not in rejectlist: tempdict[k].append(v[n]) oldday = newday else: dictlist.append(tempdict) tempdict = dict() for k,v in tracedict.iteritems(): if k not in rejectlist: tempdict[k] = [] tempdict[k].append(v[n]) oldday = newday dictlist.append(tempdict) #generate mean daily values for each element of the dictionaires for line in dictlist: d_mean.append(np.mean(line['delta_d'])) d_std.append(np.std(line['delta_d'])) t_mean.append(np.mean(line['delta_t'])) t_std.append(np.std(line['delta_t'])) endpos_mean.append(np.mean(line['end_loc'],axis=0)) endpos_std.append(np.std(line['end_loc'],axis=0)) start_time.append(line['start_date'][0]) station.append(str(line['station'][0])) os.chdir('..') output_dict = {'d_mean':d_mean,'d_std':d_std,'t_mean':t_mean,'t_std':t_std,\ 'endpos_mean':endpos_mean,'endpos_std':endpos_std} #convert dates to datetime objects ind_date = [] for date in start_time: year = int(date[0]) month = int(date[1]) day = int(date[2]) hour = int(date[3]) ind_date.append(pan.datetime(year,month,day,hour)) #create multi-index tuples ind = [(s,d) for s in station for d in ind_date] multi = pan.MultiIndex.from_tuples(ind, names=['Station','Date']) df = pan.DataFrame(output_dict, index = multi) pan.save(df,'Hyproc.pickle') os.chdir(startdir)
multi_survival_d['log_pred'] = e['m_multi_survival_log_pred'] multi_survival_d['RMSE'] = e['m_multi_survival_RMSE'] multi_survival_d['MAE'] = e['m_multi_survival_MAE'] multi_survival_d['log_likelihood'] = e['m_multi_survival_log_likelihood'] multi_survival_d['prediction'] = e['m_multi_survival_predictions'] exs_df = exs_df.append(gauss_d, ignore_index=True) exs_df = exs_df.append(stut_d, ignore_index=True) exs_df = exs_df.append(survival_d, ignore_index=True) exs_df = exs_df.append(laplace_stut_d, ignore_index=True) exs_df = exs_df.append(laplace_survival_d, ignore_index=True) exs_df = exs_df.append(multi_gauss_d, ignore_index=True) exs_df = exs_df.append(multi_stut_d, ignore_index=True) exs_df = exs_df.append(multi_survival_d, ignore_index=True) pd.save(exs_df, filename ) def print_results(df): means_multi_stut_log_pred = df['m_multi_stut_log_pred'].apply(np.mean) means_stut_log_pred = df['m_stut_log_pred'].apply(np.mean) means_gauss_log_pred = df['m_gauss_log_pred'].apply(np.mean) means_multi_gauss_log_pred = df['m_multi_gauss_log_pred'].apply(np.mean) means_multi_stut_RMSE = df['m_multi_stut_RMSE'].apply(np.mean) means_stut_RMSE = df['m_stut_RMSE'].apply(np.mean) means_gauss_RMSE = df['m_gauss_RMSE'].apply(np.mean) means_multi_gauss_RMSE = df['m_multi_gauss_RMSE'].apply(np.mean) means_multi_stut_MAE = df['m_multi_stut_MAE'].apply(np.mean) means_stut_MAE = df['m_stut_MAE'].apply(np.mean) means_gauss_MAE = df['m_gauss_MAE'].apply(np.mean)
def put(self, name, obj): filename = self.get_filename(name) pd.save(obj, filename)
def aeroproc(aerofilt_dir): import hysplit_tools as tools import os, sys import scipy.io import numpy as np import pandas as pan startdir = os.getcwd() topdir = aerofilt_dir os.chdir(topdir) data_files = os.listdir(os.getcwd()) total_mean = [] total_std = [] fine_mean = [] fine_std = [] coarse_mean = [] coarse_std = [] inpoint = [] numdist_mean = [] numdist_std = [] station = [] date = [] diameters = [] # run through all location folders for f in data_files: if os.path.isdir(f): os.chdir(f) aerofile = "Aerostats_" + f # open traceback file aerodict = scipy.io.loadmat(aerofile) # create a separate dict of lists for each day and put those into a # list called dictlist dates = aerodict["Date"] keys = aerodict.keys() oldday = dates[0][-2] tempdict = dict() dictlist = [] # ignore these keys that come attached to the dictionary from loadmat rejectlist = ["__globals__", "__header__", "__version__"] for k, v in aerodict.iteritems(): if k == "Diameters": tempdict[k] = [] tempdict[k].append(v) elif k == "Numdist": tempdict[k] = [] tempdict[k].append(v[:, 0]) elif k not in rejectlist: tempdict[k] = [] tempdict[k].append(v[0]) for n in range(1, len(dates)): newday = dates[n][-2] if newday == oldday: for k, v in aerodict.iteritems(): if k == "Diameters": tempdict[k].append(v) elif k == "Numdist": tempdict[k].append(v[:, n]) elif k not in rejectlist: tempdict[k].append(v[n]) oldday = newday else: dictlist.append(tempdict) tempdict = dict() for k, v in aerodict.iteritems(): if k == "Diameters": tempdict[k] = [] tempdict[k].append(v) elif k == "Numdist": tempdict[k] = [] tempdict[k].append(v[:, n]) elif k not in rejectlist: tempdict[k] = [] tempdict[k].append(v[n]) oldday = newday dictlist.append(tempdict) # generate mean daily values for each element of the dictionaires for line in dictlist: total_mean.append( [ np.mean(line["EffRad-T"]), np.mean(line["VolMedianRad-T"]), np.mean(line["VolCon-T"]), np.mean(line["StdDev-T"]), ] ) total_std.append( [ np.std(line["EffRad-T"]), np.std(line["VolMedianRad-T"]), np.std(line["VolCon-T"]), np.std(line["StdDev-T"]), ] ) fine_mean.append( [ np.mean(line["EffRad-F"]), np.mean(line["VolMedianRad-F"]), np.mean(line["VolCon-F"]), np.mean(line["StdDev-F"]), ] ) fine_std.append( [ np.std(line["EffRad-F"]), np.std(line["VolMedianRad-F"]), np.std(line["VolCon-F"]), np.std(line["StdDev-F"]), ] ) coarse_mean.append( [ np.mean(line["EffRad-C"]), np.mean(line["VolMedianRad-C"]), np.mean(line["VolCon-C"]), np.mean(line["StdDev-C"]), ] ) coarse_std.append( [ np.std(line["EffRad-C"]), np.std(line["VolMedianRad-C"]), np.std(line["VolCon-C"]), np.std(line["StdDev-C"]), ] ) inpoint.append(np.mean(line["Inflection_Point[um]"])) numdist_mean.append(np.mean(line["Numdist"], axis=0)) numdist_std.append(np.std(line["Numdist"], axis=0)) station.append(f) date.append(np.mean(line["Date"], axis=0)) diameters.append(line["Diameters"]) os.chdir("..") output_dict = { "numdist_mean": numdist_mean, "numdist_std": numdist_std, "total_mean": total_mean, "total_std": total_std, "fine_mean": fine_mean, "fine_std": fine_std, "coarse_mean": coarse_mean, "coarse_std": coarse_std, "inpoint": inpoint, "diameters": diameters, } # convert dates to datetime objects: CURRENTLY ONLY WORKS FOR MARCH/APRIL! ind_date = [] for d in date: year = int(d[0]) month = int(d[1]) day = int(d[2]) hour = int(d[3]) if hour == 24: hour = 0 if month == 3: if day == 31: day = 1 month += 1 else: day += 1 else: if day == 30: day - 1 month += 1 else: day += 1 print "Date is %i %i %i" % (month, day, hour) ind_date.append(pan.datetime(year, month, day, hour)) # create multi-index tuples ind = [(s, d) for s in station for d in ind_date] multi = pan.MultiIndex.from_tuples(ind, names=["Station", "Date"]) df = pan.DataFrame(output_dict, index=multi) pan.save(df, "Aeroproc.pickle") os.chdir(startdir)
def put(self, name, obj): self.cache_dir_check() filename = self.get_filename(name) pd.save(obj, filename) return filename
name = "Gemm" filename = "Gemm.dat" if len(sys.argv[1:]) > 0: name = sys.argv[1] filename = name + ".dat" if len(sys.argv[1:]) > 1: filename = sys.argv[2] if len(sys.argv[1:]) > 2: (NB, MB, VP, nW) = map(lambda x: int(x), sys.argv[3].split(':')) else: NB = 256 MB = 256 VP = 96 nW = 2 if len(sys.argv[1:]) > 3: sizes = map(lambda x: int(x), sys.argv[4:]) else: sizes = [10, 20, 40, 60, 80, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000, 1100, 1200, 1300] print "running with params mB=%d, nB=%d, nVP=%d nWRK=%d sizes: %s" %(MB, NB, VP, nW, sizes) pdata = performance_test(name, sizes, NB, MB, VP, nW) series = Series(pdata) print "GFlops/s: %s" % name print series if filename[0] != "-": pandas.save(series, filename)
def traceback(aerofilt_dir): import hysplit_tools as tools import numpy as np import pandas as pan import os,sys import math import datetime as dt Tman_coord = [39.0,84.0] Tman_a = 800 #km semi-major axis Tman_b = 300 #km semi-minor axis Tman_tilt = 70 #degrees between North and major axis Gobi_coord = [43.0,106.0] Gobi_a = 1000 #km semi_major axis Gobi_b = 500 Gobi_tilt = 65 #degrees between North and major axis G_maxlon = 118 G_minlon = 95 G_maxlat = 49 G_minlat = 37 T_maxlon = 93 T_minlon = 75 T_maxlat = 43 T_minlat = 35 #load .mat file [aerofilt_topdir, station_id] = os.path.split(aerofilt_dir) data_keys = ('station','start_loc','start_date','end_loc','delta_t','delta_d','desert_tag') startdir = os.getcwd() os.chdir(aerofilt_dir) output_filename = station_id+'traceback' hysplit_files = os.listdir(os.getcwd()) data_index = [] start_date = [] start_loc = [] end_loc = [] delta_t = [] station = [] delta_d = [] desert_tag = [] files = 0 tracks = 0 G = 0 T = 0 print 'Traceback is Processing %s folder' %station_id for h in hysplit_files: filetest = h.split('.') try: if filetest[1] == 'mat' and not('Aerostats' in h or 'traceback' in h): files += 1 hysplit_dict = scipy.io.loadmat(h) lat = hysplit_dict['lat'] lon = hysplit_dict['lon'] data_index = [] d_total = 0 tracks = tracks + len(lat) for n in range(1,len(lat)): [d_temp,theta_temp] = tools.haversine(lat[n-1],lon[n-1],lat[n],lon[n]) d_total = d_total + d_temp if G_minlat < lat[n] < G_maxlat: if G_minlon < lon[n] < G_maxlon: [G_range,G_theta] = tools.haversine(lat[n],lon[n],Gobi_coord[0],Gobi_coord[1]) G_rad = tools.ellipserad(Gobi_a,Gobi_b,G_theta,Gobi_tilt) if G_range < G_rad: data_index.append(n) delta_d.append(d_total) desert_tag.append('Gobi') G += 1 if T_minlat < lat[n] < T_maxlat: if T_minlon < lon[n] < T_maxlon: [T_range,T_theta] = tools.haversine(lat[n],lon[n],Tman_coord[0],Tman_coord[1]) T_rad = tools.ellipserad(Tman_a,Tman_b,T_theta,Tman_tilt) if T_range < T_rad: data_index.append(n) delta_d.append(d_total) desert_tag.append('Tman') T += 1 for n in range(0,len(data_index)): tempdate = dt.datetime(hysplit_dict['year'][0],hysplit_dict['month'][0],hysplit_dict['day'][0],\ hysplit_dict['hour'][0]) start_date.append(tempdate) start_loc.append((lat[0],lon[0])) end_loc.append((lat[data_index[n]],lon[data_index[n]])) delta_t.append(hysplit_dict['delta_t'][data_index[n]]) station.append(station_id) except IndexError: pass output_data = (station,start_loc,end_loc,delta_t,delta_d,desert_tag) output_dict = dict(zip(data_keys,output_data)) df_out = pan.DataFrame(output_dict, index = start_date) pan.save(df_out, output_filename) os.chdir(startdir) print 'Out of %i files, %i total tracks checked' %(files,tracks) G_percent = 100.0*G/tracks T_percent = 100.0*T/tracks print '%0.3f percent of tracks passed over Gobi (or %i total)' %(G_percent, G) print '%0.3f percent of tracks passed over Taklimikan (or %i total)' %(T_percent, T)
def main(): ############################################# # Set up the data as per the first Practicum ############################################# spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',') fl = open('../input_data/spambase.names', 'r') lines = [line.strip() for line in fl] # J : strip from beginning and ending whitespace fl.close() colnames = [line.partition(':')[0] for line in lines if not (len(line) == 0 or line[0] == '|' or line[0] == '1')] colnames.append('spam') spam_df = pd.DataFrame(spam_values,columns=colnames) spam_df['spam']=2*spam_df['spam']-1 # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame nsamples = spam_df.shape[0] ntest = np.floor(.2 * nsamples) ntune = np.floor(.1 * nsamples) # we want to make this reproducible so we seed the random number generator np.random.seed(1) all_indices = np.arange(nsamples) # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data np.random.shuffle(all_indices) test_indices = all_indices[:ntest] # J: Get shuffled test indices first tune_indices = all_indices[ntest:(ntest+ntune)] # J: tune indices second train_indices = all_indices[(ntest+ntune):] # J: train indices (the majority) last # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through # DataFrame.ix. The second argument includes all columns, labels included. spam_train = spam_df.ix[train_indices,:] spam_tune = spam_df.ix[tune_indices,:] spam_test = spam_df.ix[test_indices,:] pd.save(spam_train, '../proc_data/training_data/spam_train.pdat') pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat') pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat') ####################################################################### # See how features are sorted according to their Information Gain score ####################################################################### # atestTree = DecisionTree(spam_train, 5, True) # print atestTree.__sortFeatures__(spam_train, spam_train.columns) ############################################### # Training classifiers and saving them on disk ############################################### # Already trained those two, it took about 4 hours total. # majVoteTree = DecTree.DecisionTree(spam_train, 5, False) # print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive." # majVoteTree.tune(spam_tune,1, 15) # print "Saving this classifier to disk." # majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj") # # IGTree = DecTree.DecisionTree(spam_train, 5, True) # print "Tuning an information gain classifier on all depths between 1 and 15 inclusive." # IGTree.tune(spam_tune,1, 15) # print "Saving this classifier to disk." # IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj") HectorsKNN = KNN(spam_train, spam_train['spam'], 5) print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:" HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1,42,2)) print "Saving this classifier to disk." HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj") ########################################### # Playing with stored classifiers ########################################### # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10 # print "Loading a decision tree trained with Majority Vote for depths 1 to 10..." # majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj") # print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth) # classifications = majVoteTree.classify(spam_test) # testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) # print 'For this depth, the error on the test set was %0.3f' % testErrorRate # print "We will now test all different hyper-parameters found during tuning on the test data:" # majVoteTree.classifyWithAllDepths(spam_test) # print "\n===========================================================\n" # # # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10 # # print "Loading a decision tree trained with Information Gain for depths 1 to 10..." # IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj") # print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth) # classifications = IGTree.classify(spam_test) # testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) # print 'For this depth, the error on the test set was %0.3f' % testErrorRate # print "We will now test all different hyper-parameters found during tuning on the test data:" # IGTree.classifyWithAllDepths(spam_test) # Part 3: Hector's KNN-classifier print "Reloading Hector's classifier from disk:" HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj") print "According to the tuning set, the optimal K for this classifier is: " + str(HectorsKNN.k) + "." classifications = HectorsKNN.classify(spam_test) testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) print 'For this value of K, the error on the test set was %0.3f' % testErrorRate print "We will now test all different hyper-parameters found during tuning on the test data:" HectorsKNN.classifyWithAllK(spam_test) # Part 4: Weighted Features KNN print "Exiting..."
def main(): ############################################# # Set up the data as per the first Practicum ############################################# spam_values = np.genfromtxt('../input_data/spambase.data', delimiter=',') fl = open('../input_data/spambase.names', 'r') lines = [line.strip() for line in fl] # J : strip from beginning and ending whitespace fl.close() colnames = [ line.partition(':')[0] for line in lines if not (len(line) == 0 or line[0] == '|' or line[0] == '1') ] colnames.append('spam') spam_df = pd.DataFrame(spam_values, columns=colnames) spam_df['spam'] = 2 * spam_df['spam'] - 1 # J: Apparently DataFrame.shape is a list or something and the first cell contains the number of samples in the DataFrame nsamples = spam_df.shape[0] ntest = np.floor(.2 * nsamples) ntune = np.floor(.1 * nsamples) # we want to make this reproducible so we seed the random number generator np.random.seed(1) all_indices = np.arange(nsamples) # J: important to shuffle so that you don't know which portion is training, which is testing and which is tuning data np.random.shuffle(all_indices) test_indices = all_indices[:ntest] # J: Get shuffled test indices first tune_indices = all_indices[ntest:(ntest + ntune)] # J: tune indices second train_indices = all_indices[( ntest + ntune):] # J: train indices (the majority) last # J : now that the "*indices" arrays have been shuffled, you can actually draw the relevant data through # DataFrame.ix. The second argument includes all columns, labels included. spam_train = spam_df.ix[train_indices, :] spam_tune = spam_df.ix[tune_indices, :] spam_test = spam_df.ix[test_indices, :] pd.save(spam_train, '../proc_data/training_data/spam_train.pdat') pd.save(spam_tune, '../proc_data/training_data/spam_tune.pdat') pd.save(spam_test, '../proc_data/testing_data/spam_test.pdat') ####################################################################### # See how features are sorted according to their Information Gain score ####################################################################### # atestTree = DecisionTree(spam_train, 5, True) # print atestTree.__sortFeatures__(spam_train, spam_train.columns) ############################################### # Training classifiers and saving them on disk ############################################### # Already trained those two, it took about 4 hours total. # majVoteTree = DecTree.DecisionTree(spam_train, 5, False) # print "Tuning a majority vote classifier on all depths between 1 and 15 inclusive." # majVoteTree.tune(spam_tune,1, 15) # print "Saving this classifier to disk." # majVoteTree.dump("../proc_data/dtreeWithMajVote_1_to_15.pyobj") # # IGTree = DecTree.DecisionTree(spam_train, 5, True) # print "Tuning an information gain classifier on all depths between 1 and 15 inclusive." # IGTree.tune(spam_tune,1, 15) # print "Saving this classifier to disk." # IGTree.dump("../proc_data/dtreeWithIG_1_to_15.pyobj") HectorsKNN = KNN(spam_train, spam_train['spam'], 5) print "Tuning Hector's KNN classifier for all values of K between 1 and 41 inclusive:" HectorsKNN.tune(spam_tune, spam_tune['spam'], k=range(1, 42, 2)) print "Saving this classifier to disk." HectorsKNN.dump("../proc_data/HectorsKNN_1_to_41.pyobj") ########################################### # Playing with stored classifiers ########################################### # Part 1: A decision tree classifier trained with Majority Vote, depths 1 to 10 # print "Loading a decision tree trained with Majority Vote for depths 1 to 10..." # majVoteTree = load("../proc_data/dtreeWithMajVote_1_to_15.pyobj") # print "According to the tuning set, the optimal depth for this tree is: " + str(majVoteTree.depth) # classifications = majVoteTree.classify(spam_test) # testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) # print 'For this depth, the error on the test set was %0.3f' % testErrorRate # print "We will now test all different hyper-parameters found during tuning on the test data:" # majVoteTree.classifyWithAllDepths(spam_test) # print "\n===========================================================\n" # # # Part 2: A decision tree classifier trained with Information Gain, depths 1 to 10 # # print "Loading a decision tree trained with Information Gain for depths 1 to 10..." # IGTree = load("../proc_data/dtreeWithIG_1_to_15.pyobj") # print "According to the tuning set, the optimal depth for this tree is: " + str(IGTree.depth) # classifications = IGTree.classify(spam_test) # testErrorRate = np.mean ( (spam_test['spam'].values * classifications) < 0) # print 'For this depth, the error on the test set was %0.3f' % testErrorRate # print "We will now test all different hyper-parameters found during tuning on the test data:" # IGTree.classifyWithAllDepths(spam_test) # Part 3: Hector's KNN-classifier print "Reloading Hector's classifier from disk:" HectorsKNN = load("../proc_data/HectorsKNN_1_to_41.pyobj") print "According to the tuning set, the optimal K for this classifier is: " + str( HectorsKNN.k) + "." classifications = HectorsKNN.classify(spam_test) testErrorRate = np.mean((spam_test['spam'].values * classifications) < 0) print 'For this value of K, the error on the test set was %0.3f' % testErrorRate print "We will now test all different hyper-parameters found during tuning on the test data:" HectorsKNN.classifyWithAllK(spam_test) # Part 4: Weighted Features KNN print "Exiting..."
def traceback(aerofilt_dir): import hysplit_tools as tools import numpy as np import pandas as pan import os, sys import math import datetime as dt Tman_coord = [39.0, 84.0] Tman_a = 800 #km semi-major axis Tman_b = 300 #km semi-minor axis Tman_tilt = 70 #degrees between North and major axis Gobi_coord = [43.0, 106.0] Gobi_a = 1000 #km semi_major axis Gobi_b = 500 Gobi_tilt = 65 #degrees between North and major axis G_maxlon = 118 G_minlon = 95 G_maxlat = 49 G_minlat = 37 T_maxlon = 93 T_minlon = 75 T_maxlat = 43 T_minlat = 35 #load .mat file [aerofilt_topdir, station_id] = os.path.split(aerofilt_dir) data_keys = ('station', 'start_loc', 'start_date', 'end_loc', 'delta_t', 'delta_d', 'desert_tag') startdir = os.getcwd() os.chdir(aerofilt_dir) output_filename = station_id + 'traceback' hysplit_files = os.listdir(os.getcwd()) data_index = [] start_date = [] start_loc = [] end_loc = [] delta_t = [] station = [] delta_d = [] desert_tag = [] files = 0 tracks = 0 G = 0 T = 0 print 'Traceback is Processing %s folder' % station_id for h in hysplit_files: filetest = h.split('.') try: if filetest[1] == 'mat' and not ('Aerostats' in h or 'traceback' in h): files += 1 hysplit_dict = scipy.io.loadmat(h) lat = hysplit_dict['lat'] lon = hysplit_dict['lon'] data_index = [] d_total = 0 tracks = tracks + len(lat) for n in range(1, len(lat)): [d_temp, theta_temp] = tools.haversine(lat[n - 1], lon[n - 1], lat[n], lon[n]) d_total = d_total + d_temp if G_minlat < lat[n] < G_maxlat: if G_minlon < lon[n] < G_maxlon: [G_range, G_theta ] = tools.haversine(lat[n], lon[n], Gobi_coord[0], Gobi_coord[1]) G_rad = tools.ellipserad(Gobi_a, Gobi_b, G_theta, Gobi_tilt) if G_range < G_rad: data_index.append(n) delta_d.append(d_total) desert_tag.append('Gobi') G += 1 if T_minlat < lat[n] < T_maxlat: if T_minlon < lon[n] < T_maxlon: [T_range, T_theta ] = tools.haversine(lat[n], lon[n], Tman_coord[0], Tman_coord[1]) T_rad = tools.ellipserad(Tman_a, Tman_b, T_theta, Tman_tilt) if T_range < T_rad: data_index.append(n) delta_d.append(d_total) desert_tag.append('Tman') T += 1 for n in range(0, len(data_index)): tempdate = dt.datetime(hysplit_dict['year'][0],hysplit_dict['month'][0],hysplit_dict['day'][0],\ hysplit_dict['hour'][0]) start_date.append(tempdate) start_loc.append((lat[0], lon[0])) end_loc.append((lat[data_index[n]], lon[data_index[n]])) delta_t.append(hysplit_dict['delta_t'][data_index[n]]) station.append(station_id) except IndexError: pass output_data = (station, start_loc, end_loc, delta_t, delta_d, desert_tag) output_dict = dict(zip(data_keys, output_data)) df_out = pan.DataFrame(output_dict, index=start_date) pan.save(df_out, output_filename) os.chdir(startdir) print 'Out of %i files, %i total tracks checked' % (files, tracks) G_percent = 100.0 * G / tracks T_percent = 100.0 * T / tracks print '%0.3f percent of tracks passed over Gobi (or %i total)' % ( G_percent, G) print '%0.3f percent of tracks passed over Taklimikan (or %i total)' % ( T_percent, T)
data[name] = data[name][ (data[name].index.get_level_values(1).month >= monthrange[0]) & (data[name].index.get_level_values(1).month <= monthrange[1])] # Turn daily max temperatures column into 2d array (date by site) values = data[name]['Value'] values = values.unstack(level=0) # limit to sites with at least the amount of data listed in the settings values = values.loc[:, values.count() > 0.25*len(values)] # apply filter to full dataframe if pd.__version__ == '0.7.3': tuples = list(itertools.product(values.columns,values.index.tolist())) reduced_index = pd.MultiIndex.from_tuples(tuples) else: reduced_index = pd.MultiIndex.from_product([values.columns, values.index.tolist()]) data[name] = data[name].reindex(reduced_index) print "Writing "+join(settings['outputfolder'], name+settings['outputsuffix']+'.pkl') if pd.__version__ == '0.7.3': pd.save(data[name], join(settings['outputfolder'], name+settings['outputsuffix']+'.pkl')) else: data[name].to_pickle(join(settings['outputfolder'], name+settings['outputsuffix']+'.pkl')) # free up some memory del data[name]