def proc_light_curve_pl(subset_index): import DAL #from DAL.datasets.checkpoint import Checkpoint lightcurves = DAL.create('lightcurves') #checkpoint = Checkpoint() if(subset_index >= len(s)): return [] subset_filename = s[subset_index] subset_number = subset_filename[:-9] print subset_filename , subset_number ret = [] ccc = 0 for i in lightcurves.iter(subset_filename): ccc += 1 #if(ccc >= 5): # break #print "--"*10 #t1 = time.clock() name = i['id'] lc = i['data'] time_x = lc[:int(len(lc)/2)] #modified - first half time flux_y = lc[int(len(lc)/2):] #modified - first half flux X = np.array(time_x, dtype = 'float32') Y = np.array(flux_y, dtype = 'float32') X = X[np.logical_not(np.isnan(Y))] Y = Y[np.logical_not(np.isnan(Y))] res = kernel_regress_cross_validation(X, Y) Y_hat = res[0] Y_band = res[2] resids = Y- Y_hat sigma = 1.4826* np.median(abs(resids- np.median(resids))) # 1.4826 * MAD resids_standard = (resids - resids.mean() ) / sigma beta = math.sqrt(2*math.log(len(X))) resids_truc = resids_standard[np.logical_not(resids_standard >= -1*beta)] norm_one_sum = np.linalg.norm(resids_truc, ord=1) ret.append((name, norm_one_sum)) ''' print subset_index, name, Y_band, norm_one_sum print "temp_spent ",time.clock() - t1 fig = plt.figure(figsize=(10,6)) plt.plot(X, Y, '.k') plt.plot(X, Y_hat, '-b') title_string = "id:" + str(sample[0]) +", kernel regression, boxcar , bandwidth="+str(h) plt.title(title_string) plt.show() ''' #checkpoint.store("proj4_1_"+str(subset_index), obj=ret) #also supports fp=<file pointer> and s=<string> return ret
def gist(index): import DAL import scipy import leargist import math import numpy as np tinyimages=DAL.create('tinyimages') img=scipy.misc.toimage( \ tinyimages.byid(index).reshape(32,32,3, order="F").copy()) #return leargist.color_gist(img) a = leargist.color_gist(img) vec = np.array(a) sd = math.sqrt(np.var(vec)) mu = np.mean(vec) #normalization standarized_gist = (vec- mu) / sd #In this case the gist of 960 #ret = [[index]] return list(standarized_gist)
import re import fractions import math import scipy from numpy import arange,array,ones,linalg from pylab import plot,show from __future__ import division from IPython.parallel import Client import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from matplotlib import cm from matplotlib.ticker import LinearLocator, FormatStrFormatter import matplotlib.pyplot as plt lightcurves = DAL.create('lightcurves') s = lightcurves.subsets() ''' dat = [] for i in lightcurves.iter(s[101]): name = i['id'] #print type(name),name == 3096237 if name == 3096237: break lc = i['data'] time = lc[:int(len(lc)/2)] #modified - first half time flux = lc[int(len(lc)/2):] #modified - first half flux dat.append( (name, time, flux) ) #modified ''' print len(s) print "files:"
############################### ## Project 1 problem 1 ## Stat 376 ## Zhengjian Song ############################### # Verify cluster connection and connect to the data set import DAL from IPython.parallel import Client rc = Client() #print len(rc) dview = rc[:] tinyimages = DAL.create('tinyimages') # Search the image set by keywords: car and bicycle car_ids = tinyimages.search('car', 2000) bicycle_ids = tinyimages.search('bicycle', 2000) # Ground truth, constructed manually # 100 in each category car_true = [12025562,12025563,12025564,12025565,12025567,12025568,12025569,12025571,12025572,12025573,12025574,12025576,12025580,12025583,12025584,12025585,12025586,12025587,12025588,12025589,12025590,12025591,12025592,12025593,12025594,12025597,12025599,12025601,12025602,12025603,12025604,12025605,12025606,12025611,12025615,12025618,12025620,12025624,12025627,12025628,12025630,12025631,12025632,12025633,12025634,12025635,12025636,12025661,12025663,12025664,12025668,12025670,12025671,12025674,12025675,12025676,12025677,12025678,12025679,12025682,12025683,12025684,12025686,12025687,12025688,12025689,12025690,12025691,12025692,12025693,12025701,12025702,12025706,12025707,12025709,12025711,12025712,12025713,12025717,12025718,12025725,12025726,12025727,12025747,12025749,12025752,12025753,12025757,12025768,12025769,12025771,12025772,12025773,12025774,12025776,12025778,12025779,12025780,12025781,12025782] bicycle_true = [7112211,7112212,7112213,7112214,7112215,7112216,7112218,7112219,7112220,7112223,7112224,7112225,7112226,7112227,7112228,7112229,7112231,7112232,7112234,7112235,7112237,7112239,7112240,7112241,7112243,7112244,7112245,7112246,7112247,7112248,7112249,7112250,7112251,7112253,7112259,7112260,7112261,7112262,7112263,7112265,7112266,7112268,7112270,7112272,7112273,7112275,7112276,7112281,7112285,7112287,7112297,7112300,7112301,7112302,7112303,7112304,7112309,7112310,7112312,7112320,7112325,7112328,7112329,7112330,7112335,7112339,7112341,7112342,7112343,7112344,7112347,7112348,7112349,7112350,7112351,7112352,7112355,7112358,7112359,7112360,7112361,7112362,7112365,7112366,7112371,7112374,7112380,7112381,7112386,7112394,7112403,7112409,7112413,7112414,7112416,7112421,7112424,7112426,7112427,7112431] # Print the results images = tinyimages.byid(car_ids[0:300]) print "\"cars\"" tinyimages.display(images) images = tinyimages.byid(car_true) print "verified cars" tinyimages.display(images) print "\n"
import numpy import scipy from numpy import arange,array,ones,linalg from pylab import plot,show from __future__ import division from IPython.parallel import Client import matplotlib.pyplot as plt from mpl_toolkits.mplot3d import Axes3D from matplotlib import cm from matplotlib.ticker import LinearLocator, FormatStrFormatter import matplotlib.pyplot as plt import numpy as np crime = DAL.create('crime') crime_list = crime.get_crime_list() # the following can be slow; do this once at the beginning # of the program and use this data structure throughout crime_counts = crime.get_crime_counts() region_list = crime.get_region_list() K = 10 #number of crime types N = int(len(crime_counts)/K) #number of regions T = len(crime_counts.get((100,0))) print K,N,T print crime_list proj3_1_filename = [] proj3_2_filename = []
def wave_filter(t_id): import DAL from DAL.datasets.checkpoint import Checkpoint #from DAL.datasets.checkpoint import Checkpoint lightcurves = DAL.create('lightcurves') try: reversed_dict except: checkpoint = Checkpoint() reversed_dict = checkpoint.load("reversed_dict", t = "obj") #get the date by id sample_data = [] subset_index = reversed_dict.get(t_id) if(subset_index < 0 or subset_index>= len(s)): return subset_index for i in lightcurves.iter(s[subset_index]): name = i['id'] if name == t_id: print name lc = i['data'] time = lc[:int(len(lc)/2)] #modified - first half time flux = lc[int(len(lc)/2):] #modified - first half flux sample_data.append((name, time, flux)) break for sample in [sample_data[0]]: print "-"*10 print "id:", sample[0] X = np.array(sample[1], dtype = 'float32') Y = np.array(sample[2], dtype = 'float32') X = X[np.logical_not(np.isnan(Y))] Y = Y[np.logical_not(np.isnan(Y))] h = 0.8 # be carefully chosen #res = nad_wat(X, Y, 0.1) #fit the curve res = nad_wat_robust(X,Y, h) Y_hat = res[0] ################################### fig = plt.figure(figsize=(10,6)) plt.plot(X, Y, '.k') plt.plot(X, Y_hat, '-b') title_string = "id:" + str(sample[0]) +", kernel regression, boxcar , bandwidth="+str(h) plt.title(title_string) plt.show() ################################### resids = Y- Y_hat sigma = 1.4826* np.median(abs(resids- np.median(resids))) # 1.4826 * MAD resids_standard = (resids - resids.mean() ) / sigma beta = math.sqrt(2*math.log(len(X))) ################################################# resids_truc = resids_standard[np.logical_not(resids_standard >= -1*beta)] norm_one_sum = np.linalg.norm(resids_truc, ord =2 ) # norm 2 print "norm1 of residuals ", norm_one_sum #norm_ones.append(norm_one_sum) fig = plt.figure(figsize=(10,6)) plt.plot(X, resids_standard, '.k') plt.plot(X, np.zeros(len(X)) - beta, '-r' ) plt.show() ############################### ## detect the different levels, if there are more levels, just think they are binary stars. wave_level = 0 outliers = np.logical_not(resids_standard >= -1*0.9*beta) # let more potential signals in #scale the outlier residuals to range(0,1) mask = (np.zeros(len(X))+1)* outliers # len (mask) == len(X) , if signal, mask[i] == 1,, not signal, mask[i] == 0 if(mask.sum() <= 3): # no signal, it is very possible to be outliers from white noise. # mark this to be white noise print "LEVEL: ", wave_level print "==="*10 return wave_level # get the signal out signal = resids_standard * mask # rescale the signal to process signal = -1 * signal # get them reflected above the x-axis s_min = signal[ signal > 0].min() # the min non-zero s_max = signal.max() # the max if(abs(s_min - s_max) <= 0.0001): #it is almost impossible to happen, just in case print "LEVEL: ", wave_level print "==="*10 return wave_level #scaled signal signal = signal - s_min signal = signal / abs(s_max - s_min) #stat the area in each cell. #resharp the signals into rect total_num_signal = np.sum(mask) re_signal = np.zeros(len(signal)) is_open = False signal_strength = 0 counts_open = 0 for i in range(4, len(X)-4): # start to receive if( mask[i-1] == 0 and mask[i] == 1 and mask[i+1] == 1 and mask[i+2] == 1): is_open = True counts_open += 1 if(is_open == True): if(signal[i] > signal_strength): signal_strength = signal[i] # start to refuse if( mask[i-2] == 1 and mask[i-1] == 1 and mask[i] == 1 and mask[i+1] == 0): j = i while(j >= 0 and mask[j] > 0): re_signal[j] = signal_strength j = j -1 signal_strength = 0 is_open = False if(re_signal.max() <= 0.0001): # no signal, it is very possible to be outliers from white noise. # mark this to be white noise print "no signal after filtering" print "LEVEL: ", wave_level print "==="*10 return wave_level #re-scale after filter re_signal = re_signal / abs(re_signal.max() - re_signal.min()) left_sum = 0 right_sum = 0 left_sum = np.sum(re_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1 right_sum = np.sum(re_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1 if(left_sum > 2* right_sum or right_sum > 2* left_sum ): print "unbalanced signal ---" print "LEVEL: ", wave_level print "==="*10 if np.sum(counts_open > 5): return wave_level # confirm that there is at least one level of signals wave_level = 1 ######################################################## #get the upper level of signal #find the safe zone between waves zone_width = 0.1 up_bound = np.arange(1.0, 0.12, -0.025) down_bound = up_bound - zone_width up_zone = 1 down_zone = 1 for i in range(len(up_bound)): ones = np.zeros(len(X))+1 is_in_zone = np.logical_and(re_signal <= up_bound[i], re_signal >= down_bound[i]) num_in_zone = np.sum(is_in_zone * 1) if(num_in_zone == 0): up_zone = up_bound[i]-0.01 down_zone = down_bound[i]+0.01 break is_upper = np.logical_not(re_signal <= up_zone) # let more potential signals in #scale the outlier residuals to range(0,1) mask = (np.zeros(len(X))+1)* is_upper # len (mask) == len(X) , if signal, mask[i] == 1,, not signal, mask[i] == 0 up_signal = re_signal * mask down_signal = re_signal - up_signal up_is_true_signal = True down_is_true_signal = True # testing the upside is singel left_sum = 0 right_sum = 0 left_sum = np.sum(up_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1 right_sum = np.sum(up_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1 if(left_sum > 2* right_sum or right_sum > 2* left_sum ): print "unbalanced up signal" up_is_true_signal = False return wave_level # testing the downside is singel if(down_signal.max() <= 0.0001): # no signal, it is very possible to be outliers from white noise. # mark this to be white noise print "no signal down_side " down_is_true_signal = False return wave_level down_signal = down_signal / abs(down_signal.max()) left_sum = 0 right_sum = 0 left_sum = np.sum(down_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1 right_sum = np.sum(down_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1 if(left_sum > 2* right_sum or right_sum > 2* left_sum ): print "unbalanced up signal" down_is_true_signal = False return wave_level if(up_is_true_signal and down_is_true_signal): wave_level += 1 ############################################# #continue if you want to get more levels ############################################# re_signal = down_signal #re-scale after filter re_signal = re_signal / abs(re_signal.max() - re_signal.min()) ######################################################## #get the upper level of signal #find the safe zone between waves zone_width = 0.1 up_bound = np.arange(1.0, 0.12, -0.025) down_bound = up_bound - zone_width up_zone = 1 down_zone = 1 for i in range(len(up_bound)): ones = np.zeros(len(X))+1 is_in_zone = np.logical_and(re_signal <= up_bound[i], re_signal >= down_bound[i]) num_in_zone = np.sum(is_in_zone * 1) if(num_in_zone == 0): up_zone = up_bound[i]-0.01 down_zone = down_bound[i]+0.01 break is_upper = np.logical_not(re_signal <= up_zone) # let more potential signals in #scale the outlier residuals to range(0,1) mask = (np.zeros(len(X))+1)* is_upper # len (mask) == len(X) , if signal, mask[i] == 1,, not signal, mask[i] == 0 up_signal = re_signal * mask down_signal = re_signal - up_signal up_is_true_signal = True down_is_true_signal = True # testing the upside is singel left_sum = 0 right_sum = 0 left_sum = np.sum(up_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1 right_sum = np.sum(up_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1 if(left_sum > 2* right_sum or right_sum > 2* left_sum ): print "unbalanced up signal" up_is_true_signal = False return wave_level # testing the downside is singel if(down_signal.max() <= 0.0001): # no signal, it is very possible to be outliers from white noise. # mark this to be white noise print "no signal down_side " down_is_true_signal = False return wave_level down_signal = down_signal / abs(down_signal.max()) left_sum = 0 right_sum = 0 left_sum = np.sum(down_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1 right_sum = np.sum(down_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1 if(left_sum > 2* right_sum or right_sum > 2* left_sum ): print "unbalanced up signal" down_is_true_signal = False return wave_level if(up_is_true_signal and down_is_true_signal): wave_level += 1 return wave_level
import time import string import re import fractions import numpy import scipy import time from numpy import arange,array,ones,linalg from pylab import plot,show from __future__ import division from IPython.parallel import Client import matplotlib.pyplot as plt rc = Client() dview = rc[:] wishes=DAL.create('wishes') data = wishes.subsets()[13:18] #use only one recent week of twitter data to get the volcabulary, # Clean the raw text data with filters # Create a very long string comprising of the first 5 days of twitter data dictionary= {} for i in range(len(data)): print 'day', i text = "" for tweet in wishes.iter(data[i]): if tweet.has_key('text'): lower = tweet['text'].lower() text += lower else: #print i break