def proc_light_curve_pl(subset_index):
    import DAL
    #from DAL.datasets.checkpoint import Checkpoint
    lightcurves = DAL.create('lightcurves')
    #checkpoint = Checkpoint()
    if(subset_index >= len(s)):
        return []
    subset_filename = s[subset_index]
    subset_number = subset_filename[:-9]
    print subset_filename , subset_number
    ret = []
    ccc = 0
    for i in lightcurves.iter(subset_filename):
        ccc += 1
        #if(ccc >= 5):
        #    break
        #print "--"*10
        #t1 = time.clock()
        name = i['id']
        lc = i['data']
        time_x = lc[:int(len(lc)/2)]  #modified - first half time
        flux_y = lc[int(len(lc)/2):]  #modified - first half flux
        X = np.array(time_x, dtype = 'float32')
        Y = np.array(flux_y, dtype = 'float32')
        X = X[np.logical_not(np.isnan(Y))]
        Y = Y[np.logical_not(np.isnan(Y))]
        
        res = kernel_regress_cross_validation(X, Y)
        Y_hat = res[0]
        Y_band = res[2]
        
        resids = Y- Y_hat
        sigma = 1.4826* np.median(abs(resids- np.median(resids))) # 1.4826 * MAD
        resids_standard = (resids - resids.mean() ) / sigma
        beta = math.sqrt(2*math.log(len(X)))
        resids_truc = resids_standard[np.logical_not(resids_standard >=  -1*beta)]
        norm_one_sum = np.linalg.norm(resids_truc, ord=1)
        ret.append((name, norm_one_sum))
        '''
        print subset_index, name, Y_band, norm_one_sum
        print "temp_spent ",time.clock() - t1
        
        fig = plt.figure(figsize=(10,6))
        plt.plot(X, Y, '.k')
        plt.plot(X, Y_hat, '-b')
        title_string = "id:" + str(sample[0]) +", kernel regression, boxcar , bandwidth="+str(h)
        plt.title(title_string)
        plt.show()
        '''
        
    #checkpoint.store("proj4_1_"+str(subset_index), obj=ret) #also supports fp=<file pointer> and s=<string>
    return ret
def gist(index):
    import DAL
    import scipy
    import leargist
    import math
    import numpy as np
    tinyimages=DAL.create('tinyimages')
    img=scipy.misc.toimage( \
        tinyimages.byid(index).reshape(32,32,3, order="F").copy())
    #return leargist.color_gist(img)
    a = leargist.color_gist(img)
    vec = np.array(a)
    sd = math.sqrt(np.var(vec))
    mu = np.mean(vec)
    #normalization
    standarized_gist =  (vec- mu) / sd 
    #In this case the gist of 960
    #ret = [[index]]
    return list(standarized_gist)
import re
import fractions
import math
import scipy
from numpy import arange,array,ones,linalg
from pylab import plot,show
from __future__ import division
from IPython.parallel import Client
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import matplotlib.pyplot as plt

lightcurves = DAL.create('lightcurves')
s = lightcurves.subsets()
'''
dat = []
for i in lightcurves.iter(s[101]):
    name = i['id']
    #print type(name),name == 3096237
    if name == 3096237:
        break
    lc = i['data']
    time = lc[:int(len(lc)/2)]  #modified - first half time
    flux = lc[int(len(lc)/2):]  #modified - first half flux
    dat.append( (name, time, flux) ) #modified
'''
print len(s)
print "files:"
###############################
## Project 1 problem 1
## Stat 376
## Zhengjian Song
###############################

# Verify cluster connection and connect to the data set
import DAL
from IPython.parallel import Client
rc = Client()
#print len(rc)
dview = rc[:]
tinyimages = DAL.create('tinyimages')

# Search the image set by keywords: car and bicycle
car_ids = tinyimages.search('car', 2000)
bicycle_ids = tinyimages.search('bicycle', 2000)

# Ground truth, constructed manually
# 100 in each category
car_true = [12025562,12025563,12025564,12025565,12025567,12025568,12025569,12025571,12025572,12025573,12025574,12025576,12025580,12025583,12025584,12025585,12025586,12025587,12025588,12025589,12025590,12025591,12025592,12025593,12025594,12025597,12025599,12025601,12025602,12025603,12025604,12025605,12025606,12025611,12025615,12025618,12025620,12025624,12025627,12025628,12025630,12025631,12025632,12025633,12025634,12025635,12025636,12025661,12025663,12025664,12025668,12025670,12025671,12025674,12025675,12025676,12025677,12025678,12025679,12025682,12025683,12025684,12025686,12025687,12025688,12025689,12025690,12025691,12025692,12025693,12025701,12025702,12025706,12025707,12025709,12025711,12025712,12025713,12025717,12025718,12025725,12025726,12025727,12025747,12025749,12025752,12025753,12025757,12025768,12025769,12025771,12025772,12025773,12025774,12025776,12025778,12025779,12025780,12025781,12025782]
bicycle_true = [7112211,7112212,7112213,7112214,7112215,7112216,7112218,7112219,7112220,7112223,7112224,7112225,7112226,7112227,7112228,7112229,7112231,7112232,7112234,7112235,7112237,7112239,7112240,7112241,7112243,7112244,7112245,7112246,7112247,7112248,7112249,7112250,7112251,7112253,7112259,7112260,7112261,7112262,7112263,7112265,7112266,7112268,7112270,7112272,7112273,7112275,7112276,7112281,7112285,7112287,7112297,7112300,7112301,7112302,7112303,7112304,7112309,7112310,7112312,7112320,7112325,7112328,7112329,7112330,7112335,7112339,7112341,7112342,7112343,7112344,7112347,7112348,7112349,7112350,7112351,7112352,7112355,7112358,7112359,7112360,7112361,7112362,7112365,7112366,7112371,7112374,7112380,7112381,7112386,7112394,7112403,7112409,7112413,7112414,7112416,7112421,7112424,7112426,7112427,7112431]
 
# Print the results
images = tinyimages.byid(car_ids[0:300])
print "\"cars\""
tinyimages.display(images)
images = tinyimages.byid(car_true)
print "verified cars"
tinyimages.display(images)
print "\n"
import numpy
import scipy
from numpy import arange,array,ones,linalg
from pylab import plot,show
from __future__ import division
from IPython.parallel import Client
import matplotlib.pyplot as plt

from mpl_toolkits.mplot3d import Axes3D
from matplotlib import cm
from matplotlib.ticker import LinearLocator, FormatStrFormatter
import matplotlib.pyplot as plt
import numpy as np


crime = DAL.create('crime')
crime_list = crime.get_crime_list()
# the following can be slow; do this once at the beginning
# of the program and use this data structure throughout
crime_counts = crime.get_crime_counts()
region_list = crime.get_region_list()
K = 10 #number of crime types
N = int(len(crime_counts)/K) #number of regions
T = len(crime_counts.get((100,0)))

print K,N,T
print crime_list


proj3_1_filename = []
proj3_2_filename = []
def wave_filter(t_id):

    import DAL
    from DAL.datasets.checkpoint import Checkpoint
    #from DAL.datasets.checkpoint import Checkpoint
    lightcurves = DAL.create('lightcurves')
   
    try:
        reversed_dict
    except:
        checkpoint = Checkpoint()
        reversed_dict = checkpoint.load("reversed_dict", t = "obj")
    
    #get the date by id
    sample_data = []
    subset_index = reversed_dict.get(t_id)
    if(subset_index < 0 or subset_index>= len(s)):
        return subset_index
    for i in lightcurves.iter(s[subset_index]):
        name = i['id']
        if name == t_id:
            print name
            lc = i['data']
            time = lc[:int(len(lc)/2)]  #modified - first half time
            flux = lc[int(len(lc)/2):]  #modified - first half flux
            sample_data.append((name, time, flux))
            break
    
    
    for sample in [sample_data[0]]:
        print "-"*10
        print "id:", sample[0]
        X = np.array(sample[1], dtype = 'float32')
        Y = np.array(sample[2], dtype = 'float32')
        X = X[np.logical_not(np.isnan(Y))]
        Y = Y[np.logical_not(np.isnan(Y))]
        h = 0.8 # be carefully chosen
        #res = nad_wat(X, Y, 0.1)
        #fit the curve
        res = nad_wat_robust(X,Y, h)
        Y_hat = res[0]

        ###################################
        fig = plt.figure(figsize=(10,6))
        plt.plot(X, Y, '.k')
        plt.plot(X, Y_hat, '-b')
        title_string = "id:" + str(sample[0]) +", kernel regression, boxcar , bandwidth="+str(h)
        plt.title(title_string)
        plt.show()
        ###################################
        
        resids = Y- Y_hat
        sigma = 1.4826* np.median(abs(resids- np.median(resids))) # 1.4826 * MAD
        resids_standard = (resids - resids.mean() ) / sigma
        beta = math.sqrt(2*math.log(len(X)))
        
        #################################################
        resids_truc = resids_standard[np.logical_not(resids_standard >=  -1*beta)]
        norm_one_sum = np.linalg.norm(resids_truc, ord =2 ) # norm 2
        print "norm1 of residuals ", norm_one_sum
        #norm_ones.append(norm_one_sum)
        fig = plt.figure(figsize=(10,6))
        plt.plot(X, resids_standard, '.k')
        plt.plot(X, np.zeros(len(X)) - beta, '-r' )
        plt.show()
        ###############################
        
        ## detect the different levels, if there are more levels, just think they are binary stars.
        wave_level = 0
        outliers = np.logical_not(resids_standard >=  -1*0.9*beta) # let more potential signals in
        
        #scale the outlier residuals to range(0,1)
        mask = (np.zeros(len(X))+1)* outliers # len (mask) == len(X) , if signal, mask[i] == 1,, not signal, mask[i] == 0
        
        if(mask.sum() <= 3): # no signal, it is very possible to be outliers from white noise.
            # mark this to be white noise
            print "LEVEL: ", wave_level
            print "==="*10
            return wave_level
            
        # get the signal out
        signal = resids_standard * mask

        # rescale the signal to process
        signal = -1 * signal # get them reflected above the x-axis
        s_min = signal[ signal > 0].min() # the min non-zero 
        s_max = signal.max() # the max
        
        if(abs(s_min - s_max) <= 0.0001):
            #it is almost impossible to happen, just in case
            print "LEVEL: ", wave_level
            print "==="*10
            return wave_level
        
        #scaled signal
        signal = signal - s_min
        signal = signal / abs(s_max - s_min)
        

        #stat the area in each cell.
        #resharp the signals into rect
        total_num_signal = np.sum(mask)
        re_signal = np.zeros(len(signal))
        
        is_open = False
        signal_strength = 0
        counts_open = 0
        
        for i in range(4, len(X)-4):
            # start to receive
            if( mask[i-1] == 0 and mask[i] == 1 and mask[i+1] == 1 and mask[i+2] == 1):
                is_open = True
                counts_open += 1
                
            if(is_open == True):
                if(signal[i] > signal_strength):
                    signal_strength = signal[i]
            # start to refuse
            if( mask[i-2] == 1 and mask[i-1] == 1 and mask[i] == 1 and mask[i+1] == 0):
                j = i
                while(j >= 0 and mask[j] > 0):
                    re_signal[j] = signal_strength
                    j = j -1
                signal_strength = 0
                is_open = False
        
        if(re_signal.max() <= 0.0001): # no signal, it is very possible to be outliers from white noise.
            # mark this to be white noise
            print "no signal after filtering"
            print "LEVEL: ", wave_level
            print "==="*10
            
            return wave_level
            
        #re-scale after filter
        re_signal = re_signal / abs(re_signal.max() - re_signal.min())
        
        left_sum = 0
        right_sum = 0
        left_sum = np.sum(re_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1
        right_sum = np.sum(re_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1
        if(left_sum > 2* right_sum or right_sum > 2* left_sum ):
            print "unbalanced signal ---"
            print "LEVEL: ", wave_level
            print "==="*10
            if np.sum(counts_open > 5):
                return wave_level
        
        
        # confirm that there is at least one level of signals
        wave_level = 1 
        
        ########################################################
        #get the upper level of signal
        
        #find the safe zone between waves
        zone_width = 0.1
        up_bound = np.arange(1.0, 0.12, -0.025)
        down_bound = up_bound - zone_width
        
        up_zone = 1
        down_zone = 1
        for i in range(len(up_bound)):
            ones = np.zeros(len(X))+1
            is_in_zone = np.logical_and(re_signal <= up_bound[i], re_signal >= down_bound[i])
            num_in_zone = np.sum(is_in_zone * 1)
            if(num_in_zone == 0):
                up_zone = up_bound[i]-0.01
                down_zone = down_bound[i]+0.01
                break

        
        is_upper = np.logical_not(re_signal <= up_zone) # let more potential signals in
        #scale the outlier residuals to range(0,1)
        mask = (np.zeros(len(X))+1)* is_upper # len (mask) == len(X) , if signal, mask[i] == 1,, not signal, mask[i] == 0
        
        up_signal = re_signal * mask
        down_signal = re_signal - up_signal
        
        
        up_is_true_signal = True
        down_is_true_signal = True 
        # testing the upside is singel
        left_sum = 0
        right_sum = 0
        left_sum = np.sum(up_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1
        right_sum = np.sum(up_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1
        if(left_sum > 2* right_sum or right_sum > 2* left_sum ):
            print "unbalanced up signal"
            up_is_true_signal = False
            return wave_level
            
        # testing the downside is singel
        if(down_signal.max() <= 0.0001): # no signal, it is very possible to be outliers from white noise.
            # mark this to be white noise
            print "no signal down_side "
            down_is_true_signal = False
            return wave_level
    
        down_signal = down_signal / abs(down_signal.max())
        left_sum = 0
        right_sum = 0
        left_sum = np.sum(down_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1
        right_sum = np.sum(down_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1
        if(left_sum > 2* right_sum or right_sum > 2* left_sum ):
            print "unbalanced up signal"
            down_is_true_signal = False
            return wave_level
     
        if(up_is_true_signal and down_is_true_signal):
            wave_level += 1
        
        
        #############################################
        #continue if you want to get more levels
        #############################################
        re_signal = down_signal
        
        #re-scale after filter
        re_signal = re_signal / abs(re_signal.max() - re_signal.min())
        
    
        ########################################################
        #get the upper level of signal
        
        #find the safe zone between waves
        zone_width = 0.1
        up_bound = np.arange(1.0, 0.12, -0.025)
        down_bound = up_bound - zone_width
        
        up_zone = 1
        down_zone = 1
        for i in range(len(up_bound)):
            ones = np.zeros(len(X))+1
            is_in_zone = np.logical_and(re_signal <= up_bound[i], re_signal >= down_bound[i])
            num_in_zone = np.sum(is_in_zone * 1)
            if(num_in_zone == 0):
                up_zone = up_bound[i]-0.01
                down_zone = down_bound[i]+0.01
                break

        is_upper = np.logical_not(re_signal <= up_zone) # let more potential signals in
        #scale the outlier residuals to range(0,1)
        mask = (np.zeros(len(X))+1)* is_upper # len (mask) == len(X) , if signal, mask[i] == 1,, not signal, mask[i] == 0
        
        up_signal = re_signal * mask
        down_signal = re_signal - up_signal
        
        
        up_is_true_signal = True
        down_is_true_signal = True 
        # testing the upside is singel
        left_sum = 0
        right_sum = 0
        left_sum = np.sum(up_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1
        right_sum = np.sum(up_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1
        if(left_sum > 2* right_sum or right_sum > 2* left_sum ):
            print "unbalanced up signal"
            up_is_true_signal = False
            return wave_level
            
        # testing the downside is singel
        if(down_signal.max() <= 0.0001): # no signal, it is very possible to be outliers from white noise.
            # mark this to be white noise
            print "no signal down_side "
            down_is_true_signal = False
            return wave_level
    
        down_signal = down_signal / abs(down_signal.max())
        left_sum = 0
        right_sum = 0
        left_sum = np.sum(down_signal[int(len(X)/4): int(len(X)*3/8)+int(len(X)/4)])+1
        right_sum = np.sum(down_signal[int(len(X)*3/8)+int(len(X)/4)+1:])+1
        if(left_sum > 2* right_sum or right_sum > 2* left_sum ):
            print "unbalanced up signal"
            down_is_true_signal = False
            return wave_level
     
        if(up_is_true_signal and down_is_true_signal):
            wave_level += 1
        
        
        return wave_level
import time
import string
import re
import fractions
import numpy
import scipy
import time
from numpy import arange,array,ones,linalg
from pylab import plot,show
from __future__ import division
from IPython.parallel import Client
import matplotlib.pyplot as plt 

rc = Client()
dview = rc[:]
wishes=DAL.create('wishes')
data = wishes.subsets()[13:18] #use only one recent week of twitter data to get the volcabulary,  

# Clean the raw text data with filters
# Create a very long string comprising of the first 5 days of twitter data
dictionary= {}
for i in range(len(data)):
    print 'day', i
    text = ""
    for tweet in wishes.iter(data[i]):
        if tweet.has_key('text'):
            lower = tweet['text'].lower()
            text += lower
        else:
            #print i
            break