def test_zs(self): """ not in R, so tested by using (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4) """ y = stats.zs(self.testcase) desired = ([-1.3416407864999, -0.44721359549996 , 0.44721359549996 , 1.3416407864999]) assert_array_almost_equal(desired,y,decimal=12)
def standardize(self, data): """standardize data""" import scipy.stats as st newdata = copy.deepcopy(data) i=0 for col in zip(*data): newdata[:,i] = st.zs(col) i+=1 print newdata return newdata
def standardize(self, data): """standardize data""" import scipy.stats as st newdata = copy.deepcopy(data) i = 0 for col in zip(*data): newdata[:, i] = st.zs(col) i += 1 print newdata return newdata
def test_zs(self): """ not in R, so tested by using (testcase[i]-mean(testcase,axis=0))/sqrt(var(testcase)*3/4) """ y = stats.zs(self.testcase) desired = ([ -1.3416407864999, -0.44721359549996, 0.44721359549996, 1.3416407864999 ]) assert_array_almost_equal(desired, y, decimal=12)
def residual_nllf(v): # Normalize the scores so that we are assuming z = stats.zs(v)
period = 350 _df, _adf = .1, '10%' desvio = 2 yi, xi = 0, 0 for i in range(base.shape[1] - 1): yn = base.iloc[:, i].name xn = base.iloc[:, i].name if yn == atvA: yi = i if xn == atvB: xi = i period = period * -1 y = base.iloc[period:, yi].values y = zs(y) x = base.iloc[period:, xi].values x = zs(x) ynn = base.iloc[:, yi].name xnn = base.iloc[:, xi].name x = sm.add_constant(x) model = sm.OLS(y, x).fit() adf = ts.adfuller(model.resid, 1) std = statistics.stdev(model.resid) if model.resid[-1] > desvio * std or model.resid[-1] < desvio * std * -1: print('Pode Operar!') print(ynn, xnn) if adf[1] < _df:
def main(): top = 50 #max: 8921 with filters database = "calorie_king_social_networking_2010" server = "tarraco.chem-eng.northwestern.edu" user = "******" passwd = "n1ckuDB!" db = Connection(server, database, user, passwd) db.execute( "DROP TABLE IF EXISTS gaps_by_frequency") #i remove the (old) table db.execute( """ CREATE TABLE gaps_by_frequency ( file_index INT, ck_id CHAR (20), start_date INT, end_date INT, start_day INT, end_day INT, days_gap INT, zscore_gap FLOAT ) """ ) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones #query="""describe gaps_by_frequency""" #db.execute ("DROP TABLE IF EXISTS animal") # query="""show tables""" query = """select * from gaps_by_frequency""" # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES (1, 'reptile',7, 4,1,20,18, 2.,3.) ") # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES ("+str(1)+", 'reptile',"+str(1)+", "+str(1)+","+str(1)+","+str(1)+","+str(1)+", "+str(1.)+","+str(1.)+") ") #query="""show tables""" # query="""select * from gaps_by_frequency""" # result1 = db.query(query) # is a list of dict. # for r1 in result1: # print r1 list_all_average_frequencies = [] histogram_all_freq_no_averaged = [0] * 1000 num_events_all_freq_no_averaged = 0. for index_file in range(top): index_file += 1 list_average_frequencies_one_user = [] histogram_idiv = [0] * 1000 num_events_indiv = 0. #input file: file_name = "temporal_series/most_weigh_ins/weigh_in_time_serie_days" + str( index_file) + "_top50" #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_filters" file = open(file_name + ".dat", 'r') list_lines_file = file.readlines() list_dates = [] list_days = [] list_frequencies = [] cont = 0 for line in list_lines_file: if cont > 0: # i skip the first line,cos it doesnt have an associated freq. list = line.split(" ") ck_id = list[10] print line try: list_frequencies.append(float(list[9])) #frequency list_days.append(float(list[4])) #relative day list_dates.append(list[7]) #dates except IndexError: list_frequencies.append(float(0.0)) #frequency list_days.append(float(list[4])) #day list_dates.append(list[7]) #dates cont += 1 print list_dates print "\n\n" list_zscores = stats.zs(list_frequencies) for i in range(len(list_zscores)): if list_zscores[ i] >= 3.0: # statistically significant gap if zs>=3 std if list_frequencies[ i] > 15.: # dont consider it a gap if it is shorter than 2weeks if i > 2: #or happens for the very second measurement print "on file", index_file, "between days:", list_days[ i - 1], "-", list_days[ i], "there is a gap. freq:", list_frequencies[ i], "zscore:", list_zscores[i] time_gap = list_days[i] - list_days[i - 1] db.execute( "INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, zscore_gap) VALUES (" + str(index_file) + ", " + str(ck_id) + "," + str(list_dates[i - 1]) + ", " + str(list_dates[i]) + "," + str(list_days[i - 1]) + "," + str(list_days[i]) + "," + str(time_gap) + ", " + str(list_zscores[i]) + " ") print "\n", "on file", index_file, "mean freq:", np.asanyarray( list_frequencies).mean( axis=0), "std:", np.asanyarray(list_frequencies).std( axis=0, ddof=0) raw_input()
#!/usr/bin/env python from numpy import array from scipy.stats import zs import csv reader = csv.reader(open("K562_H3K27me_mono-tri.txt", "rb"), delimiter="\t") writer = csv.writer(open("K562_H3K27me_mono-tri.zscored.tester.txt", "wb"), delimiter="\t") ids = [] vals = [] for row in reader: thisid, val = row ids.append(thisid) vals.append(float(val)) newvals = zs(array(vals)) for i in range(len(ids)): thisid = ids[i] thisval = newvals[i] writer.writerow((thisid, thisval))
def main(): top = 8924 #max: 8924 for the files with filters (>=10 days, >=10weigh-ins >= 1/30 weigh-ins per day). max:50 for the top50 longest time series (no filter) zscore_threshold = 1. # it is a statistically significant gap if zs>=3 std min_freq = 10. # to consider something a gap database = "calorie_king_social_networking_2010" server = "tarraco.chem-eng.northwestern.edu" user = "******" passwd = "n1ckuDB!" db = Connection(server, database, user, passwd) db.execute( "DROP TABLE IF EXISTS gaps_by_frequency") #i remove the old table #i create a new table in an existing DB db.execute( """ CREATE TABLE gaps_by_frequency ( file_index INT, ck_id CHAR (36), index_start_day INT, index_end_day INT, start_day INT, end_day INT, days_gap INT, zscore_gap FLOAT, average_freq FLOAT ) """ ) # if i use triple quotation marks, i can have jumps of line no problem, but not with single ones #query="""describe gaps_by_frequency""" #db.execute ("DROP TABLE IF EXISTS animal") # query="""show tables""" query = """select * from gaps_by_frequency""" # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES (1, 'reptile',7, 4,1,20,18, 2.,3.) ") # db.execute ("INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, std_freq, zscore_gap) VALUES ("+str(1)+", 'reptile',"+str(1)+", "+str(1)+","+str(1)+","+str(1)+","+str(1)+", "+str(1.)+","+str(1.)+") ") #query="""show tables""" # query="""select * from gaps_by_frequency""" # result1 = db.query(query) # is a list of dict. # for r1 in result1: # print r1 list_all_average_frequencies = [] histogram_all_freq_no_averaged = [0] * 1000 num_events_all_freq_no_averaged = 0. for index_file in range(top): index_file += 1 print "\n\n", index_file list_average_frequencies_one_user = [] histogram_idiv = [0] * 1000 num_events_indiv = 0. #input file: #file_name="temporal_series/most_weigh_ins/weigh_in_time_serie_days"+str(index_file)+"_top50" file_name = "temporal_series/most_weigh_ins/weigh_in_time_serie_days" + str( index_file) + "_filters" # OJO!!!!!!!! EN ESTE ARCHIVO, EL DIA (RELATIVO AL PRIMERO) ES LA COLUNMA 4, NO LA 0 !!!!!!!!!! file = open(file_name + ".dat", 'r') list_lines_file = file.readlines() list_dates = [] list_days = [] list_frequencies = [] cont = 0 for line in list_lines_file: if cont > 0: # i skip the first line,cos it doesnt have an associated freq. list = line.split(" ") ck_id = list[8].strip("\n") try: list_frequencies.append(float(list[7])) #frequency list_days.append(float( list[4])) #relative day to the sign-up date list_dates.append(list[5]) #dates except IndexError: list_frequencies.append(float(0.0)) #frequency list_days.append(float(list[4])) #day list_dates.append(list[5]) #dates cont += 1 average_freq = np.mean(list_frequencies) list_zscores = stats.zs(list_frequencies) # OJO!!!!!!!!! list_zscores[0] (o tb list_frequencies[0]) corresponde a la diff entre la primera y la segunda entrada de list_days, por lo que en realindad #hay un desfase de una unidad entre los indices de las dos listas num_gaps = 0 for i in range(len(list_zscores)): if list_zscores[ i] >= zscore_threshold: # it is a statistically significant gap if zs>= zscore_threshold if list_frequencies[ i] > min_freq: # dont consider it a gap if it is shorter than x days if i >= 1: #because of the python thing about list[-1]=last_element_of_list) print " between days:", list_days[i - 1], "-", list_days[ i], "there is a gap. freq:", list_frequencies[ i], "zscore:", list_zscores[ i], "average freq: ", average_freq, ck_id, "on file", index_file time_gap = list_days[i] - list_days[i - 1] # db.execute (""" # INSERT INTO gaps_by_frequency (file_index, ck_id, start_date, end_date, start_day, end_day, days_gap, zscore_gap) #VALUES (%s, %s, %s,%s, %s, %s,%s, %s,%s, %s) #""", str(index_file), str(ck_id),str(list_dates[i-1]), str(list_dates[i]),str(list_days[i-1]),str(list_days[i]),str(time_gap), str(list_zscores[i]), str(np.asanyarray(list_frequencies).mean(axis=0)), str(np.asanyarray(list_frequencies).std(axis=0, ddof=0))) NO FUNCIONA!! db.execute( """ INSERT INTO gaps_by_frequency (file_index, ck_id, start_day, end_day, index_start_day, index_end_day, days_gap, zscore_gap, average_freq) VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s) """, str(index_file), str(ck_id), str(list_days[i - 1]), str(list_days[i]), i, i + 1, str(time_gap), str(list_zscores[i]), str(average_freq)) # note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts ato 0 index) num_gaps += 1 # if ck_id== "34214d9b-3fae-43d5-a961-bf7a94e22a3c" : # for ii in range(len(list_zscores)): # print list_days[ii],list_frequencies[ii],list_zscores[ii] # raw_input() # print str(ck_id),str(list_days[i-1]),str(list_days[i]),i,i+1,str(time_gap), str(list_zscores[i]), str(average_freq) else: # for the very first point time_gap = list_days[i] db.execute( """ INSERT INTO gaps_by_frequency (file_index, ck_id, start_day, end_day, index_start_day, index_end_day, days_gap, zscore_gap, average_freq) VALUES (%s, %s, %s, %s,%s, %s, %s, %s, %s) """, str(index_file), str(ck_id), str(0), str(list_days[i]), i, i + 1, str(time_gap), str(list_zscores[i]), str(average_freq)) # note: to get the index (of the point) for the days, it is i+1, because i corresponds to the serie of freq. (also, remember that it starts ato 0 index) num_gaps += 1 print "on file", index_file, "mean freq:", np.asanyarray( list_frequencies).mean( axis=0), "std:", np.asanyarray(list_frequencies).std(axis=0, ddof=0)