def interpolate(var1, var2): """ Interpolate to make two variable lists equal length var = ['variable', 'filename', 'station_ID', 'kind'] """ print '' print 'Interpolating and creating new table...' print '' # declare a class class Variable1(IsDescription): variable1 = Float64Col() variable2 = Float64Col() intermediate1 = var1[0][1].replace('data_s' + str(var1[0][2]) + '_', '') intermediate2 = intermediate1.partition(' -') start_date = intermediate2[0] intermediate3 = intermediate2[2][1:] end_date = intermediate3.replace('.h5','') filename = 'interpolated_table_' + str(var1[0][0]) + '_station' + str(var1[0][2]) + '_with_' + str(var2[0][0]) + '_station' + str(var2[0][2]) + '_' + start_date + '_' + end_date + '.h5' """ intermediate1 = var1[0][1].replace('data_s' + str(var1[0][2]) + '_', '') station_id_and_date_interval1 = intermediate1.replace('.h5', '') intermediate2 = var1[-1][1].replace('data_', '') station_id_and_date_interval2 = intermediate2.replace('.h5', '') filename = 'cor_' + str(var1[0][0]) + '_' + str(var1[0][2]) + '_' + str(var2[0][0]) + '_' + str(var2[0][2]) + '_' + station_id_and_date_interval1 + '_' + station_id_and_date_interval2 + '.h5' """ # make new table data_cor = openFile(filename, 'w') group_variable1 = data_cor.createGroup("/", 'correlation') table_variable1 = data_cor.createTable(group_variable1, 'table', Variable1) # Insert a new particle record particle = table_variable1.row for i in range(len(var1)): # open data file 1 data_var1 = openFile(str(var1[i][1]), 'r') # fetch timestamps and variable 1 from station 1 timestamps_station1 = eval("data_var1.root.s%s.%s.col('timestamp')" % (str(var1[i][2]), str(var1[i][3]))) var1_station1 = eval("data_var1.root.s%s.%s.col('%s')" % (str(var1[i][2]), str(var1[i][3]), str(var1[i][0]))) data_var1.close() if len(var1_station1.shape) != 1: print 'There are %d plates with an individual %s value.' % (var1_station1.shape[1], str(var1[i][0])) plate_number1 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", var1_station1.shape[1])) var1_station1 = var1_station1[:, plate_number1 - 1] var1_station1.tolist() elif len(var1_station1.shape) == 1: var1_station1 = var1_station1.tolist() else: print 'weird' # zip the hisparc timestamps together with the event_rates, # sort them on the basis of timestamp value variable1_sorted = sorted(zip(timestamps_station1, var1_station1)) del timestamps_station1, var1_station1 if var1[i][0] in low_limit: var_list_without_bad_data = [] for t1, v1 in variable1_sorted: if low_limit[var1[i][0]] <= v1 <= high_limit[var1[i][0]]: var_list_without_bad_data.append((t1, v1)) if len(variable1_sorted) != len(var_list_without_bad_data): print 'Removed %d rows of bad %s data.' % (len(variable1_sorted) - len(var_list_without_bad_data), var1[i][0]) if len(var_list_without_bad_data) == 0: print 'Exit. In your data file there is no valid %s data' % (var1[i][0]) exit() variable1_sorted = var_list_without_bad_data del var_list_without_bad_data length_var1 = len(variable1_sorted) # open data file 2 data_var2 = openFile(str(var2[i][1]), 'r') #fetch timestamps and variable2 from station 2 timestamps_station2 = eval("data_var2.root.s%s.%s.col('timestamp')" % (str(var2[i][2]), str(var2[i][3]))) var2_station2 = eval("data_var2.root.s%s.%s.col('%s')" % (str(var2[i][2]), str(var2[i][3]), str(var2[i][0]))) data_var2.close() if len(var2_station2.shape) != 1: print 'There are %d plates with an individual %s value.' % (var2_station2.shape[1], str(var2[i][0])) plate_number1 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", var2_station2.shape[1])) var2_station2 = var2_station2[:, plate_number1 - 1] var2_station2.tolist() elif len(var2_station2.shape) == 1: var2_station2 = var2_station2.tolist() else: print 'weird' # zip the hisparc time stamps together with the event_rates, # sort them on the basis of timestamp value variable2_sorted = sorted(zip(timestamps_station2, var2_station2)) del timestamps_station2, var2_station2 if var2[i][0] in low_limit: var_list_without_bad_data2 = [] for t2,v2 in variable2_sorted: if v2 >= low_limit[var2[i][0]] and v2 <= high_limit[var2[i][0]]: var_list_without_bad_data2.append((t2, v2)) if len(variable2_sorted) != len(var_list_without_bad_data2): print 'Removed %d rows of bad %s data.' % (len(variable2_sorted) - len(var_list_without_bad_data2), var2[i][0]) if len(var_list_without_bad_data2) == 0: print 'Exit. In your data file there is no valid %s data' % (var2[i][0]) exit() variable2_sorted = var_list_without_bad_data2 del var_list_without_bad_data2 length_var2 = len(variable2_sorted) if length_var1 != length_var2: #print 'variable2_sorted[:10]', variable2_sorted[:10] #print 'variable1_sorted[:10]', variable1_sorted[:10] # Apply linear interpolation if length_var1 > length_var2: x, variable1 = zip(*variable1_sorted) xp, fp = zip(*variable2_sorted) result = np.interp(x, xp, fp) variable2 = result del variable1_sorted, variable2_sorted, result, x, xp, fp for i in range(len(variable1)): particle['variable1'] = variable1[i] particle['variable2'] = variable2[i] particle.append() del variable1, variable2 table_variable1.flush() elif length_var1 < length_var2: xp, fp = zip(*variable1_sorted) x, variable2 = zip(*variable2_sorted) result = np.interp(x, xp, fp) variable1 = result del variable1_sorted, variable2_sorted, result, x, xp, fp for i in range(len(variable1)): particle['variable1'] = variable1[i] particle['variable2'] = variable2[i] particle.append() del variable1, variable2 table_variable1.flush() else: print '' print 'No interpolation necessary' print '' timestamps_station1, var1_station1 = zip(*variable1_sorted) timestamps_station2, var1_station2 = zip(*variable2_sorted) combo_two_vars = zip(timestamps_station1, var1_station1, timestamps_station2, var1_station2) combo_new = [] for combo in combo_two_vars: if combo[0] == combo[2]: combo_new.append([combo[1], combo[3]]) var1, var2 = zip(*combo_new) for i in range(len(var1)): particle['variable1'] = var1[i] particle['variable2'] = var2[i] particle.append() table_variable1.flush() data_cor.close() return filename
def least_squares_fit(filename, variable1, variable2): with tables.openFile(filename, 'r') as data: # fetch values variable 1 and 2 variable_1 = data.root.correlation.table.col('variable1') variable_2 = data.root.correlation.table.col('variable2') y_axis = query_yes_no("Do you want to plot %s on the y-axis?" % variable1[0][0]) if len(variable_1.shape) != 1: print 'There are %d plates with an individual %s value.' % (variable_1.shape[1], variable1[0][0]) plate_number1 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", variable_1.shape[1])) variable_1 = variable_1[:, plate_number1 - 1] if len(variable_2.shape) != 1: print 'There are %d plates with an individual %s value.' % (variable_2.shape[1], variable2[0][0]) plate_number2 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", variable_2.shape[1])) variable_2 = variable_2[:, plate_number2 - 1] if y_axis == True: y = variable_1 # e.g. 'event_rates' x = variable_2 # e.g. 'barometric pressure' x, y = lose_nans(x, y) elif y_axis == False: x = variable_1 # e.g. 'event_rates' y = variable_2 # e.g. 'barometric pressure' else: print 'weird' del variable_1, variable_2 # Apply a linear least square fit: # a line, ``y = mx + c``, through the data-points: # We can rewrite the line equation as ``y = Ap``, where ``A = [[x 1]]`` # and ``p = [[m], [c]]``. Now use `lstsq` to solve for `p`: A = np.vstack([x, np.ones(len(x))]).T a, b = np.linalg.lstsq(A, y)[0] del A if y_axis == True: print '' print "The equation for the linear fit line is: ( y = a * x + b ) y = " + str(a) + " * x + " + str(b) print '' print "or '" + variable1[0][0] + "' = " + str(a) + " * '" + variable2[0][0] + "' + " + str(b) elif y_axis == False: print '' print "The equation for the linear fit line is: ( y = a * x + b ) y = " + str(a) + " * x + " + str(b) print '' print "or '" + variable2[0][0] + "' = " + str(a) + " * '" + variable1[0][0] + "' + " + str(b) # Calculate sample pearson correlation coefficient cor_coef = np.corrcoef([x, y])[0, 1] absolute_cor_coef = abs(cor_coef) print '' pearson_text = "The Pearson correlation coefficient between '%s' and '%s' is: %s" % (variable1[0][0], variable2[0][0], str(cor_coef)) print pearson_text print '' if absolute_cor_coef < 0.1: correlation = 'NO' elif 0.1 <= absolute_cor_coef <= 0.3: correlation = 'a SMALL' elif 0.3 <= absolute_cor_coef <= 0.5: correlation = 'a MEDIUM' elif 0.5 <= absolute_cor_coef <= 1: correlation = 'a STRONG' else: correlation = '' if cor_coef >= 0.1: pos_neg = ' POSITIVE' elif cor_coef <= -0.1: pos_neg = ' NEGATIVE' else: pos_neg = '' conclusion = "For this sample you have found %s%s correlation between '%s' and '%s'." % (correlation, pos_neg, variable1[0][0], variable2[0][0]) print conclusion """ # calculate chi squared list_exp = array([a*i + b for i in x]) begin3 = datetime.now() chi2, p = chisquare(y,list_exp) end3 = datetime.now() print end3 - begin3 combo = zip(y,list_exp) begin = datetime.now() ch2 = 0 for i in combo: ch2 = ch2 + (i[0]-i[1]-0.5)**2/i[1] print 'chi squared is ', ch2 end = datetime.now() print end - begin print '' print 'chi squared:', chi2 print 'associated p-value: ', p print '' degrees_of_freedom = (len(x) - 1) print 'chi squared divided by the number of measurements: ', chi2/degrees_of_freedom chi2_prob = chisqprob(chi2,degrees_of_freedom) # probability value associated with the provided chi-square value and degrees of freedom print 'probability value associated with the provided chi-square value and degrees of freedom:', chi2_prob """ # Plot the data along with the fitted line: if(len(x) > 500000): x, y = downsample(x, y) plt.plot(x, y, 'o', label='Original data', markersize=1) plt.plot(x, a * x + b, 'r', label='Fitted line') if y_axis == True: plt.ylabel(variable1[0][0] + ' (' + units[variable1[0][0]] + ')') plt.xlabel(variable2[0][0] + ' (' + units[variable2[0][0]] + ')') elif y_axis == False: plt.ylabel(variable2[0][0] + ' (' + units[variable2[0][0]] + ')') plt.xlabel(variable1[0][0] + ' (' + units[variable1[0][0]] + ')') tit = "Fit line: ( y = ax + b ) y = " + str(a) + " * x + " + str(b) plt.legend() plt.title(tit) start_date_interval, stop_date_interval = get_date_interval_from_file_names(variable1, variable2) inter_filename = filename.replace('.h5', '') fname = inter_filename + ' ' + start_date_interval + '_' + stop_date_interval plt.savefig(fname + ".png") plt.show() fit_info = open(fname + '.txt', 'w') fit_info.write(tit) fit_info.write("%s\n" % ('')) fit_info.write(str(pearson_text)) fit_info.write("%s\n" % ('')) fit_info.write(str(conclusion)) fit_info.close """ # calculate mean y value mean_y = sum(y) / len(y) print 'mean_y = ', mean_y relative_deviation_from_mean_y_list = [] #relative deviation of the cosmic ray intensity (deltaI/I) from the mean intensity. for i in range(len(y)): deviation_of_mean_y = y[i] - mean_y relative_deviation_from_mean_y = deviation_of_mean_y/mean_y relative_deviation_from_mean_y_list.append(relative_deviation_from_mean_y) plt.plot(x,relative_deviation_from_mean_y_list,'o',markersize=1) plt.ylabel('deltaMPV_p/<MPV_p>') plt.xlabel('Outside temperature (degrees Celsius)') tit = "Correlation between the Relative deviation of the MPV of the pulseheight (3h intervals) from the mean MPV value with the outside temperature." plt.title(tit) fname = 'Correlation between relative deviation of the MPV of the pulseheights (3h intervals) from the mean MPV value with T_out' plt.savefig(fname +".png") plt.show() """ """