コード例 #1
0
def interpolate(var1, var2):
    """ Interpolate to make two variable lists equal length

    var = ['variable', 'filename', 'station_ID', 'kind']
    """

    print ''
    print 'Interpolating and creating new table...'
    print ''

    # declare a class
    class Variable1(IsDescription):
        variable1 = Float64Col()
        variable2 = Float64Col()

    intermediate1 = var1[0][1].replace('data_s' + str(var1[0][2]) + '_', '')
    intermediate2 = intermediate1.partition(' -')
    start_date = intermediate2[0]
    intermediate3 = intermediate2[2][1:]
    end_date = intermediate3.replace('.h5','')
    filename = 'interpolated_table_' + str(var1[0][0]) + '_station' + str(var1[0][2]) + '_with_' + str(var2[0][0]) + '_station' + str(var2[0][2]) + '_' + start_date + '_' + end_date + '.h5'

    """
    intermediate1 = var1[0][1].replace('data_s' + str(var1[0][2]) + '_', '')
    station_id_and_date_interval1 = intermediate1.replace('.h5', '')

    intermediate2 = var1[-1][1].replace('data_', '')
    station_id_and_date_interval2 = intermediate2.replace('.h5', '')

    filename = 'cor_' + str(var1[0][0]) + '_' + str(var1[0][2]) + '_' + str(var2[0][0]) + '_' + str(var2[0][2]) + '_' + station_id_and_date_interval1 + '_' + station_id_and_date_interval2 + '.h5'
    """

    # make new table
    data_cor = openFile(filename, 'w')
    group_variable1 = data_cor.createGroup("/", 'correlation')
    table_variable1 = data_cor.createTable(group_variable1, 'table', Variable1)

    # Insert a new particle record
    particle = table_variable1.row

    for i in range(len(var1)):
        # open data file 1
        data_var1 = openFile(str(var1[i][1]), 'r')
        # fetch timestamps and variable 1 from station 1

        timestamps_station1 = eval("data_var1.root.s%s.%s.col('timestamp')" % (str(var1[i][2]), str(var1[i][3])))
        var1_station1 = eval("data_var1.root.s%s.%s.col('%s')" % (str(var1[i][2]), str(var1[i][3]), str(var1[i][0])))
        data_var1.close()
        if len(var1_station1.shape) != 1:
            print 'There are %d plates with an individual %s value.' % (var1_station1.shape[1], str(var1[i][0]))
            plate_number1 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", var1_station1.shape[1]))
            var1_station1 = var1_station1[:, plate_number1 - 1]
            var1_station1.tolist()
        elif len(var1_station1.shape) == 1:
            var1_station1 = var1_station1.tolist()
        else:
            print 'weird'

        # zip the hisparc timestamps together with the event_rates,
        # sort them on the basis of timestamp value
        variable1_sorted = sorted(zip(timestamps_station1, var1_station1))
        del timestamps_station1, var1_station1

        if var1[i][0] in low_limit:
            var_list_without_bad_data = []
            for t1, v1 in variable1_sorted:
                if low_limit[var1[i][0]] <= v1 <= high_limit[var1[i][0]]:
                    var_list_without_bad_data.append((t1, v1))

            if len(variable1_sorted) != len(var_list_without_bad_data):
                print 'Removed %d rows of bad %s data.' % (len(variable1_sorted) - len(var_list_without_bad_data), var1[i][0])
                if len(var_list_without_bad_data) == 0:
                    print 'Exit. In your data file there is no valid %s data' % (var1[i][0])
                    exit()

            variable1_sorted = var_list_without_bad_data
            del var_list_without_bad_data

        length_var1 = len(variable1_sorted)

        # open data file 2
        data_var2 = openFile(str(var2[i][1]), 'r')
        #fetch timestamps and variable2 from station 2
        timestamps_station2 = eval("data_var2.root.s%s.%s.col('timestamp')" % (str(var2[i][2]), str(var2[i][3])))
        var2_station2 = eval("data_var2.root.s%s.%s.col('%s')" % (str(var2[i][2]), str(var2[i][3]), str(var2[i][0])))
        data_var2.close()

        if len(var2_station2.shape) != 1:
            print 'There are %d plates with an individual %s value.' % (var2_station2.shape[1], str(var2[i][0]))
            plate_number1 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", var2_station2.shape[1]))
            var2_station2 = var2_station2[:, plate_number1 - 1]
            var2_station2.tolist()
        elif len(var2_station2.shape) == 1:
            var2_station2 = var2_station2.tolist()
        else:
            print 'weird'

        # zip the hisparc time stamps together with the event_rates,
        # sort them on the basis of timestamp value
        variable2_sorted = sorted(zip(timestamps_station2, var2_station2))
        del timestamps_station2, var2_station2

        if var2[i][0] in low_limit:
            var_list_without_bad_data2 = []

            for t2,v2 in variable2_sorted:
                if v2 >= low_limit[var2[i][0]] and  v2 <= high_limit[var2[i][0]]:
                    var_list_without_bad_data2.append((t2, v2))

            if len(variable2_sorted) != len(var_list_without_bad_data2):
                print 'Removed %d rows of bad %s data.' % (len(variable2_sorted) - len(var_list_without_bad_data2), var2[i][0])
                if len(var_list_without_bad_data2) == 0:
                    print 'Exit. In your data file there is no valid %s data' % (var2[i][0])
                    exit()
            variable2_sorted = var_list_without_bad_data2
            del var_list_without_bad_data2

        length_var2 = len(variable2_sorted)

        if length_var1 != length_var2:
            #print 'variable2_sorted[:10]', variable2_sorted[:10]
            #print 'variable1_sorted[:10]', variable1_sorted[:10]
            # Apply linear interpolation

            if length_var1 > length_var2:
                x, variable1 = zip(*variable1_sorted)
                xp, fp = zip(*variable2_sorted)

                result = np.interp(x, xp, fp)
                variable2 = result
                del variable1_sorted, variable2_sorted, result, x, xp, fp

                for i in range(len(variable1)):
                    particle['variable1'] = variable1[i]
                    particle['variable2'] = variable2[i]
                    particle.append()
                del variable1, variable2
                table_variable1.flush()

            elif length_var1 < length_var2:
                xp, fp = zip(*variable1_sorted)
                x, variable2 = zip(*variable2_sorted)

                result = np.interp(x, xp, fp)
                variable1 = result
                del variable1_sorted, variable2_sorted, result, x, xp, fp

                for i in range(len(variable1)):
                    particle['variable1'] = variable1[i]
                    particle['variable2'] = variable2[i]
                    particle.append()
                del variable1, variable2
                table_variable1.flush()

        else:
            print ''
            print 'No interpolation necessary'
            print ''
            timestamps_station1, var1_station1 = zip(*variable1_sorted)
            timestamps_station2, var1_station2 = zip(*variable2_sorted)

            combo_two_vars = zip(timestamps_station1, var1_station1, timestamps_station2, var1_station2)

            combo_new = []
            for combo in combo_two_vars:
                if combo[0] == combo[2]:
                    combo_new.append([combo[1], combo[3]])

            var1, var2 = zip(*combo_new)

            for i in range(len(var1)):
                particle['variable1'] = var1[i]
                particle['variable2'] = var2[i]
                particle.append()

            table_variable1.flush()

    data_cor.close()

    return filename
コード例 #2
0
def least_squares_fit(filename, variable1, variable2):

    with tables.openFile(filename, 'r') as data:
        # fetch values variable 1 and 2
        variable_1 = data.root.correlation.table.col('variable1')
        variable_2 = data.root.correlation.table.col('variable2')

    y_axis = query_yes_no("Do you want to plot %s on the y-axis?" % variable1[0][0])

    if len(variable_1.shape) != 1:
        print 'There are %d plates with an individual %s value.' % (variable_1.shape[1], variable1[0][0])
        plate_number1 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", variable_1.shape[1]))
        variable_1 = variable_1[:, plate_number1 - 1]

    if len(variable_2.shape) != 1:
        print 'There are %d plates with an individual %s value.' % (variable_2.shape[1], variable2[0][0])
        plate_number2 = int(question.digit_plate("Enter the plate number that you want to you use in your correlation analysis ( e.g. '1' ): ", variable_2.shape[1]))
        variable_2 = variable_2[:, plate_number2 - 1]

    if y_axis == True:
        y = variable_1 # e.g. 'event_rates'
        x = variable_2 # e.g. 'barometric pressure'
        x, y = lose_nans(x, y)

    elif y_axis == False:
        x = variable_1 # e.g. 'event_rates'
        y = variable_2 # e.g. 'barometric pressure'
    else:
        print 'weird'
    del variable_1, variable_2


    # Apply a linear least square fit:
    # a line, ``y = mx + c``, through the data-points:

    # We can rewrite the line equation as ``y = Ap``, where ``A = [[x 1]]``
    # and ``p = [[m], [c]]``.  Now use `lstsq` to solve for `p`:

    A = np.vstack([x, np.ones(len(x))]).T

    a, b = np.linalg.lstsq(A, y)[0]
    del A

    if y_axis == True:
        print ''
        print "The equation for the linear fit line is: ( y = a * x + b )   y = " + str(a) + " * x + " + str(b)
        print ''
        print "or     '" + variable1[0][0] + "' = " + str(a) + " * '" + variable2[0][0] + "' + " + str(b)
    elif y_axis == False:
        print ''
        print "The equation for the linear fit line is: ( y = a * x + b )   y = " + str(a) + " * x + " + str(b)
        print ''
        print "or     '" + variable2[0][0] + "' = " + str(a) + " * '" + variable1[0][0] + "' + " + str(b)

    # Calculate sample pearson correlation coefficient
    cor_coef = np.corrcoef([x, y])[0, 1]

    absolute_cor_coef = abs(cor_coef)
    print ''
    pearson_text = "The Pearson correlation coefficient between '%s' and '%s' is: %s" % (variable1[0][0], variable2[0][0], str(cor_coef))
    print pearson_text
    print ''

    if absolute_cor_coef < 0.1:
        correlation = 'NO'
    elif 0.1 <= absolute_cor_coef <= 0.3:
        correlation = 'a SMALL'
    elif 0.3 <= absolute_cor_coef <= 0.5:
        correlation = 'a MEDIUM'
    elif 0.5 <= absolute_cor_coef <= 1:
        correlation = 'a STRONG'
    else:
        correlation = ''

    if cor_coef >= 0.1:
        pos_neg = ' POSITIVE'
    elif cor_coef <= -0.1:
        pos_neg = ' NEGATIVE'
    else:
        pos_neg = ''

    conclusion = "For this sample you have found %s%s correlation between '%s' and '%s'." % (correlation, pos_neg, variable1[0][0], variable2[0][0])
    print conclusion

    """
    # calculate chi squared
    list_exp = array([a*i + b for i in x])

    begin3 = datetime.now()
    chi2, p = chisquare(y,list_exp)
    end3 = datetime.now()
    print end3 - begin3

    combo = zip(y,list_exp)

    begin = datetime.now()

    ch2 = 0

    for i in combo:
        ch2 = ch2 + (i[0]-i[1]-0.5)**2/i[1]

    print 'chi squared is ', ch2
    end = datetime.now()
    print end - begin



    print ''
    print 'chi squared:', chi2
    print 'associated p-value: ', p
    print ''

    degrees_of_freedom = (len(x) - 1)

    print 'chi squared divided by the number of measurements: ', chi2/degrees_of_freedom

    chi2_prob = chisqprob(chi2,degrees_of_freedom) # probability value associated with the provided chi-square value and degrees of freedom

    print 'probability value associated with the provided chi-square value and degrees of freedom:', chi2_prob
    """

    # Plot the data along with the fitted line:

    if(len(x) > 500000):
        x, y = downsample(x, y)

    plt.plot(x, y, 'o', label='Original data', markersize=1)
    plt.plot(x, a * x + b, 'r', label='Fitted line')

    if y_axis == True:
        plt.ylabel(variable1[0][0] + ' (' + units[variable1[0][0]] + ')')
        plt.xlabel(variable2[0][0] + ' (' + units[variable2[0][0]] + ')')
    elif y_axis == False:
        plt.ylabel(variable2[0][0] + ' (' + units[variable2[0][0]] + ')')
        plt.xlabel(variable1[0][0] + ' (' + units[variable1[0][0]] + ')')

    tit = "Fit line: ( y = ax + b )   y = " + str(a) + " * x + " + str(b)

    plt.legend()
    plt.title(tit)

    start_date_interval, stop_date_interval = get_date_interval_from_file_names(variable1, variable2)
    inter_filename = filename.replace('.h5', '')
    fname = inter_filename + ' ' + start_date_interval + '_' + stop_date_interval

    plt.savefig(fname + ".png")
    plt.show()

    fit_info = open(fname + '.txt', 'w')
    fit_info.write(tit)
    fit_info.write("%s\n" % (''))
    fit_info.write(str(pearson_text))
    fit_info.write("%s\n" % (''))
    fit_info.write(str(conclusion))
    fit_info.close

    """
    # calculate mean y value
    mean_y = sum(y) / len(y)
    print 'mean_y = ', mean_y

    relative_deviation_from_mean_y_list = []
    #relative deviation of the cosmic ray intensity (deltaI/I) from the mean intensity.

    for i in range(len(y)):
        deviation_of_mean_y = y[i] - mean_y
        relative_deviation_from_mean_y = deviation_of_mean_y/mean_y
        relative_deviation_from_mean_y_list.append(relative_deviation_from_mean_y)

    plt.plot(x,relative_deviation_from_mean_y_list,'o',markersize=1)

    plt.ylabel('deltaMPV_p/<MPV_p>')
    plt.xlabel('Outside temperature (degrees Celsius)')

    tit = "Correlation between the Relative deviation of the MPV of the pulseheight (3h intervals) from the mean MPV value with the outside temperature."
    plt.title(tit)

    fname = 'Correlation between relative deviation of the MPV of the pulseheights (3h intervals) from the mean MPV value with T_out'
    plt.savefig(fname +".png")

    plt.show()
    """
    """