def test_correlation(self):
        # covariance: [3, 5] [2, 10] => [-1 1] [-4 4] => 8
        # stddev:    sqrt(2) sqrt(32)
        self.assertEqual(8 / math.sqrt(2) / math.sqrt(32), statistics.correlation([3, 5], [2, 10]))
        self.assertEqual(0, statistics.correlation([3, 3], [2, 10]))

        num_friends = [100, 49, 41, 40, 25, 21, 21, 19, 19, 18, 18, 16, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12,
                       11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
                       9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
                       7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5,
                       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3,
                       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
        daily_minutes = [1, 68.77, 51.25, 52.08, 38.36, 44.54, 57.13, 51.4, 41.42, 31.22, 34.76, 54.01, 38.79, 47.59,
                         49.1, 27.66, 41.03, 36.73, 48.65, 28.12, 46.62, 35.57, 32.98, 35, 26.07, 23.77, 39.73, 40.57,
                         31.65, 31.21, 36.32, 20.45, 21.93, 26.02, 27.34, 23.49, 46.94, 30.5, 33.8, 24.23, 21.4, 27.94,
                         32.24, 40.57, 25.07, 19.42, 22.39, 18.42, 46.96, 23.72, 26.41, 26.97, 36.76, 40.32, 35.02,
                         29.47, 30.2, 31, 38.11, 38.18, 36.31, 21.03, 30.86, 36.07, 28.66, 29.08, 37.28, 15.28, 24.17,
                         22.31, 30.17, 25.53, 19.85, 35.37, 44.6, 17.23, 13.47, 26.33, 35.02, 32.09, 24.81, 19.33,
                         28.77, 24.26, 31.98, 25.73, 24.86, 16.28, 34.51, 15.23, 39.72, 40.8, 26.06, 35.76, 34.76,
                         16.13, 44.04, 18.03, 19.65, 32.62, 35.59, 39.43, 14.18, 35.24, 40.13, 41.82, 35.45, 36.07,
                         43.67, 24.61, 20.9, 21.9, 18.79, 27.61, 27.21, 26.61, 29.77, 20.59, 27.53, 13.82, 33.2, 25,
                         33.1, 36.65, 18.63, 14.87, 22.2, 36.81, 25.53, 24.62, 26.25, 18.21, 28.08, 19.42, 29.79, 32.8,
                         35.99, 28.32, 27.79, 35.88, 29.06, 36.28, 14.1, 36.63, 37.49, 26.9, 18.58, 38.48, 24.48, 18.95,
                         33.55, 14.24, 29.04, 32.51, 25.63, 22.22, 19, 32.73, 15.16, 13.9, 27.2, 32.01, 29.27, 33,
                         13.74, 20.42, 27.32, 18.23, 35.35, 28.48, 9.08, 24.62, 20.12, 35.26, 19.92, 31.02, 16.49,
                         12.16, 30.7, 31.22, 34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42, 9.82,
                         23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89, 23.48, 8.38, 27.81, 32.35, 23.84]
        self.assertAlmostEqual(0.25, statistics.correlation(num_friends, daily_minutes), places=2)
def test_correlation():
    assert np.allclose(statistics.correlation([np.arange(0, 5),
                                    np.arange(0, 5)]), np.ones([2, 2]))
    assert np.allclose(statistics.correlation(np.array([np.arange(0, 5),
                                             np.arange(0, 5)])), \
                       np.ones([2, 2]))
    assert np.allclose(statistics.correlation([np.arange(0, 5),
                                    -1*np.arange(0, 5)]), \
                       np.array([[1, -1], [-1, 1]]))
def run_two_demension_data_process():
    xs = [random_normal() for _ in range(1000)]
    ys1 = [x + random_normal() / 2 for x in xs]
    ys2 = [-x + random_normal() / 2 for x in xs]

    plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
    plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
    plt.ylabel('xs')
    plt.legend(loc=9)
    plt.title("Diferent distribution")
    plt.grid()
    plt.show()

    print(correlation(xs, ys1))
    print(correlation(xs, ys2))
Beispiel #4
0
def run_analysis():
    file_path = './winequality.csv'
    data = load_data(file_path)

    # first way of printing. Everything casted to string, and spaces put automatically between passed values.
    print('Number of features:', len(data))
    for feature_name, list_of_values in sorted(data.items()):
        # second way of printing. We print single string after format function.
        # Format function fills {} with values passed as arguments. It has nice applications for better printing,
        # like limiting number of digits for floats or other formatting tools.
        print('"{}". Mean: {:3.2f}, Median: {:.2f}, Std: {:.4f}'.format(
            feature_name, mean(list_of_values), median(list_of_values),
            variance(list_of_values)**0.5))

    strongest_pair = ["aaa", "bbb"]
    high_correlation = 0.0
    weakest_pair = ["aaa", "bbb"]
    low_correlation = 1.0
    # compares correlation between all keys in data dictionary
    for i, keys1 in enumerate(data):
        for j, keys2 in enumerate(data):
            if j <= i:
                continue
            if abs(correlation(data[keys1], data[keys2])) > abs(
                    high_correlation):  # strong correlation - far from 0
                if keys1 < keys2:
                    strongest_pair[0] = keys1
                    strongest_pair[1] = keys2
                else:
                    strongest_pair[0] = keys2
                    strongest_pair[1] = keys1
                high_correlation = correlation(data[keys1], data[keys2])
            if abs(correlation(data[keys1], data[keys2])) < abs(
                    low_correlation):  # weak correlation - close to 0
                if keys1 < keys2:
                    weakest_pair[0] = keys1
                    weakest_pair[1] = keys2
                else:
                    weakest_pair[0] = keys2
                    weakest_pair[1] = keys1
                low_correlation = correlation(data[keys1], data[keys2])

    print('The strongest linear relationship is between: "{}","{}". '
          'The value is: {:.4f}'.format(strongest_pair[0], strongest_pair[1],
                                        high_correlation))
    print('The weakest linear relationship is between: "{}","{}". '
          'The value is: {:.4f}'.format(
              *weakest_pair, low_correlation))  # * converts list to arguments.
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
    """Given two vectors x and y,
    find the least-squares value of alpha and beta"""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    #print(alpha, beta)
    return alpha, beta
    def test_correlation(self):
        # covariance: [3, 5] [2, 10] => [-1 1] [-4 4] => 8
        # stddev:    sqrt(2) sqrt(32)
        self.assertEqual(8 / math.sqrt(2) / math.sqrt(32),
                         statistics.correlation([3, 5], [2, 10]))
        self.assertEqual(0, statistics.correlation([3, 3], [2, 10]))

        num_friends = [
            100, 49, 41, 40, 25, 21, 21, 19, 19, 18, 18, 16, 15, 15, 15, 15,
            14, 14, 13, 13, 13, 13, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 10,
            10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
            9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7,
            7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
            6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
            5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
            4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
            3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
            1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
        ]
        daily_minutes = [
            1, 68.77, 51.25, 52.08, 38.36, 44.54, 57.13, 51.4, 41.42, 31.22,
            34.76, 54.01, 38.79, 47.59, 49.1, 27.66, 41.03, 36.73, 48.65,
            28.12, 46.62, 35.57, 32.98, 35, 26.07, 23.77, 39.73, 40.57, 31.65,
            31.21, 36.32, 20.45, 21.93, 26.02, 27.34, 23.49, 46.94, 30.5, 33.8,
            24.23, 21.4, 27.94, 32.24, 40.57, 25.07, 19.42, 22.39, 18.42,
            46.96, 23.72, 26.41, 26.97, 36.76, 40.32, 35.02, 29.47, 30.2, 31,
            38.11, 38.18, 36.31, 21.03, 30.86, 36.07, 28.66, 29.08, 37.28,
            15.28, 24.17, 22.31, 30.17, 25.53, 19.85, 35.37, 44.6, 17.23,
            13.47, 26.33, 35.02, 32.09, 24.81, 19.33, 28.77, 24.26, 31.98,
            25.73, 24.86, 16.28, 34.51, 15.23, 39.72, 40.8, 26.06, 35.76,
            34.76, 16.13, 44.04, 18.03, 19.65, 32.62, 35.59, 39.43, 14.18,
            35.24, 40.13, 41.82, 35.45, 36.07, 43.67, 24.61, 20.9, 21.9, 18.79,
            27.61, 27.21, 26.61, 29.77, 20.59, 27.53, 13.82, 33.2, 25, 33.1,
            36.65, 18.63, 14.87, 22.2, 36.81, 25.53, 24.62, 26.25, 18.21,
            28.08, 19.42, 29.79, 32.8, 35.99, 28.32, 27.79, 35.88, 29.06,
            36.28, 14.1, 36.63, 37.49, 26.9, 18.58, 38.48, 24.48, 18.95, 33.55,
            14.24, 29.04, 32.51, 25.63, 22.22, 19, 32.73, 15.16, 13.9, 27.2,
            32.01, 29.27, 33, 13.74, 20.42, 27.32, 18.23, 35.35, 28.48, 9.08,
            24.62, 20.12, 35.26, 19.92, 31.02, 16.49, 12.16, 30.7, 31.22,
            34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42,
            9.82, 23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89,
            23.48, 8.38, 27.81, 32.35, 23.84
        ]
        self.assertAlmostEqual(0.25,
                               statistics.correlation(num_friends,
                                                      daily_minutes),
                               places=2)
def calculate_correlation(l1,l2):

    import sys
    sys.path.append('/home/people/tc/svn/tc_sandbox/misc/')
    import statistics
    r = statistics.correlation(l1,l2)

    return r
Beispiel #8
0
def calculate_correlation(l1, l2):

    import sys
    sys.path.append('/home/people/tc/svn/tc_sandbox/misc/')
    import statistics
    r = statistics.correlation(l1, l2)

    return r
Beispiel #9
0
def least_squares_fit(x, y):
    """given training values for x and y,
    find the least-squares values of alpha and beta"""

    #x的系数β=xy相关系数*y的标准差/x的标准差
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Beispiel #10
0
def corr(x, y, n):
    for i in range(n):
        corrnp = np.corrcoef(x[i], y[i])
        corrs = s.correlation(x[i], y[i])
        if (int(corrnp[1][0]) == int(corrs)):
            printpass()
        else:
            printfail()
        printYE(corrs, corrnp[1][0])
Beispiel #11
0
def run_analysis():
    """
    run analysis on the file in file_path
    * prints the mean, median and std for all features in the file
    * prints the highest correlated pair (of features) and the least correlated pair, and the value of the correlation
    :return: void
    """
    file_path = './winequality.csv'
    data = load_data(file_path)

    # first way of printing. Everything casted to string, and spaces put automatically between passed values.
    print('Number of features:', len(data))
    for feature_name, list_of_values in sorted(data.items()):
        # second way of printing. We print single string after format function.
        # Format function fills {} with values passed as arguments. It has nice applications for better printing,
        # like limiting number of digits for floats or other formatting tools.
        print('"{}". Mean: {:3.2f}, Median: {:3.2f}, Std: {:3.4f}'.format(
            feature_name, mean(list_of_values), median(list_of_values),
            variance(list_of_values)**0.5))
    # here you should compute correlations. Be careful, pair should be sorted before printing
    high_correlation = 0.0
    low_correlation = 1.0
    strongest_pair = ("aaa", "bbb")
    weakest_pair = ("aaa", "bbb")
    arranged_data = sorted(
        data.items()
    )  # this is a tuple the first item is the feature and the second the value.
    # nested loop, over the tuple, compare only items that weren't compered so far.
    # keep the strongest and weakest correlations.
    for outer_index, outer in enumerate(arranged_data):
        for inner_index, inner in enumerate(arranged_data):
            inner_name = inner[0]
            inner_values = inner[1]
            outer_name = outer[0]
            outer_values = outer[1]
            if outer_index <= inner_index:
                continue
            temp = correlation(inner_values, outer_values)
            if abs(temp) > abs(high_correlation):
                strongest_pair = (min(outer_name,
                                      inner_name), max(outer_name, inner_name))
                high_correlation = temp
            if abs(temp) < abs(low_correlation):
                weakest_pair = min(outer_name,
                                   inner_name), max(outer_name, inner_name)
                low_correlation = temp
    print('The strongest linear relationship is between: "{}","{}". '
          'The value is: {:3.4f}'.format(strongest_pair[0], strongest_pair[1],
                                         high_correlation))

    print('The weakest linear relationship is between: "{}","{}". '
          'The value is: {:3.4f}'.format(
              *weakest_pair, low_correlation))  # * converts list to arguments.
Beispiel #12
0
def correlation(x, y):
    """Return Pearson's correlation coefficient for x and y.

    Pearson's correlation coefficient takes values between -1 and +1.
    It measures the strength and direction of the linear relationship
    between x and y, where +1 means very strong, positive linear relationship,
    -1 very strong, negative linear relationship, and 0 no linear relationship.
    """
    n = len(x)
    if len(y) != n:
        raise statistics.StatisticsError(
            'covariance requires that both inputs '
            'have same number of data points')

    if n < 2:
        raise statistics.StatisticsError(
            'covariance requires at least two data points')

    sectype = type(x[0])  # all elts of x assumed of same type
    if not issubclass(sectype, SecureObject):
        if sys.version_info.minor >= 10:
            return statistics.correlation(x, y)

        # inline code of statistics.correlation() copied from Python 3.10.0:
        xbar = fsum(x) / n
        ybar = fsum(y) / n
        sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y))
        sxx = fsum((xi - xbar)**2.0 for xi in x)
        syy = fsum((yi - ybar)**2.0 for yi in y)
        try:
            return sxy / sqrt(sxx * syy)

        except ZeroDivisionError:
            raise statistics.StatisticsError(
                'at least one of the inputs is constant') from None

    if issubclass(sectype, SecureFixedPoint):
        xbar = runtime.sum(x) / n
        ybar = runtime.sum(y) / n
        xxbar = [xi - xbar for xi in x]
        yybar = [yi - ybar for yi in y]
        sxy = runtime.in_prod(xxbar, yybar)
        sxx = runtime.in_prod(xxbar, xxbar)
        syy = runtime.in_prod(yybar, yybar)
        return sxy / (_fsqrt(sxx) * _fsqrt(syy))

    raise TypeError('secure fixed-point type required')
Beispiel #13
0
 def test_plain(self):
     f = lambda: (i * j for i in range(-1, 2, 1) for j in range(2, -2, -1))
     self.assertEqual(mean(f()), statistics.mean(f()))
     self.assertEqual(variance(f()), statistics.variance(f()))
     self.assertEqual(stdev(f()), statistics.stdev(f()))
     self.assertEqual(pvariance(f()), statistics.pvariance(f()))
     self.assertEqual(pstdev(f()), statistics.pstdev(f()))
     self.assertEqual(mode(f()), statistics.mode(f()))
     self.assertEqual(median(f()), statistics.median(f()))
     self.assertEqual(quantiles(f()), statistics.quantiles(f()))
     self.assertEqual(quantiles(f(), n=6, method='inclusive'),
                      statistics.quantiles(f(), n=6, method='inclusive'))
     x = list(f())
     y = list(reversed(x))
     self.assertEqual(covariance(x, y), statistics.covariance(x, y))
     self.assertEqual(correlation(x, y), statistics.correlation(x, y))
     self.assertEqual(linear_regression(x, y), statistics.linear_regression(x, y))
Beispiel #14
0
def run_analysis():
    """
    Run analysis on data then prints information
    :param path: None
    :return: None
    """
    file_path = './winequality.csv'
    data = load_data(file_path)

    # first way of printing. Everything casted to string, and spaces put automatically between passed values.
    print('Number of features:', len(data))
    for feature_name, list_of_values in sorted(data.items()):
        # second way of printing. We print single string after format function.
        # Format function fills {} with values passed as arguments. It has nice applications for better printing,
        # like limiting number of digits for floats or other formatting tools.
        print('"{0}". Mean: {1:.2f}, Median: {2:.2f}, Std: {3:.4f}'.format(
            feature_name, mean(list_of_values), median(list_of_values),
            variance(list_of_values)**0.5))

    # here you should compute correlations. Be careful, pair should be sorted before printing
    keys = data.keys()
    strongest_pair = ("aaa", "bbb")
    high_correlation = -0.9
    weakest_pair = ("aaa", "bbb")
    low_correlation = 0.1
    for key1 in keys:
        for key2 in keys:
            current_correlation = correlation(data[key1], data[key2])

            if current_correlation > high_correlation and key1 is not key2:
                strongest_pair = (key1, key2)
                high_correlation = current_correlation
            elif fabs(current_correlation) < fabs(low_correlation):
                weakest_pair = (key1, key2)
                low_correlation = current_correlation
    strongest_pair = sorted(strongest_pair)
    weakest_pair = sorted(weakest_pair)

    print('The strongest linear relationship is between: "{0}","{1}". '
          'The value is: {2:.4f}'.format(strongest_pair[0], strongest_pair[1],
                                         high_correlation))
    print('The weakest linear relationship is between: "{0}","{1}". '
          'The value is: {2:.4f}'.format(
              weakest_pair[0], weakest_pair[1],
              low_correlation))  # * converts list to arguments.
Beispiel #15
0
def calculate_correlation_of_all_lists(data):
    """
    calculate the correlation for each to pairs of headers from data
    :param data: dictionary of headers and list values
    :return: returns list of all the pairs and their correlations
    """
    data_header, data_values, correlations = [], [], []  # TODO iterate
    for feature_name, list_of_values in sorted(data.items()):
        data_header.append(feature_name)
        data_values.append(list_of_values)
    for i in range(len(data_header)):
        for j in range(len(data_header)):
            if i == j:
                continue
            current_correlation = correlation(data_values[i], data_values[j])
            pair = [data_header[i], data_header[j]]
            correlations.append([sorted(pair), current_correlation])
    return correlations
Beispiel #16
0
def run_analysis():
    """
    the function that used to calculate for the main function purpose
           """
    file_path = './winequality.csv'
    data = load_data(file_path)

    # first way of printing. Everything casted to string, and spaces put automatically between passed values.
    print('Number of features:', len(data))
    for feature_name, list_of_values in sorted(data.items()):
        # second way of printing. We print single string after format function.
        # Format function fills {} with values passed as arguments. It has nice applications for better printing,
        # like limiting number of digits for floats or other formatting tools.
        print('"{}". Mean: {:3.2f}, Median: {:.2f}, Std: {:.4f}'.format(
            feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5))

    # here you should compute correlations. Be careful, pair should be sorted before printing
    min_1 = 2
    max_1 = -2
    strongest_pair = []
    weakest_pair = []

    for index in range(len(data.items())-1):
        feature_name_1, list_of_values_1 = list(data.items())[index]
        for index_2 in range(index+1, len(data.items())):
            feature_name_2, list_of_values_2 = list(data.items())[index_2]
            cor = correlation(list_of_values_1, list_of_values_2)
            if abs(cor) < abs(min_1):
                min_1 = cor
                weakest_pair = sorted([feature_name_1, feature_name_2])
            if cor > max_1:
                max_1 = cor
                strongest_pair = sorted([feature_name_1, feature_name_2])

    high_correlation = max_1
    print('The strongest linear relationship is between: "{}","{}". '
          'The value is: {:.4f}'.format(strongest_pair[0], strongest_pair[1], high_correlation))

    low_correlation = min_1
    print('The weakest linear relationship is between: "{}","{}". '
          'The value is: {:.4f}'.format(*weakest_pair, low_correlation))  # * converts list to arguments.
Beispiel #17
0
def run_analysis():
    file_path = './winequality.csv'
    data = load_data(file_path)

    # first way of printing. Everything casted to string, and spaces put automatically between passed values.
    print('Number of features:', len(data))
    for feature_name, list_of_values in sorted(data.items()):
        # second way of printing. We print single string after format function.
        # Format function fills {} with values passed as arguments. It has nice applications for better printing,
        # like limiting number of digits for floats or other formatting tools.
        print('"{}". Mean: {:3.2f}, Median: {:.2f}, Std: {:.4f}'.format(
            feature_name, mean(list_of_values), median(list_of_values),
            variance(list_of_values)**0.5))

    low_correlation = 1
    high_correlation = 0
    weakest_pair = [0, 0]
    strongest_pair = [0, 0]
    for feature_name1, list_of_values1 in sorted(data.items()):
        for feature_name2, list_of_values2 in sorted(data.items()):
            if feature_name1 != feature_name2:
                corr = correlation(list_of_values1, list_of_values2)
                if abs(corr) < abs(low_correlation):
                    weakest_pair[0] = feature_name1
                    weakest_pair[1] = feature_name2
                    low_correlation = corr
                elif abs(corr) > abs(high_correlation):
                    strongest_pair[0] = feature_name1
                    strongest_pair[1] = feature_name2
                    high_correlation = corr
    weakest_pair.sort()
    strongest_pair.sort()

    print('The strongest linear relationship is between: "{}","{}". '
          'The value is: {:.4f}'.format(strongest_pair[0], strongest_pair[1],
                                        high_correlation))

    print('The weakest linear relationship is between: "{}","{}". '
          'The value is: {:.4f}'.format(
              weakest_pair[0], weakest_pair[1],
              low_correlation))  # * converts list to arguments.
Beispiel #18
0
def run_analysis():
    file_path = './winequality.csv'
    data = load_data(file_path)

    # first way of printing. Everything casted to string, and spaces put automatically between passed values.
    print('Number of features:', len(data))
    for feature_name, list_of_values in sorted(data.items()):
        # second way of printing. We print single string after format function.
        # Format function fills {} with values passed as arguments. It has nice applications for better printing,
        # like limiting number of digits for floats or other formatting tools.
        print('"{}". Mean: {:3.2f}, Median: {:0.2f}, Std: {:0.4f}'.format(
            feature_name, mean(list_of_values), median(list_of_values),
            variance(list_of_values)**0.5))

    correlations = []
    for key_1 in data.keys():
        for key_2 in data.keys():
            if key_1 != key_2:
                correlations.append(correlation(key_1, key_2))

    correlations.sort()

    min_correlations = correlations[0]
    max_correlations = correlations[len(correlations) - 1]

    # here you should compute correlations. Be careful, pair should be sorted before printing
    strongest_pair = ("aaa", "bbb")
    high_correlation = -0.9
    print('The strongest linear relationship is between: "{}","{}". '
          'The value is: {}'.format(strongest_pair[0], strongest_pair[1],
                                    high_correlation))

    weakest_pair = ("aaa", "bbb")
    low_correlation = 0.1
    print('The weakest linear relationship is between: "{}","{}". '
          'The value is: {}'.format(
              *weakest_pair, low_correlation))  # * converts list to arguments.
Beispiel #19
0
def run_analysis():
    file_path = './winequality.csv'
    data = load_data(file_path)
    print("corra")
    # print(correlation([1,2,4,5,8],[5,20,40,80,100]))
    # print(variance([1,2,4,5,8])**0.5)
    # print(variance([5,20,40,80,100])**0.5)
    print(
        correlation(data["free sulfur dioxide"], data["total sulfur dioxide"]))
    print(
        numpy.correlate(data["free sulfur dioxide"],
                        data["total sulfur dioxide"]))
    # print(correlation(data["total sulfur dioxide"]))

    # first way of printing. Everything casted to string, and spaces put automatically between passed values.
    print('Number of features:', len(data))
    for feature_name, list_of_values in sorted(data.items()):
        # second way of printing. We print single string after format function.
        # Format function fills {} with values passed as arguments. It has nice applications for better printing,
        # like limiting number of digits for floats or other formatting tools.
        print('"{}". Mean: {:3.2f}, Median: {}, Std: {:.4f}'.format(
            feature_name, mean(list_of_values), median(list_of_values),
            variance(list_of_values)**0.5))

    # here you should compute correlations. Be careful, pair should be sorted before printing
    strongest_pair = ("aaa", "bbb")
    high_correlation = -0.9
    print('The strongest linear relationship is between: "{}","{}". '
          'The value is: {}'.format(strongest_pair[0], strongest_pair[1],
                                    high_correlation))

    weakest_pair = ("free sulfur dioxide", "total sulfur dioxide")
    low_correlation = 0.1
    print('The weakest linear relationship is between: "{}","{}". '
          'The value is: {}'.format(
              *weakest_pair, low_correlation))  # * converts list to arguments.
    def main(
        self,
        l_wts,
        d_pred,
        l_xtics,
    ):

        import os, sys
        sys.path.append('/home/people/tc/svn/tc_sandbox/misc/')
        import gnuplot, statistics

        ## parse experimental data
        d_exp = self.dic2csv(l_xtics)

        ## get cwd
        dir_main = os.getcwd()

        l_r = []

        for pdb in l_wts:

            print pdb, l_wts.index(pdb)

            if not os.path.isdir('%s/%s' % (dir_main, pdb)):
                os.mkdir('%s/%s' % (dir_main, pdb))

            os.chdir('%s/%s' % (dir_main, pdb))

            self.pre_whatif(pdb)

            if pdb in [
                    '2vb1',
                    '1vdp',
            ]:
                os.system('cp %s_monomer.pdb %s_protonated.pdb' % (pdb, pdb))
##            else:
##                self.whatif(pdb)

##            self.calculate_chemical_shifts(pdb)

## parse computational predictions
            d_pred = self.parse_chemical_shifts(pdb, d_pred)

            ## calculate correlation coefficients
            l_exp = []
            l_pred = []
            for titgrp in d_exp.keys():
                res_number = int(titgrp[1:])
                res_symbol = titgrp[0]
                res_name = self.d_ressymbol2resname[res_symbol]
                for nucleus in d_exp[titgrp].keys():
                    cs_exp = d_exp[titgrp][nucleus]
                    l_exp += [cs_exp]
                    index = nucleus.index('N-HN')
                    cs_pred = d_pred['%s%i' %
                                     (res_name,
                                      res_number)][nucleus[:index]][-1]
                    l_pred += [cs_pred]
                r = statistics.correlation(l_exp, l_pred)
                l_r += [r]
##                print titgrp,r

##            print sum(l_r)/len(l_r), min(l_r), max(l_r)

## change from local dir to main dir
        os.chdir(dir_main)

        ## plots
        for titgrp1 in d_exp.keys() + ['E35']:
            res_number = int(titgrp1[1:])
            res_symbol = titgrp1[0]
            res_name = self.d_ressymbol2resname[res_symbol]
            titgrp3 = '%s%i' % (res_name, res_number)
            prefix = 'delta_cs_%s' % (titgrp3)
            ylabel = '{/Symbol D}{/Symbol w}_H'
            title = titgrp3
            gnuplot.histogram(
                d_pred[titgrp3],
                prefix,
                l_xtics,
                ylabel=ylabel,
                title=title,
                ##                l_plotdatafiles=['E34.txt'],
            )

        return
    # two dimensions
    xs = [random_normal() for _ in range(1000)]
    ys1 = [x + random_normal() / 2 for x in xs]
    ys2 = [-x + random_normal() / 2 for x in xs]

    plot_histogram(ys1, 0.5, "ys1")
    plot_histogram(ys2, 0.5, "ys2")

    plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
    plt.scatter(xs, ys2, marker='.', color='red', label='ys2')
    plt.legend(loc=9)
    plt.title("Very different joint distributions")
    plt.show()

    print correlation(xs, ys1), correlation(xs, ys2)

    # scatterplot matrix
    # prepare data
    def make_row():
        v0 = random_normal()
        v1 = -5 * v0 + random_normal()  # negatively correlated to v0
        v2 = v0 + v1 + 5 * random_normal(
        )  # positively correlated to both v0 and v1
        v3 = 6 if v2 > -2 else 0  # depends exclusively on v2
        return [v0, v1, v2, v3]

    data = [make_row() for _ in range(100)]

    # plot it
    _, num_columns = shape(data)
Beispiel #22
0
def eval_corr(v1, v2):
    return statistics.eval_correlation(statistics.correlation(v1, v2)).encode(encoding_out())


if __name__ == "__main__":
    # Shift-JIS, tsvのみ対応
    parser = argparse.ArgumentParser()
    parser.add_argument('file', metavar='FILE', help=u'tab split file. need all columns.')
    parser.add_argument('-e', '--eval', action="store_true", help=u'correlation value to evaluation text.')
    parser.add_argument('-d', '--delimiter', metavar='DELIMITER', default='\t', help=u'output delimiter.')
    opt = parser.parse_args()

    # ファイルから読み込み
    data = read_data(opt.file.decode(encoding_in()))
    columns = data[0]
    # 列名毎のリストにする
    data = map(list, zip(*data[1:]))
    # 列名毎にベクトルを辞書にまとめる
    data_set = dict(zip(columns[1:], data[1:]))

    names = data_set.keys()
    print opt.delimiter.join(['-'] + names)
    for y in names:
        if opt.eval:
            # 可視化
            records = [y] + ['-' if x == y else eval_corr(data_set[y], data_set[x]) for x in names]
        else:
            # 相関係数
            records = [y] + ['1' if x == y else str(statistics.correlation(data_set[y], data_set[x])) for x in names]
        print opt.delimiter.join(records)
Beispiel #23
0
        X = remove_projection(X, component)

    return components


def transform_vector(v, components):
    return [dot(v, w) for w in components]


def transform(X, components):
    return [transform_vector(x_i, components) for x_i in X]


if __name__ == "__main__":
    compare_two_distributions()
    print "correlation(xs, ys1)", correlation(xs, ys1)
    print "correlation(xs, ys2)", correlation(xs, ys2)
    #make_scatterplot_matrix()

    # safe parsing

    data = []

    with open("comma_delimited_stock_prices.csv", "rb") as f:
        reader = csv.reader(f)
        for line in parse_rows_with(reader,
                                    [dateutil.parser.parse, None, float]):
            if any(x is None for x in line):
                pass
            else:
                data.append(line)
 def matrix_entry(i, j):
     return correlation(get_column(data, i), get_column(data, j))
import statistics as stat
import csv
def get_data(filename):
    x = []
    y = []
    with open(filename, 'rb') as csvfile:
        reader = csv.reader(csvfile)
        reader.next()
        for row in reader:
            x.append(float(row[1]))
            y.append(float(row[2]))
    return x,y
x,y = get_data('../../data/football.csv')
print stat.correlation(x,y,"population")
def least_squares_fit(x,y):
    beta = correlation(x,y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
def least_squares_fit(x, y):
    """given training values for x and y
    find the least-squares values of alpha and beta"""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Beispiel #28
0
#Cumulative averages
X1_cummean = cumsum( X1 ) / ( 1 + arange( len( X1 )))
X2_cummean = cumsum( X2 ) / ( 1 + arange( len( X1 )))
pylab.figure( 2 )
pylab.plot( X1_cummean,"b" )
pylab.title("Empirical mean of X_1")

pylab.figure( 3 )
pylab.plot( X2_cummean,"r" )
pylab.title("Empirical mean of X_2")
pylab.show()        

#Autocorrelation
X1_sd = sqrt( var( X1 ) )
X2_sd = sqrt( var( X2 ) )
X1_autocorr = statistics.correlation( X1[1: ], X1[:-1])
X2_autocorr = statistics.correlation( X2[1: ], X2[:-1])
print "The autocorrelation of X_1 is", X1_autocorr
print "The autocorrelation of X_2 is", X2_autocorr

#Effective sample size
X1_ess = n * ( 1 - X1_autocorr ) / ( 1 + X1_autocorr )
X2_ess = n * ( 1 - X2_autocorr ) / ( 1 + X2_autocorr )
print "The effective sample size of X_1 is", X1_ess
print "The effective sample size of X_2 is", X2_ess

#Task 4: Repeat with sigma_prop = 0.1, sigma_prop = 10

#Task 5: Repeat with sigma = array([[4,2.8],[2.8,4]])
Beispiel #29
0
	prices = []
	temps = []
	produced_or_not = []
	for state in states:
		for pdata, tdata in zip(apples_info[state].values(), state_temperatures[state].values()):
			for price, temp in zip(pdata, tdata):
				if price > 0:
					prices.append(price)
					temps.append(temp)
					plt.scatter(temp, price, c='r')
					produced_or_not.append((temp,1))
				else:
					produced_or_not.append((temp,0))

	result = stats.linregress(prices, temps)
	correlation = st.correlation(prices, temps)
	temps2, produced = zip(*produced_or_not)
	correlation2 = st.correlation(temps2, produced)

	print "Correlation: ", correlation
	print result
	print "SD prices:", np.std(prices)
	print "SD temps:", np.std(temps)
	print "Correlation 2:", correlation2
	# print temps
	plt.title("Effects of temperature on apple production")
	# plt.xlabel("Price in pounds")
	# plt.ylabel("Temperature")
	plt.xlabel("Temperature")
	plt.ylabel("Price in Pounds")
	plt.show()
def main():

    ## dictionary of apo and holo structures (from what script???)
    d_apo2holo = {
        ## conformational selection
        ## RNase A, 1kf3 high resolution
##        '1kf3': {'holo': '1rpg','ligand':'CPA',},
        '1kf5': {
            'holo': '1eow','ligand':'U2G',
            'site':[
                11,43,44,
##                119,120,121,122, ## terminal flexible residues...
                ],
            'title':'Ribonuclease (RNase) A',
            },
        ## CypA, highest resolution room
##        '3k0n': {'holo': '1cwa','ligand':['DAL','MLE','MVA','BMT','ABA','SAR',],'site':[18-1,54-1,59-1,62-1,71-1,100-1,101-1,102-1,110-1,112-1,120-1,121-1,125-1,163-1,],'title':'Peptidyl-prolyl isomerase A (CypA)',},
        '1w8v': {'holo': '1w8m','ligand':['E1P',],'site':[54,62,100,101,112,125,],'title':'Peptidyl-prolyl isomerase A (CypA)',},
        ## DHFR
        '1ra9': {'holo': '1ra2','ligand':'FOL','site':[4,5,6,26,27,30,31,56,93,112,],'title':'Dihydrofolate reductase (DHFR)',},
        ## AdK
        '2rh5': {'holo': '2rgx','ligand':'AP5','coords_apo':[0,202],'coords_holo':[0,202],'site':[8,9,10,11,12,13,14,30,31,34,57,58,63,81,84,88,119,120,123,134,137,149,160,188,189,190,],'title':'Adenylate kinase (AdK)',},
        ## PKA
        '3iia': {'holo': '3pna', 'ligand':'CMP', 'coords_apo':[4,133],'coords_holo':[0,129],'site':[
            182-112, 198-112, 199-112, 200-112, 201-112, 202-112, 209-112, 211-112,
            ],
                 'title':'Protein Kinase A (PKA)',
                 },
        ## induced fit
        ## PEPCK
        '2qew': {'holo': '3dt4', 'ligand':'OXL', 'coords_apo':[1,620],'coords_holo':[0,619],'site':[240,260,307,401],'title':'PEPCK',},
        ## beta-lactoglobulin
        '3npo': {'holo': '3nq3', 'ligand':'DKA','site':[53,104,106,],'title':'beta-lactoglobulin',},
        }

    for pdb_apo in d_apo2holo.keys():

        pdb_holo = d_apo2holo[pdb_apo]['holo']

##        continue ## tmp!!!
        print pdb_apo, pdb_holo

        ##
        ## parse coordinates
        ##
        d_mmCIF_apo, l_coords_alpha_apo = parse_coords(pdb_apo)
        d_mmCIF_holo, l_coords_alpha_holo = parse_coords(pdb_holo)
        if 'coords_apo' in d_apo2holo[pdb_apo].keys():
            l_coords_alpha_apo = l_coords_alpha_apo[
                d_apo2holo[pdb_apo]['coords_apo'][0]:d_apo2holo[pdb_apo]['coords_apo'][1]
                ]
            l_coords_alpha_holo = l_coords_alpha_holo[
                d_apo2holo[pdb_apo]['coords_holo'][0]:d_apo2holo[pdb_apo]['coords_holo'][1]
                ]
        else:
            ## sequential alignment of coordinates
            index1_seq_apo = next((i for i,v in enumerate(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?'))
            index1_seq_holo = next((i for i,v in enumerate(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?'))
            ## last non-?
            index2_seq_apo = len(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?'))
            index2_seq_holo = len(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?'))
            ## first common non-?
            index1_coord_apo = max(0,index1_seq_holo-index1_seq_apo)
            index1_coord_holo = max(0,index1_seq_apo-index1_seq_holo)
            ## last common non-?
            index2_coord_apo = len(l_coords_alpha_apo)+min(0,index2_seq_holo-index2_seq_apo)
            index2_coord_holo = len(l_coords_alpha_holo)+min(0,index2_seq_apo-index2_seq_holo)
            l_coords_alpha_apo = l_coords_alpha_apo[index1_coord_apo:index2_coord_apo]
            l_coords_alpha_holo = l_coords_alpha_holo[index1_coord_holo:index2_coord_holo]

        if pdb_apo == '2qew' and pdb_holo == '3dt4':
##            l_coords_alpha_holo = l_coords_alpha_holo[:459]+l_coords_alpha_holo[466:]
            l_coords_alpha_holo = l_coords_alpha_holo[:461]+l_coords_alpha_holo[461+7:]

        if len(l_coords_alpha_apo) != len(l_coords_alpha_holo):
            print pdb_apo, pdb_holo
            print len(l_coords_alpha_apo), len(l_coords_alpha_holo)
            stop

##        if pdb_holo == '1eow':
##            print l_coords_alpha_holo[d_apo2holo[pdb_apo]['site'][0]]
##            print pdb_holo
##            stop

        tv1, rm, tv2, l_coords_alpha_apo, l_coords_alpha_holo = get_transformation_matrix(
            l_coords_alpha_apo,
            l_coords_alpha_holo,
            )

        vector_apo2holo = get_apo_holo_vector(
            d_mmCIF_apo, l_coords_alpha_apo,
            d_mmCIF_holo, l_coords_alpha_holo,
            tv1, rm, tv2,
            )

        chain_apo = ''.join(d_mmCIF_apo['_entity_poly.pdbx_strand_id'])
        chain_holo = ''.join(d_mmCIF_holo['_entity_poly.pdbx_strand_id'])

        if pdb_holo == '1cwa':
            ligand_pos_holo = numpy.array([3.307729,36.55456,17.45886])
            ligand_pos_apo = numpy.dot(ligand_pos_holo-tv1,rm)+tv2
        else:
            ligand_pos_apo, ligand_pos_holo, lines_ligand_apo = get_ligand_pos(
                d_mmCIF_holo,
                tv1, rm, tv2,
                d_apo2holo[pdb_apo]['ligand'],
                pdb_holo,
                )

        dist_max = 6
        dist_min = 3
##        print len(vector_apo2holo), len(l_coords_alpha_apo)
##        stop
        for pdb, chain, l_coords_alpha, ligand_pos in [
##            [pdb_holo,chain_holo,l_coords_alpha_holo,],
            [pdb_apo,chain_apo,l_coords_alpha_apo,ligand_pos_apo],
            ]:
##            l_coords_protein_alpha = []
##            for i in range(len(l_coords_alpha)):
##                l_coords_protein_alpha += [l_coords_alpha[i][0]]
##                l_coords_protein_alpha += [l_coords_alpha[i][1]]
##                l_coords_protein_alpha += [l_coords_alpha[i][2]]
            fn = '/home/tc/UCD/GV_ligand_binding_site_identification/%s_%s_probe.pdb' %(pdb,chain,)
            if os.path.isfile(fn):
                continue
            d = goodvibes_ligand.main(
                pdb,chain,
                dist_max,dist_min,
                v_apoholo=vector_apo2holo,
                l_coords_protein_alpha = l_coords_alpha,
##                l_coords_probe = [ligand_pos],
                )
##            d = goodvibes_ligand.main(
##                pdb,chain,
##                dist_max,dist_min,
##                v_apoholo=vector_apo2holo,
##                l_coords_protein_alpha = l_coords_alpha,
##                l_coords_probe = [ligand_pos],
##                )
            l_factors = d['l_factors']
##            l_factors_perturbed = d['l_factors_probe']

        if os.path.isfile(fn):
            fd = open(fn,'r')
            lines = fd.readlines()
            fd.close()
            lines += lines_ligand_apo
            fd = open(fn+'2','w')
            fd.writelines(lines)
            fd.close()
            continue

        eigenvectors = d['eigenvectors']
        l_factors_abs = [abs(factor) for factor in l_factors]
        mode_max_contribution = l_factors_abs.index(max(l_factors_abs))
        print mode_max_contribution

        print d['l_overlaps']

        v1 = vector_apo2holo
        eigenvector = v2 = eigenvectors[mode_max_contribution]
        overlap_max = abs(numpy.dot(v1,v2))/math.sqrt(numpy.dot(v1,v1)*numpy.dot(v2,v2))
        print 'mode_max_contribution', mode_max_contribution
        print 'overlap_max', overlap_max

        ## write amplitudes
        lines = []
        l1 = []
        l2 = []
        for i in range(0,len(eigenvector),3):
            amplitude = math.sqrt(eigenvector[i+0]**2+eigenvector[i+1]**2+eigenvector[i+2]**2)
            amplitude7 = math.sqrt(eigenvectors[6][i+0]**2+eigenvectors[6][i+1]**2+eigenvectors[6][i+2]**2)
            l1 += [amplitude]
            l2 += [amplitude7]
            if i/3 in d_apo2holo[pdb_apo]['site']:
                bool_site = amplitude
            else:
                bool_site = -1
            lines += ['%s %s %s\n' %(amplitude,amplitude7,bool_site,)]
        fd = open('amplitudes_%s%s.txt' %(pdb_apo,pdb_holo,),'w')
        fd.writelines(lines)
        fd.close()

        r = statistics.correlation(l1,l2)

        xmin = 0
        if pdb_holo == '1w8m':
            xmin = 2
        if pdb_holo == '3dt4':
            xmin = 5
        if pdb_holo == '1eow':
            xmin = 1
        lines = [
            'set terminal png\n',
            'set output "%s%s_amplitudes.png"\n' %(pdb_apo,pdb_holo,),
            'set size 1,1\n',
            'set xlabel "residue index"\n',
            'set ylabel "amplitude (a.u.)\n',
            'set title "%s (r = %.2f)"\n' %(d_apo2holo[pdb_apo]['title'],r,),
            'set key out\n',
            'f(x) = %s\n' %(sum(l1)/len(l1)),
            'plot [%s:][0:]"amplitudes_%s%s.txt" u 1 t "mode %i" w l, "amplitudes_%s%s.txt" u 2 t "mode 7" w l, "amplitudes_%s%s.txt" u 3 t "binding site" ps 1 pt 7, f(x) t "average amplitude"\n' %(
##            'plot [%s:][0:]"amplitudes_%s%s.txt" u 1 t "mode %i" w l, "amplitudes_%s%s.txt" u 2 t "mode 7" w l, "amplitudes_%s%s.txt" u 3 t "binding site" ps 1 pt 7\n' %(
                xmin,pdb_apo,pdb_holo,mode_max_contribution+1, pdb_apo,pdb_holo, pdb_apo,pdb_holo,
                ),
            ]
        fd = open('gnuplot.settings','w')
        fd.writelines(lines)
        fd.close()
        os.system('/usr/bin/gnuplot gnuplot.settings')

        s = ''
        for i in range(len(l_factors)):
            s += '%s %s %s\n' %(i+1, l_factors[i],abs(l_factors[i]),)
        fd = open('facs_eigvals_%s%s.txt' %(pdb_apo,pdb_holo,),'w')
        fd.write(s)
        fd.close()

##        s = ''
##        for i in range(len(l_factors_perturbed)):
##            s += '%s %s %s\n' %(i+1, l_factors_perturbed[i],abs(l_factors_perturbed[i]),)
##        fd = open('facs_eigvals_%s%s_perturbed.txt' %(pdb_apo,pdb_holo,),'w')
##        fd.write(s)
##        fd.close()
    
    return
Beispiel #31
0
def calculate_averages_and_plot(cwd,):

    import statistics

    print 'calculate averages and plot'

    for topology in ['NEUNEU','NEUCHA','CHANEU','CHACHA',]:

        print 'protonation state', topology

        fd = open('energies_%s.txt' %(topology),'r')
        lines = fd.readlines()
        fd.close()

        lines2 = []
        l_asp52 = []
        l_protein = []
        l_chloride = []
        l_water = []
        l_sum = []
        l_sum_excl_ions = []
        for line in lines:
            i = int(line.split()[0])
            if i % 1000 == 0:
                print 'average', topology, i
    ##        if i < 100:
    ##            continue
            l_asp52 += [float(line.split()[1])]
            l_protein += [float(line.split()[2])]
            l_chloride += [float(line.split()[3])]
            l_water += [float(line.split()[4])]
            l_sum += [
                float(line.split()[1])+
                float(line.split()[2])+
                float(line.split()[3])+
                float(line.split()[4])
                ]
            l_sum_excl_ions += [
                float(line.split()[1])+
                float(line.split()[2])+
                float(line.split()[4])
                ]

            lines2 += [
                '%i %f %f %f %f %f %f\n' %(
                    i, ## 1
                    sum(l_asp52)/len(l_asp52),
                    sum(l_protein)/len(l_protein),
                    sum(l_chloride)/len(l_chloride),
                    sum(l_water)/len(l_water),
                    sum(l_sum)/len(l_sum), ## 13
                    sum(l_sum_excl_ions)/len(l_sum_excl_ions), ## 12
                    )
                ]

        fd = open('energies_averages_%s.txt' %(topology),'w')
        fd.writelines(lines2)
        fd.close()

        fd = open('energies_averages_%s.txt' %(topology),'r')
        lines2 = fd.readlines()
        fd.close()
        average = float(lines2[-1].split()[5])
        print '******** average', average
        ## calculate rmsd
        l_diff = []
        for i in range(len(lines)):
            Sum = float(line.split()[1])+float(line.split()[2])+float(line.split()[3])+float(line.split()[4])
            l_diff += [Sum-average]
        rmsd = statistics.do_rmsd(l_diff)
        print '******** rmsd', rmsd

        if len(l_sum) > 0:
            print 'INCLUDING IONS'
            print 'correl asp52', statistics.correlation(l_sum,l_asp52)
            print 'correl protein', statistics.correlation(l_sum,l_protein)
            print 'correl chloride', statistics.correlation(l_sum,l_chloride)
            print 'correl water', statistics.correlation(l_sum,l_water)
            print 'EXCLUDING IONS'
            print 'correl asp52', statistics.correlation(l_sum_excl_ions,l_asp52)
            print 'correl protein', statistics.correlation(l_sum_excl_ions,l_protein)
            print 'correl chloride', statistics.correlation(l_sum_excl_ions,l_chloride)
            print 'correl water', statistics.correlation(l_sum_excl_ions,l_water)

    ##
    ## combined plot 3 (4 conformational states x 4 protonation states and their averages)
    ##
    ## orange=black, blue=green, yellow=red, grey=purple
    ## *NEUNEUCHA*NEU = *NEUNEUCHA*CHA (ion,water,protein)
    ## *CHANEUCHA*NEU = *CHANEUCHA*CHA (ion,water,protein)
    ## *NEUCHACHA*CHA = *NEUCHACHA*NEU (ion,water)
    ## *CHACHACHA*NEU = *CHACHACHA*CHA (ion,water)
    ## overlaps - water, ions, (protein)
    ## CHACHANEUCHA not overlap when protein
    for s_col,title,suffix,y1,y2 in [
        ['$2+$3+$4+$5','energies of 4 conformational states at 4 different protonation states (all terms)','1all',-700,250,],
        ['$2+$3+$5','energies of 4 conformational states at 4 different protonation states (excluding ions)','1exclions',-700,250,],
        ['$3+$4+$5','energies of 4 conformational states at 4 different protonation states (excluding Asp52)','1exclasp52',-700,250,],
        ['$2','energies of 4 conformational states at 4 different protonation states (Asp52)','2asp52',-700,250,],
        ['$2','energies of 4 conformational states at 4 different protonation states (Asp52)','2asp52_zoom1',-80,23,],
        ['$2','energies of 4 conformational states at 4 different protonation states (Asp52)','2asp52_zoom2',23,160,],
        ['$3','energies of 4 conformational states at 4 different protonation states (protein)','3protein',-700,250,],
        ['$4','energies of 4 conformational states at 4 different protonation states (ions)','4ions',-700,250,],
        ['$5','energies of 4 conformational states at 4 different protonation states (water)','5water',-700,250,],
        ['$5','energies of 4 conformational states at 4 different protonation states (water)','5water_zoom',-80,40,],
        ]:
        print 'combined plot 16 states', suffix
        lines = [
            'set terminal postscript eps enhanced color "Helvetica" 32\n',
            'set size 3,3\n',
            'set output "combined_16states.ps"\n',
            'set xlabel "t / ps"\n',
            'set ylabel "E / kT"\n',
            'set title "%s"\n' %(title),
            ]
    ##    line = 'plot [0:][%s:%s]' %(Min,Max,)
        line = 'plot [0:30000][%s:%s]' %(y1,y2,)
        ## data points
        for i_state in range(16):
            state = l_states[i_state]
    ##        pt = [6,7,4,5,12,13][i_state % 4]
            if i_state < 8:
                pt = 7
                ps = 1
            else:
                pt = 5
                ps = 1
            ## data points
            line += '"../%s/energies_%s.txt" u 1:(%s) lc rgb "#%6s" ps %i pt %i t "%s", ' %(state[:6],state[-6:],s_col,d_colors[state]['pc'],ps,pt,state,)
        ## lines
        for i_state in range(16):
            if i_state < 8:
                if i_state in [0,1,4,5,]:
                    lw = 16
                else:
                    lw = 12
            else:
                lw = 4
            state = l_states[i_state]
            ## lines (averages)
            line += '"../%s/energies_averages_%s.txt" u 1:(%s) w l lt 1 lc rgb "#%6s" lw %i t "%s", ' %(state[:6],state[-6:],s_col,d_colors[state]['lc'],lw,state,)
        line = line[:-2]+'\n'
        lines += [line]
        
        fd = open('gnu.set','w')
        fd.writelines(lines)
        fd.close()

        os.system('gnuplot gnu.set')
        os.system('convert combined_16states.ps combined_16states_%s.png' %(suffix))

    ##
    ## combined plot 2 (2 states with individual terms and their averages)
    ##
    for combination in [['CHACHA','CHANEU',],['NEUCHA','NEUNEU',],]:

        print 'combined plot', combination
        lines = [
            'set terminal postscript eps enhanced color "Helvetica" 32\n',
            'set size 3,3\n',
            'set output "combined.ps"\n',
            'set xlabel "t / ps"\n',
            'set ylabel "E / kT"\n',
            'set title "%s"\n' %('%s v %s' %(combination[0],combination[1],)),
            ]
        line = 'plot [0:][-500:100]'
        ## data points
        line += '"../%s/energies_%s.txt" u 1:%s lc 3 t "%s protein", ' %(combination[0],combination[0],'($3)',combination[0],)
        line += '"../%s/energies_%s.txt" u 1:%s lc 4 t "%s protein", ' %(combination[1],combination[1],'($3)',combination[1],)
        line += '"../%s/energies_%s.txt" u 1:%s lc 5 t "%s water", ' %(combination[0],combination[0],'($5)',combination[0],)
        line += '"../%s/energies_%s.txt" u 1:%s lc 6 t "%s water", ' %(combination[1],combination[1],'($5)',combination[1],)
        line += '"../%s/energies_%s.txt" u 1:%s lc 1 t "%s Asp52", ' %(combination[0],combination[0],'($2)',combination[0],)
        line += '"../%s/energies_%s.txt" u 1:%s lc 2 t "%s Asp52", ' %(combination[1],combination[1],'($2)',combination[1],)
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 3 lw 16 t "%s protein average", ' %(combination[0],combination[0],'($3)',combination[0],)
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 4 lw 16 t "%s protein average", ' %(combination[1],combination[1],'($3)',combination[1],)
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 5 lw 16 t "%s water average", ' %(combination[0],combination[0],'($5)',combination[0],)
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 6 lw 16 t "%s water average", ' %(combination[1],combination[1],'($5)',combination[1],)
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 1 lw 16 t "%s Asp52 average", ' %(combination[0],combination[0],'($2)',combination[0],)
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 2 lw 16 t "%s Asp52 average", ' %(combination[1],combination[1],'($2)',combination[1],)
    ##    line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 7 t "NEUCHA Asp52", ' %('($2)',)
    ##    line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 8 t "NEUNEU Asp52", ' %('($2)',)
    ##    line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 9 t "NEUCHA protein", ' %('($3)',)
    ##    line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 10 t "NEUNEU protein", ' %('($3)',)
        line = line[:-2]+'\n'
        lines += [line]
        
        fd = open('gnu.set','w')
        fd.writelines(lines)
        fd.close()

        os.system('gnuplot gnu.set')
        os.system('convert combined.ps combined_%s_v_%s.png' %(combination[0],combination[1],))

    return
def least_squares_fit(x, y):
    """numerical 'perfect' determination of alpha, beta for linear regression"""
    beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
Beispiel #33
0
    columns = fields[0]
    # 列名毎のリストにする
    data = map(list, zip(*fields[1:]))
    # 列名毎にベクトルを辞書にまとめる
    return dict(zip(columns[1:], data[1:]))


if __name__ == "__main__":
    # Shift-JIS, tsvのみ対応
    parser = argparse.ArgumentParser()
    parser.add_argument('file', metavar='FILE', nargs=2, help=u'tab split file. need all columns.')
    parser.add_argument('-e', '--eval', action="store_true", help=u'correlation value to evaluation text.')
    parser.add_argument('-d', '--delimiter', metavar='DELIMITER', default='\t', help=u'output delimiter.')
    opt = parser.parse_args()

    # ファイルから読み込み
    data_set1 = dataset(read_data(opt.file[0].decode(encoding_in())))
    data_set2 = dataset(read_data(opt.file[1].decode(encoding_in())))

    keys1 = data_set1.keys()
    keys2 = data_set2.keys()
    print opt.delimiter.join(['-'] + keys2)
    for y in keys1:
        if opt.eval:
            # 可視化
            records = [y] + ['-' if x == y else eval_corr(data_set1[y], data_set2[x]) for x in keys2]
        else:
            # 相関係数
            records = [y] + ['1' if x == y else str(statistics.correlation(data_set1[y], data_set2[x])) for x in keys2]
        print opt.delimiter.join(records)
Beispiel #34
0
pylab.figure(1)
pylab.plot(X2, 'r')
pylab.title("Sample path of X_2")

# Cumulative averages
X1_cummean = cumsum(X1) / (1 + arange(len(X1)))
X2_cummean = cumsum(X2) / (1 + arange(len(X1)))
pylab.figure(2)
pylab.plot(X1_cummean, "b")
pylab.title("Empirical mean of X_1")

pylab.figure(3)
pylab.plot(X2_cummean, "r")
pylab.title("Empirical mean of X_2")

pylab.show()

# Autocorrelation
X1_sd = sqrt(var(X1))
X2_sd = sqrt(var(X2))
X1_autocorr = statistics.correlation(X1[1: ], X1[:-1])
X2_autocorr = statistics.correlation(X2[1: ], X2[:-1])
print "The autocorrelation of X1 is", X1_autocorr
print "The autocorrelation of X2 is", X2_autocorr

# Effective Sample size
X1_ess = n * (1 - X1_autocorr) / (1 + X1_autocorr)
X2_ess = n * (1 - X2_autocorr) / (1 + X2_autocorr)
print "The effective sample size of X_1 is", X1_ess
print "The effective sample size of X_2 is", X2_ess
    for _ in range(num_components):
        component = first_principal_component(X)
        components.append(component)
        X = remove_projection(X, component)

    return components

def transform_vector(v, components):
    return [dot(v, w) for w in components]

def transform(X, components):
    return [transform_vector(x_i, components) for x_i in X]

if __name__ == "__main__":

    print("correlation(xs, ys1)", correlation(xs, ys1))
    print("correlation(xs, ys2)", correlation(xs, ys2))

    # safe parsing

    data = []

    with open("comma_delimited_stock_prices.csv", "r", encoding='utf8', newline='') as f:
        reader = csv.reader(f)
        for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
            data.append(line)

    for row in data:
        if any(x is None for x in row):
            print(row)
ys2 = [-x + random_normal() / 2 for x in xs]

plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
plt.scatter(xs, ys2, marker='.', color='gray', label='ys2')
plt.xlabel('xs')
plt.ylabel('ys')
plt.legend(loc=9)
plt.title("Very Different Joint Distributions")
# plt.show()

plt.savefig('im/working_scatter.png')
plt.gca().clear()

from statistics import correlation

assert 0.89 < correlation(xs, ys1) < 0.91
assert -0.91 < correlation(xs, ys2) < -0.89

from src.scratch_dir import Matrix, Vector, make_matrix


def correlation_matrix(data: List[Vector]) -> Matrix:
    """
    Returns the len(data) x len(data) matrix whose (i, j)-th entry
    is the correlation between data[i] and data[j]
    """
    def correlation_ij(i: int, j: int) -> float:
        return correlation(data[i], data[j])

    return make_matrix(len(data), len(data), correlation_ij)
    def main(
        self,l_wts,d_pred,l_xtics,
        ):

        import os, sys
        sys.path.append('/home/people/tc/svn/tc_sandbox/misc/')
        import gnuplot, statistics

        ## parse experimental data
        d_exp = self.dic2csv(l_xtics)

        ## get cwd
        dir_main = os.getcwd()

        l_r = []

        for pdb in l_wts:

            print pdb, l_wts.index(pdb)

            if not os.path.isdir('%s/%s' %(dir_main,pdb)):
                os.mkdir('%s/%s' %(dir_main,pdb))

            os.chdir('%s/%s' %(dir_main,pdb))

            self.pre_whatif(pdb)

            if pdb in ['2vb1','1vdp',]:
                os.system('cp %s_monomer.pdb %s_protonated.pdb' %(pdb,pdb))
##            else:
##                self.whatif(pdb)

##            self.calculate_chemical_shifts(pdb)

            ## parse computational predictions
            d_pred = self.parse_chemical_shifts(pdb,d_pred)

            ## calculate correlation coefficients
            l_exp = []
            l_pred = []
            for titgrp in d_exp.keys():
                res_number = int(titgrp[1:])
                res_symbol = titgrp[0]
                res_name = self.d_ressymbol2resname[res_symbol]
                for nucleus in d_exp[titgrp].keys():
                    cs_exp = d_exp[titgrp][nucleus]
                    l_exp += [cs_exp]
                    index = nucleus.index('N-HN')
                    cs_pred = d_pred['%s%i' %(res_name,res_number)][nucleus[:index]][-1]
                    l_pred += [cs_pred]
                r = statistics.correlation(l_exp,l_pred)
                l_r += [r]
##                print titgrp,r

##            print sum(l_r)/len(l_r), min(l_r), max(l_r)

        ## change from local dir to main dir
        os.chdir(dir_main)

        ## plots
        for titgrp1 in d_exp.keys()+['E35']:
            res_number = int(titgrp1[1:])
            res_symbol = titgrp1[0]
            res_name = self.d_ressymbol2resname[res_symbol]
            titgrp3 = '%s%i' %(res_name,res_number)
            prefix = 'delta_cs_%s' %(titgrp3)
            ylabel = '{/Symbol D}{/Symbol w}_H'
            title = titgrp3
            gnuplot.histogram(
                d_pred[titgrp3],prefix,l_xtics,
                ylabel=ylabel,title=title,
##                l_plotdatafiles=['E34.txt'],
                )

        return
def main():

    # I don't know why this is necessary
    plt.gca().clear()
    plt.close()

    import random
    from probability import inverse_normal_cdf

    random.seed(0)

    # uniform between -100 and 100
    uniform = [200 * random.random() - 100 for _ in range(10000)]

    # normal distribution with mean 0, standard deviation 57
    normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)]

    plot_histogram(uniform, 10, "Uniform Histogram")

    plt.savefig('im/working_histogram_uniform.png')
    plt.gca().clear()
    plt.close()

    plot_histogram(normal, 10, "Normal Histogram")

    plt.savefig('im/working_histogram_normal.png')
    plt.gca().clear()

    from statistics import correlation

    print(correlation(xs, ys1))  # about 0.9
    print(correlation(xs, ys2))  # about -0.9

    from typing import List

    # Just some random data to show off correlation scatterplots
    num_points = 100

    def random_row() -> List[float]:
        row = [0.0, 0, 0, 0]
        row[0] = random_normal()
        row[1] = -5 * row[0] + random_normal()
        row[2] = row[0] + row[1] + 5 * random_normal()
        row[3] = 6 if row[2] > -2 else 0
        return row

    random.seed(0)
    # each row has 4 points, but really we want the columns
    corr_rows = [random_row() for _ in range(num_points)]

    corr_data = [list(col) for col in zip(*corr_rows)]

    # corr_data is a list of four 100-d vectors
    num_vectors = len(corr_data)
    fig, ax = plt.subplots(num_vectors, num_vectors)

    for i in range(num_vectors):
        for j in range(num_vectors):

            # Scatter column_j on the x-axis vs column_i on the y-axis,
            if i != j:
                ax[i][j].scatter(corr_data[j], corr_data[i])

                # unless i == j, in which case show the series name.
            else:
                ax[i][j].annotate("series " + str(i), (0.5, 0.5),
                                  xycoords='axes fraction',
                                  ha="center",
                                  va="center")

            # Then hide axis labels except left and bottom charts
            if i < num_vectors - 1: ax[i][j].xaxis.set_visible(False)
            if j > 0: ax[i][j].yaxis.set_visible(False)

    # Fix the bottom right and top left axis labels, which are wrong because
    # their charts only have text in them
    ax[-1][-1].set_xlim(ax[0][-1].get_xlim())
    ax[0][0].set_ylim(ax[0][1].get_ylim())

    # plt.show()

    plt.savefig('im/working_scatterplot_matrix.png')
    plt.gca().clear()
    plt.close()
    plt.clf()

    import csv

    data: List[StockPrice] = []

    with open("comma_delimited_stock_prices.csv") as f:
        reader = csv.reader(f)
        for row in reader:
            maybe_stock = try_parse_row(row)
            if maybe_stock is None:
                print(f"skipping invalid row: {row}")
            else:
                data.append(maybe_stock)

    from typing import List

    def primes_up_to(n: int) -> List[int]:
        primes = [2]

        with tqdm.trange(3, n) as t:
            for i in t:
                # i is prime if no smaller prime divides it.
                i_is_prime = not any(i % p == 0 for p in primes)
                if i_is_prime:
                    primes.append(i)

                t.set_description(f"{len(primes)} primes")

        return primes

    my_primes = primes_up_to(100_000)

    de_meaned = de_mean(pca_data)
    fpc = first_principal_component(de_meaned)
    assert 0.923 < fpc[0] < 0.925
    assert 0.382 < fpc[1] < 0.384
def plot(d_mmCIF_main,d_rmsds,):

    l_pdbs = d_rmsds.keys()
    l_pdbs.sort()

    l_temperature = []
    l_ph = []
    l_resolution = []
    d_spacegroup = {}
    d_starting_model = {}

    l_correl_T = [[],[],]
    l_correl_pH = [[],[],]
    l_correl_resol_max = [[],[],]

    d_histo_pH = {}
    d_histo_T = {}
    d_histo_resol = {}

    for i1 in range(len(l_pdbs)-1):
        pdb1 = l_pdbs[i1]
        spacegroup1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_symmetry.space_group_name_H-M',)
        T1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_diffrn.ambient_temp',)
        pH1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_exptl_crystal_grow.pH',)
        starting_model1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.pdbx_starting_model',)
        resolution1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.ls_d_res_high',)

        for i2 in range(i1+1,len(l_pdbs)):
            pdb2 = l_pdbs[i2]
            spacegroup2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_symmetry.space_group_name_H-M',)
            T2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_diffrn.ambient_temp',)
            pH2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_exptl_crystal_grow.pH',)
            starting_model2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.pdbx_starting_model',)
            resolution2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.ls_d_res_high',)

            rmsd = d_rmsds[pdb1][pdb2]
            if rmsd > 1:
                print pdb1, pdb2, rmsd

            if T1 and T2:
                T_diff = abs(float(T2)-float(T1))
                l_temperature += ['%s %s\n' %(T_diff,rmsd),]
                l_correl_T[0] += [T_diff]
                l_correl_T[1] += [rmsd]

                print T_diff, 10*round(T_diff/10.,0)
                if not 10*round(T_diff/10.,0) in d_histo_T.keys():
                    d_histo_T[10*round(T_diff/10.,0)] = 0
                d_histo_T[10*round(T_diff/10.,0)] += 1

            if pH1 and pH2:
                pH_diff = abs(float(pH2)-float(pH1))
                l_ph += ['%s %s\n' %(pH_diff,rmsd),]
                l_correl_pH[0] += [pH_diff]
                l_correl_pH[1] += [rmsd]

                if not pH_diff in d_histo_pH.keys():
                    d_histo_pH[pH_diff] = 0
                d_histo_pH[pH_diff] += 1

            resolution_max = max(resolution1,resolution2,)
            l_resolution += ['%s %s\n' %(resolution_max,rmsd),]
            if resolution_max != 'N/A':
                l_correl_resol_max[0] += [float(resolution_max)]
                l_correl_resol_max[1] += [rmsd]

                if not round(float(resolution_max),0) in d_histo_resol.keys():
                    d_histo_resol[round(float(resolution_max),0)] = 0
                d_histo_resol[round(float(resolution_max),0)] += 1

            d_spacegroup = append_to_dictionary(d_spacegroup,spacegroup1,spacegroup2,rmsd,)
            d_starting_model = append_to_dictionary(d_starting_model,starting_model1,starting_model2,rmsd,)

    r1 = statistics.correlation(l_correl_T[0],l_correl_T[1],)
    r2 = statistics.correlation(l_correl_pH[0],l_correl_pH[1],)
    r3 = statistics.correlation(l_correl_resol_max[0],l_correl_resol_max[1],)

    ##
    ## plot histograms
    ##
    for prefix,d in [
        ['deltapH',d_histo_pH,],
        ['deltaT',d_histo_T,],
        ['maxresolution',d_histo_resol,],
        ]:
        
        l = []
        l_diffs = d.keys()
        l_diffs.sort()
        for diff in l_diffs:
            l += ['%s %s\n' %(diff,d[diff],)]
        fd = open('histo_%s.txt' %(prefix),'w')
        fd.writelines(l)
        fd.close()

        l = [
            'set terminal postscript eps enhanced color "Helvetica"\n',
            'set output "gnuplot.ps"\n',
            'set size 3,3\n',
            'set style data histogram\n',
            'set xtics rotate\n',
            'set xlabel "%s\n' %(prefix),
            'set ylabel "count\n',
            'plot "histo_%s.txt" u 2:xtic(1) t ""\n' %(prefix)
            ]
        fd = open('tmp.txt','w')
        fd.writelines(l)
        fd.close()

        os.system('gnuplot tmp.txt')
        os.system('convert gnuplot.ps histo_%s.png' %(prefix))

    ##
    ## plot rmsd as a function of each property (2d)
    ##
    for prefix,data,xlabel in [
        ['pH',l_ph,'pH diff',],
        ['Temperature',l_temperature,'T diff',],
        ['resolution',l_resolution,'maximum resolution',],
        ]:
        prefix += method
        fd = open('%s.gnuplotdata' %(prefix),'w')
        fd.writelines(data)
        fd.close()
        gnuplot.scatter_plot_2d(
            prefix,xlabel=xlabel,ylabel='RMSD %s' %(method,),
##            averages=True,
            regression=True,
            )

    ##
    ## plot rmsd as a function of each property (contour)
    ##
    for d,prefix in [
        [d_spacegroup,'spacegroup',],
        [d_starting_model,'startingmodel',],
        ]:

        d_tics = {}
        l_tics = d.keys()
        l_tics.sort()
        for i in range(len(l_tics)):
            d_tics[l_tics[i]] = i+.5
        z1 = 9
        z2 = 0

        l_data = []
        for x in range(len(l_tics)):
            k1 = l_tics[x]
            for y in range(len(l_tics)):
                k2 = l_tics[y]
                if not k2 in d[k1].keys():
                    average = 9
                else:
                    l_rmsds = d[k1][k2]
                    average = sum(l_rmsds)/len(l_rmsds)
                    if average < z1:
                        z1 = average
                    if average > z2:
                        z2 = average
                l_data += ['%s %s %s\n' %(x,y,average,)]
            l_data += ['%s %s %s\n' %(x,y+1,1,)]
            l_data += ['\n']
        for y in range(len(l_tics)):
            l_data += ['%s %s %s\n' %(x+1,y,1,)]
        l_data += ['%s %s %s\n' %(x+1,y+1,1,)]
        l_data += ['\n']
        gnuplot.contour_plot(
            prefix,l_data,
            title='%s %s' %(prefix,method,),zlabel='RMSD %s' %(method),
            d_xtics = d_tics, d_ytics = d_tics,
            palette = '0 1 0 0, 0.9999 0 0 1, 0.9999 1 1 1, 1 1 1 1',
            z1 = z1, z2 = z2+0.1,
            bool_remove = False,
            )
        os.system('convert %s.ps %s_spacegroup%s_mutations%s_atoms%s.png' %(prefix,prefix,spacegroup.replace(' ',''),n_mutations_max,method,))
##        os.remove('%s.ps' %(prefix,))

    print d_spacegroup
    print d_starting_model

    print r1
    print r2
    print r3

    return
 def correlation_ij(i: int, j: int) -> float:
     return correlation(data[i], data[j])
Beispiel #41
0
def main():

    #This will get the total count of crime for each month

    #count of Crimes for Louisiana
    l1 = total(year1)
    l2 = total(year2)
    l3 = total(year3)
    l4 = total(year4)

    #count of Crimes for Chicago
    c1 = total(chicago_year1)
    c2 = total(chicago_year2)
    c3 = total(chicago_year3)
    c4 = total(chicago_year4)

    chicago_1 = values(c1)
    chicago_2 = values(c2)
    chicago_3 = values(c3)
    chicago_4 = values(c4)
    print 'Chicago crime Per month'
    print chicago_1
    print chicago_2
    print chicago_3
    print chicago_4
    print
    newarr1 = map(add, chicago_1, chicago_2)
    newarr2 = map(add, chicago_3, chicago_4)
    newarr3 = map(add, newarr1, newarr2)
    print 'Sum crimes per month for all four years in Chicago'
    print newarr3
    print
    new_chicago_avg = []
    for i in newarr3:
        new_chicago_avg.append(i/4)
    print 'Average crime per month for all four years in Chicago'
    print new_chicago_avg
    print

    #Values of crimes for each month of Louisiana
    louisiana_1 = values(l1)
    louisiana_2 = values(l2)
    louisiana_3 = values(l3)
    louisiana_4 = values(l4)
    print 'Louisiana crime Per month'
    print louisiana_1
    print louisiana_2
    print louisiana_3
    print louisiana_4
    print

    newarr4 = map(add, louisiana_1, louisiana_2)
    newarr5 = map(add, louisiana_3, louisiana_4)
    newarr6 = map(add, newarr4, newarr5)
    print 'Sum crimes per month for all four years in Louisiana'
    print newarr6
    print

    new_louisiana_avg = []
    for i in newarr6:
        new_louisiana_avg.append(i/4)
    print 'Average crime per month for all four years in Louisiana'
    print new_louisiana_avg
    print

    weather_l_1 = [46, 46, 42, 59, 66, 69, 73, 73, 73, 56, 47, 48]
    weather_l_2 = [32, 44, 46, 57, 62, 72, 71, 72, 72, 61, 46, 43]
    weather_l_3 = [37, 38, 55, 56, 61, 71, 70, 72, 72, 58, 42, 48]
    weather_l_4 = [39, 41, 55, 63, 67, 71, 73, 70, 70, 58, 55, 53]
    suml = map(add, weather_l_1, weather_l_2)
    suml1 = map(add, weather_l_3, weather_l_4)
    total_l = map(add, suml, suml1)
    louisiana_weather_total = []
    for i in total_l:
        louisiana_weather_total.append(i/4)
    print 'Average weather from 2012 - 2015 for Louisiana'
    print louisiana_weather_total
    print


    weather_c_1 = [18, 19, 22, 35, 47, 53, 64, 58, 58, 40, 29, 29]
    weather_c_2 = [10, 9, 21, 33, 47, 55, 60, 59, 59, 43, 28, 18]
    weather_c_3 = [17, 8, 24, 35, 48, 59, 58, 64, 64, 43, 25, 26]
    weather_c_4 = [17, 20, 33, 34, 50, 59, 60, 59, 59, 42, 35, 33]
    sum2 = map(add, weather_c_1, weather_c_2)
    sum3 = map(add, weather_c_3, weather_c_4)
    total_c = map(add, sum2, sum3)
    chicago_weather_total = []
    for i in total_c:
        chicago_weather_total.append(i/4)
    print 'Average weather from 2012 - 2015 for Chicago'
    print chicago_weather_total
    print

    m = range(0,12)
    print 'Correlation between between all the years for Chicago and Louisiana'
    lou_chi_2012_2015 = statistics.correlation(new_chicago_avg, new_louisiana_avg)
    print lou_chi_2012_2015
    print
    print 'Correlation between the total crime of Louisiana and the Weather for the same period of time'
    louisiana_weather_correlation = statistics.correlation(new_louisiana_avg, louisiana_weather_total)
    print louisiana_weather_correlation
    print
    print 'Correlation between the total crime of Chicago and the Weather for the same period of time'
    chicago_weather_correlation = statistics.correlation(new_chicago_avg, chicago_weather_total)
    print chicago_weather_correlation

    plt.plot(m, new_louisiana_avg, marker = 'o',color='purple', label="Louisiana 2012-2015")
    plt.plot(m, new_chicago_avg, marker= 'o', color='green', label="Chicago 2012-2015")
    plt.title("Osnaldy Vasquez\nCrime Correlation between Chicago and Louisiana for 2012 - 2015\nCorrelation = 0.706490238246", fontsize= 'medium')
    plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec'))
    plt.xlabel("Months")
    plt.ylabel('Crimes')
    plt.legend(loc = 4,fontsize = 'x-small')
    plt.show()
    #
    plt.plot(m, louisiana_1, marker = 'o',color='red', label="Louisiana 2012")
    plt.plot(m, louisiana_2, marker = 'o',color='purple', label="Louisiana 2013")
    plt.plot(m, louisiana_3, marker = 'o',color='pink', label="Louisiana 2014")
    plt.plot(m, louisiana_4, marker= 'o', color='green', label="Louisiana 2015")
    plt.title("Osnaldy Vasquez\nCrime for Louisiana 2012 - 2015")
    plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec'))
    plt.xlabel("Months")
    plt.ylabel('Crimes')
    plt.legend(loc = 4,fontsize = 'x-small')
    plt.show()
    #
    plt.plot(m, chicago_1, marker = 'o',color='red', label="Chicago 2012")
    plt.plot(m, chicago_2, marker = 'o',color='purple', label="Chicago 2013")
    plt.plot(m, chicago_3, marker = 'o',color='pink', label="Chicago 2014")
    plt.plot(m, chicago_4, marker= 'o', color='green', label="Chicago 2015")
    plt.title("Osnaldy Vasquez\nCrime for Chicago 2012 - 2015")
    plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec'))
    plt.xlabel("Months")
    plt.ylabel('Crimes')
    plt.legend(loc = 4,fontsize = 'x-small')
    plt.show()

    plt.plot(m, new_louisiana_avg, marker = 'o',color='blue', label="Crime 2012-2015")
    plt.plot(m, louisiana_weather_total, marker= 'o', color='red', label=" Weather 2012-2015")
    plt.title("Osnaldy Vasquez\nCorrelation between the weather and the crime for Louisiana\nCorrelation = 0.696352921783", fontsize= 'medium')
    plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec'))
    plt.xlabel("Months")
    plt.ylabel('Crimes')
    plt.legend(loc = 2,fontsize = 'x-small')
    plt.show()

    plt.plot(m, new_chicago_avg, marker = 'o',color='green', label="Crime 2012-2015")
    plt.plot(m, chicago_weather_total, marker= 'o', color='pink', label=" Weather 2012-2015")
    plt.title("Osnaldy Vasquez\nCorrelation between the weather and the crime for Chicago\nCorrelation = 0.647656528", fontsize= 'medium')
    plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec'))
    plt.xlabel("Months")
    plt.ylabel('Crimes')
    plt.legend(loc = 2,fontsize = 'x-small')
    plt.show()
    for _ in range(num_components):
        component = first_principal_component(X)
        components.append(component)
        X = remove_projection(X, component)
        
    return components

def transform_vector(v, components):
    return [dot(v, w) for w in components]
    
def transform(X, components):
    return [transform_vector(x_i, components) for x_i in X] 

if __name__ == "__main__":

    print "correlation(xs, ys1)", correlation(xs, ys1)
    print "correlation(xs, ys2)", correlation(xs, ys2)

    # safe parsing

    data = []

    with open("comma_delimited_stock_prices.csv", "rb") as f:
        reader = csv.reader(f)
        for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]):
            data.append(line)

    for row in data:
        if any(x is None for x in row):
            print row
Beispiel #43
0
def eval_corr(v1, v2):
    return statistics.eval_correlation(statistics.correlation(v1, v2)).encode(encoding_out())
def columns_correlation(matrix, i, j):
    return correlation(get_column(matrix, i), get_column(matrix, j))
Beispiel #45
0
 def matrix_entry(i, j):
     return correlation(get_column(data, i), get_column(data, j))
Beispiel #46
0
 def matrix_entry(i, j):
     return ind.correlation(mat.get_collumn(data, i),
                            mat.get_collumn(data, j))
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]:
    beta = correlation(x, y) * standart_deviation(y) / standart_deviation(x)
    alpha = mean(y) - beta * mean(x)
    return alpha, beta
    # two dimensions
    xs = [random_normal() for _ in range(1000)]
    ys1 = [x + random_normal() / 2 for x in xs]
    ys2 = [-x + random_normal() / 2 for x in xs]

    plot_histogram(ys1, 0.5, "ys1")
    plot_histogram(ys2, 0.5, "ys2")

    plt.scatter(xs, ys1, marker='.', color='black', label='ys1')
    plt.scatter(xs, ys2, marker='.', color='red', label='ys2')
    plt.legend(loc=9)
    plt.title("Very different joint distributions")
    plt.show()

    print correlation(xs, ys1), correlation(xs, ys2)

    # scatterplot matrix
    # prepare data
    def make_row():
        v0 = random_normal()
        v1 = -5 * v0 + random_normal()  # negatively correlated to v0
        v2 = v0 + v1 + 5 * random_normal()  # positively correlated to both v0 and v1
        v3 = 6 if v2 > -2 else 0  # depends exclusively on v2
        return [v0, v1, v2, v3]
    data = [make_row() for _ in range(100)]

    # plot it
    _, num_columns = shape(data)
    fig, ax = plt.subplots(num_columns, num_columns)
def columns_correlation(matrix, i, j):
    return correlation(get_column(matrix, i), get_column(matrix, j))
Beispiel #50
0
def calculate_averages_and_plot(cwd, ):

    import statistics

    print 'calculate averages and plot'

    for topology in [
            'NEUNEU',
            'NEUCHA',
            'CHANEU',
            'CHACHA',
    ]:

        print 'protonation state', topology

        fd = open('energies_%s.txt' % (topology), 'r')
        lines = fd.readlines()
        fd.close()

        lines2 = []
        l_asp52 = []
        l_protein = []
        l_chloride = []
        l_water = []
        l_sum = []
        l_sum_excl_ions = []
        for line in lines:
            i = int(line.split()[0])
            if i % 1000 == 0:
                print 'average', topology, i
    ##        if i < 100:
    ##            continue
            l_asp52 += [float(line.split()[1])]
            l_protein += [float(line.split()[2])]
            l_chloride += [float(line.split()[3])]
            l_water += [float(line.split()[4])]
            l_sum += [
                float(line.split()[1]) + float(line.split()[2]) +
                float(line.split()[3]) + float(line.split()[4])
            ]
            l_sum_excl_ions += [
                float(line.split()[1]) + float(line.split()[2]) +
                float(line.split()[4])
            ]

            lines2 += [
                '%i %f %f %f %f %f %f\n' % (
                    i,  ## 1
                    sum(l_asp52) / len(l_asp52),
                    sum(l_protein) / len(l_protein),
                    sum(l_chloride) / len(l_chloride),
                    sum(l_water) / len(l_water),
                    sum(l_sum) / len(l_sum),  ## 13
                    sum(l_sum_excl_ions) / len(l_sum_excl_ions),  ## 12
                )
            ]

        fd = open('energies_averages_%s.txt' % (topology), 'w')
        fd.writelines(lines2)
        fd.close()

        fd = open('energies_averages_%s.txt' % (topology), 'r')
        lines2 = fd.readlines()
        fd.close()
        average = float(lines2[-1].split()[5])
        print '******** average', average
        ## calculate rmsd
        l_diff = []
        for i in range(len(lines)):
            Sum = float(line.split()[1]) + float(line.split()[2]) + float(
                line.split()[3]) + float(line.split()[4])
            l_diff += [Sum - average]
        rmsd = statistics.do_rmsd(l_diff)
        print '******** rmsd', rmsd

        if len(l_sum) > 0:
            print 'INCLUDING IONS'
            print 'correl asp52', statistics.correlation(l_sum, l_asp52)
            print 'correl protein', statistics.correlation(l_sum, l_protein)
            print 'correl chloride', statistics.correlation(l_sum, l_chloride)
            print 'correl water', statistics.correlation(l_sum, l_water)
            print 'EXCLUDING IONS'
            print 'correl asp52', statistics.correlation(
                l_sum_excl_ions, l_asp52)
            print 'correl protein', statistics.correlation(
                l_sum_excl_ions, l_protein)
            print 'correl chloride', statistics.correlation(
                l_sum_excl_ions, l_chloride)
            print 'correl water', statistics.correlation(
                l_sum_excl_ions, l_water)

    ##
    ## combined plot 3 (4 conformational states x 4 protonation states and their averages)
    ##
    ## orange=black, blue=green, yellow=red, grey=purple
    ## *NEUNEUCHA*NEU = *NEUNEUCHA*CHA (ion,water,protein)
    ## *CHANEUCHA*NEU = *CHANEUCHA*CHA (ion,water,protein)
    ## *NEUCHACHA*CHA = *NEUCHACHA*NEU (ion,water)
    ## *CHACHACHA*NEU = *CHACHACHA*CHA (ion,water)
    ## overlaps - water, ions, (protein)
    ## CHACHANEUCHA not overlap when protein
    for s_col, title, suffix, y1, y2 in [
        [
            '$2+$3+$4+$5',
            'energies of 4 conformational states at 4 different protonation states (all terms)',
            '1all',
            -700,
            250,
        ],
        [
            '$2+$3+$5',
            'energies of 4 conformational states at 4 different protonation states (excluding ions)',
            '1exclions',
            -700,
            250,
        ],
        [
            '$3+$4+$5',
            'energies of 4 conformational states at 4 different protonation states (excluding Asp52)',
            '1exclasp52',
            -700,
            250,
        ],
        [
            '$2',
            'energies of 4 conformational states at 4 different protonation states (Asp52)',
            '2asp52',
            -700,
            250,
        ],
        [
            '$2',
            'energies of 4 conformational states at 4 different protonation states (Asp52)',
            '2asp52_zoom1',
            -80,
            23,
        ],
        [
            '$2',
            'energies of 4 conformational states at 4 different protonation states (Asp52)',
            '2asp52_zoom2',
            23,
            160,
        ],
        [
            '$3',
            'energies of 4 conformational states at 4 different protonation states (protein)',
            '3protein',
            -700,
            250,
        ],
        [
            '$4',
            'energies of 4 conformational states at 4 different protonation states (ions)',
            '4ions',
            -700,
            250,
        ],
        [
            '$5',
            'energies of 4 conformational states at 4 different protonation states (water)',
            '5water',
            -700,
            250,
        ],
        [
            '$5',
            'energies of 4 conformational states at 4 different protonation states (water)',
            '5water_zoom',
            -80,
            40,
        ],
    ]:
        print 'combined plot 16 states', suffix
        lines = [
            'set terminal postscript eps enhanced color "Helvetica" 32\n',
            'set size 3,3\n',
            'set output "combined_16states.ps"\n',
            'set xlabel "t / ps"\n',
            'set ylabel "E / kT"\n',
            'set title "%s"\n' % (title),
        ]
        ##    line = 'plot [0:][%s:%s]' %(Min,Max,)
        line = 'plot [0:30000][%s:%s]' % (
            y1,
            y2,
        )
        ## data points
        for i_state in range(16):
            state = l_states[i_state]
            ##        pt = [6,7,4,5,12,13][i_state % 4]
            if i_state < 8:
                pt = 7
                ps = 1
            else:
                pt = 5
                ps = 1
            ## data points
            line += '"../%s/energies_%s.txt" u 1:(%s) lc rgb "#%6s" ps %i pt %i t "%s", ' % (
                state[:6],
                state[-6:],
                s_col,
                d_colors[state]['pc'],
                ps,
                pt,
                state,
            )
        ## lines
        for i_state in range(16):
            if i_state < 8:
                if i_state in [
                        0,
                        1,
                        4,
                        5,
                ]:
                    lw = 16
                else:
                    lw = 12
            else:
                lw = 4
            state = l_states[i_state]
            ## lines (averages)
            line += '"../%s/energies_averages_%s.txt" u 1:(%s) w l lt 1 lc rgb "#%6s" lw %i t "%s", ' % (
                state[:6],
                state[-6:],
                s_col,
                d_colors[state]['lc'],
                lw,
                state,
            )
        line = line[:-2] + '\n'
        lines += [line]

        fd = open('gnu.set', 'w')
        fd.writelines(lines)
        fd.close()

        os.system('gnuplot gnu.set')
        os.system('convert combined_16states.ps combined_16states_%s.png' %
                  (suffix))

    ##
    ## combined plot 2 (2 states with individual terms and their averages)
    ##
    for combination in [
        [
            'CHACHA',
            'CHANEU',
        ],
        [
            'NEUCHA',
            'NEUNEU',
        ],
    ]:

        print 'combined plot', combination
        lines = [
            'set terminal postscript eps enhanced color "Helvetica" 32\n',
            'set size 3,3\n',
            'set output "combined.ps"\n',
            'set xlabel "t / ps"\n',
            'set ylabel "E / kT"\n',
            'set title "%s"\n' % ('%s v %s' % (
                combination[0],
                combination[1],
            )),
        ]
        line = 'plot [0:][-500:100]'
        ## data points
        line += '"../%s/energies_%s.txt" u 1:%s lc 3 t "%s protein", ' % (
            combination[0],
            combination[0],
            '($3)',
            combination[0],
        )
        line += '"../%s/energies_%s.txt" u 1:%s lc 4 t "%s protein", ' % (
            combination[1],
            combination[1],
            '($3)',
            combination[1],
        )
        line += '"../%s/energies_%s.txt" u 1:%s lc 5 t "%s water", ' % (
            combination[0],
            combination[0],
            '($5)',
            combination[0],
        )
        line += '"../%s/energies_%s.txt" u 1:%s lc 6 t "%s water", ' % (
            combination[1],
            combination[1],
            '($5)',
            combination[1],
        )
        line += '"../%s/energies_%s.txt" u 1:%s lc 1 t "%s Asp52", ' % (
            combination[0],
            combination[0],
            '($2)',
            combination[0],
        )
        line += '"../%s/energies_%s.txt" u 1:%s lc 2 t "%s Asp52", ' % (
            combination[1],
            combination[1],
            '($2)',
            combination[1],
        )
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 3 lw 16 t "%s protein average", ' % (
            combination[0],
            combination[0],
            '($3)',
            combination[0],
        )
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 4 lw 16 t "%s protein average", ' % (
            combination[1],
            combination[1],
            '($3)',
            combination[1],
        )
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 5 lw 16 t "%s water average", ' % (
            combination[0],
            combination[0],
            '($5)',
            combination[0],
        )
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 6 lw 16 t "%s water average", ' % (
            combination[1],
            combination[1],
            '($5)',
            combination[1],
        )
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 1 lw 16 t "%s Asp52 average", ' % (
            combination[0],
            combination[0],
            '($2)',
            combination[0],
        )
        line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 2 lw 16 t "%s Asp52 average", ' % (
            combination[1],
            combination[1],
            '($2)',
            combination[1],
        )
        ##    line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 7 t "NEUCHA Asp52", ' %('($2)',)
        ##    line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 8 t "NEUNEU Asp52", ' %('($2)',)
        ##    line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 9 t "NEUCHA protein", ' %('($3)',)
        ##    line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 10 t "NEUNEU protein", ' %('($3)',)
        line = line[:-2] + '\n'
        lines += [line]

        fd = open('gnu.set', 'w')
        fd.writelines(lines)
        fd.close()

        os.system('gnuplot gnu.set')
        os.system('convert combined.ps combined_%s_v_%s.png' % (
            combination[0],
            combination[1],
        ))

    return