def test_correlation(self): # covariance: [3, 5] [2, 10] => [-1 1] [-4 4] => 8 # stddev: sqrt(2) sqrt(32) self.assertEqual(8 / math.sqrt(2) / math.sqrt(32), statistics.correlation([3, 5], [2, 10])) self.assertEqual(0, statistics.correlation([3, 3], [2, 10])) num_friends = [100, 49, 41, 40, 25, 21, 21, 19, 19, 18, 18, 16, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] daily_minutes = [1, 68.77, 51.25, 52.08, 38.36, 44.54, 57.13, 51.4, 41.42, 31.22, 34.76, 54.01, 38.79, 47.59, 49.1, 27.66, 41.03, 36.73, 48.65, 28.12, 46.62, 35.57, 32.98, 35, 26.07, 23.77, 39.73, 40.57, 31.65, 31.21, 36.32, 20.45, 21.93, 26.02, 27.34, 23.49, 46.94, 30.5, 33.8, 24.23, 21.4, 27.94, 32.24, 40.57, 25.07, 19.42, 22.39, 18.42, 46.96, 23.72, 26.41, 26.97, 36.76, 40.32, 35.02, 29.47, 30.2, 31, 38.11, 38.18, 36.31, 21.03, 30.86, 36.07, 28.66, 29.08, 37.28, 15.28, 24.17, 22.31, 30.17, 25.53, 19.85, 35.37, 44.6, 17.23, 13.47, 26.33, 35.02, 32.09, 24.81, 19.33, 28.77, 24.26, 31.98, 25.73, 24.86, 16.28, 34.51, 15.23, 39.72, 40.8, 26.06, 35.76, 34.76, 16.13, 44.04, 18.03, 19.65, 32.62, 35.59, 39.43, 14.18, 35.24, 40.13, 41.82, 35.45, 36.07, 43.67, 24.61, 20.9, 21.9, 18.79, 27.61, 27.21, 26.61, 29.77, 20.59, 27.53, 13.82, 33.2, 25, 33.1, 36.65, 18.63, 14.87, 22.2, 36.81, 25.53, 24.62, 26.25, 18.21, 28.08, 19.42, 29.79, 32.8, 35.99, 28.32, 27.79, 35.88, 29.06, 36.28, 14.1, 36.63, 37.49, 26.9, 18.58, 38.48, 24.48, 18.95, 33.55, 14.24, 29.04, 32.51, 25.63, 22.22, 19, 32.73, 15.16, 13.9, 27.2, 32.01, 29.27, 33, 13.74, 20.42, 27.32, 18.23, 35.35, 28.48, 9.08, 24.62, 20.12, 35.26, 19.92, 31.02, 16.49, 12.16, 30.7, 31.22, 34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42, 9.82, 23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89, 23.48, 8.38, 27.81, 32.35, 23.84] self.assertAlmostEqual(0.25, statistics.correlation(num_friends, daily_minutes), places=2)
def test_correlation(): assert np.allclose(statistics.correlation([np.arange(0, 5), np.arange(0, 5)]), np.ones([2, 2])) assert np.allclose(statistics.correlation(np.array([np.arange(0, 5), np.arange(0, 5)])), \ np.ones([2, 2])) assert np.allclose(statistics.correlation([np.arange(0, 5), -1*np.arange(0, 5)]), \ np.array([[1, -1], [-1, 1]]))
def run_two_demension_data_process(): xs = [random_normal() for _ in range(1000)] ys1 = [x + random_normal() / 2 for x in xs] ys2 = [-x + random_normal() / 2 for x in xs] plt.scatter(xs, ys1, marker='.', color='black', label='ys1') plt.scatter(xs, ys2, marker='.', color='gray', label='ys2') plt.ylabel('xs') plt.legend(loc=9) plt.title("Diferent distribution") plt.grid() plt.show() print(correlation(xs, ys1)) print(correlation(xs, ys2))
def run_analysis(): file_path = './winequality.csv' data = load_data(file_path) # first way of printing. Everything casted to string, and spaces put automatically between passed values. print('Number of features:', len(data)) for feature_name, list_of_values in sorted(data.items()): # second way of printing. We print single string after format function. # Format function fills {} with values passed as arguments. It has nice applications for better printing, # like limiting number of digits for floats or other formatting tools. print('"{}". Mean: {:3.2f}, Median: {:.2f}, Std: {:.4f}'.format( feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5)) strongest_pair = ["aaa", "bbb"] high_correlation = 0.0 weakest_pair = ["aaa", "bbb"] low_correlation = 1.0 # compares correlation between all keys in data dictionary for i, keys1 in enumerate(data): for j, keys2 in enumerate(data): if j <= i: continue if abs(correlation(data[keys1], data[keys2])) > abs( high_correlation): # strong correlation - far from 0 if keys1 < keys2: strongest_pair[0] = keys1 strongest_pair[1] = keys2 else: strongest_pair[0] = keys2 strongest_pair[1] = keys1 high_correlation = correlation(data[keys1], data[keys2]) if abs(correlation(data[keys1], data[keys2])) < abs( low_correlation): # weak correlation - close to 0 if keys1 < keys2: weakest_pair[0] = keys1 weakest_pair[1] = keys2 else: weakest_pair[0] = keys2 weakest_pair[1] = keys1 low_correlation = correlation(data[keys1], data[keys2]) print('The strongest linear relationship is between: "{}","{}". ' 'The value is: {:.4f}'.format(strongest_pair[0], strongest_pair[1], high_correlation)) print('The weakest linear relationship is between: "{}","{}". ' 'The value is: {:.4f}'.format( *weakest_pair, low_correlation)) # * converts list to arguments.
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]: """Given two vectors x and y, find the least-squares value of alpha and beta""" beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) #print(alpha, beta) return alpha, beta
def test_correlation(self): # covariance: [3, 5] [2, 10] => [-1 1] [-4 4] => 8 # stddev: sqrt(2) sqrt(32) self.assertEqual(8 / math.sqrt(2) / math.sqrt(32), statistics.correlation([3, 5], [2, 10])) self.assertEqual(0, statistics.correlation([3, 3], [2, 10])) num_friends = [ 100, 49, 41, 40, 25, 21, 21, 19, 19, 18, 18, 16, 15, 15, 15, 15, 14, 14, 13, 13, 13, 13, 12, 12, 11, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] daily_minutes = [ 1, 68.77, 51.25, 52.08, 38.36, 44.54, 57.13, 51.4, 41.42, 31.22, 34.76, 54.01, 38.79, 47.59, 49.1, 27.66, 41.03, 36.73, 48.65, 28.12, 46.62, 35.57, 32.98, 35, 26.07, 23.77, 39.73, 40.57, 31.65, 31.21, 36.32, 20.45, 21.93, 26.02, 27.34, 23.49, 46.94, 30.5, 33.8, 24.23, 21.4, 27.94, 32.24, 40.57, 25.07, 19.42, 22.39, 18.42, 46.96, 23.72, 26.41, 26.97, 36.76, 40.32, 35.02, 29.47, 30.2, 31, 38.11, 38.18, 36.31, 21.03, 30.86, 36.07, 28.66, 29.08, 37.28, 15.28, 24.17, 22.31, 30.17, 25.53, 19.85, 35.37, 44.6, 17.23, 13.47, 26.33, 35.02, 32.09, 24.81, 19.33, 28.77, 24.26, 31.98, 25.73, 24.86, 16.28, 34.51, 15.23, 39.72, 40.8, 26.06, 35.76, 34.76, 16.13, 44.04, 18.03, 19.65, 32.62, 35.59, 39.43, 14.18, 35.24, 40.13, 41.82, 35.45, 36.07, 43.67, 24.61, 20.9, 21.9, 18.79, 27.61, 27.21, 26.61, 29.77, 20.59, 27.53, 13.82, 33.2, 25, 33.1, 36.65, 18.63, 14.87, 22.2, 36.81, 25.53, 24.62, 26.25, 18.21, 28.08, 19.42, 29.79, 32.8, 35.99, 28.32, 27.79, 35.88, 29.06, 36.28, 14.1, 36.63, 37.49, 26.9, 18.58, 38.48, 24.48, 18.95, 33.55, 14.24, 29.04, 32.51, 25.63, 22.22, 19, 32.73, 15.16, 13.9, 27.2, 32.01, 29.27, 33, 13.74, 20.42, 27.32, 18.23, 35.35, 28.48, 9.08, 24.62, 20.12, 35.26, 19.92, 31.02, 16.49, 12.16, 30.7, 31.22, 34.65, 13.13, 27.51, 33.2, 31.57, 14.1, 33.42, 17.44, 10.12, 24.42, 9.82, 23.39, 30.93, 15.03, 21.67, 31.09, 33.29, 22.61, 26.89, 23.48, 8.38, 27.81, 32.35, 23.84 ] self.assertAlmostEqual(0.25, statistics.correlation(num_friends, daily_minutes), places=2)
def calculate_correlation(l1,l2): import sys sys.path.append('/home/people/tc/svn/tc_sandbox/misc/') import statistics r = statistics.correlation(l1,l2) return r
def calculate_correlation(l1, l2): import sys sys.path.append('/home/people/tc/svn/tc_sandbox/misc/') import statistics r = statistics.correlation(l1, l2) return r
def least_squares_fit(x, y): """given training values for x and y, find the least-squares values of alpha and beta""" #x的系数β=xy相关系数*y的标准差/x的标准差 beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
def corr(x, y, n): for i in range(n): corrnp = np.corrcoef(x[i], y[i]) corrs = s.correlation(x[i], y[i]) if (int(corrnp[1][0]) == int(corrs)): printpass() else: printfail() printYE(corrs, corrnp[1][0])
def run_analysis(): """ run analysis on the file in file_path * prints the mean, median and std for all features in the file * prints the highest correlated pair (of features) and the least correlated pair, and the value of the correlation :return: void """ file_path = './winequality.csv' data = load_data(file_path) # first way of printing. Everything casted to string, and spaces put automatically between passed values. print('Number of features:', len(data)) for feature_name, list_of_values in sorted(data.items()): # second way of printing. We print single string after format function. # Format function fills {} with values passed as arguments. It has nice applications for better printing, # like limiting number of digits for floats or other formatting tools. print('"{}". Mean: {:3.2f}, Median: {:3.2f}, Std: {:3.4f}'.format( feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5)) # here you should compute correlations. Be careful, pair should be sorted before printing high_correlation = 0.0 low_correlation = 1.0 strongest_pair = ("aaa", "bbb") weakest_pair = ("aaa", "bbb") arranged_data = sorted( data.items() ) # this is a tuple the first item is the feature and the second the value. # nested loop, over the tuple, compare only items that weren't compered so far. # keep the strongest and weakest correlations. for outer_index, outer in enumerate(arranged_data): for inner_index, inner in enumerate(arranged_data): inner_name = inner[0] inner_values = inner[1] outer_name = outer[0] outer_values = outer[1] if outer_index <= inner_index: continue temp = correlation(inner_values, outer_values) if abs(temp) > abs(high_correlation): strongest_pair = (min(outer_name, inner_name), max(outer_name, inner_name)) high_correlation = temp if abs(temp) < abs(low_correlation): weakest_pair = min(outer_name, inner_name), max(outer_name, inner_name) low_correlation = temp print('The strongest linear relationship is between: "{}","{}". ' 'The value is: {:3.4f}'.format(strongest_pair[0], strongest_pair[1], high_correlation)) print('The weakest linear relationship is between: "{}","{}". ' 'The value is: {:3.4f}'.format( *weakest_pair, low_correlation)) # * converts list to arguments.
def correlation(x, y): """Return Pearson's correlation coefficient for x and y. Pearson's correlation coefficient takes values between -1 and +1. It measures the strength and direction of the linear relationship between x and y, where +1 means very strong, positive linear relationship, -1 very strong, negative linear relationship, and 0 no linear relationship. """ n = len(x) if len(y) != n: raise statistics.StatisticsError( 'covariance requires that both inputs ' 'have same number of data points') if n < 2: raise statistics.StatisticsError( 'covariance requires at least two data points') sectype = type(x[0]) # all elts of x assumed of same type if not issubclass(sectype, SecureObject): if sys.version_info.minor >= 10: return statistics.correlation(x, y) # inline code of statistics.correlation() copied from Python 3.10.0: xbar = fsum(x) / n ybar = fsum(y) / n sxy = fsum((xi - xbar) * (yi - ybar) for xi, yi in zip(x, y)) sxx = fsum((xi - xbar)**2.0 for xi in x) syy = fsum((yi - ybar)**2.0 for yi in y) try: return sxy / sqrt(sxx * syy) except ZeroDivisionError: raise statistics.StatisticsError( 'at least one of the inputs is constant') from None if issubclass(sectype, SecureFixedPoint): xbar = runtime.sum(x) / n ybar = runtime.sum(y) / n xxbar = [xi - xbar for xi in x] yybar = [yi - ybar for yi in y] sxy = runtime.in_prod(xxbar, yybar) sxx = runtime.in_prod(xxbar, xxbar) syy = runtime.in_prod(yybar, yybar) return sxy / (_fsqrt(sxx) * _fsqrt(syy)) raise TypeError('secure fixed-point type required')
def test_plain(self): f = lambda: (i * j for i in range(-1, 2, 1) for j in range(2, -2, -1)) self.assertEqual(mean(f()), statistics.mean(f())) self.assertEqual(variance(f()), statistics.variance(f())) self.assertEqual(stdev(f()), statistics.stdev(f())) self.assertEqual(pvariance(f()), statistics.pvariance(f())) self.assertEqual(pstdev(f()), statistics.pstdev(f())) self.assertEqual(mode(f()), statistics.mode(f())) self.assertEqual(median(f()), statistics.median(f())) self.assertEqual(quantiles(f()), statistics.quantiles(f())) self.assertEqual(quantiles(f(), n=6, method='inclusive'), statistics.quantiles(f(), n=6, method='inclusive')) x = list(f()) y = list(reversed(x)) self.assertEqual(covariance(x, y), statistics.covariance(x, y)) self.assertEqual(correlation(x, y), statistics.correlation(x, y)) self.assertEqual(linear_regression(x, y), statistics.linear_regression(x, y))
def run_analysis(): """ Run analysis on data then prints information :param path: None :return: None """ file_path = './winequality.csv' data = load_data(file_path) # first way of printing. Everything casted to string, and spaces put automatically between passed values. print('Number of features:', len(data)) for feature_name, list_of_values in sorted(data.items()): # second way of printing. We print single string after format function. # Format function fills {} with values passed as arguments. It has nice applications for better printing, # like limiting number of digits for floats or other formatting tools. print('"{0}". Mean: {1:.2f}, Median: {2:.2f}, Std: {3:.4f}'.format( feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5)) # here you should compute correlations. Be careful, pair should be sorted before printing keys = data.keys() strongest_pair = ("aaa", "bbb") high_correlation = -0.9 weakest_pair = ("aaa", "bbb") low_correlation = 0.1 for key1 in keys: for key2 in keys: current_correlation = correlation(data[key1], data[key2]) if current_correlation > high_correlation and key1 is not key2: strongest_pair = (key1, key2) high_correlation = current_correlation elif fabs(current_correlation) < fabs(low_correlation): weakest_pair = (key1, key2) low_correlation = current_correlation strongest_pair = sorted(strongest_pair) weakest_pair = sorted(weakest_pair) print('The strongest linear relationship is between: "{0}","{1}". ' 'The value is: {2:.4f}'.format(strongest_pair[0], strongest_pair[1], high_correlation)) print('The weakest linear relationship is between: "{0}","{1}". ' 'The value is: {2:.4f}'.format( weakest_pair[0], weakest_pair[1], low_correlation)) # * converts list to arguments.
def calculate_correlation_of_all_lists(data): """ calculate the correlation for each to pairs of headers from data :param data: dictionary of headers and list values :return: returns list of all the pairs and their correlations """ data_header, data_values, correlations = [], [], [] # TODO iterate for feature_name, list_of_values in sorted(data.items()): data_header.append(feature_name) data_values.append(list_of_values) for i in range(len(data_header)): for j in range(len(data_header)): if i == j: continue current_correlation = correlation(data_values[i], data_values[j]) pair = [data_header[i], data_header[j]] correlations.append([sorted(pair), current_correlation]) return correlations
def run_analysis(): """ the function that used to calculate for the main function purpose """ file_path = './winequality.csv' data = load_data(file_path) # first way of printing. Everything casted to string, and spaces put automatically between passed values. print('Number of features:', len(data)) for feature_name, list_of_values in sorted(data.items()): # second way of printing. We print single string after format function. # Format function fills {} with values passed as arguments. It has nice applications for better printing, # like limiting number of digits for floats or other formatting tools. print('"{}". Mean: {:3.2f}, Median: {:.2f}, Std: {:.4f}'.format( feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5)) # here you should compute correlations. Be careful, pair should be sorted before printing min_1 = 2 max_1 = -2 strongest_pair = [] weakest_pair = [] for index in range(len(data.items())-1): feature_name_1, list_of_values_1 = list(data.items())[index] for index_2 in range(index+1, len(data.items())): feature_name_2, list_of_values_2 = list(data.items())[index_2] cor = correlation(list_of_values_1, list_of_values_2) if abs(cor) < abs(min_1): min_1 = cor weakest_pair = sorted([feature_name_1, feature_name_2]) if cor > max_1: max_1 = cor strongest_pair = sorted([feature_name_1, feature_name_2]) high_correlation = max_1 print('The strongest linear relationship is between: "{}","{}". ' 'The value is: {:.4f}'.format(strongest_pair[0], strongest_pair[1], high_correlation)) low_correlation = min_1 print('The weakest linear relationship is between: "{}","{}". ' 'The value is: {:.4f}'.format(*weakest_pair, low_correlation)) # * converts list to arguments.
def run_analysis(): file_path = './winequality.csv' data = load_data(file_path) # first way of printing. Everything casted to string, and spaces put automatically between passed values. print('Number of features:', len(data)) for feature_name, list_of_values in sorted(data.items()): # second way of printing. We print single string after format function. # Format function fills {} with values passed as arguments. It has nice applications for better printing, # like limiting number of digits for floats or other formatting tools. print('"{}". Mean: {:3.2f}, Median: {:.2f}, Std: {:.4f}'.format( feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5)) low_correlation = 1 high_correlation = 0 weakest_pair = [0, 0] strongest_pair = [0, 0] for feature_name1, list_of_values1 in sorted(data.items()): for feature_name2, list_of_values2 in sorted(data.items()): if feature_name1 != feature_name2: corr = correlation(list_of_values1, list_of_values2) if abs(corr) < abs(low_correlation): weakest_pair[0] = feature_name1 weakest_pair[1] = feature_name2 low_correlation = corr elif abs(corr) > abs(high_correlation): strongest_pair[0] = feature_name1 strongest_pair[1] = feature_name2 high_correlation = corr weakest_pair.sort() strongest_pair.sort() print('The strongest linear relationship is between: "{}","{}". ' 'The value is: {:.4f}'.format(strongest_pair[0], strongest_pair[1], high_correlation)) print('The weakest linear relationship is between: "{}","{}". ' 'The value is: {:.4f}'.format( weakest_pair[0], weakest_pair[1], low_correlation)) # * converts list to arguments.
def run_analysis(): file_path = './winequality.csv' data = load_data(file_path) # first way of printing. Everything casted to string, and spaces put automatically between passed values. print('Number of features:', len(data)) for feature_name, list_of_values in sorted(data.items()): # second way of printing. We print single string after format function. # Format function fills {} with values passed as arguments. It has nice applications for better printing, # like limiting number of digits for floats or other formatting tools. print('"{}". Mean: {:3.2f}, Median: {:0.2f}, Std: {:0.4f}'.format( feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5)) correlations = [] for key_1 in data.keys(): for key_2 in data.keys(): if key_1 != key_2: correlations.append(correlation(key_1, key_2)) correlations.sort() min_correlations = correlations[0] max_correlations = correlations[len(correlations) - 1] # here you should compute correlations. Be careful, pair should be sorted before printing strongest_pair = ("aaa", "bbb") high_correlation = -0.9 print('The strongest linear relationship is between: "{}","{}". ' 'The value is: {}'.format(strongest_pair[0], strongest_pair[1], high_correlation)) weakest_pair = ("aaa", "bbb") low_correlation = 0.1 print('The weakest linear relationship is between: "{}","{}". ' 'The value is: {}'.format( *weakest_pair, low_correlation)) # * converts list to arguments.
def run_analysis(): file_path = './winequality.csv' data = load_data(file_path) print("corra") # print(correlation([1,2,4,5,8],[5,20,40,80,100])) # print(variance([1,2,4,5,8])**0.5) # print(variance([5,20,40,80,100])**0.5) print( correlation(data["free sulfur dioxide"], data["total sulfur dioxide"])) print( numpy.correlate(data["free sulfur dioxide"], data["total sulfur dioxide"])) # print(correlation(data["total sulfur dioxide"])) # first way of printing. Everything casted to string, and spaces put automatically between passed values. print('Number of features:', len(data)) for feature_name, list_of_values in sorted(data.items()): # second way of printing. We print single string after format function. # Format function fills {} with values passed as arguments. It has nice applications for better printing, # like limiting number of digits for floats or other formatting tools. print('"{}". Mean: {:3.2f}, Median: {}, Std: {:.4f}'.format( feature_name, mean(list_of_values), median(list_of_values), variance(list_of_values)**0.5)) # here you should compute correlations. Be careful, pair should be sorted before printing strongest_pair = ("aaa", "bbb") high_correlation = -0.9 print('The strongest linear relationship is between: "{}","{}". ' 'The value is: {}'.format(strongest_pair[0], strongest_pair[1], high_correlation)) weakest_pair = ("free sulfur dioxide", "total sulfur dioxide") low_correlation = 0.1 print('The weakest linear relationship is between: "{}","{}". ' 'The value is: {}'.format( *weakest_pair, low_correlation)) # * converts list to arguments.
def main( self, l_wts, d_pred, l_xtics, ): import os, sys sys.path.append('/home/people/tc/svn/tc_sandbox/misc/') import gnuplot, statistics ## parse experimental data d_exp = self.dic2csv(l_xtics) ## get cwd dir_main = os.getcwd() l_r = [] for pdb in l_wts: print pdb, l_wts.index(pdb) if not os.path.isdir('%s/%s' % (dir_main, pdb)): os.mkdir('%s/%s' % (dir_main, pdb)) os.chdir('%s/%s' % (dir_main, pdb)) self.pre_whatif(pdb) if pdb in [ '2vb1', '1vdp', ]: os.system('cp %s_monomer.pdb %s_protonated.pdb' % (pdb, pdb)) ## else: ## self.whatif(pdb) ## self.calculate_chemical_shifts(pdb) ## parse computational predictions d_pred = self.parse_chemical_shifts(pdb, d_pred) ## calculate correlation coefficients l_exp = [] l_pred = [] for titgrp in d_exp.keys(): res_number = int(titgrp[1:]) res_symbol = titgrp[0] res_name = self.d_ressymbol2resname[res_symbol] for nucleus in d_exp[titgrp].keys(): cs_exp = d_exp[titgrp][nucleus] l_exp += [cs_exp] index = nucleus.index('N-HN') cs_pred = d_pred['%s%i' % (res_name, res_number)][nucleus[:index]][-1] l_pred += [cs_pred] r = statistics.correlation(l_exp, l_pred) l_r += [r] ## print titgrp,r ## print sum(l_r)/len(l_r), min(l_r), max(l_r) ## change from local dir to main dir os.chdir(dir_main) ## plots for titgrp1 in d_exp.keys() + ['E35']: res_number = int(titgrp1[1:]) res_symbol = titgrp1[0] res_name = self.d_ressymbol2resname[res_symbol] titgrp3 = '%s%i' % (res_name, res_number) prefix = 'delta_cs_%s' % (titgrp3) ylabel = '{/Symbol D}{/Symbol w}_H' title = titgrp3 gnuplot.histogram( d_pred[titgrp3], prefix, l_xtics, ylabel=ylabel, title=title, ## l_plotdatafiles=['E34.txt'], ) return
# two dimensions xs = [random_normal() for _ in range(1000)] ys1 = [x + random_normal() / 2 for x in xs] ys2 = [-x + random_normal() / 2 for x in xs] plot_histogram(ys1, 0.5, "ys1") plot_histogram(ys2, 0.5, "ys2") plt.scatter(xs, ys1, marker='.', color='black', label='ys1') plt.scatter(xs, ys2, marker='.', color='red', label='ys2') plt.legend(loc=9) plt.title("Very different joint distributions") plt.show() print correlation(xs, ys1), correlation(xs, ys2) # scatterplot matrix # prepare data def make_row(): v0 = random_normal() v1 = -5 * v0 + random_normal() # negatively correlated to v0 v2 = v0 + v1 + 5 * random_normal( ) # positively correlated to both v0 and v1 v3 = 6 if v2 > -2 else 0 # depends exclusively on v2 return [v0, v1, v2, v3] data = [make_row() for _ in range(100)] # plot it _, num_columns = shape(data)
def eval_corr(v1, v2): return statistics.eval_correlation(statistics.correlation(v1, v2)).encode(encoding_out()) if __name__ == "__main__": # Shift-JIS, tsvのみ対応 parser = argparse.ArgumentParser() parser.add_argument('file', metavar='FILE', help=u'tab split file. need all columns.') parser.add_argument('-e', '--eval', action="store_true", help=u'correlation value to evaluation text.') parser.add_argument('-d', '--delimiter', metavar='DELIMITER', default='\t', help=u'output delimiter.') opt = parser.parse_args() # ファイルから読み込み data = read_data(opt.file.decode(encoding_in())) columns = data[0] # 列名毎のリストにする data = map(list, zip(*data[1:])) # 列名毎にベクトルを辞書にまとめる data_set = dict(zip(columns[1:], data[1:])) names = data_set.keys() print opt.delimiter.join(['-'] + names) for y in names: if opt.eval: # 可視化 records = [y] + ['-' if x == y else eval_corr(data_set[y], data_set[x]) for x in names] else: # 相関係数 records = [y] + ['1' if x == y else str(statistics.correlation(data_set[y], data_set[x])) for x in names] print opt.delimiter.join(records)
X = remove_projection(X, component) return components def transform_vector(v, components): return [dot(v, w) for w in components] def transform(X, components): return [transform_vector(x_i, components) for x_i in X] if __name__ == "__main__": compare_two_distributions() print "correlation(xs, ys1)", correlation(xs, ys1) print "correlation(xs, ys2)", correlation(xs, ys2) #make_scatterplot_matrix() # safe parsing data = [] with open("comma_delimited_stock_prices.csv", "rb") as f: reader = csv.reader(f) for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]): if any(x is None for x in line): pass else: data.append(line)
def matrix_entry(i, j): return correlation(get_column(data, i), get_column(data, j))
import statistics as stat import csv def get_data(filename): x = [] y = [] with open(filename, 'rb') as csvfile: reader = csv.reader(csvfile) reader.next() for row in reader: x.append(float(row[1])) y.append(float(row[2])) return x,y x,y = get_data('../../data/football.csv') print stat.correlation(x,y,"population")
def least_squares_fit(x,y): beta = correlation(x,y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
def least_squares_fit(x, y): """given training values for x and y find the least-squares values of alpha and beta""" beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
#Cumulative averages X1_cummean = cumsum( X1 ) / ( 1 + arange( len( X1 ))) X2_cummean = cumsum( X2 ) / ( 1 + arange( len( X1 ))) pylab.figure( 2 ) pylab.plot( X1_cummean,"b" ) pylab.title("Empirical mean of X_1") pylab.figure( 3 ) pylab.plot( X2_cummean,"r" ) pylab.title("Empirical mean of X_2") pylab.show() #Autocorrelation X1_sd = sqrt( var( X1 ) ) X2_sd = sqrt( var( X2 ) ) X1_autocorr = statistics.correlation( X1[1: ], X1[:-1]) X2_autocorr = statistics.correlation( X2[1: ], X2[:-1]) print "The autocorrelation of X_1 is", X1_autocorr print "The autocorrelation of X_2 is", X2_autocorr #Effective sample size X1_ess = n * ( 1 - X1_autocorr ) / ( 1 + X1_autocorr ) X2_ess = n * ( 1 - X2_autocorr ) / ( 1 + X2_autocorr ) print "The effective sample size of X_1 is", X1_ess print "The effective sample size of X_2 is", X2_ess #Task 4: Repeat with sigma_prop = 0.1, sigma_prop = 10 #Task 5: Repeat with sigma = array([[4,2.8],[2.8,4]])
prices = [] temps = [] produced_or_not = [] for state in states: for pdata, tdata in zip(apples_info[state].values(), state_temperatures[state].values()): for price, temp in zip(pdata, tdata): if price > 0: prices.append(price) temps.append(temp) plt.scatter(temp, price, c='r') produced_or_not.append((temp,1)) else: produced_or_not.append((temp,0)) result = stats.linregress(prices, temps) correlation = st.correlation(prices, temps) temps2, produced = zip(*produced_or_not) correlation2 = st.correlation(temps2, produced) print "Correlation: ", correlation print result print "SD prices:", np.std(prices) print "SD temps:", np.std(temps) print "Correlation 2:", correlation2 # print temps plt.title("Effects of temperature on apple production") # plt.xlabel("Price in pounds") # plt.ylabel("Temperature") plt.xlabel("Temperature") plt.ylabel("Price in Pounds") plt.show()
def main(): ## dictionary of apo and holo structures (from what script???) d_apo2holo = { ## conformational selection ## RNase A, 1kf3 high resolution ## '1kf3': {'holo': '1rpg','ligand':'CPA',}, '1kf5': { 'holo': '1eow','ligand':'U2G', 'site':[ 11,43,44, ## 119,120,121,122, ## terminal flexible residues... ], 'title':'Ribonuclease (RNase) A', }, ## CypA, highest resolution room ## '3k0n': {'holo': '1cwa','ligand':['DAL','MLE','MVA','BMT','ABA','SAR',],'site':[18-1,54-1,59-1,62-1,71-1,100-1,101-1,102-1,110-1,112-1,120-1,121-1,125-1,163-1,],'title':'Peptidyl-prolyl isomerase A (CypA)',}, '1w8v': {'holo': '1w8m','ligand':['E1P',],'site':[54,62,100,101,112,125,],'title':'Peptidyl-prolyl isomerase A (CypA)',}, ## DHFR '1ra9': {'holo': '1ra2','ligand':'FOL','site':[4,5,6,26,27,30,31,56,93,112,],'title':'Dihydrofolate reductase (DHFR)',}, ## AdK '2rh5': {'holo': '2rgx','ligand':'AP5','coords_apo':[0,202],'coords_holo':[0,202],'site':[8,9,10,11,12,13,14,30,31,34,57,58,63,81,84,88,119,120,123,134,137,149,160,188,189,190,],'title':'Adenylate kinase (AdK)',}, ## PKA '3iia': {'holo': '3pna', 'ligand':'CMP', 'coords_apo':[4,133],'coords_holo':[0,129],'site':[ 182-112, 198-112, 199-112, 200-112, 201-112, 202-112, 209-112, 211-112, ], 'title':'Protein Kinase A (PKA)', }, ## induced fit ## PEPCK '2qew': {'holo': '3dt4', 'ligand':'OXL', 'coords_apo':[1,620],'coords_holo':[0,619],'site':[240,260,307,401],'title':'PEPCK',}, ## beta-lactoglobulin '3npo': {'holo': '3nq3', 'ligand':'DKA','site':[53,104,106,],'title':'beta-lactoglobulin',}, } for pdb_apo in d_apo2holo.keys(): pdb_holo = d_apo2holo[pdb_apo]['holo'] ## continue ## tmp!!! print pdb_apo, pdb_holo ## ## parse coordinates ## d_mmCIF_apo, l_coords_alpha_apo = parse_coords(pdb_apo) d_mmCIF_holo, l_coords_alpha_holo = parse_coords(pdb_holo) if 'coords_apo' in d_apo2holo[pdb_apo].keys(): l_coords_alpha_apo = l_coords_alpha_apo[ d_apo2holo[pdb_apo]['coords_apo'][0]:d_apo2holo[pdb_apo]['coords_apo'][1] ] l_coords_alpha_holo = l_coords_alpha_holo[ d_apo2holo[pdb_apo]['coords_holo'][0]:d_apo2holo[pdb_apo]['coords_holo'][1] ] else: ## sequential alignment of coordinates index1_seq_apo = next((i for i,v in enumerate(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?')) index1_seq_holo = next((i for i,v in enumerate(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id']) if v != '?')) ## last non-? index2_seq_apo = len(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_apo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?')) index2_seq_holo = len(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])-next((i for i,v in enumerate(reversed(d_mmCIF_holo['_pdbx_poly_seq_scheme.pdb_mon_id'])) if v != '?')) ## first common non-? index1_coord_apo = max(0,index1_seq_holo-index1_seq_apo) index1_coord_holo = max(0,index1_seq_apo-index1_seq_holo) ## last common non-? index2_coord_apo = len(l_coords_alpha_apo)+min(0,index2_seq_holo-index2_seq_apo) index2_coord_holo = len(l_coords_alpha_holo)+min(0,index2_seq_apo-index2_seq_holo) l_coords_alpha_apo = l_coords_alpha_apo[index1_coord_apo:index2_coord_apo] l_coords_alpha_holo = l_coords_alpha_holo[index1_coord_holo:index2_coord_holo] if pdb_apo == '2qew' and pdb_holo == '3dt4': ## l_coords_alpha_holo = l_coords_alpha_holo[:459]+l_coords_alpha_holo[466:] l_coords_alpha_holo = l_coords_alpha_holo[:461]+l_coords_alpha_holo[461+7:] if len(l_coords_alpha_apo) != len(l_coords_alpha_holo): print pdb_apo, pdb_holo print len(l_coords_alpha_apo), len(l_coords_alpha_holo) stop ## if pdb_holo == '1eow': ## print l_coords_alpha_holo[d_apo2holo[pdb_apo]['site'][0]] ## print pdb_holo ## stop tv1, rm, tv2, l_coords_alpha_apo, l_coords_alpha_holo = get_transformation_matrix( l_coords_alpha_apo, l_coords_alpha_holo, ) vector_apo2holo = get_apo_holo_vector( d_mmCIF_apo, l_coords_alpha_apo, d_mmCIF_holo, l_coords_alpha_holo, tv1, rm, tv2, ) chain_apo = ''.join(d_mmCIF_apo['_entity_poly.pdbx_strand_id']) chain_holo = ''.join(d_mmCIF_holo['_entity_poly.pdbx_strand_id']) if pdb_holo == '1cwa': ligand_pos_holo = numpy.array([3.307729,36.55456,17.45886]) ligand_pos_apo = numpy.dot(ligand_pos_holo-tv1,rm)+tv2 else: ligand_pos_apo, ligand_pos_holo, lines_ligand_apo = get_ligand_pos( d_mmCIF_holo, tv1, rm, tv2, d_apo2holo[pdb_apo]['ligand'], pdb_holo, ) dist_max = 6 dist_min = 3 ## print len(vector_apo2holo), len(l_coords_alpha_apo) ## stop for pdb, chain, l_coords_alpha, ligand_pos in [ ## [pdb_holo,chain_holo,l_coords_alpha_holo,], [pdb_apo,chain_apo,l_coords_alpha_apo,ligand_pos_apo], ]: ## l_coords_protein_alpha = [] ## for i in range(len(l_coords_alpha)): ## l_coords_protein_alpha += [l_coords_alpha[i][0]] ## l_coords_protein_alpha += [l_coords_alpha[i][1]] ## l_coords_protein_alpha += [l_coords_alpha[i][2]] fn = '/home/tc/UCD/GV_ligand_binding_site_identification/%s_%s_probe.pdb' %(pdb,chain,) if os.path.isfile(fn): continue d = goodvibes_ligand.main( pdb,chain, dist_max,dist_min, v_apoholo=vector_apo2holo, l_coords_protein_alpha = l_coords_alpha, ## l_coords_probe = [ligand_pos], ) ## d = goodvibes_ligand.main( ## pdb,chain, ## dist_max,dist_min, ## v_apoholo=vector_apo2holo, ## l_coords_protein_alpha = l_coords_alpha, ## l_coords_probe = [ligand_pos], ## ) l_factors = d['l_factors'] ## l_factors_perturbed = d['l_factors_probe'] if os.path.isfile(fn): fd = open(fn,'r') lines = fd.readlines() fd.close() lines += lines_ligand_apo fd = open(fn+'2','w') fd.writelines(lines) fd.close() continue eigenvectors = d['eigenvectors'] l_factors_abs = [abs(factor) for factor in l_factors] mode_max_contribution = l_factors_abs.index(max(l_factors_abs)) print mode_max_contribution print d['l_overlaps'] v1 = vector_apo2holo eigenvector = v2 = eigenvectors[mode_max_contribution] overlap_max = abs(numpy.dot(v1,v2))/math.sqrt(numpy.dot(v1,v1)*numpy.dot(v2,v2)) print 'mode_max_contribution', mode_max_contribution print 'overlap_max', overlap_max ## write amplitudes lines = [] l1 = [] l2 = [] for i in range(0,len(eigenvector),3): amplitude = math.sqrt(eigenvector[i+0]**2+eigenvector[i+1]**2+eigenvector[i+2]**2) amplitude7 = math.sqrt(eigenvectors[6][i+0]**2+eigenvectors[6][i+1]**2+eigenvectors[6][i+2]**2) l1 += [amplitude] l2 += [amplitude7] if i/3 in d_apo2holo[pdb_apo]['site']: bool_site = amplitude else: bool_site = -1 lines += ['%s %s %s\n' %(amplitude,amplitude7,bool_site,)] fd = open('amplitudes_%s%s.txt' %(pdb_apo,pdb_holo,),'w') fd.writelines(lines) fd.close() r = statistics.correlation(l1,l2) xmin = 0 if pdb_holo == '1w8m': xmin = 2 if pdb_holo == '3dt4': xmin = 5 if pdb_holo == '1eow': xmin = 1 lines = [ 'set terminal png\n', 'set output "%s%s_amplitudes.png"\n' %(pdb_apo,pdb_holo,), 'set size 1,1\n', 'set xlabel "residue index"\n', 'set ylabel "amplitude (a.u.)\n', 'set title "%s (r = %.2f)"\n' %(d_apo2holo[pdb_apo]['title'],r,), 'set key out\n', 'f(x) = %s\n' %(sum(l1)/len(l1)), 'plot [%s:][0:]"amplitudes_%s%s.txt" u 1 t "mode %i" w l, "amplitudes_%s%s.txt" u 2 t "mode 7" w l, "amplitudes_%s%s.txt" u 3 t "binding site" ps 1 pt 7, f(x) t "average amplitude"\n' %( ## 'plot [%s:][0:]"amplitudes_%s%s.txt" u 1 t "mode %i" w l, "amplitudes_%s%s.txt" u 2 t "mode 7" w l, "amplitudes_%s%s.txt" u 3 t "binding site" ps 1 pt 7\n' %( xmin,pdb_apo,pdb_holo,mode_max_contribution+1, pdb_apo,pdb_holo, pdb_apo,pdb_holo, ), ] fd = open('gnuplot.settings','w') fd.writelines(lines) fd.close() os.system('/usr/bin/gnuplot gnuplot.settings') s = '' for i in range(len(l_factors)): s += '%s %s %s\n' %(i+1, l_factors[i],abs(l_factors[i]),) fd = open('facs_eigvals_%s%s.txt' %(pdb_apo,pdb_holo,),'w') fd.write(s) fd.close() ## s = '' ## for i in range(len(l_factors_perturbed)): ## s += '%s %s %s\n' %(i+1, l_factors_perturbed[i],abs(l_factors_perturbed[i]),) ## fd = open('facs_eigvals_%s%s_perturbed.txt' %(pdb_apo,pdb_holo,),'w') ## fd.write(s) ## fd.close() return
def calculate_averages_and_plot(cwd,): import statistics print 'calculate averages and plot' for topology in ['NEUNEU','NEUCHA','CHANEU','CHACHA',]: print 'protonation state', topology fd = open('energies_%s.txt' %(topology),'r') lines = fd.readlines() fd.close() lines2 = [] l_asp52 = [] l_protein = [] l_chloride = [] l_water = [] l_sum = [] l_sum_excl_ions = [] for line in lines: i = int(line.split()[0]) if i % 1000 == 0: print 'average', topology, i ## if i < 100: ## continue l_asp52 += [float(line.split()[1])] l_protein += [float(line.split()[2])] l_chloride += [float(line.split()[3])] l_water += [float(line.split()[4])] l_sum += [ float(line.split()[1])+ float(line.split()[2])+ float(line.split()[3])+ float(line.split()[4]) ] l_sum_excl_ions += [ float(line.split()[1])+ float(line.split()[2])+ float(line.split()[4]) ] lines2 += [ '%i %f %f %f %f %f %f\n' %( i, ## 1 sum(l_asp52)/len(l_asp52), sum(l_protein)/len(l_protein), sum(l_chloride)/len(l_chloride), sum(l_water)/len(l_water), sum(l_sum)/len(l_sum), ## 13 sum(l_sum_excl_ions)/len(l_sum_excl_ions), ## 12 ) ] fd = open('energies_averages_%s.txt' %(topology),'w') fd.writelines(lines2) fd.close() fd = open('energies_averages_%s.txt' %(topology),'r') lines2 = fd.readlines() fd.close() average = float(lines2[-1].split()[5]) print '******** average', average ## calculate rmsd l_diff = [] for i in range(len(lines)): Sum = float(line.split()[1])+float(line.split()[2])+float(line.split()[3])+float(line.split()[4]) l_diff += [Sum-average] rmsd = statistics.do_rmsd(l_diff) print '******** rmsd', rmsd if len(l_sum) > 0: print 'INCLUDING IONS' print 'correl asp52', statistics.correlation(l_sum,l_asp52) print 'correl protein', statistics.correlation(l_sum,l_protein) print 'correl chloride', statistics.correlation(l_sum,l_chloride) print 'correl water', statistics.correlation(l_sum,l_water) print 'EXCLUDING IONS' print 'correl asp52', statistics.correlation(l_sum_excl_ions,l_asp52) print 'correl protein', statistics.correlation(l_sum_excl_ions,l_protein) print 'correl chloride', statistics.correlation(l_sum_excl_ions,l_chloride) print 'correl water', statistics.correlation(l_sum_excl_ions,l_water) ## ## combined plot 3 (4 conformational states x 4 protonation states and their averages) ## ## orange=black, blue=green, yellow=red, grey=purple ## *NEUNEUCHA*NEU = *NEUNEUCHA*CHA (ion,water,protein) ## *CHANEUCHA*NEU = *CHANEUCHA*CHA (ion,water,protein) ## *NEUCHACHA*CHA = *NEUCHACHA*NEU (ion,water) ## *CHACHACHA*NEU = *CHACHACHA*CHA (ion,water) ## overlaps - water, ions, (protein) ## CHACHANEUCHA not overlap when protein for s_col,title,suffix,y1,y2 in [ ['$2+$3+$4+$5','energies of 4 conformational states at 4 different protonation states (all terms)','1all',-700,250,], ['$2+$3+$5','energies of 4 conformational states at 4 different protonation states (excluding ions)','1exclions',-700,250,], ['$3+$4+$5','energies of 4 conformational states at 4 different protonation states (excluding Asp52)','1exclasp52',-700,250,], ['$2','energies of 4 conformational states at 4 different protonation states (Asp52)','2asp52',-700,250,], ['$2','energies of 4 conformational states at 4 different protonation states (Asp52)','2asp52_zoom1',-80,23,], ['$2','energies of 4 conformational states at 4 different protonation states (Asp52)','2asp52_zoom2',23,160,], ['$3','energies of 4 conformational states at 4 different protonation states (protein)','3protein',-700,250,], ['$4','energies of 4 conformational states at 4 different protonation states (ions)','4ions',-700,250,], ['$5','energies of 4 conformational states at 4 different protonation states (water)','5water',-700,250,], ['$5','energies of 4 conformational states at 4 different protonation states (water)','5water_zoom',-80,40,], ]: print 'combined plot 16 states', suffix lines = [ 'set terminal postscript eps enhanced color "Helvetica" 32\n', 'set size 3,3\n', 'set output "combined_16states.ps"\n', 'set xlabel "t / ps"\n', 'set ylabel "E / kT"\n', 'set title "%s"\n' %(title), ] ## line = 'plot [0:][%s:%s]' %(Min,Max,) line = 'plot [0:30000][%s:%s]' %(y1,y2,) ## data points for i_state in range(16): state = l_states[i_state] ## pt = [6,7,4,5,12,13][i_state % 4] if i_state < 8: pt = 7 ps = 1 else: pt = 5 ps = 1 ## data points line += '"../%s/energies_%s.txt" u 1:(%s) lc rgb "#%6s" ps %i pt %i t "%s", ' %(state[:6],state[-6:],s_col,d_colors[state]['pc'],ps,pt,state,) ## lines for i_state in range(16): if i_state < 8: if i_state in [0,1,4,5,]: lw = 16 else: lw = 12 else: lw = 4 state = l_states[i_state] ## lines (averages) line += '"../%s/energies_averages_%s.txt" u 1:(%s) w l lt 1 lc rgb "#%6s" lw %i t "%s", ' %(state[:6],state[-6:],s_col,d_colors[state]['lc'],lw,state,) line = line[:-2]+'\n' lines += [line] fd = open('gnu.set','w') fd.writelines(lines) fd.close() os.system('gnuplot gnu.set') os.system('convert combined_16states.ps combined_16states_%s.png' %(suffix)) ## ## combined plot 2 (2 states with individual terms and their averages) ## for combination in [['CHACHA','CHANEU',],['NEUCHA','NEUNEU',],]: print 'combined plot', combination lines = [ 'set terminal postscript eps enhanced color "Helvetica" 32\n', 'set size 3,3\n', 'set output "combined.ps"\n', 'set xlabel "t / ps"\n', 'set ylabel "E / kT"\n', 'set title "%s"\n' %('%s v %s' %(combination[0],combination[1],)), ] line = 'plot [0:][-500:100]' ## data points line += '"../%s/energies_%s.txt" u 1:%s lc 3 t "%s protein", ' %(combination[0],combination[0],'($3)',combination[0],) line += '"../%s/energies_%s.txt" u 1:%s lc 4 t "%s protein", ' %(combination[1],combination[1],'($3)',combination[1],) line += '"../%s/energies_%s.txt" u 1:%s lc 5 t "%s water", ' %(combination[0],combination[0],'($5)',combination[0],) line += '"../%s/energies_%s.txt" u 1:%s lc 6 t "%s water", ' %(combination[1],combination[1],'($5)',combination[1],) line += '"../%s/energies_%s.txt" u 1:%s lc 1 t "%s Asp52", ' %(combination[0],combination[0],'($2)',combination[0],) line += '"../%s/energies_%s.txt" u 1:%s lc 2 t "%s Asp52", ' %(combination[1],combination[1],'($2)',combination[1],) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 3 lw 16 t "%s protein average", ' %(combination[0],combination[0],'($3)',combination[0],) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 4 lw 16 t "%s protein average", ' %(combination[1],combination[1],'($3)',combination[1],) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 5 lw 16 t "%s water average", ' %(combination[0],combination[0],'($5)',combination[0],) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 6 lw 16 t "%s water average", ' %(combination[1],combination[1],'($5)',combination[1],) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 1 lw 16 t "%s Asp52 average", ' %(combination[0],combination[0],'($2)',combination[0],) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 2 lw 16 t "%s Asp52 average", ' %(combination[1],combination[1],'($2)',combination[1],) ## line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 7 t "NEUCHA Asp52", ' %('($2)',) ## line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 8 t "NEUNEU Asp52", ' %('($2)',) ## line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 9 t "NEUCHA protein", ' %('($3)',) ## line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 10 t "NEUNEU protein", ' %('($3)',) line = line[:-2]+'\n' lines += [line] fd = open('gnu.set','w') fd.writelines(lines) fd.close() os.system('gnuplot gnu.set') os.system('convert combined.ps combined_%s_v_%s.png' %(combination[0],combination[1],)) return
def least_squares_fit(x, y): """numerical 'perfect' determination of alpha, beta for linear regression""" beta = correlation(x, y) * standard_deviation(y) / standard_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
columns = fields[0] # 列名毎のリストにする data = map(list, zip(*fields[1:])) # 列名毎にベクトルを辞書にまとめる return dict(zip(columns[1:], data[1:])) if __name__ == "__main__": # Shift-JIS, tsvのみ対応 parser = argparse.ArgumentParser() parser.add_argument('file', metavar='FILE', nargs=2, help=u'tab split file. need all columns.') parser.add_argument('-e', '--eval', action="store_true", help=u'correlation value to evaluation text.') parser.add_argument('-d', '--delimiter', metavar='DELIMITER', default='\t', help=u'output delimiter.') opt = parser.parse_args() # ファイルから読み込み data_set1 = dataset(read_data(opt.file[0].decode(encoding_in()))) data_set2 = dataset(read_data(opt.file[1].decode(encoding_in()))) keys1 = data_set1.keys() keys2 = data_set2.keys() print opt.delimiter.join(['-'] + keys2) for y in keys1: if opt.eval: # 可視化 records = [y] + ['-' if x == y else eval_corr(data_set1[y], data_set2[x]) for x in keys2] else: # 相関係数 records = [y] + ['1' if x == y else str(statistics.correlation(data_set1[y], data_set2[x])) for x in keys2] print opt.delimiter.join(records)
pylab.figure(1) pylab.plot(X2, 'r') pylab.title("Sample path of X_2") # Cumulative averages X1_cummean = cumsum(X1) / (1 + arange(len(X1))) X2_cummean = cumsum(X2) / (1 + arange(len(X1))) pylab.figure(2) pylab.plot(X1_cummean, "b") pylab.title("Empirical mean of X_1") pylab.figure(3) pylab.plot(X2_cummean, "r") pylab.title("Empirical mean of X_2") pylab.show() # Autocorrelation X1_sd = sqrt(var(X1)) X2_sd = sqrt(var(X2)) X1_autocorr = statistics.correlation(X1[1: ], X1[:-1]) X2_autocorr = statistics.correlation(X2[1: ], X2[:-1]) print "The autocorrelation of X1 is", X1_autocorr print "The autocorrelation of X2 is", X2_autocorr # Effective Sample size X1_ess = n * (1 - X1_autocorr) / (1 + X1_autocorr) X2_ess = n * (1 - X2_autocorr) / (1 + X2_autocorr) print "The effective sample size of X_1 is", X1_ess print "The effective sample size of X_2 is", X2_ess
for _ in range(num_components): component = first_principal_component(X) components.append(component) X = remove_projection(X, component) return components def transform_vector(v, components): return [dot(v, w) for w in components] def transform(X, components): return [transform_vector(x_i, components) for x_i in X] if __name__ == "__main__": print("correlation(xs, ys1)", correlation(xs, ys1)) print("correlation(xs, ys2)", correlation(xs, ys2)) # safe parsing data = [] with open("comma_delimited_stock_prices.csv", "r", encoding='utf8', newline='') as f: reader = csv.reader(f) for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]): data.append(line) for row in data: if any(x is None for x in row): print(row)
ys2 = [-x + random_normal() / 2 for x in xs] plt.scatter(xs, ys1, marker='.', color='black', label='ys1') plt.scatter(xs, ys2, marker='.', color='gray', label='ys2') plt.xlabel('xs') plt.ylabel('ys') plt.legend(loc=9) plt.title("Very Different Joint Distributions") # plt.show() plt.savefig('im/working_scatter.png') plt.gca().clear() from statistics import correlation assert 0.89 < correlation(xs, ys1) < 0.91 assert -0.91 < correlation(xs, ys2) < -0.89 from src.scratch_dir import Matrix, Vector, make_matrix def correlation_matrix(data: List[Vector]) -> Matrix: """ Returns the len(data) x len(data) matrix whose (i, j)-th entry is the correlation between data[i] and data[j] """ def correlation_ij(i: int, j: int) -> float: return correlation(data[i], data[j]) return make_matrix(len(data), len(data), correlation_ij)
def main( self,l_wts,d_pred,l_xtics, ): import os, sys sys.path.append('/home/people/tc/svn/tc_sandbox/misc/') import gnuplot, statistics ## parse experimental data d_exp = self.dic2csv(l_xtics) ## get cwd dir_main = os.getcwd() l_r = [] for pdb in l_wts: print pdb, l_wts.index(pdb) if not os.path.isdir('%s/%s' %(dir_main,pdb)): os.mkdir('%s/%s' %(dir_main,pdb)) os.chdir('%s/%s' %(dir_main,pdb)) self.pre_whatif(pdb) if pdb in ['2vb1','1vdp',]: os.system('cp %s_monomer.pdb %s_protonated.pdb' %(pdb,pdb)) ## else: ## self.whatif(pdb) ## self.calculate_chemical_shifts(pdb) ## parse computational predictions d_pred = self.parse_chemical_shifts(pdb,d_pred) ## calculate correlation coefficients l_exp = [] l_pred = [] for titgrp in d_exp.keys(): res_number = int(titgrp[1:]) res_symbol = titgrp[0] res_name = self.d_ressymbol2resname[res_symbol] for nucleus in d_exp[titgrp].keys(): cs_exp = d_exp[titgrp][nucleus] l_exp += [cs_exp] index = nucleus.index('N-HN') cs_pred = d_pred['%s%i' %(res_name,res_number)][nucleus[:index]][-1] l_pred += [cs_pred] r = statistics.correlation(l_exp,l_pred) l_r += [r] ## print titgrp,r ## print sum(l_r)/len(l_r), min(l_r), max(l_r) ## change from local dir to main dir os.chdir(dir_main) ## plots for titgrp1 in d_exp.keys()+['E35']: res_number = int(titgrp1[1:]) res_symbol = titgrp1[0] res_name = self.d_ressymbol2resname[res_symbol] titgrp3 = '%s%i' %(res_name,res_number) prefix = 'delta_cs_%s' %(titgrp3) ylabel = '{/Symbol D}{/Symbol w}_H' title = titgrp3 gnuplot.histogram( d_pred[titgrp3],prefix,l_xtics, ylabel=ylabel,title=title, ## l_plotdatafiles=['E34.txt'], ) return
def main(): # I don't know why this is necessary plt.gca().clear() plt.close() import random from probability import inverse_normal_cdf random.seed(0) # uniform between -100 and 100 uniform = [200 * random.random() - 100 for _ in range(10000)] # normal distribution with mean 0, standard deviation 57 normal = [57 * inverse_normal_cdf(random.random()) for _ in range(10000)] plot_histogram(uniform, 10, "Uniform Histogram") plt.savefig('im/working_histogram_uniform.png') plt.gca().clear() plt.close() plot_histogram(normal, 10, "Normal Histogram") plt.savefig('im/working_histogram_normal.png') plt.gca().clear() from statistics import correlation print(correlation(xs, ys1)) # about 0.9 print(correlation(xs, ys2)) # about -0.9 from typing import List # Just some random data to show off correlation scatterplots num_points = 100 def random_row() -> List[float]: row = [0.0, 0, 0, 0] row[0] = random_normal() row[1] = -5 * row[0] + random_normal() row[2] = row[0] + row[1] + 5 * random_normal() row[3] = 6 if row[2] > -2 else 0 return row random.seed(0) # each row has 4 points, but really we want the columns corr_rows = [random_row() for _ in range(num_points)] corr_data = [list(col) for col in zip(*corr_rows)] # corr_data is a list of four 100-d vectors num_vectors = len(corr_data) fig, ax = plt.subplots(num_vectors, num_vectors) for i in range(num_vectors): for j in range(num_vectors): # Scatter column_j on the x-axis vs column_i on the y-axis, if i != j: ax[i][j].scatter(corr_data[j], corr_data[i]) # unless i == j, in which case show the series name. else: ax[i][j].annotate("series " + str(i), (0.5, 0.5), xycoords='axes fraction', ha="center", va="center") # Then hide axis labels except left and bottom charts if i < num_vectors - 1: ax[i][j].xaxis.set_visible(False) if j > 0: ax[i][j].yaxis.set_visible(False) # Fix the bottom right and top left axis labels, which are wrong because # their charts only have text in them ax[-1][-1].set_xlim(ax[0][-1].get_xlim()) ax[0][0].set_ylim(ax[0][1].get_ylim()) # plt.show() plt.savefig('im/working_scatterplot_matrix.png') plt.gca().clear() plt.close() plt.clf() import csv data: List[StockPrice] = [] with open("comma_delimited_stock_prices.csv") as f: reader = csv.reader(f) for row in reader: maybe_stock = try_parse_row(row) if maybe_stock is None: print(f"skipping invalid row: {row}") else: data.append(maybe_stock) from typing import List def primes_up_to(n: int) -> List[int]: primes = [2] with tqdm.trange(3, n) as t: for i in t: # i is prime if no smaller prime divides it. i_is_prime = not any(i % p == 0 for p in primes) if i_is_prime: primes.append(i) t.set_description(f"{len(primes)} primes") return primes my_primes = primes_up_to(100_000) de_meaned = de_mean(pca_data) fpc = first_principal_component(de_meaned) assert 0.923 < fpc[0] < 0.925 assert 0.382 < fpc[1] < 0.384
def plot(d_mmCIF_main,d_rmsds,): l_pdbs = d_rmsds.keys() l_pdbs.sort() l_temperature = [] l_ph = [] l_resolution = [] d_spacegroup = {} d_starting_model = {} l_correl_T = [[],[],] l_correl_pH = [[],[],] l_correl_resol_max = [[],[],] d_histo_pH = {} d_histo_T = {} d_histo_resol = {} for i1 in range(len(l_pdbs)-1): pdb1 = l_pdbs[i1] spacegroup1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_symmetry.space_group_name_H-M',) T1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_diffrn.ambient_temp',) pH1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_exptl_crystal_grow.pH',) starting_model1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.pdbx_starting_model',) resolution1 = core.parse_mmCIF_item(d_mmCIF_main[pdb1[:4]],'_refine.ls_d_res_high',) for i2 in range(i1+1,len(l_pdbs)): pdb2 = l_pdbs[i2] spacegroup2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_symmetry.space_group_name_H-M',) T2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_diffrn.ambient_temp',) pH2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_exptl_crystal_grow.pH',) starting_model2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.pdbx_starting_model',) resolution2 = core.parse_mmCIF_item(d_mmCIF_main[pdb2[:4]],'_refine.ls_d_res_high',) rmsd = d_rmsds[pdb1][pdb2] if rmsd > 1: print pdb1, pdb2, rmsd if T1 and T2: T_diff = abs(float(T2)-float(T1)) l_temperature += ['%s %s\n' %(T_diff,rmsd),] l_correl_T[0] += [T_diff] l_correl_T[1] += [rmsd] print T_diff, 10*round(T_diff/10.,0) if not 10*round(T_diff/10.,0) in d_histo_T.keys(): d_histo_T[10*round(T_diff/10.,0)] = 0 d_histo_T[10*round(T_diff/10.,0)] += 1 if pH1 and pH2: pH_diff = abs(float(pH2)-float(pH1)) l_ph += ['%s %s\n' %(pH_diff,rmsd),] l_correl_pH[0] += [pH_diff] l_correl_pH[1] += [rmsd] if not pH_diff in d_histo_pH.keys(): d_histo_pH[pH_diff] = 0 d_histo_pH[pH_diff] += 1 resolution_max = max(resolution1,resolution2,) l_resolution += ['%s %s\n' %(resolution_max,rmsd),] if resolution_max != 'N/A': l_correl_resol_max[0] += [float(resolution_max)] l_correl_resol_max[1] += [rmsd] if not round(float(resolution_max),0) in d_histo_resol.keys(): d_histo_resol[round(float(resolution_max),0)] = 0 d_histo_resol[round(float(resolution_max),0)] += 1 d_spacegroup = append_to_dictionary(d_spacegroup,spacegroup1,spacegroup2,rmsd,) d_starting_model = append_to_dictionary(d_starting_model,starting_model1,starting_model2,rmsd,) r1 = statistics.correlation(l_correl_T[0],l_correl_T[1],) r2 = statistics.correlation(l_correl_pH[0],l_correl_pH[1],) r3 = statistics.correlation(l_correl_resol_max[0],l_correl_resol_max[1],) ## ## plot histograms ## for prefix,d in [ ['deltapH',d_histo_pH,], ['deltaT',d_histo_T,], ['maxresolution',d_histo_resol,], ]: l = [] l_diffs = d.keys() l_diffs.sort() for diff in l_diffs: l += ['%s %s\n' %(diff,d[diff],)] fd = open('histo_%s.txt' %(prefix),'w') fd.writelines(l) fd.close() l = [ 'set terminal postscript eps enhanced color "Helvetica"\n', 'set output "gnuplot.ps"\n', 'set size 3,3\n', 'set style data histogram\n', 'set xtics rotate\n', 'set xlabel "%s\n' %(prefix), 'set ylabel "count\n', 'plot "histo_%s.txt" u 2:xtic(1) t ""\n' %(prefix) ] fd = open('tmp.txt','w') fd.writelines(l) fd.close() os.system('gnuplot tmp.txt') os.system('convert gnuplot.ps histo_%s.png' %(prefix)) ## ## plot rmsd as a function of each property (2d) ## for prefix,data,xlabel in [ ['pH',l_ph,'pH diff',], ['Temperature',l_temperature,'T diff',], ['resolution',l_resolution,'maximum resolution',], ]: prefix += method fd = open('%s.gnuplotdata' %(prefix),'w') fd.writelines(data) fd.close() gnuplot.scatter_plot_2d( prefix,xlabel=xlabel,ylabel='RMSD %s' %(method,), ## averages=True, regression=True, ) ## ## plot rmsd as a function of each property (contour) ## for d,prefix in [ [d_spacegroup,'spacegroup',], [d_starting_model,'startingmodel',], ]: d_tics = {} l_tics = d.keys() l_tics.sort() for i in range(len(l_tics)): d_tics[l_tics[i]] = i+.5 z1 = 9 z2 = 0 l_data = [] for x in range(len(l_tics)): k1 = l_tics[x] for y in range(len(l_tics)): k2 = l_tics[y] if not k2 in d[k1].keys(): average = 9 else: l_rmsds = d[k1][k2] average = sum(l_rmsds)/len(l_rmsds) if average < z1: z1 = average if average > z2: z2 = average l_data += ['%s %s %s\n' %(x,y,average,)] l_data += ['%s %s %s\n' %(x,y+1,1,)] l_data += ['\n'] for y in range(len(l_tics)): l_data += ['%s %s %s\n' %(x+1,y,1,)] l_data += ['%s %s %s\n' %(x+1,y+1,1,)] l_data += ['\n'] gnuplot.contour_plot( prefix,l_data, title='%s %s' %(prefix,method,),zlabel='RMSD %s' %(method), d_xtics = d_tics, d_ytics = d_tics, palette = '0 1 0 0, 0.9999 0 0 1, 0.9999 1 1 1, 1 1 1 1', z1 = z1, z2 = z2+0.1, bool_remove = False, ) os.system('convert %s.ps %s_spacegroup%s_mutations%s_atoms%s.png' %(prefix,prefix,spacegroup.replace(' ',''),n_mutations_max,method,)) ## os.remove('%s.ps' %(prefix,)) print d_spacegroup print d_starting_model print r1 print r2 print r3 return
def correlation_ij(i: int, j: int) -> float: return correlation(data[i], data[j])
def main(): #This will get the total count of crime for each month #count of Crimes for Louisiana l1 = total(year1) l2 = total(year2) l3 = total(year3) l4 = total(year4) #count of Crimes for Chicago c1 = total(chicago_year1) c2 = total(chicago_year2) c3 = total(chicago_year3) c4 = total(chicago_year4) chicago_1 = values(c1) chicago_2 = values(c2) chicago_3 = values(c3) chicago_4 = values(c4) print 'Chicago crime Per month' print chicago_1 print chicago_2 print chicago_3 print chicago_4 print newarr1 = map(add, chicago_1, chicago_2) newarr2 = map(add, chicago_3, chicago_4) newarr3 = map(add, newarr1, newarr2) print 'Sum crimes per month for all four years in Chicago' print newarr3 print new_chicago_avg = [] for i in newarr3: new_chicago_avg.append(i/4) print 'Average crime per month for all four years in Chicago' print new_chicago_avg print #Values of crimes for each month of Louisiana louisiana_1 = values(l1) louisiana_2 = values(l2) louisiana_3 = values(l3) louisiana_4 = values(l4) print 'Louisiana crime Per month' print louisiana_1 print louisiana_2 print louisiana_3 print louisiana_4 print newarr4 = map(add, louisiana_1, louisiana_2) newarr5 = map(add, louisiana_3, louisiana_4) newarr6 = map(add, newarr4, newarr5) print 'Sum crimes per month for all four years in Louisiana' print newarr6 print new_louisiana_avg = [] for i in newarr6: new_louisiana_avg.append(i/4) print 'Average crime per month for all four years in Louisiana' print new_louisiana_avg print weather_l_1 = [46, 46, 42, 59, 66, 69, 73, 73, 73, 56, 47, 48] weather_l_2 = [32, 44, 46, 57, 62, 72, 71, 72, 72, 61, 46, 43] weather_l_3 = [37, 38, 55, 56, 61, 71, 70, 72, 72, 58, 42, 48] weather_l_4 = [39, 41, 55, 63, 67, 71, 73, 70, 70, 58, 55, 53] suml = map(add, weather_l_1, weather_l_2) suml1 = map(add, weather_l_3, weather_l_4) total_l = map(add, suml, suml1) louisiana_weather_total = [] for i in total_l: louisiana_weather_total.append(i/4) print 'Average weather from 2012 - 2015 for Louisiana' print louisiana_weather_total print weather_c_1 = [18, 19, 22, 35, 47, 53, 64, 58, 58, 40, 29, 29] weather_c_2 = [10, 9, 21, 33, 47, 55, 60, 59, 59, 43, 28, 18] weather_c_3 = [17, 8, 24, 35, 48, 59, 58, 64, 64, 43, 25, 26] weather_c_4 = [17, 20, 33, 34, 50, 59, 60, 59, 59, 42, 35, 33] sum2 = map(add, weather_c_1, weather_c_2) sum3 = map(add, weather_c_3, weather_c_4) total_c = map(add, sum2, sum3) chicago_weather_total = [] for i in total_c: chicago_weather_total.append(i/4) print 'Average weather from 2012 - 2015 for Chicago' print chicago_weather_total print m = range(0,12) print 'Correlation between between all the years for Chicago and Louisiana' lou_chi_2012_2015 = statistics.correlation(new_chicago_avg, new_louisiana_avg) print lou_chi_2012_2015 print print 'Correlation between the total crime of Louisiana and the Weather for the same period of time' louisiana_weather_correlation = statistics.correlation(new_louisiana_avg, louisiana_weather_total) print louisiana_weather_correlation print print 'Correlation between the total crime of Chicago and the Weather for the same period of time' chicago_weather_correlation = statistics.correlation(new_chicago_avg, chicago_weather_total) print chicago_weather_correlation plt.plot(m, new_louisiana_avg, marker = 'o',color='purple', label="Louisiana 2012-2015") plt.plot(m, new_chicago_avg, marker= 'o', color='green', label="Chicago 2012-2015") plt.title("Osnaldy Vasquez\nCrime Correlation between Chicago and Louisiana for 2012 - 2015\nCorrelation = 0.706490238246", fontsize= 'medium') plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec')) plt.xlabel("Months") plt.ylabel('Crimes') plt.legend(loc = 4,fontsize = 'x-small') plt.show() # plt.plot(m, louisiana_1, marker = 'o',color='red', label="Louisiana 2012") plt.plot(m, louisiana_2, marker = 'o',color='purple', label="Louisiana 2013") plt.plot(m, louisiana_3, marker = 'o',color='pink', label="Louisiana 2014") plt.plot(m, louisiana_4, marker= 'o', color='green', label="Louisiana 2015") plt.title("Osnaldy Vasquez\nCrime for Louisiana 2012 - 2015") plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec')) plt.xlabel("Months") plt.ylabel('Crimes') plt.legend(loc = 4,fontsize = 'x-small') plt.show() # plt.plot(m, chicago_1, marker = 'o',color='red', label="Chicago 2012") plt.plot(m, chicago_2, marker = 'o',color='purple', label="Chicago 2013") plt.plot(m, chicago_3, marker = 'o',color='pink', label="Chicago 2014") plt.plot(m, chicago_4, marker= 'o', color='green', label="Chicago 2015") plt.title("Osnaldy Vasquez\nCrime for Chicago 2012 - 2015") plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec')) plt.xlabel("Months") plt.ylabel('Crimes') plt.legend(loc = 4,fontsize = 'x-small') plt.show() plt.plot(m, new_louisiana_avg, marker = 'o',color='blue', label="Crime 2012-2015") plt.plot(m, louisiana_weather_total, marker= 'o', color='red', label=" Weather 2012-2015") plt.title("Osnaldy Vasquez\nCorrelation between the weather and the crime for Louisiana\nCorrelation = 0.696352921783", fontsize= 'medium') plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec')) plt.xlabel("Months") plt.ylabel('Crimes') plt.legend(loc = 2,fontsize = 'x-small') plt.show() plt.plot(m, new_chicago_avg, marker = 'o',color='green', label="Crime 2012-2015") plt.plot(m, chicago_weather_total, marker= 'o', color='pink', label=" Weather 2012-2015") plt.title("Osnaldy Vasquez\nCorrelation between the weather and the crime for Chicago\nCorrelation = 0.647656528", fontsize= 'medium') plt.xticks(np.arange(12), ('Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov',' Dec')) plt.xlabel("Months") plt.ylabel('Crimes') plt.legend(loc = 2,fontsize = 'x-small') plt.show()
for _ in range(num_components): component = first_principal_component(X) components.append(component) X = remove_projection(X, component) return components def transform_vector(v, components): return [dot(v, w) for w in components] def transform(X, components): return [transform_vector(x_i, components) for x_i in X] if __name__ == "__main__": print "correlation(xs, ys1)", correlation(xs, ys1) print "correlation(xs, ys2)", correlation(xs, ys2) # safe parsing data = [] with open("comma_delimited_stock_prices.csv", "rb") as f: reader = csv.reader(f) for line in parse_rows_with(reader, [dateutil.parser.parse, None, float]): data.append(line) for row in data: if any(x is None for x in row): print row
def eval_corr(v1, v2): return statistics.eval_correlation(statistics.correlation(v1, v2)).encode(encoding_out())
def columns_correlation(matrix, i, j): return correlation(get_column(matrix, i), get_column(matrix, j))
def matrix_entry(i, j): return ind.correlation(mat.get_collumn(data, i), mat.get_collumn(data, j))
def least_squares_fit(x: Vector, y: Vector) -> Tuple[float, float]: beta = correlation(x, y) * standart_deviation(y) / standart_deviation(x) alpha = mean(y) - beta * mean(x) return alpha, beta
# two dimensions xs = [random_normal() for _ in range(1000)] ys1 = [x + random_normal() / 2 for x in xs] ys2 = [-x + random_normal() / 2 for x in xs] plot_histogram(ys1, 0.5, "ys1") plot_histogram(ys2, 0.5, "ys2") plt.scatter(xs, ys1, marker='.', color='black', label='ys1') plt.scatter(xs, ys2, marker='.', color='red', label='ys2') plt.legend(loc=9) plt.title("Very different joint distributions") plt.show() print correlation(xs, ys1), correlation(xs, ys2) # scatterplot matrix # prepare data def make_row(): v0 = random_normal() v1 = -5 * v0 + random_normal() # negatively correlated to v0 v2 = v0 + v1 + 5 * random_normal() # positively correlated to both v0 and v1 v3 = 6 if v2 > -2 else 0 # depends exclusively on v2 return [v0, v1, v2, v3] data = [make_row() for _ in range(100)] # plot it _, num_columns = shape(data) fig, ax = plt.subplots(num_columns, num_columns)
def calculate_averages_and_plot(cwd, ): import statistics print 'calculate averages and plot' for topology in [ 'NEUNEU', 'NEUCHA', 'CHANEU', 'CHACHA', ]: print 'protonation state', topology fd = open('energies_%s.txt' % (topology), 'r') lines = fd.readlines() fd.close() lines2 = [] l_asp52 = [] l_protein = [] l_chloride = [] l_water = [] l_sum = [] l_sum_excl_ions = [] for line in lines: i = int(line.split()[0]) if i % 1000 == 0: print 'average', topology, i ## if i < 100: ## continue l_asp52 += [float(line.split()[1])] l_protein += [float(line.split()[2])] l_chloride += [float(line.split()[3])] l_water += [float(line.split()[4])] l_sum += [ float(line.split()[1]) + float(line.split()[2]) + float(line.split()[3]) + float(line.split()[4]) ] l_sum_excl_ions += [ float(line.split()[1]) + float(line.split()[2]) + float(line.split()[4]) ] lines2 += [ '%i %f %f %f %f %f %f\n' % ( i, ## 1 sum(l_asp52) / len(l_asp52), sum(l_protein) / len(l_protein), sum(l_chloride) / len(l_chloride), sum(l_water) / len(l_water), sum(l_sum) / len(l_sum), ## 13 sum(l_sum_excl_ions) / len(l_sum_excl_ions), ## 12 ) ] fd = open('energies_averages_%s.txt' % (topology), 'w') fd.writelines(lines2) fd.close() fd = open('energies_averages_%s.txt' % (topology), 'r') lines2 = fd.readlines() fd.close() average = float(lines2[-1].split()[5]) print '******** average', average ## calculate rmsd l_diff = [] for i in range(len(lines)): Sum = float(line.split()[1]) + float(line.split()[2]) + float( line.split()[3]) + float(line.split()[4]) l_diff += [Sum - average] rmsd = statistics.do_rmsd(l_diff) print '******** rmsd', rmsd if len(l_sum) > 0: print 'INCLUDING IONS' print 'correl asp52', statistics.correlation(l_sum, l_asp52) print 'correl protein', statistics.correlation(l_sum, l_protein) print 'correl chloride', statistics.correlation(l_sum, l_chloride) print 'correl water', statistics.correlation(l_sum, l_water) print 'EXCLUDING IONS' print 'correl asp52', statistics.correlation( l_sum_excl_ions, l_asp52) print 'correl protein', statistics.correlation( l_sum_excl_ions, l_protein) print 'correl chloride', statistics.correlation( l_sum_excl_ions, l_chloride) print 'correl water', statistics.correlation( l_sum_excl_ions, l_water) ## ## combined plot 3 (4 conformational states x 4 protonation states and their averages) ## ## orange=black, blue=green, yellow=red, grey=purple ## *NEUNEUCHA*NEU = *NEUNEUCHA*CHA (ion,water,protein) ## *CHANEUCHA*NEU = *CHANEUCHA*CHA (ion,water,protein) ## *NEUCHACHA*CHA = *NEUCHACHA*NEU (ion,water) ## *CHACHACHA*NEU = *CHACHACHA*CHA (ion,water) ## overlaps - water, ions, (protein) ## CHACHANEUCHA not overlap when protein for s_col, title, suffix, y1, y2 in [ [ '$2+$3+$4+$5', 'energies of 4 conformational states at 4 different protonation states (all terms)', '1all', -700, 250, ], [ '$2+$3+$5', 'energies of 4 conformational states at 4 different protonation states (excluding ions)', '1exclions', -700, 250, ], [ '$3+$4+$5', 'energies of 4 conformational states at 4 different protonation states (excluding Asp52)', '1exclasp52', -700, 250, ], [ '$2', 'energies of 4 conformational states at 4 different protonation states (Asp52)', '2asp52', -700, 250, ], [ '$2', 'energies of 4 conformational states at 4 different protonation states (Asp52)', '2asp52_zoom1', -80, 23, ], [ '$2', 'energies of 4 conformational states at 4 different protonation states (Asp52)', '2asp52_zoom2', 23, 160, ], [ '$3', 'energies of 4 conformational states at 4 different protonation states (protein)', '3protein', -700, 250, ], [ '$4', 'energies of 4 conformational states at 4 different protonation states (ions)', '4ions', -700, 250, ], [ '$5', 'energies of 4 conformational states at 4 different protonation states (water)', '5water', -700, 250, ], [ '$5', 'energies of 4 conformational states at 4 different protonation states (water)', '5water_zoom', -80, 40, ], ]: print 'combined plot 16 states', suffix lines = [ 'set terminal postscript eps enhanced color "Helvetica" 32\n', 'set size 3,3\n', 'set output "combined_16states.ps"\n', 'set xlabel "t / ps"\n', 'set ylabel "E / kT"\n', 'set title "%s"\n' % (title), ] ## line = 'plot [0:][%s:%s]' %(Min,Max,) line = 'plot [0:30000][%s:%s]' % ( y1, y2, ) ## data points for i_state in range(16): state = l_states[i_state] ## pt = [6,7,4,5,12,13][i_state % 4] if i_state < 8: pt = 7 ps = 1 else: pt = 5 ps = 1 ## data points line += '"../%s/energies_%s.txt" u 1:(%s) lc rgb "#%6s" ps %i pt %i t "%s", ' % ( state[:6], state[-6:], s_col, d_colors[state]['pc'], ps, pt, state, ) ## lines for i_state in range(16): if i_state < 8: if i_state in [ 0, 1, 4, 5, ]: lw = 16 else: lw = 12 else: lw = 4 state = l_states[i_state] ## lines (averages) line += '"../%s/energies_averages_%s.txt" u 1:(%s) w l lt 1 lc rgb "#%6s" lw %i t "%s", ' % ( state[:6], state[-6:], s_col, d_colors[state]['lc'], lw, state, ) line = line[:-2] + '\n' lines += [line] fd = open('gnu.set', 'w') fd.writelines(lines) fd.close() os.system('gnuplot gnu.set') os.system('convert combined_16states.ps combined_16states_%s.png' % (suffix)) ## ## combined plot 2 (2 states with individual terms and their averages) ## for combination in [ [ 'CHACHA', 'CHANEU', ], [ 'NEUCHA', 'NEUNEU', ], ]: print 'combined plot', combination lines = [ 'set terminal postscript eps enhanced color "Helvetica" 32\n', 'set size 3,3\n', 'set output "combined.ps"\n', 'set xlabel "t / ps"\n', 'set ylabel "E / kT"\n', 'set title "%s"\n' % ('%s v %s' % ( combination[0], combination[1], )), ] line = 'plot [0:][-500:100]' ## data points line += '"../%s/energies_%s.txt" u 1:%s lc 3 t "%s protein", ' % ( combination[0], combination[0], '($3)', combination[0], ) line += '"../%s/energies_%s.txt" u 1:%s lc 4 t "%s protein", ' % ( combination[1], combination[1], '($3)', combination[1], ) line += '"../%s/energies_%s.txt" u 1:%s lc 5 t "%s water", ' % ( combination[0], combination[0], '($5)', combination[0], ) line += '"../%s/energies_%s.txt" u 1:%s lc 6 t "%s water", ' % ( combination[1], combination[1], '($5)', combination[1], ) line += '"../%s/energies_%s.txt" u 1:%s lc 1 t "%s Asp52", ' % ( combination[0], combination[0], '($2)', combination[0], ) line += '"../%s/energies_%s.txt" u 1:%s lc 2 t "%s Asp52", ' % ( combination[1], combination[1], '($2)', combination[1], ) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 3 lw 16 t "%s protein average", ' % ( combination[0], combination[0], '($3)', combination[0], ) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 4 lw 16 t "%s protein average", ' % ( combination[1], combination[1], '($3)', combination[1], ) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 5 lw 16 t "%s water average", ' % ( combination[0], combination[0], '($5)', combination[0], ) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 6 lw 16 t "%s water average", ' % ( combination[1], combination[1], '($5)', combination[1], ) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 1 lw 16 t "%s Asp52 average", ' % ( combination[0], combination[0], '($2)', combination[0], ) line += '"../%s/energies_averages_%s.txt" u 1:%s w l lt 1 lc 2 lw 16 t "%s Asp52 average", ' % ( combination[1], combination[1], '($2)', combination[1], ) ## line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 7 t "NEUCHA Asp52", ' %('($2)',) ## line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 8 t "NEUNEU Asp52", ' %('($2)',) ## line += '"../NEUCHA/energies_NEUCHA.txt" u 1:%s lc 9 t "NEUCHA protein", ' %('($3)',) ## line += '"../NEUNEU/energies_NEUNEU.txt" u 1:%s lc 10 t "NEUNEU protein", ' %('($3)',) line = line[:-2] + '\n' lines += [line] fd = open('gnu.set', 'w') fd.writelines(lines) fd.close() os.system('gnuplot gnu.set') os.system('convert combined.ps combined_%s_v_%s.png' % ( combination[0], combination[1], )) return