def main(data_choice='random_vertical_boundary'): # For Moons dataset, noise = 0.05 is added. data_train, data_test, true_labels_train, true_labels_test = generate_data( data_choice, num_points=1024, split=True) true_labels_test_flip = [] for label in true_labels_test: if label == 0: true_labels_test_flip.append(1) elif label == 1: true_labels_test_flip.append(0) else: raise ValueError('This label does not exist') plot_params_train = { 'colors': ['blue', 'magenta'], 'alpha': 0.5, 'size': 80 } scatter(data_train, true_labels_train, **plot_params_train, show=False) plot_params_test = { 'colors': ['blue', 'magenta'], 'alpha': 1, 'size': 40, 'linewidth': 1.5 } scatter(data_test, true_labels_test, true_labels_test_flip, **plot_params_test, show=False) plt.show()
def main(encoding='denseangle_param'): qc_name = '1q-qvm' qc = get_qc(qc_name) num_shots = 1024 device_qubits = qc.qubits() classifier_qubits = device_qubits if encoding.lower() == 'wavefunction_param': params = np.array([0.45811744, 0.2575122, 0.52902198]) else: params = np.random.rand(3) n_layers = 1 if encoding.lower() == 'denseangle_param': encoding_choice = 'denseangle_param' init_encoding_params = [np.pi, 2*np.pi] elif encoding.lower() == 'wavefunction_param': encoding_choice = 'wavefunction_param' init_encoding_params = [0] elif encoding.lower() == 'superdenseangle_param': encoding_choice = 'superdenseangle_param' init_encoding_params = [np.pi, 2*np.pi] else: raise NotImplementedError ''' # Generate Grid of datapoints to determine and visualise ideal decision boundary ''' data_choice = 'full_vertical_boundary' num_grid_points = 2000 data_grid, grid_true_labels = generate_data(data_choice, num_grid_points) data_grid, grid_true_labels = remove_zeros(data_grid, grid_true_labels) predicted_labels_grid = ClassificationCircuit(classifier_qubits, data_grid).make_predictions(params, n_layers, encoding_choice, init_encoding_params, \ num_shots, qc) plot_params = {'colors': ['blue', 'orange'], 'alpha': 1, 'size': 70} scatter(data_grid, predicted_labels_grid, **plot_params) plt.show()
def scatter(self, clusters=None, components=[1, 2, 3], limit=500, figsize=(16, 5), s=5): """ Generates a scatter plot in feature space of the clustered data. """ from scipy.misc import comb from itertools import combinations components = [c - 1 for c in components] feats, col_array = self._scatter_helper(clusters, limit) N_plots = int(comb(len(components), 2, exact=True)) fig, axes = plt.generate_axes(N_plots, ncols=3, num=1, figsize=figsize) for ax, (x, y) in zip(axes, combinations(components, 2)): ax.clear() # Clear the axes before replotting plt.scatter(feats[:, x], feats[:, y], colors=col_array, ax=ax, s=s) ax.set_xlabel("Component {}".format(x + 1)) ax.set_ylabel("Component {}".format(y + 1)) fig.tight_layout() return fig
def scatter(self, clusters=None, components=[1,2,3], limit=500, figsize=(16,5), s=5): """ Generates a scatter plot in feature space of the clustered data. """ from scipy.misc import comb from itertools import combinations components = [ c-1 for c in components ] feats, col_array = self._scatter_helper(clusters, limit) N_plots = int(comb(len(components), 2, exact=True)) fig, axes = plt.generate_axes(N_plots, ncols=3, num=1, figsize=figsize) for ax, (x,y) in zip(axes,combinations(components,2)): ax.clear() # Clear the axes before replotting plt.scatter(feats[:,x], feats[:,y], colors=col_array, ax=ax, s=s) ax.set_xlabel("Component {}".format(x+1)) ax.set_ylabel("Component {}".format(y+1)) fig.tight_layout() return fig
def find_outliers(data): """ Finds the outliers in the dataset :param data: :return: """ scatter(data, ['salary', 'bonus']) # Let's find out the possible outliers print('Finding possible outliers...') potential_outliers = [] for person in data: if data[person]['salary'] > 800000 or data[person]['bonus'] > 6000000 or \ (data[person]['salary'] == 0 and data[person]['bonus'] == 0 and data[person]['total_payments'] == 0 and data[person]['from_poi_to_this_person'] == 0 and data[person]['total_stock_value'] == 0 and data[person]['from_this_person_to_poi'] == 0 and data[person]['from_messages'] == 'NaN' and data[person]['to_messages'] == 'NaN'): potential_outliers.append(person) # There is one key which is not an actual name: potential_outliers.append('THE TRAVEL AGENCY IN THE PARK') print('Found {:,} potential outliers, "{}"'.format(len(potential_outliers), ', '.join(potential_outliers))) # Let's examine now the potential outliers outliers = [] for potential_outlier in potential_outliers: if data[potential_outlier]["poi"]: print(' -> "{}" is excluded for being a POI'.format(potential_outlier)) elif data[potential_outlier]['from_poi_to_this_person'] + data[potential_outlier]['from_this_person_to_poi'] > 100: print(' -> "{}" is excluded for having high interactions with a POI'.format(potential_outlier)) else: outliers.append(potential_outlier) print('Found {:,} actual outliers, "{}"'.format(len(outliers), ', '.join(outliers))) return outliers
def create_new_features(data): """ Adds additional features to the dataset :param data: :return: """ print("Adding features...") for person in data: from_poi_to_this_person = data[person]['from_poi_to_this_person'] to_messages = data[person]['to_messages'] from_messages = data[person]['from_messages'] from_this_person_to_poi = data[person]['from_this_person_to_poi'] salary = data[person]['salary'] bonus = data[person]['bonus'] # Let's add some interesting ratios data[person][ 'from_poi_to_this_person_ratio'] = from_poi_to_this_person / float( to_messages) data[person][ 'from_this_person_to_poi_ratio'] = from_this_person_to_poi / float( from_messages) if salary != 0: data[person]['bonus_over_salary_ratio'] = bonus / float(salary) else: data[person]['bonus_over_salary_ratio'] = 0 # Let's render some charts to help us understand this new features scatter(data, ['from_poi_to_this_person_ratio', 'from_this_person_to_poi_ratio']) histogram(data, 'bonus_over_salary_ratio') return data
assert (abs(xlo - xlotest) < 1e-8) assert (abs(xhi - xhitest) < 1e-8) # initialize plotting parameters nplot = 1 nrow = 2 ncol = 2 fig = plt.figure(figsize=figsize) ax = fig.add_subplot(nrow, ncol, nplot) # plot the population distribution plots.scatter(x, y, fig=fig, ax=ax, title='Two-sided Confidence Interval', xlabel='height', ylabel='f(x)', linewidth=2, markersize=0) # fill the areas corresponding to the significance level of a # 2-sided confidence interval xfill = x[x <= xlo] yfill = y[x <= xlo] ax.fill_between(xfill, yfill, color=plots.BLUE) xfill = x[x >= xhi] yfill = y[x >= xhi] ax.fill_between(xfill, yfill, color=plots.BLUE) nplot = nplot + 1
def plots(): plot1 = scatter() plot2 = hist() return render_template('plots.html', plot_1=plot1, plot_2=plot2)
import pandas as pd from plotnine import * import plots as plt import numpy as np df = pd.read_csv("train.csv") df.describe() df.head() df.columns # 7 col are num, 5 are cat df.isnull().sum() #age and cabin missing values df.dtypes df.Name plt.scatter(df,'Survived','Age') plt.scatter(df,df.index,'Age') plt.hist(df,'Cabin',30) plt.boxplot(df,'Age') import plotly.plotly as py from plotly.graph_objs import * data = {'x': df.Age.values, 'y': df.Fare.values, 'z': df.Survived.values, 'type': 'surface'} fig = Figure(data=data) py.plot(fig)
# - Percentages of wine types in the data print 100*df.groupby(y).size()/float(len(df)) # - Means (averages) by wine type print df.groupby(y).mean() # - Standard deviations by wine type print df.groupby(y).std() # We could use all the predictors in a model. # But to keep things simple, let's start with two variables only. # (Things are easier to plot if you have only two dimensions.) # Choose variables that separate the target classes (wine type) well. # For example... # Hint: Try other predictors. Look at the histograms by wine type. # Plot scatter plot of the chosen vars, colors specify wine type. plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', alpha=0.2, ymax=0.4) # Our function allows transformations, see the code in plots.py # Logarithm compresses high values and opens up the scale at the lower end. plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', np.log, np.log, alpha=0.2, ymax=0.4) # Notice: The second part starts here. # Fit a logistic regression model using two variables. # More info for logistic regression: http://en.wikipedia.org/wiki/Logistic_regression # In a regression model, you should have a free constant in the linear combination. # It is called intercept. # (Think for a while a logistic regression with no variables, but either with an intercept # or without. The latter always gives p=0.5, the former adjusts to the prior class probabilities # of the data.)
# - Means (averages) by wine type print df.groupby(y).mean() # - Standard deviations by wine type print df.groupby(y).std() # We could use all the predictors in a model. # But to keep things simple, let's start with two variables only. # (Things are easier to plot if you have only two dimensions.) # Choose variables that separate the target classes (wine type) well. # For example... # Hint: Try other predictors. Look at the histograms by wine type. # Plot scatter plot of the chosen vars, colors specify wine type. plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', alpha=0.2, ymax=0.4) # Our function allows transformations, see the code in plots.py # Logarithm compresses high values and opens up the scale at the lower end. plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', np.log, np.log, alpha=0.2, ymax=0.4) # Notice: The second part starts here.
def main(train=False, encoding_choice='denseangle_param', retrain=False, data_choice='moons', noise=False): ### Firstly, generate for dataset: ''' # We use the transpose of the (scaled to unit square) Moons dataset in order to see a non-linear decision boundary ''' data_train, data_test, true_labels_train, true_labels_test = generate_data(data_choice, num_points=500, split=True) # data_train, true_labels_train = remove_zeros(data_train, true_labels_train) # data_test, true_labels_test = remove_zeros(data_test, true_labels_test) ### Next, generate correct classification parameters for dataset (perfect classification): ''' # Define parameters of model. Start with DenseAngle encoding with fixed parameters. ''' qc_name = '1q-qvm' qc = get_qc(qc_name) num_shots = 1024 qubits = qc.qubits() init_params = np.random.rand(3) if encoding_choice.lower() == 'wavefunction_param': init_encoding_params = [ 0 ] # Generalized Wavefunction Encoding initialised to Wavefunction encoding else: init_encoding_params = [np.pi, 2*np.pi] if train: optimiser = 'Powell' params, result_unitary_param = train_classifier(qc, num_shots, init_params, encoding_choice, init_encoding_params, optimiser, data_train, true_labels_train) print('The optimised parameters are:', result_unitary_param.x) print('These give a cost of:', ClassificationCircuit(qubits, data_train).build_classifier(result_unitary_param.x, encoding_choice, init_encoding_params, num_shots, qc, true_labels_train)) ideal_params = result_unitary_param.x else: if data_choice.lower() == 'moons': ### Define Ideal parameters for trained model. Simple model can acheieve classification of about 90 % ''' # 90% Classification parameters for dense angle encoding ''' if encoding_choice.lower() == 'denseangle_param': ideal_params= [ 2.19342064 , 1.32972029, -0.18308298] ### Define Ideal parameters for trained model. Simple model can acheieve classification of about 75 % ''' # 73% Classification parameters for superdense angle encoding ''' if encoding_choice.lower() == 'superdenseangle_param': ideal_params = [-0.27365492, 0.83278854, 3.00092961] ### Define Ideal parameters for trained model. Simple model can acheieve classification of about % ''' # 85% Classification parameters for wavefunction encoding ''' if encoding_choice.lower() == 'wavefunction': ideal_params = [0.81647273, 0.41996708, 2.20603541] if encoding_choice.lower() == 'wavefunction_param': ideal_params = [0.81647273, 0.41996708, 2.20603541] elif data_choice.lower() == 'random_vertical_boundary': if encoding_choice.lower() == 'superdenseangle_param': ideal_params = [1.606422245361118, 0.23401504261014927, 5.694226283697996] elif data_choice.lower() == 'random_diagonal_boundary': ### Define Ideal parameters for trained model. Simple model can acheieve classification of about 90 % ''' # 90% Classification parameters for dense angle encoding ''' if encoding_choice.lower() == 'denseangle_param': ideal_params = [0.8579214, 1.22952647, 4.99408074] ### Define Ideal parameters for trained model. Simple model can acheieve classification of about % ''' # % Classification parameters for superdense angle encoding ''' if encoding_choice.lower() == 'superdenseangle_param': ideal_params = [2.0101407, 1.05916291, 1.14570489] ### Define Ideal parameters for trained model. Simple model can acheieve classification of about 97% ''' # 97% Classification parameters for wavefunction encoding ''' if encoding_choice.lower() == 'wavefunction': ideal_params = [0.69409285 0.0862859 0.42872711] if encoding_choice.lower() == 'wavefunction_param': ideal_params = [0.69409285 0.0862859 0.42872711] print('These give a cost of:', ClassificationCircuit(qubits, data_test).build_classifier(ideal_params, encoding_choice, init_encoding_params, num_shots, qc, true_labels_test)) predicted_labels_ideal = ClassificationCircuit(qubits, data_test).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc) # nisqai.visual.scatter(data_test, true_labels_test, predicted_labels) ### Overlay decision bounday ''' # Generate Grid of datapoints to determine and visualise ideal decision boundary ''' num_points = 400 data_grid, grid_true_labels = generate_data('full_vertical_boundary', num_points) data_grid, grid_true_labels = remove_zeros(data_grid, grid_true_labels) predicted_labels = ClassificationCircuit(qubits, data_test).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc) plot_params = {'colors': ['blue', 'orange'], 'alpha': 1} scatter(data_test, true_labels_test, predicted_labels, **plot_params) predicted_labels_grid = ClassificationCircuit(qubits, data_grid).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc) plot_params = {'colors': ['red', 'green'], 'alpha': 0.2} scatter(data_grid, predicted_labels_grid, **plot_params) plt.show() ## Define noise parameters ''' # Define noise parameters to add to model to determine how classification is affected. ''' if noise: noise_choice = 'amp_damp_before_measurement' noise_values = 0.3 ### Add noise to circuit and classify ''' # Add noise to circuit, and determine number of points classified differently (not mis-classified since we can't achieve perfect classification) ''' if noise: noisy_predictions, number_classified_same = generate_noisy_classification(ideal_params, noise_choice, noise_values, encoding_choice, init_encoding_params, qc, num_shots, data_test, predicted_labels_ideal) print('The proportion classified differently after noise is:', 1- number_classified_same) ## Overlay decision boundary ''' # Generate Grid of datapoints to determine and visualise ideal decision boundary WITH noise added ''' if noise: print(noise_choice) predicted_labels = ClassificationCircuit(qubits, data_test, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc) plot_params = {'colors': ['blue', 'orange'], 'alpha': 1} scatter(data_test, true_labels_test, predicted_labels, **plot_params) predicted_labels_grid = ClassificationCircuit(qubits, data_grid, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc) plot_params = {'colors': ['red', 'green'], 'alpha': 0.2} scatter(data_grid, predicted_labels_grid, **plot_params) plt.show() ### Retrain circuit with noise ''' # Given the noise in the circuit, train the parameters of encoding unitary to account for noise. Parameterised unitary parameters are fixed as the ideal ones learned. ''' if retrain: if encoding_choice.lower() == 'wavefunction_param': optimiser = 'L-BFGS-B' else: optimiser = 'Powell' if noise: encoding_params, result_encoding_param = train_classifier_encoding(qc, noise_choice, noise_values, num_shots, ideal_params, encoding_choice, init_encoding_params, optimiser, data_train, true_labels_train) print('The optimised encoding parameters with noise are:', result_encoding_param.x) ideal_encoding_params = result_encoding_param.x else: encoding_params, result_encoding_param = train_classifier_encoding(qc, None, None, num_shots, ideal_params, encoding_choice, init_encoding_params, optimiser, data_train, true_labels_train) print('The optimised encoding parameters without noise are:', result_encoding_param.x) ideal_encoding_params = result_encoding_param.x else: ### Define Ideal ENCODING parameters for trained model. Simple model can acheieve classification of about 90 with noise, 93% without noise % ''' # 90% Classification parameters for dense angle encoding ''' if data_choice.lower() == 'moons' and encoding_choice.lower() == 'denseangle_param' and noise: ideal_encoding_params = [2.23855329, 7.57781576] ''' # 93% Classification parameters for dense angle encoding without noise ''' elif data_choice.lower() == 'moons' and encoding_choice.lower() == 'denseangle_param': ideal_encoding_params = [3.05615259, 7.61215138] # No noise ### Define Ideal ENCODING parameters for trained model. Simple model can acheieve classification of about 90 % ''' # NO NOISE - 74-77% Classification parameters with training for superdense angle encoding # NOISE - Classification parameters for superdense angle encoding (0.3 amp damp = 20% different classification - 69% accuracy with noise before encoding training) # With learned encoding - ''' if data_choice.lower() == 'moons' and encoding_choice.lower() == 'superdenseangle_param' and noise: ideal_encoding_params = [3.31296568, 6.34142188] elif data_choice.lower() == 'moons' and encoding_choice.lower() == 'superdenseangle_param': ideal_encoding_params = [2.86603822, 6.14328274] # No noise ### Define Ideal ENCODING parameters for trained model. Simple model can acheieve classification of about 90 % ''' # NO NOISE - 82-84% Classification parameters with training for generalised wavefunction encoding # NOISE - Classification parameters for superdense angle encoding (0.3 amp damp = 20% different classification - 78% accuracy with noise before encoding training) # With learned encoding - ''' print(data_choice.lower(), encoding_choice.lower()) if data_choice.lower() == 'moons' and encoding_choice.lower() == 'wavefunction_param' and noise: ideal_encoding_params = [0.02884417] elif data_choice.lower() == 'moons' and encoding_choice.lower() == 'wavefunction_param': ideal_encoding_params = [0.01582773] # No noise if noise: print('These give a cost with the noisy circuit of:',\ ClassificationCircuit(qubits, data_test, noise_choice, noise_values).build_classifier(ideal_params, encoding_choice, ideal_encoding_params , num_shots, qc, true_labels_test) ) else: print('These give a cost with the ideal circuit of:',\ ClassificationCircuit(qubits, data_test).build_classifier(ideal_params, encoding_choice, ideal_encoding_params , num_shots, qc, true_labels_test) ) ### Add noise to circuit and classify ''' # Using learned encoding parameters, check again proportion misclassified ''' if noise: noisy_predictions, number_classified_same = generate_noisy_classification(ideal_params, noise_choice, noise_values, encoding_choice, ideal_encoding_params, qc, num_shots, data_test, predicted_labels) print('The proportion classified differently after noise with learned encoding is:', 1 - number_classified_same) ## Overlay decision boundary ''' # Generate Grid of datapoints to determine and visualise ideal decision boundary WITH/WITHOUT noise added ''' if noise: predicted_labels = ClassificationCircuit(qubits, data_test, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc) plot_params = {'colors': ['blue', 'orange'], 'alpha': 1} scatter(data_test, true_labels_test, predicted_labels, **plot_params) predicted_labels_grid = ClassificationCircuit(qubits, data_grid, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc) plot_params = {'colors': ['red', 'green'], 'alpha': 0.2} scatter(data_grid, predicted_labels_grid, **plot_params) plt.show() else: predicted_labels = ClassificationCircuit(qubits, data_test).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc) plot_params = {'colors': ['blue', 'orange'], 'alpha': 1} scatter(data_test, true_labels_test, predicted_labels, **plot_params) predicted_labels_grid = ClassificationCircuit(qubits, data_grid).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc) plot_params = {'colors': ['red', 'green'], 'alpha': 0.2} scatter(data_grid, predicted_labels_grid, **plot_params) plt.show()
xs1 = np.array(range(x1, x2 + 1)) probs = stats.binom.pmf(xs1, n, p) xs2 = np.arange(x1, x2, 1e-1) proba = stats.norm.pdf((xs2 - (n * p))/(math.sqrt((n * p) * (1 - p)))) # ---------------------------------------- # plotting # ---------------------------------------- probs = np.array(probs).round(4) fig, ax1 = plots.barplot(xs1, probs ,title = 'Normal Approximation of Binomial Distribution; p = {0:.8}, n = {1}'.format(p, n) ,align = 'edge' ,edgecolor = edgecolor ,show = False, close = False) ax2 = ax1.twinx() fig, ax2 = plots.scatter(xs2 + 0.5, proba, fig = fig, ax = ax2, markersize = 0, linewidth = 2) ax2.set_title('') print('') # ---------------------------------------- # sample calculations # ---------------------------------------- p = 1e-5 # probability of E occurring n = 16e6 # in this many trials x = 150 # contains this many occurrences of E h = 1 # step size xs1 = np.arange(0, x + h, 1) probs = stats.binom.pmf(xs1, n, p) probcum = probs.sum() print('Probability that fewer than {0} occurrences of E occur in {1} trials is P(X <= {2}) = {3:.8}'\ .format(x, n, x, probcum))
# ---------------------------------------------------------------------- mu0 = 175 mua = 179 xmin = min(mu0, mua) - (5 * sigma / math.sqrt(n)) xmax = max(mu0, mua) + (5 * sigma / math.sqrt(n)) x = np.linspace(xmin, xmax, 500) y0 = stats.norm.pdf(x, loc=mu0, scale=sigma / math.sqrt(n)) ya = stats.norm.pdf(x, loc=mua, scale=sigma / math.sqrt(n)) ymin = min(y0.min(), ya.min()) ymax = max(y0.max(), ya.max()) # plot both distributions fig, ax = plots.scatter(x, y0, ylim=(0, max(y0.max(), ya.max())), xlabel='height', ylabel='f(x)', markersize=0, linewidth=2, color=plots.BLUE) plots.scatter(x, ya, fig=fig, ax=ax, ylim=(0, max(y0.max(), ya.max())), xlabel='height', ylabel='f(x)', markersize=0, linewidth=2, color=plots.RED) # find the acceptance region and fill it
xs = np.arange(xstart, xend, h) pdfvals = stats.expon.pdf(xs, 0, 1 / lamb).round(8) # ---------------------------------------- # plotting # ---------------------------------------- xs = np.array(xs).round(4) pdfvals = np.array(pdfvals).round(2) fig, ax1 = plots.barplot(xs, pdfvals ,title = 'Exponential Distribution; lambda = {0:.8}'.format(lamb) ,align = 'edge' ,edgecolor = edgecolor ,width = h ,show = False, close = False) ax2 = ax1.twinx() fig, ax2 = plots.scatter(xs, pdfvals, fig = fig, ax = ax1 ,ylim = ax1.get_ylim(), markersize = 0, linewidth = 2) ax2.set_title('') # ---------------------------------------- # sample calculations 1 # ---------------------------------------- prob1 = stats.expon.cdf(x, 0, 1 / lamb) prob1calc = 1 - math.exp(-x * lamb) prob2 = 1 - prob1 assert(0.082 - round(prob2, 3) == 0) assert(round(prob1 - prob1calc, 8) == 0) print('{0}The probability that event E will occur in <= {1:.2} units is {2:.8}'\ .format(space, x, prob1)) x1 = 2 / 60 x2 = 3 / 60
def file_size_analysis(major_extensions_file): trep = get_valid_repos() rep_size = pd.read_csv(major_extensions_file) print('avg file mean', rep_size.avg_size.mean() / KILOBYTE) print('std file mean', rep_size.std_size.mean() / KILOBYTE) print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE) print('std capped file mean', rep_size.capped_std_file.mean() / KILOBYTE) print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE) print('std capped file mean/avg capped file mean', rep_size.capped_std_file.mean() / rep_size.capped_avg_file.mean()) treps = pd.merge(trep, rep_size, on='repo_name') print(rep_size.capped_avg_file.describe()) size_25_q = rep_size.capped_avg_file.quantile(0.25) print("size 25 quantile", size_25_q, "in kb", size_25_q / KILOBYTE) size_75_q = rep_size.capped_avg_file.quantile(0.75) print("size 75 quantile", size_75_q, "in kb", size_75_q / KILOBYTE) treps['size_group'] = treps.apply( lambda x: 'Lower 25' if x.capped_avg_file < size_25_q else "top 25" if x.capped_avg_file > size_75_q else "Middle", axis=1) print('top 10 prob', 1.0 * len(treps[treps.quality_group == 'Top 10']) / len(treps)) top_10_in_l25 = 1.0 * len(treps[(treps.quality_group == 'Top 10') & (treps.size_group == 'Lower 25')]) / len( treps[treps.size_group == 'Lower 25']) print('top 10 prob in lower 25', top_10_in_l25) top_10_in_t25 = 1.0 * len(treps[(treps.quality_group == 'Top 10') & (treps.size_group == 'top 25')]) / len( treps[treps.size_group == 'top 25']) print('top 10 prob in top 25', top_10_in_t25) print("short files lift ", top_10_in_l25 / top_10_in_t25 - 1) group_by_size = treps.groupby(['size_group'], as_index=False).agg({'y2019_ccp': 'mean'}) print(group_by_size) print("all files") print( treps.groupby('quality_group').agg({ 'capped_avg_file': 'mean', 'avg_size': 'mean', 'files': 'sum', 'repo_name': 'count' })) for i in lang_name: print(i, " files") print(treps[(treps.major_extension_ratio > DOMINANT_RATE) & (treps.major_extension == lang_extension[i])].groupby( 'quality_group').agg({ 'capped_avg_file': 'mean', 'avg_size': 'mean', 'files': 'sum', 'repo_name': 'count' })) print("Size controled by developer groups") pretty_print( pair_analysis_by_dev_num_group(treps, 'size_group', 'y2019_ccp')) print("Size controled by project age") pretty_print(pair_analysis_by_age_group(treps, 'size_group', 'y2019_ccp')) scatter(treps, first_metric='y2019_ccp', second_metric='capped_avg_file', output_file=os.path.join(FIGURES_PATH, r'ccp_vs_length_scatter.html'), mode='markers', opacity=0.9) pair_analysis_by_bins_to_file(treps, 'y2019_ccp', 'capped_avg_file', output_file=os.path.join( DATA_PATH, 'ccp_vs_length_bins.csv'), bins=10) return treps
def plots(): plot1 = scatter() return render_template('plots.html', plot_1=plot1)
xlabel=['height'], ylabel=['count']) # create array of x values for calculating pdf values xmin = dfPop.loc[:, 'height'].min() xmax = dfPop.loc[:, 'height'].max() x = np.linspace(xmin, xmax, 500) # plot normal probability density function with population mean and variance pdf = pdfnorm(x, mu, sigma) ax = ax.twinx() plots.scatter(x, pdf, fig=fig, ax=ax, ylim=(pdf.min(), pdf.max()), title='', markersize=0, linewidth=2, color=plots.RED) nplot = nplot + 1 # NOTE that the shape population distribution is normal. # histogram of sample ax = fig.add_subplot(nrow, ncol, nplot) plots.histogram(dfSamp, fig=fig, ax=ax, numBins=numBins, title='Single Sample',
def main(train=False, encoding='denseangle_param', ideal=False, noise=False, analytic=False, compare=False): """ # Find optimal parameters for linear decision boundary and add noise """ ### Firstly, generate for dataset: ''' # We use the transpose of the (scaled to unit square) Moons dataset in order to see a non-linear decision boundary ''' data_vertical_train, data_vertical_test, true_labels_train, true_labels_test = generate_data( 'random_vertical_boundary', num_points=500, split=True) ### Next, generate correct classification parameters for dataset (perfect classification): ''' # Define parameters of model. Start with DenseAngle encoding with fixed parameters. ''' qc_name = '1q-qvm' qc = get_qc(qc_name) num_shots = 1024 device_qubits = qc.qubits() classifier_qubits = device_qubits n_layers = 1 init_params = np.random.rand(3) if encoding.lower() == 'denseangle_param': encoding_choice = 'denseangle_param' # init_encoding_params = [np.pi, 2*np.pi] init_encoding_params = [np.pi, 2 * np.pi] elif encoding.lower() == 'wavefunction' or encoding.lower( ) == 'wavefunction_param': encoding_choice = 'wavefunction_param' init_encoding_params = [0] optimiser = 'Powell' if train: ### Train model, and check classification result of ideal parameters found ''' # Train model using scipy.optimize ''' params, result_unitary_param = train_classifier( qc, num_shots, init_params, encoding_choice, init_encoding_params, optimiser, data_vertical_train, true_labels_train) print('The optimised parameters are:', result_unitary_param.x) print('These give a cost of:', ClassificationCircuit(classifier_qubits, data_vertical_train).build_classifier(result_unitary_param.x, n_layers, \ encoding_choice, init_encoding_params, num_shots, qc, true_labels_train)) ideal_params_vertical = result_unitary_param.x else: ### Define Ideal parameters for trained model learned from previous. Simple model can acheieve classification of about 90 % if encoding_choice.lower() == 'denseangle_param': ''' # 100% Classification parameters (modulo points on the boundary) ''' # ideal_params_vertical = [3.8208,1.525,0.0808] ideal_params_vertical = [1.67814786, 1.56516469, 1.77820848] elif encoding_choice.lower() == 'wavefunction_param': ''' # 78% Classification parameters (modulo points on the boundary) ''' ideal_params_vertical = [2.2921198, 0.61375299, -5.15252796] plt.rcParams.update({ "font.size": 20, "font.serif": "Computer Modern Roman" }) ### Overlay decision bounday ''' # Generate Grid of datapoints to determine and visualise ideal decision boundary ''' data_choice = 'full_vertical_boundary' num__grid_points = 1000 data_grid, grid_true_labels = generate_data(data_choice, num__grid_points) data_grid, grid_true_labels = remove_zeros(data_grid, grid_true_labels) if ideal: predicted_labels_test = ClassificationCircuit(classifier_qubits, data_vertical_test, qc).make_predictions(ideal_params_vertical, n_layers, \ encoding_choice, init_encoding_params, num_shots) plot_params = {'colors': ['blue', 'orange'], 'alpha': 1} scatter(data_vertical_test, true_labels_test, predicted_labels_test, **plot_params) predicted_labels_grid = ClassificationCircuit(classifier_qubits, data_grid, qc).make_predictions(ideal_params_vertical, n_layers,\ encoding_choice, init_encoding_params, num_shots) plot_params = {'colors': ['red', 'green'], 'alpha': 0.2} scatter(data_grid, predicted_labels_grid, **plot_params) plt.show() ### Define noise parameters ''' # Define noise parameters to add to model to determine how classification is affected. ''' noise_choice = 'amp_damp_before_measurement' noise_values = 0.4 ### Add noise to circuit and classify ''' # Add noise to circuit, and determine number of points classified differently (not mis-classified since we can't achieve perfect classification) ''' if noise: ## Overlay decision boundary ''' # Generate Grid of datapoints to determine and visualise ideal decision boundary WITH noise added ''' predicted_labels_test_noise = ClassificationCircuit(classifier_qubits, data_vertical_test, qc,\ noise_choice, noise_values).make_predictions(ideal_params_vertical, n_layers, encoding_choice, init_encoding_params, num_shots) plot_params = {'colors': ['blue', 'orange'], 'alpha': 1} scatter(data_vertical_test, true_labels_test, predicted_labels_test_noise, **plot_params) predicted_labels_grid_noise = ClassificationCircuit(classifier_qubits, data_grid, qc,\ noise_choice, noise_values).make_predictions(ideal_params_vertical, n_layers, \ encoding_choice, init_encoding_params, num_shots) plot_params = {'colors': ['red', 'green'], 'alpha': 0.2} scatter(data_grid, predicted_labels_grid_noise, **plot_params) plt.show() ''' # Define function to compute points which will remian correctly classified after noise is added ''' def correct_function(data_point, params, encoding_choice, encoding_params): [alpha_1, alpha_2, alpha_3] = params [x_1, x_2] = data_point if encoding_choice.lower() == 'denseangle_param': [theta, phi] = encoding_params function = (np.sin(alpha_2) )**2 * ( np.cos(theta * x_1) )**2 + (np.cos(alpha_2))**2 * (np.sin(theta * x_1))**2 \ + ((1/2)*(np.sin(2 * alpha_2) * np.sin(2 * theta * x_1) * np.exp(-1j*(2 * alpha_3 + phi * x_2)))).real elif encoding_choice.lower() == 'wavefunction_param': [theta] = encoding_params l2_norm = np.linalg.norm(np.array([x_1, x_2]))**2 function = (np.sin(alpha_2)**2 ) * ( x_1**2/(l2_norm) ) + (np.cos(alpha_2)**2) * (x_2**2/(l2_norm)) \ + ((1/(2*l2_norm))*(np.sin(2 * alpha_2) * (x_1) * (x_2) * np.exp(-1j*(2 * alpha_3)))).real return function def compute_analytic_misclassifed_condition(data, params, encoding_choice, encoding_params, noise_strength, true_labels): correct_classification_labels = [] for ii, data_point in enumerate(data): function = correct_function(data_point, params, encoding_choice, encoding_params) if true_labels[ii] == 0: correct_classification_labels.append( 0 ) # If datapoint was zero originally, it will be correctly classified regardless of noise else: if function > 1 / ( 2 * (1 - noise_strength) ): # If data point was classified as 1, it will be correctly classified if condition is met. correct_classification_labels.append(0) else: correct_classification_labels.append(1) number_robust = 1 - sum(correct_classification_labels) / len( correct_classification_labels) # percentage of misclassified points return np.array(correct_classification_labels), number_robust def plot_number_misclassified_amp_damp(ideal_params, num_shots, num_points, qc, noise_values): points_noise_inc = [] data_vertical_train, data_vertical_test, true_labels_train, true_labels_test = generate_data('random_vertical_boundary',\ num_points=num_points, split=True) interval = 0.2 encoding_choice = 'denseangle_param' theta = np.arange(0, 2 * np.pi, interval) phi = np.arange(0, 2 * np.pi, interval) X, Y = np.meshgrid(theta, phi) noise_choice = 'amp_damp_before_measurement' test_acc_ideal = np.zeros((theta.shape[0], phi.shape[0]), dtype=float) test_acc_noise = np.zeros((theta.shape[0], phi.shape[0]), dtype=float) number_robust = np.zeros((theta.shape[0], phi.shape[0]), dtype=float) for ii, t in enumerate(theta): for jj, p in enumerate(phi): temp_encoding_params = [t, p] # Classification of encoding parameters *without* noise ideal_predictions, test_acc_ideal[ii,jj] = generate_noisy_classification(ideal_params, 1, None, None,\ encoding_choice, temp_encoding_params, qc,\ classifier_qubits, num_shots, data_vertical_test, true_labels_test) # Learned encoding parameters *with* noise noisy_predictions, test_acc_noise[ii,jj] = generate_noisy_classification(ideal_params, 1, noise_choice, noise_values,\ encoding_choice, temp_encoding_params, qc,\ classifier_qubits, num_shots, data_vertical_test, true_labels_test) # Number expected to be robust under analytic condition correct_classification_labels, number_robust[ii, jj] = compute_analytic_misclassifed_condition(data_vertical_test, ideal_params_vertical,\ encoding_choice, temp_encoding_params,\ noise_values, true_labels_test) print('Theta, Phi is:', t, p) print('Test accuracy ideal:', test_acc_ideal[ii, jj]) print('Test accuracy with noise:', test_acc_noise[ii, jj]) print('Proportion robust:', number_robust[ii, jj]) max_acc_indices_ideal = np.unravel_index( np.argmax(test_acc_ideal, axis=None), test_acc_ideal.shape) max_acc_indices = np.unravel_index( np.argmax(test_acc_noise, axis=None), test_acc_noise.shape) max_robust_indices = np.unravel_index( np.argmax(number_robust, axis=None), number_robust.shape) plt.rcParams.update({"font.size": 14, "font.family": "serif"}) # ---------------------- # Uncomment below for 3d plots # ---------------------- # fig = plt.figure(figsize=plt.figaspect(0.33)) # ax1 = fig.add_subplot(1, 3, 1, projection='3d') # surf1 = ax1.plot_surface(X, Y, test_acc_ideal, cmap=cm.coolwarm_r,linewidth=0, antialiased=False) # # ax1.set_zlim(0.45, 1.01) # cbar1 =fig.colorbar(surf1) # cbar1.ax.set_ylabel('Test Accuracy') # ax2 = fig.add_subplot(1, 3, 2, projection='3d') # surf2 = ax2.plot_surface(X, Y, test_acc_noise, cmap=cm.coolwarm_r, linewidth=0, antialiased=False) # # ax2.set_zlim(0.45, 1.01) # cbar2 = fig.colorbar(surf2) # cbar2.ax.set_ylabel('Test Accuracy') # ax3 = fig.add_subplot(1, 3, 3, projection='3d') # surf3 = ax3.plot_surface(X, Y, number_robust, cmap=cm.PuOr, linewidth=0, antialiased=False) # cbar3 = fig.colorbar(surf3) # cbar3.ax.set_ylabel('Proportion robust') # ax1.set_ylabel(r'$\theta (rads)$') # ax1.set_xlabel(r'$\phi (rads)$' ) # ax1.set_title( 'Best accuracy ideal: ' + str( round( test_acc_ideal[max_acc_indices_ideal] , 2) ) \ # + '\nBest accuracy with noise: ' + str( round( test_acc_noise[max_acc_indices_ideal] , 2) ) \ # + '\nRobustness: ' + str( round( number_robust[max_acc_indices_ideal] , 2) ) + '\n' \ # + r'$[\theta, \phi]$ = ' + '['+str(round(theta [ max_acc_indices_ideal[0] ], 2) )+ ', ' + str( round( phi [ max_acc_indices_ideal[1] ] , 2) ) + ']' ) # ax2.set_ylabel(r'$\theta (rads)$') # ax2.set_xlabel(r'$\phi (rads)$' ) # ax2.set_title( 'Best accuracy with noise: ' + str( round( test_acc_noise[max_acc_indices] , 2) ) \ # + '\nBest accuracy ideal: ' + str( round( test_acc_ideal[max_acc_indices] , 2) ) \ # + '\nRobustness: ' + str( round( number_robust[max_acc_indices] , 2) ) + '\n' \ # + r'$[\theta, \phi]$ = ' + '['+str(theta [ max_acc_indices[0] ])+ ', ' + str(round( phi [ max_acc_indices[1] ], 2) ) + ']' ) # ax3.set_ylabel(r'$\theta (rads)$') # ax3.set_xlabel(r'$\phi (rads)$' ) # ax3.set_title('Max. robustness: ' + str( round( number_robust[max_robust_indices] , 2) ) \ # +'\nBest accuracy with noise: ' + str( round( test_acc_noise[max_robust_indices] , 2) ) \ # +'\nBest accuracy ideal: ' + str( round( test_acc_ideal[max_robust_indices] , 2) ) + '\n'\ # +r'$[\theta, \phi]$ = ' + '[' + str(theta [ max_robust_indices[0] ]) \ # + ', ' + str(phi [ max_robust_indices[1] ] ) + ']' ) ## 2D PLOTS fig, ax = plt.subplots(1, 3) im0 = ax[0].imshow(test_acc_ideal, cmap=cm.coolwarm_r, extent=[0, 2 * np.pi, 2 * np.pi, 0]) divider = make_axes_locatable(ax[0]) cax = divider.append_axes('right', size='5%', pad=0.1) cbar0 = fig.colorbar(im0, cax=cax, orientation='vertical') cbar0.ax.set_ylabel('Test Accuracy') im1 = ax[1].imshow(test_acc_noise, cmap=cm.coolwarm_r, extent=[0, 2 * np.pi, 2 * np.pi, 0]) divider = make_axes_locatable(ax[1]) cax = divider.append_axes('right', size='5%', pad=0.1) cbar1 = fig.colorbar(im1, cax=cax, orientation='vertical') cbar1.ax.set_ylabel('Test Accuracy') im2 = ax[2].imshow(number_robust, cmap=cm.PuOr, extent=[0, 2 * np.pi, 2 * np.pi, 0]) divider = make_axes_locatable(ax[2]) cax = divider.append_axes('right', size='5%', pad=0.1) cbar2 = fig.colorbar(im2, cax=cax, orientation='vertical') cbar2.ax.set_ylabel('Proportion robust') ax[0].set_title( 'Best accuracy ideal: ' + str( round( test_acc_ideal[max_acc_indices_ideal] , 2) ) \ + '\nBest accuracy with noise: ' + str( round( test_acc_noise[max_acc_indices_ideal] , 2) ) \ + '\nRobustness: ' + str( round( number_robust[max_acc_indices_ideal] , 2) ) + '\n' \ + r'$[\theta, \phi]$ = ' + '['+str(round(theta [ max_acc_indices_ideal[0] ], 2) )+ ', ' + str( round( phi [ max_acc_indices_ideal[1] ] , 2) ) + ']' ) ax[1].set_title( 'Best accuracy with noise: ' + str( round( test_acc_noise[max_acc_indices] , 2) ) \ + '\nBest accuracy ideal: ' + str( round( test_acc_ideal[max_acc_indices] , 2) ) \ + '\nRobustness: ' + str( round( number_robust[max_acc_indices] , 2) ) + '\n' \ + r'$[\theta, \phi]$ = ' + '['+str(theta [ max_acc_indices[0] ])+ ', ' + str(round( phi [ max_acc_indices[1] ], 2) ) + ']' ) ax[2].set_title('Max. robustness: ' + str( round( number_robust[max_robust_indices] , 2) ) \ +'\nBest accuracy with noise: ' + str( round( test_acc_noise[max_robust_indices] , 2) ) \ +'\nBest accuracy ideal: ' + str( round( test_acc_ideal[max_robust_indices] , 2) ) + '\n'\ +r'$[\theta, \phi]$ = ' + '[' + str(theta [ max_robust_indices[0] ]) \ + ', ' + str(phi [ max_robust_indices[1] ] ) + ']' ) return if analytic: correct_classification_labels, number_robust = compute_analytic_misclassifed_condition(data_grid, ideal_params_vertical,\ encoding_choice, init_encoding_params,\ noise_values, grid_true_labels) plot_params = {'colors': ['blue', 'black'], 'alpha': 0.3} scatter(data_grid, correct_classification_labels, **plot_params) plt.show() if compare: plot_number_misclassified_amp_damp(ideal_params_vertical, num_shots, 500, qc, noise_values) plt.show()
st.plotly_chart(plot.question1(base)) st.markdown("## Final Data Set") st.write(base) st.write(base.shape) st.plotly_chart( plot.scatter_poor_rich(base.copy(), x='SP_DYN_TFRT_IN', x_name='Fertility Rate', y='NY_GDP_PCAP_CD', y_name='GDP per capita')) st.plotly_chart( plot.scatter(base.copy(), x='SP_DYN_TFRT_IN', x_name='Fertility Rate', y='NY_GDP_PCAP_CD', y_name='GDP per capita')) st.plotly_chart( plot.world_map(base.copy(), y='SP_DYN_TFRT_IN', y_name='Fertility Rate')) st.markdown("## Features") with st.echo(): # GET DATA PER COLUMN na_percent = [] na_total = [] minimum = [] maximum = [] for col in base.columns: na_percent.append(
def file_length_per_language(major_extensions_file, commits_per_user_file, image_file): ext = pd.read_csv(major_extensions_file) dominant = ext[ext.major_extension_ratio > DOMINANT_RATE] trep = get_valid_repos() major = pd.merge(trep, dominant, left_on='repo_name', right_on='repo_name') users_per_project = pd.read_csv(commits_per_user_file) users_per_project = users_per_project[users_per_project.year == 2019] trepu = pd.merge(major, users_per_project, on='repo_name') trepu['commit_per_user'] = trepu.apply(lambda x: x.y2019_commits / x.users if x.users > 0 else None, axis=1) trepu['commit_per_user_above_11'] = trepu.apply( lambda x: x.users_above_11_commits / x.users_above_11 if x.users_above_11 > 0 else None, axis=1) trepu['commit_per_user_cap'] = trepu.apply( lambda x: x.users_capped_commit / x.users if x.users > 0 else None, axis=1) trepu['commit_per_user_above_11_cap'] = trepu.apply( lambda x: x.commits_above_11_500_cap / x.users_above_11 if x.users_above_11 > 0 else None, axis=1) agg_lang = trepu[trepu.major_extension.isin(language_extensions)].groupby( 'major_extension', as_index=False).agg({ 'repo_name': 'count', 'y2019_ccp': {'mean', 'std'}, 'commit_per_user_above_11_cap': {'mean', 'std'} }) agg_lang.columns = agg_lang.columns.droplevel() agg_lang.columns = [ u'langauge', u'projects', u'ccp_mean', u'ccp_std', u'speed_mean', u'speed_std' ] agg_lang_quality = trepu[trepu.major_extension.isin( language_extensions)].groupby(['major_extension', 'quality_group'], as_index=False).agg({ 'repo_name': 'count', 'commit_per_user_above_11_cap': {'mean', 'std'} }) agg_lang_quality.columns = agg_lang_quality.columns.droplevel() """ agg_lang_quality = agg_lang_quality.rename(columns={ 'major_extension' : u'langauge' , 'std': u'speed_std' , 'mean': u'speed_mean' , 'count': u'projects' }) """ agg_lang_quality.columns = [ u'langauge', u'quality_group', u'projects', u'speed_mean', u'speed_std' ] all_speed_mean = [] all_speed_std = [] top_speed_mean = [] top_speed_std = [] other_speed_mean = [] other_speed_std = [] ccp_mean = [] ccp_std = [] for i in language_extensions: top_speed_mean.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_mean)) top_speed_std.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10')].iloc[0].projects))) other_speed_mean.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_mean)) other_speed_std.append( round( agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others')].iloc[0].projects))) ccp_mean.append( round(100 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean)) ccp_std.append(100 * round( agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std / math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects))) all_speed_mean.append( round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean)) all_speed_std.append( round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std / math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects))) trace0 = go.Bar(x=lang_name, y=all_speed_mean, name='Speed', error_y=dict(type='data', array=all_speed_std, visible=True)) trace1 = go.Bar(x=lang_name, y=top_speed_mean, name='Top Speed', error_y=dict(type='data', array=top_speed_std, visible=True)) trace2 = go.Bar(x=lang_name, y=other_speed_mean, name='Other Speed', error_y=dict(type='data', array=other_speed_std, visible=True)) trace3 = go.Bar(x=lang_name, y=ccp_mean, name='CCP', error_y=dict(type='data', array=ccp_std, visible=True)) data = [trace0, trace1, trace2, trace3] layout = go.Layout( barmode='group', title='Speed and CCP per language', xaxis=dict(title='Language', titlefont=dict(family='Courier New, monospace', size=24, color='#7f7f7f')), yaxis=dict(title='Commit per developer, CCP', titlefont=dict(family='Courier New, monospace', size=24, color='#7f7f7f'))) fig = go.Figure(data=data, layout=layout) plot(fig, image='png', image_filename=image_file, output_type='file') print(r"\begin{tabular}{| l| l| l| l| l| l|}") print(r" \hline ") Title = r" Metric & Projects & CCP & Speed & Top Speed & Others Speed \\ \hline" print(Title) for i in agg_lang.sort_values('ccp_mean').langauge.tolist(): Line = str(lang_by_extension(i)) Line = Line + " & " + str(agg_lang[(agg_lang.langauge == i)].iloc[0].projects) Line = Line + " & " + str( round(1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean, 2)) Line = Line + " $\pm$ " + str( round( 1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std / math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects), 3)) Line = Line + " & " + str( int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean)) Line = Line + " $\pm$ " + str( int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std / math.sqrt(agg_lang[ (agg_lang.langauge == i)].iloc[0].projects))) Line = Line + " & " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_mean)) Line = Line + " $\pm$ " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Top 10')].iloc[0].projects))) Line = Line + " & " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_mean)) Line = Line + " $\pm$ " + str( int(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others' )].iloc[0].speed_std / math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i) & (agg_lang_quality.quality_group == 'Others')].iloc[0].projects))) Line = Line + r" \\ \hline" print(Line) scatter(trepu, first_metric='y2019_ccp', second_metric='commit_per_user_above_11_cap', output_file=os.path.join(FIGURES_PATH, r'ccp_vs_speed_scatter.html'), mode='markers', opacity=0.9) pair_analysis_by_bins_to_file(trepu, 'y2019_ccp', 'commit_per_user_above_11_cap', output_file=os.path.join( DATA_PATH, 'ccp_vs_speed_bins.csv'), bins=10)
maxRuntime = max(runtime) avgRuntime = sum(runtime) / float(len(runtime)) plots.hist( runtime, minRuntime, maxRuntime, "Runtime (s)", "Number of Queries", "$min = %s$ $max = %s$ $avg = %s$" % (minRuntime, maxRuntime, avgRuntime), "runtime.png", ylog=True, ) plots.scatter( runtime, "Query", "Runtime (s)", "$min = %s$ $max = %s$ $avg = %s$" % (minRuntime, maxRuntime, avgRuntime), "runtime_scatter.png", ) plots.scatter( runtime, "Query", "Runtime (s)", "$min = %s$ $max = %s$ $avg = %s$" % (minRuntime, maxRuntime, avgRuntime), "runtime_scatter_ylog.png", ylog=True, ) for name in timePctPerOperator.iterkeys(): # if an operator doesn't exist in a query, its pct is 0 timePctPerOperator[name].extend([0.0] * (numQueries - len(timePctPerOperator[name])))