def main(data_choice='random_vertical_boundary'):

    # For Moons dataset, noise = 0.05 is added.

    data_train, data_test, true_labels_train, true_labels_test = generate_data(
        data_choice, num_points=1024, split=True)
    true_labels_test_flip = []
    for label in true_labels_test:
        if label == 0: true_labels_test_flip.append(1)
        elif label == 1: true_labels_test_flip.append(0)
        else: raise ValueError('This label does not exist')

    plot_params_train = {
        'colors': ['blue', 'magenta'],
        'alpha': 0.5,
        'size': 80
    }
    scatter(data_train, true_labels_train, **plot_params_train, show=False)
    plot_params_test = {
        'colors': ['blue', 'magenta'],
        'alpha': 1,
        'size': 40,
        'linewidth': 1.5
    }
    scatter(data_test,
            true_labels_test,
            true_labels_test_flip,
            **plot_params_test,
            show=False)
    plt.show()
def main(encoding='denseangle_param'):

    qc_name = '1q-qvm'
    qc = get_qc(qc_name)
    num_shots = 1024
    device_qubits = qc.qubits()
    classifier_qubits = device_qubits
    if encoding.lower() == 'wavefunction_param':
        params = np.array([0.45811744, 0.2575122, 0.52902198])
    else:
        params = np.random.rand(3)
    n_layers = 1
    if encoding.lower() == 'denseangle_param':
        encoding_choice = 'denseangle_param'
        init_encoding_params = [np.pi, 2*np.pi]
    elif encoding.lower() == 'wavefunction_param':
        encoding_choice = 'wavefunction_param'
        init_encoding_params = [0]
    elif encoding.lower() == 'superdenseangle_param':
        encoding_choice = 'superdenseangle_param'
        init_encoding_params = [np.pi, 2*np.pi]
    else: raise NotImplementedError
    '''
        # Generate Grid of datapoints to determine and visualise ideal decision boundary
    '''
    data_choice = 'full_vertical_boundary'
    num_grid_points = 2000
    data_grid, grid_true_labels = generate_data(data_choice, num_grid_points)
    data_grid, grid_true_labels = remove_zeros(data_grid, grid_true_labels)

    predicted_labels_grid = ClassificationCircuit(classifier_qubits, data_grid).make_predictions(params, n_layers, encoding_choice, init_encoding_params, \
                                                                num_shots, qc)
    plot_params = {'colors': ['blue', 'orange'], 'alpha': 1, 'size': 70}
    scatter(data_grid, predicted_labels_grid, **plot_params)
    plt.show()
Esempio n. 3
0
    def scatter(self,
                clusters=None,
                components=[1, 2, 3],
                limit=500,
                figsize=(16, 5),
                s=5):
        """ Generates a scatter plot in feature space of the clustered data.
        """
        from scipy.misc import comb
        from itertools import combinations

        components = [c - 1 for c in components]
        feats, col_array = self._scatter_helper(clusters, limit)
        N_plots = int(comb(len(components), 2, exact=True))

        fig, axes = plt.generate_axes(N_plots, ncols=3, num=1, figsize=figsize)
        for ax, (x, y) in zip(axes, combinations(components, 2)):
            ax.clear()  # Clear the axes before replotting
            plt.scatter(feats[:, x], feats[:, y], colors=col_array, ax=ax, s=s)
            ax.set_xlabel("Component {}".format(x + 1))
            ax.set_ylabel("Component {}".format(y + 1))

        fig.tight_layout()

        return fig
Esempio n. 4
0
 def scatter(self, clusters=None, components=[1,2,3], limit=500, figsize=(16,5), s=5):
     """ Generates a scatter plot in feature space of the clustered data.
     """
     from scipy.misc import comb
     from itertools import combinations
     
     components = [ c-1 for c in components ]
     feats, col_array = self._scatter_helper(clusters, limit)
     N_plots = int(comb(len(components), 2, exact=True))
     
     fig, axes = plt.generate_axes(N_plots, ncols=3, num=1, figsize=figsize)
     for ax, (x,y) in zip(axes,combinations(components,2)):
         ax.clear() # Clear the axes before replotting
         plt.scatter(feats[:,x], feats[:,y], colors=col_array, ax=ax, s=s)
         ax.set_xlabel("Component {}".format(x+1))
         ax.set_ylabel("Component {}".format(y+1))
     
     fig.tight_layout()
     
     return fig
def find_outliers(data):
    """
    Finds the outliers in the dataset

    :param data:
    :return:
    """
    scatter(data, ['salary', 'bonus'])

    # Let's find out the possible outliers
    print('Finding possible outliers...')
    potential_outliers = []
    for person in data:
        if data[person]['salary'] > 800000 or data[person]['bonus'] > 6000000 or \
                (data[person]['salary'] == 0 and data[person]['bonus'] == 0 and
                 data[person]['total_payments'] == 0 and data[person]['from_poi_to_this_person'] == 0 and
                 data[person]['total_stock_value'] == 0 and data[person]['from_this_person_to_poi'] == 0 and
                 data[person]['from_messages'] == 'NaN' and data[person]['to_messages'] == 'NaN'):
            potential_outliers.append(person)

    # There is one key which is not an actual name:
    potential_outliers.append('THE TRAVEL AGENCY IN THE PARK')

    print('Found {:,} potential outliers, "{}"'.format(len(potential_outliers), ', '.join(potential_outliers)))
    # Let's examine now the potential outliers
    outliers = []
    for potential_outlier in potential_outliers:
        if data[potential_outlier]["poi"]:
            print('  -> "{}" is excluded for being a POI'.format(potential_outlier))
        elif data[potential_outlier]['from_poi_to_this_person'] + data[potential_outlier]['from_this_person_to_poi'] > 100:
            print('  -> "{}" is excluded for having high interactions with a POI'.format(potential_outlier))
        else:
            outliers.append(potential_outlier)

    print('Found {:,} actual outliers, "{}"'.format(len(outliers), ', '.join(outliers)))

    return outliers
Esempio n. 6
0
def create_new_features(data):
    """
    Adds additional features to the dataset

    :param data:
    :return:
    """
    print("Adding features...")
    for person in data:
        from_poi_to_this_person = data[person]['from_poi_to_this_person']
        to_messages = data[person]['to_messages']
        from_messages = data[person]['from_messages']
        from_this_person_to_poi = data[person]['from_this_person_to_poi']

        salary = data[person]['salary']
        bonus = data[person]['bonus']

        # Let's add some interesting ratios
        data[person][
            'from_poi_to_this_person_ratio'] = from_poi_to_this_person / float(
                to_messages)
        data[person][
            'from_this_person_to_poi_ratio'] = from_this_person_to_poi / float(
                from_messages)

        if salary != 0:
            data[person]['bonus_over_salary_ratio'] = bonus / float(salary)
        else:
            data[person]['bonus_over_salary_ratio'] = 0

    # Let's render some charts to help us understand this new features
    scatter(data,
            ['from_poi_to_this_person_ratio', 'from_this_person_to_poi_ratio'])
    histogram(data, 'bonus_over_salary_ratio')

    return data
Esempio n. 7
0
    assert (abs(xlo - xlotest) < 1e-8)
    assert (abs(xhi - xhitest) < 1e-8)

    # initialize plotting parameters
    nplot = 1
    nrow = 2
    ncol = 2
    fig = plt.figure(figsize=figsize)
    ax = fig.add_subplot(nrow, ncol, nplot)

    # plot the population distribution
    plots.scatter(x,
                  y,
                  fig=fig,
                  ax=ax,
                  title='Two-sided Confidence Interval',
                  xlabel='height',
                  ylabel='f(x)',
                  linewidth=2,
                  markersize=0)

    # fill the areas corresponding to the significance level of a
    # 2-sided confidence interval
    xfill = x[x <= xlo]
    yfill = y[x <= xlo]
    ax.fill_between(xfill, yfill, color=plots.BLUE)
    xfill = x[x >= xhi]
    yfill = y[x >= xhi]
    ax.fill_between(xfill, yfill, color=plots.BLUE)
    nplot = nplot + 1
Esempio n. 8
0
def plots():
    plot1 = scatter()
    plot2 = hist()
    return render_template('plots.html', plot_1=plot1, plot_2=plot2)
Esempio n. 9
0
import pandas as pd
from plotnine import *
import plots as plt
import numpy as np

df = pd.read_csv("train.csv")
df.describe()
df.head()
df.columns # 7 col are num, 5 are cat
df.isnull().sum() #age and cabin missing values
df.dtypes

df.Name
    
plt.scatter(df,'Survived','Age')
plt.scatter(df,df.index,'Age')

plt.hist(df,'Cabin',30)
plt.boxplot(df,'Age')

import plotly.plotly as py
from plotly.graph_objs import *

data = {'x': df.Age.values,
        'y': df.Fare.values,
        'z': df.Survived.values,
        'type': 'surface'}

fig = Figure(data=data)
py.plot(fig)
# - Percentages of wine types in the data
print 100*df.groupby(y).size()/float(len(df))
# - Means (averages) by wine type
print df.groupby(y).mean()
# - Standard deviations by wine type
print df.groupby(y).std()

# We could use all the predictors in a model. 
# But to keep things simple, let's start with two variables only.
# (Things are easier to plot if you have only two dimensions.)
# Choose variables that separate the target classes (wine type) well.
# For example...
# Hint: Try other predictors. Look at the histograms by wine type.

# Plot scatter plot of the chosen vars, colors specify wine type.
plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', alpha=0.2, ymax=0.4)
# Our function allows transformations, see the code in plots.py
# Logarithm compresses high values and opens up the scale at the lower end.
plots.scatter(y, chemvars(df), 'fixed_acidity', 'chlorides', np.log, np.log, alpha=0.2, ymax=0.4)



# Notice: The second part starts here.

# Fit a logistic regression model using two variables.
# More info for logistic regression: http://en.wikipedia.org/wiki/Logistic_regression
# In a regression model, you should have a free constant in the linear combination.
# It is called intercept. 
# (Think for a while a logistic regression with no variables, but either with an intercept
#  or without. The latter always gives p=0.5, the former adjusts to the prior class probabilities
#  of the data.)
# - Means (averages) by wine type
print df.groupby(y).mean()
# - Standard deviations by wine type
print df.groupby(y).std()

# We could use all the predictors in a model.
# But to keep things simple, let's start with two variables only.
# (Things are easier to plot if you have only two dimensions.)
# Choose variables that separate the target classes (wine type) well.
# For example...
# Hint: Try other predictors. Look at the histograms by wine type.

# Plot scatter plot of the chosen vars, colors specify wine type.
plots.scatter(y,
              chemvars(df),
              'fixed_acidity',
              'chlorides',
              alpha=0.2,
              ymax=0.4)
# Our function allows transformations, see the code in plots.py
# Logarithm compresses high values and opens up the scale at the lower end.
plots.scatter(y,
              chemvars(df),
              'fixed_acidity',
              'chlorides',
              np.log,
              np.log,
              alpha=0.2,
              ymax=0.4)

# Notice: The second part starts here.
Esempio n. 12
0
def main(train=False, encoding_choice='denseangle_param', retrain=False, data_choice='moons', noise=False):
    
    ### Firstly, generate for dataset:
    '''
    # We use the transpose of the (scaled to unit square) Moons dataset in order to see a non-linear decision boundary
    '''
    data_train, data_test, true_labels_train, true_labels_test = generate_data(data_choice, num_points=500, split=True)

    # data_train, true_labels_train   = remove_zeros(data_train, true_labels_train)
    # data_test, true_labels_test     = remove_zeros(data_test, true_labels_test)

    ### Next, generate correct classification parameters for dataset (perfect classification):
    '''
    # Define parameters of model. Start with DenseAngle encoding with fixed parameters.
    '''

    qc_name = '1q-qvm'
    qc = get_qc(qc_name)
    num_shots = 1024
    qubits = qc.qubits()
    init_params = np.random.rand(3)
    if encoding_choice.lower() == 'wavefunction_param':
        init_encoding_params = [ 0 ] # Generalized Wavefunction Encoding initialised to Wavefunction encoding 
    else: 
        init_encoding_params = [np.pi, 2*np.pi]

    if train:
         
        optimiser = 'Powell' 
        params, result_unitary_param = train_classifier(qc, num_shots, init_params, encoding_choice, init_encoding_params, optimiser, data_train, true_labels_train)

        print('The optimised parameters are:', result_unitary_param.x)
        print('These give a cost of:', ClassificationCircuit(qubits, data_train).build_classifier(result_unitary_param.x, encoding_choice, init_encoding_params, num_shots, qc, true_labels_train))
        ideal_params =  result_unitary_param.x
    else:
        if data_choice.lower() == 'moons':
            ### Define Ideal parameters for trained model. Simple model can acheieve classification of about 90 %
            '''
            # 90% Classification parameters for dense angle encoding
            '''
            if encoding_choice.lower() == 'denseangle_param': ideal_params= [ 2.19342064 , 1.32972029, -0.18308298]

            ### Define Ideal parameters for trained model. Simple model can acheieve classification of about 75 %
            '''
            # 73% Classification parameters for superdense angle encoding
            '''
            if encoding_choice.lower() == 'superdenseangle_param': ideal_params =  [-0.27365492,  0.83278854,  3.00092961]

            ### Define Ideal parameters for trained model. Simple model can acheieve classification of about  %
            '''
            # 85% Classification parameters for wavefunction encoding
            '''
            if encoding_choice.lower() == 'wavefunction': ideal_params = [0.81647273, 0.41996708, 2.20603541]
            if encoding_choice.lower() == 'wavefunction_param': ideal_params = [0.81647273, 0.41996708, 2.20603541]
        
        elif data_choice.lower() == 'random_vertical_boundary':
            if encoding_choice.lower() == 'superdenseangle_param': ideal_params = [1.606422245361118, 0.23401504261014927, 5.694226283697996]
        
        elif data_choice.lower() == 'random_diagonal_boundary':

            ### Define Ideal parameters for trained model. Simple model can acheieve classification of about 90 %
            '''
            # 90% Classification parameters for dense angle encoding
            '''
            if encoding_choice.lower() == 'denseangle_param':   ideal_params = [0.8579214,  1.22952647, 4.99408074]
            
            ### Define Ideal parameters for trained model. Simple model can acheieve classification of about  %
            '''
            # % Classification parameters for superdense angle encoding
            '''
            if encoding_choice.lower() == 'superdenseangle_param': ideal_params = [2.0101407,  1.05916291, 1.14570489]
            
            ### Define Ideal parameters for trained model. Simple model can acheieve classification of about  97%
            '''
            # 97% Classification parameters for wavefunction encoding
            '''
            if encoding_choice.lower() == 'wavefunction':           ideal_params = [0.69409285 0.0862859  0.42872711]
            if encoding_choice.lower() == 'wavefunction_param':     ideal_params =  [0.69409285 0.0862859  0.42872711]

        
    print('These give a cost of:', ClassificationCircuit(qubits, data_test).build_classifier(ideal_params, encoding_choice, init_encoding_params, num_shots, qc, true_labels_test))
    predicted_labels_ideal = ClassificationCircuit(qubits, data_test).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc)

    # nisqai.visual.scatter(data_test, true_labels_test, predicted_labels)


    ### Overlay decision bounday
    '''
    # Generate Grid of datapoints to determine and visualise ideal decision boundary
    '''
    num_points = 400
    data_grid, grid_true_labels = generate_data('full_vertical_boundary', num_points)
    data_grid, grid_true_labels = remove_zeros(data_grid, grid_true_labels)

    predicted_labels = ClassificationCircuit(qubits, data_test).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc)
    plot_params = {'colors': ['blue', 'orange'], 'alpha': 1}
    scatter(data_test, true_labels_test, predicted_labels, **plot_params)

    predicted_labels_grid = ClassificationCircuit(qubits, data_grid).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc)

    plot_params = {'colors': ['red', 'green'], 'alpha': 0.2}

    scatter(data_grid, predicted_labels_grid, **plot_params)
    plt.show()


    ## Define noise parameters
    '''
    # Define noise parameters to add to model to determine how classification is affected.
    '''
    if noise: 
        noise_choice = 'amp_damp_before_measurement'
        noise_values = 0.3

    ### Add noise to circuit and classify
    '''
    # Add noise to circuit, and determine number of points classified differently (not mis-classified since we can't achieve perfect classification)
    '''
    if noise:
        noisy_predictions, number_classified_same = generate_noisy_classification(ideal_params, noise_choice, noise_values, encoding_choice, init_encoding_params, qc, num_shots, data_test, predicted_labels_ideal)
        print('The proportion classified differently after noise is:', 1- number_classified_same)

    ## Overlay decision boundary
    '''
    # Generate Grid of datapoints to determine and visualise ideal decision boundary WITH noise added
    '''

    if noise:
        print(noise_choice)
        predicted_labels = ClassificationCircuit(qubits, data_test, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc)
        plot_params = {'colors': ['blue', 'orange'], 'alpha': 1}
        scatter(data_test, true_labels_test, predicted_labels, **plot_params)

        predicted_labels_grid = ClassificationCircuit(qubits, data_grid, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, init_encoding_params, num_shots, qc)
        plot_params = {'colors': ['red', 'green'], 'alpha': 0.2}
        scatter(data_grid, predicted_labels_grid, **plot_params)

        plt.show()

    ### Retrain circuit with noise
    '''
    # Given the noise in the circuit, train the parameters of encoding unitary to account for noise. Parameterised unitary parameters are fixed as the ideal ones learned.
    '''
    if retrain:

        if encoding_choice.lower() == 'wavefunction_param': optimiser = 'L-BFGS-B' 
        else:                                               optimiser = 'Powell' 

        if noise:
            encoding_params, result_encoding_param = train_classifier_encoding(qc, noise_choice, noise_values, num_shots, ideal_params, encoding_choice, init_encoding_params, optimiser, data_train, true_labels_train)
            print('The optimised encoding parameters with noise are:', result_encoding_param.x)
            ideal_encoding_params = result_encoding_param.x
        else:
            encoding_params, result_encoding_param = train_classifier_encoding(qc, None, None, num_shots, ideal_params, encoding_choice, init_encoding_params, optimiser, data_train, true_labels_train)
            print('The optimised encoding parameters without noise are:', result_encoding_param.x)
            ideal_encoding_params = result_encoding_param.x
    else:
        ### Define Ideal ENCODING parameters for trained model. Simple model can acheieve classification of about 90 with noise, 93% without noise %
        '''
        # 90% Classification parameters for dense angle encoding
        '''
        if data_choice.lower() == 'moons' and encoding_choice.lower() == 'denseangle_param' and noise:  
            ideal_encoding_params = [2.23855329, 7.57781576]
            '''
            # 93% Classification parameters for dense angle encoding without noise
            '''
        elif data_choice.lower() == 'moons' and encoding_choice.lower() == 'denseangle_param':
            ideal_encoding_params =  [3.05615259, 7.61215138]  # No noise

        ### Define Ideal ENCODING parameters for trained model. Simple model can acheieve classification of about 90 %
        '''
        # NO NOISE  - 74-77% Classification parameters with training for superdense angle encoding  
        # NOISE     - Classification parameters for superdense angle encoding (0.3 amp damp = 20% different classification - 69% accuracy with noise before encoding training)
        #             With learned encoding - 
        '''
        if data_choice.lower() == 'moons' and encoding_choice.lower() == 'superdenseangle_param' and noise: 
            ideal_encoding_params =  [3.31296568, 6.34142188]

        elif data_choice.lower() == 'moons' and encoding_choice.lower() == 'superdenseangle_param':
            ideal_encoding_params = [2.86603822, 6.14328274] # No noise
        
        ### Define Ideal ENCODING parameters for trained model. Simple model can acheieve classification of about 90 %
        '''
        # NO NOISE  - 82-84% Classification parameters with training for generalised wavefunction encoding  
        # NOISE     - Classification parameters for superdense angle encoding (0.3 amp damp = 20% different classification - 78% accuracy with noise before encoding training)
        #             With learned encoding - 
        '''
        print(data_choice.lower(), encoding_choice.lower())
        if data_choice.lower() == 'moons' and encoding_choice.lower() == 'wavefunction_param' and noise: 
            ideal_encoding_params =  [0.02884417]
        elif data_choice.lower() == 'moons' and encoding_choice.lower() == 'wavefunction_param':
            ideal_encoding_params = [0.01582773] # No noise
            

    if noise:
        print('These give a cost with the noisy circuit of:',\
            ClassificationCircuit(qubits, data_test, noise_choice, noise_values).build_classifier(ideal_params, encoding_choice, ideal_encoding_params , num_shots, qc, true_labels_test) )
    else:       
        print('These give a cost with the ideal circuit of:',\
            ClassificationCircuit(qubits, data_test).build_classifier(ideal_params, encoding_choice, ideal_encoding_params , num_shots, qc, true_labels_test) )

    ### Add noise to circuit and classify
    '''
    # Using learned encoding parameters, check again proportion misclassified
    '''
    if noise:
        noisy_predictions, number_classified_same = generate_noisy_classification(ideal_params, noise_choice, noise_values, encoding_choice, ideal_encoding_params, qc, num_shots, data_test, predicted_labels)
        print('The proportion classified differently after noise with learned encoding is:', 1 - number_classified_same)

    ## Overlay decision boundary
    '''
    # Generate Grid of datapoints to determine and visualise ideal decision boundary WITH/WITHOUT noise added
    '''
    if noise:
        predicted_labels = ClassificationCircuit(qubits, data_test, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc)

        plot_params = {'colors': ['blue', 'orange'], 'alpha': 1}
        scatter(data_test,  true_labels_test, predicted_labels, **plot_params)

        predicted_labels_grid = ClassificationCircuit(qubits, data_grid, noise_choice, noise_values).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc)
        
        plot_params = {'colors': ['red', 'green'], 'alpha': 0.2}
        scatter(data_grid, predicted_labels_grid, **plot_params)
        plt.show()
    else:
        predicted_labels = ClassificationCircuit(qubits, data_test).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc)
        
        plot_params = {'colors': ['blue', 'orange'], 'alpha': 1}
        scatter(data_test, true_labels_test,  predicted_labels, **plot_params)
        
        predicted_labels_grid = ClassificationCircuit(qubits, data_grid).make_predictions(ideal_params, encoding_choice, ideal_encoding_params, num_shots, qc)
        
        plot_params = {'colors': ['red', 'green'], 'alpha': 0.2}
        scatter(data_grid, predicted_labels_grid, **plot_params)
        plt.show()
Esempio n. 13
0
    xs1 = np.array(range(x1, x2 + 1))
    probs = stats.binom.pmf(xs1, n, p)
    xs2 = np.arange(x1, x2, 1e-1)
    proba = stats.norm.pdf((xs2 - (n * p))/(math.sqrt((n * p) * (1 - p))))

    # ----------------------------------------
    # plotting
    # ----------------------------------------
    probs = np.array(probs).round(4)
    fig, ax1 = plots.barplot(xs1, probs
        ,title = 'Normal Approximation of Binomial Distribution; p = {0:.8}, n = {1}'.format(p, n)
        ,align = 'edge'
        ,edgecolor = edgecolor
        ,show = False, close = False)
    ax2 = ax1.twinx()
    fig, ax2 = plots.scatter(xs2 + 0.5, proba, fig = fig, ax = ax2, markersize = 0, linewidth = 2)
    ax2.set_title('')
    print('')

    # ----------------------------------------
    # sample calculations
    # ----------------------------------------
    p = 1e-5        # probability of E occurring
    n = 16e6        # in this many trials
    x = 150         # contains this many occurrences of E
    h = 1           # step size
    xs1 = np.arange(0, x + h, 1)
    probs = stats.binom.pmf(xs1, n, p)
    probcum = probs.sum()
    print('Probability that fewer than {0} occurrences of E occur in {1} trials is P(X <= {2}) = {3:.8}'\
        .format(x, n, x, probcum))
Esempio n. 14
0
    # ----------------------------------------------------------------------
    mu0 = 175
    mua = 179
    xmin = min(mu0, mua) - (5 * sigma / math.sqrt(n))
    xmax = max(mu0, mua) + (5 * sigma / math.sqrt(n))
    x = np.linspace(xmin, xmax, 500)
    y0 = stats.norm.pdf(x, loc=mu0, scale=sigma / math.sqrt(n))
    ya = stats.norm.pdf(x, loc=mua, scale=sigma / math.sqrt(n))
    ymin = min(y0.min(), ya.min())
    ymax = max(y0.max(), ya.max())

    # plot both distributions
    fig, ax = plots.scatter(x,
                            y0,
                            ylim=(0, max(y0.max(), ya.max())),
                            xlabel='height',
                            ylabel='f(x)',
                            markersize=0,
                            linewidth=2,
                            color=plots.BLUE)
    plots.scatter(x,
                  ya,
                  fig=fig,
                  ax=ax,
                  ylim=(0, max(y0.max(), ya.max())),
                  xlabel='height',
                  ylabel='f(x)',
                  markersize=0,
                  linewidth=2,
                  color=plots.RED)

    # find the acceptance region and fill it
Esempio n. 15
0
    xs = np.arange(xstart, xend, h)
    pdfvals = stats.expon.pdf(xs, 0, 1 / lamb).round(8)

    # ----------------------------------------
    # plotting
    # ----------------------------------------
    xs = np.array(xs).round(4)
    pdfvals = np.array(pdfvals).round(2)
    fig, ax1 = plots.barplot(xs, pdfvals
        ,title = 'Exponential Distribution; lambda = {0:.8}'.format(lamb)
        ,align = 'edge'
        ,edgecolor = edgecolor
        ,width = h
        ,show = False, close = False)
    ax2 = ax1.twinx()
    fig, ax2 = plots.scatter(xs, pdfvals, fig = fig, ax = ax1
        ,ylim = ax1.get_ylim(), markersize = 0, linewidth = 2)
    ax2.set_title('')

    # ----------------------------------------
    # sample calculations 1
    # ----------------------------------------
    prob1 = stats.expon.cdf(x, 0, 1 / lamb)
    prob1calc = 1 - math.exp(-x * lamb)
    prob2 = 1 - prob1
    assert(0.082 - round(prob2, 3) == 0)
    assert(round(prob1 - prob1calc, 8) == 0)
    print('{0}The probability that event E will occur in <= {1:.2} units is {2:.8}'\
        .format(space, x, prob1))

    x1 = 2 / 60
    x2 = 3 / 60
Esempio n. 16
0
def file_size_analysis(major_extensions_file):

    trep = get_valid_repos()

    rep_size = pd.read_csv(major_extensions_file)
    print('avg file mean', rep_size.avg_size.mean() / KILOBYTE)
    print('std file mean', rep_size.std_size.mean() / KILOBYTE)
    print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE)
    print('std capped file mean', rep_size.capped_std_file.mean() / KILOBYTE)
    print('avg capped file mean', rep_size.capped_avg_file.mean() / KILOBYTE)
    print('std capped file mean/avg capped file mean',
          rep_size.capped_std_file.mean() / rep_size.capped_avg_file.mean())

    treps = pd.merge(trep, rep_size, on='repo_name')
    print(rep_size.capped_avg_file.describe())

    size_25_q = rep_size.capped_avg_file.quantile(0.25)
    print("size 25 quantile", size_25_q, "in kb", size_25_q / KILOBYTE)
    size_75_q = rep_size.capped_avg_file.quantile(0.75)
    print("size 75 quantile", size_75_q, "in kb", size_75_q / KILOBYTE)

    treps['size_group'] = treps.apply(
        lambda x: 'Lower 25' if x.capped_avg_file < size_25_q else "top 25"
        if x.capped_avg_file > size_75_q else "Middle",
        axis=1)

    print('top 10 prob',
          1.0 * len(treps[treps.quality_group == 'Top 10']) / len(treps))
    top_10_in_l25 = 1.0 * len(treps[(treps.quality_group == 'Top 10')
                                    & (treps.size_group == 'Lower 25')]) / len(
                                        treps[treps.size_group == 'Lower 25'])
    print('top 10 prob in lower 25', top_10_in_l25)
    top_10_in_t25 = 1.0 * len(treps[(treps.quality_group == 'Top 10')
                                    & (treps.size_group == 'top 25')]) / len(
                                        treps[treps.size_group == 'top 25'])
    print('top 10 prob in top 25', top_10_in_t25)
    print("short files lift ", top_10_in_l25 / top_10_in_t25 - 1)

    group_by_size = treps.groupby(['size_group'],
                                  as_index=False).agg({'y2019_ccp': 'mean'})
    print(group_by_size)

    print("all files")
    print(
        treps.groupby('quality_group').agg({
            'capped_avg_file': 'mean',
            'avg_size': 'mean',
            'files': 'sum',
            'repo_name': 'count'
        }))

    for i in lang_name:
        print(i, " files")
        print(treps[(treps.major_extension_ratio > DOMINANT_RATE)
                    & (treps.major_extension == lang_extension[i])].groupby(
                        'quality_group').agg({
                            'capped_avg_file': 'mean',
                            'avg_size': 'mean',
                            'files': 'sum',
                            'repo_name': 'count'
                        }))

    print("Size controled by developer groups")
    pretty_print(
        pair_analysis_by_dev_num_group(treps, 'size_group', 'y2019_ccp'))

    print("Size controled by project age")
    pretty_print(pair_analysis_by_age_group(treps, 'size_group', 'y2019_ccp'))

    scatter(treps,
            first_metric='y2019_ccp',
            second_metric='capped_avg_file',
            output_file=os.path.join(FIGURES_PATH,
                                     r'ccp_vs_length_scatter.html'),
            mode='markers',
            opacity=0.9)
    pair_analysis_by_bins_to_file(treps,
                                  'y2019_ccp',
                                  'capped_avg_file',
                                  output_file=os.path.join(
                                      DATA_PATH, 'ccp_vs_length_bins.csv'),
                                  bins=10)

    return treps
Esempio n. 17
0
def plots():
    plot1 = scatter()
    return render_template('plots.html', plot_1=plot1)
Esempio n. 18
0
                    xlabel=['height'],
                    ylabel=['count'])

    # create array of x values for calculating pdf values
    xmin = dfPop.loc[:, 'height'].min()
    xmax = dfPop.loc[:, 'height'].max()
    x = np.linspace(xmin, xmax, 500)

    # plot normal probability density function with population mean and variance
    pdf = pdfnorm(x, mu, sigma)
    ax = ax.twinx()
    plots.scatter(x,
                  pdf,
                  fig=fig,
                  ax=ax,
                  ylim=(pdf.min(), pdf.max()),
                  title='',
                  markersize=0,
                  linewidth=2,
                  color=plots.RED)
    nplot = nplot + 1

    # NOTE that the shape population distribution is normal.

    # histogram of sample
    ax = fig.add_subplot(nrow, ncol, nplot)
    plots.histogram(dfSamp,
                    fig=fig,
                    ax=ax,
                    numBins=numBins,
                    title='Single Sample',
def main(train=False,
         encoding='denseangle_param',
         ideal=False,
         noise=False,
         analytic=False,
         compare=False):
    """
    # Find optimal parameters for linear decision boundary and add noise
    """

    ### Firstly, generate for dataset:
    '''
    # We use the transpose of the (scaled to unit square) Moons dataset in order to see a non-linear decision boundary
    '''
    data_vertical_train, data_vertical_test, true_labels_train, true_labels_test = generate_data(
        'random_vertical_boundary', num_points=500, split=True)

    ### Next, generate correct classification parameters for dataset (perfect classification):
    '''
    # Define parameters of model. Start with DenseAngle encoding with fixed parameters.
    '''

    qc_name = '1q-qvm'
    qc = get_qc(qc_name)
    num_shots = 1024
    device_qubits = qc.qubits()
    classifier_qubits = device_qubits
    n_layers = 1
    init_params = np.random.rand(3)
    if encoding.lower() == 'denseangle_param':
        encoding_choice = 'denseangle_param'
        # init_encoding_params = [np.pi, 2*np.pi]
        init_encoding_params = [np.pi, 2 * np.pi]

    elif encoding.lower() == 'wavefunction' or encoding.lower(
    ) == 'wavefunction_param':
        encoding_choice = 'wavefunction_param'
        init_encoding_params = [0]

    optimiser = 'Powell'

    if train:
        ### Train model, and check classification result of ideal parameters found
        '''
        # Train model using scipy.optimize
        '''
        params, result_unitary_param = train_classifier(
            qc, num_shots, init_params, encoding_choice, init_encoding_params,
            optimiser, data_vertical_train, true_labels_train)
        print('The optimised parameters are:', result_unitary_param.x)
        print('These give a cost of:', ClassificationCircuit(classifier_qubits, data_vertical_train).build_classifier(result_unitary_param.x, n_layers, \
                                                                            encoding_choice, init_encoding_params, num_shots, qc, true_labels_train))
        ideal_params_vertical = result_unitary_param.x
    else:
        ### Define Ideal parameters for trained model learned from previous. Simple model can acheieve classification of about 90 %

        if encoding_choice.lower() == 'denseangle_param':
            '''
            # 100% Classification parameters (modulo points on the boundary)
            '''
            # ideal_params_vertical = [3.8208,1.525,0.0808]
            ideal_params_vertical = [1.67814786, 1.56516469, 1.77820848]
        elif encoding_choice.lower() == 'wavefunction_param':
            '''
            # 78% Classification parameters (modulo points on the boundary)
            '''
            ideal_params_vertical = [2.2921198, 0.61375299, -5.15252796]

    plt.rcParams.update({
        "font.size": 20,
        "font.serif": "Computer Modern Roman"
    })

    ### Overlay decision bounday
    '''
    # Generate Grid of datapoints to determine and visualise ideal decision boundary
    '''
    data_choice = 'full_vertical_boundary'
    num__grid_points = 1000
    data_grid, grid_true_labels = generate_data(data_choice, num__grid_points)
    data_grid, grid_true_labels = remove_zeros(data_grid, grid_true_labels)

    if ideal:

        predicted_labels_test = ClassificationCircuit(classifier_qubits, data_vertical_test, qc).make_predictions(ideal_params_vertical,  n_layers, \
                                                                                        encoding_choice, init_encoding_params, num_shots)
        plot_params = {'colors': ['blue', 'orange'], 'alpha': 1}
        scatter(data_vertical_test, true_labels_test, predicted_labels_test,
                **plot_params)

        predicted_labels_grid = ClassificationCircuit(classifier_qubits, data_grid, qc).make_predictions(ideal_params_vertical, n_layers,\
                                                                             encoding_choice, init_encoding_params, num_shots)
        plot_params = {'colors': ['red', 'green'], 'alpha': 0.2}
        scatter(data_grid, predicted_labels_grid, **plot_params)
        plt.show()

    ### Define noise parameters
    '''
        # Define noise parameters to add to model to determine how classification is affected.
    '''

    noise_choice = 'amp_damp_before_measurement'
    noise_values = 0.4

    ### Add noise to circuit and classify
    '''
        # Add noise to circuit, and determine number of points classified differently (not mis-classified since we can't achieve perfect classification)
    '''

    if noise:
        ## Overlay decision boundary
        '''
        # Generate Grid of datapoints to determine and visualise ideal decision boundary WITH noise added
        '''
        predicted_labels_test_noise = ClassificationCircuit(classifier_qubits, data_vertical_test, qc,\
                     noise_choice, noise_values).make_predictions(ideal_params_vertical, n_layers, encoding_choice, init_encoding_params, num_shots)
        plot_params = {'colors': ['blue', 'orange'], 'alpha': 1}
        scatter(data_vertical_test, true_labels_test,
                predicted_labels_test_noise, **plot_params)

        predicted_labels_grid_noise = ClassificationCircuit(classifier_qubits, data_grid, qc,\
                                                            noise_choice, noise_values).make_predictions(ideal_params_vertical, n_layers, \
                                                            encoding_choice, init_encoding_params, num_shots)

        plot_params = {'colors': ['red', 'green'], 'alpha': 0.2}
        scatter(data_grid, predicted_labels_grid_noise, **plot_params)
        plt.show()
    '''
    # Define function to compute points which will remian correctly classified after noise is added
    '''
    def correct_function(data_point, params, encoding_choice, encoding_params):
        [alpha_1, alpha_2, alpha_3] = params
        [x_1, x_2] = data_point

        if encoding_choice.lower() == 'denseangle_param':
            [theta, phi] = encoding_params
            function = (np.sin(alpha_2) )**2 * ( np.cos(theta * x_1) )**2  + (np.cos(alpha_2))**2 * (np.sin(theta * x_1))**2 \
                        + ((1/2)*(np.sin(2 * alpha_2) * np.sin(2 * theta * x_1) * np.exp(-1j*(2 * alpha_3 + phi * x_2)))).real
        elif encoding_choice.lower() == 'wavefunction_param':
            [theta] = encoding_params
            l2_norm = np.linalg.norm(np.array([x_1, x_2]))**2
            function = (np.sin(alpha_2)**2 ) * ( x_1**2/(l2_norm) )  + (np.cos(alpha_2)**2) * (x_2**2/(l2_norm)) \
                        + ((1/(2*l2_norm))*(np.sin(2 * alpha_2) * (x_1) * (x_2) * np.exp(-1j*(2 * alpha_3)))).real

        return function

    def compute_analytic_misclassifed_condition(data, params, encoding_choice,
                                                encoding_params,
                                                noise_strength, true_labels):
        correct_classification_labels = []
        for ii, data_point in enumerate(data):

            function = correct_function(data_point, params, encoding_choice,
                                        encoding_params)
            if true_labels[ii] == 0:
                correct_classification_labels.append(
                    0
                )  # If datapoint was zero originally, it will be correctly classified regardless of noise

            else:
                if function > 1 / (
                        2 * (1 - noise_strength)
                ):  # If data point was classified as 1, it will be correctly classified if condition is met.
                    correct_classification_labels.append(0)

                else:
                    correct_classification_labels.append(1)
        number_robust = 1 - sum(correct_classification_labels) / len(
            correct_classification_labels)  # percentage of misclassified points
        return np.array(correct_classification_labels), number_robust

    def plot_number_misclassified_amp_damp(ideal_params, num_shots, num_points,
                                           qc, noise_values):

        points_noise_inc = []

        data_vertical_train, data_vertical_test, true_labels_train, true_labels_test = generate_data('random_vertical_boundary',\
                                                                                                     num_points=num_points, split=True)
        interval = 0.2
        encoding_choice = 'denseangle_param'
        theta = np.arange(0, 2 * np.pi, interval)
        phi = np.arange(0, 2 * np.pi, interval)
        X, Y = np.meshgrid(theta, phi)
        noise_choice = 'amp_damp_before_measurement'
        test_acc_ideal = np.zeros((theta.shape[0], phi.shape[0]), dtype=float)

        test_acc_noise = np.zeros((theta.shape[0], phi.shape[0]), dtype=float)
        number_robust = np.zeros((theta.shape[0], phi.shape[0]), dtype=float)

        for ii, t in enumerate(theta):
            for jj, p in enumerate(phi):
                temp_encoding_params = [t, p]

                # Classification of encoding parameters *without* noise
                ideal_predictions, test_acc_ideal[ii,jj]  = generate_noisy_classification(ideal_params, 1, None, None,\
                                                                                                    encoding_choice, temp_encoding_params, qc,\
                                                                                                    classifier_qubits, num_shots, data_vertical_test, true_labels_test)

                # Learned encoding parameters *with* noise
                noisy_predictions, test_acc_noise[ii,jj]  = generate_noisy_classification(ideal_params, 1, noise_choice, noise_values,\
                                                                                                    encoding_choice, temp_encoding_params, qc,\
                                                                                                    classifier_qubits, num_shots, data_vertical_test, true_labels_test)
                # Number expected to be robust under analytic condition
                correct_classification_labels, number_robust[ii, jj] = compute_analytic_misclassifed_condition(data_vertical_test, ideal_params_vertical,\
                                                                                                                encoding_choice, temp_encoding_params,\
                                                                                                                noise_values, true_labels_test)

                print('Theta, Phi is:', t, p)
                print('Test accuracy ideal:', test_acc_ideal[ii, jj])
                print('Test accuracy with noise:', test_acc_noise[ii, jj])
                print('Proportion robust:', number_robust[ii, jj])

        max_acc_indices_ideal = np.unravel_index(
            np.argmax(test_acc_ideal, axis=None), test_acc_ideal.shape)
        max_acc_indices = np.unravel_index(
            np.argmax(test_acc_noise, axis=None), test_acc_noise.shape)
        max_robust_indices = np.unravel_index(
            np.argmax(number_robust, axis=None), number_robust.shape)

        plt.rcParams.update({"font.size": 14, "font.family": "serif"})

        # ----------------------
        # Uncomment below for 3d plots
        # ----------------------

        # fig = plt.figure(figsize=plt.figaspect(0.33))
        # ax1 = fig.add_subplot(1, 3, 1, projection='3d')
        # surf1 = ax1.plot_surface(X, Y, test_acc_ideal, cmap=cm.coolwarm_r,linewidth=0, antialiased=False)
        # # ax1.set_zlim(0.45, 1.01)
        # cbar1 =fig.colorbar(surf1)
        # cbar1.ax.set_ylabel('Test Accuracy')

        # ax2 = fig.add_subplot(1, 3, 2, projection='3d')
        # surf2 = ax2.plot_surface(X, Y, test_acc_noise, cmap=cm.coolwarm_r, linewidth=0, antialiased=False)
        # # ax2.set_zlim(0.45, 1.01)
        # cbar2 = fig.colorbar(surf2)
        # cbar2.ax.set_ylabel('Test Accuracy')

        # ax3 = fig.add_subplot(1, 3, 3, projection='3d')

        # surf3 = ax3.plot_surface(X, Y, number_robust, cmap=cm.PuOr, linewidth=0, antialiased=False)
        # cbar3 = fig.colorbar(surf3)
        # cbar3.ax.set_ylabel('Proportion robust')

        # ax1.set_ylabel(r'$\theta (rads)$')
        # ax1.set_xlabel(r'$\phi (rads)$' )
        # ax1.set_title(  'Best accuracy ideal: '             + str( round( test_acc_ideal[max_acc_indices_ideal] , 2) ) \
        #                 + '\nBest accuracy with noise: '    + str( round( test_acc_noise[max_acc_indices_ideal] , 2) ) \
        #                 + '\nRobustness: '                  + str( round( number_robust[max_acc_indices_ideal]  , 2) ) + '\n' \
        #                 + r'$[\theta, \phi]$ = '            + '['+str(round(theta [ max_acc_indices_ideal[0] ], 2) )+ ', ' + str( round( phi [ max_acc_indices_ideal[1] ] , 2) ) + ']' )

        # ax2.set_ylabel(r'$\theta (rads)$')
        # ax2.set_xlabel(r'$\phi (rads)$' )
        # ax2.set_title(  'Best accuracy with noise: '    + str( round( test_acc_noise[max_acc_indices]   , 2) ) \
        #                 + '\nBest accuracy ideal: '     + str( round( test_acc_ideal[max_acc_indices]   , 2) ) \
        #                 + '\nRobustness: '               + str( round( number_robust[max_acc_indices]    , 2) ) + '\n' \
        #                 + r'$[\theta, \phi]$ = '        + '['+str(theta [ max_acc_indices[0] ])+ ', ' + str(round( phi [ max_acc_indices[1] ], 2) ) + ']' )

        # ax3.set_ylabel(r'$\theta (rads)$')
        # ax3.set_xlabel(r'$\phi (rads)$' )
        # ax3.set_title('Max. robustness: '               + str( round( number_robust[max_robust_indices]  , 2) ) \
        #                 +'\nBest accuracy with noise: ' + str( round( test_acc_noise[max_robust_indices] , 2) ) \
        #                 +'\nBest accuracy ideal: '      + str( round( test_acc_ideal[max_robust_indices] , 2) ) + '\n'\
        #                 +r'$[\theta, \phi]$ = '         + '[' + str(theta [ max_robust_indices[0] ]) \
        #                                                 + ', ' + str(phi [ max_robust_indices[1] ] ) + ']' )

        ## 2D PLOTS
        fig, ax = plt.subplots(1, 3)
        im0 = ax[0].imshow(test_acc_ideal,
                           cmap=cm.coolwarm_r,
                           extent=[0, 2 * np.pi, 2 * np.pi, 0])
        divider = make_axes_locatable(ax[0])
        cax = divider.append_axes('right', size='5%', pad=0.1)
        cbar0 = fig.colorbar(im0, cax=cax, orientation='vertical')

        cbar0.ax.set_ylabel('Test Accuracy')

        im1 = ax[1].imshow(test_acc_noise,
                           cmap=cm.coolwarm_r,
                           extent=[0, 2 * np.pi, 2 * np.pi, 0])
        divider = make_axes_locatable(ax[1])
        cax = divider.append_axes('right', size='5%', pad=0.1)
        cbar1 = fig.colorbar(im1, cax=cax, orientation='vertical')

        cbar1.ax.set_ylabel('Test Accuracy')

        im2 = ax[2].imshow(number_robust,
                           cmap=cm.PuOr,
                           extent=[0, 2 * np.pi, 2 * np.pi, 0])
        divider = make_axes_locatable(ax[2])
        cax = divider.append_axes('right', size='5%', pad=0.1)
        cbar2 = fig.colorbar(im2, cax=cax, orientation='vertical')

        cbar2.ax.set_ylabel('Proportion robust')

        ax[0].set_title(  'Best accuracy ideal: '             + str( round( test_acc_ideal[max_acc_indices_ideal] , 2) ) \
                        + '\nBest accuracy with noise: '    + str( round( test_acc_noise[max_acc_indices_ideal] , 2) ) \
                        + '\nRobustness: '                  + str( round( number_robust[max_acc_indices_ideal]  , 2) ) + '\n' \
                        + r'$[\theta, \phi]$ = '            + '['+str(round(theta [ max_acc_indices_ideal[0] ], 2) )+ ', ' + str( round( phi [ max_acc_indices_ideal[1] ] , 2) ) + ']' )

        ax[1].set_title(  'Best accuracy with noise: '    + str( round( test_acc_noise[max_acc_indices]   , 2) ) \
                        + '\nBest accuracy ideal: '     + str( round( test_acc_ideal[max_acc_indices]   , 2) ) \
                        + '\nRobustness: '               + str( round( number_robust[max_acc_indices]    , 2) ) + '\n' \
                        + r'$[\theta, \phi]$ = '        + '['+str(theta [ max_acc_indices[0] ])+ ', ' + str(round( phi [ max_acc_indices[1] ], 2) ) + ']' )

        ax[2].set_title('Max. robustness: '               + str( round( number_robust[max_robust_indices]  , 2) ) \
                        +'\nBest accuracy with noise: ' + str( round( test_acc_noise[max_robust_indices] , 2) ) \
                        +'\nBest accuracy ideal: '      + str( round( test_acc_ideal[max_robust_indices] , 2) ) + '\n'\
                        +r'$[\theta, \phi]$ = '         + '[' + str(theta [ max_robust_indices[0] ]) \
                                                        + ', ' + str(phi [ max_robust_indices[1] ] ) + ']' )

        return

    if analytic:
        correct_classification_labels, number_robust = compute_analytic_misclassifed_condition(data_grid, ideal_params_vertical,\
                                                                                                encoding_choice, init_encoding_params,\
                                                                                                noise_values, grid_true_labels)
        plot_params = {'colors': ['blue', 'black'], 'alpha': 0.3}

        scatter(data_grid, correct_classification_labels, **plot_params)
        plt.show()

    if compare:
        plot_number_misclassified_amp_damp(ideal_params_vertical, num_shots,
                                           500, qc, noise_values)
        plt.show()
Esempio n. 20
0
st.plotly_chart(plot.question1(base))

st.markdown("## Final Data Set")
st.write(base)
st.write(base.shape)

st.plotly_chart(
    plot.scatter_poor_rich(base.copy(),
                           x='SP_DYN_TFRT_IN',
                           x_name='Fertility Rate',
                           y='NY_GDP_PCAP_CD',
                           y_name='GDP per capita'))
st.plotly_chart(
    plot.scatter(base.copy(),
                 x='SP_DYN_TFRT_IN',
                 x_name='Fertility Rate',
                 y='NY_GDP_PCAP_CD',
                 y_name='GDP per capita'))
st.plotly_chart(
    plot.world_map(base.copy(), y='SP_DYN_TFRT_IN', y_name='Fertility Rate'))

st.markdown("## Features")
with st.echo():
    # GET DATA PER COLUMN
    na_percent = []
    na_total = []
    minimum = []
    maximum = []

    for col in base.columns:
        na_percent.append(
def file_length_per_language(major_extensions_file, commits_per_user_file,
                             image_file):

    ext = pd.read_csv(major_extensions_file)

    dominant = ext[ext.major_extension_ratio > DOMINANT_RATE]

    trep = get_valid_repos()

    major = pd.merge(trep, dominant, left_on='repo_name', right_on='repo_name')

    users_per_project = pd.read_csv(commits_per_user_file)
    users_per_project = users_per_project[users_per_project.year == 2019]
    trepu = pd.merge(major, users_per_project, on='repo_name')

    trepu['commit_per_user'] = trepu.apply(lambda x: x.y2019_commits / x.users
                                           if x.users > 0 else None,
                                           axis=1)
    trepu['commit_per_user_above_11'] = trepu.apply(
        lambda x: x.users_above_11_commits / x.users_above_11
        if x.users_above_11 > 0 else None,
        axis=1)

    trepu['commit_per_user_cap'] = trepu.apply(
        lambda x: x.users_capped_commit / x.users if x.users > 0 else None,
        axis=1)
    trepu['commit_per_user_above_11_cap'] = trepu.apply(
        lambda x: x.commits_above_11_500_cap / x.users_above_11
        if x.users_above_11 > 0 else None,
        axis=1)

    agg_lang = trepu[trepu.major_extension.isin(language_extensions)].groupby(
        'major_extension', as_index=False).agg({
            'repo_name': 'count',
            'y2019_ccp': {'mean', 'std'},
            'commit_per_user_above_11_cap': {'mean', 'std'}
        })

    agg_lang.columns = agg_lang.columns.droplevel()
    agg_lang.columns = [
        u'langauge', u'projects', u'ccp_mean', u'ccp_std', u'speed_mean',
        u'speed_std'
    ]

    agg_lang_quality = trepu[trepu.major_extension.isin(
        language_extensions)].groupby(['major_extension', 'quality_group'],
                                      as_index=False).agg({
                                          'repo_name': 'count',
                                          'commit_per_user_above_11_cap':
                                          {'mean', 'std'}
                                      })
    agg_lang_quality.columns = agg_lang_quality.columns.droplevel()
    """
    agg_lang_quality = agg_lang_quality.rename(columns={
        'major_extension' : u'langauge'
        , 'std': u'speed_std'
        , 'mean': u'speed_mean'
        , 'count': u'projects'
    })
    """
    agg_lang_quality.columns = [
        u'langauge', u'quality_group', u'projects', u'speed_mean', u'speed_std'
    ]

    all_speed_mean = []
    all_speed_std = []

    top_speed_mean = []
    top_speed_std = []
    other_speed_mean = []
    other_speed_std = []
    ccp_mean = []
    ccp_std = []
    for i in language_extensions:
        top_speed_mean.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_mean))
        top_speed_std.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Top 10')].iloc[0].projects)))
        other_speed_mean.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_mean))
        other_speed_std.append(
            round(
                agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Others')].iloc[0].projects)))
        ccp_mean.append(
            round(100 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean))
        ccp_std.append(100 * round(
            agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std /
            math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects)))
        all_speed_mean.append(
            round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean))
        all_speed_std.append(
            round(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std /
                  math.sqrt(agg_lang[(agg_lang.langauge
                                      == i)].iloc[0].projects)))

    trace0 = go.Bar(x=lang_name,
                    y=all_speed_mean,
                    name='Speed',
                    error_y=dict(type='data',
                                 array=all_speed_std,
                                 visible=True))
    trace1 = go.Bar(x=lang_name,
                    y=top_speed_mean,
                    name='Top Speed',
                    error_y=dict(type='data',
                                 array=top_speed_std,
                                 visible=True))

    trace2 = go.Bar(x=lang_name,
                    y=other_speed_mean,
                    name='Other Speed',
                    error_y=dict(type='data',
                                 array=other_speed_std,
                                 visible=True))

    trace3 = go.Bar(x=lang_name,
                    y=ccp_mean,
                    name='CCP',
                    error_y=dict(type='data', array=ccp_std, visible=True))
    data = [trace0, trace1, trace2, trace3]

    layout = go.Layout(
        barmode='group',
        title='Speed and CCP per language',
        xaxis=dict(title='Language',
                   titlefont=dict(family='Courier New, monospace',
                                  size=24,
                                  color='#7f7f7f')),
        yaxis=dict(title='Commit per developer, CCP',
                   titlefont=dict(family='Courier New, monospace',
                                  size=24,
                                  color='#7f7f7f')))

    fig = go.Figure(data=data, layout=layout)
    plot(fig, image='png', image_filename=image_file, output_type='file')

    print(r"\begin{tabular}{| l| l| l| l| l| l|}")
    print(r"   \hline ")
    Title = r" Metric & Projects & CCP & Speed & Top Speed & Others Speed  \\ \hline"
    print(Title)
    for i in agg_lang.sort_values('ccp_mean').langauge.tolist():
        Line = str(lang_by_extension(i))

        Line = Line + " & " + str(agg_lang[(agg_lang.langauge
                                            == i)].iloc[0].projects)

        Line = Line + " & " + str(
            round(1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_mean, 2))
        Line = Line + " $\pm$ " + str(
            round(
                1 * agg_lang[(agg_lang.langauge == i)].iloc[0].ccp_std /
                math.sqrt(agg_lang[(agg_lang.langauge == i)].iloc[0].projects),
                3))

        Line = Line + " & " + str(
            int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_mean))
        Line = Line + " $\pm$ " + str(
            int(agg_lang[(agg_lang.langauge == i)].iloc[0].speed_std /
                math.sqrt(agg_lang[
                    (agg_lang.langauge == i)].iloc[0].projects)))

        Line = Line + " & " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_mean))
        Line = Line + " $\pm$ " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Top 10'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Top 10')].iloc[0].projects)))
        Line = Line + " & " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_mean))
        Line = Line + " $\pm$ " + str(
            int(agg_lang_quality[(agg_lang_quality.langauge == i)
                                 & (agg_lang_quality.quality_group == 'Others'
                                    )].iloc[0].speed_std /
                math.sqrt(agg_lang_quality[(agg_lang_quality.langauge == i)
                                           & (agg_lang_quality.quality_group ==
                                              'Others')].iloc[0].projects)))

        Line = Line + r" \\ \hline"
        print(Line)

    scatter(trepu,
            first_metric='y2019_ccp',
            second_metric='commit_per_user_above_11_cap',
            output_file=os.path.join(FIGURES_PATH,
                                     r'ccp_vs_speed_scatter.html'),
            mode='markers',
            opacity=0.9)

    pair_analysis_by_bins_to_file(trepu,
                                  'y2019_ccp',
                                  'commit_per_user_above_11_cap',
                                  output_file=os.path.join(
                                      DATA_PATH, 'ccp_vs_speed_bins.csv'),
                                  bins=10)
Esempio n. 22
0
maxRuntime = max(runtime)
avgRuntime = sum(runtime) / float(len(runtime))
plots.hist(
    runtime,
    minRuntime,
    maxRuntime,
    "Runtime (s)",
    "Number of Queries",
    "$min = %s$ $max = %s$ $avg = %s$" % (minRuntime, maxRuntime, avgRuntime),
    "runtime.png",
    ylog=True,
)
plots.scatter(
    runtime,
    "Query",
    "Runtime (s)",
    "$min = %s$ $max = %s$ $avg = %s$" % (minRuntime, maxRuntime, avgRuntime),
    "runtime_scatter.png",
)
plots.scatter(
    runtime,
    "Query",
    "Runtime (s)",
    "$min = %s$ $max = %s$ $avg = %s$" % (minRuntime, maxRuntime, avgRuntime),
    "runtime_scatter_ylog.png",
    ylog=True,
)

for name in timePctPerOperator.iterkeys():
    # if an operator doesn't exist in a query, its pct is 0
    timePctPerOperator[name].extend([0.0] * (numQueries - len(timePctPerOperator[name])))