def check_ks_of_expression(expression):
    '''
  Do KS test on original and reweighted distributions
  Draws a bar chart to compare results for different weighting methods

  Original version taken from hep-ml package
  '''
  
    col_original = original_test.eval(expression, engine='python')
    col_target = target_test.eval(expression, engine='python')
    w_target = np.ones(len(col_target), dtype='float')
    w_foldingTarget = np.ones(len(targetCollisions['p3_energy']),dtype='float')

    

    noReweight = ks_2samp_weighted(col_original, col_target, weights1=original_weights_test, weights2=w_target)
    binReweight = ks_2samp_weighted(col_original, col_target, weights1=bins_weights_test, weights2=w_target)
    gbReweight = ks_2samp_weighted(col_original, col_target, weights1=gb_weights_test, weights2=w_target)
    #Folding reweight uses whole dataset, so hardwire in for now
    foldingReweight = ks_2samp_weighted(originalCollisions.eval('p3_momentum * p4_momentum * p3_energy * p4_energy * p3_theta * p4_theta'), targetCollisions.eval('p3_momentum * p4_momentum * p3_energy * p4_energy * p3_theta * p4_theta'), weights1=folding_weights, weights2=np.ones(len(targetCollisions['p3_energy']),dtype='float'))


    print('No Reweight   KS:', noReweight)
    print('Bins Reweight KS:', binReweight)
    print('GB Reweight   KS:', gbReweight)
    print('Folding Reweight   KS:', foldingReweight)
    

    plt.bar(['No Weights','Bin Reweighting','GB Weights','Folding Weights','NN Weights', 'Ada NN','lbfgs NN'],[noReweight,binReweight,gbReweight,foldingReweight,neuralReweight,adaReweight,lbfgsReweight],color=['green','blue','blue','blue','red','red','red'])
Ejemplo n.º 2
0
def check_reweighter(n_dimensions, n_samples, reweighter):
    mean_original = numpy.random.normal(size=n_dimensions)
    cov_original = numpy.diag([1.] * n_dimensions)

    mean_target = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original)
    cov_target = cov_original * 0.4 + numpy.ones([n_dimensions, n_dimensions]) * 0.2

    original = numpy.random.mtrand.multivariate_normal(mean=mean_original, cov=cov_original, size=n_samples + 1)
    original_weight = numpy.ones(n_samples + 1)

    target = numpy.random.mtrand.multivariate_normal(mean=mean_target, cov=cov_target, size=n_samples)
    target_weight = numpy.ones(n_samples)

    reweighter.fit(original, target, original_weight=original_weight, target_weight=target_weight)
    new_weights = reweighter.predict_weights(original, original_weight=original_weight)

    av_orig = numpy.average(original, weights=original_weight, axis=0)
    print('WAS', av_orig)
    av_now = numpy.average(original, weights=new_weights, axis=0)
    print('NOW:', av_now)
    av_ideal = numpy.average(target, weights=target_weight, axis=0)
    print('IDEAL:', av_ideal)

    print('COVARIATION')
    print('WAS', weighted_covar(original, original_weight))
    print('NOW', weighted_covar(original, new_weights))
    print('IDEAL', weighted_covar(target, target_weight))

    assert numpy.all(abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong'
    for dim in range(n_dimensions):
        diff1 = ks_2samp_weighted(original[:, dim], target[:, dim], original_weight, target_weight)
        diff2 = ks_2samp_weighted(original[:, dim], target[:, dim], new_weights, target_weight)
        print('KS', diff1, diff2)
        assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)
Ejemplo n.º 3
0
def check_reweighter(n_dimensions, n_samples, reweighter, folding=False):
    mean_original = numpy.random.normal(size=n_dimensions)
    cov_original = numpy.diag([1.] * n_dimensions)

    mean_target = numpy.random.mtrand.multivariate_normal(mean=mean_original,
                                                          cov=cov_original)
    cov_target = cov_original * 0.4 + numpy.ones([n_dimensions, n_dimensions
                                                  ]) * 0.2

    original = numpy.random.mtrand.multivariate_normal(mean=mean_original,
                                                       cov=cov_original,
                                                       size=n_samples + 1)
    original_weight = numpy.ones(n_samples + 1)

    target = numpy.random.mtrand.multivariate_normal(mean=mean_target,
                                                     cov=cov_target,
                                                     size=n_samples)
    target_weight = numpy.ones(n_samples)

    reweighter.fit(original,
                   target,
                   original_weight=original_weight,
                   target_weight=target_weight)
    new_weights_array = []
    new_weights_array.append(
        reweighter.predict_weights(original, original_weight=original_weight))
    if folding:

        def mean_vote(x):
            return numpy.mean(x, axis=0)

        new_weights_array.append(
            reweighter.predict_weights(original,
                                       original_weight=original_weight,
                                       vote_function=mean_vote))

    for new_weights in new_weights_array:
        av_orig = numpy.average(original, weights=original_weight, axis=0)
        print('WAS', av_orig)
        av_now = numpy.average(original, weights=new_weights, axis=0)
        print('NOW:', av_now)
        av_ideal = numpy.average(target, weights=target_weight, axis=0)
        print('IDEAL:', av_ideal)

        print('COVARIANCE')
        print('WAS', weighted_covariance(original, original_weight))
        print('NOW', weighted_covariance(original, new_weights))
        print('IDEAL', weighted_covariance(target, target_weight))

        assert numpy.all(
            abs(av_now - av_ideal) < abs(av_orig -
                                         av_ideal)), 'averages are wrong'
        for dim in range(n_dimensions):
            diff1 = ks_2samp_weighted(original[:, dim], target[:, dim],
                                      original_weight, target_weight)
            diff2 = ks_2samp_weighted(original[:, dim], target[:, dim],
                                      new_weights, target_weight)
            print('KS', diff1, diff2)
            assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)
Ejemplo n.º 4
0
def check_ks_of_expression(expression):
	col_original = original_test.eval(expression, engine='python')
	col_target = target_test.eval(expression, engine='python')
	w_target = numpy.ones(len(col_target), dtype='float')
	print('Variable: %s'%expression)
	print('No reweight   KS:', ks_2samp_weighted(col_original, col_target, 
												 weights1=original_weights_test, weights2=w_target))
	print('GB Reweight   KS:', ks_2samp_weighted(col_original, col_target,
												 weights1=gb_weights_test, weights2=w_target))
Ejemplo n.º 5
0
def draw_distributions_weighted(original, target, new_original_weights,
                                target_sWeights, filename):
    fig = plt.figure()

    for id, column in enumerate(columns, 1):
        xlim = numpy.percentile(numpy.hstack([target[column]]), [0.01, 99.99])

        ax = plt.subplot(2, 3, id)
        ax.hist(original[column],
                weights=new_original_weights,
                range=xlim,
                **hist_settings)
        ax.hist(target[column],
                weights=target_sWeights,
                range=xlim,
                **hist_settings)
        ax.set_title(column)

        print('KS over %s = %s' %
              (column,
               ks_2samp_weighted(original[column],
                                 target[column],
                                 weights1=new_original_weights,
                                 weights2=target_sWeights)))

    fig.savefig(filename)
Ejemplo n.º 6
0
def draw_distributions(myoriginal, mytarget, new_original_weights, targetwts):
    sum_ks = 0
    ctr = 0
    plt.figure(figsize=[15, 7])
    for id, column in enumerate(columns[0:len], 1):
        ctr = ctr + 1
        xlim = numpy.percentile(numpy.hstack([mytarget[column]]),
                                [0.01, 99.99])
        plt.subplot(2, 3, id)
        plt.hist(myoriginal[column],
                 weights=new_original_weights,
                 range=xlim,
                 **hist_settings)
        plt.hist(mytarget[column],
                 weights=targetwts,
                 range=xlim,
                 **hist_settings)
        plt.title(column)
        myks = ks_2samp_weighted(myoriginal[column],
                                 mytarget[column],
                                 weights1=new_original_weights,
                                 weights2=targetwts)
        sum_ks = sum_ks + myks
        # print('KS over ', column, ' = ', myks)
    plt.draw()
    # plt.figure(figsize=[15, 7])
    # for id, column in enumerate(columns[6:len], 1):
    #     xlim = numpy.percentile(numpy.hstack([mytarget[column]]),
    #                             [0.01, 99.99])
    #     plt.subplot(2, 3, id)
    #     plt.hist(myoriginal[column], weights=new_original_weights, range=xlim,
    #              **hist_settings)
    #     plt.hist(mytarget[column], weights=targetwts, range=xlim,
    #              **hist_settings)
    #     plt.title(column)
    #     myks = ks_2samp_weighted(myoriginal[column], mytarget[column],
    #                              weights1=new_original_weights,
    #                              weights2=targetwts)
    #     sum_ks = sum_ks + myks
    #     # print('KS over ', column, ' = ', myks)
    # plt.draw()
    # plt.figure(figsize=[15, 7])
    # for id, column in enumerate(columns[12:14], 1):
    #     xlim = numpy.percentile(numpy.hstack([mytarget[column]]),
    #                             [0.01, 99.99])
    #     plt.subplot(2, 3, id)
    #     plt.hist(myoriginal[column], weights=new_original_weights, range=xlim,
    #              **hist_settings)
    #     plt.hist(mytarget[column], weights=targetwts, range=xlim,
    #              **hist_settings)
    #     plt.title(column)
    #     myks = ks_2samp_weighted(myoriginal[column], mytarget[column],
    #                              weights1=new_original_weights,
    #                              weights2=targetwts)
    #     sum_ks = sum_ks + myks
    #     # print('KS over ', column, ' = ', myks)
    # plt.draw()
    avg_ks = sum_ks / ctr
    print('average of KS distances = ', avg_ks)
    return avg_ks
Ejemplo n.º 7
0
def draw_distributions(original, target, new_original_weights, splot_weights):
    hist1_settings = {'bins': 20, 'density': True, 'alpha': 0.7}
    plt.figure(figsize=[15, 7])
    for id, column in enumerate(used_branch, 1):
        xlim = numpy.percentile(numpy.hstack([target[column]]), [0.01, 99.99])
        plt.subplot(2, 3, id)
        plt.hist(original[column],
                 weights=new_original_weights,
                 range=xlim,
                 **hist1_settings,
                 label="MC(weighted)")
        plt.hist(target[column],
                 weights=splot_weights,
                 range=xlim,
                 **hist1_settings,
                 label="Data(splot)")
        handles, labels = plt.gca().get_legend_handles_labels()
        plt.legend(loc='best')
        plt.title(column)
        print(
            'KS over ', column, ' = ',
            ks_2samp_weighted(original[column],
                              target[column],
                              weights1=new_original_weights,
                              weights2=splot_weights))
    plt.savefig('compare_show.pdf')
    plt.show()
def draw_distributions(original, target, new_original_weights, evaluation_method = 'ks'):
  #Draws histograms of target data and reweighted monte carlo data
  #Evaluation method is 'ks' or 'kl' for Kolmogorov–Smirnov or Kullback-Leibler
    plt.figure(figsize=[15,8]) #swap these around
    for id, column in enumerate(columns, 1):
        xlim = np.percentile(np.hstack([target[column]]), [0.01, 99.99])
        plt.subplot(2,3, id) # and these around to change how hists are stacked

        #Plot angles in degrees rather than radians
        if column == 'p3_theta' or column == 'p4_theta' or column == 'p5_theta' or column == 'p6_theta':
          plt.hist(original[column]*(180/math.pi), weights=new_original_weights, range=xlim*(180/math.pi), **hist_settings)
          plt.hist(target[column]*(180/math.pi), range=xlim*(180/math.pi), **hist_settings)
          plt.title(column)

        else:
          plt.hist(original[column], weights=new_original_weights, range=xlim, **hist_settings)
          plt.hist(target[column], range=xlim, **hist_settings)
          plt.title(column)


        if evaluation_method == 'ks':
          print('KS over ', column, ' = ', ks_2samp_weighted(original[column], target[column], 
                                          weights1=new_original_weights, weights2=np.ones(len(target), dtype=float)))
        elif evaluation_method == 'kl':
          target_hist = np.histogram(target[column],density=True,bins=20)
          original_hist = np.histogram(original[column],density=True,weights=new_original_weights,bins=20)

          print('KL over ', column, ' = ', entropy(original_hist,target_hist))
Ejemplo n.º 9
0
def check_reweighter(n_dimensions, n_samples, reweighter):
    mean_original = numpy.random.normal(size=n_dimensions)
    cov_original = numpy.diag([1.] * n_dimensions)

    mean_target = numpy.random.mtrand.multivariate_normal(mean=mean_original,
                                                          cov=cov_original)
    cov_target = cov_original * 0.4 + numpy.ones([n_dimensions, n_dimensions
                                                  ]) * 0.2

    original = numpy.random.mtrand.multivariate_normal(mean=mean_original,
                                                       cov=cov_original,
                                                       size=n_samples + 1)
    original_weight = numpy.ones(n_samples + 1)

    target = numpy.random.mtrand.multivariate_normal(mean=mean_target,
                                                     cov=cov_target,
                                                     size=n_samples)
    target_weight = numpy.ones(n_samples)

    reweighter.fit(original,
                   target,
                   original_weight=original_weight,
                   target_weight=target_weight)
    new_weights = reweighter.predict_weights(original,
                                             original_weight=original_weight)

    av_orig = numpy.average(original, weights=original_weight, axis=0)
    print('WAS', av_orig)
    av_now = numpy.average(original, weights=new_weights, axis=0)
    print('NOW:', av_now)
    av_ideal = numpy.average(target, weights=target_weight, axis=0)
    print('IDEAL:', av_ideal)

    print('COVARIATION')
    print('WAS', weighted_covar(original, original_weight))
    print('NOW', weighted_covar(original, new_weights))
    print('IDEAL', weighted_covar(target, target_weight))

    assert numpy.all(
        abs(av_now - av_ideal) < abs(av_orig - av_ideal)), 'deviation is wrong'
    for dim in range(n_dimensions):
        diff1 = ks_2samp_weighted(original[:, dim], target[:, dim],
                                  original_weight, target_weight)
        diff2 = ks_2samp_weighted(original[:, dim], target[:, dim],
                                  new_weights, target_weight)
        print('KS', diff1, diff2)
        assert diff2 < diff1, 'Differences {} {}'.format(diff1, diff2)
Ejemplo n.º 10
0
def test_ks2samp_fast(size=1000):
    y1 = RandomState().uniform(size=size)
    y2 = y1[RandomState().uniform(size=size) > 0.5]
    a = ks_2samp(y1, y2)[0]
    prep_data, prep_weights, prep_F = prepare_distibution(y1, numpy.ones(len(y1)))
    b = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), F1=prep_F)
    c = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), F1=prep_F)
    d = ks_2samp_weighted(y1, y2, numpy.ones(len(y1)) / 3, numpy.ones(len(y2)) / 4)
    assert numpy.allclose(a, b, rtol=1e-2, atol=1e-3)
    assert numpy.allclose(b, c)
    assert numpy.allclose(b, d)
    print('ks2samp is ok')
Ejemplo n.º 11
0
def test_ks2samp_fast(size=1000):
    y1 = RandomState().uniform(size=size)
    y2 = y1[RandomState().uniform(size=size) > 0.5]
    a = ks_2samp(y1, y2)[0]
    prep_data, prep_weights, prep_F = prepare_distribution(y1, numpy.ones(len(y1)))
    b = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), cdf1=prep_F)
    c = _ks_2samp_fast(prep_data, y2, prep_weights, numpy.ones(len(y2)), cdf1=prep_F)
    d = ks_2samp_weighted(y1, y2, numpy.ones(len(y1)) / 3, numpy.ones(len(y2)) / 4)
    assert numpy.allclose(a, b, rtol=1e-2, atol=1e-3)
    assert numpy.allclose(b, c)
    assert numpy.allclose(b, d)
    print('ks2samp is ok')
Ejemplo n.º 12
0
def ks_test(original, target, variables, original_weights):
    ksresults = []  ##original_weights
    i = 0
    for id, column in enumerate(variables, 1):

        ks = ks_2samp_weighted(original[column],
                               target[column],
                               weights1=original_weights,
                               weights2=numpy.ones(len(target), dtype=int))
        ksresults.append(ks)
        i += 1
    return ksresults
def averageKS(original, target, weights):
  '''
  Averages the KS value for all the variables in a data set after applying weights

  original(DataFrame): original collisions
  target(DataFrame): target collisions
  weights(array): weights for the original data
  '''

  nColumns = len(original.columns)
  total = 0

  for column in original.columns:

    total += ks_2samp_weighted(original[column], target[column], 
                                         weights1=weights, weights2=np.ones(len(target), dtype=float))

  return(total/nColumns)
Ejemplo n.º 14
0
def print_statistics(names,
                     original,
                     target,
                     original_weights=None,
                     target_weights=None):
    # Assume weights to be equal if there are not provided
    original_weights = numpy.ones(len(original)) if \
        original_weights is None else original_weights
    target_weights = numpy.ones(len(target)) if \
        target_weights is None else target_weights

    for n in names:
        print('KS over %s = %s' % (n,
                                   ks_2samp_weighted(original[n],
                                                     target[n],
                                                     weights1=original_weights,
                                                     weights2=target_weights)))

    print('========')
Ejemplo n.º 15
0
def test_ks2samp(n_samples1=100, n_samples2=100):
    """
    checking that KS can be computed with ROC curve
    """
    data1 = numpy.random.normal(size=n_samples1)
    weights1 = numpy.random.random(size=n_samples1)
    data2 = numpy.random.normal(size=n_samples2)
    weights2 = numpy.random.random(size=n_samples2)

    print(weights1.sum(), 'SUM')

    KS = ks_2samp_weighted(data1, data2, weights1=weights1, weights2=weights2)

    # alternative way to check
    labels = [0] * len(data1) + [1] * len(data2)
    data = numpy.concatenate([data1, data2])
    weights = numpy.concatenate([weights1, weights2])
    from sklearn.metrics import roc_curve
    fpr, tpr, _ = roc_curve(labels, data, sample_weight=weights)
    KS2 = numpy.max(numpy.abs(symmetrize(fpr) - symmetrize(tpr)))
    print(KS, KS2)
    print(weights1.sum(), 'SUM')
    assert numpy.allclose(KS, KS2), 'different values of KS'
Ejemplo n.º 16
0
def test_ks2samp(n_samples1=100, n_samples2=100):
    """
    checking that KS can be computed with ROC curve
    """
    data1 = numpy.random.normal(size=n_samples1)
    weights1 = numpy.random.random(size=n_samples1)
    data2 = numpy.random.normal(size=n_samples2)
    weights2 = numpy.random.random(size=n_samples2)

    print(weights1.sum(), 'SUM')

    KS = ks_2samp_weighted(data1, data2, weights1=weights1, weights2=weights2)

    # alternative way to check
    labels = [0] * len(data1) + [1] * len(data2)
    data = numpy.concatenate([data1, data2])
    weights = numpy.concatenate([weights1, weights2])
    from sklearn.metrics import roc_curve
    fpr, tpr, _ = roc_curve(labels, data, sample_weight=weights)
    KS2 = numpy.max(numpy.abs(symmetrize(fpr) - symmetrize(tpr)))
    print(KS, KS2)
    print(weights1.sum(), 'SUM')
    assert numpy.allclose(KS, KS2), 'different values of KS'