Beispiel #1
0
def calculate_ndvi_and_cloud_percent_for_the_parcel(df_ext, cloud_categories):
    # we make a copy first of the dataframe passed to this function to avoid changing the original
    # dataframe
    df = df_ext.copy()
    # Convert the epoch timestamp to a datetime
    df['date_part']=df['date_part'].map(lambda e: datetime.datetime.fromtimestamp(e))
    df['cloud_pct'] = df['hist'].apply(lambda s: get_cloudyness(s, cloud_categories)[1])
    bands = ['B04', 'B08']
    # Check if extraction exists for these bands 4 and 8 for NDVI calculation, otherwise quit
    length_of_band0 = len(df[df['band']==bands[0]])
    length_of_band1 = len(df[df['band']==bands[1]])
    if length_of_band0>0 and length_of_band1>0:
         # Treat each band separately.
        df0 = df[df['band']==bands[0]][['date_part', 'mean', 'count', 'std', 'cloud_pct', 'reference']]
        df1 = df[df['band']==bands[1]][['date_part', 'mean', 'count', 'std', 'cloud_pct', 'reference']]
        # Merge back into one DataFrame based on reference that should be unique
        dff = pd.merge(df0, df1, on = 'reference', suffixes = (bands[0], bands[1]))
        dff['ndvi'] = (dff[f"mean{bands[1]}"]-dff[f"mean{bands[0]}"])/(dff[f"mean{bands[1]}"]+dff[f"mean{bands[0]}"])
        dff['utm_number'] = dff['reference'].apply(lambda s: get_utm_number_from_reference(s))

        dff['ndvi_std'] = dff.apply(lambda x: calculate_ndvi_std_from_band_mean_and_std(x.meanB04,x.meanB08,x.stdB04,x.stdB08), axis=1)

        pd.set_option('precision', 3)
        pd.set_eng_float_format(accuracy=3)
        return dff
    else:
        return pd.DataFrame()
Beispiel #2
0
def t_test(data=None, independent=None, dependent=None):

    pd.set_eng_float_format(accuracy=3, use_eng_prefix=False)
    independent_groups = pd.unique(data[independent])
    if len(independent_groups)>2:
        print('There are more than 2 groups in the independent variable')
        print('t-test is not the correct statistical test to run in that circumstance,')
        print('consider running an ANOVA')
        return

    mct = parammct(data=data, independent=independent, dependent=dependent)

    t_test_value, p_value = stats.ttest_ind(data[dependent][data[independent] == independent_groups[0]],
                                            data[dependent][data[independent] == independent_groups[1]])

    difference_mean = np.abs(mct.loc['Mean'][0] - mct.loc['Mean'][1])
    pooled_sd = np.sqrt( ( ((mct.loc['n'][0]-1)*mct.loc['SD'][0]**2) + ((mct.loc['n'][1]-1)*mct.loc['SD'][1]**2) ) /
                         (mct.loc['n'][0] + mct.loc['n'][1] - 2) )
    sedifference = pooled_sd * np.sqrt( (1/mct.loc['n'][0]) + (1/mct.loc['n'][1]) )
    difference_mean_ci1 = difference_mean + (t_test_value * sedifference)
    difference_mean_ci2 = difference_mean - (t_test_value * sedifference)
    if difference_mean_ci1>difference_mean_ci2:
        difference_mean_cilower = difference_mean_ci2
        difference_mean_ciupper = difference_mean_ci1
    else:
        difference_mean_cilower = difference_mean_ci1
        difference_mean_ciupper = difference_mean_ci2
    cohend = difference_mean / pooled_sd
    t_test_result= pd.DataFrame ([difference_mean, sedifference, t_test_value, p_value,
                                  difference_mean_cilower, difference_mean_ciupper, cohend],
                                 index = ['Difference between means', 'SE difference', 't-test', 'p-value',
                                          'Lower bound difference CI', 'Upper bound difference CI', 'Cohen\'s d'],
                                 columns=['Value'])

    return t_test_result
Beispiel #3
0
def chi_square(data=None, variable1=None, variable2=None):

    pd.set_eng_float_format(accuracy=3, use_eng_prefix=False)

    variable1 = str(variable1)
    variable2 = str(variable2)
    if input_check_categorical_categorical(data, variable1, variable2):
        return

    values_var1=pd.unique(data[variable1])
    values_var2=pd.unique(data[variable2])

    problem_found=False
    for variable in [values_var1, values_var2]:
        if len(variable)<2:
            print(variable, 'has less than two categories. It has:', len(variable))
            problem_found=True
    if problem_found:
        return

    contingency_table = pd.crosstab(data[variable1], data[variable2])
    contingency_table = pd.DataFrame(contingency_table)
    display(Markdown('**Contingency Table**'))
    display(contingency_table)

    chi2_test=stats.chi2_contingency(contingency_table, correction=False)

    chi2_result = pd.Series ([chi2_test[0], chi2_test[1], chi2_test[2], chi2_test[3]],
                            index = ['Chi-square value', 'p-value', 'Degrees of freedom', 'Expected frequencies'])
    chi2_result = pd.DataFrame(chi2_result, columns=['Value'])
    display(Markdown('**Results Chi-square test**'))
    display(chi2_result)

    return
Beispiel #4
0
def logistic_reg(data=None, independent=None, dependent=None):

    pd.set_eng_float_format(accuracy=3, use_eng_prefix=False)

    independent = str(independent)
    dependent = str(dependent)
    if input_check_categorical(data, independent, dependent):
        return

    if not len(pd.unique(data[dependent]))==2:
        print('Dependent variable must have two categories')
        print(dependent, 'variable has', len(pd.unique(data[dependent])), 'categories')
        return

    data['interceptant']=1
    independent=[independent, 'interceptant']
    logReg = sm.Logit(data[dependent], data[independent])
    regression = logReg.fit()
    display(regression.summary())
    display(Markdown('**Coefficients confidence intervals**'))
    display(regression.conf_int())

    predicted_values =regression.predict()
    plt.plot(data[independent[0]], data[dependent], 'o', label='Actual values')
    plt.plot(data[independent[0]], predicted_values, 'ok', label='Predicted probabilities')
    plt.xlabel(independent[0], fontsize=14)
    plt.ylabel('Probability '+dependent, fontsize=14)
    plt.ylim(-0.05, 1.05)
    plt.legend()
    plt.show()

    return
def printStats(players):
    players_sorted = sorted([players[name] for name in players], 
                            key=lambda p: p.totalP)
    d = {}
    for player in reversed(players_sorted[len(players_sorted)-25:]):
        d[player.name] = [player.totalP, player.avgP, player.medianP, 
                          player.stddevP, player.totalT, 
                          player.avgT, player.stddevT]
    pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
    idx = ["T", "A", "M","V","TT", "AT", "VT"]
    print pd.DataFrame(d, index = idx).transpose().sort("T", ascending=False)
Beispiel #6
0
def tukey(data=None, independent=None, dependent=None):

    pd.set_eng_float_format(accuracy=3, use_eng_prefix=False)

    independent = str(independent)
    dependent = str(dependent)
    if input_check_numerical_categorical(data, independent, dependent):
        return

    test = multi.MultiComparison(data[dependent], data[independent])
    res = test.tukeyhsd()
    display(res.summary())
    res.plot_simultaneous()

    return
Beispiel #7
0
def estimate_relative_error_in_nominal_capacitance(df):
    # Calculate the relative percentage difference in the mean capacitance
    # values measured relative to the nominal values.
    cleaned_df = df.dropna().copy()
    C_relative_error = (cleaned_df.groupby('test_capacitor').apply(lambda x: (
        (x['C'] - x['test_capacitor']) / x['test_capacitor']).describe()))
    pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
    print(
        'Estimated relative error in nominal capacitance values = %.1f%% '
        ' +/-%.1f%%' % (C_relative_error['mean'].mean() * 100,
                        C_relative_error['mean'].std() * 100))
    print C_relative_error[['mean', 'std']] * 100
    print

    return C_relative_error
def estimate_relative_error_in_nominal_capacitance(df):
    # Calculate the relative percentage difference in the mean capacitance
    # values measured relative to the nominal values.
    cleaned_df = df.dropna().copy()
    C_relative_error = (cleaned_df.groupby('test_capacitor')
                        .apply(lambda x: ((x['C'] - x['test_capacitor']) /
                               x['test_capacitor']).describe()))
    pd.set_eng_float_format(accuracy=1, use_eng_prefix=True)
    print ('Estimated relative error in nominal capacitance values = %.1f%% '
           ' +/-%.1f%%' % (C_relative_error['mean'].mean() * 100,
                           C_relative_error['mean'].std() * 100))
    print C_relative_error[['mean', 'std']] * 100
    print


    return C_relative_error
Beispiel #9
0
def anova(data=None, independent=None, dependent=None):

    pd.set_eng_float_format(accuracy=3, use_eng_prefix=False)

    independent = str(independent)
    dependent = str(dependent)
    if input_check_numerical_categorical(data, independent, dependent):
        return

    formula = dependent + ' ~ ' + independent
    model = ols(formula, data=data).fit()
    aov_table = sm.stats.anova_lm(model, typ=2)
    aov_table.rename(columns={'PR(>F)':'p'}, inplace=True)
    aov_table['F'] = pd.Series([aov_table['F'][0], ''], index = [independent, 'Residual'])
    aov_table['p'] = pd.Series([aov_table['p'][0], ''], index = [independent, 'Residual'])
    eta_sq = aov_table['sum_sq'][0]/(aov_table['sum_sq'][0]+aov_table['sum_sq'][1])
    aov_table['Eta squared'] = pd.Series([eta_sq, ''], index = [independent, 'Residual'])

    return aov_table
Beispiel #10
0
import textwrap

import numpy as np
import pandas as pd

from matplotlib import pylab as plt
from matplotlib.gridspec import GridSpec
from matplotlib import ticker
from matplotlib.ticker import MaxNLocator
from mpl_toolkits.axes_grid.anchored_artists import AnchoredText

from .kplot import tprop
from .. import tfind
from .. import tval

pd.set_eng_float_format(accuracy=3,use_eng_prefix=True)
plt.rc('axes',color_cycle=['RoyalBlue','Tomato'])
plt.rc('font',size=8)
def print_traceback(f):
    """
    Decorator so that we can fail gracefully from a plotting mishap
    """
    def wrapper_function(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except Exception:
            ax = plt.gca()
            error = traceback.format_exc()
            print(error)
            error = textwrap.fill(error,50)
            ax.text(0, 1, error, transform=ax.transAxes, va='top')
 def write_csv(self):
     if self.outfile:
         pd.set_eng_float_format(accuracy=3, use_eng_prefix=True)
         self.result.to_csv(self.outfile,encoding='utf-8',index=False)
     else:
         utils.log('No outfile specified!')
                       sep=' ',
                       index=False,
                       header=False,
                       mode='a',
                       encoding='utf-8',
                       float_format='%.4f',
                       index_label=None)
    # data.to_pickle(des,compression='zip')
    print('*' * 40)


if __name__ == "__main__":
    print('program  started at:', time.asctime(time.localtime(time.time()))
          )  #time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time()))

    pd.set_eng_float_format(7, True)
    pd.set_option('precision', 7)
    # pd.set_option('chop_threshold', .5)

    # source_addr = "/home/gjj/PycharmProjects/ADA/netsData/hackingData/GANdata/from_raw_change_scaler/2/data/Attack_free_dataset2.pkl"
    # dire_addr = "/home/gjj/PycharmProjects/ADA/netsData/hackingData/GANdata/from_raw_change_scaler/"
    # dire_addr = "/home/gjj/PycharmProjects/ADA/netsData/hackingData/GANdata/from_raw_change_scaler/2/data"

    # print('program  start at:', time.strftime('%Y-%m-%d,%H:%M:%S', time.localtime(time.time())))
    # # print('data from :%s'%source_addr)
    # print()
    # os.chdir(os.path.dirname(dire_addr))
    # dire_url = os.path.join(dire_addr, 'Attack_free_dataset_64.txt')
    # print("\ncurrent at:{}".format(os.getcwd()))
    # print()
    """pkl to txt"""
Beispiel #13
0
def main(routing_hdf_path, net_file_namebase):
    format_opts = dict(((k, pd.get_option(k)) for k in ('float_format',
                                                        'column_space')))
    # Format floats to:
    #
    #   * Avoid small float values being displayed as zero _(e.g.,
    #     critical-path-delay)_.
    #   * Use engineering postfix to make it easier to compare values
    #     at-a-glance _(e.g., `u` for micro, `n` for nano, etc.)_.
    pd.set_eng_float_format(accuracy=3, use_eng_prefix=True)
    h5f = ts.open_file(str(routing_hdf_path), 'r')

    # In our case, we need to first load the data from our `route_states` table
    # from the HDF file into a `pandas.DataFrame` instance.
    net_file_routings = getattr(h5f.root, net_file_namebase)
    data = np.array([v.fetch_all_fields()
                     for v in net_file_routings.route_states],
                    dtype=net_file_routings.route_states.dtype)
    routing_results = pd.DataFrame(data)
    h5f.close()

    string_io = StringIO.StringIO()
    indent = 4 * ' '

    print >> string_io, '# [%s] Routing results summary #\n' % net_file_namebase
    _min_success_data = min_success_data(routing_results)
    if len(_min_success_data) > 1:
        min_success_summary = _min_success_data.describe()
    elif len(_min_success_data) == 1:
        min_success_summary = _min_success_data.iloc[0]
    print >> string_io, '## Minimum routable channel-width summary ##\n'
    print >> string_io, prefix_lines(min_success_summary, indent)

    print >> string_io, '\n' + 70 * '-' + '\n'

    _max_failed_data = max_failed_data(routing_results)
    if len(_min_success_data) > 1:
    #max_failed_summary = _max_failed_data.describe().astype('i')
        max_failed_summary = _max_failed_data.describe()
    elif len(_min_success_data) == 1:
        max_failed_summary = _max_failed_data.iloc[0]
    print >> string_io, '## Maximum unroutable channel-width summary ##\n'
    print >> string_io, prefix_lines(max_failed_summary, indent)

    incomplete_routing_searches = np.where(
        min_success_max_failed_channel_width_diff(routing_results) != 1)
    if len(incomplete_routing_searches[0]):
        print >> string_io, 'Incomplete routings:'
        print >> string_io, '\n'.join(['  * `%s`' %
                                       pformat(routing_results
                                               ['block_positions_sha1'][i])
                                     for i in incomplete_routing_searches[0]])

    print >> string_io, '\n' + 70 * '-' + '\n'

    print >> string_io, ('## Missing routability result routing configurations'
                         ' ##\n')
    print >> string_io, '\n'.join(['  * `%s`' % pformat(v) for v in
                                   missing_routability_result_configs
                                   (routing_results)])

    print >> string_io, '\n' + 70 * '-' + '\n\n'

    for k, v in format_opts.iteritems():
        if v is not None:
            pd.set_option(k, v)
    return string_io.getvalue(), routing_results
Beispiel #14
0
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# In[2]:


pd.set_eng_float_format(accuracy=4)


# In[3]:


train = pd.read_csv(r'E:\Users\quadr\Documents\datascience-arquivos\Kaggle\Ashrae_Energy_Prediction\train.csv')
w_train = pd.read_csv(r'E:\Users\quadr\Documents\datascience-arquivos\Kaggle\Ashrae_Energy_Prediction\weather_train.csv')


# In[4]:


b_meta = pd.read_csv(r'E:\Users\quadr\Documents\datascience-arquivos\Kaggle\Ashrae_Energy_Prediction\building_metadata.csv')

Beispiel #15
0
 def eng():
     import pandas as pd
     pd.set_eng_float_format(accuracy=3, use_eng_prefix=True)
     pd.options.display.float_format = '{:, .5f}'.format
     pd.set_option('precision', 7)
Beispiel #16
0
def set_format():
    pd.set_eng_float_format(accuracy=2, use_eng_prefix=False)