Example #1
0
    def test_info_duplicate_columns(self):
        io = StringIO()

        # it works!
        frame = DataFrame(np.random.randn(1500, 4),
                          columns=['a', 'a', 'b', 'b'])
        frame.info(buf=io)
def test_survey_simulation():
    year = 2011
    input_data_frame = get_input_data_frame(year)
    tax_benefit_system_class = openfisca_france.FranceTaxBenefitSystem()
    survey_scenario = SurveyScenario().init_from_data_frame(
        input_data_frame = input_data_frame,
        tax_benefit_system_class = tax_benefit_system_class,
        year = year,
        )
    simulation = survey_scenario.new_simulation()
    try:
        from pandas import DataFrame
        revenu_disponible = DataFrame({"revenu_disponible": simulation.calculate('revenu_disponible')})
    except NaNCreationError as error:
        index = error.index
        entity = error.entity
        column_name = error.column_name
        input_data_frame_debug = filter_input_data_frame(
            simulation.input_data_frame,
            entity,
            index[:10],
            )
        survey_scenario_debug = SurveyScenario()
        simulation_debug = survey_scenario_debug.new_simulation(
            debug = True,
            input_data_frame = input_data_frame_debug,
            tax_benefit_system_class = tax_benefit_system_class,
            year = year,
            )
        simulation_debug.calculate(column_name)

    print revenu_disponible.info()
    print 'finished'
Example #3
0
    def test_info(self):
        io = StringIO()
        self.frame.info(buf=io)
        self.tsframe.info(buf=io)

        frame = DataFrame(np.random.randn(5, 3))

        frame.info()
        frame.info(verbose=False)
Example #4
0
class GetDtypeCounts(object):
    # 2807
    def setup(self):
        self.df = DataFrame(np.random.randn(10, 10000))

    def time_frame_get_dtype_counts(self):
        self.df.get_dtype_counts()

    def time_info(self):
        self.df.info()
Example #5
0
    def test_info_duplicate_columns_shows_correct_dtypes(self):
        # GH11761
        io = StringIO()

        frame = DataFrame([[1, 2.0]], columns=["a", "a"])
        frame.info(buf=io)
        io.seek(0)
        lines = io.readlines()
        self.assertEqual("a    1 non-null int64\n", lines[3])
        self.assertEqual("a    1 non-null float64\n", lines[4])
Example #6
0
    def test_info_duplicate_columns_shows_correct_dtypes(self):
        # GH11761
        io = StringIO()

        frame = DataFrame([[1, 2.0]],
                          columns=['a', 'a'])
        frame.info(buf=io)
        io.seek(0)
        lines = io.readlines()
        assert 'a    1 non-null int64\n' == lines[3]
        assert 'a    1 non-null float64\n' == lines[4]
Example #7
0
    def test_info(self):
        io = StringIO()
        self.frame.info(buf=io)
        self.tsframe.info(buf=io)

        frame = DataFrame(np.random.randn(5, 3))

        import sys
        sys.stdout = StringIO()
        frame.info()
        frame.info(verbose=False)
        sys.stdout = sys.__stdout__
def read_spss(spss_file_path):
    print spss_file_path

    with SavReader(spss_file_path, returnHeader=True) as reader:
        for record in reader:
            print record
            #records_got.append(record)

    data_frame = DataFrame(list(s.SavReader(spss_file_path)))
    print data_frame.info()

    return data_frame
Example #9
0
 def test_info_shows_column_dtypes(self):
     dtypes = ["int64", "float64", "datetime64[ns]", "timedelta64[ns]", "complex128", "object", "bool"]
     data = {}
     n = 10
     for i, dtype in enumerate(dtypes):
         data[i] = np.random.randint(2, size=n).astype(dtype)
     df = DataFrame(data)
     buf = StringIO()
     df.info(buf=buf)
     res = buf.getvalue()
     for i, dtype in enumerate(dtypes):
         name = "%d    %d non-null %s" % (i, n, dtype)
         assert name in res
Example #10
0
    def test_to_string_unicode_columns(self):
        df = DataFrame({u'\u03c3' : np.arange(10.)})

        buf = StringIO()
        df.to_string(buf=buf)
        buf.getvalue()

        buf = StringIO()
        df.info(buf=buf)
        buf.getvalue()

        result = self.frame.to_string()
        self.assert_(isinstance(result, unicode))
Example #11
0
    def test_info_memory_usage_qualified(self):

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=[1, 2, 3])
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=list('ABC'))
        df.info(buf=buf)
        assert '+' in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), range(3)]))
        df.info(buf=buf)
        assert '+' not in buf.getvalue()

        buf = StringIO()
        df = DataFrame(1, columns=list('ab'),
                       index=pd.MultiIndex.from_product(
                           [range(3), ['foo', 'bar']]))
        df.info(buf=buf)
        assert '+' in buf.getvalue()
Example #12
0
    def test_info_categorical_column(self):

        # make sure it works
        n = 2500
        df = DataFrame({'int64': np.random.randint(100, size=n)})
        df['category'] = Series(np.array(list('abcdefghij')).take(
            np.random.randint(0, 10, size=n))).astype('category')
        df.isna()
        buf = StringIO()
        df.info(buf=buf)

        df2 = df[df['category'] == 'd']
        buf = StringIO()
        df2.info(buf=buf)
Example #13
0
    def test_info_max_cols(self):
        df = DataFrame(np.random.randn(10, 5))
        for len_, verbose in [(5, None), (5, False), (10, True)]:
            # For verbose always      ^ setting  ^ summarize ^ full output
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, verbose in [(10, None), (5, False), (10, True)]:

            # max_cols no exceeded
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, verbose=verbose)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

        for len_, max_cols in [(10, 5), (5, 4)]:
            # setting truncates
            with option_context('max_info_columns', 4):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)

            # setting wouldn't truncate
            with option_context('max_info_columns', 5):
                buf = StringIO()
                df.info(buf=buf, max_cols=max_cols)
                res = buf.getvalue()
                self.assertEqual(len(res.strip().split('\n')), len_)
Example #14
0
 def test_info_shows_column_dtypes(self):
     dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
               'complex128', 'object', 'bool']
     data = {}
     n = 10
     for i, dtype in enumerate(dtypes):
         data[i] = np.random.randint(2, size=n).astype(dtype)
     df = DataFrame(data)
     buf = StringIO()
     df.info(buf=buf)
     res = buf.getvalue()
     for i, dtype in enumerate(dtypes):
         name = '%d    %d non-null %s' % (i, n, dtype)
         assert name in res
Example #15
0
    def test_info_wide(self):
        from pandas import set_option, reset_option
        io = StringIO()
        df = DataFrame(np.random.randn(5, 101))
        df.info(buf=io)

        io = StringIO()
        df.info(buf=io, max_cols=101)
        rs = io.getvalue()
        self.assertTrue(len(rs.splitlines()) > 100)
        xp = rs

        set_option('display.max_info_columns', 101)
        io = StringIO()
        df.info(buf=io)
        self.assertEqual(rs, xp)
        reset_option('display.max_info_columns')
Example #16
0
random_names = [ names[random.randint(low=0, high = len(names))] for i in range(1000) ]
births = [ random.randint(low=0, high=1000) for i in range(1000) ]

#print random_names[:10]
#print births[:10]

dataset = zip(random_names,births)

df = DataFrame(data=dataset, columns=['Names','Births'])
#print df[:10]

df.to_csv("births1880.txt",index=False,header=False)

df = read_csv(r'./births1880.txt',names=["Names","Births"])
print "df.info over all info of df"
print df.info()
print "df.head - first 5 rows"
print df.head()
import os
os.remove(r'./births1880.txt')

uqNames = df['Names'].unique()

print "df['names'].unique()"
print uqNames

print "df.names.describe()"
print df['Names'].describe()

df = df.groupby("Names")  #group by name
print df
Example #17
0
#plt.show()

t = 1
data['predicted_purchases'] = data.apply(lambda r: bgf.conditional_expected_number_of_purchases_up_to_time(t, r['frequency'], r['recency'], r['T']), axis = 1)
print data.sort('predicted_purchases').tail(5)

from lifetimes.datasets import load_transaction_data
from lifetimes.utils import summary_data_from_transaction_data

transaction_data = load_transaction_data()
print transaction_data.head()
print type(transaction_data)
print transaction_data.columns

print data.columns
print data.head()

t = 10
data['predicted_purchases'] = data.apply(lambda r: bgf.conditional_expected_number_of_purchases_up_to_time(t, r['frequency'], r['recency'], r['T']), axis = 1)
print data

from pandas import DataFrame
d = [{'id': 1, 'R':23, 'F':12, 'M':12.5}, {'id': 2,'R':43, 'F':1, 'M':120.5}, {'id': 3,'R':203, 'F':2, 'M':19.5}]
test = DataFrame(d)
print test
print test.info()
print test['R']

ggf = GammaGammaFitter(penalizer_coef=0)
Example #18
0
    def test_info_memory_usage(self):
        # Ensure memory usage is displayed, when asserted, on the last line
        dtypes = ['int64', 'float64', 'datetime64[ns]', 'timedelta64[ns]',
                  'complex128', 'object', 'bool']
        data = {}
        n = 10
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        buf = StringIO()
        # display memory usage case
        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        self.assertTrue("memory usage: " in res[-1])
        # do not display memory usage cas
        df.info(buf=buf, memory_usage=False)
        res = buf.getvalue().splitlines()
        self.assertTrue("memory usage: " not in res[-1])

        df.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        # memory usage is a lower bound, so print it as XYZ+ MB
        self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

        df.iloc[:, :5].info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        # excluded column with object dtype, so estimate is accurate
        self.assertFalse(re.match(r"memory usage: [^+]+\+", res[-1]))

        df_with_object_index = pd.DataFrame({'a': [1]}, index=['foo'])
        df_with_object_index.info(buf=buf, memory_usage=True)
        res = buf.getvalue().splitlines()
        self.assertTrue(re.match(r"memory usage: [^+]+\+", res[-1]))

        df_with_object_index.info(buf=buf, memory_usage='deep')
        res = buf.getvalue().splitlines()
        self.assertTrue(re.match(r"memory usage: [^+]+$", res[-1]))

        self.assertTrue(df_with_object_index.memory_usage(index=True,
                                                          deep=True).sum()
                        > df_with_object_index.memory_usage(index=True).sum())

        df_object = pd.DataFrame({'a': ['a']})
        self.assertTrue(df_object.memory_usage(deep=True).sum()
                        > df_object.memory_usage().sum())

        # Test a DataFrame with duplicate columns
        dtypes = ['int64', 'int64', 'int64', 'float64']
        data = {}
        n = 100
        for i, dtype in enumerate(dtypes):
            data[i] = np.random.randint(2, size=n).astype(dtype)
        df = DataFrame(data)
        df.columns = dtypes

        # Ensure df size is as expected
        # (cols * rows * bytes) + index size
        df_size = df.memory_usage().sum()
        exp_size = len(dtypes) * n * 8 + df.index.nbytes
        self.assertEqual(df_size, exp_size)

        # Ensure number of cols in memory_usage is the same as df
        size_df = np.size(df.columns.values) + 1  # index=True; default
        self.assertEqual(size_df, np.size(df.memory_usage()))

        # assert deep works only on object
        self.assertEqual(df.memory_usage().sum(),
                         df.memory_usage(deep=True).sum())

        # test for validity
        DataFrame(1, index=['a'], columns=['A']
                  ).memory_usage(index=True)
        DataFrame(1, index=['a'], columns=['A']
                  ).index.nbytes
        df = DataFrame(
            data=1,
            index=pd.MultiIndex.from_product(
                [['a'], range(1000)]),
            columns=['A']
        )
        df.index.nbytes
        df.memory_usage(index=True)
        df.index.values.nbytes

        # sys.getsizeof will call the .memory_usage with
        # deep=True, and add on some GC overhead
        diff = df.memory_usage(deep=True).sum() - sys.getsizeof(df)
        self.assertTrue(abs(diff) < 100)
Example #19
0
        random_status = [status[np.randint(low=0,high=len(status))] for i in range(len(rng))]
        
        # State pool
        states = ['GA','FL','fl','NY','NJ','TX']
        
        # Make a random list of states
        random_states = [states[np.randint(low=0,high=len(states))] for i in range(len(rng))]
        
        Output.extend(zip(random_states, random_status, data, rng))
    
    return Output

# Now that we have a function to generate our test data, let's create some data and stick it into a dataframe
dataset = CreateDataSet(4)
df = DataFrame(data=dataset, columns=['State','Status','CustomerCount','StatusDate'])
df.info()
df.head()

# Save results to excel
df.to_excel('Lesson3.xlsx', index=False)
print 'Done'

# Grab data from excel
# read_excel?

Location = "C:\Users\ABaker\Documents\Python Scripts\Lesson3.xlsx"

# Parse a specific sheet
df = read_excel(Location, 0, index_col='StatusDate')
df.dtypes
df.head()
%pylab inline
import seaborn
import matplotlib.dates as md
from matplotlib import pyplot as plt
from sklearn import preprocessing
from zoo.pipeline.api.keras.layers import Dense, Dropout, LSTM
from zoo.pipeline.api.keras.models import Sequential

try:
    dataset_path = "/home/cdsw/nyc_taxi.csv"
    df = pd.read_csv(dataset_path)
except Exception as e:
    print("nyc_taxi.csv doesn't exist")
    print("you can run $ANALYTICS_ZOO_HOME/bin/data/NAB/nyc_taxi/get_nyc_taxi.sh to download nyc_taxi.csv")
   
print(df.info())

# check the timestamp format and frequence 
print(df['timestamp'].head(10))

# check the mean of passenger number 
print(df['value'].mean())

# change the type of timestamp column for plotting
df['datetime'] = pd.to_datetime(df['timestamp'])

# visualisation of anomaly throughout time (viz 1)
fig, ax = plt.subplots(figsize=(12, 5))

ax.plot(df['datetime'], df['value'], color='blue', linewidth=0.6)
ax.set_title('NYC taxi passengers throughout time')
Example #21
0
import numpy as np
import pandas as pd
from pandas import Series
from pandas import DataFrame

import extract_2

soup = extract_2.soup_dict

data = DataFrame(soup)
data = data.T

print data.info()
print data['source_register'].value_counts()

def after_str(data):
    patterns = ['Planned Sample Size: ', 'UK Sample Size: ', 'More than ', 'Total ']
    for pattern in patterns:
        pattern_1 = '('+ pattern + ')([0-9]+)'
        print   data['target_size'][data['target_size'].str.contains(pattern)].tail()
        temp  = data['target_size'][data['target_size'].str.contains(pattern)]
        temp2 = temp.str.findall(pattern_1).str[0].str[1]
        for i in temp.index:
            data['target_size'].ix[i] = temp2.ix[i]

def before_str(data):
    patterns = [' patients to be recruited', ' \(212 by end of recruitment']
    for pattern in patterns:
        pattern_1 = '([0-9]+)('+ pattern + ')'
        print   data['target_size'][data['target_size'].str.contains(pattern)].tail()
        temp  = data['target_size'][data['target_size'].str.contains(pattern)]
def create_fip(year = None):
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    temporary_store = TemporaryStore.create(file_name = "erfs")

    replace = create_replace(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)
    log.info("{}".format(fip.describe()))
    log.info("{}".format(fip.info()))

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité"
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus"
    # TODO: find a more explicit message

#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    log.info(u"longueur fip {}".format(len(fip)))

    fip['to_keep'] = np.nan
    fip.update(type_FG)

    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip)))

    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    pac['naia'] = pac.naia.astype('int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
#   We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"longueur pacInd1 {}".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"longueur pacInd2 {}".format(len(pac_ind2)))
    log.info(u"pacInd1 & pacInd2 créés")

    log.info("{}".format(pac_ind1.duplicated().sum()))
    log.info("{}".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
                log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd)))
    log.info("{}".format(pac_ind2.type_pac.isnull().sum()))
    log.info("{}".format(pacInd.type_pac.value_counts()))

    log.info(u"    2.2 : pacInd created")

    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident (ménage) for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
# indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")]
    individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  # TODO: declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    # TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("{}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
# In[71]:

submission.head()


# In[75]:

submission.to_csv('submit1_KMeans.csv',index=False)


##### We have to also convert the solution column to int type from float else a 0 score is obtained

# In[82]:

submission.info()


# In[91]:

submission['Solution']=submission['Solution'].astype(int)


# In[92]:

submission.info()


# In[93]:

submission['Solution'].value_counts()
    for moment in samples_dicts:
        if moment < firststart:
            continue
        if moment >laststart:
            break
        task_usage_moment_df = task_usage_df[(task_usage_df['starttime'] <= moment) &
                                             (moment < task_usage_df['endtime'])]
        # print task_usage_moment_df.info()
        samples_dicts[moment]['cpu_usage'] += sum(task_usage_moment_df['cpu_usage'])
        samples_dicts[moment]['mem_usage'] += sum(task_usage_moment_df['mem_usage'])
        samples_dicts[moment]['disk_io_time'] += sum(task_usage_moment_df['disk_io_time'])
        samples_dicts[moment]['disk_space'] += sum(task_usage_moment_df['mean_local_disk_space'])
        samples_dicts[moment]['number_of_running_task'] += len(task_usage_moment_df['cpu_usage'])
    if (totalreadfile == 50):
        samples_df = DataFrame(samples_dicts.values())
        print samples_df.info()
        try:
            samples_df.to_csv(path.join(results_directory,'machine_usage_sampling_machineid_'+str(machine_id)+'_interval_'+str(interval)
                                            +'.csv'),index=False)
        except:
            print 'khong ghi duoc file csv'
        totalreadfile = 0

samples_df = DataFrame(samples_dicts.values())
print samples_df.info()
try:
    samples_df.to_csv(path.join(results_directory,'machine_usage_sampling_machineid_'+str(machine_id)+'_interval_'+str(interval)
                                    +'.csv'),index=False)
except:
    print 'khong ghi duoc file csv'
# 4.13.2016
# @totallygloria


import json
from pandas import DataFrame, Series
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

path = 'ch02/usagov_bitly_data2012-03-16-1331923249.txt'
records = [json.loads(line) for line in open(path, 'rb')]

frame = DataFrame(records)

print frame.info()

tz_counts = frame['tz'].value_counts()
cy_counts = frame['cy'].value_counts()
l_counts = frame['l'].value_counts()

print tz_counts[:10]
# print cy_counts[:20]
# print l_counts[:20]

clean_tz = frame['tz'].fillna('Missing')

clean_tz[clean_tz == ''] = 'TZ Unknown'

tz_counts = clean_tz.value_counts()