コード例 #1
0
 def remove(self, path):
     s = HDFStore(self.path)
     if path in s:
         print("removing %s" % path)
         s.remove(path)
         s.flush(fsync=True)
     s.close()
コード例 #2
0
 def put(self, path, obj):
     s = HDFStore(self.path)
     if path in s:
         print "updating %s" % path
         s.remove(path)
     s[path] = obj
     s.close()
コード例 #3
0
 def remove(self, path):
     s = HDFStore(self.path)
     if path in s:
         print("removing %s" % path)
         s.remove(path)
         s.flush(fsync=True)
     s.close()
コード例 #4
0
 def put(self, path, obj):
     s = HDFStore(self.path)
     if path in s:
         print "updating %s" % path
         s.remove(path)
     s[path] = obj
     s.close()
コード例 #5
0
 def _put(self, path, obj):
     s = HDFStore(self.path)
     if path in s:
         print("updating %s" % path)
         s.remove(path)
         s.close()
     s = HDFStore(self.path)
     s[path] = obj
     s.flush(fsync=True)
     s.close()
コード例 #6
0
 def _put(self, path, obj):
     s = HDFStore(self.path)
     if path in s:
         print("updating %s" % path)
         s.remove(path)
         s.close()
     s = HDFStore(self.path)
     s[path] = obj
     s.flush(fsync=True)
     s.close()
コード例 #7
0
    def aggregate(hdf_store_loc,
                  file_pattern,
                  headerfile=None,
                  remove_part_files=False):
        df = None

        store = HDFStore(hdf_store_loc)
        store_keys = [w.replace('/', '') for w in store.keys()]

        print(
            f'Aggregating part files in {hdf_store_loc} for {file_pattern} into single file'
        )

        for key in store_keys:
            if re.match(file_pattern.replace('*', '.+'), key):
                print(
                    f'********************* Key : {key} MAtches pattern : {file_pattern.replace("*",".+")}'
                )
                #thisdf = pd.read_hdf(store_loc, key)
                thisdf = store.select(key)

                if df is None:
                    df = thisdf
                else:
                    #' for gz file that not have headers assign headers.
                    try:
                        df = df.append(thisdf, ignore_index=True, sort=True)
                    except Exception as e:
                        print('Error while joining data {e}')

                if remove_part_files:
                    store.remove(key)

        try:
            #df.to_hdf(store_loc, key=file_pattern.replace('*',''))
            store.put(key=file_pattern.replace('*', ''), value=df)
        except Exception as e:
            print(
                f'Exception while combining flile for {file_pattern} exception {e}'
            )

        store.close()
コード例 #8
0
ファイル: of2liam.py プロジェクト: TaxIPP-Life/til-core
def main(period=None):
    temps = time.clock()
    input_tab = "C:/openfisca/output/liam/" + "LiamLeg.h5"
    output_tab = "C:/Myliam2/Model/SimulTest.h5"

    store = HDFStore(input_tab)
    goal = HDFStore(output_tab)

    name_convertion = {"ind": "person", "foy": "declar", "men": "menage", "fam": "menage"}
    # on travaille d'abord sur l'ensemble des tables puis on selectionne chaque annee
    # step 1

    for ent in ("ind", "men", "foy", "fam"):
        dest = name_convertion[ent]
        tab_in = store[ent]
        tab_out = goal["entities/" + dest]
        # on jour sur les variable a garder
        # TODO: remonter au niveau de of_on_liam mais la c'est pratique du fait de
        # l'autre table
        ident = "id" + ent
        if ent == "ind":
            ident = "noi"
        # on garde les valeurs de depart
        to_remove = [x for x in tab_in.columns if x in tab_out.columns]
        # on retire les identifiant sauf celui qui deviendra id
        list_id = ["idmen", "idfoy", "idfam", "id", "quifoy", "quifam", "quimen", "noi"]
        list_id.remove(ident)
        to_remove = to_remove + [x for x in tab_in.columns if x in list_id]
        # on n4oublie pas de garder periode
        to_remove.remove("period")
        tab_in = tab_in.drop(to_remove, axis=1)
        tab_in = tab_in.rename(columns={ident: "id"})
        tab_out = merge(tab_in, tab_out, how="right", on=["id", "period"], sort=False)
        goal.remove("entities/" + dest)
        goal.append("entities/" + dest, tab_out)
    #        new_tab = np.array(tab_out.to_records())

    store.close()
    goal.close()
コード例 #9
0
ファイル: test_file_handling.py プロジェクト: Aathi410/Pro123
def test_multiple_open_close(setup_path):
    # gh-4409: open & close multiple times

    with ensure_clean_path(setup_path) as path:

        df = tm.makeDataFrame()
        df.to_hdf(path, "df", mode="w", format="table")

        # single
        store = HDFStore(path)
        assert "CLOSED" not in store.info()
        assert store.is_open

        store.close()
        assert "CLOSED" in store.info()
        assert not store.is_open

    with ensure_clean_path(setup_path) as path:

        if pytables._table_file_open_policy_is_strict:
            # multiples
            store1 = HDFStore(path)
            msg = (
                r"The file [\S]* is already opened\.  Please close it before "
                r"reopening in write mode\."
            )
            with pytest.raises(ValueError, match=msg):
                HDFStore(path)

            store1.close()
        else:

            # multiples
            store1 = HDFStore(path)
            store2 = HDFStore(path)

            assert "CLOSED" not in store1.info()
            assert "CLOSED" not in store2.info()
            assert store1.is_open
            assert store2.is_open

            store1.close()
            assert "CLOSED" in store1.info()
            assert not store1.is_open
            assert "CLOSED" not in store2.info()
            assert store2.is_open

            store2.close()
            assert "CLOSED" in store1.info()
            assert "CLOSED" in store2.info()
            assert not store1.is_open
            assert not store2.is_open

            # nested close
            store = HDFStore(path, mode="w")
            store.append("df", df)

            store2 = HDFStore(path)
            store2.append("df2", df)
            store2.close()
            assert "CLOSED" in store2.info()
            assert not store2.is_open

            store.close()
            assert "CLOSED" in store.info()
            assert not store.is_open

            # double closing
            store = HDFStore(path, mode="w")
            store.append("df", df)

            store2 = HDFStore(path)
            store.close()
            assert "CLOSED" in store.info()
            assert not store.is_open

            store2.close()
            assert "CLOSED" in store2.info()
            assert not store2.is_open

    # ops on a closed store
    with ensure_clean_path(setup_path) as path:

        df = tm.makeDataFrame()
        df.to_hdf(path, "df", mode="w", format="table")

        store = HDFStore(path)
        store.close()

        msg = r"[\S]* file is not open!"
        with pytest.raises(ClosedFileError, match=msg):
            store.keys()

        with pytest.raises(ClosedFileError, match=msg):
            "df" in store

        with pytest.raises(ClosedFileError, match=msg):
            len(store)

        with pytest.raises(ClosedFileError, match=msg):
            store["df"]

        with pytest.raises(ClosedFileError, match=msg):
            store.select("df")

        with pytest.raises(ClosedFileError, match=msg):
            store.get("df")

        with pytest.raises(ClosedFileError, match=msg):
            store.append("df2", df)

        with pytest.raises(ClosedFileError, match=msg):
            store.put("df3", df)

        with pytest.raises(ClosedFileError, match=msg):
            store.get_storer("df2")

        with pytest.raises(ClosedFileError, match=msg):
            store.remove("df2")

        with pytest.raises(ClosedFileError, match=msg):
            store.select("df")

        msg = "'HDFStore' object has no attribute 'df'"
        with pytest.raises(AttributeError, match=msg):
            store.df
コード例 #10
0
 def remove(path):
     s = HDFStore(self.path)
     if path in s:
         print "removing %s" % path
         s.remove(path)
     s.close()
コード例 #11
0
ファイル: helper.py プロジェクト: xmduhan/contest
def removeDataFrame(dfName):
    dbName = 'db.h5'
    s = HDFStore(dbName)
    if s.get_node(dfName):
        s.remove(dfName)
    s.close()
コード例 #12
0
ファイル: utils.py プロジェクト: alybel/fintf
def load_from_store_or_yahoo(start=None, end=None, symbol=None):
    append = False
    hdf = HDFStore(settings.storage_path)
    today = dt.datetime.today().date()

    yahoo_symbol = symbol
    symbol = clean_symbol(symbol)

    # this case, earlier data than in store is requested. The table needs to be rewritten
    if symbol in hdf:
        df = hdf[symbol]
        start_store = df.index.min()
        if isinstance(start, str):
            start = dt.datetime.strptime(start, '%Y-%m-%d')
        if start_store.date() > start:
            hdf.remove(symbol)
            lprint('start date was earlier than the oldest date in the storage. storage needs to be rewritten.')

    if symbol in hdf:
        df = hdf[symbol]
        end_store = df.index.max()

        # check if today is a weekend day
        weekday = dt.datetime.today().weekday()
        last_trading_day = today
        if weekday in [5, 6]:
            correction = 1 if weekday == 5 else 2
            last_trading_day = today - dt.timedelta(correction)

        # if the last trading day is the max date in the store than do not reload data
        if last_trading_day == end_store.date():
            lprint('loaded %s data from storage.' % symbol)
            return df

        # if the last trading is younger that the last trading day, load the difference
        end = today + dt.timedelta(1)
        start = end_store
        append = True

    # if no store was found, use the start and end from above
    df = None
    count = 0
    while df is None and count < 10:
        try:
            df = get_yahoo_data(start=start, end=end, symbol=yahoo_symbol)
        except RemoteDataError:
            time.sleep(10 + int(np.random.rand() * 10))
        count += 1

    if df is None:
        raise Exception('Even after 10 trials data could not be loaded from yahoo')

    # remove blanks in the header
    df.columns = [x.replace(' ', '_') for x in df.columns]

    # store or append to hdf5 storage

    if symbol in hdf:
        # drop duplicates
        exist_df = hdf[symbol]
        df = df[~df.index.isin(exist_df.index)]

    if append:
        hdf.append(symbol, df, format='table', data_columns=True)
    else:
        df.drop_duplicates(inplace=True)
        hdf.put(symbol, df, format='table', data_columns=True)
    if not df.index.is_unique:
        lprint('index of %s is not unique' % symbol)
    return df
コード例 #13
0
 def remove(path):
     s = HDFStore(self.path)
     if path in s:
         print "removing %s" % path
         s.remove(path)
     s.close()
コード例 #14
0
ファイル: DataTable_from_liam.py プロジェクト: AnneDy/Til
# il y a un truc avec les gens qui se marient puis divorcent
# en profiter pour bien gerer les conj = 0 ou conj =-1
        # si on ne s'arrete pas là, c'est qu'on n'a pas de problème !! 
        print year, ent, diff1
        for k in diff1:           

            pd.set_printoptions(max_columns=30)
            listind = table['ind'][table['ind'][ident]==k]
            print listind
            for indiv in np.unique(listind['id']):
                print table['ind'].ix[table['ind']['id']==indiv,['id','period','sexe','idmen','quimen','idfoy','quifoy','conj','mere','pere']]
                pdb.set_trace()   
        
            
for year in years:
    goal.remove('survey_'+str(year))
    for ent in ('ind','men','foy','fam'):
        tab = table[ent].ix[table[ent]['period']==year]
        key = 'survey_'+str(year) + '/'+ent     
        goal.put(key, tab) 
#    if year == 2010:
#        pdb.set_trace()
#        tab = table[ent].ix[table[ent]['period']==year]
#        tab[:5]
#        len(tab['idfam'])
#        len(np.unique(tab['idfam']))
#        list_qui = tab['idfam']
#        double = list_qui.value_counts()[list_qui.value_counts()>1]
#        tabind = table['ind'].ix[table['ind']['period']==year]
        
        
コード例 #15
0
    def read_data(source_dir,
                  hdf_store_loc,
                  ignore_file_pattern='',
                  data_dict=None):
        # create a dictionary of columns that should be of type 'str'
        str_cols_dict = {
            config.CHURN_ACTIVITIES_FILE: [
                'Opportunity', 'Created By', 'Account ID', 'Company / Account',
                'Contact', 'Lead', 'Priority', 'Activity Type',
                'Task/Event Record Type', 'Task Subtype', 'Event Subtype',
                'Subject', 'Call Result', 'Topics Discussed', 'Comments',
                'Full Comments', 'Follow Up Subject', 'Follow Up Notes',
                'Name of Value Prop', 'Activity ID', 'Assigned', 'Date',
                'Product Name', 'Assigned Role', 'Assigned Role Display',
                'Created Date', 'Start', 'End', 'ECR Id', 'Parent ECR-ID'
            ],
            config.CHURN_RISKS_FILE: [
                'Opportunity ID', 'Opportunity Name', 'Sales Type', 'Stage',
                'Level 1 Product', 'Level 2 Product', 'Expected Close Date',
                'Subscription Start Date', 'Amount (converted) Currency',
                'Agreement Number', 'Account Name: ECR Id',
                'Account Name: Account Name', 'Risk ID', 'Risk Name',
                'Risk Type', 'Severity', 'Status', 'Created Date', 'Comments',
                'Competitor: Account Name'
            ],
            config.CHURN_PRODUCTS_FILE: [
                'Product Level 1', 'Product Level 2', 'Product Level 3',
                'Product Level 4'
            ],
            config.SIS_MAPPING_FILE: [
                'ACCOUNT_NAME', 'SIS_ID', 'HQ_SIS_ID', 'OLD_SIS_ID',
                'OLD_HQ_SIS_ID', 'CRM_ID', 'CRM_HQ_ID'
            ],
            config.ACCOUNT_ASSIGNMENT_PREFIX: [
                'BUSINESS_DIVISION', 'COUNTRY', 'CUSTOMER_NAME', 'ECRID',
                'LEVEL_12', 'LEVEL_13', 'LEVEL_14', 'LEVEL_15',
                'ORGANIZATION_TYPE', 'PROVINCE', 'SIZE', 'STATE', 'TERRITORY',
                'TERRITORY_OWNER', 'TERRITORY_TYPE', 'TIER'
            ],
            config.CONTRACTS_JOURNALS_PREFIX: [
                'Agreement End Date', 'Agreement Number',
                'Agreement Start Date', 'Business Division (Agreement SIS)',
                'Business Indicator', 'Calculated New/Renewal',
                'Country Name (Agreement SIS)', 'Division',
                'HQ SIS Id (Agreement SIS)', 'Invoice Date', 'Invoice Num',
                'Name  (Agreement SIS)', 'Parent Agreement Number',
                'Payment Term', 'Payment Term Description',
                'Payment Term Type', 'Product Line Level 1',
                'Product Line Level 2', 'Product Line Level 3',
                'Product Line Level 4', 'Product Revenue Type', 'RSO',
                'Renewal Exp Complete Date', 'SIS Id  (Agreement SIS)',
                'Saleable Product Name (Source)',
                'Sales Division (Agreement SIS)', 'Sales Type', 'Status',
                'Status Change Date', 'Subregion Grouping',
                'Subscription End Date', 'Subscription Start Date', 'WIP Flag'
            ],
            config.CONTRACTS_OTHER_PREFIX: [
                'Agreement End Date', 'Agreement Number',
                'Agreement Start Date', 'Business Division (Agreement SIS)',
                'Business Indicator', 'Calculated New/Renewal',
                'Country Name (Agreement SIS)', 'Division',
                'HQ SIS Id (Agreement SIS)', 'Invoice Date', 'Invoice Num',
                'Name  (Agreement SIS)', 'Parent Agreement Number',
                'Payment Term', 'Payment Term Description',
                'Payment Term Type', 'Product Line Level 1',
                'Product Line Level 2', 'Product Line Level 3',
                'Product Line Level 4', 'Product Revenue Type', 'RSO',
                'Renewal Exp Complete Date', 'SIS Id  (Agreement SIS)',
                'Saleable Product Name (Source)',
                'Sales Division (Agreement SIS)', 'Sales Type', 'Status',
                'Status Change Date', 'Subregion Grouping',
                'Subscription End Date', 'Subscription Start Date', 'WIP Flag'
            ],
            config.CANCELLATIONS_FILE: [
                'Source System', 'SIS Id  (Agreement SIS)', 'Source System',
                'HQ SIS Id (Agreement SIS)', 'Name  (Agreement SIS)',
                'Business Division (Agreement SIS)',
                'Sales Division (Agreement SIS)', 'Division', 'RSO',
                'Subregion Grouping', 'Country Name (Agreement SIS)',
                'WIP Flag', 'Status', 'Wip Type', 'Business Indicator',
                'Sales Type', 'Calculated New/Renewal', 'Payment Term',
                'Payment Term Description', 'Payment Term Type',
                'Status Change Date', 'Renewal Exp Complete Date',
                'Product Revenue Type', 'Product Line Level 1',
                'Product Line Level 2', 'Product Line Level 3',
                'Product Line Level 4', 'Saleable Product Name (Source)',
                'Agreement Number', 'Agreement Start Date',
                'Agreement End Date', 'Subscription Start Date',
                'Subscription End Date', 'Parent Agreement Number',
                'Currency(Entered)', 'Cancellation Reason'
            ],
            config.ECH_FILE: [
                'ecrid', 'name', 'city', 'Country ISO', 'Region', 'post_code',
                'Classification'
            ],
            config.INTERACTION_PREFIX: [
                'CONTACT_COUNTRY', 'CONTACT_REASON_LVL1_DESC',
                'CONTACT_REASON_LVL2_DESC', 'CONTACT_REASON_LVL3_DESC',
                'CONTACT_TYPE', 'CREATED_TO_CLOSED_DAYS',
                'CREATED_TO_INITIAL_RESPONSE_DAYS',
                'CUSTOMER_CLASSIFICATION_PRODUCT',
                'CUSTOMER_CLASSIFICATION_ROLE', 'CUSTOMER_CLASSIFICATION_TYPE',
                'ECR_ID', 'INCIDENT_AUTO_SOLVED', 'INCIDENT_CLOSED_DATETIME',
                'INCIDENT_CREATED_DATETIME', 'INCIDENT_ID',
                'INCIDENT_REOPENED', 'INCIDENT_SYSTEM', 'NUMBER_OF_RESPONSES',
                'OWNER_ID', 'OWNER_NAME', 'RESOLUTION_CODE_LVL1_DESC',
                'RESOLUTION_CODE_LVL2_DESC', 'RESOLUTION_CODE_LVL3_DESC',
                'SOURCE_LVL1_DESC', 'SOURCE_LVL2_DESC', 'STATUS'
            ],
            config.NPS_FILE: [
                'ECR_ID', 'DATE_OF_INTERVIEW', 'ORG_NAME', 'COUNTRY',
                'ORGANIZATION', 'PRODUCT_NAME_ROLLUP', 'PRODUCT_DETAIL',
                'JOB_ROLE', 'JOB_ROLE_GROUPED', 'COMPETITOR_SAT',
                'COMPETITOR_NAME', 'DEPARTMENT', 'INFLUENCE', 'CSAT',
                'CSAT_COMMENT', 'NPS_COMMENT', 'AT_RISK',
                'VALUE_FOR_MONEY_SCORE', 'SHARE_WITH_CUST_DETAILS'
            ],
            config.PRODUCT_ASSIGNMENT_FILE: [
                'TERRITORYNAME', 'OWNERNAME', 'OWNERID', 'BU',
                'ASSIGNTOTERRITORYNAME', 'ASSIGNTOTERRITORYOWNERNAME',
                'ASSIGNTOTERRITORYOWNERID', 'PRODUCT_LEVEL_1',
                'PRODUCT_LEVEL_2'
            ],
            config.USAGE_PREFIX: [
                'ACT_CLICK_DEPTH', 'ACT_DWELL_TIME_VISIT_MIN', 'ECR_ID',
                'LOY_DWELL_TIME_USER_MIN', 'POP_ACTIVE_USERS',
                'POP_PAGE_VIEWS', 'PROD_NAME', 'REPORT_AGG'
            ],
            config.HIERARCHY_PREFIX: [
                'CHILD_ECR', 'CHILD_NAME', 'CONSORTIUM', 'COUNTRY_CHILD',
                'COUNTRY_PARENT', 'HIERARCHY_TYPE', 'PARENT_ECR', 'PARENT_NAME'
            ]
        }

        # create a dictionary of columns that should be of type 'float'
        float_cols_dict = {
            config.JOURNAL_CONTRACTS_FILE: [
                'Bookigns - Committed Print(Rep)',
                'Bookings - Final Net Price - Agent Discount Amount(Rep)'
            ],
            config.OTHER_CONTRACTS_FILE: [
                'Bookigns - Committed Print(Rep)',
                'Bookings - Final Net Price - Agent Discount Amount(Rep)'
            ],
            config.CANCELLATIONS_FILE: [
                'Bookigns - Committed Print(Rep)',
                'Bookings - Final Net Price - Agent Discount Amount(Rep)'
            ]
        }

        print(f'Reading files from {source_dir}')
        print(f'New Data store will be create at {hdf_store_loc}')
        # open the datastore and, if it exists, remove current content
        store = HDFStore(hdf_store_loc)
        print(
            f'Clean HDF Store if it alredy exists. Store currently contains following data {store.keys()}'
        )
        for key in store.keys():
            store.remove(key)
        print(f'After cleaning store currently contains {store.keys()}')

        # go through directory of files that you want to ingest
        for (dirpath, dirnames,
             filenames) in tqdm(os.walk(os.path.normpath(source_dir))):
            hdf_name = None
            for filename in filenames:
                # Ignore files that are marked as old
                ignore_this_file = False
                for p in ignore_file_pattern:
                    if fnmatch.fnmatch(filename, p): ignore_this_file = True

                if ignore_this_file: continue

                filedata = None

                # ingest .csvs
                if filename.endswith('.csv'):
                    filedata = pd.read_csv(os.altsep.join([dirpath, filename]),
                                           encoding="ISO-8859-1",
                                           low_memory=False)
                    hdf_name = str(filename.replace('.csv', '')) + '.pickle'
                    # sometimes csv file colum headers have odd special characters we replace them
                    filedata.columns = [
                        w.replace('', '') for w in filedata.columns
                    ]

                elif filename.endswith('.csv.gz'):

                    headers = pd.read_excel(os.sep.join([dirpath, data_dict
                                                         ]))['Column Name']
                    filedata = pd.read_csv(os.sep.join([dirpath, filename]),
                                           compression='gzip',
                                           header=None,
                                           names=headers)
                    if 'usage'.upper() in dirpath.upper():
                        filename = filename.replace('data_', 'usage_')
                    elif 'interaction'.upper() in dirpath.upper():
                        filename = filename.replace('data_', 'interaction_')
                    elif 'hierarchy'.upper() in dirpath.upper():
                        filename = filename.replace('data_', 'hierarchy_')

                    hdf_name = str(filename.replace('.csv.gz', '')) + '.pickle'
                elif filename.endswith('.xlsx'):
                    filedata = pd.read_excel(os.sep.join([dirpath, filename]))

                    hdf_name = str(filename.replace('.xlsx', '')) + '.pickle'
                else:
                    print(
                        f'File {filename} is not read as it is not csv /gz/ xlsx format '
                    )
                    continue

                # data stored in an hdf datastore cannot have a name which starts with numbers, and it shouldn't contain spaces
                # replace these
                filename_stem = hdf_name.replace(".pickle", "")
                hdf_name = re.sub(r"^[0-9]+_", "", filename_stem)
                # replace space with underscore
                hdf_name = re.sub(r" ", "_", hdf_name)

                dict_lookup = filename_stem.split('_')[0]
                if "journals" in filename_stem:
                    dict_lookup = dict_lookup + '*journals'
                elif "other" in filename_stem:
                    dict_lookup = dict_lookup + '*other'
                elif config.ACCOUNT_ASSIGNMENT_PREFIX in filename_stem:
                    dict_lookup = config.ACCOUNT_ASSIGNMENT_PREFIX
                elif config.USAGE_PREFIX in filename_stem:
                    dict_lookup = config.USAGE_PREFIX
                elif config.INTERACTION_PREFIX in filename_stem:
                    dict_lookup = config.INTERACTION_PREFIX
                elif config.HIERARCHY_PREFIX in filename_stem:
                    dict_lookup = config.HIERARCHY_PREFIX
                else:
                    dict_lookup = filename_stem

                print(
                    f'FILESTEM : {filename_stem.split("_")[0]} to be matched for {filename_stem}'
                )

                try:
                    # set string cols to string datatype
                    if dict_lookup in str_cols_dict.keys():
                        filedata[str_cols_dict[dict_lookup]] = filedata[
                            str_cols_dict[dict_lookup]].astype(str)

                    # set float cols to float datatype
                    if dict_lookup in float_cols_dict.keys():
                        filedata[float_cols_dict[dict_lookup]] = filedata[
                            float_cols_dict[dict_lookup]].astype(float)
                except:
                    print(
                        'Exception while typecasting columns, please check columns are same as in the declatred dictionary above'
                    )

                if hdf_name is not None:
                    # save hdf file
                    #filedata.to_hdf(store, key=hdf_name)
                    store.put(key=hdf_name, value=filedata)
                del filedata

        # store should always be closed after use
        store.close()
コード例 #16
0
ファイル: liam2of.py プロジェクト: antoineboiron/Til
def main(simulation, period=None, output=".h5"):
    temps = time.clock()    
    output_tab = path_til + "/output/to_run_leg.h5"
    name_convertion = {'person':'ind','declar':'foy','menage':'men', 'fam':'fam'}


    # on travaille d'abord sur l'ensemble des tables puis on selectionne chaque annee
    # on étudie d'abord la table individu pour pouvoir séléctionner les identifiants
    # step 1
    table = {}
    entities = simulation.entities
    for entity in entities:
        nom = entity.name
        if nom == 'person':
            ent = name_convertion[nom]
            # convert from PyTables to Pandas
            table[ent] = pd.DataFrame(entity.array.columns)
            # rename variables to make them OF ones
            table['ind'] = table['ind'].rename(columns={
                        'men': 'idmen', 'foy': 'idfoy', 'id': 'noi', 'statmarit': 'civilstate'})

    # get years
    years = np.unique(table['ind']['period'].values/100)
    ent = 'ind'
    # création de variable
    
# useless since agem is in simu    
#     table[ent]['agem'] = 12 * table[ent]['age'] 
    
    table[ent]['ageq'] = table[ent]['age']/5 - 4 
    table[ent]['ageq'] = table[ent]['ageq']*(table[ent]['ageq'] > 0) 
    table[ent]['ageq'] = 12 + (table[ent]['ageq']-12)*(table[ent]['ageq'] < 12) 
    #TODO: modifier pour les jeunes veufs 
    
    # create fam entity
    try:
        table[ent][['idfam','quifam']] = table[ent].loc[:,['idmen','quimen']]
    except:
        pdb.set_trace()
    
    # save information on qui == 0
    foy0 = table[ent].ix[table[ent]['quifoy']==0,['noi','idfoy','idmen','idfam','period']]
    men0 = table[ent].ix[table[ent]['quimen']==0,['noi','idfoy','idmen','idfam','period']]

#    # Travail sur les qui quand on ne controle pas dans la simulation que tout le monde n'est pas qui==2
## inutile car fait maintenant dans la simulation mais peut-être mieux à refaire ici un jour
## parce que ça prend du temps dans la simulation
#    time_qui = time.clock()
#    for ent in ('men','foy'): # 'fam' un jour...
#        print "Deal with qui for ", ent        
#        qui= 'qui'+ent
#        ident = 'id'+ent
#        trav = table['ind'].ix[table['ind'][qui]==2, [ident,qui,'period']]
#        for name, groupfor nom in ('menage','declar','fam'):for nom in ('menage','declar','fam'): in trav.groupby([ident,'period']):
#            to_add = range(len(group))
#            group[qui] = group[qui]+to_add
#            table['ind'].ix[group[qui].index, qui] = group[qui]
#        print "les qui pour ", ent," sont réglés"
#    time_qui = time.clock() - time_qui
#    print "le temps passé à s'occuper des qui a été",time_qui
    

    
    for entity in entities:
        nom = entity.name
        if nom in name_convertion:
            if nom != 'person': 
                pd.DataFrame(entity.array.columns)
                ent = name_convertion[nom]
                # convert from PyTables to Pandas
                table[ent] = pd.DataFrame(entity.array.columns)
                ident = 'id'+ent
                table[ent] = table[ent].rename(columns={'id': ident})
                table[ent] = merge(table[ent], eval(ent +'0'), how='left', left_on=[ident,'period'], right_on=[ident,'period'])
            # traduction de variable en OF pour ces entités
                
            if ent=='men':
                # nbinde est limité à 6 personnes et donc valeur = 5 en python
                table[ent]['nbinde'] = (table[ent]['nb_persons']-1) * (table[ent]['nb_persons']-1 <=5) +5*(table[ent]['nb_persons']-1 >5)

    table['fam'] = men0 
    
    if period is not None:
        years=[period]
        print years
    
    # a comnmenter quand on est sur du nodele pour gagner un peu de temps
#    test = {}
#    for year in years: 
#        for nom in ('menage','declar'):
#            ent = name_convertion[nom] 
##            print ent, base, ident
#            test[ent] = pd.DataFrame(entity.array.columns).rename(columns={'id': ident})
#            test[ent] = test[ent].ix[test[ent]['period']==year,:]
#            
#            test0 = eval(ent +'0')[eval(ent +'0')['period']==year]
#            
#            tab = table[ent].ix[table[ent]['period']==year,['noi','id'+ent,'idfam']]
#            ind = table['ind'].ix[table['ind']['period']==year,['qui'+ent]] 
#            try:
#                list_ind =  ind[ind==0]
#            except:
#                pdb.set_trace()            
#            lidmen = test[ent][ident]
#            lidmenU = np.unique(lidmen)
#            diff1 = set(test0[ident]).symmetric_difference(lidmenU)
#            print year, ent, diff1
#            for k in diff1:           
#    
#                pd.set_printoptions(max_columns=30)
#                listind = table['ind'][table['ind'][ident]==k]
#                print listind
#                for indiv in np.unique(listind['noi']):
#                    print table['ind'].ix[table['ind']['noi']==indiv,['noi','period','sexe','idmen','quimen','idfoy','quifoy','conj','mere','pere']]
#                    pdb.set_trace()   
              
              

    #available_years = sorted([int(x[-4:]) for x in  store.keys()])              
              
    for year in years:    
        if output=='.h5':
            try: 
                os.remove(output_tab)
            except: 
                print("Attention, la table intermediaire n'a pas ete supprimee")
            goal = HDFStore(output_tab)             
            goal.remove('survey_'+str(year))
            for ent in ('ind','men','foy','fam'):
                tab = table[ent].ix[table[ent]['period']/100==year]
                key = 'survey_'+str(year) + '/'+ent     
                goal.put(key, tab) 
            goal.close()
        else:
            for ent in ('ind','men','foy','fam'):
                table[ent] = table[ent].ix[table[ent]['period']/100==year] 
            return table       
コード例 #17
0
ファイル: liam2of.py プロジェクト: TaxIPP-Life/Til
def table_for_of(simulation, period=None, check_validity=False, save_tables=False):
    temps = time.clock()
    output_tab = os.path.join(path_til[0], "output", "to_run_leg.h5" )
    # on travaille d'abord sur l'ensemble des tables puis on selectionne chaque annee
    # on étudie d'abord la table individu pour pouvoir séléctionner les identifiants
    # step 1
    table = {}

    entities = simulation.entities
    entities_name =  map( lambda e: e.name, simulation.entities)
    def _get_entity(name):
        position = entities_name.index(name)
        return simulation.entities[position]
        
    ind = _get_entity('person')
    table['ind'] = DataFrame(ind.array.columns)
    table['ind'] = table['ind'].rename(columns={'men': 'idmen', 'foy': 'idfoy', 'id': 'noi', 'statmarit': 'civilstate'})
    
    # création de variable
    table['ind']['ageq'] = table['ind']['age']/5 - 4 
    table['ind']['ageq'] = table['ind']['ageq']*(table['ind']['ageq'] > 0) 
    table['ind']['ageq'] = 12 + (table['ind']['ageq']-12)*(table['ind']['ageq'] < 12) 
    #TODO: modifier pour les jeunes veufs 
    
    # create fam entity
    try:
        table['ind'][['idfam','quifam']] = table['ind'].loc[:,['idmen','quimen']]
    except:
        pdb.set_trace()

#    # Travail sur les qui quand on ne controle pas dans la simulation que tout le monde n'est pas qui==2
## inutile car fait maintenant dans la simulation mais peut-être mieux à refaire ici un jour
## parce que ça prend du temps dans la simulation
#    time_qui = time.clock()
#    for ent in ('men','foy'): # 'fam' un jour...
#        print "Deal with qui for ", ent        
#        qui= 'qui'+ent
#        ident = 'id'+ent
#        trav = table['ind'].ix[table['ind'][qui]==2, [ident,qui,'period']]
#        for name, groupfor nom in ('menage','declar','fam'):for nom in ('menage','declar','fam'): in trav.groupby([ident,'period']):
#            to_add = range(len(group)) 
#            group[qui] = group[qui]+to_add
#            table['ind'].ix[group[qui].index, qui] = group[qui]
#        print "les qui pour ", ent," sont réglés"
#    time_qui = time.clock() - time_qui
#    print "le temps passé à s'occuper des qui a été",time_qui
    ind = table['ind']
    for ent in ['men','foy']:
        entity = _get_entity(of_name_to_til[ent])

        table[ent] = DataFrame(entity.array.columns)
        id = 'id' + ent
        qui = 'qui' + ent
        table[ent] = table[ent].rename(columns={'id': id})
        # travail sur les qui
        nb_qui = ind.loc[ind[qui]>1, ['noi',id,qui]].groupby(id, sort=True).size()
        if len(nb_qui)>0:
            new_qui = concatenated_ranges(nb_qui) + 2 
            table['ind'] = table['ind'].sort(id) #note the sort
            col_qui = table['ind'][qui]
            col_qui[col_qui>1] = new_qui
            table['ind'][qui] = col_qui 
        
        
        # informations on qui == 0
        qui0 = table['ind'].loc[table['ind']['qui' + ent]==0,['noi','idfoy','idmen','idfam','period']] 
        table[ent] = merge(table[ent], qui0, how='left', left_on=[id,'period'], right_on=[id,'period'])
    
        if ent=='men':
            # nbinde est limité à 6 personnes et donc valeur = 5 en python
            table[ent]['nbinde'] = (table[ent]['nb_persons']-1) * (table[ent]['nb_persons']-1 <=5) +5*(table[ent]['nb_persons']-1 >5)
            table['fam'] = qui0
    
    # remove non-ordinary household
    cond = (table['ind']['idmen'] >= 10) & (table['ind']['idfoy'] >= 10)
    table['ind'] = table['ind'][cond]
    table['men'] = table['men'][table['men']['idmen']>=10]
    table['foy'] = table['foy'][table['foy']['idfoy']>=10]
    table['fam'] = table['fam'][table['fam']['idfam']>=10]
    # get years
    years = np.unique(table['ind']['period'].values/100)    
    if period is not None:
        years=[period]
        print years

    if check_validity:
        for year in years: 
            ind = table['ind'] 
            for ent in ['men','foy']: #fam
                id = 'id' + ent
                qui = 'qui' + ent
                tab = table[ent]
                try:
                    assert ind.groupby([id,qui]).size().max() == 1
                except:
                    print ent
                    pb = ind.groupby([id,qui]).size() > 1
                    print(ind.groupby([id,qui]).size()[pb])
                    pdb.set_trace()
                    print(ind[ind[id]==43][['noi',id,qui]])
                
                qui0 = ind[ind[qui]==0]
                try:  
                    assert qui0[id].isin(tab[id]).all()
                except:
                    cond = tab[id].isin(qui0[id])
                    print(tab[~cond])
                    pdb.set_trace()
                try:
                    assert tab[id].isin(qui0[id]).all()
                except:
                    cond = tab[id].isin(qui0[id])
                    print(tab[~cond])
                    pdb.set_trace()

    for year in years:    
        if save_tables:
            try: 
                os.remove(output_tab)
            except: 
                print("Attention, la table intermediaire n'a pas ete supprimee")
            goal = HDFStore(output_tab)             
            goal.remove('survey_'+str(year))
            for ent in ('ind','men','foy','fam'):
                tab = table[ent].loc[table[ent]['period']/100==year]
                key = 'survey_'+str(year) + '/'+ent     
                goal.put(key, tab) 
            goal.close()
        else:
            for ent in ('ind','men','foy','fam'):
                table[ent] = table[ent].loc[table[ent]['period']/100==year] 
            return table