Python concat_df 예제들, common.concat_df Python 예제들

예제 #1

0

파일 보기

파일: mode_sampling.py 프로젝트: DuggalM/R-Codes

def elemental_mode(self, mand_prob_l1, veh_segment, mand_prob_append, purpose, trips):
    """
    This function samples

    :param self:
    :param mand_prob_l1:
    :param veh_segment:
    :param mand_prob_append:
    :param purpose:
    :param trips:
    :param seed:
    :return:
    """

    # null dataframe
    mand_level1 = pd.DataFrame()
    sampled_df = pd.DataFrame()
    mand_l1 = pd.DataFrame()
    collect_df = {}

    # generate the appropriate df for sampling by veh_type
    # and then attach the production, destination, and market segment
    mand_l1 = self.vehtype_prob(mand_prob_l1, veh_segment)  # get the columns of prob that will be sampled
    mand_level1 = common.concat_df(mand_prob_append, mand_l1, 1)  # concat function

    # Now prepare the level1 file for sampling and only keep relevant columns
    mand_level2, df_join = prob_df_longtermchoice(trips, purpose, mand_level1, veh_segment)
    mand_level2 = mand_level2.iloc[:, 0:52]

    # sample using Cheval
    sampled_df = pd.DataFrame(sample_from_weights(mand_level2, randomizer=seed,
                                                  astype='category', n_threads=1, n_draws=1)).reset_index()
    sampled_df.columns = ['Production Zone', 'Destination Zone', 'Market Segment', 'Mode']

    # create  flag to help select the records in the trips dataframe. Creating the flag allows us to select
    # exactly the same number of rows even in the trips dataframe that match the sampled_df in length
    # Also sort the df to ensure that we don't end up concatenating the wrong o-ds
    sampled_df['flag'] = sampled_df['Production Zone'].astype(str) + sampled_df['Destination Zone'].astype(str) + \
                         sampled_df['Market Segment'].astype(str)
    sampled_df = sampled_df.sort_values(['Production Zone', 'Destination Zone', 'Market Segment'])
    list_un = sampled_df['flag'].unique().tolist()

    # select from trips dataframe recores that corresspond to the sampled df using the flag. Once again sort
    # to ensure proper concatenation
    df_join = df_join.loc[(df_join['flag'].isin(list_un))]
    df_join = df_join.sort_values(['taz_i', 'taz_j', 'market_seg'])

    # concatenate the data. Concatenate is needed as
    # the flag is yet not unique and a merge will result in a larger dataframe than what we started with
    collect_df[veh_segment] = common.concat_df(df_join, sampled_df, 1)

    print("Vehicle Type: %s" % (veh_segment), len(df_join), len(mand_level1), len(sampled_df), len(mand_l1),
          len(mand_prob_l1))

    # now make one dataframe across vehicle segments
    mand_mode = pd.concat(collect_df.values(), ignore_index=True)
    mand_mode['PrimaryMode'] = mand_mode['Mode'].map(lambda x: str(x)[10:])

    return mand_mode

예제 #2

0

파일 보기

파일: vehicle_sampling.py 프로젝트: DuggalM/R-Codes

    def assign_vehtype(self, hh_file, vehtype_file, seed):
        """
        This function takes the household file and processed vehicle type file and samples a vehicle to attach to each
        household.
        :param hh_file: households file noted in the control_parameters file
        :param vehtype_file: vehicle type probability file noted in the control_parameters file
        :return: sample_df: A dataframe with the sampled vehicle type
        """

        # join the vehicle probabilities to the households so that we can sample from them. It is easy enough
        # to attach the probabilities because we know the market segment of each household.

        hh_vehprob = pd.merge(hh_file,
                              vehtype_file,
                              left_on=['taz', 'market_seg'],
                              right_on=['taz', 'market_seg'],
                              how='left')

        # now unstack and get it ready for Cheval
        hh_vehprob = hh_vehprob.pivot(index='hhid',
                                      columns='vtype',
                                      values='value')

        # Sample a vehicle type using Cheval
        sample_df = pd.DataFrame(
            sample_from_weights(hh_vehprob,
                                randomizer=seed,
                                astype='category',
                                n_threads=3))
        sample_df.columns = ['hh_veh_type']

        hh_file = common.concat_df(hh_file, sample_df, 1)

        return hh_file

예제 #3

0

파일 보기

파일: veh_sampler.py 프로젝트: DuggalM/R-Codes

    def assign_vehtype(self, hh_file, vehtype_file, seed):

        """
        This function takes the household file and processed vehicle type file and samples a vehicle to attach to each
        household.
        :param hh_file:
        :param vehtype_file:
        :return: sample_df
        """

        # join the vehicle probabilities to the households so that we can sample from them. It is easy enough
        # to attach the probabilities because we know the market segment of each household.
        hh_vehprob = pd.merge(hh_file, vehtype_file, left_on=['taz', 'market_seg'],
                              right_on=['taz', 'market_seg'], how='left')

        # now unstack and get it ready for Cheval
        hh_vehprob = hh_vehprob.pivot(index='hhid', columns='vtype', values='value')

        # Sample a vehicle type using Cheval
        sample_df = pd.DataFrame(sample_from_weights(hh_vehprob, randomizer=seed, astype='category', n_threads=3))
        sample_df.columns = ['hh_veh_type']

        hh_file = concat_df(hh_file, sample_df, 1)

        return hh_file

예제 #4

0

파일 보기

파일: Mlogit_Probe.py 프로젝트: DuggalM/R-Codes

    def identify_peak(self, trips_hhold_df):
        """


        :return:
        """

        # The trips_out file contains a peak hour factor column that decides whether a trip is sampled
        # in the peak or off-peak period. In order to discretely select the peak records and vice-versa
        # an uniform random number generator is run and the values are attached to the trips_out file.
        # If the (1-peak_factor) value in the record is greater than that of the random value than
        # the record is in the off-peak and vice-versa.

        np.random.seed(mprobe_valid.seed)
        random = pd.DataFrame(np.random.uniform(size=len(trips_hhold_df)))
        random.columns = ['rnum']

        # attach the random number generator and calculate peak_flag. A value of 1 in this flag
        # means that this is a peak period trip record.
        trips_hhold_df = common.concat_df(trips_hhold_df, random, 1)
        trips_hhold_df['peak_flag'] = np.where(
            (1 - trips_hhold_df['peak_factor']) > trips_hhold_df['rnum'], 0, 1)
        trips_hhold_df[['peak_flag']] = trips_hhold_df[['peak_flag']].astype(
            'int8')  # save some memory
        mprobe_valid.logger.info(
            "Return the trips and household dataframe combined with each record tagged as to whether"
            "it starts in the peak (1) or off-peak period (0)")
        return trips_hhold_df

예제 #5

0

파일 보기

    def run(self, trips_hhold_df, peak_consistency):
        """
        This function creates a peak flag for every record in the trips_hhold dataframe. This is needed to ensure that
        MLOGIT produces the correct probabilities by time period and O-D pair. If the peak_consistency flag is
        set to 1 then the function ensures consistency in choosing the peak-off peak flag for the outbound and inbound
        trip of the mandatory tours. This option also increases the run times by around 35 minutes as every row needs to
        be evaluated.
        :param trips_hhold_df: the trips_hhold df that needs a peak flag
        :param peak_consistency: if activated to 1 then consistency between the outbound and inbound trips of the
        mandatory tour are maintained.
        :return: trips_hhold df with peak flag
        """

        # The trips_out file contains a peak hour factor column that decides whether a trip is sampled
        # in the peak or off-peak period. In order to discretely select the peak records and vice-versa
        # an uniform random number generator is run and the values are attached to the trips_out file.
        # If the (1-peak_factor) value in the record is greater than that of the random value than
        # the record is in the off-peak and vice-versa.

        np.random.seed(mprobe_valid.seed)
        random = pd.DataFrame(np.random.uniform(size=len(trips_hhold_df)))
        random.columns = ['rnum']

        # attach the random number generator and calculate peak_flag. A value of 1 in this flag
        # means that this is a peak period trip record.
        trips_hhold_df = common.concat_df(trips_hhold_df, random, 1)
        trips_hhold_df['peak_flag'] = np.where(trips_hhold_df['rnum'] <= trips_hhold_df['peak_factor'], 1, 0)
        trips_hhold_df[['peak_flag']] = trips_hhold_df[['peak_flag']].astype('int8')


        mprobe_valid.logger.info("Return the trips and household dataframe combined with each record that has a home end"
                                 "in it tagged as to whether it starts in the peak (1) or off-peak period (0). The rest"
                                 "of the records are populated with a dummy value of 10 as their time period will be "
                                 "determined by the destination choice model of the GGHMV4")

        # GGHMV4 carries out mode choice for the HBW, HBS, and HBU trips at the PA level. This essentially means that
        # the return trip of that tour must also lie in the same peak period that the home based trip was in.

        if peak_consistency == 1:
            mprobe_valid.logger.info("Peak consistency flag set to %s." % peak_consistency)

            # set default values
            loop = 0
            start_peak_flag = 0

            # convert the trips_hhold dataframe and run the peak consistency function. Once completed make the array
            # back to a dataframe and set column names
            trips_hhold_df_array = trips_hhold_df.values
            trips_hhold_df_array = self.peak_consistency(trips_hhold_df_array)

            trips_hhold_df_array = pd.DataFrame(trips_hhold_df_array)
            trips_hhold_df_array.columns = trips_hhold_df.columns

            # reset the trips_hhold_df dataframe
            trips_hhold_df = trips_hhold_df_array

        mprobe_valid.logger.info("Peak flag populated")
        return trips_hhold_df

예제 #6

0

파일 보기

    def run_dest_solver(self, group, chaos_monkey):

        # run destination solver. But first translate the trips_hhold dataframe to a numpy array. This results in a
        # drop in run times. Unlike the peak solver which saw a drop from 30 mins to 20 seconds, the destination
        # solver sees around a 50% run time savings to around 2 hours.

        # some housekeeping before running the destination solver function. First, create the flag that will help
        # choose the appropriate non-mandatory matrix to sample from.
        # Bring the in binary files and save to dictionary

        trips_hhold = group

        # control_parameters.logger.info("Prepare the trips_hhold dataframe for the destination solver function")
        trips_hhold['dict_flag'] = trips_hhold['purpose'].astype(str) + '_' + trips_hhold['market_seg'].apply(str) + '_' + \
                              trips_hhold['peak_flag'].apply(str)


        # Second, there are many instances where the person only makes mandatory tours, in which case we don't need to
        # evaluate it. Thus, only keep records where the taz_j has a 0 to run the destination solver. Create a flag to
        # help identify the appropriate records
        # control_parameters.logger.info("Getting households and person trip records that have more than just manadatory trips,"
        #                          "thereby needing the destination solver.")
        tgr = trips_hhold.iloc[np.where(trips_hhold['taz_j'].values == 0)].\
            groupby(['hhid', 'pid']).\
            size().\
            reset_index(name="count")
        tgr['solver_flag'] = 1
        tgr.drop('count', axis=1, inplace = True)

        # transfer the flag information to the trips_hhold while holding it in a temp_df and slicing it to hold the
        # requisite records.
        trips_hhold = pd.merge(trips_hhold, tgr, how='left', on=['hhid', 'pid'])
        trips_hhold['solver_flag'].fillna(0, inplace=True)
        # create temp df to run through the destination solver
        temp_df = trips_hhold
        temp_df = temp_df.iloc[np.where(temp_df['solver_flag'].values == 1)]
        # control_parameters.logger.info(
        #     "A total of %s nonmandatory trips will be assigned a origin and/or destination" % temp_df.shape[0])

        # control_parameters.logger.info("Running the destination solver. Please be patient. There are too many records and machinations"
        #                          "that need to be completed before it all ends.")
        trips_hhold_array = temp_df.values
        trips_hhold_array = self.destination_solver(trips_hhold_array, chaos_monkey)
        trips_hhold_dest_df = pd.DataFrame(trips_hhold_array)
        trips_hhold_dest_df.columns = trips_hhold.columns

        # The records are now concatenated back to the original trips_hhold df, but as a replacement
        # control_parameters.logger.info("Concatenating the records back")
        trips_hhold = trips_hhold.iloc[np.where(trips_hhold['solver_flag'].values == 0)]
        trips_hhold = common.concat_df(trips_hhold, trips_hhold_dest_df, 0)

        return(trips_hhold)

예제 #7

0

파일 보기

파일: mode_sampling.py 프로젝트: DuggalM/R-Codes

def egress_prob(self, mand_mode, mand_eg_prob):
    """

    :param mand_mode:
    :param mand_eg_prob:
    :return:
    """

    egg_df = mand_mode.loc[mand_mode['EgressZone'] > 0]
    cols = ['Production Zone', 'Destination Zone']
    egg_df[cols] = egg_df[cols].astype(int)

    # groupby and get the number of draws for each unique O-D pair that has an egress zone
    egg_df_gr = egg_df.groupby(['Production Zone', 'Destination Zone', 'PrimaryMode']).size().\
        reset_index(name='counts')

    # get column names and melt the dataframe on the production and destination zones
    # and then add in columns for defining the primary and egress modes
    melt_df = pd.melt(mand_eg_prob, id_vars=['Production Zone', 'Destination Zone'])
    melt_df['PrimaryMode'] = melt_df['variable'].str[26:]
    melt_df['EgressMode'] = melt_df['variable'].str[21:25]
    melt_df.drop('variable', axis=1, inplace=True)

    # get rid of any non-uniqueness and get it read for joining
    melt_df = melt_df.pivot_table(index=['Production Zone', 'Destination Zone', 'PrimaryMode'],
                                  columns='EgressMode', values='value').reset_index()

    # The melted df is now joined back to the group dataframe
    # so that the grouped df can be expanded by the counts and contains the egress probabilities as well.
    egg_df_gr1 = pd.merge(egg_df_gr, melt_df, on=['Production Zone', 'Destination Zone', 'PrimaryMode'], how='left')
    egg_df_gr1 = egg_df_gr1.loc[np.repeat(egg_df_gr1.index.values, egg_df_gr1['counts'])]

    # Now make the df back to a wide format and ready for sampling. ALso, Bill does not explicitly compute bus
    # probabilities, which are computed by subtracting Uber and Walk from 1.
    egg_df_gr1.set_index(['Production Zone', 'Destination Zone', 'PrimaryMode'], inplace=True)
    egg_df_gr1.drop('counts', axis=1, inplace=True)
    egg_df_gr1['Bus'] = 1 - (egg_df_gr1['Uber'] + egg_df_gr1['Walk'])

    # ' sample egress mode
    sampled_df_eg = pd.DataFrame(sample_from_weights(egg_df_gr1, randomizer = self.seed,
                                                     astype='category', n_threads=3, n_draws=1)).reset_index()

    egg_df_gr1 = common.concat_df(egg_df_gr1, sampled_df_eg, 1)
    egg_df_gr1.rename(columns={egg_df_gr1.columns[-1]: "EgressMode"}, inplace=True)

    # assign egress mode
    cols = [0, 1, 2]
    egg_df_gr1.drop(egg_df_gr1.columns[cols], axis=1, inplace=True)

    # like before we need a flag to join the information back to hbw_mode df. We also sort the dfs before concatenating
    egg_df_gr1['egressflag'] = egg_df_gr1['Production Zone'].astype(str) + \
                               egg_df_gr1['Destination Zone'].astype(str) + \
                               egg_df_gr1['PrimaryMode'].astype(str)
    egg_df_gr1 = egg_df_gr1.sort_values(['Production Zone', 'Destination Zone', 'egressflag'])

    # create unique list for selection
    list_un_eg = egg_df_gr1['egressflag'].unique().tolist()

    # get temp dataframe to do the assigning of the egress mode and this will then be later integrated with the
    # chunk being processed
    temp_df = mand_mode
    temp_df['egressflag'] = np.where(temp_df['EgressZone'] > 0,
                                     temp_df['Production Zone'].astype(str) + \
                                     temp_df['Destination Zone'].astype(str) + \
                                     temp_df['PrimaryMode'].astype(str), np.NAN)
    temp_df = temp_df.loc[(temp_df['egressflag'].isin(list_un_eg))].sort_values(
        ['Production Zone', 'Destination Zone', 'egressflag'])

    # concatenate the dfs
    temp_df = common.concat_df(temp_df, egg_df_gr1, 1)

    # remove the egress records from the hbw_mode chunk and replace them with with the temp dfs. One will need to get rid of duplicated
    # columns as well
    mand_mode = mand_mode.loc[mand_mode['egressflag'].isnull()]
    mand_mode = common.concat_df(mand_mode, temp_df, 0)

    return mand_mode

예제 #8

0

파일 보기

    def egress_prob(self, mand_mode, melt_df):
        """

        :param mand_mode:
        :param mand_eg_prob:
        :return:
        """
        egg_df = mand_mode[mand_mode['EgressZone'] > 0].copy()
        # egg_df = mand_mode.loc[mand_mode['EgressZone'] > 0]
        cols = ['Production Zone', 'Destination Zone']
        egg_df[cols] = egg_df[cols].astype(int)

        # groupby and get the number of draws for each unique O-D pair that has an egress zone
        egg_df_gr = egg_df.groupby(['Production Zone', 'Destination Zone', 'PrimaryMode']).size().\
            reset_index(name='counts')

        # The melted df is now joined back to the group dataframe
        # so that the grouped df can be expanded by the counts and contains the egress probabilities as well.
        egg_df_gr1 = pd.merge(
            egg_df_gr,
            melt_df,
            on=['Production Zone', 'Destination Zone', 'PrimaryMode'],
            how='left')
        egg_df_gr1 = egg_df_gr1.loc[np.repeat(egg_df_gr1.index.values,
                                              egg_df_gr1['counts'])]

        # Now make the df back to a wide format and ready for sampling. ALso, Bill does not explicitly compute bus
        # probabilities, which are computed by subtracting Uber and Walk from 1.
        egg_df_gr1.set_index(
            ['Production Zone', 'Destination Zone', 'PrimaryMode'],
            inplace=True)
        egg_df_gr1.drop('counts', axis=1, inplace=True)
        egg_df_gr1['Bus'] = 1 - (egg_df_gr1['Uber'] + egg_df_gr1['Walk'])

        # ' sample egress mode
        sampled_df_eg = pd.DataFrame(
            sample_from_weights(egg_df_gr1,
                                randomizer=self.prng,
                                astype='category',
                                n_threads=3,
                                n_draws=1)).reset_index()

        egg_df_gr1 = common.concat_df(egg_df_gr1, sampled_df_eg, 1)
        egg_df_gr1.rename(columns={egg_df_gr1.columns[-1]: "EgressMode"},
                          inplace=True)

        # assign egress mode
        cols = [0, 1, 2]
        egg_df_gr1.drop(egg_df_gr1.columns[cols], axis=1, inplace=True)

        # like before we need a flag to join the information back to hbw_mode df. We also sort the dfs before concatenating
        egg_df_gr1['egressflag'] = egg_df_gr1['Production Zone'].astype(str) + \
                                   egg_df_gr1['Destination Zone'].astype(str) + \
                                   egg_df_gr1['PrimaryMode'].astype(str)
        egg_df_gr1 = egg_df_gr1.sort_values(
            ['Production Zone', 'Destination Zone', 'egressflag'])

        # create unique list for selection
        list_un_eg = egg_df_gr1['egressflag'].unique().tolist()

        # get temp dataframe to do the assigning of the egress mode and this will then be later integrated with the
        # chunk being processed
        temp_df = mand_mode
        temp_df['egressflag'] = np.where(temp_df['EgressZone'] > 0,
                                         temp_df['Production Zone'].astype(str) + \
                                         temp_df['Destination Zone'].astype(str) + \
                                         temp_df['PrimaryMode'].astype(str), np.NAN)
        temp_df = temp_df.loc[(
            temp_df['egressflag'].isin(list_un_eg))].sort_values(
                ['Production Zone', 'Destination Zone', 'egressflag'])

        # concatenate the dfs
        temp_df = common.concat_df(temp_df, egg_df_gr1, 1)

        # remove the egress records from the hbw_mode chunk and replace them with with the temp dfs. One will
        # need to get rid of duplicated columns as well
        mand_mode = mand_mode.loc[mand_mode['egressflag'].isnull()]
        mand_mode = common.concat_df(mand_mode, temp_df, 0)

        return mand_mode

예제 #9

0

파일 보기

    def elemental_mode(self, mand_prob_l1, veh_segment, mand_prob_append,
                       purpose, trips):
        """
        This function samples

        :param self:
        :param mand_prob_l1:
        :param veh_segment:
        :param mand_prob_append:
        :param purpose:
        :param trips:
        :return:
        """

        # null dataframe
        mand_level1 = pd.DataFrame()
        sampled_df = pd.DataFrame()
        mand_l1 = pd.DataFrame()
        collect_df = {}

        # generate the appropriate df for sampling by veh_type
        # and then attach the production, destination, and market segment
        mand_l1 = self.vehtype_prob(
            mand_prob_l1, veh_segment
        )  # get the columns of prob by veh type that will be sampled
        mand_level1 = common.concat_df(mand_prob_append, mand_l1,
                                       1)  # concat function

        # Now prepare the file for sampling and only keep relevant columns

        mand_level2, df_join = self.prob_df_longtermchoice(
            trips, purpose, mand_level1, veh_segment)
        mand_level2 = mand_level2.iloc[:, 0:52]
        mand_level2 = mand_level2.loc[(mand_level2 != 0).any(
            axis=1)]  # get rid of rows that are zero all the way

        if len(mand_level2) > 0:
            control_parameters.logger.info(
                "Start sampling of the elemental mode for vehicle segment %s. This is initiated "
                "provided the binary probability file has records with "
                "non-zero probabilities. A total of %s records are in the df" %
                (veh_segment, len(mand_level2)))
            # sample using Cheval
            sampled_df = pd.DataFrame(
                sample_from_weights(mand_level2,
                                    randomizer=self.prng,
                                    astype='category',
                                    n_threads=1,
                                    n_draws=1)).reset_index()
            sampled_df.columns = [
                'Production Zone', 'Destination Zone', 'Market Segment', 'Mode'
            ]

            # create  flag to help select the records in the trips dataframe. Creating the flag allows us to select
            # exactly the same number of rows even in the trips dataframe that match the sampled_df in length
            # Also sort the df to ensure that we don't end up concatenating the wrong o-ds
            sampled_df['flag'] = sampled_df['Production Zone'].astype(str) + sampled_df['Destination Zone'].astype(str) + \
                                 sampled_df['Market Segment'].astype(str)
            sampled_df = sampled_df.sort_values(
                ['Production Zone', 'Destination Zone', 'Market Segment'])
            list_un = sampled_df['flag'].unique().tolist()

            # select from trips dataframe that correspond to the sampled df using the flag. Once again sort
            # to ensure proper concatenation
            df_join = df_join.loc[(df_join['flag'].isin(list_un))]
            df_join = df_join.sort_values(['taz_i', 'taz_j', 'market_seg'])

            # concatenate the data. Concatenate is needed as
            # the flag is yet not unique and a merge will result in a larger dataframe than what we started with
            collect_df[veh_segment] = common.concat_df(df_join, sampled_df, 1)

        if len(collect_df) > 0:
            control_parameters.logger.info(
                "Concatenate the dictionary of dataframes by vehicle segment")
            # now make one dataframe across vehicle segments
            mand_mode = pd.concat(collect_df.values(), ignore_index=True)
            mand_mode['PrimaryMode'] = mand_mode['Mode'].map(
                lambda x: str(x)[10:])
            mand_mode['PrimaryMode'] = mand_mode['PrimaryMode'].astype(
                'category')

            return mand_mode

        # return empty df
        mand_mode = pd.DataFrame()
        control_parameters.logger.info(
            "Returning an empty dataframe because there were no elemental probabilities in the "
            "i-j pairs for the vehicle segment %s " % veh_segment)
        return mand_mode

예제 #10

0

파일 보기

파일: mode_sampling.py 프로젝트: DuggalM/R-Codes

def elemental_mode(self, mand_prob_l1, veh_segment, mand_prob_append, purpose,
                   trips):
    """
    This function samples

    :param self:
    :param mand_prob_l1:
    :param veh_segment:
    :param mand_prob_append:
    :param purpose:
    :param trips:
    :param seed:
    :return:
    """

    # null dataframe
    mand_level1 = pd.DataFrame()
    sampled_df = pd.DataFrame()
    mand_l1 = pd.DataFrame()
    collect_df = {}

    # generate the appropriate df for sampling by veh_type
    # and then attach the production, destination, and market segment
    mand_l1 = self.vehtype_prob(
        mand_prob_l1,
        veh_segment)  # get the columns of prob that will be sampled
    mand_level1 = common.concat_df(mand_prob_append, mand_l1,
                                   1)  # concat function

    # Now prepare the level1 file for sampling and only keep relevant columns
    mand_level2, df_join = prob_df_longtermchoice(trips, purpose, mand_level1,
                                                  veh_segment)
    mand_level2 = mand_level2.iloc[:, 0:52]

    # sample using Cheval
    sampled_df = pd.DataFrame(
        sample_from_weights(mand_level2,
                            randomizer=seed,
                            astype='category',
                            n_threads=1,
                            n_draws=1)).reset_index()
    sampled_df.columns = [
        'Production Zone', 'Destination Zone', 'Market Segment', 'Mode'
    ]

    # create  flag to help select the records in the trips dataframe. Creating the flag allows us to select
    # exactly the same number of rows even in the trips dataframe that match the sampled_df in length
    # Also sort the df to ensure that we don't end up concatenating the wrong o-ds
    sampled_df['flag'] = sampled_df['Production Zone'].astype(str) + sampled_df['Destination Zone'].astype(str) + \
                         sampled_df['Market Segment'].astype(str)
    sampled_df = sampled_df.sort_values(
        ['Production Zone', 'Destination Zone', 'Market Segment'])
    list_un = sampled_df['flag'].unique().tolist()

    # select from trips dataframe recores that corresspond to the sampled df using the flag. Once again sort
    # to ensure proper concatenation
    df_join = df_join.loc[(df_join['flag'].isin(list_un))]
    df_join = df_join.sort_values(['taz_i', 'taz_j', 'market_seg'])

    # concatenate the data. Concatenate is needed as
    # the flag is yet not unique and a merge will result in a larger dataframe than what we started with
    collect_df[veh_segment] = common.concat_df(df_join, sampled_df, 1)

    print("Vehicle Type: %s" % (veh_segment), len(df_join), len(mand_level1),
          len(sampled_df), len(mand_l1), len(mand_prob_l1))

    # now make one dataframe across vehicle segments
    mand_mode = pd.concat(collect_df.values(), ignore_index=True)
    mand_mode['PrimaryMode'] = mand_mode['Mode'].map(lambda x: str(x)[10:])

    return mand_mode

예제 #11

0

파일 보기

파일: discretization.py 프로젝트: DuggalM/R-Codes

    def elemental_mode(self, mand_prob_l1, veh_segment, mand_prob_append, purpose, trips):
        """
        This function samples

        :param self:
        :param mand_prob_l1:
        :param veh_segment:
        :param mand_prob_append:
        :param purpose:
        :param trips:
        :return:
        """

        # null dataframe
        mand_level1 = pd.DataFrame()
        sampled_df = pd.DataFrame()
        mand_l1 = pd.DataFrame()
        collect_df = {}

        # generate the appropriate df for sampling by veh_type
        # and then attach the production, destination, and market segment
        mand_l1 = self.vehtype_prob(mand_prob_l1, veh_segment)  # get the columns of prob by veh type that will be sampled
        mand_level1 = common.concat_df(mand_prob_append, mand_l1, 1)  # concat function

        # Now prepare the file for sampling and only keep relevant columns
        mand_level2, df_join = self.prob_df_longtermchoice(trips, purpose, mand_level1, veh_segment)
        mand_level2 = mand_level2.iloc[:, 0:52]
        mand_level2 = mand_level2.loc[(mand_level2 != 0).any(axis=1)]  # get rid of rows that are zero all the way

        if len(mand_level2) >0:
            common.logger.info("Start sampling of the elemental mode for vehicle segment %s. This is initiated "
                               "provided the binary probability file has records with "
                               "non-zero probabilities." %veh_segment)
            # sample using Cheval
            sampled_df = pd.DataFrame(sample_from_weights(mand_level2, randomizer=self.seed,
                                                          astype='category', n_threads=1, n_draws=1)).reset_index()
            sampled_df.columns = ['Production Zone', 'Destination Zone', 'Market Segment', 'Mode']

            # create  flag to help select the records in the trips dataframe. Creating the flag allows us to select
            # exactly the same number of rows even in the trips dataframe that match the sampled_df in length
            # Also sort the df to ensure that we don't end up concatenating the wrong o-ds
            sampled_df['flag'] = sampled_df['Production Zone'].astype(str) + sampled_df['Destination Zone'].astype(str) + \
                                 sampled_df['Market Segment'].astype(str)
            sampled_df = sampled_df.sort_values(['Production Zone', 'Destination Zone', 'Market Segment'])
            list_un = sampled_df['flag'].unique().tolist()

            # select from trips dataframe that correspond to the sampled df using the flag. Once again sort
            # to ensure proper concatenation
            df_join = df_join.loc[(df_join['flag'].isin(list_un))]
            df_join = df_join.sort_values(['taz_i', 'taz_j', 'market_seg'])

            # concatenate the data. Concatenate is needed as
            # the flag is yet not unique and a merge will result in a larger dataframe than what we started with
            collect_df[veh_segment] = common.concat_df(df_join, sampled_df, 1)

        if len(collect_df) > 0:
            common.logger.info("Concatenate the dictionary of dataframes by vehicle segment")
            # now make one dataframe across vehicle segments
            mand_mode = pd.concat(collect_df.values(), ignore_index=True)
            mand_mode['PrimaryMode'] = mand_mode['Mode'].map(lambda x: str(x)[10:])

            return mand_mode

        # return empty df
        mand_mode = pd.DataFrame()
        common.logger.info("Returning an empty dataframe because there were no elemental probabilities in the "
                           "i-j pairs for the vehicle segment %s " % veh_segment)
        return mand_mode

예제 #12

0

파일 보기

파일: Mlogit_Probe_NonMandatory.py 프로젝트: DuggalM/R-Codes

    def run(self, trips_hhold, nonmandatory_purposes):

        # run destination solver. But first translate the trips_hhold dataframe to a numpy array. This results in a
        # drop in run times. Unlike the peak solver which saw a drop from 30 mins to 20 seconds, the destination
        # solver sees around a 50% run time savings to around 2 hours.

        # some housekeeping before running the destination solver function. First, create the flag that will help
        # choose the appropriate non-mandatory matrix to sample from.

        mprobe_valid.logger.info("Prepare the trips_hhold dataframe for the destination solver function")
        nrows_trips_hhold = trips_hhold.shape[0]
        trips_hhold['dict_flag'] = trips_hhold['purpose'].astype(str) + '_' + trips_hhold['market_seg'].apply(str) + '_' + \
                              trips_hhold['peak_flag'].apply(str)


        # Second, there are many instances where the person only makes mandatory tours, in which case we don't need to
        # evaluate it. Thus, only keep records where the taz_j has a 0 to run the destination solver. Create a flag to
        # help identify the appropriate records
        mprobe_valid.logger.info("Getting households and person trip records that have more than just manadatory trips,"
                                 "thereby needing the destination solver.")
        tgr = trips_hhold.iloc[np.where(trips_hhold['taz_j'].values == 0)].\
            groupby(['hhid', 'pid']).\
            size().\
            reset_index(name="count")
        tgr['solver_flag'] = 1
        tgr.drop('count', axis=1, inplace = True)

        # transfer the flag information to the trips_hhold while holding it in a temp_df and slicing it to hold the
        # requisite records.
        trips_hhold = pd.merge(trips_hhold, tgr, how='left', on=['hhid', 'pid'])
        trips_hhold['solver_flag'].fillna(0, inplace=True)
        # create temp df to run through the destination solver
        temp_df = trips_hhold
        temp_df = temp_df.iloc[np.where(temp_df['solver_flag'].values == 1)]

        # The destination solver is run, but first provide a numpy array. Once run, the numpy array is converted back
        # to a dataframe.
        mprobe_valid.logger.info("Running the destination solver. Please be patient. There are too many records and machinations"
                                 "that need to be completed before it all ends.")
        trips_hhold_array = temp_df.values
        trips_hhold_array = self.destination_solver(trips_hhold_array)
        trips_hhold_dest_df = pd.DataFrame(trips_hhold_array)
        trips_hhold_dest_df.columns = trips_hhold.columns

        # The records are now concatenated back to the original trips_hhold df, but as a replacement
        mprobe_valid.logger.info("Concatenating the records back")
        trips_hhold = trips_hhold.iloc[np.where(trips_hhold['solver_flag'].values == 0)]
        trips_hhold = common.concat_df(trips_hhold, trips_hhold_dest_df, 0)
        trips_hhold.sort_values(['hhid', 'pid'], inplace= True, ascending = True)

        # check if the length of the final dataframe (after concatenating) is the same length as the original
        if not len(trips_hhold) == nrows_trips_hhold:
            mprobe_valid.logger.info("The number of rows after running the destination solver is different from that of"
                                     "the original dataframe. Something is wrong.")
            exit(0)
        else:
            mprobe_valid.logger.info("Destination solver finished successfully")


        # now batch out the necessary matrices
        mprobe_valid.logger.info("Start saving the matrices in the format desired by Mlogit")
        for purpose in nonmandatory_purposes:

            nonmand_only = trips_hhold.iloc[np.where(trips_hhold['purpose'].values == purpose)]

            # now loop over the peak periods
            for peak in range(0, 2):

                timeperiod_df = nonmand_only.loc[nonmand_only['peak_flag'] == peak]
                timeperiod_df = timeperiod_df.groupby(['taz_i', 'taz_j', 'purpose', 'market_seg']).size().reset_index(
                    name='freq')

                # now loop over the segments
                for segment in timeperiod_df['market_seg'].unique():
                    # create filename and then groupby
                    # only keep relevant cols and set a flag
                    # Merge the ggh zones and the trip list and convert to wide format

                    fname = purpose + "_" + str(segment)
                    df_hbw = timeperiod_df.loc[timeperiod_df['market_seg'] == segment]
                    df_hbw = df_hbw[['taz_i', 'taz_j']]
                    df_hbw['probflag'] = 1

                    # Make square dataframe for Fortran
                    df_hbw1 = pd.merge(self.ggh2, df_hbw, how="left", left_on=['ggh_zone_x', 'ggh_zone_y'],
                                       right_on=['taz_i', 'taz_j'])
                    df_hbw2 = df_hbw1.pivot_table(index='ggh_zone_x', columns='ggh_zone_y', values='probflag',
                                                  fill_value=0)

                    to_fortran(df_hbw2, os.path.join(mprobe_valid.dirListing_abm, fname + ' peak_flag ' + str(peak) + '.bin'),
                               n_columns=4000)
                    mprobe_valid.logger.info("All matrices saved.")
        return trips_hhold

예제 #13

0

파일 보기

    def run(self, trips_hhold, nonmandatory_purposes):

        # run destination solver. But first translate the trips_hhold dataframe to a numpy array. This results in a
        # drop in run times. Unlike the peak solver which saw a drop from 30 mins to 20 seconds, the destination
        # solver sees around a 50% run time savings to around 2 hours.

        # some housekeeping before running the destination solver function. First, create the flag that will help
        # choose the appropriate non-mandatory matrix to sample from.

        mprobe_valid.logger.info(
            "Prepare the trips_hhold dataframe for the destination solver function"
        )
        nrows_trips_hhold = trips_hhold.shape[0]
        trips_hhold['dict_flag'] = trips_hhold['purpose'].astype(str) + '_' + trips_hhold['market_seg'].apply(str) + '_' + \
                              trips_hhold['peak_flag'].apply(str)

        # Second, there are many instances where the person only makes mandatory tours, in which case we don't need to
        # evaluate it. Thus, only keep records where the taz_j has a 0 to run the destination solver. Create a flag to
        # help identify the appropriate records
        mprobe_valid.logger.info(
            "Getting households and person trip records that have more than just manadatory trips,"
            "thereby needing the destination solver.")
        tgr = trips_hhold.iloc[np.where(trips_hhold['taz_j'].values == 0)].\
            groupby(['hhid', 'pid']).\
            size().\
            reset_index(name="count")
        tgr['solver_flag'] = 1
        tgr.drop('count', axis=1, inplace=True)

        # transfer the flag information to the trips_hhold while holding it in a temp_df and slicing it to hold the
        # requisite records.
        trips_hhold = pd.merge(trips_hhold,
                               tgr,
                               how='left',
                               on=['hhid', 'pid'])
        trips_hhold['solver_flag'].fillna(0, inplace=True)
        # create temp df to run through the destination solver
        temp_df = trips_hhold
        temp_df = temp_df.iloc[np.where(temp_df['solver_flag'].values == 1)]

        # The destination solver is run, but first provide a numpy array. Once run, the numpy array is converted back
        # to a dataframe.
        mprobe_valid.logger.info(
            "Running the destination solver. Please be patient. There are too many records and machinations"
            "that need to be completed before it all ends.")
        trips_hhold_array = temp_df.values
        trips_hhold_array = self.destination_solver(trips_hhold_array)
        trips_hhold_dest_df = pd.DataFrame(trips_hhold_array)
        trips_hhold_dest_df.columns = trips_hhold.columns

        # The records are now concatenated back to the original trips_hhold df, but as a replacement
        mprobe_valid.logger.info("Concatenating the records back")
        trips_hhold = trips_hhold.iloc[np.where(
            trips_hhold['solver_flag'].values == 0)]
        trips_hhold = common.concat_df(trips_hhold, trips_hhold_dest_df, 0)
        trips_hhold.sort_values(['hhid', 'pid'], inplace=True, ascending=True)

        # check if the length of the final dataframe (after concatenating) is the same length as the original
        if not len(trips_hhold) == nrows_trips_hhold:
            mprobe_valid.logger.info(
                "The number of rows after running the destination solver is different from that of"
                "the original dataframe. Something is wrong.")
            exit(0)
        else:
            mprobe_valid.logger.info(
                "Destination solver finished successfully")

        # now batch out the necessary matrices
        mprobe_valid.logger.info(
            "Start saving the matrices in the format desired by Mlogit")
        for purpose in nonmandatory_purposes:

            nonmand_only = trips_hhold.iloc[np.where(
                trips_hhold['purpose'].values == purpose)]

            # now loop over the peak periods
            for peak in range(0, 2):

                timeperiod_df = nonmand_only.loc[nonmand_only['peak_flag'] ==
                                                 peak]
                timeperiod_df = timeperiod_df.groupby(
                    ['taz_i', 'taz_j', 'purpose',
                     'market_seg']).size().reset_index(name='freq')

                # now loop over the segments
                for segment in timeperiod_df['market_seg'].unique():
                    # create filename and then groupby
                    # only keep relevant cols and set a flag
                    # Merge the ggh zones and the trip list and convert to wide format

                    fname = purpose + "_" + str(segment)
                    df_hbw = timeperiod_df.loc[timeperiod_df['market_seg'] ==
                                               segment]
                    df_hbw = df_hbw[['taz_i', 'taz_j']]
                    df_hbw['probflag'] = 1

                    # Make square dataframe for Fortran
                    df_hbw1 = pd.merge(self.ggh2,
                                       df_hbw,
                                       how="left",
                                       left_on=['ggh_zone_x', 'ggh_zone_y'],
                                       right_on=['taz_i', 'taz_j'])
                    df_hbw2 = df_hbw1.pivot_table(index='ggh_zone_x',
                                                  columns='ggh_zone_y',
                                                  values='probflag',
                                                  fill_value=0)

                    to_fortran(df_hbw2,
                               os.path.join(
                                   mprobe_valid.dirListing_abm,
                                   fname + ' peak_flag ' + str(peak) + '.bin'),
                               n_columns=4000)
                    mprobe_valid.logger.info("All matrices saved.")
        return trips_hhold

예제 #14

0

파일 보기

파일: Mlogit_Probe_NonMandatory_DC_discretization.py 프로젝트: DuggalM/OhPlease

    def run(self, trips_hhold, nonmandatory_purposes, chaos_monkey):

        # run destination solver. But first translate the trips_hhold dataframe to a numpy array. This results in a
        # drop in run times. Unlike the peak solver which saw a drop from 30 mins to 20 seconds, the destination
        # solver sees around a 50% run time savings to around 2 hours.

        # some housekeeping before running the destination solver function. First, create the flag that will help
        # choose the appropriate non-mandatory matrix to sample from.

        control_parameters.logger.info(
            "Prepare the trips_hhold dataframe for the destination solver function"
        )
        nrows_trips_hhold = trips_hhold.shape[0]
        trips_hhold['dict_flag'] = trips_hhold['purpose'].astype(str) + '_' + trips_hhold['market_seg'].apply(str) + '_' + \
                              trips_hhold['peak_flag'].apply(str)

        # Second, there are many instances where the person only makes mandatory tours, in which case we don't need to
        # evaluate it. Thus, only keep records where the taz_j has a 0 to run the destination solver. Create a flag to
        # help identify the appropriate records
        control_parameters.logger.info(
            "Getting households and person trip records that have more than just manadatory trips,"
            "thereby needing the destination solver.")
        tgr = trips_hhold.iloc[np.where(trips_hhold['taz_j'].values == 0)].\
            groupby(['hhid', 'pid']).\
            size().\
            reset_index(name="count")
        tgr['solver_flag'] = 1
        tgr.drop('count', axis=1, inplace=True)

        # transfer the flag information to the trips_hhold while holding it in a temp_df and slicing it to hold the
        # requisite records.
        trips_hhold = pd.merge(trips_hhold,
                               tgr,
                               how='left',
                               on=['hhid', 'pid'])
        trips_hhold['solver_flag'].fillna(0, inplace=True)
        # create temp df to run through the destination solver
        temp_df = trips_hhold
        temp_df = temp_df.iloc[np.where(temp_df['solver_flag'].values == 1)]
        control_parameters.logger.info(
            "A total of %s nonmandatory trips will be assigned a origin and/or destination"
            % temp_df.shape[0])

        control_parameters.logger.info(
            "Running the destination solver. Please be patient. There are too many records and machinations"
            "that need to be completed before it all ends.")
        trips_hhold_array = temp_df.values
        trips_hhold_array = self.destination_solver(trips_hhold_array,
                                                    chaos_monkey)
        trips_hhold_dest_df = pd.DataFrame(trips_hhold_array)
        trips_hhold_dest_df.columns = trips_hhold.columns

        # The records are now concatenated back to the original trips_hhold df, but as a replacement
        control_parameters.logger.info("Concatenating the records back")
        trips_hhold = trips_hhold.iloc[np.where(
            trips_hhold['solver_flag'].values == 0)]
        trips_hhold = common.concat_df(trips_hhold, trips_hhold_dest_df, 0)
        trips_hhold.sort_values(['hhid', 'pid'], inplace=True, ascending=True)

        trips_hhold = trips_hhold.astype(
            dtype={
                "hhid": "int32",
                "pid": "int8",
                "tour_id": "int8",
                "subtour_id": "int8",
                "trip_id": "int8",
                "activity_i": "category",
                "activity_j": "category",
                "taz_i": "int16",
                "taz_j": "int16",
                "tour_direction": "category",
                "purpose": "category",
                "trip_direction": "category",
                "peak_factor": "float64",
                "taz": "int16",
                "hhinc": "int32",
                "dtype": "int8",
                "hhsize": "int8",
                "nveh": "int8",
                "auto_suff": "int8",
                "market_seg": "int8",
                "rnum": "float64",
                "peak_flag": "int8",
                "dict_flag": "category",
                "solver_flag": "float64"
            })

        # now batch out the necessary matrices
        control_parameters.logger.info(
            "Start saving the matrices in the format desired by Mlogit")
        for purpose in nonmandatory_purposes:

            nonmand_only = trips_hhold[trips_hhold['purpose'].values ==
                                       purpose].copy()
            # set the market segment to 0 as NHB has no market segment and Bill's prob file will have this as 1. We
            # will reset 1 to 0 during mode choice discretization
            if purpose == "NHB":
                nonmand_only['market_seg'] = 0

            # now loop over the peak periods
            for peak in range(0, 2):

                timeperiod_df = nonmand_only[nonmand_only['peak_flag'] ==
                                             peak].copy()
                timeperiod_df = timeperiod_df.groupby(['taz_i', 'taz_j', 'purpose', 'market_seg']).size().\
                    reset_index(name='freq')

                # now loop over the segments
                for segment in timeperiod_df['market_seg'].unique():
                    # create filename and then groupby
                    # only keep relevant cols and set a flag
                    # Merge the ggh zones and the trip list and convert to wide format

                    dataFrameDtype = common.set_dtype_defintions(
                        control_parameters.inputDirListing,
                        ev.EarlyValidFiles.getJSONFileList())
                    mtx_name = dataFrameDtype[ev.EarlyValidFiles.MATRIX_NAMES]

                    # the matrices have to be given a specific filename that corresponds to the control file for MLOGIT
                    fname = purpose + "_" + str(segment) + "_" + str(peak)
                    for key, value in mtx_name.items():
                        if fname == key:
                            fname_mtx = value
                            control_parameters.logger.info(
                                "The %s matrix is being saved" % fname_mtx)

                    df_hbw = timeperiod_df[timeperiod_df['market_seg'] ==
                                           segment].copy()
                    df_hbw = df_hbw[['taz_i', 'taz_j']]
                    df_hbw['probflag'] = 1

                    # Make square dataframe for Fortran
                    df_hbw1 = pd.merge(self.ggh2,
                                       df_hbw,
                                       how="left",
                                       left_on=['ggh_zone_x', 'ggh_zone_y'],
                                       right_on=['taz_i', 'taz_j'])
                    df_hbw2 = df_hbw1.pivot_table(index='ggh_zone_x',
                                                  columns='ggh_zone_y',
                                                  values='probflag',
                                                  fill_value=0)

                    to_fortran(
                        df_hbw2,
                        os.path.join(
                            control_parameters.dirListing_mlogitmatrices,
                            fname_mtx + '.bin'),
                        n_columns=4000)

        return trips_hhold