def test_co_only(): y = numpy.random.random([20, 3]) y[5, 0] = 1.5 x_co = pandas.DataFrame( y, columns=['Aa', 'Bb', 'Cc'], ) x_co.index.name = 'caseid' with raises(ValueError): # missing altcodes DataFrames( co=x_co, av=True, ) with raises(ValueError): # trying to use ca DataFrames( ca=x_co, av=True, ) d = DataFrames( co=x_co, av=True, alt_codes=[1, 2, 3, 4, 5, 6], ) assert d.n_alts == 6 assert d.n_cases == 20 assert d.data_av.shape == (20, 6) all(d.data_av.dtypes == numpy.int8) all(d.data_av == 1) d = DataFrames( co=x_co, av={ 1: True, 2: 'Aa < 1.1', 3: True, 4: True, 5: True, 6: True, }, alt_codes=[1, 2, 3, 4, 5, 6], ) assert d.n_alts == 6 assert d.n_cases == 20 assert d.data_av.shape == (20, 6) assert all(d.data_av.dtypes == numpy.int8) assert d.data_av.sum().sum() == 119 assert d.data_av.iloc[5].loc[2] == 0
def test_promotion_ce_to_ca(): from larch.data_warehouse import example_file ca = pandas.read_csv(example_file('MTCwork.csv.gz'), index_col=('casenum', 'altnum')) dfs = DataFrames(ca, ch="chose", crack=True) assert dfs.data_ce is not None assert dfs.data_ca is None assert dfs.data_ce.shape == (22033, 5) dfs.data_ce_as_ca("_avail_") assert dfs.data_ce is None assert dfs.data_ca is not None assert dfs.data_ca.shape == (30174, 6)
def test_service_idco(): df = pandas.read_csv(example_file("MTCwork.csv.gz")) df.set_index(['casenum', 'altnum'], inplace=True) dfs = DataFrames(df, crack=True) check1 = dfs.make_idco('1') assert (check1 == 1).shape == (5029, 1) assert numpy.all(check1 == 1) check2 = dfs.make_idco('age') assert check2.shape == (5029, 1) assert numpy.all(check2.iloc[:5, 0] == [35, 40, 28, 34, 43]) assert numpy.all(check2.iloc[-5:, 0] == [58, 33, 34, 35, 37]) check3 = dfs.make_idco('age', '1') assert check3.shape == (5029, 2)
def nonmand_tour_freq_model( edb_directory="output/estimation_data_bundle/{name}/", return_data=False, ): data = interaction_simulate_data( name="non_mandatory_tour_frequency", edb_directory=edb_directory, ) settings = data.settings segment_names = [s["NAME"] for s in settings["SPEC_SEGMENTS"]] data.relabel_coef = link_same_value_coefficients(segment_names, data.coefficients, data.spec) spec = data.spec coefficients = data.coefficients chooser_data = data.chooser_data alt_values = data.alt_values alt_def = data.alt_def m = {} for segment_name in segment_names: segment_model = m[segment_name] = Model() # One of the alternatives is coded as 0, so # we need to explicitly initialize the MNL nesting graph # and set to root_id to a value other than zero. segment_model.initialize_graph(alternative_codes=alt_def.index, root_id=9999) # Utility specifications segment_model.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=segment_name, ) apply_coefficients(coefficients[segment_name], segment_model) segment_model.choice_co_code = "override_choice" # Attach Data x_co = (chooser_data[segment_name].set_index("person_id").rename( columns={"TAZ": "HOMETAZ"})) x_ca = cv_to_ca(alt_values[segment_name].set_index( ["person_id", "variable"])) d = DataFrames( co=x_co, ca=x_ca, av=~unavail(segment_model, x_ca), ) m[segment_name].dataservice = d if return_data: return m, data return m
def mtc2(): d = MTC() d1 = d.make_dataframes({ 'ca': ( 'ivtt', 'ovtt', 'totcost', 'chose', 'tottime', ), 'co': ('age', 'hhinc', 'hhsize', 'numveh==0'), 'avail_ca': '_avail_', 'choice_ca': 'chose', }) df_co2 = pd.concat([d1.data_co, d1.data_co]).reset_index(drop=True) df_ca2 = pd.concat([d1.data_ca.unstack(), d1.data_ca.unstack()]).reset_index(drop=True).stack() df_av2 = pd.concat([d1.data_av, d1.data_av]).reset_index(drop=True) df_chX = pd.DataFrame( np.zeros_like(d1.data_ch.values), index=d1.data_ch.index, columns=d1.data_ch.columns, ) df_chX.iloc[:, 1] = 2.0 df_ch2 = pd.concat([d1.data_ch, df_chX]).reset_index(drop=True) from larch import DataFrames, Model j1 = DataFrames( co=d1.data_co, ca=d1.data_ca, av=d1.data_av, ch=df_chX + d1.data_ch, ) j2 = DataFrames( co=df_co2, ca=df_ca2, av=df_av2, ch=df_ch2, ) j1.autoscale_weights() j2.autoscale_weights() return j1, j2
def cdap_dataframes(households, values): data = cdap_split_data(households, values) dfs = {} for hhsize in data.keys(): alts = generate_alternatives(hhsize) dfs[hhsize] = DataFrames( co=data[hhsize], alt_names=alts.keys(), alt_codes=alts.values(), av=1, ch=data[hhsize].override_choice.map(alts), ) return dfs
def test_dfs_init_co(): from larch.data_warehouse import example_file raw_data = pandas.read_csv(example_file('swissmetro.csv.gz')) keep = raw_data.eval("PURPOSE in (1,3) and CHOICE != 0") selected_data = raw_data[keep] d0 = DataFrames(selected_data, alt_codes=[1, 2, 3]) assert d0.data_co.shape == (6768, 28) assert d0.data_ca is None assert d0.data_ce is None assert d0.data_ch is None assert d0.data_av is None d1 = DataFrames(co=selected_data, alt_codes=[1, 2, 3]) assert d1.data_co.shape == (6768, 28) assert d1.data_ca is None assert d1.data_ce is None assert d1.data_ch is None assert d1.data_av is None with raises(ValueError): DataFrames(ca=selected_data, alt_codes=[1, 2, 3]) d2 = DataFrames(co=selected_data, alt_codes=[1, 2, 3], ch='CHOICE') assert d2.data_co.shape == (6768, 28) assert d2.data_ca is None assert d2.data_ce is None assert d2.data_ch is not None assert d2.data_ch.shape == (6768, 3) assert all(d2.data_ch.sum() == [908, 4090, 1770]) assert d2.data_av is None d2 = DataFrames(co=selected_data, alt_codes=[1, 2, 3], ch=selected_data.CHOICE) assert d2.data_co.shape == (6768, 28) assert d2.data_ca is None assert d2.data_ce is None assert d2.data_ch is not None assert d2.data_ch.shape == (6768, 3) assert all(d2.data_ch.sum() == [908, 4090, 1770]) assert d2.data_av is None d2 = DataFrames(co=selected_data, alt_codes=[1, 2, 3], ch='CHOICE', wt='GROUP') assert d2.data_co.shape == (6768, 28) assert d2.data_ca is None assert d2.data_ce is None assert d2.data_ch is not None assert d2.data_ch.shape == (6768, 3) assert all(d2.data_ch.sum() == [908, 4090, 1770]) assert d2.data_av is None assert d2.data_wt is not None assert d2.data_wt.shape == (6768, 1) d2 = DataFrames(co=selected_data, alt_codes=[1, 2, 3], ch='CHOICE', wt=selected_data.GROUP) assert d2.data_co.shape == (6768, 28) assert d2.data_ca is None assert d2.data_ce is None assert d2.data_ch is not None assert d2.data_ch.shape == (6768, 3) assert all(d2.data_ch.sum() == [908, 4090, 1770]) assert d2.data_av is None assert d2.data_wt is not None assert d2.data_wt.shape == (6768, 1)
def test_dfs_init_ca(): from larch.data_warehouse import example_file df = pandas.read_csv(example_file("MTCwork.csv.gz"), index_col=['casenum', 'altnum']) d0 = DataFrames(ca=df, crack=True, ch='chose', wt_name='wgt') assert d0.data_wt is not None assert d0.data_wt.columns == 'wgt' assert d0.data_ch is not None assert d0.data_ch.shape == (5029, 6) assert d0.data_av is not None assert d0.data_av.shape == (5029, 6) d1 = DataFrames(ca=df, crack=True) assert d1.data_wt is None assert d1.data_ch is None assert d1.data_av is not None assert d1.data_av.shape == (5029, 6) d2 = DataFrames(df) assert d2.data_wt is None assert d2.data_ch is None assert d2.data_av is not None assert d2.data_av.shape == (5029, 6) assert d2.data_co is None assert d2.data_ca is None assert d2.data_ce is not None assert d2.data_ce.shape == (22033, 36) d3 = DataFrames(ca=df, crack=True, ch=df.chose, wt_name='wgt') assert d3.data_wt is not None assert d3.data_wt.columns == 'wgt' assert d3.data_ch is not None assert d3.data_ch.shape == (5029, 6) assert d3.data_av is not None assert d3.data_av.shape == (5029, 6) assert pandas.isna(d3.data_ch).sum().sum() == 0 d4 = DataFrames(ca=df, crack=True, ch=df.chose, wt='wgt') assert d4.data_wt is not None assert d4.data_wt.columns == 'wgt' assert d4.data_ch is not None assert d4.data_ch.shape == (5029, 6) assert d4.data_av is not None assert d4.data_av.shape == (5029, 6) d5 = DataFrames(ca=df, crack=True, ch=df.chose, wt=df.wgt) assert d5.data_wt is not None assert d5.data_wt.columns == 'wgt' assert d5.data_ch is not None assert d5.data_ch.shape == (5029, 6) assert d5.data_av is not None assert d5.data_av.shape == (5029, 6) assert d5.data_co.shape == (5029, 31) assert d5.data_ca is None assert d5.data_ce is not None assert d5.data_ce.shape == (22033, 5) with raises(ValueError): bad = DataFrames(co=df)
def test_ce_initialization(): cax = pandas.DataFrame({ 'caseid': [ 1, 1, 1, 2, 2, ], 'caseid_bad': [ 'x', 'x', 'x', 'y', 'y', ], 'altid_bad': [ 'aa', 'bb', 'cc', 'aa', 'bb', ], 'altid_good': [ 1, 2, 3, 1, 2, ], 'altid_str': [ '1', '2', '3', '1', '2', ], 'buggers': [ 1.2, 3.4, 5.6, 7.8, 9.0, ], 'baggers': [ 22, 33, 44, 55, 66, ], }) import pytest with pytest.raises(ValueError): d = DataFrames(cax.set_index(['caseid_bad', 'altid_bad']), ) d = DataFrames(cax.set_index(['caseid', 'altid_bad']), ) assert len(d.data_ce) == 5 assert d.data_ca is None assert all(d.alternative_codes() == numpy.asarray([1, 2, 3])) assert all(d.alternative_names() == numpy.asarray(['aa', 'bb', 'cc'])) d = DataFrames(cax.set_index(['caseid', 'altid_good']), ) assert len(d.data_ce) == 5 assert d.data_ca is None assert all(d.alternative_codes() == [1, 2, 3]) d = DataFrames(ca=cax.set_index(['caseid', 'altid_str']), ) assert len(d.data_ce) == 5 assert d.data_ca is None assert all(d.alternative_codes() == [1, 2, 3])
def test_ca_initialization(): cax = pandas.DataFrame({ 'caseid': [1, 1, 1, 2, 2, 2], 'caseid_bad': ['x', 'x', 'x', 'y', 'y', 'y'], 'altid_bad': ['aa', 'bb', 'cc', 'aa', 'bb', 'cc'], 'altid_good': [1, 2, 3, 1, 2, 3], 'altid_str': ['1', '2', '3', '1', '2', '3'], 'buggers': [1.2, 3.4, 5.6, 7.8, 9.0, 5.5], 'baggers': [22, 33, 44, 55, 66, 77], }) import pytest with pytest.raises(ValueError): d = DataFrames(ca=cax.set_index(['caseid_bad', 'altid_bad']), ) d = DataFrames(ca=cax.set_index(['caseid', 'altid_bad']), ) assert all(d.alternative_codes() == numpy.asarray([1, 2, 3])) assert all(d.alternative_names() == numpy.asarray(['aa', 'bb', 'cc'])) d = DataFrames(ca=cax.set_index(['caseid', 'altid_good']), ) assert all(d.alternative_codes() == [1, 2, 3]) d = DataFrames(ca=cax.set_index(['caseid', 'altid_str']), ) assert all(d.alternative_codes() == [1, 2, 3]) d = DataFrames(ca=cax.set_index(['caseid', 'altid_good']), av='baggers > 25') assert all(d.alternative_codes() == [1, 2, 3]) av = d.data_av.to_numpy() assert av.shape == (2, 3) assert av.ravel() == approx([0, 1, 1, 1, 1, 1])
def location_choice_model( name="workplace_location", edb_directory="output/estimation_data_bundle/{name}/", coefficients_file="{name}_coefficients.csv", spec_file="{name}_SPEC.csv", size_spec_file="{name}_size_terms.csv", alt_values_file="{name}_alternatives_combined.csv", chooser_file="{name}_choosers_combined.csv", settings_file="{name}_model_settings.yaml", landuse_file="{name}_landuse.csv", return_data=False, ): model_selector = name.replace("_location", "") model_selector = model_selector.replace("_destination", "") model_selector = model_selector.replace("_subtour", "") model_selector = model_selector.replace("_tour", "") if model_selector == 'joint': model_selector = 'non_mandatory' edb_directory = edb_directory.format(name=name) def _read_csv(filename, **kwargs): filename = filename.format(name=name) return pd.read_csv(os.path.join(edb_directory, filename), **kwargs) coefficients = _read_csv( coefficients_file, index_col="coefficient_name", ) spec = _read_csv(spec_file, comment="#") alt_values = _read_csv(alt_values_file) chooser_data = _read_csv(chooser_file) landuse = _read_csv(landuse_file, index_col="zone_id") master_size_spec = _read_csv(size_spec_file) # remove temp rows from spec, ASim uses them to calculate the other values written # to the EDB, but they are not actually part of the utility function themselves. spec = spec.loc[~spec.Expression.isna()] spec = spec.loc[~spec.Expression.str.startswith("_")].copy() settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: settings = yaml.load( yf, Loader=yaml.SafeLoader, ) include_settings = settings.get("include_settings") if include_settings: include_settings = os.path.join(edb_directory, include_settings) if include_settings and os.path.exists(include_settings): with open(include_settings, "r") as yf: more_settings = yaml.load( yf, Loader=yaml.SafeLoader, ) settings.update(more_settings) CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME") SEGMENT_IDS = settings.get("SEGMENT_IDS") if SEGMENT_IDS is None: SEGMENTS = settings.get("SEGMENTS") if SEGMENTS is not None: SEGMENT_IDS = {i: i for i in SEGMENTS} SIZE_TERM_SELECTOR = settings.get('SIZE_TERM_SELECTOR', model_selector) # filter size spec for this location choice only size_spec = (master_size_spec.query( f"model_selector == '{SIZE_TERM_SELECTOR}'").drop( columns="model_selector").set_index("segment")) size_spec = size_spec.loc[:, size_spec.max() > 0] size_coef = size_coefficients_from_spec(size_spec) indexes_to_drop = [ "util_size_variable", # pre-computed size (will be re-estimated) "util_size_variable_atwork", # pre-computed size (will be re-estimated) "util_utility_adjustment", # shadow pricing (ignored in estimation) "@df['size_term'].apply(np.log1p)", # pre-computed size (will be re-estimated) ] if 'Label' in spec.columns: indexes_to_drop = [ i for i in indexes_to_drop if i in spec.Label.to_numpy() ] label_column_name = 'Label' elif 'Expression' in spec.columns: indexes_to_drop = [ i for i in indexes_to_drop if i in spec.Expression.to_numpy() ] label_column_name = 'Expression' else: raise ValueError("cannot find Label or Expression in spec file") expression_labels = None if label_column_name == 'Expression': expression_labels = { expr: f"variable_label{n:04d}" for n, expr in enumerate(spec.Expression.to_numpy()) } # Remove shadow pricing and pre-existing size expression for re-estimation spec = (spec.set_index(label_column_name).drop( index=indexes_to_drop).reset_index()) if label_column_name == 'Expression': spec.insert(0, "Label", spec['Expression'].map(expression_labels)) alt_values['variable'] = alt_values['variable'].map(expression_labels) label_column_name = "Label" if name == 'trip_destination': CHOOSER_SEGMENT_COLUMN_NAME = 'primary_purpose' primary_purposes = spec.columns[3:] SEGMENT_IDS = {pp: pp for pp in primary_purposes} chooser_index_name = chooser_data.columns[0] x_co = chooser_data.set_index(chooser_index_name) x_ca = cv_to_ca( alt_values.set_index([chooser_index_name, alt_values.columns[1]])) if CHOOSER_SEGMENT_COLUMN_NAME is not None: # label segments with names SEGMENT_IDS_REVERSE = {v: k for k, v in SEGMENT_IDS.items()} x_co["_segment_label"] = x_co[CHOOSER_SEGMENT_COLUMN_NAME].apply( lambda x: SEGMENT_IDS_REVERSE[x]) else: x_co["_segment_label"] = size_spec.index[0] # compute total size values by segment for segment in size_spec.index: total_size_segment = pd.Series(0, index=landuse.index) x_co["total_size_" + segment] = 0 for land_use_field in size_spec.loc[segment].index: total_size_segment += (landuse[land_use_field] * size_spec.loc[segment, land_use_field]) x_co["total_size_" + segment] = total_size_segment.loc[ x_co["override_choice"]].to_numpy() # for each chooser, collate the appropriate total size value x_co["total_size_segment"] = 0 for segment in size_spec.index: labels = "total_size_" + segment rows = x_co["_segment_label"] == segment x_co.loc[rows, "total_size_segment"] = x_co[labels][rows] # Remove choosers with invalid observed choice (appropriate total size value = 0) valid_observed_zone = x_co["total_size_segment"] > 0 x_co = x_co[valid_observed_zone] x_ca = x_ca[x_ca.index.get_level_values(chooser_index_name).isin( x_co.index)] # Merge land use characteristics into CA data try: x_ca_1 = pd.merge(x_ca, landuse, on="zone_id", how="left") except KeyError: # Missing the zone_id variable? # Use the alternative id's instead, which assumes no sampling of alternatives x_ca_1 = pd.merge(x_ca, landuse, left_on=x_ca.index.get_level_values(1), right_index=True, how="left") x_ca_1.index = x_ca.index # Availability of choice zones if "util_no_attractions" in x_ca_1: av = x_ca_1["util_no_attractions"].apply( lambda x: False if x == 1 else True).astype(np.int8) elif "@df['size_term']==0" in x_ca_1: av = x_ca_1["@df['size_term']==0"].apply( lambda x: False if x == 1 else True).astype(np.int8) else: av = 1 d = DataFrames(co=x_co, ca=x_ca_1, av=av) m = Model(dataservice=d) if len(spec.columns) == 4 and all( spec.columns == ['Label', 'Description', 'Expression', 'coefficient']): m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist", ), ) elif len(spec.columns) == 4 \ and all(spec.columns[:3] == ['Label', 'Description', 'Expression']) \ and len(SEGMENT_IDS) == 1 \ and spec.columns[3] == list(SEGMENT_IDS.values())[0]: m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist", ), ) else: m.utility_ca = linear_utility_from_spec( spec, x_col=label_column_name, p_col=SEGMENT_IDS, ignore_x=("local_dist", ), segment_id=CHOOSER_SEGMENT_COLUMN_NAME, ) if CHOOSER_SEGMENT_COLUMN_NAME is None: assert len(size_spec) == 1 m.quantity_ca = sum( P(f"{i}_{q}") * X(q) for i in size_spec.index for q in size_spec.columns if size_spec.loc[i, q] != 0) else: m.quantity_ca = sum( P(f"{i}_{q}") * X(q) * X(f"{CHOOSER_SEGMENT_COLUMN_NAME}=={str_repr(SEGMENT_IDS[i])}") for i in size_spec.index for q in size_spec.columns if size_spec.loc[i, q] != 0) apply_coefficients(coefficients, m) apply_coefficients(size_coef, m, minimum=-6, maximum=6) m.choice_co_code = "override_choice" if return_data: return ( m, Dict( edb_directory=Path(edb_directory), alt_values=alt_values, chooser_data=chooser_data, coefficients=coefficients, landuse=landuse, spec=spec, size_spec=size_spec, master_size_spec=master_size_spec, model_selector=model_selector, settings=settings, ), ) return m
def schedule_choice_model( name, edb_directory="output/estimation_data_bundle/{name}/", coefficients_file="{name}_coefficients.csv", spec_file="{name}_SPEC.csv", alt_values_file="{name}_alternatives_combined.csv", chooser_file="{name}_choosers_combined.csv", settings_file="{name}_model_settings.yaml", return_data=False, ): model_selector = name.replace("_location", "") model_selector = model_selector.replace("_destination", "") model_selector = model_selector.replace("_subtour", "") model_selector = model_selector.replace("_tour", "") edb_directory = edb_directory.format(name=name) def _read_csv(filename, optional=False, **kwargs): filename = filename.format(name=name) try: return pd.read_csv(os.path.join(edb_directory, filename), **kwargs) except FileNotFoundError: if optional: return None else: raise settings_file = settings_file.format(name=name) with open(os.path.join(edb_directory, settings_file), "r") as yf: settings = yaml.load( yf, Loader=yaml.SafeLoader, ) try: coefficients = _read_csv( coefficients_file, index_col="coefficient_name", ) except FileNotFoundError: # possibly mis-named file is shown in settings coefficients_file = settings.get('COEFFICIENTS', coefficients_file) coefficients = _read_csv( coefficients_file, index_col="coefficient_name", ) spec = _read_csv(spec_file, comment='#') alt_values = _read_csv(alt_values_file) chooser_data = _read_csv(chooser_file) # remove temp rows from spec, ASim uses them to calculate the other values written # to the EDB, but they are not actually part of the utility function themselves. spec = spec.loc[~spec.Expression.str.startswith("_")].copy() include_settings = settings.get("include_settings") if include_settings: with open(os.path.join(edb_directory, include_settings), "r") as yf: more_settings = yaml.load( yf, Loader=yaml.SafeLoader, ) settings.update(more_settings) CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME") SEGMENT_IDS = settings.get("SEGMENT_IDS") if SEGMENT_IDS is None: SEGMENTS = settings.get("SEGMENTS") if SEGMENTS is not None: SEGMENT_IDS = {i: i for i in SEGMENTS} if 'Label' in spec.columns: label_column_name = 'Label' elif 'Expression' in spec.columns: label_column_name = 'Expression' else: raise ValueError("cannot find Label or Expression in spec file") m = Model() if len(spec.columns) == 4 and ([c.lower() for c in spec.columns] == [ 'label', 'description', 'expression', 'coefficient' ]): m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist", ), ) elif len(spec.columns) == 4 \ and all(spec.columns[:3] == ['Label', 'Description', 'Expression']) \ and len(SEGMENT_IDS) == 1 \ and spec.columns[3] == list(SEGMENT_IDS.values())[0]: m.utility_ca = linear_utility_from_spec( spec, x_col="Label", p_col=spec.columns[-1], ignore_x=("local_dist", ), ) else: m.utility_ca = linear_utility_from_spec( spec, x_col=label_column_name, p_col=SEGMENT_IDS, ignore_x=("local_dist", ), segment_id=CHOOSER_SEGMENT_COLUMN_NAME, ) apply_coefficients(coefficients, m, minimum=-25, maximum=25) chooser_index_name = chooser_data.columns[0] x_co = chooser_data.set_index(chooser_index_name) alt_values.fillna(0, inplace=True) x_ca = cv_to_ca( alt_values.set_index([chooser_index_name, alt_values.columns[1]]), required_labels=spec[label_column_name], ) # if CHOOSER_SEGMENT_COLUMN_NAME is not None: # # label segments with names # SEGMENT_IDS_REVERSE = {v: k for k, v in SEGMENT_IDS.items()} # x_co["_segment_label"] = x_co[CHOOSER_SEGMENT_COLUMN_NAME].apply( # lambda x: SEGMENT_IDS_REVERSE[x] # ) # else: # x_co["_segment_label"] = size_spec.index[0] alt_codes = np.arange(len(x_ca.index.levels[1])) + 1 x_ca.index = x_ca.index.set_levels(alt_codes, 1) x_co["override_choice_plus1"] = x_co["override_choice"] + 1 x_co["model_choice_plus1"] = x_co["model_choice"] + 1 unavail_coefs = coefficients.query( "(constrain == 'T') & (value < -900)").index unavail_data = [i.data for i in m.utility_ca if i.param in unavail_coefs] if len(unavail_data): joint_unavail = "|".join(f"({i}>0)" for i in unavail_data) joint_avail = f"~({joint_unavail})" else: joint_avail = 1 d = DataFrames(co=x_co, ca=x_ca, av=joint_avail) m.dataservice = d m.choice_co_code = "override_choice_plus1" # m.choice_co_code = "model_choice_plus1" if return_data: return ( m, Dict( edb_directory=Path(edb_directory), alt_values=alt_values, chooser_data=chooser_data, coefficients=coefficients, spec=spec, model_selector=model_selector, joint_avail=joint_avail, ), ) return m
def mode_choice_model( name, edb_directory="output/estimation_data_bundle/{name}/", return_data=False, override_filenames=None, ): if override_filenames is None: override_filenames = {} data = simple_simulate_data( name=name, edb_directory=edb_directory, **override_filenames, ) coefficients = data.coefficients coef_template = data.coef_template spec = data.spec chooser_data = data.chooser_data settings = data.settings chooser_data = clean_values( chooser_data, alt_names_to_codes=data.alt_names_to_codes, choice_code="override_choice_code", ) tree = construct_nesting_tree(data.alt_names, settings["NESTS"]) purposes = list(coef_template.columns) if "atwork" in name: purposes = ['atwork'] elif 'atwork' in purposes: purposes.remove('atwork') # Setup purpose specific models m = {purpose: Model(graph=tree, title=purpose) for purpose in purposes} for alt_code, alt_name in tree.elemental_names().items(): # Read in base utility function for this alt_name u = linear_utility_from_spec( spec, x_col="Label", p_col=alt_name, ignore_x=("#", ), ) for purpose in purposes: # Modify utility function based on template for purpose u_purp = sum((P(coef_template[purpose].get(i.param, i.param)) * i.data * i.scale) for i in u) m[purpose].utility_co[alt_code] = u_purp for model in m.values(): explicit_value_parameters(model) apply_coefficients(coefficients, m) avail = construct_availability(m[purposes[0]], chooser_data, data.alt_codes_to_names) d = DataFrames( co=chooser_data, av=avail, alt_codes=data.alt_codes, alt_names=data.alt_names, ) if 'atwork' not in name: for purpose, model in m.items(): model.dataservice = d.selector_co(f"tour_type=='{purpose}'") model.choice_co_code = "override_choice_code" else: for purpose, model in m.items(): model.dataservice = d model.choice_co_code = "override_choice_code" from larch.model.model_group import ModelGroup mg = ModelGroup(m.values()) if return_data: return ( mg, Dict( edb_directory=Path(edb_directory), chooser_data=chooser_data, avail=avail, coefficients=coefficients, coef_template=coef_template, spec=spec, settings=settings, ), ) return mg
def simple_simulate_model( name, edb_directory="output/estimation_data_bundle/{name}/", return_data=False, choices=None, construct_avail=False, values_index_col="household_id", ): data = simple_simulate_data( name=name, edb_directory=edb_directory, values_index_col=values_index_col, ) coefficients = data.coefficients # coef_template = data.coef_template # not used spec = data.spec chooser_data = data.chooser_data settings = data.settings alt_names = data.alt_names alt_codes = data.alt_codes from .general import clean_values chooser_data = clean_values( chooser_data, alt_names_to_codes=choices or data.alt_names_to_codes, choice_code="override_choice_code", ) if settings.get('LOGIT_TYPE') == 'NL': tree = construct_nesting_tree(data.alt_names, settings["NESTS"]) m = Model(graph=tree) else: m = Model(alts=data.alt_codes_to_names) m.utility_co = dict_of_linear_utility_from_spec( spec, "Label", dict(zip(alt_names, alt_codes)), ) apply_coefficients(coefficients, m) if construct_avail: avail = construct_availability(m, chooser_data, data.alt_codes_to_names) else: avail = True d = DataFrames(co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names, ) m.dataservice = d m.choice_co_code = "override_choice_code" if return_data: return ( m, Dict( edb_directory=data.edb_directory, chooser_data=chooser_data, coefficients=coefficients, spec=spec, alt_names=alt_names, alt_codes=alt_codes, settings=settings, ), ) return m
def test_dfs_feathers(): import tempfile m = example(1, legacy=True) with tempfile.TemporaryDirectory() as td: m.load_data() filename = os.path.join(td, 'dfs') m.dataframes.to_feathers(filename) d_co = m.dataframes.data_co.copy() d_ca = m.dataframes.data_ca.copy() d_ch = m.dataframes.data_ch.copy() d_av = m.dataframes.data_av.copy() m.dataframes.data_co.iloc[:] = 0.0 m.dataframes.data_ca.iloc[:] = 0.0 m.dataframes.data_ch.iloc[:] = 0.0 m.dataframes.data_av.iloc[:] = 0.0 assert all(m.dataframes.array_co().reshape(-1) == 0) assert all(m.dataframes.array_ca().reshape(-1) == 0) assert all(m.dataframes.array_ch().reshape(-1) == 0) assert all(m.dataframes.array_av().reshape(-1) == 0) m.dataframes.inject_feathers(filename) pandas.testing.assert_frame_equal(m.dataframes.data_co, d_co) pandas.testing.assert_frame_equal(m.dataframes.data_ca, d_ca) pandas.testing.assert_frame_equal(m.dataframes.data_ch, d_ch) pandas.testing.assert_frame_equal(m.dataframes.data_av, d_av) df = pandas.read_csv(example_file("MTCwork.csv.gz")) df.set_index(['casenum', 'altnum'], inplace=True) ds = DataFrames(df) filename2 = os.path.join(td, 'dfs1') ds.to_feathers(filename2) d_ce = ds.data_ce.copy() ds.data_ce.iloc[:] = 0.0 assert all(ds.array_ce().reshape(-1) == 0) ds.inject_feathers(filename2) pandas.testing.assert_frame_equal(ds.data_ce, d_ce) filename3 = os.path.join(td, 'dfs2') ds.to_feathers(filename3) ds2 = DataFrames.from_feathers(filename3) pandas.testing.assert_index_equal( ds.alternative_codes(), ds2.alternative_codes(), check_names=False, ) pandas.testing.assert_frame_equal( ds.data_ce, ds2.data_ce, ) pandas.testing.assert_frame_equal( ds.data_av, ds2.data_av, ) pandas.testing.assert_index_equal( ds.caseindex, ds2.caseindex, ) dfs2 = DataFrames.from_feathers(filename) pandas.testing.assert_index_equal( m.dataframes.alternative_codes(), dfs2.alternative_codes(), check_names=False, ) pandas.testing.assert_frame_equal( m.dataframes.data_co, dfs2.data_co, ) pandas.testing.assert_frame_equal( m.dataframes.data_ca, dfs2.data_ca, ) pandas.testing.assert_frame_equal( m.dataframes.data_av, dfs2.data_av, ) pandas.testing.assert_index_equal( m.dataframes.caseindex, dfs2.caseindex, )
def test_dfs_info(): from larch.data_warehouse import example_file df = pandas.read_csv(example_file("MTCwork.csv.gz")) df.set_index(['casenum', 'altnum'], inplace=True) ds = DataFrames(df) s = io.StringIO() ds.info(out=s) assert s.getvalue() == ('larch.DataFrames: (not computation-ready)\n' ' n_cases: 5029\n' ' n_alts: 6\n' ' data_ce: 36 variables, 22033 rows\n' ' data_co: <not populated>\n' ' data_av: <populated>\n') s = io.StringIO() ds.info(out=s, verbose=True) assert s.getvalue() == ( 'larch.DataFrames: (not computation-ready)\n n_cases: 5029\n n_alts: 6\n data_ce: 22033 rows\n' ' - chose (22033 non-null int64)\n - ivtt (22033 non-null float64)\n' ' - ovtt (22033 non-null float64)\n - tottime (22033 non-null float64)\n' ' - totcost (22033 non-null float64)\n - hhid (22033 non-null int64)\n' ' - perid (22033 non-null int64)\n - numalts (22033 non-null int64)\n' ' - dist (22033 non-null float64)\n - wkzone (22033 non-null int64)\n' ' - hmzone (22033 non-null int64)\n - rspopden (22033 non-null float64)\n' ' - rsempden (22033 non-null float64)\n - wkpopden (22033 non-null float64)\n' ' - wkempden (22033 non-null float64)\n - vehavdum (22033 non-null int64)\n' ' - femdum (22033 non-null int64)\n - age (22033 non-null int64)\n' ' - drlicdum (22033 non-null int64)\n - noncadum (22033 non-null int64)\n' ' - numveh (22033 non-null int64)\n - hhsize (22033 non-null int64)\n' ' - hhinc (22033 non-null float64)\n - famtype (22033 non-null int64)\n' ' - hhowndum (22033 non-null int64)\n - numemphh (22033 non-null int64)\n' ' - numadlt (22033 non-null int64)\n - nmlt5 (22033 non-null int64)\n' ' - nm5to11 (22033 non-null int64)\n - nm12to16 (22033 non-null int64)\n' ' - wkccbd (22033 non-null int64)\n - wknccbd (22033 non-null int64)\n' ' - corredis (22033 non-null int64)\n - vehbywrk (22033 non-null float64)\n' ' - vocc (22033 non-null int64)\n - wgt (22033 non-null int64)\n' ' data_co: <not populated>\n data_av: <populated>\n') assert not ds.computational assert not ds.is_computational_ready() ds.computational = True assert ds.is_computational_ready() assert ds.computational s = io.StringIO() ds.info(out=s) assert s.getvalue() == ('larch.DataFrames:\n' ' n_cases: 5029\n' ' n_alts: 6\n' ' data_ce: 36 variables, 22033 rows\n' ' data_co: <not populated>\n' ' data_av: <populated>\n')
def test_repeated_splitting(): df = pandas.read_csv(example_file("MTCwork.csv.gz")) df.set_index(['casenum', 'altnum'], inplace=True) dfs = DataFrames(df, crack=False) d1, d2 = dfs.split([80, 20]) assert d1.n_cases == 4024 assert d2.n_cases == 1005 d11, d12 = d1.split([50, 50]) assert d11.n_cases == 2012 assert d12.n_cases == 2012 dfs = DataFrames(df, crack=False) d1, d2 = dfs.split([80, 20], method='shuffle') assert d1.n_cases == 4024 assert d2.n_cases == 1005 d11, d12 = d1.split([50, 50]) assert d11.n_cases == 2012 assert d12.n_cases == 2012 dfs = DataFrames(df, crack=True) d1, d2 = dfs.split([80, 20]) assert d1.n_cases == 4024 assert d2.n_cases == 1005 d11, d12 = d1.split([50, 50]) assert d11.n_cases == 2012 assert d12.n_cases == 2012 dfs = DataFrames(df, crack=True) d1, d2 = dfs.split([80, 20], method='shuffle') assert d1.n_cases == 4024 assert d2.n_cases == 1005 d11, d12 = d1.split([50, 50]) assert d11.n_cases == 2012 assert d12.n_cases == 2012
def stop_frequency_model( edb_directory="output/estimation_data_bundle/{name}/", return_data=False, ): data = stop_frequency_data( edb_directory=edb_directory, values_index_col="tour_id", ) models = [] for n in range(len(data.spec)): coefficients = data.coefficients # coef_template = data.coef_template # not used spec = data.spec[n] chooser_data = data.chooser_data[n] settings = data.settings alt_names = data.alt_names[n] alt_codes = data.alt_codes[n] from .general import clean_values chooser_data = clean_values( chooser_data, alt_names_to_codes=data.alt_names_to_codes[n], choice_code="override_choice_code", ) if settings.get('LOGIT_TYPE') == 'NL': tree = construct_nesting_tree(data.alt_names[n], settings["NESTS"]) m = Model(graph=tree) else: m = Model() m.utility_co = dict_of_linear_utility_from_spec( spec, "Label", dict(zip(alt_names, alt_codes)), ) apply_coefficients(coefficients, m) avail = True d = DataFrames( co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names, ) m.dataservice = d m.choice_co_code = "override_choice_code" models.append(m) from larch.model.model_group import ModelGroup models = ModelGroup(models) if return_data: return ( models, data, ) return models
def auto_ownership_model( name="auto_ownership", edb_directory="output/estimation_data_bundle/{name}/", return_data=False, ): data = simple_simulate_data( name=name, edb_directory=edb_directory, values_index_col="household_id", ) coefficients = data.coefficients # coef_template = data.coef_template # not used spec = data.spec chooser_data = data.chooser_data settings = data.settings altnames = list(spec.columns[3:]) altcodes = range(len(altnames)) chooser_data = remove_apostrophes(chooser_data) chooser_data.fillna(0, inplace=True) # Remove choosers with invalid observed choice chooser_data = chooser_data[chooser_data["override_choice"] >= 0] m = Model() # One of the alternatives is coded as 0, so # we need to explicitly initialize the MNL nesting graph # and set to root_id to a value other than zero. m.initialize_graph(alternative_codes=altcodes, root_id=99) m.utility_co = dict_of_linear_utility_from_spec( spec, "Label", dict(zip(altnames, altcodes)), ) apply_coefficients(coefficients, m) d = DataFrames( co=chooser_data, av=True, alt_codes=altcodes, alt_names=altnames, ) m.dataservice = d m.choice_co_code = "override_choice" if return_data: return ( m, Dict( edb_directory=data.edb_directory, chooser_data=chooser_data, coefficients=coefficients, spec=spec, altnames=altnames, altcodes=altcodes, ), ) return m