Exemple #1
0
def test_co_only():

    y = numpy.random.random([20, 3])
    y[5, 0] = 1.5
    x_co = pandas.DataFrame(
        y,
        columns=['Aa', 'Bb', 'Cc'],
    )
    x_co.index.name = 'caseid'

    with raises(ValueError):
        # missing altcodes
        DataFrames(
            co=x_co,
            av=True,
        )

    with raises(ValueError):
        # trying to use ca
        DataFrames(
            ca=x_co,
            av=True,
        )

    d = DataFrames(
        co=x_co,
        av=True,
        alt_codes=[1, 2, 3, 4, 5, 6],
    )
    assert d.n_alts == 6
    assert d.n_cases == 20
    assert d.data_av.shape == (20, 6)
    all(d.data_av.dtypes == numpy.int8)
    all(d.data_av == 1)

    d = DataFrames(
        co=x_co,
        av={
            1: True,
            2: 'Aa < 1.1',
            3: True,
            4: True,
            5: True,
            6: True,
        },
        alt_codes=[1, 2, 3, 4, 5, 6],
    )
    assert d.n_alts == 6
    assert d.n_cases == 20
    assert d.data_av.shape == (20, 6)
    assert all(d.data_av.dtypes == numpy.int8)
    assert d.data_av.sum().sum() == 119
    assert d.data_av.iloc[5].loc[2] == 0
Exemple #2
0
def test_promotion_ce_to_ca():
    from larch.data_warehouse import example_file

    ca = pandas.read_csv(example_file('MTCwork.csv.gz'),
                         index_col=('casenum', 'altnum'))
    dfs = DataFrames(ca, ch="chose", crack=True)
    assert dfs.data_ce is not None
    assert dfs.data_ca is None
    assert dfs.data_ce.shape == (22033, 5)
    dfs.data_ce_as_ca("_avail_")
    assert dfs.data_ce is None
    assert dfs.data_ca is not None
    assert dfs.data_ca.shape == (30174, 6)
Exemple #3
0
def test_service_idco():

    df = pandas.read_csv(example_file("MTCwork.csv.gz"))
    df.set_index(['casenum', 'altnum'], inplace=True)

    dfs = DataFrames(df, crack=True)
    check1 = dfs.make_idco('1')
    assert (check1 == 1).shape == (5029, 1)
    assert numpy.all(check1 == 1)

    check2 = dfs.make_idco('age')
    assert check2.shape == (5029, 1)
    assert numpy.all(check2.iloc[:5, 0] == [35, 40, 28, 34, 43])
    assert numpy.all(check2.iloc[-5:, 0] == [58, 33, 34, 35, 37])

    check3 = dfs.make_idco('age', '1')
    assert check3.shape == (5029, 2)
def nonmand_tour_freq_model(
    edb_directory="output/estimation_data_bundle/{name}/",
    return_data=False,
):
    data = interaction_simulate_data(
        name="non_mandatory_tour_frequency",
        edb_directory=edb_directory,
    )

    settings = data.settings
    segment_names = [s["NAME"] for s in settings["SPEC_SEGMENTS"]]
    data.relabel_coef = link_same_value_coefficients(segment_names,
                                                     data.coefficients,
                                                     data.spec)
    spec = data.spec
    coefficients = data.coefficients
    chooser_data = data.chooser_data
    alt_values = data.alt_values
    alt_def = data.alt_def

    m = {}
    for segment_name in segment_names:
        segment_model = m[segment_name] = Model()
        # One of the alternatives is coded as 0, so
        # we need to explicitly initialize the MNL nesting graph
        # and set to root_id to a value other than zero.
        segment_model.initialize_graph(alternative_codes=alt_def.index,
                                       root_id=9999)

        # Utility specifications
        segment_model.utility_ca = linear_utility_from_spec(
            spec,
            x_col="Label",
            p_col=segment_name,
        )
        apply_coefficients(coefficients[segment_name], segment_model)
        segment_model.choice_co_code = "override_choice"

        # Attach Data
        x_co = (chooser_data[segment_name].set_index("person_id").rename(
            columns={"TAZ": "HOMETAZ"}))
        x_ca = cv_to_ca(alt_values[segment_name].set_index(
            ["person_id", "variable"]))
        d = DataFrames(
            co=x_co,
            ca=x_ca,
            av=~unavail(segment_model, x_ca),
        )
        m[segment_name].dataservice = d

    if return_data:
        return m, data
    return m
Exemple #5
0
def mtc2():

    d = MTC()
    d1 = d.make_dataframes({
        'ca': (
            'ivtt',
            'ovtt',
            'totcost',
            'chose',
            'tottime',
        ),
        'co': ('age', 'hhinc', 'hhsize', 'numveh==0'),
        'avail_ca':
        '_avail_',
        'choice_ca':
        'chose',
    })

    df_co2 = pd.concat([d1.data_co, d1.data_co]).reset_index(drop=True)
    df_ca2 = pd.concat([d1.data_ca.unstack(),
                        d1.data_ca.unstack()]).reset_index(drop=True).stack()
    df_av2 = pd.concat([d1.data_av, d1.data_av]).reset_index(drop=True)
    df_chX = pd.DataFrame(
        np.zeros_like(d1.data_ch.values),
        index=d1.data_ch.index,
        columns=d1.data_ch.columns,
    )
    df_chX.iloc[:, 1] = 2.0

    df_ch2 = pd.concat([d1.data_ch, df_chX]).reset_index(drop=True)

    from larch import DataFrames, Model

    j1 = DataFrames(
        co=d1.data_co,
        ca=d1.data_ca,
        av=d1.data_av,
        ch=df_chX + d1.data_ch,
    )

    j2 = DataFrames(
        co=df_co2,
        ca=df_ca2,
        av=df_av2,
        ch=df_ch2,
    )

    j1.autoscale_weights()
    j2.autoscale_weights()
    return j1, j2
Exemple #6
0
def cdap_dataframes(households, values):
    data = cdap_split_data(households, values)
    dfs = {}
    for hhsize in data.keys():
        alts = generate_alternatives(hhsize)
        dfs[hhsize] = DataFrames(
            co=data[hhsize],
            alt_names=alts.keys(),
            alt_codes=alts.values(),
            av=1,
            ch=data[hhsize].override_choice.map(alts),
        )
    return dfs
Exemple #7
0
def test_dfs_init_co():
    from larch.data_warehouse import example_file
    raw_data = pandas.read_csv(example_file('swissmetro.csv.gz'))
    keep = raw_data.eval("PURPOSE in (1,3) and CHOICE != 0")
    selected_data = raw_data[keep]

    d0 = DataFrames(selected_data, alt_codes=[1, 2, 3])
    assert d0.data_co.shape == (6768, 28)
    assert d0.data_ca is None
    assert d0.data_ce is None
    assert d0.data_ch is None
    assert d0.data_av is None

    d1 = DataFrames(co=selected_data, alt_codes=[1, 2, 3])
    assert d1.data_co.shape == (6768, 28)
    assert d1.data_ca is None
    assert d1.data_ce is None
    assert d1.data_ch is None
    assert d1.data_av is None

    with raises(ValueError):
        DataFrames(ca=selected_data, alt_codes=[1, 2, 3])

    d2 = DataFrames(co=selected_data, alt_codes=[1, 2, 3], ch='CHOICE')
    assert d2.data_co.shape == (6768, 28)
    assert d2.data_ca is None
    assert d2.data_ce is None
    assert d2.data_ch is not None
    assert d2.data_ch.shape == (6768, 3)
    assert all(d2.data_ch.sum() == [908, 4090, 1770])
    assert d2.data_av is None

    d2 = DataFrames(co=selected_data,
                    alt_codes=[1, 2, 3],
                    ch=selected_data.CHOICE)
    assert d2.data_co.shape == (6768, 28)
    assert d2.data_ca is None
    assert d2.data_ce is None
    assert d2.data_ch is not None
    assert d2.data_ch.shape == (6768, 3)
    assert all(d2.data_ch.sum() == [908, 4090, 1770])
    assert d2.data_av is None

    d2 = DataFrames(co=selected_data,
                    alt_codes=[1, 2, 3],
                    ch='CHOICE',
                    wt='GROUP')
    assert d2.data_co.shape == (6768, 28)
    assert d2.data_ca is None
    assert d2.data_ce is None
    assert d2.data_ch is not None
    assert d2.data_ch.shape == (6768, 3)
    assert all(d2.data_ch.sum() == [908, 4090, 1770])
    assert d2.data_av is None
    assert d2.data_wt is not None
    assert d2.data_wt.shape == (6768, 1)

    d2 = DataFrames(co=selected_data,
                    alt_codes=[1, 2, 3],
                    ch='CHOICE',
                    wt=selected_data.GROUP)
    assert d2.data_co.shape == (6768, 28)
    assert d2.data_ca is None
    assert d2.data_ce is None
    assert d2.data_ch is not None
    assert d2.data_ch.shape == (6768, 3)
    assert all(d2.data_ch.sum() == [908, 4090, 1770])
    assert d2.data_av is None
    assert d2.data_wt is not None
    assert d2.data_wt.shape == (6768, 1)
Exemple #8
0
def test_dfs_init_ca():
    from larch.data_warehouse import example_file

    df = pandas.read_csv(example_file("MTCwork.csv.gz"),
                         index_col=['casenum', 'altnum'])

    d0 = DataFrames(ca=df, crack=True, ch='chose', wt_name='wgt')
    assert d0.data_wt is not None
    assert d0.data_wt.columns == 'wgt'
    assert d0.data_ch is not None
    assert d0.data_ch.shape == (5029, 6)
    assert d0.data_av is not None
    assert d0.data_av.shape == (5029, 6)

    d1 = DataFrames(ca=df, crack=True)
    assert d1.data_wt is None
    assert d1.data_ch is None
    assert d1.data_av is not None
    assert d1.data_av.shape == (5029, 6)

    d2 = DataFrames(df)
    assert d2.data_wt is None
    assert d2.data_ch is None
    assert d2.data_av is not None
    assert d2.data_av.shape == (5029, 6)
    assert d2.data_co is None
    assert d2.data_ca is None
    assert d2.data_ce is not None
    assert d2.data_ce.shape == (22033, 36)

    d3 = DataFrames(ca=df, crack=True, ch=df.chose, wt_name='wgt')
    assert d3.data_wt is not None
    assert d3.data_wt.columns == 'wgt'
    assert d3.data_ch is not None
    assert d3.data_ch.shape == (5029, 6)
    assert d3.data_av is not None
    assert d3.data_av.shape == (5029, 6)
    assert pandas.isna(d3.data_ch).sum().sum() == 0

    d4 = DataFrames(ca=df, crack=True, ch=df.chose, wt='wgt')
    assert d4.data_wt is not None
    assert d4.data_wt.columns == 'wgt'
    assert d4.data_ch is not None
    assert d4.data_ch.shape == (5029, 6)
    assert d4.data_av is not None
    assert d4.data_av.shape == (5029, 6)

    d5 = DataFrames(ca=df, crack=True, ch=df.chose, wt=df.wgt)
    assert d5.data_wt is not None
    assert d5.data_wt.columns == 'wgt'
    assert d5.data_ch is not None
    assert d5.data_ch.shape == (5029, 6)
    assert d5.data_av is not None
    assert d5.data_av.shape == (5029, 6)
    assert d5.data_co.shape == (5029, 31)
    assert d5.data_ca is None
    assert d5.data_ce is not None
    assert d5.data_ce.shape == (22033, 5)

    with raises(ValueError):
        bad = DataFrames(co=df)
Exemple #9
0
def test_ce_initialization():
    cax = pandas.DataFrame({
        'caseid': [
            1,
            1,
            1,
            2,
            2,
        ],
        'caseid_bad': [
            'x',
            'x',
            'x',
            'y',
            'y',
        ],
        'altid_bad': [
            'aa',
            'bb',
            'cc',
            'aa',
            'bb',
        ],
        'altid_good': [
            1,
            2,
            3,
            1,
            2,
        ],
        'altid_str': [
            '1',
            '2',
            '3',
            '1',
            '2',
        ],
        'buggers': [
            1.2,
            3.4,
            5.6,
            7.8,
            9.0,
        ],
        'baggers': [
            22,
            33,
            44,
            55,
            66,
        ],
    })

    import pytest

    with pytest.raises(ValueError):
        d = DataFrames(cax.set_index(['caseid_bad', 'altid_bad']), )

    d = DataFrames(cax.set_index(['caseid', 'altid_bad']), )
    assert len(d.data_ce) == 5
    assert d.data_ca is None
    assert all(d.alternative_codes() == numpy.asarray([1, 2, 3]))
    assert all(d.alternative_names() == numpy.asarray(['aa', 'bb', 'cc']))

    d = DataFrames(cax.set_index(['caseid', 'altid_good']), )
    assert len(d.data_ce) == 5
    assert d.data_ca is None
    assert all(d.alternative_codes() == [1, 2, 3])

    d = DataFrames(ca=cax.set_index(['caseid', 'altid_str']), )
    assert len(d.data_ce) == 5
    assert d.data_ca is None
    assert all(d.alternative_codes() == [1, 2, 3])
Exemple #10
0
def test_ca_initialization():
    cax = pandas.DataFrame({
        'caseid': [1, 1, 1, 2, 2, 2],
        'caseid_bad': ['x', 'x', 'x', 'y', 'y', 'y'],
        'altid_bad': ['aa', 'bb', 'cc', 'aa', 'bb', 'cc'],
        'altid_good': [1, 2, 3, 1, 2, 3],
        'altid_str': ['1', '2', '3', '1', '2', '3'],
        'buggers': [1.2, 3.4, 5.6, 7.8, 9.0, 5.5],
        'baggers': [22, 33, 44, 55, 66, 77],
    })

    import pytest

    with pytest.raises(ValueError):
        d = DataFrames(ca=cax.set_index(['caseid_bad', 'altid_bad']), )

    d = DataFrames(ca=cax.set_index(['caseid', 'altid_bad']), )
    assert all(d.alternative_codes() == numpy.asarray([1, 2, 3]))
    assert all(d.alternative_names() == numpy.asarray(['aa', 'bb', 'cc']))

    d = DataFrames(ca=cax.set_index(['caseid', 'altid_good']), )
    assert all(d.alternative_codes() == [1, 2, 3])

    d = DataFrames(ca=cax.set_index(['caseid', 'altid_str']), )
    assert all(d.alternative_codes() == [1, 2, 3])

    d = DataFrames(ca=cax.set_index(['caseid', 'altid_good']),
                   av='baggers > 25')
    assert all(d.alternative_codes() == [1, 2, 3])
    av = d.data_av.to_numpy()
    assert av.shape == (2, 3)
    assert av.ravel() == approx([0, 1, 1, 1, 1, 1])
def location_choice_model(
    name="workplace_location",
    edb_directory="output/estimation_data_bundle/{name}/",
    coefficients_file="{name}_coefficients.csv",
    spec_file="{name}_SPEC.csv",
    size_spec_file="{name}_size_terms.csv",
    alt_values_file="{name}_alternatives_combined.csv",
    chooser_file="{name}_choosers_combined.csv",
    settings_file="{name}_model_settings.yaml",
    landuse_file="{name}_landuse.csv",
    return_data=False,
):
    model_selector = name.replace("_location", "")
    model_selector = model_selector.replace("_destination", "")
    model_selector = model_selector.replace("_subtour", "")
    model_selector = model_selector.replace("_tour", "")
    if model_selector == 'joint':
        model_selector = 'non_mandatory'
    edb_directory = edb_directory.format(name=name)

    def _read_csv(filename, **kwargs):
        filename = filename.format(name=name)
        return pd.read_csv(os.path.join(edb_directory, filename), **kwargs)

    coefficients = _read_csv(
        coefficients_file,
        index_col="coefficient_name",
    )
    spec = _read_csv(spec_file, comment="#")
    alt_values = _read_csv(alt_values_file)
    chooser_data = _read_csv(chooser_file)
    landuse = _read_csv(landuse_file, index_col="zone_id")
    master_size_spec = _read_csv(size_spec_file)

    # remove temp rows from spec, ASim uses them to calculate the other values written
    # to the EDB, but they are not actually part of the utility function themselves.
    spec = spec.loc[~spec.Expression.isna()]
    spec = spec.loc[~spec.Expression.str.startswith("_")].copy()

    settings_file = settings_file.format(name=name)
    with open(os.path.join(edb_directory, settings_file), "r") as yf:
        settings = yaml.load(
            yf,
            Loader=yaml.SafeLoader,
        )

    include_settings = settings.get("include_settings")
    if include_settings:
        include_settings = os.path.join(edb_directory, include_settings)
    if include_settings and os.path.exists(include_settings):
        with open(include_settings, "r") as yf:
            more_settings = yaml.load(
                yf,
                Loader=yaml.SafeLoader,
            )
        settings.update(more_settings)

    CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME")
    SEGMENT_IDS = settings.get("SEGMENT_IDS")
    if SEGMENT_IDS is None:
        SEGMENTS = settings.get("SEGMENTS")
        if SEGMENTS is not None:
            SEGMENT_IDS = {i: i for i in SEGMENTS}

    SIZE_TERM_SELECTOR = settings.get('SIZE_TERM_SELECTOR', model_selector)

    # filter size spec for this location choice only
    size_spec = (master_size_spec.query(
        f"model_selector == '{SIZE_TERM_SELECTOR}'").drop(
            columns="model_selector").set_index("segment"))
    size_spec = size_spec.loc[:, size_spec.max() > 0]

    size_coef = size_coefficients_from_spec(size_spec)

    indexes_to_drop = [
        "util_size_variable",  # pre-computed size (will be re-estimated)
        "util_size_variable_atwork",  # pre-computed size (will be re-estimated)
        "util_utility_adjustment",  # shadow pricing (ignored in estimation)
        "@df['size_term'].apply(np.log1p)",  # pre-computed size (will be re-estimated)
    ]
    if 'Label' in spec.columns:
        indexes_to_drop = [
            i for i in indexes_to_drop if i in spec.Label.to_numpy()
        ]
        label_column_name = 'Label'
    elif 'Expression' in spec.columns:
        indexes_to_drop = [
            i for i in indexes_to_drop if i in spec.Expression.to_numpy()
        ]
        label_column_name = 'Expression'
    else:
        raise ValueError("cannot find Label or Expression in spec file")

    expression_labels = None
    if label_column_name == 'Expression':
        expression_labels = {
            expr: f"variable_label{n:04d}"
            for n, expr in enumerate(spec.Expression.to_numpy())
        }

    # Remove shadow pricing and pre-existing size expression for re-estimation
    spec = (spec.set_index(label_column_name).drop(
        index=indexes_to_drop).reset_index())

    if label_column_name == 'Expression':
        spec.insert(0, "Label", spec['Expression'].map(expression_labels))
        alt_values['variable'] = alt_values['variable'].map(expression_labels)
        label_column_name = "Label"

    if name == 'trip_destination':
        CHOOSER_SEGMENT_COLUMN_NAME = 'primary_purpose'
        primary_purposes = spec.columns[3:]
        SEGMENT_IDS = {pp: pp for pp in primary_purposes}

    chooser_index_name = chooser_data.columns[0]
    x_co = chooser_data.set_index(chooser_index_name)
    x_ca = cv_to_ca(
        alt_values.set_index([chooser_index_name, alt_values.columns[1]]))

    if CHOOSER_SEGMENT_COLUMN_NAME is not None:
        # label segments with names
        SEGMENT_IDS_REVERSE = {v: k for k, v in SEGMENT_IDS.items()}
        x_co["_segment_label"] = x_co[CHOOSER_SEGMENT_COLUMN_NAME].apply(
            lambda x: SEGMENT_IDS_REVERSE[x])
    else:
        x_co["_segment_label"] = size_spec.index[0]

    # compute total size values by segment
    for segment in size_spec.index:
        total_size_segment = pd.Series(0, index=landuse.index)
        x_co["total_size_" + segment] = 0
        for land_use_field in size_spec.loc[segment].index:
            total_size_segment += (landuse[land_use_field] *
                                   size_spec.loc[segment, land_use_field])
        x_co["total_size_" + segment] = total_size_segment.loc[
            x_co["override_choice"]].to_numpy()

    # for each chooser, collate the appropriate total size value
    x_co["total_size_segment"] = 0
    for segment in size_spec.index:
        labels = "total_size_" + segment
        rows = x_co["_segment_label"] == segment
        x_co.loc[rows, "total_size_segment"] = x_co[labels][rows]

    # Remove choosers with invalid observed choice (appropriate total size value = 0)
    valid_observed_zone = x_co["total_size_segment"] > 0
    x_co = x_co[valid_observed_zone]
    x_ca = x_ca[x_ca.index.get_level_values(chooser_index_name).isin(
        x_co.index)]

    # Merge land use characteristics into CA data
    try:
        x_ca_1 = pd.merge(x_ca, landuse, on="zone_id", how="left")
    except KeyError:
        # Missing the zone_id variable?
        # Use the alternative id's instead, which assumes no sampling of alternatives
        x_ca_1 = pd.merge(x_ca,
                          landuse,
                          left_on=x_ca.index.get_level_values(1),
                          right_index=True,
                          how="left")
    x_ca_1.index = x_ca.index

    # Availability of choice zones
    if "util_no_attractions" in x_ca_1:
        av = x_ca_1["util_no_attractions"].apply(
            lambda x: False if x == 1 else True).astype(np.int8)
    elif "@df['size_term']==0" in x_ca_1:
        av = x_ca_1["@df['size_term']==0"].apply(
            lambda x: False if x == 1 else True).astype(np.int8)
    else:
        av = 1

    d = DataFrames(co=x_co, ca=x_ca_1, av=av)

    m = Model(dataservice=d)
    if len(spec.columns) == 4 and all(
            spec.columns ==
        ['Label', 'Description', 'Expression', 'coefficient']):
        m.utility_ca = linear_utility_from_spec(
            spec,
            x_col="Label",
            p_col=spec.columns[-1],
            ignore_x=("local_dist", ),
        )
    elif len(spec.columns) == 4 \
            and all(spec.columns[:3] == ['Label', 'Description', 'Expression']) \
            and len(SEGMENT_IDS) == 1 \
            and spec.columns[3] == list(SEGMENT_IDS.values())[0]:
        m.utility_ca = linear_utility_from_spec(
            spec,
            x_col="Label",
            p_col=spec.columns[-1],
            ignore_x=("local_dist", ),
        )
    else:
        m.utility_ca = linear_utility_from_spec(
            spec,
            x_col=label_column_name,
            p_col=SEGMENT_IDS,
            ignore_x=("local_dist", ),
            segment_id=CHOOSER_SEGMENT_COLUMN_NAME,
        )

    if CHOOSER_SEGMENT_COLUMN_NAME is None:
        assert len(size_spec) == 1
        m.quantity_ca = sum(
            P(f"{i}_{q}") * X(q) for i in size_spec.index
            for q in size_spec.columns if size_spec.loc[i, q] != 0)
    else:
        m.quantity_ca = sum(
            P(f"{i}_{q}") * X(q) *
            X(f"{CHOOSER_SEGMENT_COLUMN_NAME}=={str_repr(SEGMENT_IDS[i])}")
            for i in size_spec.index for q in size_spec.columns
            if size_spec.loc[i, q] != 0)

    apply_coefficients(coefficients, m)
    apply_coefficients(size_coef, m, minimum=-6, maximum=6)

    m.choice_co_code = "override_choice"

    if return_data:
        return (
            m,
            Dict(
                edb_directory=Path(edb_directory),
                alt_values=alt_values,
                chooser_data=chooser_data,
                coefficients=coefficients,
                landuse=landuse,
                spec=spec,
                size_spec=size_spec,
                master_size_spec=master_size_spec,
                model_selector=model_selector,
                settings=settings,
            ),
        )

    return m
Exemple #12
0
def schedule_choice_model(
    name,
    edb_directory="output/estimation_data_bundle/{name}/",
    coefficients_file="{name}_coefficients.csv",
    spec_file="{name}_SPEC.csv",
    alt_values_file="{name}_alternatives_combined.csv",
    chooser_file="{name}_choosers_combined.csv",
    settings_file="{name}_model_settings.yaml",
    return_data=False,
):
    model_selector = name.replace("_location", "")
    model_selector = model_selector.replace("_destination", "")
    model_selector = model_selector.replace("_subtour", "")
    model_selector = model_selector.replace("_tour", "")
    edb_directory = edb_directory.format(name=name)

    def _read_csv(filename, optional=False, **kwargs):
        filename = filename.format(name=name)
        try:
            return pd.read_csv(os.path.join(edb_directory, filename), **kwargs)
        except FileNotFoundError:
            if optional:
                return None
            else:
                raise

    settings_file = settings_file.format(name=name)
    with open(os.path.join(edb_directory, settings_file), "r") as yf:
        settings = yaml.load(
            yf,
            Loader=yaml.SafeLoader,
        )

    try:
        coefficients = _read_csv(
            coefficients_file,
            index_col="coefficient_name",
        )
    except FileNotFoundError:
        # possibly mis-named file is shown in settings
        coefficients_file = settings.get('COEFFICIENTS', coefficients_file)
        coefficients = _read_csv(
            coefficients_file,
            index_col="coefficient_name",
        )

    spec = _read_csv(spec_file, comment='#')
    alt_values = _read_csv(alt_values_file)
    chooser_data = _read_csv(chooser_file)

    # remove temp rows from spec, ASim uses them to calculate the other values written
    # to the EDB, but they are not actually part of the utility function themselves.
    spec = spec.loc[~spec.Expression.str.startswith("_")].copy()

    include_settings = settings.get("include_settings")
    if include_settings:
        with open(os.path.join(edb_directory, include_settings), "r") as yf:
            more_settings = yaml.load(
                yf,
                Loader=yaml.SafeLoader,
            )
        settings.update(more_settings)

    CHOOSER_SEGMENT_COLUMN_NAME = settings.get("CHOOSER_SEGMENT_COLUMN_NAME")
    SEGMENT_IDS = settings.get("SEGMENT_IDS")
    if SEGMENT_IDS is None:
        SEGMENTS = settings.get("SEGMENTS")
        if SEGMENTS is not None:
            SEGMENT_IDS = {i: i for i in SEGMENTS}

    if 'Label' in spec.columns:
        label_column_name = 'Label'
    elif 'Expression' in spec.columns:
        label_column_name = 'Expression'
    else:
        raise ValueError("cannot find Label or Expression in spec file")

    m = Model()
    if len(spec.columns) == 4 and ([c.lower() for c in spec.columns] == [
            'label', 'description', 'expression', 'coefficient'
    ]):
        m.utility_ca = linear_utility_from_spec(
            spec,
            x_col="Label",
            p_col=spec.columns[-1],
            ignore_x=("local_dist", ),
        )
    elif len(spec.columns) == 4 \
            and all(spec.columns[:3] == ['Label', 'Description', 'Expression']) \
            and len(SEGMENT_IDS) == 1 \
            and spec.columns[3] == list(SEGMENT_IDS.values())[0]:
        m.utility_ca = linear_utility_from_spec(
            spec,
            x_col="Label",
            p_col=spec.columns[-1],
            ignore_x=("local_dist", ),
        )
    else:
        m.utility_ca = linear_utility_from_spec(
            spec,
            x_col=label_column_name,
            p_col=SEGMENT_IDS,
            ignore_x=("local_dist", ),
            segment_id=CHOOSER_SEGMENT_COLUMN_NAME,
        )

    apply_coefficients(coefficients, m, minimum=-25, maximum=25)

    chooser_index_name = chooser_data.columns[0]
    x_co = chooser_data.set_index(chooser_index_name)
    alt_values.fillna(0, inplace=True)
    x_ca = cv_to_ca(
        alt_values.set_index([chooser_index_name, alt_values.columns[1]]),
        required_labels=spec[label_column_name],
    )

    # if CHOOSER_SEGMENT_COLUMN_NAME is not None:
    #     # label segments with names
    #     SEGMENT_IDS_REVERSE = {v: k for k, v in SEGMENT_IDS.items()}
    #     x_co["_segment_label"] = x_co[CHOOSER_SEGMENT_COLUMN_NAME].apply(
    #         lambda x: SEGMENT_IDS_REVERSE[x]
    #     )
    # else:
    #     x_co["_segment_label"] = size_spec.index[0]

    alt_codes = np.arange(len(x_ca.index.levels[1])) + 1
    x_ca.index = x_ca.index.set_levels(alt_codes, 1)
    x_co["override_choice_plus1"] = x_co["override_choice"] + 1
    x_co["model_choice_plus1"] = x_co["model_choice"] + 1

    unavail_coefs = coefficients.query(
        "(constrain == 'T') & (value < -900)").index
    unavail_data = [i.data for i in m.utility_ca if i.param in unavail_coefs]
    if len(unavail_data):
        joint_unavail = "|".join(f"({i}>0)" for i in unavail_data)
        joint_avail = f"~({joint_unavail})"
    else:
        joint_avail = 1

    d = DataFrames(co=x_co, ca=x_ca, av=joint_avail)
    m.dataservice = d
    m.choice_co_code = "override_choice_plus1"
    # m.choice_co_code = "model_choice_plus1"

    if return_data:
        return (
            m,
            Dict(
                edb_directory=Path(edb_directory),
                alt_values=alt_values,
                chooser_data=chooser_data,
                coefficients=coefficients,
                spec=spec,
                model_selector=model_selector,
                joint_avail=joint_avail,
            ),
        )

    return m
Exemple #13
0
def mode_choice_model(
    name,
    edb_directory="output/estimation_data_bundle/{name}/",
    return_data=False,
    override_filenames=None,
):
    if override_filenames is None:
        override_filenames = {}
    data = simple_simulate_data(
        name=name,
        edb_directory=edb_directory,
        **override_filenames,
    )
    coefficients = data.coefficients
    coef_template = data.coef_template
    spec = data.spec
    chooser_data = data.chooser_data
    settings = data.settings

    chooser_data = clean_values(
        chooser_data,
        alt_names_to_codes=data.alt_names_to_codes,
        choice_code="override_choice_code",
    )

    tree = construct_nesting_tree(data.alt_names, settings["NESTS"])

    purposes = list(coef_template.columns)
    if "atwork" in name:
        purposes = ['atwork']
    elif 'atwork' in purposes:
        purposes.remove('atwork')

    # Setup purpose specific models
    m = {purpose: Model(graph=tree, title=purpose) for purpose in purposes}
    for alt_code, alt_name in tree.elemental_names().items():
        # Read in base utility function for this alt_name
        u = linear_utility_from_spec(
            spec,
            x_col="Label",
            p_col=alt_name,
            ignore_x=("#", ),
        )
        for purpose in purposes:
            # Modify utility function based on template for purpose
            u_purp = sum((P(coef_template[purpose].get(i.param, i.param)) *
                          i.data * i.scale) for i in u)
            m[purpose].utility_co[alt_code] = u_purp

    for model in m.values():
        explicit_value_parameters(model)
    apply_coefficients(coefficients, m)

    avail = construct_availability(m[purposes[0]], chooser_data,
                                   data.alt_codes_to_names)

    d = DataFrames(
        co=chooser_data,
        av=avail,
        alt_codes=data.alt_codes,
        alt_names=data.alt_names,
    )

    if 'atwork' not in name:
        for purpose, model in m.items():
            model.dataservice = d.selector_co(f"tour_type=='{purpose}'")
            model.choice_co_code = "override_choice_code"
    else:
        for purpose, model in m.items():
            model.dataservice = d
            model.choice_co_code = "override_choice_code"

    from larch.model.model_group import ModelGroup

    mg = ModelGroup(m.values())

    if return_data:
        return (
            mg,
            Dict(
                edb_directory=Path(edb_directory),
                chooser_data=chooser_data,
                avail=avail,
                coefficients=coefficients,
                coef_template=coef_template,
                spec=spec,
                settings=settings,
            ),
        )

    return mg
Exemple #14
0
def simple_simulate_model(
        name,
        edb_directory="output/estimation_data_bundle/{name}/",
        return_data=False,
        choices=None,
        construct_avail=False,
        values_index_col="household_id",
):
    data = simple_simulate_data(
        name=name, edb_directory=edb_directory, values_index_col=values_index_col,
    )
    coefficients = data.coefficients
    # coef_template = data.coef_template # not used
    spec = data.spec
    chooser_data = data.chooser_data
    settings = data.settings

    alt_names = data.alt_names
    alt_codes = data.alt_codes

    from .general import clean_values
    chooser_data = clean_values(
        chooser_data,
        alt_names_to_codes=choices or data.alt_names_to_codes,
        choice_code="override_choice_code",
    )

    if settings.get('LOGIT_TYPE') == 'NL':
        tree = construct_nesting_tree(data.alt_names, settings["NESTS"])
        m = Model(graph=tree)
    else:
        m = Model(alts=data.alt_codes_to_names)

    m.utility_co = dict_of_linear_utility_from_spec(
        spec, "Label", dict(zip(alt_names, alt_codes)),
    )

    apply_coefficients(coefficients, m)

    if construct_avail:
        avail = construct_availability(m, chooser_data, data.alt_codes_to_names)
    else:
        avail = True

    d = DataFrames(co=chooser_data, av=avail, alt_codes=alt_codes, alt_names=alt_names, )

    m.dataservice = d
    m.choice_co_code = "override_choice_code"

    if return_data:
        return (
            m,
            Dict(
                edb_directory=data.edb_directory,
                chooser_data=chooser_data,
                coefficients=coefficients,
                spec=spec,
                alt_names=alt_names,
                alt_codes=alt_codes,
                settings=settings,
            ),
        )

    return m
Exemple #15
0
def test_dfs_feathers():
    import tempfile
    m = example(1, legacy=True)
    with tempfile.TemporaryDirectory() as td:
        m.load_data()
        filename = os.path.join(td, 'dfs')
        m.dataframes.to_feathers(filename)
        d_co = m.dataframes.data_co.copy()
        d_ca = m.dataframes.data_ca.copy()
        d_ch = m.dataframes.data_ch.copy()
        d_av = m.dataframes.data_av.copy()
        m.dataframes.data_co.iloc[:] = 0.0
        m.dataframes.data_ca.iloc[:] = 0.0
        m.dataframes.data_ch.iloc[:] = 0.0
        m.dataframes.data_av.iloc[:] = 0.0
        assert all(m.dataframes.array_co().reshape(-1) == 0)
        assert all(m.dataframes.array_ca().reshape(-1) == 0)
        assert all(m.dataframes.array_ch().reshape(-1) == 0)
        assert all(m.dataframes.array_av().reshape(-1) == 0)
        m.dataframes.inject_feathers(filename)
        pandas.testing.assert_frame_equal(m.dataframes.data_co, d_co)
        pandas.testing.assert_frame_equal(m.dataframes.data_ca, d_ca)
        pandas.testing.assert_frame_equal(m.dataframes.data_ch, d_ch)
        pandas.testing.assert_frame_equal(m.dataframes.data_av, d_av)

        df = pandas.read_csv(example_file("MTCwork.csv.gz"))
        df.set_index(['casenum', 'altnum'], inplace=True)
        ds = DataFrames(df)
        filename2 = os.path.join(td, 'dfs1')
        ds.to_feathers(filename2)
        d_ce = ds.data_ce.copy()
        ds.data_ce.iloc[:] = 0.0
        assert all(ds.array_ce().reshape(-1) == 0)
        ds.inject_feathers(filename2)
        pandas.testing.assert_frame_equal(ds.data_ce, d_ce)

        filename3 = os.path.join(td, 'dfs2')
        ds.to_feathers(filename3)
        ds2 = DataFrames.from_feathers(filename3)
        pandas.testing.assert_index_equal(
            ds.alternative_codes(),
            ds2.alternative_codes(),
            check_names=False,
        )
        pandas.testing.assert_frame_equal(
            ds.data_ce,
            ds2.data_ce,
        )
        pandas.testing.assert_frame_equal(
            ds.data_av,
            ds2.data_av,
        )
        pandas.testing.assert_index_equal(
            ds.caseindex,
            ds2.caseindex,
        )

        dfs2 = DataFrames.from_feathers(filename)
        pandas.testing.assert_index_equal(
            m.dataframes.alternative_codes(),
            dfs2.alternative_codes(),
            check_names=False,
        )
        pandas.testing.assert_frame_equal(
            m.dataframes.data_co,
            dfs2.data_co,
        )
        pandas.testing.assert_frame_equal(
            m.dataframes.data_ca,
            dfs2.data_ca,
        )
        pandas.testing.assert_frame_equal(
            m.dataframes.data_av,
            dfs2.data_av,
        )
        pandas.testing.assert_index_equal(
            m.dataframes.caseindex,
            dfs2.caseindex,
        )
Exemple #16
0
def test_dfs_info():

    from larch.data_warehouse import example_file
    df = pandas.read_csv(example_file("MTCwork.csv.gz"))
    df.set_index(['casenum', 'altnum'], inplace=True)

    ds = DataFrames(df)

    s = io.StringIO()

    ds.info(out=s)

    assert s.getvalue() == ('larch.DataFrames:  (not computation-ready)\n'
                            '  n_cases: 5029\n'
                            '  n_alts: 6\n'
                            '  data_ce: 36 variables, 22033 rows\n'
                            '  data_co: <not populated>\n'
                            '  data_av: <populated>\n')

    s = io.StringIO()
    ds.info(out=s, verbose=True)

    assert s.getvalue() == (
        'larch.DataFrames:  (not computation-ready)\n  n_cases: 5029\n  n_alts: 6\n  data_ce: 22033 rows\n'
        '    - chose    (22033 non-null int64)\n    - ivtt     (22033 non-null float64)\n'
        '    - ovtt     (22033 non-null float64)\n    - tottime  (22033 non-null float64)\n'
        '    - totcost  (22033 non-null float64)\n    - hhid     (22033 non-null int64)\n'
        '    - perid    (22033 non-null int64)\n    - numalts  (22033 non-null int64)\n'
        '    - dist     (22033 non-null float64)\n    - wkzone   (22033 non-null int64)\n'
        '    - hmzone   (22033 non-null int64)\n    - rspopden (22033 non-null float64)\n'
        '    - rsempden (22033 non-null float64)\n    - wkpopden (22033 non-null float64)\n'
        '    - wkempden (22033 non-null float64)\n    - vehavdum (22033 non-null int64)\n'
        '    - femdum   (22033 non-null int64)\n    - age      (22033 non-null int64)\n'
        '    - drlicdum (22033 non-null int64)\n    - noncadum (22033 non-null int64)\n'
        '    - numveh   (22033 non-null int64)\n    - hhsize   (22033 non-null int64)\n'
        '    - hhinc    (22033 non-null float64)\n    - famtype  (22033 non-null int64)\n'
        '    - hhowndum (22033 non-null int64)\n    - numemphh (22033 non-null int64)\n'
        '    - numadlt  (22033 non-null int64)\n    - nmlt5    (22033 non-null int64)\n'
        '    - nm5to11  (22033 non-null int64)\n    - nm12to16 (22033 non-null int64)\n'
        '    - wkccbd   (22033 non-null int64)\n    - wknccbd  (22033 non-null int64)\n'
        '    - corredis (22033 non-null int64)\n    - vehbywrk (22033 non-null float64)\n'
        '    - vocc     (22033 non-null int64)\n    - wgt      (22033 non-null int64)\n'
        '  data_co: <not populated>\n  data_av: <populated>\n')

    assert not ds.computational
    assert not ds.is_computational_ready()
    ds.computational = True
    assert ds.is_computational_ready()
    assert ds.computational
    s = io.StringIO()

    ds.info(out=s)

    assert s.getvalue() == ('larch.DataFrames:\n'
                            '  n_cases: 5029\n'
                            '  n_alts: 6\n'
                            '  data_ce: 36 variables, 22033 rows\n'
                            '  data_co: <not populated>\n'
                            '  data_av: <populated>\n')
Exemple #17
0
def test_repeated_splitting():
    df = pandas.read_csv(example_file("MTCwork.csv.gz"))
    df.set_index(['casenum', 'altnum'], inplace=True)

    dfs = DataFrames(df, crack=False)
    d1, d2 = dfs.split([80, 20])
    assert d1.n_cases == 4024
    assert d2.n_cases == 1005
    d11, d12 = d1.split([50, 50])
    assert d11.n_cases == 2012
    assert d12.n_cases == 2012

    dfs = DataFrames(df, crack=False)
    d1, d2 = dfs.split([80, 20], method='shuffle')
    assert d1.n_cases == 4024
    assert d2.n_cases == 1005
    d11, d12 = d1.split([50, 50])
    assert d11.n_cases == 2012
    assert d12.n_cases == 2012

    dfs = DataFrames(df, crack=True)
    d1, d2 = dfs.split([80, 20])
    assert d1.n_cases == 4024
    assert d2.n_cases == 1005
    d11, d12 = d1.split([50, 50])
    assert d11.n_cases == 2012
    assert d12.n_cases == 2012

    dfs = DataFrames(df, crack=True)
    d1, d2 = dfs.split([80, 20], method='shuffle')
    assert d1.n_cases == 4024
    assert d2.n_cases == 1005
    d11, d12 = d1.split([50, 50])
    assert d11.n_cases == 2012
    assert d12.n_cases == 2012
Exemple #18
0
def stop_frequency_model(
    edb_directory="output/estimation_data_bundle/{name}/",
    return_data=False,
):
    data = stop_frequency_data(
        edb_directory=edb_directory,
        values_index_col="tour_id",
    )

    models = []

    for n in range(len(data.spec)):

        coefficients = data.coefficients
        # coef_template = data.coef_template # not used
        spec = data.spec[n]
        chooser_data = data.chooser_data[n]
        settings = data.settings

        alt_names = data.alt_names[n]
        alt_codes = data.alt_codes[n]

        from .general import clean_values
        chooser_data = clean_values(
            chooser_data,
            alt_names_to_codes=data.alt_names_to_codes[n],
            choice_code="override_choice_code",
        )

        if settings.get('LOGIT_TYPE') == 'NL':
            tree = construct_nesting_tree(data.alt_names[n], settings["NESTS"])
            m = Model(graph=tree)
        else:
            m = Model()

        m.utility_co = dict_of_linear_utility_from_spec(
            spec,
            "Label",
            dict(zip(alt_names, alt_codes)),
        )

        apply_coefficients(coefficients, m)

        avail = True

        d = DataFrames(
            co=chooser_data,
            av=avail,
            alt_codes=alt_codes,
            alt_names=alt_names,
        )

        m.dataservice = d
        m.choice_co_code = "override_choice_code"
        models.append(m)

    from larch.model.model_group import ModelGroup
    models = ModelGroup(models)

    if return_data:
        return (
            models,
            data,
        )

    return models
Exemple #19
0
def auto_ownership_model(
    name="auto_ownership",
    edb_directory="output/estimation_data_bundle/{name}/",
    return_data=False,
):
    data = simple_simulate_data(
        name=name,
        edb_directory=edb_directory,
        values_index_col="household_id",
    )
    coefficients = data.coefficients
    # coef_template = data.coef_template # not used
    spec = data.spec
    chooser_data = data.chooser_data
    settings = data.settings

    altnames = list(spec.columns[3:])
    altcodes = range(len(altnames))

    chooser_data = remove_apostrophes(chooser_data)
    chooser_data.fillna(0, inplace=True)

    # Remove choosers with invalid observed choice
    chooser_data = chooser_data[chooser_data["override_choice"] >= 0]

    m = Model()
    # One of the alternatives is coded as 0, so
    # we need to explicitly initialize the MNL nesting graph
    # and set to root_id to a value other than zero.
    m.initialize_graph(alternative_codes=altcodes, root_id=99)

    m.utility_co = dict_of_linear_utility_from_spec(
        spec,
        "Label",
        dict(zip(altnames, altcodes)),
    )

    apply_coefficients(coefficients, m)

    d = DataFrames(
        co=chooser_data,
        av=True,
        alt_codes=altcodes,
        alt_names=altnames,
    )

    m.dataservice = d
    m.choice_co_code = "override_choice"

    if return_data:
        return (
            m,
            Dict(
                edb_directory=data.edb_directory,
                chooser_data=chooser_data,
                coefficients=coefficients,
                spec=spec,
                altnames=altnames,
                altcodes=altcodes,
            ),
        )

    return m