Beispiel #1
0
 def test_user_agent(self):
     user = '******'
     pw = 'supersecret'
     site = 'https://test.crunch.io/api/'
     from pycrunch.version import __version__ as pycrunch_v
     from requests.sessions import Session
     with mock.patch.object(Session, 'send') as mock_send:
         scrunch.connect(user, pw, site)
     prep_req = mock_send.call_args[0][0]
     assert prep_req.headers['user-agent'] == 'scrunch/%s (pycrunch/%s)' % (scrunch.__version__, pycrunch_v)
Beispiel #2
0
    def test_move_dataset(self):
        fo = get_user(username2)
        fo_site = connect(fo.email, password2, HOST)

        # These two datasets are created by the default logged user
        _ds1 = site.datasets.create(
            shoji_entity_wrapper({'name': 'test_move_dataset1'})).refresh()
        _ds2 = site.datasets.create(
            shoji_entity_wrapper({'name': 'test_move_dataset2'})).refresh()

        # This dataset is created and owned by the other user
        _ds4 = fo_site.datasets.create(
            shoji_entity_wrapper({
                'name': 'test_move_dataset4',
                'owner': fo.url
            })).refresh()

        ds1 = get_dataset(_ds1.body.id)
        ds2 = get_dataset(_ds2.body.id)
        ds4 = get_dataset(_ds4.body.id, connection=fo_site)

        ds2.add_user(fo, edit=True)

        # Create a hierarchy A -> B
        pa = new_project("test_move_dataset_A")
        pa.move_here([ds1])  # Put ds1 in A

        pb = pa.create_project("test_move_dataset_B")
        pa.add_user(fo, edit=True)

        # Move ds4 to B as the other user
        fo_pa = get_project(pa.name, fo_site)
        fo_pa.place(ds4, path="| %s" % pb.name)
        pb.resource.refresh()
        self.assertItemsEqual(
            pb.resource.index.keys(),
            # Only ds4 here
            [_ds4.self])

        fo_ds1 = get_dataset(_ds1.body.id, connection=fo_site)
        fo_pa.place(fo_ds1, path="| %s" % pb.name)

        pb.resource.refresh()
        self.assertItemsEqual(pb.resource.index.keys(), [_ds1.self, _ds4.self])
        pa.place(ds2, path="| %s" % pb.name)

        pb.resource.refresh()
        self.assertItemsEqual(pb.resource.index.keys(),
                              [_ds1.self, _ds2.self, _ds4.self])
        self.assertEqual(ds2.resource.project.self, pb.url)
Beispiel #3
0
from pycrunch.shoji import Catalog

from fixtures import NEWS_DATASET
from scrunch import connect
from scrunch import get_dataset
from scrunch.datasets import Variable
from scrunch.exceptions import InvalidPathError
from scrunch.folders import Folder


HOST = os.environ['SCRUNCH_HOST']
username = os.environ['SCRUNCH_USER']
password = os.environ['SCRUNCH_PASS']

site = connect(username, password, HOST)


def setup_folders(ds):
    sess = ds.session
    ds.settings.edit(variable_folders=True)
    ds.variables.create({
        'element': 'shoji:entity',
        'body': {'name': 'testvar1', 'type': 'numeric'}
    })
    ds.variables.create({
        'element': 'shoji:entity',
        'body': {'name': 'testvar2', 'type': 'numeric'}
    })
    ds.variables.create({
        'element': 'shoji:entity',
Beispiel #4
0
# coding: utf-8

from getpass import getpass
from scrunch import connect
from scrunch.datasets import get_dataset, create_dataset

username = raw_input("Enter email: ")
password = getpass("Enter password for %s: " % username)

site = connect(username, password, site_url='https://alpha.crunch.io/api/')

new_ds = create_dataset(
    'Test dataset', {
        "catvar": {
            'name':
            'categorical variable',
            'type':
            'categorical',
            'categories': [{
                'id': 1,
                'name': 'yes',
                'numeric_value': 1,
                'missing': False
            }, {
                'id': 2,
                'name': 'no',
                'numeric_value': 2,
                'missing': False
            }, {
                'id': 3,
                'name': 'Not asked',
def main():
    assert not invalid_credentials()
    assert pandaslib, 'Pandas library not installed'

    # Login.
    site = connect(CRUNCH_USER, CRUNCH_PASSWORD, CRUNCH_URL)
    assert isinstance(site, pycrunch.shoji.Catalog)

    # Create the test dataset.
    dataset_resource = site.datasets.create(DATASET_DOC).refresh()
    assert isinstance(dataset_resource, pycrunch.shoji.Entity)
    dataset = StreamingDataset(dataset_resource)

    try:
        # Load initial data.
        pycrunch.importing.importer.append_rows(dataset.resource, ROWS)

        # refresh dataset instance, so size is updated
        dataset.resource.refresh()

        # Check the initial number of rows.
        df = pandaslib.dataframe(dataset.resource)
        assert len(df) == len(ROWS) - 1  # excluding the header
        assert dataset.size.rows == len(df)
        assert dataset.size.unfiltered_rows == len(df)

        # Also check number of columns
        columns = DATASET_DOC['body']['table']['metadata'].__len__()
        assert dataset.size.columns == columns

        # 0. Manipulate metadata

        # 0.1 Start by updating the missing rules for the `identity` variable
        identity_missing_rules = {"not asked": 9999, "skipped": 9998}

        assert dataset['identity'].missing_rules == {}
        dataset['identity'].set_missing_rules(identity_missing_rules)
        assert dataset['identity'].missing_rules == identity_missing_rules

        # 0.2 Try setting and unsetting the geodata view
        location = dataset['location']
        geodata = get_geodata('UK Regions')
        assert 'geodata' not in location.view

        # Set geodata using Entity object
        location.set_geodata_view(geodata, feature_key='EER13NM')
        assert 'geodata' in location.view

        location.unset_geodata_view()
        assert 'geodata' not in location.view

        # Set geodata using url
        location.set_geodata_view(geodata.self, feature_key='EER13NM')
        assert 'geodata' in location.view

        location.unset_geodata_view()
        assert 'geodata' not in location.view

        # Set geodata using name
        location.set_geodata_view('UK Regions', feature_key='EER13NM')
        assert 'geodata' in location.view

        location.unset_geodata_view()
        assert 'geodata' not in location.view

        # 1. Exclusion Filter Integration Tests

        # 1.1 Set a simple exclusion filter.

        dataset.exclude('identity > 5')
        df = pandaslib.dataframe(dataset.resource)
        assert len(df) == 5
        assert not any(r['identity'] > 5 for _, r in df.iterrows())

        # 1.2 More complex exclusion filters involving a categorical variable.

        expr = 'speak_spanish in [32766]'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and row[4] != 32766
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids

        expr = 'speak_spanish in (32766, 32767)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and row[4] not in (32766, 32767)
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert not isnan(row['speak_spanish'])

        expr = 'not (speak_spanish in (1, 2) and operating_system == "Linux")'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and row[4] in (1, 2) and row[2] == 'Linux'
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['speak_spanish'] in \
                ('I speak Spanish primarily',
                    'I speak both Spanish and English equally')
            assert row['operating_system'] == 'Linux'

        # 1.3 Exclusion filters with `any`.

        expr = 'hobbies.any([32766])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and 32766 not in row[5:9]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert {'?': 32766} not in row['hobbies']

        expr = 'not hobbies.any([32766])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and 32766 in row[5:9]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert {'?': 32766} in row['hobbies']

        expr = 'hobbies.any([32766, 32767])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity'
            and 32766 not in row[5:9] and 32767 not in row[5:9]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert {'?': 32766} not in row['hobbies'] and \
                   {'?': 32767} not in row['hobbies']

        expr = 'music.any([32766])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and 32766 not in row[9:14]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert {'?': 32766} not in row['music']

        expr = 'music.any([1])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and 1 not in row[9:14]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert 1 not in row['music']

        expr = 'music.any([1, 2])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity'
            and 1 not in row[9:14] and 2 not in row[9:14]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert 1 not in row['music'] and 2 not in row['music']

        # 1.4 Exclusion filters with `all`.

        expr = 'hobbies.all([32767])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity'
            and row[5:9] != [32767, 32767, 32767, 32767]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['hobbies'] != [{
                '?': 32767
            }, {
                '?': 32767
            }, {
                '?': 32767
            }, {
                '?': 32767
            }]

        expr = 'not hobbies.all([32767])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity'
            and row[5:9] == [32767, 32767, 32767, 32767]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['hobbies'] == [{
                '?': 32767
            }, {
                '?': 32767
            }, {
                '?': 32767
            }, {
                '?': 32767
            }]

        expr = 'music.all([1])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and row[9:14] != [1, 1, 1, 1, 1]
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['music'] != [1, 1, 1, 1, 1]

        expr = 'music.all([1]) or music.all([2])'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and (
                row[9:14] != [1, 1, 1, 1, 1] and row[9:14] != [2, 2, 2, 2, 2])
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['music'] != [1, 1, 1, 1, 1] and \
                row['music'] != [2, 2, 2, 2, 2]

        expr = 'not ( music.all([1]) or music.all([2]) )'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and (
                row[9:14] == [1, 1, 1, 1, 1] or row[9:14] == [2, 2, 2, 2, 2])
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['music'] == [1, 1, 1, 1, 1] or \
                row['music'] == [2, 2, 2, 2, 2]

        # 1.5 Exclusion filters with `duplicates`.

        expr = 'ip_address.duplicates()'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        seen_ip_addresses = []
        for _, row in df.iterrows():
            assert row['ip_address'] not in seen_ip_addresses
            seen_ip_addresses.append(row['ip_address'])

        # 1.6 Exclusion filters with `valid` and `missing`.

        expr = 'valid(speak_spanish)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and row[4] in (32766, 32767)
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert isnan(row['speak_spanish'])

        expr = 'not valid(speak_spanish)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and row[4] not in (32766, 32767)
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert not isnan(row['speak_spanish'])

        expr = 'missing(speak_spanish)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and row[4] not in (32766, 32767)
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert not isnan(row['speak_spanish'])

        expr = 'missing(hobbies)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and (
                row[5:9] != [32766, 32766, 32766, 32766]
                and row[5:9] != [32767, 32767, 32767, 32767])
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['hobbies'] != [{'?': 32766}, {'?': 32766},
                                      {'?': 32766}, {'?': 32766}] \
                and row['hobbies'] != [{'?': 32767}, {'?': 32767},
                                       {'?': 32767}, {'?': 32767}]

        expr = 'not missing(hobbies)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and (
                row[5:9] == [32766, 32766, 32766, 32766]
                or row[5:9] == [32767, 32767, 32767, 32767])
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['hobbies'] == [{'?': 32766}, {'?': 32766},
                                      {'?': 32766}, {'?': 32766}] \
                or row['hobbies'] == [{'?': 32767}, {'?': 32767},
                                      {'?': 32767}, {'?': 32767}]

        expr = 'valid(hobbies)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and (
                32766 in row[5:9] or 32767 in row[5:9])
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert {'?': 32766} in row['hobbies'] or \
                   {'?': 32767} in row['hobbies']

        expr = 'not valid(hobbies)'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and (
                32766 not in row[5:9] and 32767 not in row[5:9])
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert {'?': 32766} not in row['hobbies'] and \
                   {'?': 32767} not in row['hobbies']

        # 1.7 Exclusion filter that refers to a subvariable by alias.
        expr = 'hobbies_1 == 4'
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS if row[0] != 'identity' and row[5] != 4
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids
            assert row['hobbies'][0] != 4

        # 1.8 Complex exclusion filters (multiple rules)
        expr = (
            '(religion != 1 and (not valid(speak_spanish) or speak_spanish >= 1)) '
            'or (religion == 1 and speak_spanish == 2) '
            'or (religion == 3 and speak_spanish == 4)')
        dataset.exclude(expr)

        # 1.9 Exclusion filters using date variables.
        dt_str = '2014-12-30T00:00:00+00:00'
        dt = isodate.parse_datetime(dt_str)
        expr = 'registration_time < "%s"' % dt_str
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and isodate.parse_datetime(row[3]) >= dt
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids

        dt_str = '2015-01-01T00:00:00+00:00'
        dt = isodate.parse_datetime(dt_str)
        expr = 'registration_time >= "%s"' % dt_str
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and isodate.parse_datetime(row[3]) < dt
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids

        dt_str = '2014-05-10T00:00:00+00:00'
        dt = isodate.parse_datetime(dt_str)
        expr = 'registration_time == "%s"' % dt_str
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and isodate.parse_datetime(row[3]) != dt
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids

        dt_str = '2014-05-10T00:00:00+00:00'
        dt = isodate.parse_datetime(dt_str)
        expr = 'not(registration_time == "%s")' % dt_str
        dataset.exclude(expr)
        df = pandaslib.dataframe(dataset.resource)
        valid_ids = [
            row[0] for row in ROWS
            if row[0] != 'identity' and isodate.parse_datetime(row[3]) == dt
        ]
        assert len(df) == len(valid_ids)
        for _, row in df.iterrows():
            assert row['identity'] in valid_ids

        # 1.10 Clear the exclusion filter.
        dataset.exclude()
        df = pandaslib.dataframe(dataset.resource)
        assert len(df) == len(ROWS) - 1  # excluding the header

        # 2. Integration Tests for "Transformations".

        categories = [{
            'id':
            1,
            'name':
            'Nerds',
            'numeric_value':
            1,
            'missing':
            False,
            'case':
            'operating_system in ("Linux", "Solaris", "Minix", "FreeBSD", "NetBSD")',
        }, {
            'id': 2,
            'name': 'Normal Users',
            'numeric_value': 2,
            'missing': False,
            'case': 'operating_system == "Windows"',
        }, {
            'id': 3,
            'name': 'Hipsters',
            'numeric_value': 3,
            'missing': False,
            'case': 'operating_system == "MacOS"',
        }, {
            'id': 32767,
            'name': 'Unknown',
            'numeric_value': None,
            'missing': True,
            'case': 'missing(operating_system)'
        }]

        new_var = dataset.create_single_response(
            categories=categories,
            name='Operating System Users',
            alias='operating_system_users',
            description='Type of Operating System Users')

        assert isinstance(new_var, Variable)
        assert new_var.type == 'categorical'

        # Check the data on the new variable.
        df = pandaslib.dataframe(dataset.resource)
        assert 'operating_system_users' in df

        # Check the nerds.
        assert set(r['operating_system'] for _, r in df[
            df['operating_system_users'] == 'Nerds'].iterrows()) == {
                'Linux', 'Solaris', 'Minix', 'FreeBSD', 'NetBSD'
            }

        # Check the hipsters.
        assert set(r['operating_system']
                   for _, r in df[df['operating_system_users'] ==
                                  'Hipsters'].iterrows()) == {'MacOS'}

        # Check normal users.
        assert set(r['operating_system']
                   for _, r in df[df['operating_system_users'] ==
                                  'Normal Users'].iterrows()) == {'Windows'}

        # 3. Integration Tests for "Recodes".

        # 3.1 combine_categories.

        # On a 'categorical' variable.
        cat_map = {1: [2, 3], 2: [1, 4], 99: [32766, 32767]}

        cat_names = {1: 'Bilingual', 2: 'Not Bilingual', 99: 'Unknown'}

        new_var = dataset.combine_categorical('speak_spanish',
                                              map=cat_map,
                                              categories=cat_names,
                                              name='Bilingual Person',
                                              alias='bilingual',
                                              missing=[99])

        assert isinstance(new_var, Variable)
        assert new_var.type == 'categorical'

        df = pandaslib.dataframe(dataset.resource)
        assert 'bilingual' in df

        # Check the data in the recoded variable.
        bilingual_ids = set(row[0] for row in ROWS
                            if row[0] != 'identity' and row[4] in (2, 3))
        assert set(
            int(r['identity']) for _, r in df[
                df['bilingual'] == 'Bilingual'].iterrows()) == bilingual_ids

        non_bilingual_ids = set(row[0] for row in ROWS
                                if row[0] != 'identity' and row[4] in (1, 4))
        assert set(
            int(r['identity'])
            for _, r in df[df['bilingual'] ==
                           'Not Bilingual'].iterrows()) == non_bilingual_ids

        bilingual_null_ids = set(
            row[0] for row in ROWS
            if row[0] != 'identity' and row[4] in (32766, 32767))
        assert set(
            int(r['identity']) for _, r in df[
                df['bilingual'].isnull()].iterrows()) == bilingual_null_ids

        # On a 'categorical_array' variable.

        cat_map = {1: [1, 2], 2: [3, 4], 99: [32766, 32767]}

        cat_names = {
            1: 'Interested',
            2: 'Not interested',
            99: 'Unknown',
        }

        new_var = dataset.combine_categorical('hobbies',
                                              map=cat_map,
                                              categories=cat_names,
                                              name='Hobbies (recoded)',
                                              alias='hobbies_recoded',
                                              missing=[99])
        assert isinstance(new_var, Variable)
        assert new_var.type == 'categorical_array'

        df = pandaslib.dataframe(dataset.resource)
        assert 'hobbies_recoded' in df

        # Check the data in the recoded variable.
        for _, row in df.iterrows():
            hobbies = row['hobbies']
            hobbies_rec = row['hobbies_recoded']
            assert len(hobbies) == len(hobbies_rec)

            for i, value in enumerate(hobbies):
                if value in ({'?': 32766}, {'?': 32767}):
                    assert hobbies_rec[i] == {'?': 99}
                elif value in (1, 2):
                    assert hobbies_rec[i] == 1
                elif value in (3, 4):
                    assert hobbies_rec[i] == 2

        # 3.2 combine_responses.

        cat_map = {1: [1, 2], 2: [97], 3: [98, 99]}

        cat_names = {
            1: 'music_recoded_1',
            2: 'music_recoded_2',
            3: 'music_recoded_3'
        }
        new_var = dataset.combine_multiple_response('music',
                                                    map=cat_map,
                                                    categories=cat_names,
                                                    name='Music (alt)',
                                                    alias='music_recoded')
        assert isinstance(new_var, Variable)
        assert new_var.type == 'multiple_response'

        df = pandaslib.dataframe(dataset.resource)
        assert 'music_recoded' in df

        # TODO: Test the data in the recoded variable. Unsure of its meaning.

    finally:
        dataset.resource.delete()
Beispiel #6
0
This example shows a more complex example using different recodes and combines
to create new variables.
"""

from examples import NEWS_DATASET, NEWS_DATASET_ROWS, mr_in

from getpass import getpass
from scrunch import connect
from scrunch.datasets import create_dataset

HOST = 'https://alpha.crunch.io'

username = raw_input("Enter email: ")
password = getpass("Enter password for %s: " % username)

site = connect(username, password, site_url='%s/api/' % HOST)

# Create a dataset for usage
dataset = create_dataset("Recodes example", NEWS_DATASET)
print("Dataset %s created" % dataset.id)

# Add data rows
total = dataset.stream_rows(NEWS_DATASET_ROWS)
dataset.push_rows(total)

# Recode a new single response variable
agerange = dataset.create_categorical([{
    'id': 1,
    'name': 'Underage',
    'case': 'age < 18'
}, {