コード例 #1
0
def main():
    assert not invalid_credentials()

    # Login.
    site = pycrunch.connect(CRUNCH_USER, CRUNCH_PASSWORD, CRUNCH_URL)
    assert isinstance(site, pycrunch.shoji.Catalog)

    # Create the test dataset.
    dataset = site.datasets.create(DATASET_DOC).refresh()
    assert isinstance(dataset, pycrunch.shoji.Entity)

    try:
        # Load initial data.
        pycrunch.importing.importer.append_rows(dataset, ROWS)

        # Check the initial number of rows.
        df = pandaslib.dataframe(dataset)
        assert len(df) == len(ROWS) - 1  # excluding the header

        # 1. Exclusion Filter Integration Tests

        # 1.1 Set a simple exclusion filter.

        pycrunch.datasets.exclusion(dataset, 'identity > 5')
        df = pandaslib.dataframe(dataset)
        assert len(df) == 5

        # 1.2 More complex exclusion filters involving a categorical variable.

        expr = 'speak_spanish in [32766]'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 10

        expr = 'speak_spanish in (32766, 32767)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 9

        expr = 'not (speak_spanish in (1, 2) and operating_system == "Linux")'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 2

        # 1.3 Exclusion filters with `has_any`.

        expr = 'hobbies.has_any([32766])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 8

        expr = 'not hobbies.has_any([32766])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 4

        expr = 'hobbies.has_any([32766, 32767])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 7

        expr = 'music.has_any([32766])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 12

        expr = 'music.has_any([1])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 1

        expr = 'music.has_any([1, 2])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 0

        # 1.4 Exclusion filters with `has_all`.

        expr = 'hobbies.has_all([32767])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 11

        expr = 'not hobbies.has_all([32767])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 1

        expr = 'music.has_all([1])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 11

        expr = 'music.has_all([1]) or music.has_all([2])'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 10

        expr = 'not ( music.has_all([1]) or music.has_all([2]) )'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 2

        # 1.5 Exclusion filters with `duplicates`.

        expr = 'ip_address.duplicates()'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 10

        # 1.6 Exclusion filters with `valid` and `missing`.

        expr = 'valid(speak_spanish)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 3

        expr = 'not valid(speak_spanish)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 9

        expr = 'missing(speak_spanish)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 9

        expr = 'missing(hobbies)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 11

        expr = 'not missing(hobbies)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 1

        expr = 'valid(hobbies)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 5

        expr = 'not valid(hobbies)'
        pycrunch.datasets.exclusion(dataset, expr)
        df = pandaslib.dataframe(dataset)
        assert len(df) == 7

        # 1.7 Clear the exclusion filter.
        pycrunch.datasets.exclusion(dataset)
        df = pandaslib.dataframe(dataset)
        assert len(df) == len(ROWS) - 1  # excluding the header

        # 2. Integration Tests for "Transformations".

        categories = [
            {'id': 1, 'name': 'Nerds', 'numeric_value': 1, 'missing': False},
            {'id': 2, 'name': 'Normal Users', 'numeric_value': 2, 'missing': False},
            {'id': 3, 'name': 'Hipsters', 'numeric_value': 3, 'missing': False},
            {'id': 32767, 'name': 'Unknown', 'numeric_value': None, 'missing': True}
        ]

        rules = [
            'operating_system in ("Linux", "Solaris", "Minix", "FreeBSD", "NetBSD")',
            'operating_system == "Windows"',
            'operating_system == "MacOS"',
            'missing(operating_system)'
        ]

        new_var = create_categorical(
            ds=dataset,
            categories=categories,
            rules=rules,
            name='Operating System Users',
            alias='operating_system_users',
            description='Type of Operating System Users'
        )
        assert isinstance(new_var, pycrunch.shoji.Entity)
        new_var.refresh()
        assert new_var.body.type == 'categorical'

        # Check the data on the new variable.
        df = pandaslib.dataframe(dataset)
        assert 'operating_system_users' in df

        # Check the nerds.
        assert len(df[df['operating_system_users'] == 'Nerds']) == 8
        assert set(
            r['operating_system']
            for _, r in df[df['operating_system_users'] == 'Nerds'].iterrows()
        ) == {'Linux', 'Solaris', 'Minix', 'FreeBSD', 'NetBSD'}

        # Check the hipsters.
        assert len(df[df['operating_system_users'] == 'Hipsters']) == 1
        assert set(
            r['operating_system']
            for _, r in df[df['operating_system_users'] == 'Hipsters'].iterrows()
        ) == {'MacOS'}

        # Check normal users.
        assert len(df[df['operating_system_users'] == 'Normal Users']) == 3
        assert set(
            r['operating_system']
            for _, r in df[df['operating_system_users'] == 'Normal Users'].iterrows()
        ) == {'Windows'}

        # 3. Integration Tests for "Recodes".

        # 3.1 combine_categories.

        # On a 'categorical' variable.
        cat_map = {
            1: {
                'name': 'Bilingual',
                'missing': False,
                'combined_ids': [2, 3]
            },
            2: {
                'name': 'Not Bilingual',
                'missing': False,
                'combined_ids': [1, 4]
            },
            99: {
                'name': 'Unknown',
                'missing': True,
                'combined_ids': [32766, 32767]
            }
        }
        new_var = combine_categories(
            dataset, 'speak_spanish', cat_map, 'Bilingual Person', 'bilingual'
        )
        assert isinstance(new_var, pycrunch.shoji.Entity)
        new_var.refresh()
        assert new_var.body.type == 'categorical'

        df = pandaslib.dataframe(dataset)
        assert 'bilingual' in df

        # Check the data in the recoded variable.
        assert len(df[df['bilingual'] == 'Bilingual']) == 5
        assert set(
            int(r['identity'])
            for _, r in df[df['bilingual'] == 'Bilingual'].iterrows()
        ) == {3, 4, 10, 11, 12}

        assert len(df[df['bilingual'] == 'Not Bilingual']) == 4
        assert set(
            int(r['identity'])
            for _, r in df[df['bilingual'] == 'Not Bilingual'].iterrows()
        ) == {1, 2, 5, 6}

        assert len(df[df['bilingual'].isnull()]) == 3
        assert set(
            int(r['identity'])
            for _, r in df[df['bilingual'].isnull()].iterrows()
        ) == {7, 8, 9}

        # On a 'categorical_array' variable.
        cat_map = {
            1: {
                'name': 'Interested',
                'missing': False,
                'combined_ids': [1, 2]
            },
            2: {
                'name': 'Not interested',
                'missing': False,
                'combined_ids': [3, 4]
            },
            99: {
                'name': 'Unknown',
                'missing': True,
                'combined_ids': [32766, 32767]
            }
        }
        new_var = combine_categories(
            dataset, 'hobbies', cat_map, 'Hobbies (recoded)', 'hobbies_recoded'
        )
        assert isinstance(new_var, pycrunch.shoji.Entity)
        new_var.refresh()
        assert new_var.body.type == 'categorical_array'

        df = pandaslib.dataframe(dataset)
        assert 'hobbies_recoded' in df

        # Check the data in the recoded variable.
        for _, row in df[['hobbies', 'hobbies_recoded']].iterrows():
            hobbies = row['hobbies']
            hobbies_rec = row['hobbies_recoded']
            assert len(hobbies) == len(hobbies_rec)

            for i, value in enumerate(hobbies):
                if value in ({'?': 32766}, {'?': 32767}):
                    assert hobbies_rec[i] == {'?': 99}
                elif value in (1, 2):
                    assert hobbies_rec[i] == 1
                elif value in (3, 4):
                    assert hobbies_rec[i] == 2

        # 3.2 combine_responses.

        response_map = {
            'music_recoded_1': ['music_1', 'music_2'],
            'music_recoded_2': ['music_97'],
            'music_recoded_3': ['music_98', 'music_99']
        }
        new_var = combine_responses(
            dataset, 'music', response_map, 'Music (alt)', 'music_recoded'
        )
        assert isinstance(new_var, pycrunch.shoji.Entity)
        new_var.refresh()
        assert new_var.body.type == 'multiple_response'

        df = pandaslib.dataframe(dataset)
        assert 'music_recoded' in df

        # TODO: Test the data in the recoded variable. Unsure of its meaning.

    finally:
        dataset.delete()
コード例 #2
0
    def test_create_categorical(self):
        var_id = '0001'
        var_type = 'categorical'
        var_url = '%svariables/%s/' % (self.ds_url, var_id)

        # Mocking setup.
        def _get(*args):
            if args[0] == 'id':
                return var_id
            if args[0] == 'type':
                return var_type
            return args[0]

        ds = mock.MagicMock()
        ds.self = self.ds_url
        _var_mock = mock.MagicMock()
        _var_mock.entity.self = var_url
        _var_mock.__getitem__.side_effect = _get
        _var_mock.get.side_effect = _get
        ds.variables.by.return_value = {
            'gender': _var_mock
        }

        test = create_categorical(ds, categories, rules, 'name', 'alias', 'description')
        call = ds.variables.create.call_args_list[0][0][0]
        payload = {
          "body": {
            "name": "name",
            "alias": "alias",
            "description": "description",
            "expr": {
              "function": "case",
              "args": [
                {
                  "column": [
                    3,
                    1,
                    2
                  ],
                  "type": {
                    "value": {
                      "categories": [
                        {
                          "name": "Hipsters",
                          "id": 3,
                          "numeric_value": None,
                          "missing": False
                        },
                        {
                          "name": "Techies",
                          "id": 1,
                          "numeric_value": None,
                          "missing": False
                        },
                        {
                          "name": "Yuppies",
                          "id": 2,
                          "numeric_value": None,
                          "missing": False
                        }
                      ],
                      "class": "categorical"
                    }
                  }
                },
                {
                  "function": "==",
                  "args": [
                    {
                      "variable": "http://test.crunch.io/api/datasets/123/variables/0001/"
                    },
                    {
                      "value": 1
                    }
                  ]
                },
                {
                  "function": "==",
                  "args": [
                    {
                      "variable": "http://test.crunch.io/api/datasets/123/variables/0001/"
                    },
                    {
                      "value": 2
                    }
                  ]
                }
              ]
            }
          },
          "element": "shoji:entity"
        }
        assert call == payload