def test_user_agent(self): user = '******' pw = 'supersecret' site = 'https://test.crunch.io/api/' from pycrunch.version import __version__ as pycrunch_v from requests.sessions import Session with mock.patch.object(Session, 'send') as mock_send: scrunch.connect(user, pw, site) prep_req = mock_send.call_args[0][0] assert prep_req.headers['user-agent'] == 'scrunch/%s (pycrunch/%s)' % (scrunch.__version__, pycrunch_v)
def test_move_dataset(self): fo = get_user(username2) fo_site = connect(fo.email, password2, HOST) # These two datasets are created by the default logged user _ds1 = site.datasets.create( shoji_entity_wrapper({'name': 'test_move_dataset1'})).refresh() _ds2 = site.datasets.create( shoji_entity_wrapper({'name': 'test_move_dataset2'})).refresh() # This dataset is created and owned by the other user _ds4 = fo_site.datasets.create( shoji_entity_wrapper({ 'name': 'test_move_dataset4', 'owner': fo.url })).refresh() ds1 = get_dataset(_ds1.body.id) ds2 = get_dataset(_ds2.body.id) ds4 = get_dataset(_ds4.body.id, connection=fo_site) ds2.add_user(fo, edit=True) # Create a hierarchy A -> B pa = new_project("test_move_dataset_A") pa.move_here([ds1]) # Put ds1 in A pb = pa.create_project("test_move_dataset_B") pa.add_user(fo, edit=True) # Move ds4 to B as the other user fo_pa = get_project(pa.name, fo_site) fo_pa.place(ds4, path="| %s" % pb.name) pb.resource.refresh() self.assertItemsEqual( pb.resource.index.keys(), # Only ds4 here [_ds4.self]) fo_ds1 = get_dataset(_ds1.body.id, connection=fo_site) fo_pa.place(fo_ds1, path="| %s" % pb.name) pb.resource.refresh() self.assertItemsEqual(pb.resource.index.keys(), [_ds1.self, _ds4.self]) pa.place(ds2, path="| %s" % pb.name) pb.resource.refresh() self.assertItemsEqual(pb.resource.index.keys(), [_ds1.self, _ds2.self, _ds4.self]) self.assertEqual(ds2.resource.project.self, pb.url)
from pycrunch.shoji import Catalog from fixtures import NEWS_DATASET from scrunch import connect from scrunch import get_dataset from scrunch.datasets import Variable from scrunch.exceptions import InvalidPathError from scrunch.folders import Folder HOST = os.environ['SCRUNCH_HOST'] username = os.environ['SCRUNCH_USER'] password = os.environ['SCRUNCH_PASS'] site = connect(username, password, HOST) def setup_folders(ds): sess = ds.session ds.settings.edit(variable_folders=True) ds.variables.create({ 'element': 'shoji:entity', 'body': {'name': 'testvar1', 'type': 'numeric'} }) ds.variables.create({ 'element': 'shoji:entity', 'body': {'name': 'testvar2', 'type': 'numeric'} }) ds.variables.create({ 'element': 'shoji:entity',
# coding: utf-8 from getpass import getpass from scrunch import connect from scrunch.datasets import get_dataset, create_dataset username = raw_input("Enter email: ") password = getpass("Enter password for %s: " % username) site = connect(username, password, site_url='https://alpha.crunch.io/api/') new_ds = create_dataset( 'Test dataset', { "catvar": { 'name': 'categorical variable', 'type': 'categorical', 'categories': [{ 'id': 1, 'name': 'yes', 'numeric_value': 1, 'missing': False }, { 'id': 2, 'name': 'no', 'numeric_value': 2, 'missing': False }, { 'id': 3, 'name': 'Not asked',
def main(): assert not invalid_credentials() assert pandaslib, 'Pandas library not installed' # Login. site = connect(CRUNCH_USER, CRUNCH_PASSWORD, CRUNCH_URL) assert isinstance(site, pycrunch.shoji.Catalog) # Create the test dataset. dataset_resource = site.datasets.create(DATASET_DOC).refresh() assert isinstance(dataset_resource, pycrunch.shoji.Entity) dataset = StreamingDataset(dataset_resource) try: # Load initial data. pycrunch.importing.importer.append_rows(dataset.resource, ROWS) # refresh dataset instance, so size is updated dataset.resource.refresh() # Check the initial number of rows. df = pandaslib.dataframe(dataset.resource) assert len(df) == len(ROWS) - 1 # excluding the header assert dataset.size.rows == len(df) assert dataset.size.unfiltered_rows == len(df) # Also check number of columns columns = DATASET_DOC['body']['table']['metadata'].__len__() assert dataset.size.columns == columns # 0. Manipulate metadata # 0.1 Start by updating the missing rules for the `identity` variable identity_missing_rules = {"not asked": 9999, "skipped": 9998} assert dataset['identity'].missing_rules == {} dataset['identity'].set_missing_rules(identity_missing_rules) assert dataset['identity'].missing_rules == identity_missing_rules # 0.2 Try setting and unsetting the geodata view location = dataset['location'] geodata = get_geodata('UK Regions') assert 'geodata' not in location.view # Set geodata using Entity object location.set_geodata_view(geodata, feature_key='EER13NM') assert 'geodata' in location.view location.unset_geodata_view() assert 'geodata' not in location.view # Set geodata using url location.set_geodata_view(geodata.self, feature_key='EER13NM') assert 'geodata' in location.view location.unset_geodata_view() assert 'geodata' not in location.view # Set geodata using name location.set_geodata_view('UK Regions', feature_key='EER13NM') assert 'geodata' in location.view location.unset_geodata_view() assert 'geodata' not in location.view # 1. Exclusion Filter Integration Tests # 1.1 Set a simple exclusion filter. dataset.exclude('identity > 5') df = pandaslib.dataframe(dataset.resource) assert len(df) == 5 assert not any(r['identity'] > 5 for _, r in df.iterrows()) # 1.2 More complex exclusion filters involving a categorical variable. expr = 'speak_spanish in [32766]' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] != 32766 ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids expr = 'speak_spanish in (32766, 32767)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'not (speak_spanish in (1, 2) and operating_system == "Linux")' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] in (1, 2) and row[2] == 'Linux' ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['speak_spanish'] in \ ('I speak Spanish primarily', 'I speak both Spanish and English equally') assert row['operating_system'] == 'Linux' # 1.3 Exclusion filters with `any`. expr = 'hobbies.any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] expr = 'not hobbies.any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} in row['hobbies'] expr = 'hobbies.any([32766, 32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[5:9] and 32767 not in row[5:9] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] and \ {'?': 32767} not in row['hobbies'] expr = 'music.any([32766])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 32766 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['music'] expr = 'music.any([1])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 1 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert 1 not in row['music'] expr = 'music.any([1, 2])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and 1 not in row[9:14] and 2 not in row[9:14] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert 1 not in row['music'] and 2 not in row['music'] # 1.4 Exclusion filters with `all`. expr = 'hobbies.all([32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5:9] != [32767, 32767, 32767, 32767] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] != [{ '?': 32767 }, { '?': 32767 }, { '?': 32767 }, { '?': 32767 }] expr = 'not hobbies.all([32767])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5:9] == [32767, 32767, 32767, 32767] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] == [{ '?': 32767 }, { '?': 32767 }, { '?': 32767 }, { '?': 32767 }] expr = 'music.all([1])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[9:14] != [1, 1, 1, 1, 1] ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] != [1, 1, 1, 1, 1] expr = 'music.all([1]) or music.all([2])' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[9:14] != [1, 1, 1, 1, 1] and row[9:14] != [2, 2, 2, 2, 2]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] != [1, 1, 1, 1, 1] and \ row['music'] != [2, 2, 2, 2, 2] expr = 'not ( music.all([1]) or music.all([2]) )' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[9:14] == [1, 1, 1, 1, 1] or row[9:14] == [2, 2, 2, 2, 2]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['music'] == [1, 1, 1, 1, 1] or \ row['music'] == [2, 2, 2, 2, 2] # 1.5 Exclusion filters with `duplicates`. expr = 'ip_address.duplicates()' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) seen_ip_addresses = [] for _, row in df.iterrows(): assert row['ip_address'] not in seen_ip_addresses seen_ip_addresses.append(row['ip_address']) # 1.6 Exclusion filters with `valid` and `missing`. expr = 'valid(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert isnan(row['speak_spanish']) expr = 'not valid(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'missing(speak_spanish)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[4] not in (32766, 32767) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert not isnan(row['speak_spanish']) expr = 'missing(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[5:9] != [32766, 32766, 32766, 32766] and row[5:9] != [32767, 32767, 32767, 32767]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] != [{'?': 32766}, {'?': 32766}, {'?': 32766}, {'?': 32766}] \ and row['hobbies'] != [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'not missing(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( row[5:9] == [32766, 32766, 32766, 32766] or row[5:9] == [32767, 32767, 32767, 32767]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'] == [{'?': 32766}, {'?': 32766}, {'?': 32766}, {'?': 32766}] \ or row['hobbies'] == [{'?': 32767}, {'?': 32767}, {'?': 32767}, {'?': 32767}] expr = 'valid(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( 32766 in row[5:9] or 32767 in row[5:9]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} in row['hobbies'] or \ {'?': 32767} in row['hobbies'] expr = 'not valid(hobbies)' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and ( 32766 not in row[5:9] and 32767 not in row[5:9]) ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert {'?': 32766} not in row['hobbies'] and \ {'?': 32767} not in row['hobbies'] # 1.7 Exclusion filter that refers to a subvariable by alias. expr = 'hobbies_1 == 4' dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and row[5] != 4 ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids assert row['hobbies'][0] != 4 # 1.8 Complex exclusion filters (multiple rules) expr = ( '(religion != 1 and (not valid(speak_spanish) or speak_spanish >= 1)) ' 'or (religion == 1 and speak_spanish == 2) ' 'or (religion == 3 and speak_spanish == 4)') dataset.exclude(expr) # 1.9 Exclusion filters using date variables. dt_str = '2014-12-30T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'registration_time < "%s"' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) >= dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids dt_str = '2015-01-01T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'registration_time >= "%s"' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) < dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids dt_str = '2014-05-10T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'registration_time == "%s"' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) != dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids dt_str = '2014-05-10T00:00:00+00:00' dt = isodate.parse_datetime(dt_str) expr = 'not(registration_time == "%s")' % dt_str dataset.exclude(expr) df = pandaslib.dataframe(dataset.resource) valid_ids = [ row[0] for row in ROWS if row[0] != 'identity' and isodate.parse_datetime(row[3]) == dt ] assert len(df) == len(valid_ids) for _, row in df.iterrows(): assert row['identity'] in valid_ids # 1.10 Clear the exclusion filter. dataset.exclude() df = pandaslib.dataframe(dataset.resource) assert len(df) == len(ROWS) - 1 # excluding the header # 2. Integration Tests for "Transformations". categories = [{ 'id': 1, 'name': 'Nerds', 'numeric_value': 1, 'missing': False, 'case': 'operating_system in ("Linux", "Solaris", "Minix", "FreeBSD", "NetBSD")', }, { 'id': 2, 'name': 'Normal Users', 'numeric_value': 2, 'missing': False, 'case': 'operating_system == "Windows"', }, { 'id': 3, 'name': 'Hipsters', 'numeric_value': 3, 'missing': False, 'case': 'operating_system == "MacOS"', }, { 'id': 32767, 'name': 'Unknown', 'numeric_value': None, 'missing': True, 'case': 'missing(operating_system)' }] new_var = dataset.create_single_response( categories=categories, name='Operating System Users', alias='operating_system_users', description='Type of Operating System Users') assert isinstance(new_var, Variable) assert new_var.type == 'categorical' # Check the data on the new variable. df = pandaslib.dataframe(dataset.resource) assert 'operating_system_users' in df # Check the nerds. assert set(r['operating_system'] for _, r in df[ df['operating_system_users'] == 'Nerds'].iterrows()) == { 'Linux', 'Solaris', 'Minix', 'FreeBSD', 'NetBSD' } # Check the hipsters. assert set(r['operating_system'] for _, r in df[df['operating_system_users'] == 'Hipsters'].iterrows()) == {'MacOS'} # Check normal users. assert set(r['operating_system'] for _, r in df[df['operating_system_users'] == 'Normal Users'].iterrows()) == {'Windows'} # 3. Integration Tests for "Recodes". # 3.1 combine_categories. # On a 'categorical' variable. cat_map = {1: [2, 3], 2: [1, 4], 99: [32766, 32767]} cat_names = {1: 'Bilingual', 2: 'Not Bilingual', 99: 'Unknown'} new_var = dataset.combine_categorical('speak_spanish', map=cat_map, categories=cat_names, name='Bilingual Person', alias='bilingual', missing=[99]) assert isinstance(new_var, Variable) assert new_var.type == 'categorical' df = pandaslib.dataframe(dataset.resource) assert 'bilingual' in df # Check the data in the recoded variable. bilingual_ids = set(row[0] for row in ROWS if row[0] != 'identity' and row[4] in (2, 3)) assert set( int(r['identity']) for _, r in df[ df['bilingual'] == 'Bilingual'].iterrows()) == bilingual_ids non_bilingual_ids = set(row[0] for row in ROWS if row[0] != 'identity' and row[4] in (1, 4)) assert set( int(r['identity']) for _, r in df[df['bilingual'] == 'Not Bilingual'].iterrows()) == non_bilingual_ids bilingual_null_ids = set( row[0] for row in ROWS if row[0] != 'identity' and row[4] in (32766, 32767)) assert set( int(r['identity']) for _, r in df[ df['bilingual'].isnull()].iterrows()) == bilingual_null_ids # On a 'categorical_array' variable. cat_map = {1: [1, 2], 2: [3, 4], 99: [32766, 32767]} cat_names = { 1: 'Interested', 2: 'Not interested', 99: 'Unknown', } new_var = dataset.combine_categorical('hobbies', map=cat_map, categories=cat_names, name='Hobbies (recoded)', alias='hobbies_recoded', missing=[99]) assert isinstance(new_var, Variable) assert new_var.type == 'categorical_array' df = pandaslib.dataframe(dataset.resource) assert 'hobbies_recoded' in df # Check the data in the recoded variable. for _, row in df.iterrows(): hobbies = row['hobbies'] hobbies_rec = row['hobbies_recoded'] assert len(hobbies) == len(hobbies_rec) for i, value in enumerate(hobbies): if value in ({'?': 32766}, {'?': 32767}): assert hobbies_rec[i] == {'?': 99} elif value in (1, 2): assert hobbies_rec[i] == 1 elif value in (3, 4): assert hobbies_rec[i] == 2 # 3.2 combine_responses. cat_map = {1: [1, 2], 2: [97], 3: [98, 99]} cat_names = { 1: 'music_recoded_1', 2: 'music_recoded_2', 3: 'music_recoded_3' } new_var = dataset.combine_multiple_response('music', map=cat_map, categories=cat_names, name='Music (alt)', alias='music_recoded') assert isinstance(new_var, Variable) assert new_var.type == 'multiple_response' df = pandaslib.dataframe(dataset.resource) assert 'music_recoded' in df # TODO: Test the data in the recoded variable. Unsure of its meaning. finally: dataset.resource.delete()
This example shows a more complex example using different recodes and combines to create new variables. """ from examples import NEWS_DATASET, NEWS_DATASET_ROWS, mr_in from getpass import getpass from scrunch import connect from scrunch.datasets import create_dataset HOST = 'https://alpha.crunch.io' username = raw_input("Enter email: ") password = getpass("Enter password for %s: " % username) site = connect(username, password, site_url='%s/api/' % HOST) # Create a dataset for usage dataset = create_dataset("Recodes example", NEWS_DATASET) print("Dataset %s created" % dataset.id) # Add data rows total = dataset.stream_rows(NEWS_DATASET_ROWS) dataset.push_rows(total) # Recode a new single response variable agerange = dataset.create_categorical([{ 'id': 1, 'name': 'Underage', 'case': 'age < 18' }, {