Ejemplo n.º 1
0
    def test_move_dataset(self):
        fo = get_user(username2)
        fo_site = connect(fo.email, password2, HOST)

        # These two datasets are created by the default logged user
        _ds1 = site.datasets.create(
            shoji_entity_wrapper({'name': 'test_move_dataset1'})).refresh()
        _ds2 = site.datasets.create(
            shoji_entity_wrapper({'name': 'test_move_dataset2'})).refresh()

        # This dataset is created and owned by the other user
        _ds4 = fo_site.datasets.create(
            shoji_entity_wrapper({
                'name': 'test_move_dataset4',
                'owner': fo.url
            })).refresh()

        ds1 = get_dataset(_ds1.body.id)
        ds2 = get_dataset(_ds2.body.id)
        ds4 = get_dataset(_ds4.body.id, connection=fo_site)

        ds2.add_user(fo, edit=True)

        # Create a hierarchy A -> B
        pa = new_project("test_move_dataset_A")
        pa.move_here([ds1])  # Put ds1 in A

        pb = pa.create_project("test_move_dataset_B")
        pa.add_user(fo, edit=True)

        # Move ds4 to B as the other user
        fo_pa = get_project(pa.name, fo_site)
        fo_pa.place(ds4, path="| %s" % pb.name)
        pb.resource.refresh()
        self.assertItemsEqual(
            pb.resource.index.keys(),
            # Only ds4 here
            [_ds4.self])

        fo_ds1 = get_dataset(_ds1.body.id, connection=fo_site)
        fo_pa.place(fo_ds1, path="| %s" % pb.name)

        pb.resource.refresh()
        self.assertItemsEqual(pb.resource.index.keys(), [_ds1.self, _ds4.self])
        pa.place(ds2, path="| %s" % pb.name)

        pb.resource.refresh()
        self.assertItemsEqual(pb.resource.index.keys(),
                              [_ds1.self, _ds2.self, _ds4.self])
        self.assertEqual(ds2.resource.project.self, pb.url)
Ejemplo n.º 2
0
    def move_to_categorical_array(self,
                                  name,
                                  alias,
                                  subvariables,
                                  description='',
                                  notes=''):
        """
        This is a dangerous method that allows moving variables (effectively
        translating them as variables in a dataset) as subvariables in the
        newly created categorical_array created.

        :param: name: Name of the new variable.
        :param: alias: Alias of the new variable
        :param: subvariables: A list of existing Dataset variables aliases
            to move into the new variable as subvariables .i.e;
                subvariables = ['var1_alias', 'var2_alias']
        :param: description: A description of the new variable
        :param: notes: Notes to attach to the new variable
        """
        payload = {
            'name': name,
            'alias': alias,
            'description': description,
            'notes': notes,
            'type': 'categorical_array',
            'subvariables': [self[v].url for v in subvariables]
        }
        self.resource.variables.create(shoji_entity_wrapper(payload))
        self._reload_variables()
        return self[alias]
Ejemplo n.º 3
0
 def push_rows(self, count=None):
     """
     Batches in the rows that have been recently streamed. This forces
     the rows to appear in the dataset instead of waiting for crunch
     automatic batcher process.
     """
     if bool(self.resource.stream.body.pending_messages):
         self.resource.batches.create(
             shoji_entity_wrapper({
                 'stream': count,
                 'type': 'ldjson'}
             ))
Ejemplo n.º 4
0
    def append_dataset(self,
                       dataset,
                       filter=None,
                       variables=None,
                       autorollback=True,
                       delete_pk=True):
        """ Append dataset into self. If this operation fails, the
        append is rolledback. Dataset variables and subvariables
        are matched on their aliases and categories are matched by name.

        :param: dataset: Daatset instance to append from
        :param: filter: An expression to filter dataset rows. cannot be a Filter
            according to: http://docs.crunch.io/#get211
        :param: variables: A list of variable names to include from dataset
        """
        if self.url == dataset.url:
            raise ValueError("Cannot append dataset to self")

        if variables and not isinstance(variables, list):
            raise AttributeError(
                "'variables' must be a list of variable names")

        if delete_pk:
            LOG.info(
                "Any pk's found will be deleted, to avoid these pass delete_pk=False"
            )
            self.resource.pk.delete()
            dataset.resource.pk.delete()

        payload = shoji_entity_wrapper({'dataset': dataset.url})
        payload['autorollback'] = autorollback

        if variables:
            id_vars = []
            for var in variables:
                id_vars.append(dataset[var].url)
            # build the payload with selected variables
            payload['body']['where'] = {
                'function': 'select',
                'args': [{
                    'map': {x: {
                        'variable': x
                    }
                            for x in id_vars}
                }]
            }

        if filter:
            # parse the filter expression
            payload['body']['filter'] = process_expr(parse_expr(filter),
                                                     dataset.resource)

        return self.resource.batches.create(payload)
Ejemplo n.º 5
0
def create_dataset(name, variables, connection=None, **kwargs):
    if connection is None:
        connection = _get_connection()
        if not connection:
            raise AttributeError(
                "Authenticate first with scrunch.connect() or by providing "
                "config/environment variables")

    dataset_doc = {
        'name': name,
        'table': {
            'element': 'crunch:table',
            'metadata': variables
        }
    }
    dataset_doc.update(**kwargs)

    shoji_ds = connection.datasets.create(
        shoji_entity_wrapper(dataset_doc)).refresh()
    return MutableDataset(shoji_ds)
Ejemplo n.º 6
0
 def move_to_multiple_response(self,
                               name,
                               alias,
                               subvariables,
                               description='',
                               notes=''):
     """
     This method is a replication of the method move_to_categorical_array,
     only this time we are creting a multiple_response variable.
     Note: the subvariables need to have at least 1 selected catagory.
     """
     payload = {
         'name': name,
         'alias': alias,
         'description': description,
         'notes': notes,
         'type': 'multiple_response',
         'subvariables': [self[v].url for v in subvariables]
     }
     self.resource.variables.create(shoji_entity_wrapper(payload))
     self._reload_variables()
     return self[alias]
Ejemplo n.º 7
0
    def join(self,
             left_var,
             right_ds,
             right_var,
             columns=None,
             filter=None,
             timeout=30):
        """
        Joins a given variable. In crunch joins are left joins, where
        left is the dataset variable and right is other dataset variable.
        For more information see:
        http://docs.crunch.io/?http#merging-and-joining-datasets

        :param: columns: Specify a list of variables from right dataset
        to bring in the merge:
        http://docs.crunch.io/?http#joining-a-subset-of-variables

        :param: wait: Wait for the join progress to finish by polling
        or simply return a url to the progress resource

        :param: filter: Filters out rows based on the given expression,
        or on a given url for an existing filter. TODO: for the moment
        we only allow expressions
        """
        right_var_url = right_ds[right_var].url
        left_var_url = self[left_var].url
        # this dictionary sets the main part of the join
        adapter = {
            'function':
            'adapt',
            'args': [{
                'dataset': right_ds.url
            }, {
                'variable': right_var_url
            }, {
                'variable': left_var_url
            }]
        }

        # wrap the adapter method on a shoji and body entity
        payload = shoji_entity_wrapper(adapter)

        if columns and isinstance(columns, list):
            # overwrite body to new format
            payload['body'] = {
                'frame': adapter,
                'function': 'select',
                'args': [{
                    'map': {}
                }]
            }
            # add the individual variable columns to the payload
            alias_list = right_ds.resource.variables.by("alias")
            var_urls = [alias_list[alias].entity_url for alias in columns]
            var_url_list = {
                var_url: {
                    "variable": var_url
                }
                for var_url in var_urls
            }
            payload['body']['args'][0]['map'] = var_url_list

        if filter:
            # in the case of a filter, convert it to crunch
            # and attach the filter to the payload
            expr = process_expr(parse_expr(filter), right_ds)
            payload['body']['filter'] = {'expression': expr}

        progress = self.resource.variables.post(payload)
        # poll for progress to finish or return the url to progress
        progress_tracker = DefaultProgressTracking(timeout)
        return wait_progress(r=progress,
                             session=self.resource.session,
                             progress_tracker=progress_tracker,
                             entity=self)
Ejemplo n.º 8
0
def new_project(name):
    res = site.projects.create(shoji_entity_wrapper({
        "name": name + UNIQUE_PREFIX
    })).refresh()
    return Project(res)