Exemple #1
0
    def get_data_source(self, data_source_id):
        # Retrieve metadata from Limonero.
        limonero_config = \
            self.parameters['configuration']['juicer']['services']['limonero']

        metadata = limonero_service.get_data_source_info(
            limonero_config['url'], str(limonero_config['auth_token']),
            str(data_source_id))

        if not metadata.get('url'):
            raise ValueError(
                gettext('Incorrect data source configuration (empty url)'))

        return metadata
def test_get_all_data_sources_success(mocked_get):
    data_source_id = 700
    text = {
        'id': data_source_id,
        'name': 'Data source for testing',
        'url': 'hdfs://test.com/testing.csv'
    }
    mocked_get.side_effect = fake_req(200, json.dumps(text))()
    url = 'http://limonero/'
    token = '00000'

    resp = limonero_service.get_data_source_info(url, token, '')
    for k, v in resp.items():
        assert v == text[k]

    mocked_get.assert_called_with('http://limonero/datasources/',
                                  headers={'X-Auth-Token': '00000'})
Exemple #3
0
    def __init__(self, parameters, named_inputs, named_outputs):

        if parameters.get('type') in ['polygon', 'geojson']:
            limonero_config = parameters['configuration']['juicer'][
                'services']['limonero']
            url = limonero_config['url']
            token = str(limonero_config['auth_token'])

            metadata = limonero_service.get_data_source_info(
                url, token, parameters.get('polygon'))
            if not metadata.get('url'):
                raise ValueError(
                    _('Incorrect data source configuration (empty url or '
                      'not GEOJSON)'))
            else:
                parameters['polygon_url'] = metadata['url']
        VisualizationMethodOperation.__init__(self, parameters, named_inputs,
                                              named_outputs)
Exemple #4
0
    def _set_data_source_parameters(self, parameters):

        self.data_source_id = int(parameters[self.DATA_SOURCE_ID_PARAM])
        # Retrieve metadata from Limonero.
        limonero_config = self.parameters['configuration']['juicer'][
            'services']['limonero']
        url = limonero_config['url']
        token = str(limonero_config['auth_token'])

        # Is data source information cached?
        self.metadata = self.parameters.get('workflow', {}).get(
            'data_source_cache', {}).get(self.data_source_id)
        if self.metadata is None:
            self.metadata = limonero_service.get_data_source_info(
                url, token, self.data_source_id)
            self.parameters['workflow']['data_source_cache'][
                self.data_source_id] = self.metadata
        if not self.metadata.get('url'):
            raise ValueError(
                _('Incorrect data source configuration (empty url)'))

        self.header = parameters.get(self.HEADER_PARAM,
                                     False) not in ('0', 0, 'false', False)
        self.null_values = [
            v.strip()
            for v in parameters.get(self.NULL_VALUES_PARAM, '').split(",")
            if v.strip()
        ]

        self.sep = parameters.get(
            self.SEPARATOR_PARAM, self.metadata.get('attribute_delimiter',
                                                    ',')) or ','
        if self.metadata['format'] == 'TEXT':
            self.sep = '{new_line}'
        self.quote = parameters.get(self.QUOTE_PARAM,
                                    self.metadata.get('text_delimiter'))
        if self.quote == '\'':
            self.quote = '\\\''
        if self.sep in self.SEPARATORS:
            self.sep = self.SEPARATORS[self.sep]
        self.infer_schema = parameters.get(self.INFER_SCHEMA_PARAM,
                                           self.INFER_FROM_LIMONERO)
        self.mode = parameters.get(self.MODE_PARAM, 'FAILFAST')
def test_get_data_source_info_failure(mocked_get):
    data_source_id = 700
    text = {
        'id': data_source_id,
        'name': 'Data source for testing',
        'url': 'hdfs://test.com/testing.csv'
    }
    mocked_get.side_effect = fake_req(201, json.dumps(text))()
    url = 'http://limonero/datasources'
    token = '00000'
    with pytest.raises(ValueError):
        resp = limonero_service.get_data_source_info(url, token,
                                                     data_source_id)

        mocked_get.assert_called_with(
            'http://limonero/datasources/{}'.format(data_source_id),
            headers={'X-Auth-Token': '00000'})
        for k, v in resp.items():
            assert v == text[k]
Exemple #6
0
def perform_copy(config, vallum_ds_id, target_id, path):
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
    services_config = config.get('juicer').get('services')
    limonero_config = services_config.get('limonero')

    limonero_url = limonero_config.get('url')
    token = str(limonero_config.get('auth_token'))
    vallum_ds = limonero_service.get_data_source_info(limonero_url, token,
                                                      vallum_ds_id)
    vallum_storage = vallum_ds.get('storage', {})
    if vallum_storage.get('type') != 'VALLUM':
        return {'status': 'ERROR', 'message': 'Storage is not VALLUM'}
    target_storage = limonero_service.get_storage_info(limonero_url, token,
                                                       target_id)
    if target_storage.get('type') != 'LOCAL':
        return {
            'status': 'ERROR',
            'message': 'Target storage must be of type LOCAL'
        }

    parsed = urlparse(vallum_storage.get('url'))
    base_url = '{}://{}:{}'.format(parsed.scheme, parsed.hostname, parsed.port
                                   or 80)
    url = base_url + parsed.path
    qs = parse_qs(parsed.query)
    database = qs.get('db', 'samples')[0]

    username = parsed.username
    password = parsed.password
    query = vallum_ds['command']
    mode = 'MN'
    thread = 1

    params = {
        "username": username,
        "password": password,
        "database": database,
        "mode": mode,
        "query": query,
        "thread": thread,
    }
    req = requests.post(url, params, verify=False)
    total = 0
    if req.status_code == 200:
        parsed_local = urlparse(target_storage.get('url'))
        target_dir = parsed_local.path + path  # '/vallum' + str(vallum_ds_id)
        obj = json.loads(req.text)
        for result in obj.get('result'):
            files = result.get('files')
            if files:
                uri_files = [
                    base_url + urlparse(f.get('uri')).path for f in files
                ]
                if not os.path.exists(target_dir):
                    os.makedirs(target_dir)
                for vallum_file in uri_files:
                    file_req = requests.get(vallum_file, params, verify=False)
                    if file_req.status_code == 200:
                        final_filename = target_dir + '/' + \
                                         vallum_file.split('/')[-1]
                        print(final_filename)
                        total += 1
                        with open(final_filename, 'wb') as fout:
                            fout.write(file_req.content)
                    else:
                        raise ValueError('HTTP Status ' + file_req.status_code)
        return total
    else:
        raise ValueError('HTTP Status ' + req.status_code)
Exemple #7
0
    def _build_privacy_restrictions(self):
        if 'juicer' not in self.config or \
                        'services' not in self.config['juicer']:
            return
        limonero_config = self.config['juicer']['services']['limonero']
        data_sources = []
        if self.workflow['platform']['slug'] != 'spark':
            return
        for t in self.workflow['tasks']:
            if t['operation'].get('slug') == 'data-reader':
                if self._query_data_sources:
                    ds = next(self._query_data_sources())
                else:
                    ds = limonero_service.get_data_source_info(
                        limonero_config['url'],
                        str(limonero_config['auth_token']),
                        t['forms']['data_source']['value'])
                data_sources.append(ds)

        privacy_info = {}
        attribute_group_set = collections.defaultdict(list)
        data_source_cache = {}
        for ds in data_sources:
            data_source_cache[ds['id']] = ds
            attrs = []
            privacy_info[ds['id']] = {'attributes': attrs}
            for attr in ds['attributes']:
                privacy = attr.get('attribute_privacy', {}) or {}
                attribute_privacy_group_id = privacy.get(
                    'attribute_privacy_group_id')
                privacy_config = {
                    'id':
                    attr['id'],
                    'name':
                    attr['name'],
                    'type':
                    attr['type'],
                    'details':
                    privacy.get('hierarchy'),
                    'privacy_type':
                    privacy.get('privacy_type'),
                    'anonymization_technique':
                    privacy.get('anonymization_technique'),
                    'attribute_privacy_group_id':
                    attribute_privacy_group_id
                }
                attrs.append(privacy_config)
                if attribute_privacy_group_id:
                    attribute_group_set[attribute_privacy_group_id].append(
                        privacy_config)
                    # print('#' * 40)
                    # print(attr.get('name'), attr.get('type'))
                    # print(privacy.get('privacy_type'),
                    #       privacy.get('anonymization_technique'),
                    #       privacy.get('attribute_privacy_group_id'))

        def sort_attr_privacy(a):
            return privaaas.ANONYMIZATION_TECHNIQUES[a.get(
                'anonymization_technique', 'NO_TECHNIQUE')]

        for attributes in list(attribute_group_set.values()):
            more_restrictive = sorted(attributes,
                                      key=sort_attr_privacy,
                                      reverse=True)[0]
            # print(json.dumps(more_restrictive[0], indent=4))
            # Copy all privacy config from more restrictive one
            for attribute in attributes:
                attribute.update(more_restrictive)

        self.workflow['data_source_cache'] = data_source_cache
        self.workflow['privacy_restrictions'] = privacy_info