Beispiel #1
0
def get_datapreview_recombinant(dataset_type, res_id):
    from ckanext.recombinant.plugins import get_table
    t = get_table(dataset_type)
    default_preview_args = {}
    if 'default_preview_sort' in t:
        default_preview_args['sort'] = t['default_preview_sort']

    lc = ckanapi.LocalCKAN(username=c.user)
    results = lc.action.datastore_search(
        resource_id=res_id, limit=0,
        **default_preview_args)

    lang = h.lang()
    field_label = {}
    for f in t['fields']:
        label = f['label'].split(' / ')
        label = label[0] if lang == 'en' else label[-1]
        field_label[f['datastore_id']] = label
    fields = [{
        'type': f['type'],
        'id': f['id'],
        'label': field_label.get(f['id'], f['id'])}
        for f in results['fields']]

    return h.snippet('package/wet_datatable.html',
        resource_id=res_id,
        ds_fields=fields)
Beispiel #2
0
def csv_data_batch(csv_path, target_dataset):
    """
    Generator of dataset records from csv file

    :param csv_path: file to parse
    :ptype csv_file: str
    :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.)
    :ptype target_dataset: str

    :return a batch of records for at most one organization
    :rtype: dict mapping at most one org-id to
            at most BATCH_SIZE (dict) records
    """
    dataset_types = get_dataset_types(target_dataset)
    # Use JSON schema to discover the dataset type to which the file corresponds
    schema_tables = dict((
            t,
            dict((f['label'], f['datastore_id'])
                for f in get_table(t)['fields']))
        for t in dataset_types)
    records = {}
    schema_cols = None
    cols = None
    csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path)))
    if os.path.islink(csv_path):
        csv_path = os.readlink(csv_path)
    with open(csv_path) as f:
        csv_in = DictReader(f)
        cols = csv_in.unicode_fieldnames

        for k, v in schema_tables.iteritems():
            if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and
                    len(cols) == len(v.keys()) + 2):
                # columns represent all schema data fields + 'Org id', 'Org'
                schema_cols = [v[col] if col in v else col for col in cols]
                break

    assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format(
        csv_path, dataset_types)

    with open(csv_path) as f:
        # use new dict, each col named for its corresponding JSON datastore_id
        csv_in = DictReader(f, fieldnames=schema_cols)
        csv_in.next()   # skip header row: no new info
        for row_dict in csv_in:
            org_id = row_dict.pop('Org id')
            org = row_dict.pop('Org')
            if org_id not in records:
                if len(records.keys()):
                    org_id_done = records.keys()[0]
                    yield {org_id_done: records.pop(org_id_done)}
                records[org_id] = []

            row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items())
            records[org_id].append(row_dict)
            if len(records[org_id]) >= BATCH_SIZE:
                yield {org_id: records.pop(org_id)}
    yield records
Beispiel #3
0
    def _build_templates(self):
        """
        Implement build-templates command
        """
        lc = LocalCKAN()
        output_files = {}
        next_row = {}
        output_counter = {}
        output_path = self.args[2:][-1]
        dataset_types = get_dataset_types(self.command_name)
        table = get_table(dataset_types[0])

        def close_write_file(org_id):
            book = output_files[org_id]
            if not book:
                return
            book.save(os.path.join(output_path,
                org_id + '-' + str(output_counter[org_id]) + '.xls'))
            output_files[org_id] = None

        def out_file(org_id):
            if org_id in output_files:
                next_row[org_id] += 1
                # need to start a new file?
                if next_row[org_id] > SPLIT_XLS_ROWS:
                    close_write_file(org_id)
                else:
                    return output_files[org_id], next_row[org_id]
            try:
                org = lc.action.organization_show(
                    id=org_id, include_data_batch=False)
            except NotFound:
                logging.error('org id', org_id, 'not found')
                output_files[org_id] = None
                next_row[org_id] = 0
                return None, None
            book = xls_template(dataset_types[0], org)
            output_files[org_id] = book
            output_counter[org_id] = output_counter.get(org_id, 0) + 1
            next_row[org_id] = len(book.get_sheet(0).get_rows())
            return book, next_row[org_id]

        def add_row(book, row, d):
            sheet = book.get_sheet(0)
            for i, f in enumerate(table['fields']):
                sheet.write(row, i, d[f['datastore_id']])

        for f in self.args[1:-1]:
            for d in DictReader(open(f, 'rb')):
                book, row = out_file(d['organization'])
                if not book:
                    continue
                add_row(book, row, d)

        for org_id in output_files:
            close_write_file(org_id)
Beispiel #4
0
    def _build_templates(self):
        lc = LocalCKAN()
        output_files = {}
        next_row = {}
        output_counter = {}
        output_path = self.args[2:][-1]
        table = get_table(DATASET_TYPE)

        def close_write_file(org_id):
            book = output_files[org_id]
            if not book:
                return
            book.save(os.path.join(output_path,
                org_id + '-' + str(output_counter[org_id]) + '.xls'))
            output_files[org_id] = None

        def out_file(org_id):
            if org_id in output_files:
                next_row[org_id] += 1
                # need to start a new file?
                if next_row[org_id] > SPLIT_XLS_ROWS:
                    close_write_file(org_id)
                else:
                    return output_files[org_id], next_row[org_id]
            try:
                org = lc.action.organization_show(id=org_id, include_datasets=False)
            except NotFound:
                print 'org id', org_id, 'not found'
                output_files[org_id] = None
                next_row[org_id] = 0
                return None, None
            book = xls_template(DATASET_TYPE, org)
            output_files[org_id] = book
            output_counter[org_id] = output_counter.get(org_id, 0) + 1
            next_row[org_id] = len(book.get_sheet(0).get_rows())
            return book, next_row[org_id]

        def add_row(book, row, d):
            sheet = book.get_sheet(0)
            for i, f in enumerate(table['fields']):
                sheet.write(row, i, d[f['datastore_id']])

        for f in self.args[1:-1]:
            for d in DictReader(open(f, 'rb')):
                book, row = out_file(d['organization'])
                if not book:
                    continue
                add_row(book, row, d)

        for org_id in output_files:
            close_write_file(org_id)
 def _check_table_columns(self, res_id, dataset_type):
     """
     return a list of columns in the res_id if they don't match the
     columns that would be created for this type.
     """
     lc = ckanapi.LocalCKAN()
     t = get_table(dataset_type)
     try:
         result = lc.action.datastore_search(resource_id=res_id, rows=0)
     except ckanapi.NotFound:
         return "table missing!"
     fields = result['fields'][1:] # remove '_id'
     if len(fields) != len(t['fields']):
         return "wrong number of columns!"
     for df, tf in zip(fields, t['fields']):
         if df['id'] != tf['datastore_id']:
             return "columns don't match: %s" % ' '.join(
                 f['id'] for f in fields)
Beispiel #6
0
def _update_records(records, org_detail, conn, recombinant_type):
    """
    Update records on solr core

    :param records: record dicts
    :ptype records: sequence of record dicts

    :param org_detail: org structure as returned via local CKAN
    :ptype org_detail: dict with local CKAN org structure

    :param conn: solr connection
    :ptype conn: obj

    :param recombinant_type: type being
    """
    table = get_table(recombinant_type)
    pk = table.get('datastore_primary_key', [])
    if not isinstance(pk, list):
        pk = [pk]

    org = org_detail['name']
    orghash = hashlib.md5(org).hexdigest()

    def unique_id(r):
        s = orghash
        if not pk:
            s = hashlib.md5(s + recombinant_type + "-%d" % r['_id']).hexdigest()
        for k in pk:
            s = hashlib.md5(s + r[k].encode('utf-8')).hexdigest()
        return s

    out = []

    for r in records:
        unique = unique_id(r)

        shortform = None
        shortform_fr = None
        for e in org_detail['extras']:
            if e['key'] == 'shortform':
                shortform = e['value']
            elif e['key'] == 'shortform_fr':
                shortform_fr = e['value']

        solrrec = {
            'id': unique,
            'org_name_code': org_detail['name'],
            'org_name_en': org_detail['title'].split(' | ', 1)[0],
            'org_name_fr': org_detail['title'].split(' | ', 1)[-1],
            }

        for f in table['fields']:
            key = f['datastore_id']
            value = r[key]

            facet_range = f.get('solr_float_range_facet')
            if facet_range:
                try:
                    float_value = float(value)
                except ValueError:
                    pass
                else:
                    for i, fac in enumerate(facet_range):
                        if 'less_than' not in fac or float_value < fac['less_than']:
                            solrrec[key + '_range'] = str(i)
                            solrrec[key + '_range_en'] = fac['label'].split(' | ')[0]
                            solrrec[key + '_range_fr'] = fac['label'].split(' | ')[-1]
                            break

            if f.get('datastore_type') == 'date':
                try:
                    value = date2zulu(value)
                    # CM: If this only applies to PD types this should be accurate
                    # CM: This should only apply if valid (as per date2zulu) else NULL
                    if f.get('extract_date_year'):
                        solrrec['date_year'] = value.split('-', 1)[0]
                    if f.get('extract_date_month'):
                        solrrec['date_month'] = value.split('-')[1]
                except ValueError:
                    pass
            solrrec[key] = value

            choices = f.get('choices')
            if not choices:
                if 'choices_source' not in f:
                    continue
                choices = f['choices'] = extract_choices(f['choices_source'])

            if key.endswith('_code'):
                key = key[:-5]
            solrrec[key + '_en'] = choices.get(value, '').split(' | ')[0]
            solrrec[key + '_fr'] = choices.get(value, '').split(' | ')[-1]
        out.append(solrrec)

    conn.add_many(out, _commit=True)
    def upload(self, id):
        package_type = self._get_package_type(id)
        t = get_table(package_type)
        expected_sheet_name = t['xls_sheet_name']

        try:
            lc = ckanapi.LocalCKAN(username=c.user)
            package = lc.action.package_show(id=id)
            owner_org = package['organization']['name']

            if request.POST['xls_update'] == u'':
                msg = _('You must provide a valid file')
                raise ValidationError({'xls_update': [msg]})

            upload_data = read_xls(request.POST['xls_update'].file)
            sheet_name, org_name = None, None
            try:
                sheet_name, org_name = next(upload_data)
            except:
                # XXX bare except because this can fail in all sorts of ways
                if asbool(config.get('debug', False)):
                    # on debug we want the real error
                    raise
                raise ValidationError({'xls_update':
                    [_("The server encountered a problem processing the file "
                    "uploaded. Please try copying your data into the latest "
                    "version of the template and uploading again. If this "
                    "problem continues, send your Excel file to "
                    "[email protected] so we may investigate.")]})

            if expected_sheet_name != sheet_name:
                raise ValidationError({'xls_update':
                    [_('Invalid file for this data type. ' +
                    'Sheet must be labeled "{0}", ' +
                    'but you supplied a sheet labeled "{1}"').format(
                        expected_sheet_name, sheet_name)]})

            # is this the right sheet for this organization?
            if org_name != owner_org:
                msg = _(
                    'Invalid sheet for this organization. ' +
                    'Sheet must be labeled for {0}, ' +
                    'but you supplied a sheet for {1}').format(
                        owner_org, org_name)
                raise ValidationError({'xls_update': [msg]})

            resource_id = package['resources'][0]['id']

            records = get_records(upload_data, t['fields'])

            method = 'upsert' if t.get('datastore_primary_key') else 'insert'
            try:
                lc.action.datastore_upsert(
                    method=method,
                    resource_id=resource_id,
                    records=records)
            except NotAuthorized, na:
                msg = _(
                    'You do not have permission to upload to {0}').format(
                        owner_org)
                raise ValidationError({'xls_update': [msg]})

            h.flash_success(_(
                "Your file was successfully uploaded into the central system."
                ))

            redirect(h.url_for(controller='package', action='read', id=id))