def get_records(rows, fields):
    """
    Truncate/pad empty/missing records to expected row length, canonicalize
    cell content, and return resulting record list.

    :param upload_data: generator producing rows of content
    :type upload_data: generator
    :param fields: collection of fields specified in JSON schema
    :type fields: list or tuple

    :return: canonicalized records of specified upload data
    :rtype: tuple of dicts
    """
    records = []
    for n, row in enumerate(rows):
        # trailing cells might be empty: trim row to fit
        while (row and (len(row) > len(fields))
               and (row[-1] is None or row[-1] == '')):
            row.pop()
        while row and (len(row) < len(fields)):
            row.append(None)  # placeholder: canonicalize once only, below

        try:
            records.append(
                dict((f['datastore_id'], _canonicalize(v, f['datastore_type']))
                     for f, v in zip(fields, row)))
        except BadExcelData, e:
            raise BadExcelData('Row %d: ' % (n + HEADER_ROWS + 1) + e.message)
def _canonicalize(dirty, dstore_tag):
    """
    Canonicalize dirty input from xlrd to align with
    recombinant.json datastore type specified in dstore_tag.

    :param dirty: dirty cell content as read through xlrd
    :type dirty: object
    :param dstore_tag: datastore_type specifier in (JSON) schema for cell
    :type dstore_tag: str

    :return: Canonicalized cell input
    :rtype: float or unicode

    Raises BadExcelData on formula cells
    """
    dtype = datastore_type[dstore_tag]
    if dirty is None:
        return dtype.default
    elif isinstance(dirty, float) or isinstance(dirty, int):
        if dtype.numeric:
            return unicode(dirty)  # FIXME ckan2.1 datastore?-- float(dirty)
        else:
            # JSON specifies text or money: content of origin is numeric string.
            # If xlrd has added .0 to present content as a float,
            # trim it before returning as numeric string
            if int(dirty) == dirty:
                return unicode(int(dirty))
            else:
                return unicode(dirty)
    elif (isinstance(dirty, basestring)) and (dirty.strip() == ''):
        # Content trims to empty: default
        return dtype.default
    elif not dtype.numeric:
        if dtype.tag == 'money':
            # User has overridden Excel format string, probably adding currency
            # markers or digit group separators (e.g.,fr-CA uses 1$ (not $1)).
            # Truncate any trailing decimal digits, retain int
            # part, and cast as numeric string.
            canon = re.sub(r'[^0-9]', '',
                           re.sub(r'\.[0-9 ]+$', '', unicode(dirty)))
            return unicode(canon)
        elif dtype.tag == 'date' and isinstance(dirty, datetime):
            return u'%04d-%02d-%02d' % (dirty.year, dirty.month, dirty.day)

        if unicode(dirty).startswith('='):
            raise BadExcelData('Formulas are not supported')
        return unicode(dirty)

    # dirty is numeric: truncate trailing decimal digits, retain int part
    canon = re.sub(r'[^0-9]', '', re.sub(r'\.[0-9 ]+$', '', unicode(dirty)))
    if not canon:
        return 0
    return unicode(canon)  # FIXME ckan2.1 datastore?-- float(dirty)
Exemple #3
0
def canonicalize(dirty, dstore_tag):
    """
    Canonicalize dirty input from xlrd to align with
    recombinant.json datastore type specified in dstore_tag.

    :param dirty: dirty cell content as read through xlrd
    :type dirty: object
    :param dstore_tag: datastore_type specifier in (JSON) schema for cell
    :type dstore_tag: str

    :return: Canonicalized cell input
    :rtype: float or unicode

    Raises BadExcelData on formula cells
    """
    dtype = datastore_type[dstore_tag]
    if dirty is None:
        return dtype.default
    elif isinstance(dirty, (float, int, long)):
        return unicode(dirty)

    elif isinstance(dirty, basestring) and not dirty.strip():
        # Content trims to empty: default
        return dtype.default
    elif not dtype.numeric:
        if dtype.tag == 'money':
            # User has overridden Excel format string, probably adding currency
            # markers or digit group separators (e.g.,fr-CA uses 1$ (not $1)).
            # Accept only "DDDDD.DD", discard other characters
            dollars, sep, cents = unicode(dirty).rpartition('.')
            return re.sub(ur'[^0-9]', '', dollars) + sep + re.sub(ur'[^0-9]', '', cents)
        elif dtype.tag == 'date' and isinstance(dirty, datetime):
            return u'%04d-%02d-%02d' % (dirty.year, dirty.month, dirty.day)

        if unicode(dirty).startswith('='):
            raise BadExcelData('Formulas are not supported')
        return unicode(dirty)

    # dirty is numeric: truncate trailing decimal digits, retain int part
    canon = re.sub(r'[^0-9]', '', unicode(dirty).split('.')[0])
    if not canon:
        return 0
    return unicode(canon) # FIXME ckan2.1 datastore?-- float(dirty)
    def upload(self, id):
        package_type = self._get_package_type(id)
        geno = get_geno(package_type)
        lc = ckanapi.LocalCKAN(username=c.user)
        dataset = lc.action.package_show(id=id)
        try:
            if request.POST['xls_update'] == '':
                raise BadExcelData('You must provide a valid file')

            _process_upload_file(lc, dataset, request.POST['xls_update'].file,
                                 geno)

            h.flash_success(
                _("Your file was successfully uploaded into the central system."
                  ))

            redirect(h.url_for(controller='package', action='read', id=id))
        except BadExcelData, e:
            org = lc.action.organization_show(id=dataset['owner_org'])
            return self.preview_table(
                resource_name=dataset['resources'][0]['name'],
                owner_org=org['name'],
                errors=[e.message])
Exemple #5
0
    def upload(self, id):
        package_type = self._get_package_type(id)
        geno = get_geno(package_type)
        lc = ckanapi.LocalCKAN(username=c.user)
        dataset = lc.action.package_show(id=id)
        try:
            if request.POST['xls_update'] == '':
                raise BadExcelData('You must provide a valid file')

            _process_upload_file(
                lc,
                dataset,
                request.POST['xls_update'].file,
                geno)

            h.flash_success(_(
                "Your file was successfully uploaded into the central system."
                ))

            redirect(h.url_for(controller='package', action='read', id=id))
        except BadExcelData, e:
            x_vars = {'errors': [e.message], 'action': 'edit'}
            c.pkg_dict = dataset
            return render(self._edit_template(package_type), extra_vars=x_vars)
def _process_upload_file(lc, dataset, upload_file, geno):
    """
    Use lc.action.datastore_upsert to load data from upload_file

    raises BadExcelData on errors.
    """
    owner_org = dataset['organization']['name']

    expected_sheet_names = dict((resource['name'], resource['id'])
                                for resource in dataset['resources'])

    upload_data = read_excel(upload_file)
    while True:
        try:
            sheet_name, org_name, column_names, rows = next(upload_data)
        except StopIteration:
            return
        except:
            # XXX bare except because this can fail in all sorts of ways
            if asbool(config.get('debug', False)):
                # on debug we want the real error
                raise
            raise BadExcelData(
                _("The server encountered a problem processing the file "
                  "uploaded. Please try copying your data into the latest "
                  "version of the template and uploading again. If this "
                  "problem continues, send your Excel file to "
                  "[email protected] so we may investigate."))

        if sheet_name not in expected_sheet_names:
            raise BadExcelData(
                _('Invalid file for this data type. ' +
                  'Sheet must be labeled "{0}", ' +
                  'but you supplied a sheet labeled "{1}"').format(
                      '"/"'.join(sorted(expected_sheet_names)), sheet_name))

        if org_name != owner_org:
            raise BadExcelData(
                _('Invalid sheet for this organization. ' +
                  'Sheet must be labeled for {0}, ' +
                  'but you supplied a sheet for {1}').format(
                      owner_org, org_name))

        # custom styles or other errors cause columns to be read
        # that actually have no data. strip them here to avoid error below
        while column_names[-1] is None:
            column_names.pop()

        chromo = get_chromo(sheet_name)
        expected_columns = [f['datastore_id'] for f in chromo['fields']]
        if column_names != expected_columns:
            raise BadExcelData(
                _("This template is out of date. "
                  "Please try copying your data into the latest "
                  "version of the template and uploading again. If this "
                  "problem continues, send your Excel file to "
                  "[email protected] so we may investigate."))

        records = get_records(rows, chromo['fields'])
        method = 'upsert' if chromo.get('datastore_primary_key') else 'insert'
        try:
            lc.action.datastore_upsert(
                method=method,
                resource_id=expected_sheet_names[sheet_name],
                records=records,
            )
        except ValidationError as e:
            # because, where else would you put the error text?
            # XXX improve this in datastore, please
            pgerror = e.error_dict['info']['orig'][0].decode('utf-8')
            # remove some postgres-isms that won't help the user
            # when we render this as an error in the form
            pgerror = re.sub(ur'\nLINE \d+:', u'', pgerror)
            pgerror = re.sub(ur'\n *\^\n$', u'', pgerror)
            raise BadExcelData(
                _(u"Error while importing data: {0}").format(pgerror))