Ejemplo n.º 1
0
    def upload(self, id):
        package_type = self._get_package_type(id)
        geno = get_geno(package_type)
        lc = ckanapi.LocalCKAN(username=c.user)
        dataset = lc.action.package_show(id=id)
        try:
            if request.POST['xls_update'] == '':
                raise BadExcelData('You must provide a valid file')

            _process_upload_file(
                lc,
                dataset,
                request.POST['xls_update'].file,
                geno)

            h.flash_success(_(
                "Your file was successfully uploaded into the central system."
                ))

            redirect(h.url_for(controller='package', action='read', id=id))
        except BadExcelData, e:
            org = lc.action.organization_show(id=dataset['owner_org'])
            return self.preview_table(
                resource_name=dataset['resources'][0]['name'],
                owner_org=org['name'],
                errors=[e.message])
Ejemplo n.º 2
0
def recombinant_get_geno(dataset_type):
    """
    Get the dataset definition (geno) for thr given dataset type
    """
    try:
        return get_geno(dataset_type)
    except RecombinantException:
        return
Ejemplo n.º 3
0
def recombinant_get_geno(dataset_type):
    """
    Get the dataset definition (geno) for thr given dataset type
    """
    try:
        return get_geno(dataset_type)
    except RecombinantException:
        return
Ejemplo n.º 4
0
    def schema_json(self, dataset_type):
        try:
            geno = get_geno(dataset_type)
        except RecombinantException:
            abort(404, _('Recombinant dataset_type not found'))

        schema = OrderedDict()
        for k in ['dataset_type', 'title', 'notes']:
            if k in geno:
                schema[k] = geno[k]

        schema['resources'] = []
        for chromo in geno['resources']:
            resource = OrderedDict()
            schema['resources'].append(resource)
            choice_fields = dict(
                (f['datastore_id'], f['choices'])
                for f in recombinant_choice_fields(
                    chromo['resource_name'],
                    all_languages=True))

            for k in ['title', 'resource_name']:
                if k in chromo:
                    resource[k] = chromo[k]

            resource['fields'] = []
            for field in chromo['fields']:
                if not field.get('visible_to_public', True):
                    continue
                fld = OrderedDict()
                resource['fields'].append(fld)
                fld['id'] = field['datastore_id']
                for k in ['label', 'description', 'obligation', 'format_type']:
                    if k in field:
                        fld[k] = field[k]

                if fld['id'] in choice_fields:
                    choices = OrderedDict()
                    fld['choices'] = choices
                    for ck, cv in choice_fields[fld['id']]:
                        choices[ck] = cv

            resource['primary_key'] = chromo['datastore_primary_key']

            if 'examples' in chromo:
                ex_record = chromo['examples']['record']
                example = OrderedDict()
                for field in chromo['fields']:
                    if field['datastore_id'] in ex_record:
                        example[field['datastore_id']] = ex_record[
                            field['datastore_id']]
                resource['example_record'] = example

        response.headers['Content-Type'] = 'application/json'
        response.headers['Content-Disposition'] = (
            'inline; filename="{0}.json"'.format(
                dataset_type))
        return json.dumps(schema, indent=2, ensure_ascii=False).encode('utf-8')
Ejemplo n.º 5
0
    def preview_table(self, id, resource_id):
        lc = ckanapi.LocalCKAN(username=c.user)
        dataset = lc.action.package_show(id=id)
        try:
            get_geno(dataset['type'])
        except RecombinantException:
            abort(404, _('Recombinant dataset_type not found'))

        for r in dataset['resources']:
            if r['id'] == resource_id:
                break
        else:
            abort(404, _('Resource not found'))

        return render('recombinant/resource_edit.html', extra_vars={
            'dataset': dataset,
            'resource': r,
            })
Ejemplo n.º 6
0
def rebuild(command_name, csv_files=None, solr_url=None, strict=True):
    """
    Implement rebuild command

    :param csv_file: path to .csv file for input
    :type csv_file: str

    :return: Nothing
    :rtype: None
    """
    clear_index(command_name, solr_url, False)

    conn = solr_connection(command_name, solr_url)
    lc = LocalCKAN()
    if csv_files:
        for csv_file in csv_files:
            print csv_file + ':'
            prev_org = None
            unmatched = None
            firstpart, filename = os.path.split(csv_file)
            assert filename.endswith('.csv')
            resource_name = filename[:-4]

            chromo = get_chromo(resource_name)
            geno = get_geno(chromo['dataset_type'])

            for org_id, records in csv_data_batch(csv_file,
                                                  chromo,
                                                  strict=strict):
                records = [
                    dict((k, safe_for_solr(v)) for k, v in row_dict.items())
                    for row_dict in records
                ]
                if org_id != prev_org:
                    unmatched = None
                try:
                    org_detail = lc.action.organization_show(id=org_id)
                except NotFound:
                    continue
                print "    {0:s} {1}".format(org_id, len(records))
                unmatched = _update_records(records, org_detail, conn,
                                            resource_name, unmatched)
    else:
        for org in lc.action.organization_list():
            count = 0
            org_detail = lc.action.organization_show(id=org)
            unmatched = None
            for resource_name, records in data_batch(org_detail['id'], lc,
                                                     command_name):
                unmatched = _update_records(records, org_detail, conn,
                                            resource_name, unmatched)
                count += len(records)
            print org, count

    print "commit"
    conn.commit()
Ejemplo n.º 7
0
    def data_dictionary(self, dataset_type):
        try:
            geno = get_geno(dataset_type)
        except RecombinantException:
            abort(404, _('Recombinant dataset_type not found'))

        book = excel_data_dictionary(geno)
        blob = StringIO()
        book.save(blob)
        response.headers['Content-Type'] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
        return blob.getvalue()
Ejemplo n.º 8
0
    def _rebuild(self, csv_files=None, solr_url=None, strict=True):
        """
        Implement rebuild command

        :param csv_files: sequence of paths to .csv files for input
        :type csv_files: sequence of str

        :return: Nothing
        :rtype: None
        """
        self._clear_index(solr_url, False)

        conn = solr_connection('ati', solr_url)
        lc = LocalCKAN()
        if csv_files:
            for csv_file in csv_files:
                print csv_file + ':'
                firstpart, filename = os.path.split(csv_file)
                assert filename.endswith('.csv')
                resource_name = filename[:-4]

                chromo = get_chromo(resource_name)
                geno = get_geno(chromo['dataset_type'])
                assert geno.get('target_dataset') == TARGET_DATASET

                for org_id, records in csv_data_batch(csv_file,
                                                      chromo,
                                                      strict=strict):
                    records = [
                        dict((k, safe_for_solr(v))
                             for k, v in row_dict.items())
                        for row_dict in records
                    ]
                    try:
                        org_detail = lc.action.organization_show(id=org_id)
                    except NotFound:
                        continue
                    print "    {0:s} {1}".format(org_id, len(records))
                    _update_records(records, org_detail, conn)
        else:
            for org_id in lc.action.organization_list():
                count = 0
                org_detail = lc.action.organization_show(id=org_id)
                for resource_name, records in data_batch(
                        org_detail['id'], lc, TARGET_DATASET):
                    _update_records(records, org_detail, conn)
                    count += len(records)
                print org_id, count

        print "commit"
        conn.commit()
Ejemplo n.º 9
0
def rebuild(command_name, csv_files=None, solr_url=None):
    """
    Implement rebuild command

    :param csv_file: path to .csv file for input
    :type csv_file: str

    :return: Nothing
    :rtype: None
    """
    clear_index(command_name, solr_url, False)

    conn = solr_connection(command_name, solr_url)
    lc = LocalCKAN()
    if csv_files:
        for csv_file in csv_files:
            print csv_file + ':'
            prev_org = None
            unmatched = None
            firstpart, filename = os.path.split(csv_file)
            assert filename.endswith('.csv')
            resource_name = filename[:-4]

            chromo = get_chromo(resource_name)
            geno = get_geno(chromo['dataset_type'])

            for org_id, records in csv_data_batch(csv_file, chromo):
                records = [dict((k, safe_for_solr(v)) for k, v in
                            row_dict.items()) for row_dict in records]
                if org_id != prev_org:
                    unmatched = None
                try:
                    org_detail = lc.action.organization_show(id=org_id)
                except NotFound:
                    continue
                print "    {0:s} {1}".format(org_id, len(records))
                unmatched = _update_records(
                    records, org_detail, conn, resource_name, unmatched)
    else:
        for org in lc.action.organization_list():
            count = 0
            org_detail = lc.action.organization_show(id=org)
            unmatched = None
            for resource_name, records in data_batch(org_detail['id'], lc, command_name):
                unmatched = _update_records(
                    records, org_detail, conn, resource_name, unmatched)
                count += len(records)
            print org, count

    print "commit"
    conn.commit()
Ejemplo n.º 10
0
def excel_template(dataset_type, org):
    """
    return an openpyxl.Workbook object containing the sheet and header fields
    for passed dataset_type and org. Supports version 2 and 3 templates.
    """
    geno = get_geno(dataset_type)
    version = geno.get('template_version', 2)

    book = openpyxl.Workbook()
    sheet = book.active
    refs = []
    choice_ranges = []
    for rnum, chromo in enumerate(geno['resources'], 1):
        if version == 2:
            _populate_excel_sheet_v2(sheet, chromo, org, refs)
        elif version == 3:
            choice_ranges.append(_populate_excel_sheet(
                sheet, geno, chromo, org, refs, rnum))
            sheet.protection.enabled = True
            sheet.protection.formatRows = False
            sheet.protection.formatColumns = False
        sheet = book.create_sheet()

    if version == 2:
        _populate_reference_sheet_v2(sheet, chromo, refs)
    elif version == 3:
        _populate_reference_sheet(sheet, geno, refs)
    sheet.title = 'reference'
    sheet.protection.enabled = True

    if version == 2:
        return book

    for i, (chromo, cranges) in enumerate(
            zip(geno['resources'], choice_ranges), 1):
        sheet = book.create_sheet()
        _populate_excel_e_sheet(sheet, chromo, cranges)
        sheet.title = 'e{i}'.format(i=i)
        sheet.protection.enabled = True
        sheet.sheet_state = 'hidden'

        sheet = book.create_sheet()
        _populate_excel_r_sheet(sheet, chromo)
        sheet.title = 'r{i}'.format(i=i)
        sheet.protection.enabled = True
        sheet.sheet_state = 'hidden'
    return book
Ejemplo n.º 11
0
    def _rebuild(self, csv_files=None, solr_url=None):
        """
        Implement rebuild command

        :param csv_files: sequence of paths to .csv files for input
        :type csv_files: sequence of str

        :return: Nothing
        :rtype: None
        """
        self._clear_index(solr_url, False)

        conn = solr_connection('ati', solr_url)
        lc = LocalCKAN()
        if csv_files:
            for csv_file in csv_files:
                print csv_file + ':'
                firstpart, filename = os.path.split(csv_file)
                assert filename.endswith('.csv')
                resource_name = filename[:-4]

                chromo = get_chromo(resource_name)
                geno = get_geno(chromo['dataset_type'])
                assert geno.get('target_dataset') == TARGET_DATASET

                for org_id, records in csv_data_batch(csv_file, chromo):
                    records = [dict((k, safe_for_solr(v)) for k, v in
                            row_dict.items()) for row_dict in records]
                    try:
                        org_detail = lc.action.organization_show(id=org_id)
                    except NotFound:
                        continue
                    print "    {0:s} {1}".format(org_id, len(records))
                    _update_records(records, org_detail, conn)
        else:
            for org_id in lc.action.organization_list():
                count = 0
                org_detail = lc.action.organization_show(id=org_id)
                for resource_name, records in data_batch(org_detail['id'], lc, TARGET_DATASET):
                    _update_records(records, org_detail, conn)
                    count += len(records)
                print org_id, count

        print "commit"
        conn.commit()
Ejemplo n.º 12
0
def data_batch(org_id, lc, target_dataset):
    """
    Generator of dataset dicts for organization with name org

    :param org_id: the id for the organization of interest
    :ptype org_id: str
    :param lc: local CKAN
    :ptype lc: obj
    :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.)
    :ptype target_dataset: str

    :return generates batches of dataset dict records
    :rtype batch of dataset dict records
    """
    dataset_types = get_dataset_types()
    for dataset_type in dataset_types:
        geno = get_geno(dataset_type)
        if geno.get('target_dataset') == target_dataset:
            break
    else:
        return

    result = lc.action.package_search(
        q="type:{0:s} owner_org:{1:s}".format(dataset_type, org_id),
        rows=2)['results']
        
    if not result:
        return
    if len(result) != 1:
       sys.stderr.write('1 record expected for %s %s, found %d' %
            (dataset_type, org_id, len(result)))

    dataset = result[0]
    for resource in dataset['resources']:
        offset = 0
        while True:
            rval = lc.action.datastore_search(
                resource_id=resource['id'],
                limit=BATCH_SIZE,
                offset=offset)
            records = rval['records']
            if not records:
                break
            offset += len(records)
            yield records
Ejemplo n.º 13
0
def _action_find_dataset(context, data_dict):
    '''
    common code for actions that need to check for a dataset based on
    the dataset type and organization name or id
    '''
    dataset_type = get_or_bust(data_dict, 'dataset_type')
    owner_org = get_or_bust(data_dict, 'owner_org')

    try:
        geno = get_geno(dataset_type)
    except RecombinantException:
        raise ValidationError(
            {'dataset_type': _("Recombinant dataset type not found")})

    lc = LocalCKAN(username=context['user'])
    result = lc.action.package_search(q="type:%s organization:%s" %
                                      (dataset_type, owner_org),
                                      rows=2)
    return lc, geno, result['results']
Ejemplo n.º 14
0
def _action_find_dataset(context, data_dict):
    '''
    common code for actions that need to check for a dataset based on
    the dataset type and organization name or id
    '''
    dataset_type = get_or_bust(data_dict, 'dataset_type')
    owner_org = get_or_bust(data_dict, 'owner_org')

    try:
        geno = get_geno(dataset_type)
    except RecombinantException:
        raise ValidationError({'dataset_type':
            _("Recombinant dataset type not found")})

    lc = LocalCKAN(username=context['user'])
    result = lc.action.package_search(
        q="type:%s organization:%s" % (dataset_type, owner_org),
        rows=2)
    return lc, geno, result['results']
Ejemplo n.º 15
0
def csv_data_batch(csv_path, target_dataset):
    """
    Generator of dataset records from csv file

    :param csv_path: file to parse

    :return a batch of records for at most one organization
    :rtype: dict mapping at most one org-id to
            at most BATCH_SIZE (dict) records
    """
    records = []
    current_owner_org = None

    firstpart, filename = os.path.split(csv_path)
    assert filename.endswith(".csv")

    chromo = get_chromo(filename[:-4])
    geno = get_geno(chromo["dataset_type"])
    assert geno.get("target_dataset") == target_dataset

    with open(csv_path) as f:
        csv_in = DictReader(f)
        cols = csv_in.unicode_fieldnames

        expected = [f["datastore_id"] for f in chromo["fields"]]
        assert cols[:-2] == expected, "column mismatch:\n{0}\n{1}".format(cols[:-2], expected)

        for row_dict in csv_in:
            owner_org = row_dict.pop("owner_org")
            owner_org_title = row_dict.pop("owner_org_title")
            if owner_org != current_owner_org:
                if records:
                    yield (current_owner_org, records)
                records = []
                current_owner_org = owner_org

            row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items())
            records.append(row_dict)
            if len(records) >= BATCH_SIZE:
                yield (current_owner_org, records)
                records = []
    if records:
        yield (current_owner_org, records)
Ejemplo n.º 16
0
def excel_template(dataset_type, org):
    """
    return an openpyxl.Workbook object containing the sheet and header fields
    for passed dataset_type and org.
    """
    geno = get_geno(dataset_type)

    book = openpyxl.Workbook()
    sheet = book.active
    refs = []
    for chromo in geno['resources']:
        _populate_excel_sheet(sheet, chromo, org, refs)
        sheet = book.create_sheet()

    ref = sheet
    ref.title = 'reference'
    ref.append([u'field', u'key', u'value'])
    for ref_line in refs:
        ref.append(ref_line)
    return book
Ejemplo n.º 17
0
def excel_template(dataset_type, org):
    """
    return an openpyxl.Workbook object containing the sheet and header fields
    for passed dataset_type and org.
    """
    geno = get_geno(dataset_type)

    book = openpyxl.Workbook()
    sheet = book.active
    refs = []
    for chromo in geno['resources']:
        _populate_excel_sheet(sheet, chromo, org, refs)
        sheet = book.create_sheet()

    ref = sheet
    ref.title = 'reference'
    ref.append([u'field', u'key', u'value'])
    for ref_line in refs:
        ref.append(ref_line)
    return book
Ejemplo n.º 18
0
def csv_data_batch(csv_path, target_dataset):
    """
    Generator of dataset records from csv file

    :param csv_path: file to parse
    """
    records = []
    current_owner_org = None

    firstpart, filename = os.path.split(csv_path)
    assert filename.endswith('.csv')
    resource_name = filename[:-4]

    chromo = get_chromo(resource_name)
    geno = get_geno(chromo['dataset_type'])
    assert geno.get('target_dataset') == target_dataset

    with open(csv_path) as f:
        csv_in = DictReader(f)
        cols = csv_in.unicode_fieldnames

        expected = [f['datastore_id'] for f in chromo['fields']]
        assert cols[:-2] == expected, 'column mismatch:\n{0}\n{1}'.format(
            cols[:-2], expected)

        for row_dict in csv_in:
            owner_org = row_dict.pop('owner_org')
            owner_org_title = row_dict.pop('owner_org_title')
            if owner_org != current_owner_org:
                if records:
                    yield (resource_name, current_owner_org, records)
                records = []
                current_owner_org = owner_org

            row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items())
            records.append(row_dict)
            if len(records) >= BATCH_SIZE:
                yield (resource_name, current_owner_org, records)
                records = []
    if records:
        yield (resource_name, current_owner_org, records)
Ejemplo n.º 19
0
    def _show(self, dataset_type, org_name):
        """
        Display some information about the status of recombinant datasets
        """
        orgs = [org_name] if org_name else self._get_orgs()
        types = [dataset_type] if dataset_type else get_dataset_types()

        for dtype in types:
            print u'{geno[title]} ({dtype})'.format(
                geno=get_geno(dtype), dtype=dtype).encode('utf-8')

            packages = self._get_packages(dtype, orgs)
            if dataset_type:
                for p in packages:
                    print p['owner_org']
                    if 'error' in p:
                        print '  *** {p[error]}'.format(p=p)
                    elif not p['metadata_correct']:
                        print '  ! metadata needs to be updated'
                    for r in p['resources']:
                        print ' - id:{r[id]} {r[name]}'.format(r=r),
                        if 'error' in r:
                            print '    *** {r[error]}'.format(r=r)
                        else:
                            print 'rows:{r[datastore_rows]}'.format(r=r)
                            if not r['datastore_correct']:
                                print '   ! datastore needs to be updated'
                            if not r['metadata_correct']:
                                print '   ! metadata needs to be updated'

            if len(packages) != len(orgs):
                print (' > %d orgs but %d records found' %
                    (len(orgs), len(packages)))
            else:
                print (' > %d datasets found' % (len(packages),))
            need_update = sum(1 for p in packages if not p['all_correct'])
            if need_update:
                print (' --> %d need to be updated' % need_update)
Ejemplo n.º 20
0
    def _show(self, dataset_type, org_name):
        """
        Display some information about the status of recombinant datasets
        """
        orgs = [org_name] if org_name else self._get_orgs()
        types = [dataset_type] if dataset_type else get_dataset_types()

        for dtype in types:
            print u'{geno[title]} ({dtype})'.format(
                geno=get_geno(dtype), dtype=dtype).encode('utf-8')

            packages = self._get_packages(dtype, orgs)
            if dataset_type:
                for p in packages:
                    print p['owner_org']
                    if 'error' in p:
                        print '  *** {p[error]}'.format(p=p)
                    elif not p['metadata_correct']:
                        print '  ! metadata needs to be updated'
                    for r in p['resources']:
                        print ' - id:{r[id]} {r[name]}'.format(r=r),
                        if 'error' in r:
                            print '    *** {r[error]}'.format(r=r)
                        else:
                            print 'rows:{r[datastore_rows]}'.format(r=r)
                            if not r['datastore_correct']:
                                print '   ! datastore needs to be updated'
                            if not r['metadata_correct']:
                                print '   ! metadata needs to be updated'

            if len(packages) != len(orgs):
                print(' > %d orgs but %d records found' %
                      (len(orgs), len(packages)))
            else:
                print(' > %d datasets found' % (len(packages), ))
            need_update = sum(1 for p in packages if not p['all_correct'])
            if need_update:
                print(' --> %d need to be updated' % need_update)
Ejemplo n.º 21
0
    def upload(self, id):
        package_type = self._get_package_type(id)
        geno = get_geno(package_type)
        lc = ckanapi.LocalCKAN(username=c.user)
        dataset = lc.action.package_show(id=id)
        try:
            if request.POST['xls_update'] == '':
                raise BadExcelData('You must provide a valid file')

            _process_upload_file(lc, dataset, request.POST['xls_update'].file,
                                 geno)

            h.flash_success(
                _("Your file was successfully uploaded into the central system."
                  ))

            redirect(h.url_for(controller='package', action='read', id=id))
        except BadExcelData, e:
            org = lc.action.organization_show(id=dataset['owner_org'])
            return self.preview_table(
                resource_name=dataset['resources'][0]['name'],
                owner_org=org['name'],
                errors=[e.message])
Ejemplo n.º 22
0
    def upload(self, id):
        package_type = self._get_package_type(id)
        geno = get_geno(package_type)
        lc = ckanapi.LocalCKAN(username=c.user)
        dataset = lc.action.package_show(id=id)
        try:
            if request.POST['xls_update'] == '':
                raise BadExcelData('You must provide a valid file')

            _process_upload_file(
                lc,
                dataset,
                request.POST['xls_update'].file,
                geno)

            h.flash_success(_(
                "Your file was successfully uploaded into the central system."
                ))

            redirect(h.url_for(controller='package', action='read', id=id))
        except BadExcelData, e:
            x_vars = {'errors': [e.message], 'action': 'edit'}
            c.pkg_dict = dataset
            return render(self._edit_template(package_type), extra_vars=x_vars)
Ejemplo n.º 23
0
 def _dataset_types(self, dataset_types):
     for t in self._expand_dataset_types():
         print t + ': ' + ' '.join(
             c['resource_name'] for c in get_geno(t)['resources'])
Ejemplo n.º 24
0
 def _dataset_types(self, dataset_types):
     for t in self._expand_dataset_types():
         print t + ': ' + ' '.join(c['resource_name']
                                   for c in get_geno(t)['resources'])