def get_datapreview_recombinant(dataset_type, res_id): from ckanext.recombinant.plugins import get_table t = get_table(dataset_type) default_preview_args = {} if 'default_preview_sort' in t: default_preview_args['sort'] = t['default_preview_sort'] lc = ckanapi.LocalCKAN(username=c.user) results = lc.action.datastore_search( resource_id=res_id, limit=0, **default_preview_args) lang = h.lang() field_label = {} for f in t['fields']: label = f['label'].split(' / ') label = label[0] if lang == 'en' else label[-1] field_label[f['datastore_id']] = label fields = [{ 'type': f['type'], 'id': f['id'], 'label': field_label.get(f['id'], f['id'])} for f in results['fields']] return h.snippet('package/wet_datatable.html', resource_id=res_id, ds_fields=fields)
def csv_data_batch(csv_path, target_dataset): """ Generator of dataset records from csv file :param csv_path: file to parse :ptype csv_file: str :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.) :ptype target_dataset: str :return a batch of records for at most one organization :rtype: dict mapping at most one org-id to at most BATCH_SIZE (dict) records """ dataset_types = get_dataset_types(target_dataset) # Use JSON schema to discover the dataset type to which the file corresponds schema_tables = dict(( t, dict((f['label'], f['datastore_id']) for f in get_table(t)['fields'])) for t in dataset_types) records = {} schema_cols = None cols = None csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path))) if os.path.islink(csv_path): csv_path = os.readlink(csv_path) with open(csv_path) as f: csv_in = DictReader(f) cols = csv_in.unicode_fieldnames for k, v in schema_tables.iteritems(): if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and len(cols) == len(v.keys()) + 2): # columns represent all schema data fields + 'Org id', 'Org' schema_cols = [v[col] if col in v else col for col in cols] break assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format( csv_path, dataset_types) with open(csv_path) as f: # use new dict, each col named for its corresponding JSON datastore_id csv_in = DictReader(f, fieldnames=schema_cols) csv_in.next() # skip header row: no new info for row_dict in csv_in: org_id = row_dict.pop('Org id') org = row_dict.pop('Org') if org_id not in records: if len(records.keys()): org_id_done = records.keys()[0] yield {org_id_done: records.pop(org_id_done)} records[org_id] = [] row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items()) records[org_id].append(row_dict) if len(records[org_id]) >= BATCH_SIZE: yield {org_id: records.pop(org_id)} yield records
def _build_templates(self): """ Implement build-templates command """ lc = LocalCKAN() output_files = {} next_row = {} output_counter = {} output_path = self.args[2:][-1] dataset_types = get_dataset_types(self.command_name) table = get_table(dataset_types[0]) def close_write_file(org_id): book = output_files[org_id] if not book: return book.save(os.path.join(output_path, org_id + '-' + str(output_counter[org_id]) + '.xls')) output_files[org_id] = None def out_file(org_id): if org_id in output_files: next_row[org_id] += 1 # need to start a new file? if next_row[org_id] > SPLIT_XLS_ROWS: close_write_file(org_id) else: return output_files[org_id], next_row[org_id] try: org = lc.action.organization_show( id=org_id, include_data_batch=False) except NotFound: logging.error('org id', org_id, 'not found') output_files[org_id] = None next_row[org_id] = 0 return None, None book = xls_template(dataset_types[0], org) output_files[org_id] = book output_counter[org_id] = output_counter.get(org_id, 0) + 1 next_row[org_id] = len(book.get_sheet(0).get_rows()) return book, next_row[org_id] def add_row(book, row, d): sheet = book.get_sheet(0) for i, f in enumerate(table['fields']): sheet.write(row, i, d[f['datastore_id']]) for f in self.args[1:-1]: for d in DictReader(open(f, 'rb')): book, row = out_file(d['organization']) if not book: continue add_row(book, row, d) for org_id in output_files: close_write_file(org_id)
def _build_templates(self): lc = LocalCKAN() output_files = {} next_row = {} output_counter = {} output_path = self.args[2:][-1] table = get_table(DATASET_TYPE) def close_write_file(org_id): book = output_files[org_id] if not book: return book.save(os.path.join(output_path, org_id + '-' + str(output_counter[org_id]) + '.xls')) output_files[org_id] = None def out_file(org_id): if org_id in output_files: next_row[org_id] += 1 # need to start a new file? if next_row[org_id] > SPLIT_XLS_ROWS: close_write_file(org_id) else: return output_files[org_id], next_row[org_id] try: org = lc.action.organization_show(id=org_id, include_datasets=False) except NotFound: print 'org id', org_id, 'not found' output_files[org_id] = None next_row[org_id] = 0 return None, None book = xls_template(DATASET_TYPE, org) output_files[org_id] = book output_counter[org_id] = output_counter.get(org_id, 0) + 1 next_row[org_id] = len(book.get_sheet(0).get_rows()) return book, next_row[org_id] def add_row(book, row, d): sheet = book.get_sheet(0) for i, f in enumerate(table['fields']): sheet.write(row, i, d[f['datastore_id']]) for f in self.args[1:-1]: for d in DictReader(open(f, 'rb')): book, row = out_file(d['organization']) if not book: continue add_row(book, row, d) for org_id in output_files: close_write_file(org_id)
def _check_table_columns(self, res_id, dataset_type): """ return a list of columns in the res_id if they don't match the columns that would be created for this type. """ lc = ckanapi.LocalCKAN() t = get_table(dataset_type) try: result = lc.action.datastore_search(resource_id=res_id, rows=0) except ckanapi.NotFound: return "table missing!" fields = result['fields'][1:] # remove '_id' if len(fields) != len(t['fields']): return "wrong number of columns!" for df, tf in zip(fields, t['fields']): if df['id'] != tf['datastore_id']: return "columns don't match: %s" % ' '.join( f['id'] for f in fields)
def _update_records(records, org_detail, conn, recombinant_type): """ Update records on solr core :param records: record dicts :ptype records: sequence of record dicts :param org_detail: org structure as returned via local CKAN :ptype org_detail: dict with local CKAN org structure :param conn: solr connection :ptype conn: obj :param recombinant_type: type being """ table = get_table(recombinant_type) pk = table.get('datastore_primary_key', []) if not isinstance(pk, list): pk = [pk] org = org_detail['name'] orghash = hashlib.md5(org).hexdigest() def unique_id(r): s = orghash if not pk: s = hashlib.md5(s + recombinant_type + "-%d" % r['_id']).hexdigest() for k in pk: s = hashlib.md5(s + r[k].encode('utf-8')).hexdigest() return s out = [] for r in records: unique = unique_id(r) shortform = None shortform_fr = None for e in org_detail['extras']: if e['key'] == 'shortform': shortform = e['value'] elif e['key'] == 'shortform_fr': shortform_fr = e['value'] solrrec = { 'id': unique, 'org_name_code': org_detail['name'], 'org_name_en': org_detail['title'].split(' | ', 1)[0], 'org_name_fr': org_detail['title'].split(' | ', 1)[-1], } for f in table['fields']: key = f['datastore_id'] value = r[key] facet_range = f.get('solr_float_range_facet') if facet_range: try: float_value = float(value) except ValueError: pass else: for i, fac in enumerate(facet_range): if 'less_than' not in fac or float_value < fac['less_than']: solrrec[key + '_range'] = str(i) solrrec[key + '_range_en'] = fac['label'].split(' | ')[0] solrrec[key + '_range_fr'] = fac['label'].split(' | ')[-1] break if f.get('datastore_type') == 'date': try: value = date2zulu(value) # CM: If this only applies to PD types this should be accurate # CM: This should only apply if valid (as per date2zulu) else NULL if f.get('extract_date_year'): solrrec['date_year'] = value.split('-', 1)[0] if f.get('extract_date_month'): solrrec['date_month'] = value.split('-')[1] except ValueError: pass solrrec[key] = value choices = f.get('choices') if not choices: if 'choices_source' not in f: continue choices = f['choices'] = extract_choices(f['choices_source']) if key.endswith('_code'): key = key[:-5] solrrec[key + '_en'] = choices.get(value, '').split(' | ')[0] solrrec[key + '_fr'] = choices.get(value, '').split(' | ')[-1] out.append(solrrec) conn.add_many(out, _commit=True)
def upload(self, id): package_type = self._get_package_type(id) t = get_table(package_type) expected_sheet_name = t['xls_sheet_name'] try: lc = ckanapi.LocalCKAN(username=c.user) package = lc.action.package_show(id=id) owner_org = package['organization']['name'] if request.POST['xls_update'] == u'': msg = _('You must provide a valid file') raise ValidationError({'xls_update': [msg]}) upload_data = read_xls(request.POST['xls_update'].file) sheet_name, org_name = None, None try: sheet_name, org_name = next(upload_data) except: # XXX bare except because this can fail in all sorts of ways if asbool(config.get('debug', False)): # on debug we want the real error raise raise ValidationError({'xls_update': [_("The server encountered a problem processing the file " "uploaded. Please try copying your data into the latest " "version of the template and uploading again. If this " "problem continues, send your Excel file to " "[email protected] so we may investigate.")]}) if expected_sheet_name != sheet_name: raise ValidationError({'xls_update': [_('Invalid file for this data type. ' + 'Sheet must be labeled "{0}", ' + 'but you supplied a sheet labeled "{1}"').format( expected_sheet_name, sheet_name)]}) # is this the right sheet for this organization? if org_name != owner_org: msg = _( 'Invalid sheet for this organization. ' + 'Sheet must be labeled for {0}, ' + 'but you supplied a sheet for {1}').format( owner_org, org_name) raise ValidationError({'xls_update': [msg]}) resource_id = package['resources'][0]['id'] records = get_records(upload_data, t['fields']) method = 'upsert' if t.get('datastore_primary_key') else 'insert' try: lc.action.datastore_upsert( method=method, resource_id=resource_id, records=records) except NotAuthorized, na: msg = _( 'You do not have permission to upload to {0}').format( owner_org) raise ValidationError({'xls_update': [msg]}) h.flash_success(_( "Your file was successfully uploaded into the central system." )) redirect(h.url_for(controller='package', action='read', id=id))