def csv_data_batch(csv_path, target_dataset): """ Generator of dataset records from csv file :param csv_path: file to parse :ptype csv_file: str :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.) :ptype target_dataset: str :return a batch of records for at most one organization :rtype: dict mapping at most one org-id to at most BATCH_SIZE (dict) records """ dataset_types = get_dataset_types(target_dataset) # Use JSON schema to discover the dataset type to which the file corresponds schema_tables = dict(( t, dict((f['label'], f['datastore_id']) for f in get_table(t)['fields'])) for t in dataset_types) records = {} schema_cols = None cols = None csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path))) if os.path.islink(csv_path): csv_path = os.readlink(csv_path) with open(csv_path) as f: csv_in = DictReader(f) cols = csv_in.unicode_fieldnames for k, v in schema_tables.iteritems(): if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and len(cols) == len(v.keys()) + 2): # columns represent all schema data fields + 'Org id', 'Org' schema_cols = [v[col] if col in v else col for col in cols] break assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format( csv_path, dataset_types) with open(csv_path) as f: # use new dict, each col named for its corresponding JSON datastore_id csv_in = DictReader(f, fieldnames=schema_cols) csv_in.next() # skip header row: no new info for row_dict in csv_in: org_id = row_dict.pop('Org id') org = row_dict.pop('Org') if org_id not in records: if len(records.keys()): org_id_done = records.keys()[0] yield {org_id_done: records.pop(org_id_done)} records[org_id] = [] row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items()) records[org_id].append(row_dict) if len(records[org_id]) >= BATCH_SIZE: yield {org_id: records.pop(org_id)} yield records
def _build_templates(self): """ Implement build-templates command """ lc = LocalCKAN() output_files = {} next_row = {} output_counter = {} output_path = self.args[2:][-1] dataset_types = get_dataset_types(self.command_name) table = get_table(dataset_types[0]) def close_write_file(org_id): book = output_files[org_id] if not book: return book.save(os.path.join(output_path, org_id + '-' + str(output_counter[org_id]) + '.xls')) output_files[org_id] = None def out_file(org_id): if org_id in output_files: next_row[org_id] += 1 # need to start a new file? if next_row[org_id] > SPLIT_XLS_ROWS: close_write_file(org_id) else: return output_files[org_id], next_row[org_id] try: org = lc.action.organization_show( id=org_id, include_data_batch=False) except NotFound: logging.error('org id', org_id, 'not found') output_files[org_id] = None next_row[org_id] = 0 return None, None book = xls_template(dataset_types[0], org) output_files[org_id] = book output_counter[org_id] = output_counter.get(org_id, 0) + 1 next_row[org_id] = len(book.get_sheet(0).get_rows()) return book, next_row[org_id] def add_row(book, row, d): sheet = book.get_sheet(0) for i, f in enumerate(table['fields']): sheet.write(row, i, d[f['datastore_id']]) for f in self.args[1:-1]: for d in DictReader(open(f, 'rb')): book, row = out_file(d['organization']) if not book: continue add_row(book, row, d) for org_id in output_files: close_write_file(org_id)
def data_batch(org_id, lc, target_dataset): """ Generator of dataset dicts for organization with name org :param org_id: the id for the organization of interest :ptype org_id: str :param lc: local CKAN :ptype lc: obj :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.) :ptype target_dataset: str :return generates batches of dataset dict records :rtype batch of dataset dict records """ dataset_types = get_dataset_types(target_dataset) for dataset_type in dataset_types: records = {} result = lc.action.package_search( q="type:{0:s} owner_org:{1:s}".format(dataset_type, org_id), rows=1000)['results'] if len(result) == 0: yield records else: try: resource_id = result[0]['resources'][0]['id'] except (IndexError, KeyError): continue offset = 0 while True: rval = lc.action.datastore_search( resource_id=resource_id, limit=BATCH_SIZE, offset=offset) records = rval['records'] if not records: break yield records offset += len(records)
def _dataset_types(self, target_datasets): if len(target_datasets) == 0: target_datasets = get_target_datasets() for target_ds in target_datasets: print target_ds + ': ' + ' '.join(get_dataset_types(target_ds))