コード例 #1
0
ファイル: dataset.py プロジェクト: TkTech/ckanext-canada
def csv_data_batch(csv_path, target_dataset):
    """
    Generator of dataset records from csv file

    :param csv_path: file to parse
    :ptype csv_file: str
    :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.)
    :ptype target_dataset: str

    :return a batch of records for at most one organization
    :rtype: dict mapping at most one org-id to
            at most BATCH_SIZE (dict) records
    """
    dataset_types = get_dataset_types(target_dataset)
    # Use JSON schema to discover the dataset type to which the file corresponds
    schema_tables = dict((
            t,
            dict((f['label'], f['datastore_id'])
                for f in get_table(t)['fields']))
        for t in dataset_types)
    records = {}
    schema_cols = None
    cols = None
    csv_path = os.path.abspath(os.path.expandvars(os.path.expanduser(csv_path)))
    if os.path.islink(csv_path):
        csv_path = os.readlink(csv_path)
    with open(csv_path) as f:
        csv_in = DictReader(f)
        cols = csv_in.unicode_fieldnames

        for k, v in schema_tables.iteritems():
            if (len(set(v.keys()).intersection(set(cols))) == len(v.keys()) and
                    len(cols) == len(v.keys()) + 2):
                # columns represent all schema data fields + 'Org id', 'Org'
                schema_cols = [v[col] if col in v else col for col in cols]
                break

    assert schema_cols > 0, '{0:s} does not match any dataset type {1}'.format(
        csv_path, dataset_types)

    with open(csv_path) as f:
        # use new dict, each col named for its corresponding JSON datastore_id
        csv_in = DictReader(f, fieldnames=schema_cols)
        csv_in.next()   # skip header row: no new info
        for row_dict in csv_in:
            org_id = row_dict.pop('Org id')
            org = row_dict.pop('Org')
            if org_id not in records:
                if len(records.keys()):
                    org_id_done = records.keys()[0]
                    yield {org_id_done: records.pop(org_id_done)}
                records[org_id] = []

            row_dict = dict((k, safe_for_solr(v)) for k, v in row_dict.items())
            records[org_id].append(row_dict)
            if len(records[org_id]) >= BATCH_SIZE:
                yield {org_id: records.pop(org_id)}
    yield records
コード例 #2
0
ファイル: pd.py プロジェクト: TkTech/ckanext-canada
    def _build_templates(self):
        """
        Implement build-templates command
        """
        lc = LocalCKAN()
        output_files = {}
        next_row = {}
        output_counter = {}
        output_path = self.args[2:][-1]
        dataset_types = get_dataset_types(self.command_name)
        table = get_table(dataset_types[0])

        def close_write_file(org_id):
            book = output_files[org_id]
            if not book:
                return
            book.save(os.path.join(output_path,
                org_id + '-' + str(output_counter[org_id]) + '.xls'))
            output_files[org_id] = None

        def out_file(org_id):
            if org_id in output_files:
                next_row[org_id] += 1
                # need to start a new file?
                if next_row[org_id] > SPLIT_XLS_ROWS:
                    close_write_file(org_id)
                else:
                    return output_files[org_id], next_row[org_id]
            try:
                org = lc.action.organization_show(
                    id=org_id, include_data_batch=False)
            except NotFound:
                logging.error('org id', org_id, 'not found')
                output_files[org_id] = None
                next_row[org_id] = 0
                return None, None
            book = xls_template(dataset_types[0], org)
            output_files[org_id] = book
            output_counter[org_id] = output_counter.get(org_id, 0) + 1
            next_row[org_id] = len(book.get_sheet(0).get_rows())
            return book, next_row[org_id]

        def add_row(book, row, d):
            sheet = book.get_sheet(0)
            for i, f in enumerate(table['fields']):
                sheet.write(row, i, d[f['datastore_id']])

        for f in self.args[1:-1]:
            for d in DictReader(open(f, 'rb')):
                book, row = out_file(d['organization'])
                if not book:
                    continue
                add_row(book, row, d)

        for org_id in output_files:
            close_write_file(org_id)
コード例 #3
0
def data_batch(org_id, lc, target_dataset):
    """
    Generator of dataset dicts for organization with name org

    :param org_id: the id for the organization of interest
    :ptype org_id: str
    :param lc: local CKAN
    :ptype lc: obj
    :param target_dataset: name of target dataset (e.g., 'ati', 'pd', etc.)
    :ptype target_dataset: str

    :return generates batches of dataset dict records
    :rtype batch of dataset dict records
    """
    dataset_types = get_dataset_types(target_dataset)

    for dataset_type in dataset_types:
        records = {}
        result = lc.action.package_search(
            q="type:{0:s} owner_org:{1:s}".format(dataset_type, org_id),
            rows=1000)['results']
        if len(result) == 0:
            yield records
        else:
            try:
                resource_id = result[0]['resources'][0]['id']
            except (IndexError, KeyError):
                continue
            offset = 0
            while True:
                rval = lc.action.datastore_search(
                    resource_id=resource_id,
                    limit=BATCH_SIZE,
                    offset=offset)
                records = rval['records']
                if not records:
                    break
                yield records
                offset += len(records)
コード例 #4
0
 def _dataset_types(self, target_datasets):
     if len(target_datasets) == 0:
         target_datasets = get_target_datasets()
     for target_ds in target_datasets:
         print target_ds + ': ' + ' '.join(get_dataset_types(target_ds))