def mirror_ckan(source, target, api_key, dryrun, update):

    sourceCKAN = ckanclient.CkanClient(base_location=source)
    targetCKAN = ckanclient.CkanClient(base_location=target, api_key=api_key)

    indent = '    '
    for name in sourceCKAN.package_register_get():

        #if name == 'hospital-compare':

        sourceCKAN.package_entity_get(name)  # Get the dataset description.
        dataset = sourceCKAN.last_message

        altID = source.replace('/api', '') + '/dataset/' + dataset['id']
        altName = source.replace('/api', '') + '/dataset/' + dataset['name']
        dataset['extras']['prov_alternateOf'] = altName
        # Would like to assert two alternates, but their model is limiting.

        if not dryrun: del dataset['id']  # DELETING
        print name + ' ' + dataset['name']
        if 'download_url' in dataset:
            print indent + 'download_url: ' + dataset['download_url']
        if 'url' in dataset:
            print indent + 'url:          ' + dataset['url']
        for resource in dataset['resources']:
            if not dryrun: del resource['id']  # DELETING
            if 'url' in resource:
                print indent + 'resource:     ' + resource['url']
                print indent + 'format:       ' + resource['format']
                # Formats seen on healthdata.gov:
                #    CSV Text XLS XML Feed Query API Widget RDF
        #print json.dumps(dataset,sort_keys=True, indent=4)
        if not dryrun:
            try:  # See if dataset is listed in targetCKAN
                targetCKAN.package_entity_get(dataset['name'])
                if update:
                    # Update target's existing entry from source's
                    targetCKAN.package_entity_put(dataset)
                else:
                    print('NOTE: skipping ' + dataset['name'] + ' ' +
                          'b/c already listed at ' + target)

                #update = targetCKAN.last_message
                #update['notes'] = 'Updated.'
                #targetCKAN.package_entity_put(update)

            except ckanclient.CkanApiNotFoundError:
                # Dataset is not listed on this CKAN
                print 'INFO: adding ' + dataset['name'] + ' to ' + target
                try:
                    targetCKAN.package_register_post(dataset)  # POST
                except ckanclient.CkanApiConflictError:
                    print('WARNING: ' + 'Conflict error when trying to POST ' +
                          dataset['name'])
Exemple #2
0
def publish_to_ckan():
    """Updates the dataset in the CKAN repository or creates a new dataset

    Returns:
        None
    """
    global ckan_client

    # Initialize the CKAN client
    ckan_client = ckanclient.CkanClient(base_location=args.ckan_api,
                                        api_key=args.ckan_api_key)

    # Create the name of the dataset on the CKAN instance
    dataset_id = args.ckan_dataset_name_prefix + args.dataset_name

    # Get the dataset from CKAN
    dataset_entity = get_remote_dataset(dataset_id)

    # Check to see if the dataset exists on CKAN or not
    if dataset_entity is None:

        # Create a new dataset
        create_dataset(dataset_id)

    else:

        # Update an existing dataset
        update_dataset(dataset_entity)

    # Update the dataset version on the CKAN repository (causes the last modified date to be updated)
    if args.increment != "none":
        update_dataset_version()
Exemple #3
0
def update_dataset_version():
    """Updates the dataset version number on CKAN repository
    
    Returns:
        None
    """
    global args

    logger.info('Updating CKAN dataset version')

    # Initialize CKAN client
    ckan = ckanclient.CkanClient(base_location=args.ckan_api,
                                 api_key=args.ckan_api_key)

    # Create the name of the dataset on the CKAN instance
    dataset_id = args.ckan_dataset_name_prefix + args.dataset_name

    try:
        # Get the dataset
        dataset_entity = ckan.package_entity_get(dataset_id)

        # Increment the version number
        version = dataset_entity['version']
        version = increment_version(version, args.increment)
        dataset_entity['version'] = version

        # Update the dataset
        ckan.package_entity_put(dataset_entity)

    except ckanclient.CkanApiNotFoundError:
        logger.info(" Dataset " + dataset_id + " not found on OpenColorado")
Exemple #4
0
def searcher(valid_ids, invalid_ids, apikey, server, times=1, count=50):
    import time
    import random
    import ckanclient

    errors = []
    for _ in range(times):
        valid = random.randint(5, count)
        invalid = random.randint(5, count)

        samples = random.sample(valid_ids, valid)
        samples.extend(random.sample(invalid_ids, invalid))

        s = time.time()
        ckan = ckanclient.CkanClient(base_location=server, api_key=apikey)
        opts = {'offset': 0, 'limit': 0}
        q = ' OR '.join(samples)

        try:
            search_results = ckan.package_search(q, opts)
            datasets = list(search_results['results'])
            log.info("%d items found from %d ids in %s" %
                     (len(datasets), valid + invalid, time.time() - s))
        except:
            log.error(
                "Search failed with %d valid and %d invalid items in query" % (
                    valid,
                    invalid,
                ))
Exemple #5
0
def main():
    args = parser.parse_args()
    client = ckanclient.CkanClient(args.url)
    rows = []
    for pkg_name in client.package_register_get():
        pkg = client.package_entity_get(pkg_name)
        for extra, value in pkg.get('extras', {}).items():
            pkg['extras_' + extra] = value
        if 'extras' in pkg:
            del pkg['extras']
        resources = pkg.get('resources', [])
        for resource in resources:
            rpkg = pkg.copy()
            for resprop, value in resource.items():
                rpkg['resource_' + resprop] = value
            rows.append(rpkg)
        if not len(resources):
            rows.append(pkg)
        del pkg['resources']
        print pkg_name
    headers = set()
    for row in rows:
        headers.update(row.keys())
    fh = open(args.outfile, 'wb')
    writer = csv.DictWriter(fh, headers)
    writer.writerow(dict(zip(headers, headers)))
    for row in rows:
        row_ = {}
        for column, value in row.items():
            if isinstance(value, unicode):
                value = value.encode('utf-8')
            row_[column] = value
        writer.writerow(row_)
    fh.close()
 def __init__(self):
     self.apikey = open(API_KEY_FILENAME).read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL, api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     self.started = False
     self.start_key = ''
     pass
Exemple #7
0
 def __init__(self):
     self.apikey = open(API_KEY_FILENAME).read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                       api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     self.started = True
     self.start_key = ''
     self.tags = [u'twitter', u'микроблоги']
     pass
Exemple #8
0
 def __init__(self):
     self.apikey = open(API_KEY_FILENAME).read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                       api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     self.started = True
     self.start_key = ''
     self.tags = [u'youtube', u'архивы', u'видеоканалы']
     pass
Exemple #9
0
 def __init__(self):
     self.apikey = open(API_KEY_FILENAME).read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                       api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     self.started = True
     self.start_key = ''
     self.tags = [u'ЕГЭ', u'экзамены', u'статистика']
     pass
def process_ckan_datasets():

    global ckan_host

    # Initialize the CKAN client
    ckan_client = ckanclient.CkanClient(base_location=ckan_host)

    package_id_list = ckan_client.package_register_get()
    # print package_id_list

    index = 0
    for package_id in package_id_list:

        # Get the package details
        package = ckan_client.package_entity_get(package_id)

        # Get the package name (slug)
        package_name = package['name']
        #print package_name

        print "------------------------------"
        print "Processing dataset " + str(index) + " of " + str(
            len(package_id_list)) + ": " + package_name
        print "Created: " + package[
            'metadata_created'] + ", modified: " + package['metadata_modified']

        shapefile_found = False
        resources = package['resources']
        for resource in resources:

            ## Look for a shapefile resource
            if (resource['mimetype'] and 'shp' in resource['mimetype'].lower()) or \
               (resource['mimetype_inner'] and 'shp' in resource['mimetype_inner'].lower()) or \
               (resource['format'] and 'shp' in resource['format'].lower()) or \
               (resource['format'] and 'shapefile' in resource['format'].lower()) or \
               (resource['name'] and 'shp' in resource['name'].lower()) or \
               (resource['name'] and 'shapefile' in resource['name'].lower()) or \
               (resource['description'] and 'shp' in resource['description'].lower()) or \
               (resource['description'] and 'shapefile' in resource['description'].lower()):

                shapefile_found = True

                print "Shapefile found!  Attepting download..."

                # Get the resource URL
                url = resource["url"]

                #### Download the shapefile
                shapefile = download_shapefile(package_name, url)

                reproject_shapefile(package_name, shapefile)

        if shapefile_found == False:
            print "No shapefile found."

        index = index + 1
Exemple #11
0
def urispace_of_dataset(ckan_loc='http://datahub.io',
                        dataset_name='2000-us-census-rdf'):
    ckan = ckanclient.CkanClient(base_location=ckan_loc + '/api')
    dataset = ckan.package_entity_get(dataset_name)

    # u'extras': {u'namespace': u'http://www.rdfabout.com/rdf/usgov/geo/'

    if 'extras' in dataset:
        if 'namespace' in dataset['extras']:
            print dataset['extras']['namespace']
Exemple #12
0
def run(directory):
    url = 'http://iatiregistry.org/api'
    registry = ckanclient.CkanClient(base_location=url)
    for pkg_name in registry.package_register_get():
        pkg = registry.package_entity_get(pkg_name)
        for resource in pkg.get('resources', []):
            print resource.get('url')
            try:
                save_file(pkg_name, resource.get('url'), directory)
            except Exception, e:
                print "Failed:", e
Exemple #13
0
def index(request):
    out = []
    # Instantiate the CKAN client.
    #ckan = ckanclient.CkanClient(base_location='http://open.alberta.ca/api')
    ckan = ckanclient.CkanClient(base_location='https://datahub.io/api')
    #ckan = ckanclient.CkanClient(base_location='http://opendata.aragon.es/api')
    i = 0
    out = {}
    out['nodes'] = []
    out['links'] = []
    title = []
    org = []

    # Get the package list.
    package_list = ckan.package_register_get()
    for pack in package_list:
        if i < 15:
            ckan.package_entity_get(pack)
            package_entity = ckan.last_message
            if package_entity.has_key('organization'):
                #print True
                title.append(package_entity['title'])
                org.append(package_entity['organization']['title'])
            i += 1
        else:
            break

    for ti in title:
        nodos = ast.literal_eval('{"name":"' + ti.encode('utf-8') +
                                 '","group":"uno"}')
        out['nodes'].append(nodos)

    j = 0
    for res in org:
        k = 0
        for lov in org:
            if res == lov:
                links = ast.literal_eval('{"source":' + str(j) + ',"target":' +
                                         str(k) + ',"weight":1}')
                out['links'].append(links)
                print res, " ", j, ",", k
            k += 1
        j += 1

    os.getcwd()
    os.path.exists("profundidad_ckan")
    with open('profundidad_ckan' + '/static/profundidad_ckan/data.json',
              'w') as fs:
        json.dump(out, fs)

    return render(request, "index.html", {
        'out': out['nodes'],
        'tam': len(package_list)
    })
Exemple #14
0
 def __init__(self,
              base_location,
              api_key=None,
              is_remote=True,
              ckan_version=2.2):
     self.ckan_target = ckanclient.CkanClient(base_location, api_key)
     self.ckanapi = ckanapi.RemoteCKAN(
         base_location,
         apikey=api_key,
         user_agent='CkanApiScript (+http://TBD)')
     '''TODO: supporter API LOCAL'''
Exemple #15
0
 def __init__(self):
     self.apikey = open(API_KEY_FILENAME).read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                       api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     self.started = True
     self.start_key = ''
     self.tags = [
         u'политика', u'выборы', u'политические партии', u'финансы'
     ]
     self.package_keys = {'govbody': u'ЦИК России'}
     pass
Exemple #16
0
 def __init__(self):
     self.apikey = open(API_KEY_FILENAME).read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                       api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     self.started = True
     self.start_key = ''
     self.tags = [
         u'минюст россии',
         u'статистика',
     ]
     self.package_keys = {'govbody': u'Минюст России'}
     pass
Exemple #17
0
    def __init__(self, ckan_host, store, talisuser, talispassword):
        api_key = None

        # ckan connection
        if not ckan_host.startswith('http://'):
            ckan_host = 'http://' + ckan_host
        ckan_host = ckan_host + '/api'
        self.ckan = ckanclient.CkanClient(base_location=ckan_host,
                                          api_key=api_key)

        # talis connection
        talis.TalisLogin.init(store, talisuser, talispassword)
        self._talis = talis.Talis()
Exemple #18
0
    def __init__(self):
        self.apikey = open(API_KEY_FILENAME).read()
        self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                          api_key=self.apikey)
        self.package_list = self.ckan.package_register_get()
        self.started = True
        self.start_key = ''
        self.tags = [
            u'фас россии', 'RSS', u'новости', u'официально', u'госсайты'
        ]
        self.feedtype = 'RSS'

        pass
Exemple #19
0
def load_registry(url='http://iatiregistry.org/api'):
    import ckanclient
    transactions = []
    registry = ckanclient.CkanClient(base_location=url)
    for pkg_name in registry.package_register_get():
        pkg = registry.package_entity_get(pkg_name)
        for resource in pkg.get('resources', []):
            print resource.get('url')
            try:
                transactions.extend(
                    load_file(resource.get('url'),
                              {'registry_package': pkg_name}))
            except Exception, e:
                print "Failed:", e
Exemple #20
0
 def __init__(self):
     self.apikey = open(API_KEY_FILENAME).read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                       api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     self.started = True
     self.start_key = ''
     self.tags = [
         u'ЕГЭ', u'экзамены', u'статистика', u'удмуртская республика'
     ]
     self.package_keys = {
         'region_code': u'18',
         'region': u'Удмуртская республика'
     }
     pass
    def command(self):
        super(SchemaChecker, self)._load_config()
        context = self.create_context()

        data = {
            'field_paths': defaultdict(int),
            'broken_rules': defaultdict(dict),
            'datasets_per_portal': defaultdict(set),
            'invalid_datasets': 0,
            'valid_datasets': 0
        }

        if len(self.args) == 0:

            context = {
                'model': model,
                'session': model.Session,
                'ignore_auth': True
            }

            validator = schema_checker.SchemaChecker()

            num_datasets = 0
            for i, dataset in enumerate(iterate_local_datasets(context)):
                print 'Processing dataset %s' % i
                normalize_action_dataset(dataset)
                validator.process_record(dataset)
                num_datasets += 1

            general = {'num_datasets': num_datasets}
            validator.redis_client.set('general', general)

        elif len(self.args) == 2 and self.args[0] == 'remote':
            endpoint = self.args[1]
            ckan = ckanclient.CkanClient(base_location=endpoint)

            rows = 1000
            total = self.get_dataset_count(ckan)
            steps = int(ceil(total / float(rows)))

            for i in range(0, steps):
                if i == steps - 1:
                    rows = total - (i * rows)

                datasets = self.get_datasets(ckan, rows, i)
                self.validate_datasets(datasets, data)

            self.write_validation_result(self.render_template(data))
Exemple #22
0
def iterate_remote_datasets(endpoint, max_rows=1000):
    ckan = ckanclient.CkanClient(base_location=endpoint)

    print 'Retrieve total number of datasets'
    total = ckan.action('package_search', rows=1)['count']

    steps = int(ceil(total / float(max_rows)))
    rows = max_rows

    for i in range(0, steps):
        if i == steps - 1:
            rows = total - (i * rows)

        datasets = (i * 1000) + 1
        print 'Retrieve datasets %s - %s' % (datasets, datasets + rows - 1)

        records = ckan.action('package_search', rows=rows, start=rows * i)
        records = records['results']
        for record in records:
            yield record
Exemple #23
0
def dump_ckan_to_pickle(keyfile):
    #Connect
    [ckankeys, googlekeys] = read_keys(keyfile)
    fout = open("pickled_ckan_contents.pk1", "wb")

    ckan = ckanclient.CkanClient(base_location=ckankeys['url'] + 'api',
                                 api_key=ckankeys['apikey'])

    #tag list
    tag_list = ckan.tag_register_get()
    pickle.dump(tag_list, fout,
                -1)  #force pickle to use highest protocol available

    #packages
    package_entities = {}
    package_list = ckan.package_register_get()
    print package_list
    for package_name in package_list:
        ckan.package_entity_get(package_name)
        package_entities[package_name] = ckan.last_message
    pickle.dump(package_entities, fout, -1)

    #groups
    groups = {}
    group_list = ckan.group_register_get()
    print group_list
    for group_name in group_list:
        groups[group_name] = ckan.group_entity_get(group_name)
    pickle.dump(groups, fout, -1)

    ###datasets
    ##datasets = {}
    ##dataset_list = ckan.dataset_register_get()
    ##for dataset_name in dataset_list:
    ##  datasets[dataset_name] = ckan.dataset_entity_get(dataset_name)
    ##pickle.dump(datasets, fout, -1)

    fout.close()
    return ()
def update_dataset_version():
    global args
    
    # Initialize ckan client
    ckan = ckanclient.CkanClient(base_location=ckan_api,api_key=ckan_api_key)
    
    # Create the name of the dataset on the CKAN instance
    dataset_id = ckan_dataset_prefix + args.catalog_dataset
    
    try:
        # Get the dataset
        dataset_entity = ckan.package_entity_get(dataset_id)
        
        # Increment the version number
        version = dataset_entity['version']
        version = increment_minor_version(version)
        dataset_entity['version'] = version
        
        # Update the dataset
        ckan.package_entity_put(dataset_entity)
        
    except ckanclient.CkanApiNotFoundError:
        info(" Dataset " + dataset_id + " not found on OpenColorado")
import os, json

import ckanclient  # see https://github.com/okfn/ckanclient README
# Get latest download URL from http://pypi.python.org/pypi/ckanclient#downloads --\/
# sudo easy_install http://pypi.python.org/packages/source/c/ckanclient/ckanclient-0.10.tar.gz

# See also https://github.com/timrdf/DataFAQs/wiki/CKAN
#    section "Automatically publish dataset on CKAN"

source = 'http://hub.healthdata.gov/api'
target = 'http://aquarius.tw.rpi.edu/projects/healthdata/api'

MIRROR = False  # Modify target CKAN with listings from source CKAN.
UPDATE = MIRROR and False  # If a dataset already exists in target, update it.

sourceCKAN = ckanclient.CkanClient(base_location=source)
api_key = os.environ['X_CKAN_API_Key']  # api_key must be defined to POST/PUT.
targetCKAN = ckanclient.CkanClient(base_location=target, api_key=api_key)

indent = '    '
for name in sourceCKAN.package_register_get():

    if name == 'hospital-compare':

        sourceCKAN.package_entity_get(name)  # Get the dataset description.
        dataset = sourceCKAN.last_message

        altID = source.replace('/api', '') + '/dataset/' + dataset['id']
        altName = source.replace('/api', '') + '/dataset/' + dataset['name']
        dataset['extras']['prov_alternateOf'] = altName
        # Would like to assert two alternates, but their model is limiting.
Exemple #26
0
# Get latest download URL from http://pypi.python.org/pypi/ckanclient#downloads --\/
# sudo easy_install http://pypi.python.org/packages/source/c/ckanclient/ckanclient-0.10.tar.gz

# See also https://github.com/timrdf/DataFAQs/wiki/CKAN
#    section "Automatically publish dataset on CKAN"

#source = 'http://hub.healthdata.gov/api'
target = 'http://healthdata.tw.rpi.edu/hub/api'

MIRROR = False  # Modify target CKAN with listings from source CKAN.
UPDATE = MIRROR and False  # If a dataset already exists in target, update it.

#sourceCKAN = ckanclient.CkanClient(base_location=target)
api_key = os.environ['X_CKAN_API_Key']  # api_key must be defined to POST/PUT.
print api_key
ckan = ckanclient.CkanClient(base_location=target, api_key=api_key)

indent = '    '
for name in ckan.package_register_get():
    print name
    ckan.package_entity_get(name)  # Get the dataset description.
    dataset = ckan.last_message
    resources = dict([(r['name'], r) for r in dataset['resources']])

    if "Data Dictionary" not in resources and "Data Dictionary" in dataset[
            'extras']:
        print "datadict"
        ddURL = dataset['extras']['Data Dictionary']
        ddFormat = ddURL.split(".")[-1]
        if len(ddFormat) > 5:
            ddFormat = None
Exemple #27
0
    s = dateutil.parser.parse(s)
    return s


def shorten(longURL):
    result = None
    f = urllib.urlopen("http://tinyurl.com/api-create.php?url=%s" % longURL)
    try:
        result = f.read()
    finally:
        f.close()
    return result


now = datetime.datetime.now()
ckan = ckanclient.CkanClient(base_location='http://dati.trentino.it/api')
package_list = ckan.package_register_get()
with open('ckan_packages.csv', 'wb') as csvfile:
    csvoutput = csv.writer(csvfile, delimiter=';', quoting=csv.QUOTE_ALL)
    csvoutput.writerow([
        "name", "author", "maintainer", "url", "metadata_created",
        "metadata_modified", "dayaftercreation"
    ])
    for package in package_list:
        ckan.package_entity_get(package)
        package_entity = ckan.last_message
        message = "Pubblicato oggi il dataset %s %s #opendatatrentino" % (
            package_entity['title'], shorten(package_entity['ckan_url']))
        maintainer = package_entity['maintainer']
        ckanurl = package_entity['ckan_url']
        #name = package_entity('name')
Exemple #28
0
def main():

    pstat = {
        'status': {},
        'text': {},
        'short': {},
    }
    now = time.strftime("%Y-%m-%d %H:%M:%S")
    jid = os.getpid()
    ckanlistrequests = ['package_list', 'group_list', 'tag_list']

    ## Get options and arguments
    args = get_args(ckanlistrequests)

    # Output instance
    OUT = Output(pstat, now, jid, args)
    logger = OUT.setup_custom_logger('root', args.verbose)

    ## Settings for CKAN client and API
    ckanapi3 = 'http://' + args.ckan + '/api/3'
    if PY2:
        ckan = ckanclient.CkanClient(ckanapi3)
    else:
        auth = '12345'
        ckan = CKAN_CLIENT(args.ckan, auth)

    ckan_limit = 500000

    start = time.time()

    if args.request.endswith('list'):
        try:
            if args.request == 'community_list':
                action = 'group_list'
            else:
                action = args.request
            if PY2:
                answer = ckan.action(action, rows=ckan_limit)
            else:
                answer = ckan.action(action)
        except ckanclient.CkanApiError as e:
            print('\t\tError %s Supported list requests are %s.' %
                  (e, ckanlistrequests))
            sys.exit(1)
        ## print '|- The list of %ss :\n\t%s' % (args.request.split('_')[0],'\n\t'.join(answer).encode('utf8'))
        print('\n\t%s' % '\n\t'.join(answer).encode('utf8'))
        sys.exit(0)

    # create CKAN search pattern :
    ckan_pattern = ''
    sand = ''
    pattern = ' '.join(args.pattern)

    if (args.community):
        ckan_pattern += "groups:%s" % args.community
        sand = " AND "
    if (args.pattern):
        ckan_pattern += sand + pattern

    print(' | - Search\n\t|- in\t%s\n\t|- for\t%s\n' %
          (args.ckan, ckan_pattern))

    if args.request == 'package_search':
        if PY2:
            answer = ckan.action('package_search',
                                 q=ckan_pattern,
                                 rows=ckan_limit)
        else:
            answer = ckan.action('package_search', {"q": ckan_pattern})
    for key, value in answer.items():
        logger.warning('answer has key %s' % key)
    if PY2:
        tcount = answer['count']
    else:
        tcount = answer['result']['count']
    print(' | - Results:\n\t|- %d records found in %d sec' %
          (tcount, time.time() - start))

    # Read in B2FIND metadata schema and fields
    schemafile = '%s/mapfiles/b2find_schema.json' % (os.getcwd())
    with open(schemafile, 'r') as f:
        b2findfields = json.loads(f.read(), object_pairs_hook=OrderedDict)

    if tcount > 0 and args.keys is not None:
        if len(args.keys) == 0:
            akeys = []
        else:
            if args.keys[0] == 'B2FIND.*':
                akeys = OrderedDict(sorted(b2findfields.keys()))
            else:
                akeys = args.keys

        suppid = b2findfields.keys()

        fh = io.open(args.output, "w", encoding='utf8')
        record = {}

        totlist = []
        count = {}
        count['id'] = 0
        statc = {}
        for outt in akeys:
            if outt not in suppid:
                print(' [WARNING] Not supported key %s is removed' % outt)
                akeys.remove(outt)
            else:
                count[outt] = 0
                statc[outt] = Counter()

        printfacets = ''
        if (len(akeys) > 0):
            printfacets = "and related facets %s " % ", ".join(akeys)

            print('\t|- IDs %sare written to %s ...' %
                  (printfacets, args.output))

        counter = 0
        cstart = 0
        oldperc = 0
        start2 = time.time()

        while (cstart < tcount):
            if (cstart > 0):
                if PY2:
                    answer = ckan.action('package_search',
                                         q=ckan_pattern,
                                         rows=ckan_limit,
                                         start=cstart)
                else:
                    answer = ckan.action('package_search', {
                        "q": ckan_pattern,
                        "rows": ckan_limit,
                        "start": cstart
                    })
            if PY2:
                if len(answer['results']) == 0:
                    break
            #HEW-D else:
            ##HEW-D    if len(answer['result']['results']) == 0 :
            ##HEW-D        break

            # loop over found records
            if PY2:
                results = answer['results']
            else:
                results = answer['result']['results']
            for ds in results:  #### answer['results']:
                counter += 1
                logger.debug('    | %-4d | %-40s |' % (counter, ds['name']))
                perc = int(counter * 100 / tcount)
                bartags = perc / 5
                if perc % 10 == 0 and perc != oldperc:
                    oldperc = perc
                    print('\r\t[%-20s] %5d (%3d%%) in %d sec' %
                          ('=' * int(bartags), counter, perc,
                           time.time() - start2))
                    sys.stdout.flush()

                record['id'] = '%s' % (ds['name'])
                outline = record['id']

                # loop over facets
                for facet in akeys:
                    ##HEW-T print 'facet : %s' % facet
                    ckanFacet = b2findfields[facet]["ckanName"]
                    if ckanFacet in ds:  ## CKAN default field
                        if facet == 'Group':
                            record[facet] = ds[ckanFacet][0]['display_name']
                        else:
                            record[facet] = ds[ckanFacet]
                    else:  ## CKAN extra field
                        ##HEW-T print 'ds extras %s' % ds['extras']
                        efacet = [e for e in ds['extras'] if e['key'] == facet]
                        if efacet:
                            ##HEW-T print 'rrrr %s effff %s' % (record[facet],efacet[0]['value'])
                            record[facet] = efacet[0]['value']
                        else:
                            record[facet] = 'N/A'
                    if record[facet] is None:
                        record[facet] = 'None'
                        statc[facet][record[facet]] += 1
                    else:
                        if not isinstance(record[facet], list):
                            words = record[facet].split(';')
                        else:
                            words = record[facet]
                        for word in words:
                            if isinstance(word, dict): word = word['name']
                            statc[facet][word] += 1
                    if not (record[facet] == 'N/A' or record[facet]
                            == 'Not Stated') and len(record[facet]) > 0:
                        count[facet] += 1
                    outline += '\t | %-30s' % record[facet][:30]
                fh.write(outline + '\n')
            cstart += len(results)
            logger.warning('%d records done, %d in total' % (cstart, tcount))
        fh.close()

        if len(akeys) > 0:
            statfh = io.open('stat_' + args.output, "w", encoding='utf8')
            ##print "\n|- Statistics :\n\t| %-16s | %-10s | %6s |\n\t%s " % ('Facet','Occurence','%',"-" * 50)
            print('|- Statistics written to file %s' % 'stat_' + args.output)

            statline = unicode("")
            for outt in akeys:
                statline += "| %-16s\n\t| %-15s | %-6d | %3d |\n" % (
                    outt, '-Total-', count[outt],
                    int(count[outt] * 100 / tcount))
                for word in statc[outt].most_common(10):
                    statline += '\t| %-15s | %-6d | %3d |\n' % (
                        word[0][:100], word[1], int(word[1] * 100 / tcount))

            statfh.write(statline)

            statfh.close()
Exemple #29
0
Sara-Jayne Farmer
2013
'''

import ckanclient
import pickle

#Connect
fin = open("../key.txt", 'rb')
key = fin.read().strip()
fin.close()

fout = open("pickled_ckan_contents.pk1", "wb")

ckan = ckanclient.CkanClient(
  base_location='http://ec2-54-228-69-142.eu-west-1.compute.amazonaws.com/api',
  api_key=key)

#tag list
tag_list = ckan.tag_register_get()
pickle.dump(tag_list, fout, -1) #force pickle to use highest protocol available

#packages
package_entities = {}
package_list = ckan.package_register_get()
print package_list
for package_name in package_list:
  ckan.package_entity_get(package_name)
  package_entities[package_name] = ckan.last_message
pickle.dump(package_entities, fout, -1)
Exemple #30
0
 def __init__(self):
     self.apikey = open("apikey.txt").read()
     self.ckan = ckanclient.CkanClient(base_location=API_URL,
                                       api_key=self.apikey)
     self.package_list = self.ckan.package_register_get()
     pass