def convert(self):

        table_set = CSVTableSet.from_fileobj(self.stream)
        row_set = table_set.tables.pop()
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] =  u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id'] for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        data_row = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            result.append(data_row)
        return fields, result
Beispiel #2
0
    def convert(self):

        table_set = CSVTableSet.from_fileobj(self.stream)
        row_set = table_set.tables.pop()
        offset, headers = headers_guess(row_set.sample)

        fields = []
        dup_columns = {}
        noname_count = 1
        for index, field in enumerate(headers):
            field_dict = {}
            if "" == field:
                field = '_'.join(['column', str(noname_count)])
                headers[index] = field
                noname_count += 1
            if headers.count(field) == 1:
                field_dict['id'] = field
            else:
                dup_columns[field] = dup_columns.get(field, 0) + 1
                field_dict['id'] = u'_'.join([field, str(dup_columns[field])])
            fields.append(field_dict)
        row_set.register_processor(headers_processor([x['id']
                                                      for x in fields]))
        row_set.register_processor(offset_processor(offset + 1))

        data_row = {}
        result = []
        for row in row_set:
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            result.append(data_row)
        return fields, result
Beispiel #3
0
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj)  # guess delimiter
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
Beispiel #4
0
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter=',')
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls', 'application/excel') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
Beispiel #5
0
def load_data(config):
    if not 'url' in config:
        yield {config.get('field'): config.get('value')}
        return
    fh = urlopen(config.get('url'))
    table_set = CSVTableSet.from_fileobj(fh)
    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    for row in row_set:
        row = [(c.column, c.value) for c in row]
        yield dict(row)

    fh.close()
Beispiel #6
0
def load_data(config):
    if not 'url' in config:
        yield {
            config.get('field'): config.get('value')
            }
        return
    fh = urlopen(config.get('url'))
    table_set = CSVTableSet.from_fileobj(fh)
    row_set = table_set.tables[0]

    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))

    for row in row_set:
        row = [(c.column, c.value) for c in row]
        yield dict(row)

    fh.close()
Beispiel #7
0
def parse(stream, guess_types=True, **kwargs):
    '''Parse CSV file and return row iterator plus metadata (fields etc).

    Additional CSV arguments as per
    http://docs.python.org/2/library/csv.html#csv-fmt-params

    :param delimiter:
    :param quotechar:
    :param window: the size of the sample used for analysis

    There is also support for:

    :param encoding: file encoding (will be guess with chardet if not provided)


    You can process csv as well as tsv files using this function. For tsv just
    pass::

        delimiter='\t'
    '''
    metadata = dict(**kwargs)
    delimiter = metadata.get('delimiter', None)
    quotechar = metadata.get('quotechar', None)
    window = metadata.get('window', None)
    encoding = metadata.get('encoding', None)
    table_set = CSVTableSet.from_fileobj(stream, delimiter=delimiter,
            quotechar=quotechar, encoding=encoding, window=window)
    row_set = table_set.tables.pop()
    offset, headers = headers_guess(row_set.sample)

    fields = []
    dup_columns = {}
    noname_count = 1
    if guess_types:
        guessable_types = [StringType, IntegerType, FloatType, DecimalType,
                           DateUtilType]
        row_types = type_guess(row_set.sample, guessable_types)
    for index, field in enumerate(headers):
        field_dict = {}
        if "" == field:
            field = '_'.join(['column', unicode(noname_count)])
            headers[index] = field
            noname_count += 1
        if headers.count(field) == 1:
            field_dict['id'] = field
        else:
            dup_columns[field] = dup_columns.get(field, 0) + 1
            field_dict['id'] = u'_'.join([field, unicode(dup_columns[field])])
        if guess_types:
            if isinstance(row_types[index], DateUtilType):
                field_dict['type'] = 'DateTime'
            else:
                field_dict['type'] = str(row_types[index])
        fields.append(field_dict)
    row_set.register_processor(headers_processor([x['id'] for x in fields]))
    row_set.register_processor(offset_processor(offset + 1))
    if guess_types:
        row_set.register_processor(types_processor(row_types))

    def row_iterator():
        for row in row_set:
            data_row = {}
            for index, cell in enumerate(row):
                data_row[cell.column] = cell.value
            yield data_row
    return row_iterator(), {'fields': fields}
Beispiel #8
0
def _datastorer_upload(context, resource):

    excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))


    ckan_url = context['site_url'].rstrip('/')
    
    webstore_request_url = '%s/api/data/%s/' % (ckan_url,
                                                resource['id']
                                                )

    def send_request(data):
        return requests.post(webstore_request_url + '_bulk',
                             data = "%s%s" % ("\n".join(data), "\n"),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )

    data = []
    for count,dict_ in enumerate(row_set.dicts()):
        data.append(json.dumps({"index": {"_id": count+1}}))
        data.append(json.dumps(dict_))
        if (count % 100) == 0:
            response = send_request(data)
            check_response_and_retry(response, webstore_request_url+'_mapping')
            data[:] = []

    if data:
        respose = send_request(data)
        check_response_and_retry(response, webstore_request_url+'_mapping')


    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url,
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Beispiel #9
0
def webstorer_upload(context, data):

    context = json.loads(context)
    resource = json.loads(data)
    
    excel_types = ['xls', 'application/ms-excel', 'application/xls']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))

    rows = []
    
    for row in row_set.dicts():
        rows.append(row)


    webstore_url = context.get('webstore_url').rstrip('/')
    
    webstore_request_url = '%s/%s/%s' % (webstore_url,
                                         context['username'],
                                         resource['id']
                                         )
    #check if resource is already there.
    webstore_response = requests.get(webstore_request_url+'.json')
    check_response_and_retry(webstore_response, webstore_request_url+'.json')

    #should be an empty list as no tables should be there.
    if json.loads(webstore_response.content):
        raise WebstorerError('Webstore already has this resource')

    response = requests.post(webstore_request_url+'/data',
                             data = json.dumps(rows),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )
    check_response_and_retry(response, webstore_request_url+'.json')
    if response.status_code != 201:
        raise WebstorerError('Websore bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )

    ckan_url = context['site_url'].rstrip('/')
    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url+'/data',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
Beispiel #10
0
def _datastorer_upload(context, resource, logger):

    excel_types = ['xls', 'application/ms-excel', 'application/xls',
                   'application/vnd.ms-excel']
    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']

    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        is_tsv = (content_type in tsv_types or
                  resource['format'] in tsv_types)
        delimiter = '\t' if is_tsv else ','
        table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat(),
        'url': resource['url']
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))