コード例 #1
0
    def test_read_simple_xlsx(self):
        fh = horror_fobj('simple.xlsx')
        table_set = XLSTableSet(fh)
        assert_equal(1, len(table_set.tables))
        row_set = table_set.tables[0]
        first_row = list(row_set.sample)[0]
        third_row = list(row_set.sample)[2]

        assert_is_instance(first_row[0].value, stringy)
        assert_is_instance(first_row[1].value, stringy)
        assert_is_instance(first_row[2].value, stringy)

        assert_is_instance(third_row[0].value, datetime.datetime)
        assert_is_instance(third_row[1].value, float)
        assert_is_instance(third_row[2].value, stringy)

        assert_equal(first_row[0].value, 'date')
        assert_equal(first_row[1].value, 'temperature')
        assert_equal(first_row[2].value, 'place')

        assert_equal(third_row[0].value, datetime.datetime(2011, 1, 2, 0, 0))
        assert_equal(third_row[1].value, -1.0)
        assert_equal(third_row[2].value, 'Galway')

        for row in list(row_set):
            assert 3 == len(row), row
コード例 #2
0
 def test_read_type_know_simple(self):
     fh = horror_fobj('simple.xls')
     table_set = XLSTableSet(fh)
     row_set = table_set.tables[0]
     row = list(row_set.sample)[1]
     types = [c.type for c in row]
     assert_equal(types, [DateType(None), FloatType(), StringType()])
コード例 #3
0
 def test_bad_first_sheet(self):
     # First sheet appears to have no cells
     fh = horror_fobj('problematic_first_sheet.xls')
     table_set = XLSTableSet(fh)
     tables = table_set.tables
     assert_equal(0, len(list(tables[0].sample)))
     assert_equal(1000, len(list(tables[1].sample)))
コード例 #4
0
 def test_attempt_read_encrypted_no_password_xls(self):
     fh = horror_fobj('encrypted_no_password.xls')
     errmsg = "Can't read Excel file: XLRDError('Workbook is encrypted',)"
     try:
         XLSTableSet(fh)
     except ReadError as e:
         assert e.args[0] == errmsg
     else:
         assert False, "Did not raise Read Error"
コード例 #5
0
def any_tableset(fileobj, mimetype=None, extension=None):
    """Reads any supported table type according to a specified
    MIME type or file extension or automatically detecting the
    type.

    Best matching TableSet loaded with the fileobject is returned.
    Matching is done by looking at the type (e.g mimetype='text/csv')
    or file extension (e.g. extension='tsv'), or otherwise autodetecting
    the file format by using the magic library which looks at the first few
    bytes of the file BUT is often wrong. Consult the source for recognized
    MIME types and file extensions.

    On error it raises messytables.ReadError
    """
    # Auto-detect if the caller has offered no clue. (Because the
    # auto-detection routine is pretty poor.)
    if mimetype is None and extension is None:
        import magic
        # Since we need to peek the start of the stream, make sure we can
        # seek back later. If not, slurp in the contents into a StringIO.
        fileobj = messytables.seekable_stream(fileobj)
        header = fileobj.read(1024)
        mimetype = magic.from_buffer(header, mime=True)
        fileobj.seek(0)

    if (mimetype in ('application/x-zip-compressed', 'application/zip')
            or (extension and extension.lower() in ('zip',))):
        # Do this first because the extension applies to the content
        # type of the inner files, so don't check them before we check
        # for a ZIP file.
        return ZIPTableSet(fileobj)

    if (mimetype in ('text/csv', 'text/comma-separated-values') or
            (extension and extension.lower() in ('csv',))):
        return CSVTableSet(fileobj)  # guess delimiter
    if (mimetype in ('text/tsv', 'text/tab-separated-values') or
            (extension and extension.lower() in ('tsv',))):
        return CSVTableSet(fileobj, delimiter='\t')
    if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                    'application/xls') or (extension and extension.lower() in
                                           ('xls',)):
        return XLSTableSet(fileobj)
    if (mimetype in (
        'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',)
            or (extension and extension.lower() in ('xlsx',))):
        return XLSXTableSet(fileobj)
    if (mimetype in ('text/html',)
            or (extension and extension.lower() in ('htm', 'html',))):
        return HTMLTableSet(fileobj)
    if mimetype:
        raise ValueError("Unrecognized MIME type: {mimetype}".format(
            mimetype=mimetype))
    if extension:
        raise ValueError('''Could not determine MIME type and
         unrecognized extension: {extension}'''.format(extension=extension))
    raise ValueError("Could not determine MIME type and no extension given.")
コード例 #6
0
 def test_http_xls(self):
     url = 'http://www.messytables.org/static/simple.xls'
     httpretty.register_uri(httpretty.GET,
                            url,
                            body=horror_fobj('simple.xls').read(),
                            content_type="application/ms-excel")
     fh = urlopen(url)
     table_set = XLSTableSet(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(7, len(data))
コード例 #7
0
 def test_read_head_offset_excel(self):
     fh = horror_fobj('simple.xls')
     table_set = XLSTableSet(fh)
     row_set = table_set.tables[0]
     offset, headers = headers_guess(row_set.sample)
     assert_equal(offset, 0)
     row_set.register_processor(offset_processor(offset + 1))
     data = list(row_set.sample)
     assert_equal(int(data[0][1].value), 1)
     data = list(row_set)
     assert_equal(int(data[0][1].value), 1)
コード例 #8
0
    def test_read_simple_xls(self):
        fh = horror_fobj('simple.xls')
        table_set = XLSTableSet(fh)
        assert_equal(1, len(table_set.tables))
        row_set = table_set.tables[0]
        row = list(row_set.sample)[0]
        assert_equal(row[0].value, 'date')
        assert_equal(row[1].value, 'temperature')
        assert_equal(row[2].value, 'place')

        for row in list(row_set):
            assert 3 == len(row), row
コード例 #9
0
 def test_attempt_read_encrypted_no_password_xls(self):
     fh = horror_fobj('encrypted_no_password.xls')
     errmsg = "Can't read Excel file: XLRDError('Workbook is encrypted',)"
     try:
         XLSTableSet(fh)
     except ReadError as e:
         # Hack: fix for difference in behaviour of Python 3.7+
         # to earlier versions; the error message matches exactly.
         # Leaving this note and the code in such a state to make
         # this clear.
         assert e.args[0].startswith(errmsg[:-2])
     else:
         assert False, "Did not raise Read Error"
コード例 #10
0
 def test_http_xlsx(self):
     url = 'http://www.messytables.org/static/simple.xlsx'
     httpretty.register_uri(
         httpretty.GET,
         url,
         body=horror_fobj('simple.xlsx').read(),
         content_type=
         "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
     )
     fh = urlopen(url)
     table_set = XLSTableSet(fh)
     row_set = table_set.tables[0]
     data = list(row_set)
     assert_equal(7, len(data))
コード例 #11
0
ファイル: any.py プロジェクト: perfectcode1/messytables
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj)  # guess delimiter
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
コード例 #12
0
ファイル: any.py プロジェクト: JoshData/messytables
    def from_fileobj(cls, fileobj, mimetype=None, extension=None):
        """ Opens whatever sort of file is passed in, using the MIME
        type (e.g mimetype='text/csv') or file extension (e.g.
        extension='tsv'), or otherwise autodetecting the file format.
        Consult the source for recognized MIME types and file
        extensions."""
        if mimetype == None:
            import magic
            # Since we need to peek the start of the stream, make sure we can
            # seek back later. If not, slurp in the contents into a StringIO.
            fileobj = messytables.seekable_stream(fileobj)
            header = fileobj.read(1024)
            mimetype = magic.from_buffer(header, mime=True)
            fileobj.seek(0)

        if mimetype in ('application/x-zip-compressed', 'application/zip') \
                or (extension and extension.lower() in ('zip',)):
            # Do this first because the extension applies to the content
            # type of the inner files, so don't check them before we check
            # for a ZIP file.
            return ZIPTableSet.from_fileobj(fileobj)

        if mimetype in ('text/csv', 'text/comma-separated-values') or \
                (extension and extension.lower() in ('csv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter=',')
        if mimetype in ('text/tsv', 'text/tab-separated-values') or \
                (extension and extension.lower() in ('tsv',)):
            return CSVTableSet.from_fileobj(fileobj, delimiter='\t')
        if mimetype in ('application/ms-excel', 'application/vnd.ms-excel',
                'application/xls', 'application/excel') or (extension and extension.lower() in \
                    ('xls',)):
            return XLSTableSet.from_fileobj(fileobj)
        if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \
                or (extension and extension.lower() in ('xlsx',)):
            return XLSXTableSet.from_fileobj(fileobj)

        if mimetype:
            raise ValueError("Unrecognized MIME type: " + mimetype)
        if extension:
            raise ValueError("Could not determine MIME type and "
             + "unrecognized extension: " + extension)
        raise ValueError("Could not determine MIME type and no extension given.")
コード例 #13
0
    def ku_openlearning(self, filename, source_id):
        CATEGORY_MAPPING = {
            'Assessment of learning':
            2298,  #Assessment,
            'Finance':
            2235,
            'Public Service':
            'Criminal Justice',
            'Health Science':
            'Health Sciences',
            'Management':
            2248,
            'Online Instruction':
            'Hybrid and Online Course Development',
            'Early Childhood':
            ['Career Counseling and Services', 'Childhood and Adolescence'],
            'Law, Legal':
            'Law',
            'Psychology':
            'Psychology',
            'Customer Service':
            2246,
            'Communications':
            'Communications',
            'Professionalism':
            'Personal Development'
        }

        source = Source.objects.get(pk=source_id)

        fh = open(filename, 'rb')
        table_set = XLSTableSet(fh)

        row_set = table_set.tables[0]
        offset, headers = headers_guess(row_set.sample)
        row_set.register_processor(headers_processor(headers))

        row_set.register_processor(offset_processor(offset + 1))
        for row in row_set:
            url = row[0].value
            title = row[1].value
            description = row[2].value
            # language = row[4].value
            # material_type = row[5].value
            license = row[6].value
            categories = row[7].value
            keywords = row[8].value
            # audience = row[9].value

            course, is_created = Course.objects.get_or_create(
                linkurl=url,
                provider=source.provider,
                source=source,
                defaults={
                    'title': title,
                    'description': description,
                    'tags': keywords,
                    'language': 'English',
                    'license': license,
                    'content_medium': 'text',
                    'creative_commons': 'Yes',
                    'creative_commons_commercial': 'No',
                    'creative_commons_derivatives': 'No'
                })

            merlot_cat = CATEGORY_MAPPING[categories]
            if type(merlot_cat) != list:
                merlot_cat = [
                    merlot_cat,
                ]

            for item in merlot_cat:
                try:
                    m = MerlotCategory.objects.get(merlot_id=item)
                    course.merlot_categories.add(m)
                except ValueError:
                    m = MerlotCategory.objects.get(name=item)
                    course.merlot_categories.add(m)
コード例 #14
0
def webstorer_upload(context, data):

    context = json.loads(context)
    resource = json.loads(data)
    
    excel_types = ['xls', 'application/ms-excel', 'application/xls']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))

    rows = []
    
    for row in row_set.dicts():
        rows.append(row)


    webstore_url = context.get('webstore_url').rstrip('/')
    
    webstore_request_url = '%s/%s/%s' % (webstore_url,
                                         context['username'],
                                         resource['id']
                                         )
    #check if resource is already there.
    webstore_response = requests.get(webstore_request_url+'.json')
    check_response_and_retry(webstore_response, webstore_request_url+'.json')

    #should be an empty list as no tables should be there.
    if json.loads(webstore_response.content):
        raise WebstorerError('Webstore already has this resource')

    response = requests.post(webstore_request_url+'/data',
                             data = json.dumps(rows),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )
    check_response_and_retry(response, webstore_request_url+'.json')
    if response.status_code != 201:
        raise WebstorerError('Websore bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )

    ckan_url = context['site_url'].rstrip('/')
    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url+'/data',
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
コード例 #15
0
ファイル: tasks.py プロジェクト: slmnhq/ckanext-datastorer
def _datastorer_upload(context, resource):

    excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel']

    result = download(context, resource, data_formats=DATA_FORMATS)
    content_type = result['headers'].get('content-type', '')
    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        table_sets = CSVTableSet.from_fileobj(f)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    types = guess_types(list(row_set.dicts(sample=True)))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(types_processor(types))


    ckan_url = context['site_url'].rstrip('/')
    
    webstore_request_url = '%s/api/data/%s/' % (ckan_url,
                                                resource['id']
                                                )

    def send_request(data):
        return requests.post(webstore_request_url + '_bulk',
                             data = "%s%s" % ("\n".join(data), "\n"),
                             headers = {'Content-Type': 'application/json',
                                        'Authorization': context['apikey']},
                             )

    data = []
    for count,dict_ in enumerate(row_set.dicts()):
        data.append(json.dumps({"index": {"_id": count+1}}))
        data.append(json.dumps(dict_))
        if (count % 100) == 0:
            response = send_request(data)
            check_response_and_retry(response, webstore_request_url+'_mapping')
            data[:] = []

    if data:
        respose = send_request(data)
        check_response_and_retry(response, webstore_request_url+'_mapping')


    ckan_request_url =  ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': webstore_request_url,
        'webstore_last_updated': datetime.datetime.now().isoformat()
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers = {'Content-Type': 'application/json',
                   'Authorization': context['apikey']},
        )

    if response.status_code not in (201, 200):
        raise WebstorerError('Ckan bad response code (%s). Response was %s'%
                             (response.status_code, response.content)
                            )
コード例 #16
0
 def setUpClass(cls):
     cls.large_xlsx_table_set = XLSTableSet(  # TODO
         horror_fobj('large.xlsx'))
コード例 #17
0
ファイル: tasks.py プロジェクト: domoritz/ckanext-datastorer
def _datastorer_upload(context, resource, logger):

    excel_types = ['xls', 'application/ms-excel', 'application/xls',
                   'application/vnd.ms-excel']
    tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values']

    result = download(context, resource, data_formats=DATA_FORMATS)

    content_type = result['headers'].get('content-type', '')\
                                    .split(';', 1)[0]  # remove parameters

    f = open(result['saved_file'], 'rb')

    if content_type in excel_types or resource['format'] in excel_types:
        table_sets = XLSTableSet.from_fileobj(f)
    else:
        is_tsv = (content_type in tsv_types or
                  resource['format'] in tsv_types)
        delimiter = '\t' if is_tsv else ','
        table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter)

    ##only first sheet in xls for time being
    row_set = table_sets.tables[0]
    offset, headers = headers_guess(row_set.sample)
    row_set.register_processor(headers_processor(headers))
    row_set.register_processor(offset_processor(offset + 1))
    row_set.register_processor(datetime_procesor())

    logger.info('Header offset: {0}.'.format(offset))

    guessed_types = type_guess(
        row_set.sample,
        [
            messytables.types.StringType,
            messytables.types.IntegerType,
            messytables.types.FloatType,
            messytables.types.DecimalType,
            messytables.types.DateUtilType
        ],
        strict=True
    )
    logger.info('Guessed types: {0}'.format(guessed_types))
    row_set.register_processor(types_processor(guessed_types, strict=True))
    row_set.register_processor(stringify_processor())

    ckan_url = context['site_url'].rstrip('/')

    datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url)

    guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types]

    def send_request(data):
        request = {'resource_id': resource['id'],
                   'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)],
                   'records': data}
        response = requests.post(datastore_create_request_url,
                         data=json.dumps(request),
                         headers={'Content-Type': 'application/json',
                                  'Authorization': context['apikey']},
                         )
        check_response_and_retry(response, datastore_create_request_url, logger)

    logger.info('Creating: {0}.'.format(resource['id']))

    # generates chunks of data that can be loaded into ckan
    # n is the maximum size of a chunk
    def chunky(iterable, n):
        it = iter(iterable)
        while True:
            chunk = list(
                itertools.imap(
                    dict, itertools.islice(it, n)))
            if not chunk:
                return
            yield chunk

    count = 0
    for data in chunky(row_set.dicts(), 100):
        count += len(data)
        send_request(data)

    logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id']))

    ckan_request_url = ckan_url + '/api/action/resource_update'

    ckan_resource_data = {
        'id': resource["id"],
        'webstore_url': 'active',
        'webstore_last_updated': datetime.datetime.now().isoformat(),
        'url': resource['url']
    }

    response = requests.post(
        ckan_request_url,
        data=json.dumps(ckan_resource_data),
        headers={'Content-Type': 'application/json',
                 'Authorization': context['apikey']})

    if response.status_code not in (201, 200):
        raise DatastorerException('Ckan bad response code (%s). Response was %s' %
                             (response.status_code, response.content))