def test_read_simple_xlsx(self): fh = horror_fobj('simple.xlsx') table_set = XLSTableSet(fh) assert_equal(1, len(table_set.tables)) row_set = table_set.tables[0] first_row = list(row_set.sample)[0] third_row = list(row_set.sample)[2] assert_is_instance(first_row[0].value, stringy) assert_is_instance(first_row[1].value, stringy) assert_is_instance(first_row[2].value, stringy) assert_is_instance(third_row[0].value, datetime.datetime) assert_is_instance(third_row[1].value, float) assert_is_instance(third_row[2].value, stringy) assert_equal(first_row[0].value, 'date') assert_equal(first_row[1].value, 'temperature') assert_equal(first_row[2].value, 'place') assert_equal(third_row[0].value, datetime.datetime(2011, 1, 2, 0, 0)) assert_equal(third_row[1].value, -1.0) assert_equal(third_row[2].value, 'Galway') for row in list(row_set): assert 3 == len(row), row
def test_read_type_know_simple(self): fh = horror_fobj('simple.xls') table_set = XLSTableSet(fh) row_set = table_set.tables[0] row = list(row_set.sample)[1] types = [c.type for c in row] assert_equal(types, [DateType(None), FloatType(), StringType()])
def test_bad_first_sheet(self): # First sheet appears to have no cells fh = horror_fobj('problematic_first_sheet.xls') table_set = XLSTableSet(fh) tables = table_set.tables assert_equal(0, len(list(tables[0].sample))) assert_equal(1000, len(list(tables[1].sample)))
def test_attempt_read_encrypted_no_password_xls(self): fh = horror_fobj('encrypted_no_password.xls') errmsg = "Can't read Excel file: XLRDError('Workbook is encrypted',)" try: XLSTableSet(fh) except ReadError as e: assert e.args[0] == errmsg else: assert False, "Did not raise Read Error"
def any_tableset(fileobj, mimetype=None, extension=None): """Reads any supported table type according to a specified MIME type or file extension or automatically detecting the type. Best matching TableSet loaded with the fileobject is returned. Matching is done by looking at the type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format by using the magic library which looks at the first few bytes of the file BUT is often wrong. Consult the source for recognized MIME types and file extensions. On error it raises messytables.ReadError """ # Auto-detect if the caller has offered no clue. (Because the # auto-detection routine is pretty poor.) if mimetype is None and extension is None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if (mimetype in ('application/x-zip-compressed', 'application/zip') or (extension and extension.lower() in ('zip',))): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet(fileobj) if (mimetype in ('text/csv', 'text/comma-separated-values') or (extension and extension.lower() in ('csv',))): return CSVTableSet(fileobj) # guess delimiter if (mimetype in ('text/tsv', 'text/tab-separated-values') or (extension and extension.lower() in ('tsv',))): return CSVTableSet(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in ('xls',)): return XLSTableSet(fileobj) if (mimetype in ( 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) or (extension and extension.lower() in ('xlsx',))): return XLSXTableSet(fileobj) if (mimetype in ('text/html',) or (extension and extension.lower() in ('htm', 'html',))): return HTMLTableSet(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: {mimetype}".format( mimetype=mimetype)) if extension: raise ValueError('''Could not determine MIME type and unrecognized extension: {extension}'''.format(extension=extension)) raise ValueError("Could not determine MIME type and no extension given.")
def test_http_xls(self): url = 'http://www.messytables.org/static/simple.xls' httpretty.register_uri(httpretty.GET, url, body=horror_fobj('simple.xls').read(), content_type="application/ms-excel") fh = urlopen(url) table_set = XLSTableSet(fh) row_set = table_set.tables[0] data = list(row_set) assert_equal(7, len(data))
def test_read_head_offset_excel(self): fh = horror_fobj('simple.xls') table_set = XLSTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) assert_equal(offset, 0) row_set.register_processor(offset_processor(offset + 1)) data = list(row_set.sample) assert_equal(int(data[0][1].value), 1) data = list(row_set) assert_equal(int(data[0][1].value), 1)
def test_read_simple_xls(self): fh = horror_fobj('simple.xls') table_set = XLSTableSet(fh) assert_equal(1, len(table_set.tables)) row_set = table_set.tables[0] row = list(row_set.sample)[0] assert_equal(row[0].value, 'date') assert_equal(row[1].value, 'temperature') assert_equal(row[2].value, 'place') for row in list(row_set): assert 3 == len(row), row
def test_attempt_read_encrypted_no_password_xls(self): fh = horror_fobj('encrypted_no_password.xls') errmsg = "Can't read Excel file: XLRDError('Workbook is encrypted',)" try: XLSTableSet(fh) except ReadError as e: # Hack: fix for difference in behaviour of Python 3.7+ # to earlier versions; the error message matches exactly. # Leaving this note and the code in such a state to make # this clear. assert e.args[0].startswith(errmsg[:-2]) else: assert False, "Did not raise Read Error"
def test_http_xlsx(self): url = 'http://www.messytables.org/static/simple.xlsx' httpretty.register_uri( httpretty.GET, url, body=horror_fobj('simple.xlsx').read(), content_type= "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" ) fh = urlopen(url) table_set = XLSTableSet(fh) row_set = table_set.tables[0] data = list(row_set) assert_equal(7, len(data))
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj) # guess delimiter if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def from_fileobj(cls, fileobj, mimetype=None, extension=None): """ Opens whatever sort of file is passed in, using the MIME type (e.g mimetype='text/csv') or file extension (e.g. extension='tsv'), or otherwise autodetecting the file format. Consult the source for recognized MIME types and file extensions.""" if mimetype == None: import magic # Since we need to peek the start of the stream, make sure we can # seek back later. If not, slurp in the contents into a StringIO. fileobj = messytables.seekable_stream(fileobj) header = fileobj.read(1024) mimetype = magic.from_buffer(header, mime=True) fileobj.seek(0) if mimetype in ('application/x-zip-compressed', 'application/zip') \ or (extension and extension.lower() in ('zip',)): # Do this first because the extension applies to the content # type of the inner files, so don't check them before we check # for a ZIP file. return ZIPTableSet.from_fileobj(fileobj) if mimetype in ('text/csv', 'text/comma-separated-values') or \ (extension and extension.lower() in ('csv',)): return CSVTableSet.from_fileobj(fileobj, delimiter=',') if mimetype in ('text/tsv', 'text/tab-separated-values') or \ (extension and extension.lower() in ('tsv',)): return CSVTableSet.from_fileobj(fileobj, delimiter='\t') if mimetype in ('application/ms-excel', 'application/vnd.ms-excel', 'application/xls', 'application/excel') or (extension and extension.lower() in \ ('xls',)): return XLSTableSet.from_fileobj(fileobj) if mimetype in ('application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',) \ or (extension and extension.lower() in ('xlsx',)): return XLSXTableSet.from_fileobj(fileobj) if mimetype: raise ValueError("Unrecognized MIME type: " + mimetype) if extension: raise ValueError("Could not determine MIME type and " + "unrecognized extension: " + extension) raise ValueError("Could not determine MIME type and no extension given.")
def ku_openlearning(self, filename, source_id): CATEGORY_MAPPING = { 'Assessment of learning': 2298, #Assessment, 'Finance': 2235, 'Public Service': 'Criminal Justice', 'Health Science': 'Health Sciences', 'Management': 2248, 'Online Instruction': 'Hybrid and Online Course Development', 'Early Childhood': ['Career Counseling and Services', 'Childhood and Adolescence'], 'Law, Legal': 'Law', 'Psychology': 'Psychology', 'Customer Service': 2246, 'Communications': 'Communications', 'Professionalism': 'Personal Development' } source = Source.objects.get(pk=source_id) fh = open(filename, 'rb') table_set = XLSTableSet(fh) row_set = table_set.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) for row in row_set: url = row[0].value title = row[1].value description = row[2].value # language = row[4].value # material_type = row[5].value license = row[6].value categories = row[7].value keywords = row[8].value # audience = row[9].value course, is_created = Course.objects.get_or_create( linkurl=url, provider=source.provider, source=source, defaults={ 'title': title, 'description': description, 'tags': keywords, 'language': 'English', 'license': license, 'content_medium': 'text', 'creative_commons': 'Yes', 'creative_commons_commercial': 'No', 'creative_commons_derivatives': 'No' }) merlot_cat = CATEGORY_MAPPING[categories] if type(merlot_cat) != list: merlot_cat = [ merlot_cat, ] for item in merlot_cat: try: m = MerlotCategory.objects.get(merlot_id=item) course.merlot_categories.add(m) except ValueError: m = MerlotCategory.objects.get(name=item) course.merlot_categories.add(m)
def webstorer_upload(context, data): context = json.loads(context) resource = json.loads(data) excel_types = ['xls', 'application/ms-excel', 'application/xls'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) rows = [] for row in row_set.dicts(): rows.append(row) webstore_url = context.get('webstore_url').rstrip('/') webstore_request_url = '%s/%s/%s' % (webstore_url, context['username'], resource['id'] ) #check if resource is already there. webstore_response = requests.get(webstore_request_url+'.json') check_response_and_retry(webstore_response, webstore_request_url+'.json') #should be an empty list as no tables should be there. if json.loads(webstore_response.content): raise WebstorerError('Webstore already has this resource') response = requests.post(webstore_request_url+'/data', data = json.dumps(rows), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, webstore_request_url+'.json') if response.status_code != 201: raise WebstorerError('Websore bad response code (%s). Response was %s'% (response.status_code, response.content) ) ckan_url = context['site_url'].rstrip('/') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url+'/data', 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def _datastorer_upload(context, resource): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '') f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: table_sets = CSVTableSet.from_fileobj(f) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) types = guess_types(list(row_set.dicts(sample=True))) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(types_processor(types)) ckan_url = context['site_url'].rstrip('/') webstore_request_url = '%s/api/data/%s/' % (ckan_url, resource['id'] ) def send_request(data): return requests.post(webstore_request_url + '_bulk', data = "%s%s" % ("\n".join(data), "\n"), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) data = [] for count,dict_ in enumerate(row_set.dicts()): data.append(json.dumps({"index": {"_id": count+1}})) data.append(json.dumps(dict_)) if (count % 100) == 0: response = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') data[:] = [] if data: respose = send_request(data) check_response_and_retry(response, webstore_request_url+'_mapping') ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': webstore_request_url, 'webstore_last_updated': datetime.datetime.now().isoformat() } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers = {'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) if response.status_code not in (201, 200): raise WebstorerError('Ckan bad response code (%s). Response was %s'% (response.status_code, response.content) )
def setUpClass(cls): cls.large_xlsx_table_set = XLSTableSet( # TODO horror_fobj('large.xlsx'))
def _datastorer_upload(context, resource, logger): excel_types = ['xls', 'application/ms-excel', 'application/xls', 'application/vnd.ms-excel'] tsv_types = ['tsv', 'text/tsv', 'text/tab-separated-values'] result = download(context, resource, data_formats=DATA_FORMATS) content_type = result['headers'].get('content-type', '')\ .split(';', 1)[0] # remove parameters f = open(result['saved_file'], 'rb') if content_type in excel_types or resource['format'] in excel_types: table_sets = XLSTableSet.from_fileobj(f) else: is_tsv = (content_type in tsv_types or resource['format'] in tsv_types) delimiter = '\t' if is_tsv else ',' table_sets = CSVTableSet.from_fileobj(f, delimiter=delimiter) ##only first sheet in xls for time being row_set = table_sets.tables[0] offset, headers = headers_guess(row_set.sample) row_set.register_processor(headers_processor(headers)) row_set.register_processor(offset_processor(offset + 1)) row_set.register_processor(datetime_procesor()) logger.info('Header offset: {0}.'.format(offset)) guessed_types = type_guess( row_set.sample, [ messytables.types.StringType, messytables.types.IntegerType, messytables.types.FloatType, messytables.types.DecimalType, messytables.types.DateUtilType ], strict=True ) logger.info('Guessed types: {0}'.format(guessed_types)) row_set.register_processor(types_processor(guessed_types, strict=True)) row_set.register_processor(stringify_processor()) ckan_url = context['site_url'].rstrip('/') datastore_create_request_url = '%s/api/action/datastore_create' % (ckan_url) guessed_type_names = [TYPE_MAPPING[type(gt)] for gt in guessed_types] def send_request(data): request = {'resource_id': resource['id'], 'fields': [dict(id=name, type=typename) for name, typename in zip(headers, guessed_type_names)], 'records': data} response = requests.post(datastore_create_request_url, data=json.dumps(request), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}, ) check_response_and_retry(response, datastore_create_request_url, logger) logger.info('Creating: {0}.'.format(resource['id'])) # generates chunks of data that can be loaded into ckan # n is the maximum size of a chunk def chunky(iterable, n): it = iter(iterable) while True: chunk = list( itertools.imap( dict, itertools.islice(it, n))) if not chunk: return yield chunk count = 0 for data in chunky(row_set.dicts(), 100): count += len(data) send_request(data) logger.info("There should be {n} entries in {res_id}.".format(n=count, res_id=resource['id'])) ckan_request_url = ckan_url + '/api/action/resource_update' ckan_resource_data = { 'id': resource["id"], 'webstore_url': 'active', 'webstore_last_updated': datetime.datetime.now().isoformat(), 'url': resource['url'] } response = requests.post( ckan_request_url, data=json.dumps(ckan_resource_data), headers={'Content-Type': 'application/json', 'Authorization': context['apikey']}) if response.status_code not in (201, 200): raise DatastorerException('Ckan bad response code (%s). Response was %s' % (response.status_code, response.content))