def rowlengths(table): """ Report on row lengths found in the table. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar', 'baz'], ... ['A', 1, 2], ... ['B', '2', '3.4'], ... [u'B', u'3', u'7.8', True], ... ['D', 'xyz', 9.0], ... ['E', None], ... ['F', 9]] >>> etl.rowlengths(table) +--------+-------+ | length | count | +========+=======+ | 3 | 3 | +--------+-------+ | 2 | 2 | +--------+-------+ | 4 | 1 | +--------+-------+ Useful for finding potential problems in data files. """ counter = Counter() for row in data(table): counter[len(row)] += 1 output = [('length', 'count')] output.extend(counter.most_common()) return wrap(output)
def tojsonarrays(table, source=None, prefix=None, suffix=None, output_header=False, *args, **kwargs): """ Write a table in JSON format, with rows output as JSON arrays. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> etl.tojsonarrays(table1, 'example.file4.json') >>> # check what it did ... print(open('example.file4.json').read()) [["a", 1], ["b", 2], ["c", 2]] Note that this is currently not streaming, all data is loaded into memory before being written to the file. """ if output_header: obj = list(table) else: obj = list(data(table)) _writejson(source, obj, prefix, suffix, *args, **kwargs)
def counts(request, uuid): try: csvdownload = CSVDownload.objects.get(uuid=uuid) except CSVDownload.DoesNotExist: return HttpResponseNotFound("Not found.") fname = '{0}.csv'.format(csvdownload.uuid) full_fname = os.path.join(settings.CSV_DIR, fname) people = fromcsv(full_fname) columns_str = request.GET.get('columns', '') columns = sorted([c for c in columns_str.split(',') if c.strip()]) for column in columns: if column not in header(people): return HttpResponseBadRequest('Bad request.') if not columns: return redirect(to=reverse('people_list', kwargs={'uuid': uuid})) counts = valuecounts(people, *columns) counts = cutout(counts, 'frequency') return render( request, 'counts.html', { 'csvdownload': csvdownload, 'columns': header(people), 'headers': header(counts), 'counts': data(counts), 'queryparams': { 'columns': columns } })
def appendxlsx(tbl, filename, sheet=None, write_header=False): """ Appends rows to an existing Excel .xlsx file. """ import openpyxl source = read_source_from_arg(filename) with source.open('rb') as source2: wb = openpyxl.load_workbook(filename=source2, read_only=False) if sheet is None: ws = wb[wb.sheetnames[0]] elif isinstance(sheet, int): ws = wb[wb.sheetnames[sheet]] else: ws = wb[str(sheet)] if write_header: it = iter(tbl) hdr = next(it) flds = list(map(text_type, hdr)) rows = itertools.chain([flds], it) else: rows = data(tbl) for row in rows: ws.append(row) target = write_source_from_arg(filename) with target.open('wb') as target2: wb.save(target2)
def toxlsx(tbl, filename, sheet=None, write_header=True, mode="replace"): """ Write a table to a new Excel .xlsx file. N.B., the sheet name is case sensitive. The `mode` argument controls how the file and sheet are treated: - `replace`: This is the default. It either replaces or adds a named sheet, or if no sheet name is provided, all sheets (overwrites the entire file). - `overwrite`: Always overwrites the file. This produces a file with a single sheet. - `add`: Adds a new sheet. Raises `ValueError` if a named sheet already exists. The `sheet` argument can be omitted in all cases. The new sheet will then get a default name. If the file does not exist, it will be created, unless `replace` mode is used with a named sheet. In the latter case, the file must exist and be a valid .xlsx file. """ wb = _load_or_create_workbook(filename, mode, sheet) ws = _insert_sheet_on_workbook(mode, sheet, wb) if write_header: rows = tbl else: rows = data(tbl) for row in rows: ws.append(row) target = write_source_from_arg(filename) with target.open('wb') as target2: wb.save(target2)
def _insert(table, h5table): it = data(table) # don't need header for row in it: for i, f in enumerate(h5table.colnames): # depends on order of fields being the same in input table # and hd5 table, but field names don't need to match h5table.row[f] = row[i] h5table.row.append() h5table.flush()
def nrows(table): """ Count the number of data rows in a table. E.g.:: >>> import petl as etl >>> table = [['foo', 'bar'], ['a', 1], ['b', 2]] >>> etl.nrows(table) 2 """ return sum(1 for _ in data(table))
def _writecsv(table, source, mode, write_header, encoding, errors, **csvargs): rows = table if write_header else data(table) with source.open(mode) as buf: # wrap buffer for text IO csvfile = io.TextIOWrapper(buf, encoding=encoding, errors=errors, newline='') try: writer = csv.writer(csvfile, **csvargs) for row in rows: writer.writerow(row) csvfile.flush() finally: csvfile.detach()
def toxlsx(tbl, filename, sheet=None, write_header=True): """ Write a table to a new Excel .xlsx file. """ import openpyxl wb = openpyxl.Workbook(write_only=True) ws = wb.create_sheet(title=sheet) if write_header: rows = tbl else: rows = data(tbl) for row in rows: ws.append(row) wb.save(filename)
def _writecsv(table, source, mode, write_header, encoding, errors, **csvargs): rows = table if write_header else data(table) with source.open(mode) as buf: # determine encoding codec = getcodec(encoding) # ascii if codec.name == "ascii": # bypass encoding writer = csv.writer(buf, **csvargs) # non-ascii else: writer = UnicodeWriter(buf, encoding=encoding, errors=errors, **csvargs) for row in rows: writer.writerow(row)
def itercrossjoin(sources, prefix): # construct fields outhdr = list() for i, s in enumerate(sources): if prefix: # use one-based numbering outhdr.extend([str(i+1) + '_' + str(f) for f in header(s)]) else: outhdr.extend(header(s)) yield tuple(outhdr) datasrcs = [data(src) for src in sources] for prod in itertools.product(*datasrcs): outrow = list() for row in prod: outrow.extend(row) yield tuple(outrow)
def _writecsv(table, source, mode, write_header, encoding, errors, **csvargs): rows = table if write_header else data(table) with source.open(mode) as buf: # determine encoding codec = getcodec(encoding) # ascii if codec.name == 'ascii': # bypass encoding writer = csv.writer(buf, **csvargs) # non-ascii else: writer = UnicodeWriter(buf, encoding=encoding, errors=errors, **csvargs) for row in rows: writer.writerow(row)
def appendxlsx(tbl, filename, sheet=None, write_header=False): """ Appends rows to an existing Excel .xlsx file. """ import openpyxl wb = openpyxl.load_workbook(filename=filename, read_only=False) if sheet is None: ws = wb[wb.sheetnames[0]] elif isinstance(sheet, int): ws = wb[wb.sheetnames[sheet]] else: ws = wb[str(sheet)] if write_header: rows = tbl else: rows = data(tbl) for row in rows: ws.append(row) wb.save(filename)
def people_list(request, uuid): try: csvdownload = CSVDownload.objects.get(uuid=uuid) except CSVDownload.DoesNotExist: return HttpResponseNotFound("Not found.") fname = '{0}.csv'.format(csvdownload.uuid) full_fname = os.path.join(settings.CSV_DIR, fname) people = fromcsv(full_fname) sortby = request.GET.get('sortby', 'name') ordering = request.GET.get('ordering', 'asc') count_str = request.GET.get('count', '10') if sortby not in header(people): return HttpResponseBadRequest('Bad request.') if ordering not in ('asc', 'desc'): return HttpResponseBadRequest('Bad request.') try: count = int(count_str) except ValueError: return HttpResponseBadRequest('Bad request.') if count < 1: return HttpResponseBadRequest('Bad request.') people = sort(people, sortby, reverse=ordering == 'desc') people = head(people, count) return render( request, 'people_list.html', { 'csvdownload': csvdownload, 'headers': header(people), 'people': data(people), 'has_more': len(people) > count, 'queryparams': { 'sortby': sortby, 'ordering': ordering, 'count': str(count + 10) } })
def tojsonarrays(table, source=None, prefix=None, suffix=None, output_header=False, *args, **kwargs): """ Write a table in JSON format, with rows output as JSON arrays. E.g.:: >>> import petl as etl >>> table1 = [['foo', 'bar'], ... ['a', 1], ... ['b', 2], ... ['c', 2]] >>> etl.tojsonarrays(table1, 'example.json') >>> # check what it did ... print(open('example.json').read()) [["a", 1], ["b", 2], ["c", 2]] Note that this is currently not streaming, all data is loaded into memory before being written to the file. """ if output_header: obj = list(table) else: obj = list(data(table)) _writejson(source, obj, prefix, suffix, *args, **kwargs)
def test_data(): table = (('foo', 'bar'), ('a', 1), ('b', 2)) actual = data(table) expect = (('a', 1), ('b', 2)) ieq(expect, actual)
def __iter__(self): for row in data(self.table): for value in row: yield value