def _import_rowids_file(args): rows_csv_in, rowids_out, id_offset, id_stride, id_field = args assert os.path.isfile(rows_csv_in) with csv_reader(rows_csv_in) as reader: header = reader.next() if id_field is None: basename = os.path.basename(rows_csv_in) get_rowid = lambda i, row: '{}:{}'.format(basename, i) else: pos = header.index(id_field) get_rowid = lambda i, row: row[pos] with csv_writer(rowids_out) as writer: for i, row in enumerate(reader): writer.writerow((id_offset + id_stride * i, get_rowid(i, row)))
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000): ''' Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files. ''' rows_csv_out = os.path.abspath(rows_csv_out) if rows_csv_out == os.getcwd(): raise LoomError('Cannot export_rows to working directory') for ext in ['.csv', '.gz', '.bz2']: if rows_csv_out.endswith(ext): raise LoomError( 'Expected rows_csv_out to be a dirname, actual'.format( rows_csv_out)) if not (chunk_size > 0): raise LoomError('Invalid chunk_size {}, must be positive'.format( chunk_size)) encoders = json_load(encoding_in) fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders] decoders = [load_decoder(e) for e in encoders] header = ['_id'] + [e['name'] for e in encoders] if os.path.exists(rows_csv_out): shutil.rmtree(rows_csv_out) os.makedirs(rows_csv_out) row_count = sum(1 for _ in protobuf_stream_load(rows_in)) rows = loom.cFormat.row_stream_load(rows_in) chunk_count = (row_count + chunk_size - 1) / chunk_size chunks = sorted( os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i)) for i in xrange(chunk_count) ) with ExitStack() as stack: with_ = stack.enter_context writers = [with_(csv_writer(f)) for f in chunks] for writer in writers: writer.writerow(header) for row, writer in izip(rows, cycle(writers)): data = row.iter_data() schema = izip(data['observed'], fields, decoders) csv_row = [row.id] for observed, field, decode in schema: csv_row.append(decode(data[field].next()) if observed else '') writer.writerow(csv_row)