Exemple #1
0
def _import_rowids_file(args):
    rows_csv_in, rowids_out, id_offset, id_stride, id_field = args
    assert os.path.isfile(rows_csv_in)
    with csv_reader(rows_csv_in) as reader:
        header = reader.next()
        if id_field is None:
            basename = os.path.basename(rows_csv_in)
            get_rowid = lambda i, row: '{}:{}'.format(basename, i)
        else:
            pos = header.index(id_field)
            get_rowid = lambda i, row: row[pos]
        with csv_writer(rowids_out) as writer:
            for i, row in enumerate(reader):
                writer.writerow((id_offset + id_stride * i, get_rowid(i, row)))
Exemple #2
0
def _import_rowids_file(args):
    rows_csv_in, rowids_out, id_offset, id_stride, id_field = args
    assert os.path.isfile(rows_csv_in)
    with csv_reader(rows_csv_in) as reader:
        header = reader.next()
        if id_field is None:
            basename = os.path.basename(rows_csv_in)
            get_rowid = lambda i, row: '{}:{}'.format(basename, i)
        else:
            pos = header.index(id_field)
            get_rowid = lambda i, row: row[pos]
        with csv_writer(rowids_out) as writer:
            for i, row in enumerate(reader):
                writer.writerow((id_offset + id_stride * i, get_rowid(i, row)))
Exemple #3
0
def export_rows(encoding_in, rows_in, rows_csv_out, chunk_size=1000000):
    '''
    Export rows from gzipped-protobuf-stream to directory-of-gzipped-csv-files.
    '''
    rows_csv_out = os.path.abspath(rows_csv_out)
    if rows_csv_out == os.getcwd():
        raise LoomError('Cannot export_rows to working directory')
    for ext in ['.csv', '.gz', '.bz2']:
        if rows_csv_out.endswith(ext):
            raise LoomError(
                'Expected rows_csv_out to be a dirname, actual'.format(
                    rows_csv_out))
    if not (chunk_size > 0):
        raise LoomError('Invalid chunk_size {}, must be positive'.format(
            chunk_size))
    encoders = json_load(encoding_in)
    fields = [loom.schema.MODEL_TO_DATATYPE[e['model']] for e in encoders]
    decoders = [load_decoder(e) for e in encoders]
    header = ['_id'] + [e['name'] for e in encoders]
    if os.path.exists(rows_csv_out):
        shutil.rmtree(rows_csv_out)
    os.makedirs(rows_csv_out)
    row_count = sum(1 for _ in protobuf_stream_load(rows_in))
    rows = loom.cFormat.row_stream_load(rows_in)
    chunk_count = (row_count + chunk_size - 1) / chunk_size
    chunks = sorted(
        os.path.join(rows_csv_out, 'rows.{}.csv.gz'.format(i))
        for i in xrange(chunk_count)
    )
    with ExitStack() as stack:
        with_ = stack.enter_context
        writers = [with_(csv_writer(f)) for f in chunks]
        for writer in writers:
            writer.writerow(header)
        for row, writer in izip(rows, cycle(writers)):
            data = row.iter_data()
            schema = izip(data['observed'], fields, decoders)
            csv_row = [row.id]
            for observed, field, decode in schema:
                csv_row.append(decode(data[field].next()) if observed else '')
            writer.writerow(csv_row)