Beispiel #1
0
def convert_to_sstables(input_files, column_family,
                        output_dir_name, keyspace, timestamp, buffer_size,
                        data_type):
    import fileinput
    from java.io import File
    from org.apache.cassandra.io.sstable import SSTableSimpleUnsortedWriter
    from org.apache.cassandra.db.marshal import AsciiType

    try:
        coercer = COERCERS[data_type]
    except KeyError:
        raise ValueError("invalid data type")

    output_dir = File(output_dir_name)

    if not output_dir.exists():
        output_dir.mkdir()

    writer = SSTableSimpleUnsortedWriter(output_dir,
                                         keyspace, column_family,
                                         AsciiType.instance, None,
                                         buffer_size)


    try:
        previous_rowkey = None
        for line in fileinput.input(input_files):
            rowkey, colkey, value = line.rstrip("\n").split("\t")

            if rowkey != previous_rowkey:
                writer.newRow(bytes(rowkey))

            coerced = coercer(value)
            writer.addColumn(bytes(colkey), coerced, timestamp)

            if fileinput.lineno() % 1000 == 0:
                print "%d items processed (%s)" % (fileinput.lineno(),
                                                   fileinput.filename())
    finally:
        writer.close()
Beispiel #2
0
def convert_to_sstables(input_files, column_family,
                        output_dir_name, keyspace, timestamp, buffer_size,
                        data_type, verbose=False):
    import fileinput
    from java.io import File
    from org.apache.cassandra.io.sstable import SSTableSimpleUnsortedWriter
    from org.apache.cassandra.db.marshal import AsciiType
    from org.apache.cassandra.service import StorageService
    from org.apache.cassandra.io.compress import CompressionParameters

    partitioner = StorageService.getPartitioner()

    try:
        coercer = COERCERS[data_type]
    except KeyError:
        raise ValueError("invalid data type")

    output_dir = File(output_dir_name)

    if not output_dir.exists():
        output_dir.mkdir()

    compression_options = CompressionParameters.create({
        'sstable_compression': 'org.apache.cassandra.io.compress.SnappyCompressor',
        'chunk_length_kb': '64'
    })

    writer = SSTableSimpleUnsortedWriter(output_dir,
                                         partitioner,
                                         keyspace,
                                         column_family,
                                         AsciiType.instance,
                                         None,
                                         buffer_size,
                                         compression_options)


    try:
        previous_rowkey = None
        for line in fileinput.input(input_files):
            ttl = None

            t_columns = line.rstrip("\n").split("\t")
            if len(t_columns) == 3:
                rowkey, colkey, value = t_columns
            elif len(t_columns) == 4:
                rowkey, colkey, value, ttl = t_columns
                ttl = int(ttl)
            else:
                raise Exception("unknown data format for %r" % (t_columns,))

            if rowkey != previous_rowkey:
                writer.newRow(bytes(rowkey))

            coerced = coercer(value)

            if ttl is None:
                writer.addColumn(bytes(colkey), coerced, timestamp)
            else:
                # see
                # https://svn.apache.org/repos/asf/cassandra/trunk/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java addExpiringColumn:expirationTimestampMS
                # for explanation
                expirationTimestampMS = (timestamp / 1000) + (ttl * 1000)
                writer.addExpiringColumn(bytes(colkey), coerced,
                                         timestamp, ttl, expirationTimestampMS)

            if verbose and fileinput.lineno() % 10000 == 0:
                print "%d items processed (%s)" % (fileinput.lineno(),
                                                   fileinput.filename())
    except:
        # it's common that whatever causes us to fail also cases the finally
        # clause below to fail, which masks the original exception
        logging.exception("Failed")
        raise
    finally:
        writer.close()
Beispiel #3
0
def utf8(val):
    return bytes(val)
Beispiel #4
0
def convert_to_sstables(input_files,
                        column_family,
                        output_dir_name,
                        keyspace,
                        timestamp,
                        buffer_size,
                        data_type,
                        verbose=False):
    import fileinput
    from java.io import File
    from org.apache.cassandra.io.sstable import SSTableSimpleUnsortedWriter
    from org.apache.cassandra.db.marshal import AsciiType
    from org.apache.cassandra.service import StorageService
    from org.apache.cassandra.io.compress import CompressionParameters

    partitioner = StorageService.getPartitioner()

    try:
        coercer = COERCERS[data_type]
    except KeyError:
        raise ValueError("invalid data type")

    output_dir = File(output_dir_name)

    if not output_dir.exists():
        output_dir.mkdir()

    compression_options = CompressionParameters.create({
        'sstable_compression':
        'org.apache.cassandra.io.compress.SnappyCompressor',
        'chunk_length_kb': '64'
    })

    writer = SSTableSimpleUnsortedWriter(output_dir, partitioner, keyspace,
                                         column_family, AsciiType.instance,
                                         None, buffer_size,
                                         compression_options)

    try:
        previous_rowkey = None
        for line in fileinput.input(input_files):
            ttl = None

            t_columns = line.rstrip("\n").split("\t")
            if len(t_columns) == 3:
                rowkey, colkey, value = t_columns
            elif len(t_columns) == 4:
                rowkey, colkey, value, ttl = t_columns
                ttl = int(ttl)
            else:
                raise Exception("unknown data format for %r" % (t_columns, ))

            if rowkey != previous_rowkey:
                writer.newRow(bytes(rowkey))

            coerced = coercer(value)

            if ttl is None:
                writer.addColumn(bytes(colkey), coerced, timestamp)
            else:
                # see
                # https://svn.apache.org/repos/asf/cassandra/trunk/src/java/org/apache/cassandra/io/sstable/AbstractSSTableSimpleWriter.java addExpiringColumn:expirationTimestampMS
                # for explanation
                expirationTimestampMS = (timestamp / 1000) + (ttl * 1000)
                writer.addExpiringColumn(bytes(colkey), coerced, timestamp,
                                         ttl, expirationTimestampMS)

            if verbose and fileinput.lineno() % 10000 == 0:
                print "%d items processed (%s)" % (fileinput.lineno(),
                                                   fileinput.filename())
    except:
        # it's common that whatever causes us to fail also cases the finally
        # clause below to fail, which masks the original exception
        logging.exception("Failed")
        raise
    finally:
        writer.close()