Ejemplo n.º 1
def insert_dataframe(subsite, node, sensor, method, stream, deployment,
                     binsize, dataframe):
    Insert the provided dataframe into cassandra, calculating bin based on binsize
    log.info('insert_dataframe(%s, %s, %s, %s, %s, %s, %s, <DATAFRAME>)',
             subsite, node, sensor, method, stream, deployment, binsize)

    metadata_cols = SessionManager.get_query_columns(stream)
    data_cols = [col for col in dataframe.columns if col in metadata_cols]
    fixed_cols = ['subsite', 'node', 'sensor', 'method', 'deployment']
    fixed_values = "'%s', '%s', '%s', '%s', %d" % (subsite, node, sensor,
                                                   method, deployment)
    variable_cols = ['bin', 'id'] + data_cols

    statement = "INSERT INTO %s (%s, %s) VALUES (%s, %s)" % (stream, ','.join(
        fixed_cols), ','.join(variable_cols), fixed_values, ','.join(
            ('?' for _ in variable_cols)))
    ps = SessionManager.prepare(statement)

    # add bin number to dataframe
    dataframe['bin'] = [
        get_bin_number(t, binsize) for t in dataframe.time.values
    # add unique UUID to each row in dataframe
    dataframe['id'] = [uuid.uuid4() for _ in dataframe.time.values]

    def values_generator(df_group):
        for index, row in df_group.iterrows():
            vals = []
            for col in variable_cols:
                val = row[col]
                if isinstance(val, np.ndarray):
                    val = val.tolist()
            yield vals

    inserted = {}
    for bin_number, group in dataframe.groupby('bin'):
        first = group.time.min()
        last = group.time.max()
        count = group.time.size
        log.info('Inserting into %s bin %d first: %.2f last: %.2f count: %d',
                 stream, bin_number, first, last, count)
        results = execute_concurrent_with_args(SessionManager.session(),
        success_mask = [success for success, _ in results]
        if not all(success_mask):
            log.error('Unable to insert all records, failed records: %r',
            first = group.time[success_mask].min()
            last = group.time[success_mask].max()
            count = group.time[success_mask].size

        inserted[bin_number] = {'first': first, 'last': last, 'count': count}

    return inserted