Example #1
0
 def get_dataframe(self, object_id) -> pd.DataFrame:
     # Get PlasmaBuffer from ObjectID
     [data] = self.client.get_buffers([object_id])
     buffer = BufferReader(data)
     reader = RecordBatchStreamReader(buffer)
     record_batch = reader.read_next_batch()
     return record_batch.to_pandas()
Example #2
0
 def readArrow(self, source):
     '''
         description:
             Read arrow file from disk using apache pyarrow
         input:
             source: file path
         return:
             pyarrow table
     '''
     reader = RecordBatchStreamReader(source)
     pa_df = reader.read_all()
     return pa_df
Example #3
0
 def read_arrow_to_DF(self,source):
     '''
         description:
             Read arrow file from disk using apache pyarrow
         input:
             source: file path
         return:
             pandas dataframe
     '''
     source = source+".arrow"
     self.pandas_df = RecordBatchStreamReader(source).read_all().to_pandas()
     self.back_up_dimension_pandas = self.pandas_df
     return "data read successfully"
Example #4
0
def _get_work_unit_results(
    query_id: str,
    token_work_unit: Tuple[str, int],
    client_lakeformation: boto3.client,
) -> Table:
    token, work_unit = token_work_unit
    messages: NativeFile = client_lakeformation.get_work_unit_results(
        QueryId=query_id, WorkUnitToken=token,
        WorkUnitId=work_unit)["ResultStream"]
    return RecordBatchStreamReader(messages.read()).read_all()
Example #5
0
def test_iterate_over_int64_chunk():
    stream = BytesIO()
    field_foo = pyarrow.field("column_foo", pyarrow.int64(), True)
    field_bar = pyarrow.field("column_bar", pyarrow.int64(), True)
    schema = pyarrow.schema([field_foo, field_bar])
    column_meta = [("column_foo", "FIXED", None, 0, 0, 0, 0),
                   ("column_bar", "FIXED", None, 0, 0, 0, 0)]

    column_size = 2
    batch_row_count = 10
    batch_count = 10
    expected_data = []
    writer = RecordBatchStreamWriter(stream, schema)

    for i in range(batch_count):
        column_arrays = []
        py_arrays = []
        for j in range(column_size):
            column_data = []
            for k in range(batch_row_count):
                data = None if bool(random.getrandbits(1)) else random.randint(
                    -100, 100)
                column_data.append(data)
            column_arrays.append(column_data)
            py_arrays.append(pyarrow.array(column_data))

        expected_data.append(column_arrays)
        rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"])
        writer.write_batch(rb)

    writer.close()

    # seek stream to begnning so that we can read from stream
    stream.seek(0)
    reader = RecordBatchStreamReader(stream)
    it = PyArrowChunkIterator()
    for rb in reader:
        it.add_record_batch(rb)

    count = 0
    while True:
        try:
            val = next(it)
            assert val[0] == expected_data[int(count / 10)][0][count % 10]
            assert val[1] == expected_data[int(count / 10)][1][count % 10]
            count += 1
        except StopIteration:
            assert count == 100
            break
Example #6
0
def iterate_over_test_chunk(pyarrow_type,
                            column_meta,
                            source_data_generator,
                            expected_data_transformer=None):
    stream = BytesIO()

    assert len(pyarrow_type) == len(column_meta)

    column_size = len(pyarrow_type)
    batch_row_count = 10
    batch_count = 9

    fields = []
    for i in range(column_size):
        fields.append(
            pyarrow.field("column_{}".format(i), pyarrow_type[i], True,
                          column_meta[i]))
    schema = pyarrow.schema(fields)

    expected_data = []
    writer = RecordBatchStreamWriter(stream, schema)

    for i in range(batch_count):
        column_arrays = []
        py_arrays = []
        for j in range(column_size):
            column_data = []
            not_none_cnt = 0
            while not_none_cnt == 0:
                column_data = []
                for k in range(batch_row_count):
                    data = None if bool(
                        random.getrandbits(1)) else source_data_generator()
                    if data != None:
                        not_none_cnt += 1
                    column_data.append(data)
            column_arrays.append(column_data)
            py_arrays.append(pyarrow.array(column_data, type=pyarrow_type[j]))

        if expected_data_transformer:
            for i in range(len(column_arrays)):
                column_arrays[i] = [
                    expected_data_transformer(data)
                    if data is not None else None for data in column_arrays[i]
                ]
        expected_data.append(column_arrays)

        column_names = ["column_{}".format(i) for i in range(column_size)]
        rb = RecordBatch.from_arrays(py_arrays, column_names)
        writer.write_batch(rb)

    writer.close()

    # seek stream to begnning so that we can read from stream
    stream.seek(0)
    reader = RecordBatchStreamReader(stream)
    context = ArrowConverterContext()
    it = PyArrowIterator(reader, context)
    it.init(ROW_UNIT)

    count = 0
    while True:
        try:
            val = next(it)
            for i in range(column_size):
                batch_index = int(count / batch_row_count)
                assert val[i] == expected_data[batch_index][i][
                    count - batch_row_count * batch_index]
            count += 1
        except StopIteration:
            assert count == (batch_count * batch_row_count)
            break
Example #7
0
class pandas_utils:
    pandas_df = None
    back_up_dimension_pandas = None
    dimensions_filters = {}
    group_by_backups = {}

    def __init__(self):
        self.pandas_df = None
        self.back_up_dimension_pandas = None
        self.dimensions_filters = {}
        self.dimensions_filters_response_format = {}
        self.group_by_backups = {}


    def hist_numpy(self,data,bins):
        '''
            description:
                Calculate histogram leveraging gpu via pycuda(using numba jit)
            input:
                data: np array, bins: number of bins in the histogram
            Output:
                json -> {X:[__values_of_colName_with_max_64_bins__], Y:[__frequencies_per_bin__]}
        '''
        df1 = np.histogram(np.array(data),bins=bins)
        dict_temp ={}

        dict_temp['X'] = list(df1[1].astype(float))[1:]
        dict_temp['Y'] = list(df1[0].astype(float))

        return json.dumps(dict_temp)


    def groupby(self,data,column_name, groupby_agg,groupby_agg_key):
        '''
            description:
                Calculate groupby on a given column on the pygdf
            input:
                data: pygdf row as a series -> gpu mem pointer,
                column_name: column name
            Output:
                json -> {A:[__values_of_colName_with_max_64_bins__], B:[__frequencies_per_bin__]}
        '''

        try:
            group_appl = data.groupby(by=[column_name]).agg(groupby_agg)
            group_appl.reset_index(inplace=True)
            key = column_name+"_"+groupby_agg_key
            self.group_by_backups[key] = group_appl
        except Exception as e:
            return "Exception *** in pandas groupby():"+str(e)

        return group_appl



    def get_columns(self):
        '''
            description:
            Column names in a data frame
            input:
                data: pandas df
            Output:
                list of column names
        '''
        try:
            response = str(list(self.pandas_df.columns))
        except Exception as e:
            response = "Exception *** in pandas get_columns():"+str(e)
        return response

    def read_arrow_to_DF(self,source):
        '''
            description:
                Read arrow file from disk using apache pyarrow
            input:
                source: file path
            return:
                pandas dataframe
        '''
        source = source+".arrow"
        self.pandas_df = RecordBatchStreamReader(source).read_all().to_pandas()
        self.back_up_dimension_pandas = self.pandas_df
        return "data read successfully"


    def read_data(self,load_type,file):
        '''
            description:
                Read file as per the load type
            input:
                load_type: csv or arrow
                file: file path
            return:
                pandas dataframe
        '''
        #file is in the uploads/ folder, so append that to the path
        file = str("/usr/src/app/node_server/uploads/"+file)
        status = self.read_arrow_to_DF(file)
        return status

    def default(self,o):
        if isinstance(o, np.int8) or isinstance(o, np.int16) or isinstance(o, np.int32) or isinstance(o, np.int64): return int(o)
        elif isinstance(o, np.float16) or isinstance(o, np.float32) or isinstance(o, np.float64): return float(o)
        raise TypeError

    def parse_dict(self,data):
        '''
            description:
                get parsed string format of the dictionary, that can be sent to the socket-client
            input:
                data: dataframe
            return:
                shape string
        '''
        try:
            temp_dict = {}
            for i in data:
                if i[1] == '':
                    temp_key = i[0]
                else:
                    temp_key = '_'.join(i[::-1])
                temp_dict[temp_key] = list(data[i].values())
            return json.dumps(temp_dict,default=self.default)
        except Exception as e:
            return "Exception *** in pandas parse_dict() (helper function):"+str(e)

    def get_size(self):
        '''
            description:
                get shape of the dataframe
            input:
                data: dataframe
            return:
                shape tuple
        '''
        try:
            return str(len(self.pandas_df))
        except Exception as e:
            return "Exception *** in pandas get_size():"+str(e)

    def reset_filters(self, data, omit=None, include_dim=['all']):
        '''
            description:
                reset filters on the data_gpu dataframe by executing all filters in the dimensions_filters dictionary
            input:
                data: dataset
                omit: column name, the filters associated to which, are to be omitted
                include_dim: list of column_names, which are to be included along with dimensions_filters.keys(); ['all'] to include all columns

            Output:
                result dataframe after executing the filters using the dataframe.query() command
        '''
        try:
            temp_list = []
            for key in self.dimensions_filters.keys():
                if omit is not None and omit == key:
                    continue
                if len(self.dimensions_filters[key])>0:
                    temp_list.append(self.dimensions_filters[key])
            query = ' and '.join(temp_list)
            if(len(query) >0):
                # return data.query(query)
                if include_dim[0] == 'all':
                    return data.query(query)
                else:
                    column_list = list(set(list(self.dimensions_filters.keys())+include_dim))
                    try:
                        return_val = data.loc[:,column_list].query(query)
                    except Exception as e:
                        return 'Exception *** in pandas reset_filters():'+str(e)
                    return return_val
            else:
                return data
        except Exception as e:
            return "Exception *** in pandas reset_filters():"+str(e)


    def reset_all_filters(self):
        '''
            description:
                reset all filters on all dimensions for the dataset
            input:
                None
            Output:
                number_of_rows_left
        '''
        try:
            self.pandas_df = self.back_up_dimension_pandas
            self.dimensions_filters.clear()
            self.dimensions_filters_response_format.clear()
            return str(len(self.pandas_df))
        except Exception as e:
            return "Exception *** in pandas reset_all_filters():"+str(e)

    def dimension_load(self, dimension_name):
        '''
            description:
                load a dimension
            Get parameters:
                dimension_name (string)
            Response:
                status -> success: dimension loaded successfully/dimension already exists   // error: "dimension not initialized"
        '''
        try:
            if dimension_name not in self.dimensions_filters:
                self.dimensions_filters[dimension_name] = ''
                self.dimensions_filters_response_format[dimension_name] = []
                res = 'dimension loaded successfully'
            else:
                res = 'dimension already exists'
            return res
        except Exception as e:
            return "Exception *** in pandas dimension_load():"+str(e)


    def dimension_reset(self, dimension_name):
        '''
            description:
                reset all filters on a dimension for pandas df
            Get parameters:
                dimension_name (string)
            Response:
                number_of_rows
        '''
        try:
            self.pandas_df = self.back_up_dimension_pandas
            self.dimensions_filters[dimension_name] = ''
            self.dimensions_filters_response_format[dimension_name] = []
            self.pandas_df = self.reset_filters(self.pandas_df)
            return str(len(self.pandas_df))

        except Exception as e:
            return "Exception *** in pandas dimension_reset():"+str(e)

    def dimension_get_max_min(self, dimension_name):
        '''
            description:
                get_max_min for a dimension for pandas
            Get parameters:
                dimension_name (string)
            Response:
                max_min_tuple
        '''
        try:
            max_min_tuple = (float(self.pandas_df[dimension_name].max()), float(self.pandas_df[dimension_name].min()))
            return str(max_min_tuple)
        except Exception as e:
            return "Exception *** in pandas dimension_get_max_min():"+str(e)

    def dimension_hist(self, dimension_name, num_of_bins):
        '''
            description:
                get histogram for a dimension for pandas
            Get parameters:
                dimension_name (string)
                num_of_bins (integer)
            Response:
                string(json) -> "{X:[__values_of_colName_with_max_64_bins__], Y:[__frequencies_per_bin__]}"
        '''
        try:
            num_of_bins = int(num_of_bins)
            if len(self.dimensions_filters.keys()) == 0 or (dimension_name not in self.dimensions_filters) or (dimension_name in self.dimensions_filters and self.dimensions_filters[dimension_name] == ''):
                return str(self.hist_numpy(self.pandas_df[str(dimension_name)].values,num_of_bins))
            else:
                temp_df = self.reset_filters(self.back_up_dimension_pandas, omit=dimension_name)
                return str(self.hist_numpy(temp_df[str(dimension_name)].values,num_of_bins))
        except Exception as e:
            return "Exception *** in pandas dimension_hist():"+str(e)



    def dimension_filter_order(self, dimension_name, sort_order, num_rows, columns):
        '''
            description:
                get columns values by a filter_order(all, top(n), bottom(n)) sorted by dimension_name
            Get parameters:
                dimension_name (string)
                sort_order (string): top/bottom/all
                num_rows (integer): OPTIONAL -> if sort_order= top/bottom
                columns (string): comma separated column names
            Response:
                string(json) -> "{col_1:[__row_values__], col_2:[__row_values__],...}"
        '''
        try:
            columns = columns.split(',')
            if(len(columns) == 0 or columns[0]==''):
                columns = list(self.pandas_df.columns)
            elif dimension_name not in columns:
                columns.append(dimension_name)
            if 'all' == sort_order:
                temp_df = self.pandas_df.loc[:,columns].to_dict()
            else:
                num_rows = int(num_rows)
                max_rows = max(len(self.pandas_df)-1,0)
                n_rows = min(num_rows, max_rows)
                if 'top' == sort_order:
                    temp_df = self.pandas_df.loc[:,columns].nlargest(n_rows,[dimension_name]).to_dict()
                elif 'bottom' == sort_order:
                    temp_df = self.pandas_df.loc[:,columns].nsmallest(n_rows,[dimension_name]).to_dict()

            return str(self.parse_dict(temp_df))

        except Exception as e:
            return "Exception *** in pandas dimension_filter_order():"+str(e)

    def dimension_filter(self, dimension_name, comparison_operation, value, pre_reset):
        '''
            description:
                cumulative filter dimension_name by comparison_operation and value
            Get parameters:
                dimension_name (string)
                comparison_operation (string)
                value (float/int)
            Response:
                number_of_rows_left
        '''
        try:
            if pre_reset == True:
                #implementation of resetThenFilter function
                self.dimension_reset(dimension_name)

            query = dimension_name+comparison_operation+value
            if dimension_name in self.dimensions_filters:
                if len(self.dimensions_filters[dimension_name])>0:
                    self.dimensions_filters[dimension_name] += ' and '+ query
                else:
                    self.dimensions_filters[dimension_name] = query
                self.dimensions_filters_response_format[dimension_name] = [value,value]
            self.pandas_df = self.pandas_df.query(query)
            return str(len(self.pandas_df))

        except Exception as e:
            return "Exception *** in pandas dimension_filter():"+str(e)

    def dimension_filter_range(self, dimension_name, min_value, max_value, pre_reset):
        '''
            description:
                cumulative filter_range dimension_name between range [min_value,max_value]
            Get parameters:
                dimension_name (string)
                min_value (integer)
                max_value (integer)
            Response:
                number_of_rows_left
        '''
        try:
            if pre_reset == True:
                #implementation of resetThenFilter function
                self.dimension_reset(dimension_name)

            query = dimension_name+">="+min_value+" and "+dimension_name+"<="+max_value
            if dimension_name in self.dimensions_filters:
                if len(self.dimensions_filters[dimension_name])>0:
                    self.dimensions_filters[dimension_name] += ' and '+ query
                else:
                    self.dimensions_filters[dimension_name] = query
                self.dimensions_filters_response_format[dimension_name] = [min_value,max_value]
            self.pandas_df = self.pandas_df.query(query)
            return str(len(self.pandas_df))

        except Exception as e:
            return "Exception *** in pandas dimension_filter_range():"+str(e)

    def groupby_load(self, dimension_name, groupby_agg, groupby_agg_key):
        '''
            description:
                load groupby operation for dimension as per the given groupby_agg
            input:
                dimension_name <string>:
                groupby_agg <dictionary>:
                groupby_agg_key <string>:
            return:
                status: groupby intialized successfully
        '''
        try:
            key = dimension_name+"_"+groupby_agg_key
            self.group_by_backups[key] = True
            response = 'groupby initialized successfully'
            return response
        except Exception as e:
            return 'Exception *** in pandas groupby_load():'+str(e)

    def groupby_filter_order(self, dimension_name, groupby_agg, groupby_agg_key, sort_order, num_rows, sort_column):
        '''
            description:
                get groupby values by a filter_order(all, top(n), bottom(n)) for a groupby on a dimension
            Get parameters:
                dimension_name (string)
                groupby_agg (JSON stringified object)
                groupby_agg_key <string>:
                sort_order (string): top/bottom/all
                num_rows (integer): OPTIONAL -> if sort_order= top/bottom
                sort_column: column name by which the result should be sorted
            Response:
                all rows/error => "groupby not initialized"
        '''
        try:
            key = dimension_name+"_"+groupby_agg_key
            if(key not in self.group_by_backups):
                res = "groupby not intialized"
            else:
                #removing the cumulative filters on the current dimension for the groupby
                temp_df = self.reset_filters(self.back_up_dimension_pandas, omit=dimension_name)
                self.groupby(temp_df,dimension_name,groupby_agg,groupby_agg_key)
                if 'all' == sort_order:
                    temp_df = self.group_by_backups[key].to_dict()
                else:
                    max_rows = max(len(self.group_by_backups[key])-1,0)
                    n_rows = min(num_rows,max_rows)
                    try:
                        if 'top' == sort_order:
                            temp_df = self.group_by_backups[key].nlargest(n_rows,[sort_column]).to_dict()
                        elif 'bottom' == sort_order:
                            temp_df = self.group_by_backups[key].nsmallest(n_rows,[sort_column]).to_dict()
                    except Exception as e:
                        return 'Exception *** '+str(e)
                res = str(self.parse_dict(temp_df))
            return res

        except Exception as e:
            return 'Exception *** in pandas groupby_filter_order():'+str(e)
Example #8
0
def test_iterate_over_decimal_chunk():
    # TODO: to add more test case to cover as much code as possible
    #       e.g. Decimal(19, 0) for Int64, Decimal(9, 0) for Int32, Decimal(4, 0) for Int16, Decimal(2, 0) for Int8

    def get_random_decimal(precision, scale):
        data = []
        for i in range(precision):
            data.append(str(random.randint(1,9)))

        if scale:
            data.insert(-scale, '.')
        return decimal.Decimal("".join(data))

    stream = BytesIO()
    column_meta = [
            { "logicalType" : "FIXED", "precision" : "10", "scale" : "3" },
            { "logicalType" : "FIXED", "precision" : "38", "scale" : "0" }
    ]
    field_foo = pyarrow.field("column_foo", pyarrow.decimal128(10, 3), True, column_meta[0])
    field_bar = pyarrow.field("column_bar", pyarrow.decimal128(38, 0), True, column_meta[1])
    schema = pyarrow.schema([field_foo, field_bar])

    column_size = 2
    batch_row_count = 10
    batch_count = 10
    expected_data = []
    writer = RecordBatchStreamWriter(stream, schema)

    for i in range(batch_count):
        column_arrays = []
        py_arrays = []
        for j in range(column_size):
            column_data = []
            not_none_cnt = 0
            while not_none_cnt == 0:
                column_data = []
                for k in range(batch_row_count):
                    data = None if bool(random.getrandbits(1)) else get_random_decimal(10 if j % 2 == 0 else 38, 3 if j % 2 == 0 else 0)
                    if data != None:
                        not_none_cnt += 1
                    column_data.append(data)
            column_arrays.append(column_data if j % 2 == 0 else [int(data) if data is not None else None for data in column_data])
            py_arrays.append(pyarrow.array(column_data))

        expected_data.append(column_arrays)
        rb = RecordBatch.from_arrays(py_arrays, ["column_foo", "column_bar"])
        writer.write_batch(rb)

    writer.close()

    # seek stream to begnning so that we can read from stream
    stream.seek(0)
    reader = RecordBatchStreamReader(stream)
    context = ArrowConverterContext()
    it = PyArrowChunkIterator(reader, context)

    count = 0
    while True:
        try:
            val = next(it)
            assert val[0] == expected_data[int(count / 10)][0][count % 10]
            assert type(val[0]) == type(expected_data[int(count / 10)][0][count % 10])  # Decimal type or NoneType
            assert val[1] == expected_data[int(count / 10)][1][count % 10]
            assert type(val[1]) == type(expected_data[int(count / 10)][1][count % 10])  # Int type or NoneType
            count += 1
        except StopIteration:
            assert count == 100
            break