コード例 #1
0
ファイル: feature.py プロジェクト: niteshnicholas/Optimus
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    normal = [
        Normalizer(inputCol=column, outputCol=column + "_normalized", p=p)
        for column in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
コード例 #2
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :return:
        """

        columns = parse_columns(self, columns)
        df = self
        expr = []

        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure/Boolean
            if is_(df.cols.schema_dtype(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")

            if is_(df.cols.schema_dtype(col_name), (float, int)):
                expr.append(F.count(F.when(F.isnan(col_name) | F.col(col_name).isNull(), col_name)).alias(col_name))

            elif is_(df.cols.schema_dtype(col_name), (NullType)):
                expr.append(F.count(col_name).alias(col_name))

            else:
                expr.append(F.count(F.when(F.col(col_name).isNull(), col_name)).alias(col_name))

        result = format_dict(df.select(*expr).to_json())
        return result
コード例 #3
0
ファイル: functions.py プロジェクト: schatzr/Optimus
def traverse(obj, path=None, callback=None):
    """
    Traverse a deep nested python structure
    :param obj: object to traverse
    :param path:
    :param callback: Function used to transform a value
    :return:
    """
    if path is None:
        path = []

    if is_(obj, dict):
        value = {k: traverse(v, path + [k], callback) for k, v in obj.items()}

    elif is_(obj, list):
        value = [traverse(elem, path + [[]], callback) for elem in obj]

    elif is_(obj, tuple):
        value = tuple(traverse(elem, path + [[]], callback) for elem in obj)
    elif is_(obj, DenseVector):
        value = DenseVector(
            [traverse(elem, path + [[]], callback) for elem in obj])
    else:
        value = obj

    if callback is None:  # if a callback is provided, call it to get the new value
        return value
    else:
        return callback(path, value)
コード例 #4
0
ファイル: enricher.py プロジェクト: schatzr/Optimus
    def send(self, df):
        """
        Send the dataframe to the mongo collection
        :param df: dataframe to be send to the enricher
        :return:
        """

        if is_(df, pd.DataFrame):
            self.get_collection(self.collection_name).insert_many(df.to_dict("records"))
        elif is_(df, DataFrame):
            df.save.mongo(self.host, self.port, self.db_name, self.collection_name)
        else:
            raise Exception("df must by a Spark Dataframe or Pandas Dataframe")
コード例 #5
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def append(col_name=None, value=None):
        """
        Append a column to a Dataframe
        :param col_name: Name of the new column
        :param value: List of data values
        :return:
        """

        def lit_array(_value):
            temp = []
            for v in _value:
                temp.append(F.lit(v))
            return F.array(temp)

        df = self

        if is_num_or_str(value):
            value = F.lit(value)
        elif is_list(value):
            value = lit_array(value)
        elif is_tuple(value):
            value = lit_array(list(value))

        if is_(value, F.Column):
            df = df.withColumn(col_name, value)

        return df
コード例 #6
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def nest(input_cols, output_col, shape="string", separator=""):
        """
        Concat multiple columns to one with the format specified
        :param input_cols: columns to be nested
        :param output_col: final column with the nested content
        :param separator: char to be used as separator at the concat time
        :param shape: final data type, 'array', 'string' or 'vector'
        :return: Spark DataFrame
        """

        df = self

        if has_(input_cols, F.Column):
            # Transform non Column data to lit
            columns = [F.lit(col) if not is_(col, F.Column) else col for col in input_cols]
        else:
            columns = parse_columns(self, input_cols)

        if shape is "vector":
            columns = parse_columns(self, input_cols, filter_by_column_dtypes=PYSPARK_NUMERIC_TYPES)

            vector_assembler = VectorAssembler(
                inputCols=columns,
                outputCol=output_col)
            df = vector_assembler.transform(df)

        elif shape is "array":
            df = apply_expr(output_col, F.array(*columns))

        elif shape is "string":
            df = apply_expr(output_col, F.concat_ws(separator, *columns))
        else:
            RaiseIt.value_error(shape, ["vector", "array", "string"])

        return df
コード例 #7
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def apply_expr(columns, func=None, args=None, filter_col_by_dtypes=None, verbose=True):
        """
        Apply a expression to column.
        :param columns: Columns in which the function is going to be applied
        :param func: function to be applied
        :type func: A plain expression or a function
        :param args: Argument passed to the function
        :param filter_col_by_dtypes: Only apply the filter to specific type of value ,integer, float, string or bool
        :param verbose: Print additional information about
        :return: Dataframe
        """

        # It handle if func param is a plain expression or a function returning and expression
        def func_col_exp(col_name, attr):
            return func

        if is_(func, F.Column):
            _func = func_col_exp
        else:
            _func = func

        columns = parse_columns(self, columns, filter_by_column_dtypes=filter_col_by_dtypes, accepts_missing_cols=True)

        df = self
        for col_name in columns:
            df = df.withColumn(col_name, audf(col_name, _func, attrs=args, func_type="column_exp", verbose=verbose))
        return df
コード例 #8
0
ファイル: columns.py プロジェクト: xuliangleon/Optimus
    def count_na(columns):
        """
        Return the NAN and Null count in a Column
        :param columns: '*', list of columns names or a single column name.
        :param type: Accepts integer, float, string or None
        :return:
        """

        columns = parse_columns(self, columns)

        df = self
        expr = []
        for col_name in columns:
            # If type column is Struct parse to String. isnan/isNull can not handle Structure

            if is_(df.cols.schema_dtypes(col_name), (StructType, BooleanType)):
                df = df.cols.cast(col_name, "string")
            expr.append(
                F.count(
                    F.when(
                        F.isnan(col_name) | F.col(col_name).isNull(),
                        col_name)).alias(col_name))

        result = format_dict(collect_as_dict(df.select(*expr).collect()))

        return result
コード例 #9
0
    def data_frame(cols=None, rows=None, infer_schema=True, pdf=None):
        """
        Helper to create a Spark dataframe:
        :param cols: List of Tuple with name, data type and a flag to accept null
        :param rows: List of Tuples with the same number and types that cols
        :param infer_schema: Try to infer the schema data type.
        :param pdf: a pandas dataframe
        :return: Dataframe
        """
        if is_(pdf, pd.DataFrame):
            result = Spark.instance.spark.createDataFrame(pdf)
        else:

            specs = []
            # Process the rows
            if not is_list_of_tuples(rows):
                rows = [(i, ) for i in rows]

            # Process the columns
            for c, r in zip(cols, rows[0]):
                # Get columns name

                if is_one_element(c):
                    col_name = c

                    if infer_schema is True:
                        var_type = infer(r)
                    else:
                        var_type = StringType()
                    nullable = True

                elif is_tuple(c):

                    # Get columns data type
                    col_name = c[0]
                    var_type = get_spark_dtypes_object(c[1])

                    count = len(c)
                    if count == 2:
                        nullable = True
                    elif count == 3:
                        nullable = c[2]

                # If tuple has not the third param with put it to true to accepts Null in columns
                specs.append([col_name, var_type, nullable])

            struct_fields = list(map(lambda x: StructField(*x), specs))

            result = Spark.instance.spark.createDataFrame(
                rows, StructType(struct_fields))

        return result
コード例 #10
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:

    assert isinstance(input_cols, (str, list)), \
        "Error: %s argument must be a string or a list." % "input_cols"

    if isinstance(input_cols, str):
        input_cols = [input_cols]

    assert isinstance(
        p, (float, int)), "Error: p argument must be a numeric value."

    # Convert ArrayType() column to DenseVector
    def arr_to_vec(arr_column):
        """
        :param arr_column: Column name
        :return: Returns DenseVector by converting an ArrayType() column
        """
        return DenseVector(arr_column)

    # User-Defined function
    # TODO: use apply() to use Pyarrow
    udf_arr_to_vec = F.udf(arr_to_vec, VectorUDT())

    # Check for columns which are not DenseVector types and convert them into DenseVector
    for col in input_cols:
        if not is_(df[col], DenseVector):
            df = df.withColumn(col, udf_arr_to_vec(df[col]))

    normal = [
        Normalizer(inputCol=column, outputCol=column + "_normalized", p=p)
        for column in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
コード例 #11
0
ファイル: columns.py プロジェクト: marcelomata/Optimus
    def unnest(columns, mark=None, splits=None, index=None):
        """
        Split an array or string in different columns
        :param columns: Columns to be un-nested
        :param mark: If column is string.
        :param splits: Number of rows to un-nested. Because we can not know beforehand the number of splits
        :param index:
        :return: Spark DataFrame
        """

        # If a number of split was not defined try to infer the length with the first element
        infer_splits = None
        if splits is None:
            infer_splits = True

        columns = parse_columns(self, columns)

        df = self

        for col_name in columns:
            # if the col is array

            col_dtype = self.schema[col_name].dataType

            # Array
            if is_(col_dtype, ArrayType):

                expr = F.col(col_name)
                # Try to infer the array length using the first row
                if infer_splits is True:
                    splits = len(self.cols.cell(col_name))

                for i in builtins.range(splits):
                    df = df.withColumn(col_name + "_" + str(i), expr.getItem(i))

            # String
            elif is_(col_dtype, StringType):
                expr = F.split(F.col(col_name), mark)
                # Try to infer the array length using the first row
                if infer_splits is True:
                    splits = len(self.cols.cell(col_name).split(mark))

                if is_int(index):
                    r = builtins.range(index, index + 1)
                else:
                    r = builtins.range(0, splits)

                for i in r:
                    df = df.withColumn(col_name + "_" + str(i), expr.getItem(i))

            # Vector
            elif is_(col_dtype, VectorUDT):

                def _unnest(row):
                    _dict = row.asDict()

                    # Get the column we want to unnest
                    _list = _dict[col_name]

                    # Ensure that float are python floats and not np floats
                    if index is None:
                        _list = [float(x) for x in _list]
                    else:
                        _list = [float(_list[1])]

                    return row + tuple(_list)

                df = df.rdd.map(_unnest).toDF(df.columns)

        return df
コード例 #12
0
    def run(self,
            df,
            collection_name=None,
            func_request=None,
            func_response=None,
            return_type="json",
            calls=60,
            period=60,
            max_tries=8):
        """
        Read a the url key from a mongo collection an make a request to a service
        :param df: Dataframe to me loaded to the enricher collection.
        :param collection_name: Custom collection to save the data.
        :param func_request: help to create a custom request
        :param func_response: help to create a custom response
        :param return_type:
        :param calls: how many call can you make
        :param period: in which period ot time can the call be made
        :param max_tries: how many retries should we do
        :return:
        """

        # Load the dataframe data in the enricher
        if is_(df, DataFrame):
            df = df.create_id(COL_ID)

        # Load the dataframe data in the enricher
        self.send(df)

        if collection_name is None:
            collection_name = self.collection_name
        collection = self.get_collection(collection_name)

        # Get data that is not yet enriched
        cursor = collection.find({COL_RESULTS: {"$exists": False}})

        total_docs = cursor.count(True)

        if func_request is None:
            func_request = requests.get
        collection = self.get_collection(collection_name)

        @on_exception(expo, RateLimitException, max_tries=max_tries)
        @limits(calls=calls, period=period)
        def _func_request(v):
            return func_request(v)

        if total_docs > 0:
            for c in tqdm_notebook(cursor,
                                   total=total_docs,
                                   desc='Processing...'):

                # Send request to the API
                response = _func_request(c)

                mongo_id = c["_id"]

                if response.status_code == 200:
                    if return_type == "json":
                        response = json.loads(response.text)
                    elif return_type == "text":
                        response = response.text

                    # Process the result with an external function
                    if is_function(func_response):
                        response = func_response(response)

                    # Update the mongo id with the result
                    collection.find_and_modify(
                        query={"_id": mongo_id},
                        update={"$set": {
                            COL_RESULTS: response
                        }},
                        upsert=False,
                        full_response=True)
                else:
                    # The response key will remain blank so we can filter it to try in future request
                    logging.info(response.status_code)

            # Append the data in enrichment to the dataframe

            logging.info("Appending collection info into the dataframe")
            # TODO: An elegant way to handle pickling?
            # take care to the pickling
            host = self.host
            port = self.port
            db_name = self.db_name

            @pandas_udf('string', PandasUDFType.SCALAR)
            def func(value):
                # More about pickling
                from pymongo import MongoClient
                _client = MongoClient(host, port)
                _db = _client[db_name]
                _collection = _db[collection_name]

                def func_serie(serie):
                    _cursor = _collection.find_one({COL_ID: serie},
                                                   projection={
                                                       "_id": 0,
                                                       COL_RESULTS: 1
                                                   })
                    return _cursor[COL_RESULTS]

                return value.apply(func_serie)

            df = df.withColumn(COL_RESULTS,
                               func(df[COL_ID])).cols.drop(COL_ID).run()

            # If the process is finished, flush the Mongo collection
            self.flush()
            return df
        else:
            print("No records available to process")
コード例 #13
0
    def unnest(columns, mark=None, n=None, index=None):
        """
        Split array or string in different columns
        :param columns: Columns to be un-nested
        :param mark: is column is string
        :param n: Number of rows to un-nested
        :param index:
        :return: Spark DataFrame
        """

        # If a number of split was not defined try to infer the lenght with the first element
        infer_n = None
        if n is None:
            infer_n = True

        columns = parse_columns(self, columns)

        df = self

        for col_name in columns:
            # if the col is array
            expr = None

            col_dtype = self.schema[col_name].dataType

            # Array
            if is_(col_dtype, ArrayType):

                expr = F.col(col_name)
                # Try to infer the array length using the first row
                if infer_n is True:
                    n = len(self.cols.cell(col_name))

                for i in builtins.range(n):
                    df = df.withColumn(col_name + "_" + str(i),
                                       expr.getItem(i))

            # String
            elif is_(col_dtype, StringType):
                expr = F.split(F.col(col_name), mark)
                # Try to infer the array length using the first row
                if infer_n is True:
                    n = len(self.cols.cell(col_name).split(mark))

                if is_int(index):
                    r = builtins.range(index, index + 1)
                else:
                    r = builtins.range(0, n)

                for i in r:
                    df = df.withColumn(col_name + "_" + str(i),
                                       expr.getItem(i))

            # Vector
            elif is_(col_dtype, VectorUDT):

                def extract(row):
                    return row + tuple(row.vector.toArray().tolist())

                df = df.rdd.map(extract).toDF(df.columns)

        return df