Esempio n. 1
0
def traverse(obj, path=None, callback=None):
    """
    Traverse a deep nested python structure
    :param obj: object to traverse
    :param path:
    :param callback: Function used to transform a value
    :return:
    """
    if path is None:
        path = []

    if is_(obj, dict):
        value = {k: traverse(v, path + [k], callback) for k, v in obj.items()}

    elif is_(obj, list):
        value = [traverse(elem, path + [[]], callback) for elem in obj]

    elif is_(obj, tuple):
        value = tuple(traverse(elem, path + [[]], callback) for elem in obj)
    elif is_(obj, DenseVector):
        value = DenseVector(
            [traverse(elem, path + [[]], callback) for elem in obj])
    else:
        value = obj

    if callback is None:  # if a callback is provided, call it to get the new value
        return value
    else:
        return callback(path, value)
Esempio n. 2
0
def normalizer(df, input_cols, p=2.0):
    """
    Transforms a dataset of Vector rows, normalizing each Vector to have unit norm. It takes parameter p, which
    specifies the p-norm used for normalization. (p=2) by default.
    :param df: Dataframe to be transformed
    :param input_cols: Columns to be normalized.
    :param p:  p-norm used for normalization.
    :return: Dataframe with normalized columns.
    """

    # Check if columns argument must be a string or list datatype:
    if is_(input_cols, [str, list]):
        RaiseIt.type_error(input_cols, [str, list])

    if is_str(input_cols):
        input_cols = [input_cols]

    if is_(input_cols, [float, int]):
        RaiseIt.type_error(input_cols, [float, int])

    df = df.cols.cast(input_cols, "vector")

    # TODO https://developer.ibm.com/code/2018/04/10/improve-performance-ml-pipelines-wide-dataframes-apache-spark-2-3/
    normal = [
        Normalizer(inputCol=col_name,
                   outputCol=name_col(col_name, "normalized"),
                   p=p) for col_name in list(set(input_cols))
    ]

    pipeline = Pipeline(stages=normal)

    df = pipeline.fit(df).transform(df)

    return df
Esempio n. 3
0
    def load(self, df):
        """

        Load the dataframe to the mongo collection
        :param df: dataframe to be send to the enricher
        :return:
        """

        if is_(df, pd.DataFrame):
            self.get_collection(self.collection_name).insert_many(
                df.to_dict("records"))
        elif is_(df, DataFrame):
            df.save.mongo(self.host, self.port, self.db_name,
                          self.collection_name)
        else:
            raise Exception("df must by a Spark Dataframe or Pandas Dataframe")
Esempio n. 4
0
    def data_frame(cols=None, rows=None, infer_schema=True, pdf=None):
        """
        Helper to create a Spark dataframe:
        :param cols: List of Tuple with name, data type and a flag to accept null
        :param rows: List of Tuples with the same number and types that cols
        :param infer_schema: Try to infer the schema data type.
        :param pdf: a pandas dataframe
        :return: Dataframe
        """
        if is_(pdf, pd.DataFrame):
            df = Spark.instance.spark.createDataFrame(pdf)
        else:

            specs = []
            # Process the rows
            if not is_list_of_tuples(rows):
                rows = [(i, ) for i in rows]

            # Process the columns
            for c, r in zip(cols, rows[0]):
                # Get columns name

                if is_one_element(c):
                    col_name = c

                    if infer_schema is True:
                        var_type = infer(r)
                    else:
                        var_type = StringType()
                    nullable = True

                elif is_tuple(c):

                    # Get columns data type
                    col_name = c[0]
                    var_type = parse_spark_class_dtypes(c[1])

                    count = len(c)
                    if count == 2:
                        nullable = True
                    elif count == 3:
                        nullable = c[2]

                # If tuple has not the third param with put it to true to accepts Null in columns
                specs.append([col_name, var_type, nullable])

            struct_fields = list(map(lambda x: StructField(*x), specs))

            df = Spark.instance.spark.createDataFrame(
                rows, StructType(struct_fields))
            df = df.columns_meta(df.cols.names())
        return df