Ejemplo n.º 1
0
    def decision_tree(df, columns, input_col, **kargs):
        """
        Runs a decision tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with decision tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        assert isinstance(input_col,
                          str), "Error, input column must be a string"

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = DecisionTreeClassifier(**kargs)

        df = df.cols.rename([(input_col + "_index", "label")])

        dt_model = model.fit(df)
        df_model = dt_model.transform(df)
        return df_model, dt_model
Ejemplo n.º 2
0
    def _iqr(self, action):
        """
        Select or drop outliers
        :param action:
        :return:
        """
        df = self.df
        columns = self.columns

        if not is_dataframe(self.df):
            raise TypeError("Spark Dataframe expected")

        columns = parse_columns(self.df, columns)

        for col_name in columns:
            iqr = df.cols.iqr(col_name, more=True)
            lower_bound = iqr["q1"] - (iqr["iqr"] * 1.5)
            upper_bound = iqr["q3"] + (iqr["iqr"] * 1.5)

            if action is "drop":
                df = df.rows.drop((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))
            elif action is "select":
                df = df.rows.select((F.col(col_name) > upper_bound) | (F.col(col_name) < lower_bound))

        return df
Ejemplo n.º 3
0
    def z_score(df, columns, threshold=None):
        """
        Delete outlier using z score
        :param df:
        :param columns:
        :param threshold:
        :return:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)

        for c in columns:
            # the column with the z_col value is always the string z_col plus the name of column
            z_col = "z_col_" + c

            df = df.cols.z_score(c) \
                .rows.drop(F.col(z_col) > threshold) \
                .cols.drop(z_col)

        return df
Ejemplo n.º 4
0
    def _z_score(self, action):
        """
        Get outlier using z score

        :return:
        """
        df = self.df
        columns = self.columns
        threshold = self.threshold

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        columns = parse_columns(df, columns)

        for col_name in columns:
            # the column with the z_col value is always the string z_col plus the name of column
            z_col_name = _z_score_col_name(col_name)

            if action is "drop":
                df = df.cols.z_score(col_name,z_col_name) \
                    .rows.drop(F.col(z_col_name) > threshold) \
                    .cols.drop(z_col_name)

            elif action is "select":
                df = df.cols.z_score(col_name) \
                    .rows.select(F.col(z_col_name) > threshold) \
                    .cols.drop(z_col_name)

        return df
Ejemplo n.º 5
0
    def _mad(self, action):
        """

               :type action:
               :return:
               """

        df = self.df
        columns = self.columns
        threshold = self.threshold

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)
        for c in columns:
            mad_value = df.cols.mad(c, more=True)
            lower_bound = mad_value["median"] - threshold * mad_value["mad"]
            upper_bound = mad_value["median"] + threshold * mad_value["mad"]

            if action is "select":
                df = df.rows.select((F.col(c) > upper_bound)
                                    | (F.col(c) < lower_bound))
            elif action is "drop":
                df = df.rows.drop((F.col(c) > upper_bound)
                                  | (F.col(c) < lower_bound))
        return df
Ejemplo n.º 6
0
    def gbt(df, columns, input_col, **kargs):
        """
        Runs a gradient boosting tree classifier for input DataFrame.
        :param df: Pyspark dataframe to analyze.
        :param columns: List of columns to select for prediction.
        :param input_col: Column to predict.
        :return: DataFrame with gradient boosting tree and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        columns = parse_columns(df, columns)

        if not is_str(input_col):
            raise TypeError("Error, input column must be a string")

        data = df.select(columns)
        feats = data.columns
        feats.remove(input_col)

        df = string_to_index(df, input_cols=input_col)
        df = vector_assembler(df, input_cols=feats)

        model = GBTClassifier(**kargs)

        df = df.cols.rename([(input_col + "_index", "label")])

        gbt_model = model.fit(df)
        df_model = gbt_model.transform(df)
        return df_model, gbt_model
Ejemplo n.º 7
0
def n_gram(df, input_col, n=2):
    """
    Converts the input array of strings inside of a Spark DF into an array of n-grams.
    :param df: Pyspark dataframe to analyze
    :param input_col: Column to analyzer.
    :param n: number of elements per n-gram >=1.
    :return: Spark DataFrame with n-grams calculated.
    """

    is_dataframe(df)

    tokenizer = feature.Tokenizer().setInputCol(
        input_col) | feature.StopWordsRemover()
    count = feature.CountVectorizer()
    gram = feature.NGram(n=n) | feature.CountVectorizer()
    tf = tokenizer | (count, gram) | feature.VectorAssembler()
    tfidf = tf | feature.IDF().setOutputCol('features')

    tfidf_model = tfidf.fit(df)
    df_model = tfidf_model.transform(df)
    return df_model, tfidf_model
Ejemplo n.º 8
0
    def logistic_regression_text(df, input_col):
        """
        Runs a logistic regression for input (text) DataFrame.
        :param df: Pyspark dataframe to analyze
        :param input_col: Column to predict
        :return: DataFrame with logistic regression and prediction run.
        """

        if not is_dataframe(df):
            raise TypeError("Spark dataframe expected")

        pl = feature.Tokenizer().setInputCol(input_col) | feature.CountVectorizer()
        ml = pl | classification.LogisticRegression()
        ml_model = ml.fit(df)
        df_model = ml_model.transform(df)
        return df_model, ml_model
Ejemplo n.º 9
0
    def iqr(df, columns):
        """
        Delete outliers using inter quartile range
        :param df:
        :param columns:
        :return:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        columns = parse_columns(df, columns)

        for column in columns:
            iqr = df.cols.iqr(column, more=True)
            lower_bound = iqr["q1"] - (iqr["iqr"] * 1.5)
            upper_bound = iqr["q3"] + (iqr["iqr"] * 1.5)

            df = df.rows.drop((F.col(column) > upper_bound)
                              | (F.col(column) < lower_bound))

        return df
Ejemplo n.º 10
0
    def _modified_z_score(self, action):
        """

        :param action:
        :return:
        """

        df = self.df
        columns = self.columns
        threshold = self.threshold

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_numeric(threshold):
            raise TypeError("Numeric expected")

        columns = parse_columns(df, columns)

        for col_name in columns:
            median = df.cols.median(col_name)
            median_absolute_deviation = df.select(
                F.abs(F.col(col_name) -
                      median).alias(col_name)).cols.median(col_name)

            m_z_col_name = _m_z_score_col_name(col_name)

            df = df.withColumn(
                m_z_col_name,
                F.abs(0.6745 * (F.col(col_name) - median) /
                      median_absolute_deviation))
            if action is "select":
                df = df.rows.select(F.col(m_z_col_name) > threshold)
            elif action is "drop":
                df = df.rows.drop(F.col(m_z_col_name) > threshold)
        return df
Ejemplo n.º 11
0
    def mad(df, columns, threshold=None):
        """
        Delete outlier using mad
        :param df:
        :param columns:
        :param threshold:
        :return:
        """

        if not is_dataframe(df):
            raise TypeError("Spark Dataframe expected")

        if not is_int(threshold):
            raise TypeError("Integer expected")

        columns = parse_columns(df, columns)
        for c in columns:
            mad_value = df.cols.mad(c, more=True)
            lower_bound = mad_value["median"] - threshold * mad_value["mad"]
            upper_bound = mad_value["median"] + threshold * mad_value["mad"]

            df = df.rows.drop((F.col(c) > upper_bound)
                              | (F.col(c) < lower_bound))
        return df
Ejemplo n.º 12
0
    def run(self,
            df,
            func_request=None,
            func_response=None,
            return_type="json",
            calls=60,
            period=60,
            max_tries=8):
        """
        Read a the url key from a mongo collection an make a request to a service
        :param df: Dataframe to me loaded to the enricher collection.
        :param func_request: help to create a custom request
        :param func_response: help to create a custom response
        :param calls: how many call can you make by period of time
        :param period: in which period ot time can the call be made in seconds
        :param max_tries: how many retries should we do
        :param return_type:
        :return:
        """

        if is_dataframe(df):
            df = df.create_id(COL_ID)

        # Load the dataframe data in the enricher
        self.load(df)

        collection_name = self.collection_name
        collection = self.get_collection(collection_name)

        # Get data that is not yet enriched
        cursor = collection.find({COL_RESULTS: {"$exists": False}})

        total_docs = cursor.count(True)

        if func_request is None:
            func_request = requests.get
        collection = self.get_collection(collection_name)

        @on_exception(expo, RateLimitException, max_tries=max_tries)
        @limits(calls=calls, period=period)
        def _func_request(v):
            return func_request(v)

        if total_docs > 0:
            for c in tqdm_notebook(cursor,
                                   total=total_docs,
                                   desc='Processing...'):

                # Send request to the API
                response = _func_request(c)

                mongo_id = c["_id"]

                if response.status_code == 200:
                    if return_type == "json":
                        response = json.loads(response.text)
                    elif return_type == "text":
                        response = response.text

                    # Process the result with an external function
                    if is_function(func_response):
                        response = func_response(response)

                    # Update the mongo id with the result
                    collection.find_and_modify(
                        query={"_id": mongo_id},
                        update={"$set": {
                            COL_RESULTS: response
                        }},
                        upsert=False,
                        full_response=True)
                else:
                    # The response key will remain blank so we can filter it to try in future request
                    logger.print(response.status_code)

            # Append the data in enrichment to the dataframe

            logger.print("Appending collection info into the dataframe")
            # TODO: An elegant way to handle pickling?
            # take care to the pickling
            host = self.host
            port = self.port
            db_name = self.db_name

            @pandas_udf('string', PandasUDFType.SCALAR)
            def func(value):
                # More about pickling
                from pymongo import MongoClient
                _client = MongoClient(host, port)
                _db = _client[db_name]
                _collection = _db[collection_name]

                def func_serie(serie):
                    _cursor = _collection.find_one({COL_ID: serie},
                                                   projection={
                                                       "_id": 0,
                                                       COL_RESULTS: 1
                                                   })
                    return _cursor[COL_RESULTS]

                return value.apply(func_serie)

            df = df.withColumn(COL_RESULTS,
                               func(df[COL_ID])).cols.drop(COL_ID).run()

            # If the process is finished, flush the Mongo collection
            self.flush()
            return df
        else:
            print("No records available to process")