Exemple #1
0
    class Pipeline(BaseEstimator, TransformerMixin):
        """
        """
        def __init__(self,
                     numeric,
                     id=None,
                     target=None,
                     categorical=None,
                     verbose=0):
            self.created_features = None
            self.id = id
            self.target = target
            self.categorical = categorical
            self.numeric = numeric
            self.verbose = verbose

            self.feature_generator = None
            self.preprocessor = None

        def fit_transform(self, df, y=None, **fit_params):
            with Timer('pipelines.Pipeline.fit_transform:', self.verbose):
                self.feature_generator = FeatureGenerator(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                df_features = self.feature_generator.fit_transform(df)

                self.preprocessor = Preprocessor(
                    id=self.id,
                    numeric=self.numeric,
                    categorical=self.categorical,
                    target=self.target,
                    verbose=self.verbose,
                )
                x = self.preprocessor.fit_transform(df_features)
                return x

        def transform(self, df):
            with Timer('pipelines.Pipeline.transform:', self.verbose):
                if self.feature_generator is None:
                    raise NotFittedError(
                        f'feature_generator = {self.feature_generator}')
                if self.preprocessor is None:
                    raise NotFittedError(f'preprocessor = {self.preprocessor}')

                df_features = self.feature_generator.transform(df)
                x = self.preprocessor.transform(df_features)
                return x

        def fit(self, x, y=None, **fit_params):
            return self

        def get_feature_names(self):
            return self.created_features
    predict_df = predict_df.explode(data_columns)
    predict_df = predict_df.reset_index(drop=True)
    predict_df = predict_df.reset_index(drop=False)

    ## do the preprocessing
    print("Preprocess")
    preprocessor = Preprocessor(
        doLower=args["doLower"],
        doLemmatization=args["doLemmatization"],
        removeStopWords=args["removeStopWords"],
        doSpellingCorrection=args["doSpellingCorrection"],
        removeNewLine=args["removeNewLine"],
        removePunctuation=args["removePunctuation"],
        removeHtmlTags=args["removeHtmlTags"],
        minTextLength=args["minTextLength"])
    predict_df["processed"] = preprocessor.fit_transform(
        predict_df["text_german"])
    predict_df = predict_df.dropna(subset=["processed"], axis=0)

    print("Tokenize")
    tokenizer = Tokenizer(tokenizeStr=preperation_technique,
                          ngram=preperation_ngram,
                          fasttextFile=args["fasttext_file"],
                          doLower=args["doLower"])
    predict_df["processed"] = tokenizer.fit_transform(predict_df["processed"])

    ## for testing purposes
    #train_df = train_df.sample(100)
    #val_df = val_df.sample(20)
    #test_df = test_df.sample(20)

    ## apply the model
            logging.error("vaidation_split needs to be given.")
            sys.exit("vaidation_split needs to be given.")

        ## get data and train columns
        data_column = list(set(train_df.columns) - set(args["targets"]))[0]

        ## do the preprocessing
        print("Preprocess")
        preprocessor = Preprocessor(
            doLower=args["doLower"],
            doLemmatization=args["doLemmatization"],
            removeStopWords=args["removeStopWords"],
            doSpellingCorrection=args["doSpellingCorrection"],
            removeNewLine=args["removeNewLine"],
            removePunctuation=args["removePunctuation"])
        train_df[data_column] = preprocessor.fit_transform(
            train_df[data_column])
        val_df[data_column] = preprocessor.transform(val_df[data_column])
        test_df[data_column] = preprocessor.transform(test_df[data_column])

        ## save the preprocessed data
        if not os.path.exists(os.path.join(args["data_path"], "temp")):
            os.makedirs(os.path.join(args["data_path"], "temp"))
        train_df.to_pickle(train_pre_path)
        val_df.to_pickle(val_pre_path)
        test_df.to_pickle(test_pre_path)
    else:
        train_df = pd.read_pickle(train_pre_path)
        val_df = pd.read_pickle(val_pre_path)
        test_df = pd.read_pickle(test_pre_path)
        ## get data and train columns
        data_column = list(set(train_df.columns) - set(args["targets"]))[0]