Esempi in Python per DataPreprocessor.preprocess_data

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: data_preprocessor

Classe/tipologia: DataPreprocessor

Metodo/funzione: preprocess_data

Esempi su hotexamples.com: 3

DataPreprocessor.preprocess_data in Python: 3 esempi trovati. Questi sono i migliori esempi reali in Python per data_preprocessor.DataPreprocessor.preprocess_data, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

DataPreprocessor(30)

process_data(6)

preprocess_data(3)

explore_factors(2)

explore_numeric_columns(2)

get_reprocessed_data(2)

restore_preprocessing_parameters(2)

get_data(2)

preprocess(1)

print_exploration(1)

process_and_save(1)

store_preprocessing_parameters(1)

split_predictors(1)

prepare_to_model(1)

remove_repeated_substrings(1)

scale_data(1)

sort_index(1)

split_dataset(1)

prepare_training_data(1)

merge(1)

one_hot_encode_labels(1)

normalize_product_groups_names(1)

add_degree_days(1)

get_mfcc(1)

get_df(1)

fill_missing_values(1)

feature_engineering(1)

display_dataset(1)

create_dummies(1)

convert_columns_to_numeric(1)

clean_data(1)

clean_columns(1)

add_total_cost(1)

add_time_features(1)

add_product_name(1)

summarize(1)

Esempio n. 1

Mostra file

class PredictionPipeline():
    def __init__(self):
        # load config file
        with open("./config/predictionconfig.yml", "r") as ymlfile:
            cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)
        self.interval = cfg['interval']
        self.threshold = cfg['single_threshold']
        # init DataPreprocessor
        self.data_preprocessor = DataPreprocessor()
        # init PredictionMaker
        self.prediction_maker = PredictionMaker()
        self.registry = CollectorRegistry()
        self.pushgateway_url = os.getenv('PUSHGATEWAY_URL')

    def run(self):
        while True:
            start_millis = int(round(time.time() * 1000))
            print("Starting pipeline...")

            # get data
            df = self.data_preprocessor.get_data()
            df = self.data_preprocessor.preprocess_data(df)

            if df.empty == False:

                # predict
                result = self.prediction_maker.make_prediction(df)
                end_millis = int(round(time.time() * 1000))
                prediction_millis = end_millis - start_millis
                prediction = Prediction(result)

                # apply changes to K8s Cluster
                prediction.apply(self.threshold)

                # push to prometheus gateway
                prediction.push_to_prometheus(self.registry,
                                              self.pushgateway_url)
                try:
                    g = Gauge('prediction_making_speed',
                              'Time in ms for making Prediction.',
                              registry=registry)
                except:
                    pass
                g.set(prediction_millis)
                push_to_gateway('{}:9091'.format(self.pushgateway_url),
                                job='prediction-maker',
                                registry=registry)
                # sleep until next interval
                print("Prediction took {} ms.".format(prediction_millis))

            print("Going back to sleep for {} sec...".format(self.interval))
            time.sleep(self.interval)

Esempio n. 2

Mostra file

File: test_data_preprocessor.py Progetto: Apodini/pythia

class TestDataPreprocessor(unittest.TestCase):
    """ unittests for DataPreprocessor
    """
    def setUp(self):
        """ init DataPreprocessor
        """
        self.data_preprocessor = DataPreprocessor()

    def test_preprocess_empty_data(self):
        """ test preprocess_data with empty df
        """
        df = pd.DataFrame(
            columns=['traceid', 'sessionid', 'servicessequence', 'starttime'])
        df = self.data_preprocessor.preprocess_data(df)
        full_df = pd.DataFrame(
            columns=['sessionid', 'nextcluster', 'starttime'])
        assert_frame_equal(df, full_df)

    def test_preprocess_data(self):
        """ test preprocess_data with normal df
        """
        df = pd.DataFrame(
            [['1234', '1234', 'service-1,service-2,service-1', '1234']],
            columns=['traceid', 'sessionid', 'servicessequence', 'starttime'])
        df = self.data_preprocessor.preprocess_data(df)
        full_df = pd.DataFrame(
            [[0.0, '1234', '1234', 'service-1,service-2', '1234', 0, 0, 0]],
            columns=[
                'index', 'traceid', 'sessionid', 'servicessequence',
                'starttime', 'currentclusternumber', 'clustersequence',
                'nextcluster'
            ])
        assert_frame_equal(df.sort_index(axis=1),
                           full_df.sort_index(axis=1),
                           check_dtype=False,
                           check_index_type=False)

Esempio n. 3

Mostra file

File: mechanism_creator.py Progetto: Apodini/pythia

class MechansimCreator():
    ''' Class responsible for creating the mechanism for the prediction
    '''

    def __init__(self, algorithm):
        ''' init data preprocessor and classifier
        '''
        self.data_preprocessor = DataPreprocessor()
        if str(algorithm).lower() == 'decisiontree':
            self.clf = DecisionTreeClassifier()
        elif str(algorithm).lower() == 'randomforest':
            self.clf = RandomForestClassifier()


    def run(self):
        ''' create and store mechanism
        '''
        df = self.data_preprocessor.get_data()
        df = self.data_preprocessor.preprocess_data(df)
        # nextcluster as label for ML algorithm
        y = df['nextcluster'].values
        y = y.astype('int')


        X_train, X_test, y_train, y_test = train_test_split(df[['currentclusternumber']].values, y)

        # TODO: imbalanced classes?
        print(df.nextcluster.value_counts())

        # Classifer
        self.clf = self.clf.fit(X_train, y_train)
        y_pred = self.clf.predict(X_test)
        print("Accuracy of Classifier:", metrics.accuracy_score(y_test, y_pred))

        '''
        dot_data = StringIO()
        export_graphviz(clf, out_file=dot_data,
                        filled=True, rounded=True,
                        special_characters=True,feature_names = feature_cols)
        graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
        graph.write_png('./data/images/decisiontree/tree.png')

        # Random Forest classifier
        clf=RandomForestClassifier(n_estimators=32)
        clf.fit(X_train,y_train)
        y_pred=clf.predict(X_test)
        print("Accuracy of RandomForest:",metrics.accuracy_score(y_test, y_pred))

        i_tree = 0
        for tree_in_forest in clf.estimators_:
            if (i_tree < 10):
                dot_data = StringIO()
                export_graphviz(tree_in_forest, out_file=dot_data,
                        filled=True, rounded=True,
                        special_characters=True,feature_names = feature_cols)
            graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
            graph.write_png('./data/images/foresttrees/foresttree{}.png'.format(i_tree))
            i_tree += 1
        '''

        # save the model to db
        mechanism = Mechansim(self.clf, self.data_preprocessor.le, int(round(time.time() * 1000)))
        mechanism.save()