Example #1
0
File: base.py Project: Leo-VK/creme
 def _unit_test_params(cls):
     yield {
         "models": [
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(
                     lr=1e-2)),
             ),
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(
                     lr=1e-1)),
             ),
         ],
         "metric":
         metrics.MAE(),
     }
     yield {
         "models": [
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(lr=lr)),
             ) for lr in [1e-4, 1e-3, 1e-2, 1e-1]
         ],
         "metric":
         metrics.MAE(),
     }
Example #2
0
class RiverML:
    # fraud detection model
    model = compose.Pipeline(preprocessing.StandardScaler(),
                             linear_model.LogisticRegression())

    # ROCAUC metric to score the model as it trains
    metric = metrics.ROCAUC()
Example #3
0
 def __init__(self):
     optimizer = optim.SGD(0.1)
     self.model = compose.Pipeline(
         preprocessing.StandardScaler(),
         linear_model.LogisticRegression(optimizer))
     self.metric = metrics.Accuracy()
     self.count = 0
    def __init__(self,
                 my_id=1,
                 bootstrap_servers='',
                 list_of_partitions=[],
                 request_topic='',
                 inference_topic='',
                 group_id='my_grp'):
        """ Constructor
        :type interval: int
        :param interval: Check interval, in seconds
        """
        self.model = compose.Pipeline(
            preprocessing.MinMaxScaler(), anomaly.HalfSpaceTrees(
                seed=42))  # tree.HoeffdingTreeClassifier(max_depth=10)
        self.metric = metrics.ROCAUC()  # metrics.Accuracy() #
        self.my_id = my_id
        self.t = request_topic
        self.result_t = inference_topic
        self.my_grp_id = group_id
        self.result_t_p = 8
        self.bootstrap_servers = bootstrap_servers
        #         self.list_of_partitions = list_of_partitions

        self.tls = []
        x = 0
        for i in list_of_partitions:
            self.tls.insert(x, TopicPartition(self.t, i))
            x = x + 1
        #self.tls=list_of_partitions
        print(self.tls)

        conf = {
            'bootstrap.servers': bootstrap_servers,
            'sasl.mechanism': 'PLAIN',
            'security.protocol': 'SASL_SSL',
            'ssl.ca.location': '/tmp/cacert.pem',
            'sasl.username': '******',
            'sasl.password':
            '******',
            #                 'sasl.username': '******',
            #                 'sasl.password': '******',
            # 'key.serializer': StringSerializer('utf_8'),
            # 'value.serializer': StringSerializer('utf_8'),
            'client.id': 'test-sw-1'
        }

        self.producer = Producer(conf)
        conf = {
            'bootstrap.servers': bootstrap_servers,
            'sasl.mechanism': 'PLAIN',
            'security.protocol': 'SASL_SSL',
            'sasl.username': '******',
            'sasl.password':
            '******',
            'ssl.ca.location': '/tmp/cacert.pem',
            'group.id': group_id,
            'auto.offset.reset': 'latest'
        }
        self.consumer = consumer = Consumer(conf)
        self.consumer.assign(self.tls)
Example #5
0
 def _unit_test_params(cls):
     return {
         "models": [
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(
                     lr=0.01)),
             ),
             compose.Pipeline(
                 preprocessing.StandardScaler(),
                 linear_model.LinearRegression(optimizer=optim.SGD(lr=0.1)),
             ),
         ],
         "metric":
         metrics.MAE(),
     }
Example #6
0
def test_list_of_funcs():
    def f(x):
        return {"f": 1}

    def g(x):
        return {"g": 2}

    def times_2(x):
        return {k: v * 2 for k, v in x.items()}

    expected = {"f": 2, "g": 4}
    assert compose.Pipeline([f, g], times_2).transform_one(None) == expected
    assert ([f, g]
            | compose.FuncTransformer(times_2)).transform_one(None) == expected
Example #7
0
def test_no_learn_unsupervised_one(func):
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("log_reg", linear_model.LogisticRegression()),
    )

    dataset = [(dict(a=x, b=x), x) for x in range(100)]

    for x, y in dataset:
        counts_pre = dict(pipeline.steps["scale"].counts)
        func(pipeline, x, learn_unsupervised=True)
        counts_post = dict(pipeline.steps["scale"].counts)
        func(pipeline, x, learn_unsupervised=False)
        counts_no_learn = dict(pipeline.steps["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #8
0
def test_predict_class_given_unseen_features():
    model = compose.Pipeline(
        ("tokenize", feature_extraction.BagOfWords()),
        ("nb", naive_bayes.MultinomialNB(alpha=1)),
    )

    docs = [
        ("cloudy cold", 0),
        ("sunny warm", 1),
    ]

    for sentence, label in docs:
        model = model.learn_one(sentence, label)

    # Assert model parameters needed to calculate the likelihoods
    assert model["nb"].n_terms == 4
    assert model["nb"].class_totals[0] == 2
    assert model["nb"].class_totals[1] == 2

    # Given new, unseen text, predict the label
    text = "new word"
    tokens = model["tokenize"].transform_one(text)
    cp = model["nb"].p_feature_given_class

    # P(new|0)
    #   = (N_new_0 + 1) / N_0 + N_terms)
    #   = (0 + 1) / (model['nb'].class_totals[0] + model['nb'].n_terms)
    assert cp("new", 0) == (0 + 1) / (2 + 4)

    # Since class_totals[0] == class_totals[1], and both words in text are new/unseen,
    # expect the class-conditional probabilities to be the same
    assert cp("new", 0) == cp("word", 0)
    assert cp("new", 0) == cp("new", 1)
    assert cp("new", 0) == cp("word", 1)

    jll = model["nb"].joint_log_likelihood(tokens)

    # Expect JLLs to be equal
    assert jll[0] == jll[1]

    # P(0|new word)
    #   = P(new|0) * P(word|0) * P(0)
    assert jll[0] == math.log(cp("new", 0) * cp("word", 0) * (1 / 2))

    # JLLs for both labels are the same, but 0 was the first label to be added to model['nb'].class_counts
    assert model.predict_one(text) == 0
Example #9
0
def test_no_learn_unsupervised_score_one():
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("anomaly", anomaly.HalfSpaceTrees()),
    )

    dataset = [(dict(a=x, b=x), x) for x in range(100)]

    for x, y in dataset:
        counts_pre = dict(pipeline.steps["scale"].counts)
        pipeline.score_one(x, learn_unsupervised=True)
        counts_post = dict(pipeline.steps["scale"].counts)
        pipeline.score_one(x, learn_unsupervised=False)
        counts_no_learn = dict(pipeline.steps["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #10
0
 def __init__(self, step, name):
     self.name = name
     self.optimizer = SynchronousSGD(0.01, name, None)
     self.model = compose.Pipeline(
         preprocessing.StandardScaler(),
         linear_model.LogisticRegression(self.optimizer))
     self.metrics = [
         metrics.Accuracy(),
         metrics.MAE(),
         metrics.RMSE(),
         metrics.Precision(),
         metrics.Recall()
     ]
     self.count = 0
     if step is None:
         self.step = 50
     else:
         self.step = int(step)
Example #11
0
def test_learn_one_warm_up_mode():
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("log_reg", linear_model.LogisticRegression()),
    )

    dataset = [(dict(a=x, b=x), bool(x % 2)) for x in range(100)]

    for x, y in dataset:
        counts_pre = dict(pipeline["scale"].counts)
        with utils.warm_up_mode():
            pipeline.learn_one(x, y)
        counts_post = dict(pipeline["scale"].counts)
        pipeline.learn_one(x, y)
        counts_no_learn = dict(pipeline["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #12
0
def test_learn_many_warm_up_mode():
    pipeline = compose.Pipeline(
        ("scale", preprocessing.StandardScaler()),
        ("log_reg", linear_model.LogisticRegression()),
    )

    dataset = [(dict(a=x, b=x), x) for x in range(100)]

    for i in range(0, len(dataset), 5):
        X = pd.DataFrame([x for x, _ in dataset][i:i + 5])
        y = pd.Series([bool(y % 2) for _, y in dataset][i:i + 5])

        counts_pre = dict(pipeline["scale"].counts)
        with utils.warm_up_mode():
            pipeline.learn_many(X, y)
        counts_post = dict(pipeline["scale"].counts)
        pipeline.learn_many(X, y)
        counts_no_learn = dict(pipeline["scale"].counts)

        assert counts_pre != counts_post
        assert counts_post == counts_no_learn
Example #13
0
                    "optimizer": [
                        (optim.SGD, {"lr": [1, 2]}),
                        (
                            optim.Adam,
                            {
                                "beta_1": [0.1, 0.01, 0.001],
                                "lr": [0.1, 0.01, 0.001, 0.0001],
                            },
                        ),
                    ]
                }
            },
            2 + 3 * 4,
        ),
        (
            compose.Pipeline(("Scaler", None), linear_model.LinearRegression()),
            {
                "Scaler": [
                    preprocessing.MinMaxScaler(),
                    preprocessing.MaxAbsScaler(),
                    preprocessing.StandardScaler(),
                ],
                "LinearRegression": {"optimizer": {"lr": [1e-1, 1e-2, 1e-3]}},
            },
            3 * 3,
        ),
    ],
)
def test_expand_param_grid_count(model, param_grid, count):
    assert len(utils.expand_param_grid(model, param_grid)) == count
Example #14
0
AUTOML_CLASSIFICATION_PIPELINE = compose.Pipeline(
    (
        'Scaler',
        PipelineHelperTransformer([
            ('StandardScaler', preprocessing.StandardScaler()),
            ('MinMaxScaler', preprocessing.MinMaxScaler()),
            ('MinAbsScaler', preprocessing.MaxAbsScaler()),
            # todo create dummy
            # ('RobustScaler', preprocessing.RobustScaler()),
            # ('AdaptiveStandardScaler', preprocessing.AdaptiveStandardScaler()),
            # ('LDA', preprocessing.LDA()),
        ])),
    # ('FeatureExtractor', PipelineHelperTransformer([
    #    ('PolynomialExtender', feature_extraction.PolynomialExtender()),
    # ('RBF', feature_extraction.RBFSampler()),
    # ])),
    (
        'Classifier',
        PipelineHelperClassifier([
            ('HT', tree.HoeffdingTreeClassifier()),
            # ('FT', tree.ExtremelyFastDecisionTreeClassifier()),
            ('LR', linear_model.LogisticRegression()),
            # ('HAT', tree.HoeffdingAdaptiveTreeClassifier()),
            ('GNB', naive_bayes.GaussianNB()),
            # ('MNB', naive_bayes.MultinomialNB()),
            # ('PAC', linear_model.PAClassifier()),
            # ('ARF', ensemble.AdaptiveRandomForestClassifier()),
            ('KNN', neighbors.KNNClassifier()),
        ])))
Example #15
0
        "Tokyo Tokyo",
        "Macao Macao new",
        "new",
    ]


def yield_batch_unseen_data():
    yield from [pd.Series(x) for x in yield_unseen_data()]


@pytest.mark.parametrize(
    "inc_model, batch_model, bag, sk_model",
    [
        pytest.param(
            compose.Pipeline(
                ("tokenize", feature_extraction.BagOfWords(lowercase=False)),
                ("model", model(alpha=alpha)),
            ),
            compose.Pipeline(
                ("tokenize", feature_extraction.BagOfWords(lowercase=False)),
                ("model", model(alpha=alpha)),
            ),
            feature_extraction.BagOfWords(lowercase=False),
            sk_model(alpha=alpha),
            id=f"{model.__name__} - {alpha}",
        ) for model, sk_model in [
            (naive_bayes.MultinomialNB, sk_naive_bayes.MultinomialNB),
            (naive_bayes.BernoulliNB, sk_naive_bayes.BernoulliNB),
            (naive_bayes.ComplementNB, sk_naive_bayes.ComplementNB),
        ] for alpha in [alpha for alpha in range(1, 4)]
    ],
)
from river import tree
from river import compose
from river import preprocessing
from river import linear_model

from tqdm import tqdm

scaler = preprocessing.StandardScaler()
log_reg = linear_model.LinearRegression()

hf_tree = tree.HoeffdingTreeClassifier(
    grace_period=100,
    split_confidence=1e-5,
)

model = compose.Pipeline()
model |= hf_tree

for index, raw in tqdm(train_features.iterrows(),
                       total=train_features.shape[0]):
    model.learn_one(raw, train_target[index])

correct_cnt = 0

for index, raw in tqdm(test_features.iterrows(), total=test_features.shape[0]):
    test_pred = model.predict_one(raw)
    if test_pred == test_target[index]:
        correct_cnt += 1

print("test accuracy: {} %".format(correct_cnt / test_features.shape[0]))
Example #17
0
                          'beta_1': [.1, .01, .001],
                          'lr': [.1, .01, .001, .0001]
                      })]
    }, 2 + 3 * 4),
    (preprocessing.StandardScaler() | linear_model.LinearRegression(), {
        'LinearRegression': {
            'optimizer': [(optim.SGD, {
                'lr': [1, 2]
            }),
                          (optim.Adam, {
                              'beta_1': [.1, .01, .001],
                              'lr': [.1, .01, .001, .0001]
                          })]
        }
    }, 2 + 3 * 4),
    (compose.Pipeline(('Scaler', None), linear_model.LinearRegression()), {
        'Scaler': [
            preprocessing.MinMaxScaler(),
            preprocessing.MaxAbsScaler(),
            preprocessing.StandardScaler()
        ],
        'LinearRegression': {
            'optimizer': {
                'lr': [1e-1, 1e-2, 1e-3]
            }
        }
    }, 3 * 3)
])
def test_expand_param_grid_count(model, param_grid, count):
    assert len(utils.expand_param_grid(model, param_grid)) == count
Example #18
0
                         task=datasets.base.REG,
                         n_features=1,
                         n_samples=1440)

    def __iter__(self):
        return stream.iter_csv(self.path,
                               target='interval_qps',
                               converters={'interval_qps': int})


def get_ordinal_date(x):
    return {'ordinal_date': int(x['secs_elapsed'])}


model = compose.Pipeline(
    ('ordinal_date', compose.FuncTransformer(get_ordinal_date)),
    ('scale', preprocessing.MinMaxScaler()),
    ('lin_reg', linear_model.LinearRegression()))

from river import metrics
import matplotlib.pyplot as plt

# target_data = "../log_traces/Mixgraph/1000_0.0000073_45000/report.csv"
target_data = "../log_traces/StorageMaterial.NVMeSSD/12CPU/64MB/report.csv_1180"
import os
target_data = os.path.abspath(target_data)


def evaluate_model(model):

    metric = metrics.Rolling(metrics.MAE(), 12)
Example #19
0
    dataset['similarity'] = similarity(dataset['title'], dataset['text'])

    return dataset


train = train_tuple[:]

test = test_tuple[:]

#Passive Aggressive Classifier
PA_model = compose.Pipeline(
    ('features',
     compose.TransformerUnion(
         ('pipe1',
          compose.Pipeline(('select_numeric_features',
                            compose.Select('length', 'punct%', 'similarity')),
                           ('scale', preprocessing.MinMaxScaler()))),
         ('pipe2',
          compose.Pipeline(
              ('select_text_features', compose.Select('content')),
              ('tfidf', feature_extraction.TFIDF(on='content')))))),
    ('modeling', linear_model.PAClassifier()))

metric = metrics.ROCAUC()
train1 = train[:]
PA_score1 = []
y_pred_l1 = []
y_l1 = []
for x, y in train1:
    x = text_processing(x)
    y_pred = PA_model.predict_one(x)
    y_pred_l1.append(y_pred)
tracks = [
    ('Random RBF', random_rbf_track),
    ('AGRAWAL', agrawal_track),
    ('Anomaly Sine', anomaly_sine_track),
    ('Concept Drift', concept_drift_track),
    ('Hyperplane', hyperplane_track),
    ('Mixed', mixed_track),
    ('SEA', sea_track),
    ('Sine', sine_track),
    ('STAGGER', stagger_track)
]

estimator1 = compose.Pipeline(
    preprocessing.StandardScaler(),
    #feature_extraction.PolynomialExtender(),
    tree.HoeffdingTreeClassifier()
)

estimator2 = compose.Pipeline(
    pipelinehelper.PipelineHelperTransformer([
        ('scaler', preprocessing.StandardScaler())
    ]),
    #feature_extraction.PolynomialExtender(),
    ('classifier', tree.HoeffdingTreeClassifier())
)
estimator3 = compose.Pipeline(
    ('scaler', preprocessing.StandardScaler()),
    pipelinehelper.PipelineHelperClassifier([
        ('classifier', tree.HoeffdingTreeClassifier())
    ])
Example #21
0
dataset_tuple_a = dataset_tuple[:5000]

train = dataset_tuple_a[:]

dataset_tuple_b = dataset_tuple[5000:]

#len(dataset_tuple_b)

Logistic_model = compose.Pipeline(
    ('features',
     compose.TransformerUnion(
         ('pipe1',
          compose.Pipeline(('drop_non_features',
                            compose.Discard('body', 'date', 'subject', 'text',
                                            'title', 'title_clean')),
                           ('scale', preprocessing.StandardScaler()))),
         ('pipe2',
          compose.Pipeline(
              ('drop_non_featuress',
               compose.Discard('body', 'body_len', 'body_num', 'date',
                               'punct%', 'subject', 'text', 'title',
                               'title_len', 'title_num')),
              ('tfidf', feature_extraction.TFIDF(on='title_clean')))))),
    ('modeling', linear_model.LogisticRegression()))

#metric = metrics.Accuracy()

#evaluate.progressive_val_score(dataset_tuple_a, model, metric)

#model.predict_proba_one(z)

#model.predict_one(z)
Example #22
0
from river import compose
from river import preprocessing
from river import linear_model
from river import metrics
from river import datasets
from river import optim

optimizer = optim.SGD(0.1)
model = compose.Pipeline(preprocessing.StandardScaler(),
                         linear_model.LogisticRegression(optimizer))

metric = metrics.ROCAUC()
precision = metrics.Precision()

for x, y in datasets.Phishing():
    y_pred = model.predict_proba_one(x)
    model.learn_one(x, y)
    metric.update(y, y_pred)
    precision.update(y, y_pred)

print(metric)
print(precision)
def get_ordinal_data(x):
    return {'ordinal_data': x['month'].toordinal()}


def get_month(x):
    return {
        calendar.month_name[month]: month == x['month'].month
        for month in range(1, 13)
    }


# To monthly trend by one-hot encoding the month name
model = compose.Pipeline(
    ('features',
     compose.TransformerUnion(
         ('ordinal_date', compose.FuncTransformer(get_ordinal_data)),
         ('month', compose.FuncTransformer(get_month)),
     )), ('scale', preprocessing.StandardScaler),
    ('lin_reg',
     linear_model.LinearRegression(intercept_lr=0, optimizer=optim.SGD(0.05))))

model = time_series.Detrender(regressor=model, window_size=12)

dates = []
dates_pred = []
y_trues = []
y_preds = []

images = []


def elevate_model(model):