def printing_test(arq="iris.arff"):
    print(Chain(Map(select(File(arq)))))
    exp = Workflow(
        File(arq),
        Partition(),
        Map(PCA(), SVMC(), Metric(enhance=False)),
        Map(Report("<---------------------- fold"), enhance=False),
        Summ(function="mean", enhance=False),
        Reduce(),
        Report("mean ... S: $S", enhance=False),
    )
    print(exp)
    print(select(DT(), SVMC()))

    sel = select(DT(), SVMC())
    print(sel)
    print(Map(DT()))
    exp = ChainCS(
        File(arq),
        Partition(),
        Map(PCA(), select(SVMC(), DT(criterion="gini")),
            Metric(enhance=False)),
        Report("teste"),
        Map(Report("<---------------------- fold")),
    )
    print(exp)
def test_cache(arq="iris.arff"):
    pipe = Workflow(Cache(File(arq), storage_alias="default_sqlite"),
                    Report("{history}"))
    train, test = pipe.dual_transform()

    print("Train..............\n", train.history ^ "name")
    print("Test..........\n", test.history ^ "name")
def ger_workflow(seed=0, arq="iris.arff"):
    np.random.seed(seed)

    workflow = Workflow(File(arq),
                        Partition(),
                        Map(PCA(), select(SVMC(), DT(criterion="gini")),
                            Metric(enhance=False)),
                        Summ(function="mean", enhance=False),
                        Reduce(),
                        Report("Mean S: $S", enhance=False),
                        seed=seed)

    return workflow
def test_with_summ_reduce(arq="iris.arff"):
    pipe = Workflow(
        File(arq),
        Partition(),
        Map(PCA(), SVMC(), Metric()),
        Map(Report("<---------------------- etapa")),
        Summ(),
        Reduce(),
        Report("mean ... S: $S"),
    )
    train, test = pipe.dual_transform()

    print("Train..............\n", train.history ^ "longname")
    print("Test..........\n", test.history ^ "longname")
def test_check_architecture2(arq="iris.arff"):
    pipe = Workflow(
        File(arq),
        Partition(),
        Map(PCA(), SVMC(), Metric(enhance=False)),
        Summ(field="Y", function="mean", enhance=False),
        Report("mean ... S: $S", enhance=False),
    )

    # tenho file na frente
    train_ = pipe.enhancer.transform(sd.NoData)
    test_ = pipe.model(sd.NoData).transform(sd.NoData)
    test_ = pipe.model(sd.NoData).transform((sd.NoData, sd.NoData))
    train_, test_ = pipe.dual_transform(sd.NoData, sd.NoData)
    train_, test_ = pipe.dual_transform(sd.NoData, (sd.NoData, sd.NoData))
def test_split_train_test(arq="iris.arff"):
    pipe = Cache(
        File(arq),
        TsSplit(
        ),  # TsSplit should come before TrSplit to ensure the same original data is used as input for both.
        TrSplit(),
        PCA(),
        SVMC(),
        Metric(enhance=False),
        Report("metric ... R: $R", enhance=False),
        storage_alias="oka")
    train, test = pipe.dual_transform()

    print("Train..............\n", train)
    print("Test..........\n", test)
def test_partition(arq="iris.arff"):
    pipe = Workflow(
        File(arq),
        Partition(),
        Map(PCA(), SVMC(), Metric(enhance=False)),
        Summ(function="mean", enhance=False),
        Reduce(),
        Report("mean ... S: $S", enhance=False),
        Report("$X"),
        Report("$y"),
    )
    train, test = pipe.dual_transform()

    print("Train..............\n", train)
    print("Test..........\n", test)
def random_search(arq="iris.arff"):
    np.random.seed(0)
    exp = Workflow(
        File(arq),
        Partition(),
        Map(PCA(), select(SVMC(), DT(criterion="gini")), Metric()),
        # Map(Report("<---------------------- fold"), enhance=False),
        Summ(function="mean"),
        Reduce(),
        Report("Mean S: $S"),
    )

    expr = sample(exp, n=10)
    result = optimize(expr, n=5)
    result.disable_pretty_printing()
    print(result)
def test_check_architecture(arq="iris.arff"):
    pipe = Workflow(
        File(arq),
        Partition(partitions=2),
        Map(PCA(), SVMC(), Metric(enhance=False)),
        Summ(field="Y", function="mean", enhance=False),
    )

    # tenho file na frente
    train_01 = pipe.enhancer.transform(sd.NoData)
    test_01 = pipe.model(sd.NoData).transform(sd.NoData)
    train_02, test_02 = pipe.dual_transform(sd.NoData, sd.NoData)

    # Collection uuid depends on data, which depends on consumption.
    for t, *_ in train_01, train_02, test_01, test_02:
        # print(111111111, t.y)
        pass

    assert train_01.uuid == train_02.uuid
    assert test_01.uuid == test_02.uuid
def test_sequence_of_classifiers(arq="abalone.arff"):
    pipe = Workflow(
        File(arq),
        Binarize(),
        Report('1 {X.shape} {history^name}'),
        PCA(n=5),
        SVMC(),
        Metric(),
        Report('2 {X.shape} {history^name}'),
        DT(),
        Metric(),
        Report('3 {X.shape} {history^name}'),
    )
    print('Enh')
    train = pipe.enhancer.transform(sd.NoData)
    print('Mod')
    test = pipe.model(sd.NoData).transform(
        sd.NoData)  # TODO: pq report não aparece no test?
    print()

    print("[test_sequence_of_classifiers] Train.........\n",
          train.history ^ "longname")
    print("[test_sequence_of_classifiers] Test..........\n",
          test.history ^ "longname")
Exemple #11
0
#exit()


print('Storing iris...')
try:
    PickleServer().store(read_arff('iris.arff'))
    print('ok!')
except DuplicateEntryException:
    print('Duplicate! Ignored.')

numpy.random.seed(50)
# import sklearn
# print('The scikit-learn version is {}.'.format(sklearn.__version__))
print('expr .................')
expr = Pipeline(
    OnlyApply(File('iris.arff')),
    Cache(
    evaluator(
    Wrap(
        shuffle(Std, MinMax),
        # shuffle(Std, select(UnderS, OverS), MinMax),
        ApplyUsing(select(DT, NB)),
    ),
    Metric(functions=['accuracy'])
    )
    )
)



# {history.last.config['function']}
from cururu.sql.mysql import MySQL
from pjdata.content.specialdata import NoData, UUIDData

# # Partitioning ##############################################
from pjml.tool.data.flow.file import File
from pjml.tool.data.modeling.supervised.classifier.dt import DT
from pjml.tool.stream.expand.partition import Partition

data = (File("iris.arff") * Partition("cv", 10)).enhancer.transform(NoData)
print(list(map(lambda d: d.X[0], data.stream)))  # P/ pegar todas as partições: list(data.stream)

import _pickle as p

storage = MySQL(db="paje:[email protected]/paje")
data = storage.fetch(UUIDData("ĹЇЖȡfĭϹƗͶэգ8Ƀű"))
data = DT().model(data).transform(data)
with open("/tmp/lixo", "wb") as f:
    p.dump(data.history, f)

exit()

#
#
# # ML 1 ========================================================================
# # # Armazenar dataset, sem depender do pacote pjml.
# # from cururu.pickleserver import PickleServer
# #
# # try:
# #     PickleServer().store(read_arff('iris.arff'))
# # except DuplicateEntryException:
# #     pass
Exemple #13
0
from pjml.tool.data.modeling.supervised.classifier.svmc import SVMC
from pjml.tool.data.processing.feature.binarize import Binarize
from pjml.tool.data.processing.instance.sampler.over.random import OverS
from pjml.tool.meta.mfe import MFE

# ML 1 ========================================================================
# # Armazenar dataset, sem depender do pacote pjml.
# from cururu.pickleserver import PickleServer
#
# try:
#     PickleServer().store(read_arff('iris.arff'))
# except DuplicateEntryException:
#     pass

pipe = Pipeline(
    Cache(File('bank.arff'), Binarize(), NB(), Metric(), Report('$X')))
print('aaaaaaaa')
m = pipe.apply()
print(m.data)
print('uuuuuuuuuuuuuuu')
d = m.use()
print(d)
exit()

#     # Source('messedup-dataset'),
#     Keep(evaluator(
#         Cache(
#             ApplyUsing(
#                 NB()
#             ),
#             Metric(function='accuracy')
            engine='mysql',
            db='paje:[email protected]/paje',
            blocking=not True
        ),
        engine='dump', blocking=True
    )

cache = partial(Cache, engine='sqlite', blocking=False)

# cache = partial(Cache, engine='amnesia', blocking=True)

# expr = Pipeline(File(arq), cache(ApplyUsing(NB())))
# p = expr
# p.apply()
expr = Pipeline(
    OnlyApply(File(arq), cache(Binarize())),
    cache(
        Partition(),
        Map(
            Wrap(
                select(SelectBest),  # slow??
                cache(ApplyUsing(select(DT, NB, hold(RF, n_estimators=40)))),
                OnlyApply(Metric(functions=['length'])),
                OnlyUse(Metric(functions=['accuracy', 'error'])),
                # AfterUse(Metric(function=['diversity']))
            ),
        ),
        # Report('HISTORY ... S: {history}'),
        Summ(function='mean_std'),
    ),
    Report('mean and std ... S: $S'),
def test_pca(arq="iris.arff"):
    cs = File(arq).cs
    pipe = Workflow(File(arq), Split(), PCA(), SVMC(), Metric())
    train, test = pipe.dual_transform()
    print("Train..............\n", train.history ^ "name")
    print("Test..........\n", test.history ^ "name")
def test_metric(arq="iris.arff"):
    pipe = Workflow(File(arq), Split(), SVMC(), Metric(enhance=False))
    train, test = pipe.dual_transform()
    print("Train..............\n", train)
    print("Test..........\n", test)
def test_split(arq="iris.arff"):
    pipe = Workflow(File(arq), Split(), SVMC())
    train, test = pipe.dual_transform()
    print("Train..............\n", str(train))
    print("Test..........\n", str(test))
def test_svmc(arq="iris.arff"):
    cs = File(arq).cs
    pipe = Workflow(File(arq), SVMC())
    train, test = pipe.dual_transform()
    print("Train..............\n", train)
    print("Test..........\n", test)
#
# s = cs.sample()
# print(s)
# exit()

cache = partial(Cache, storage_alias='default_sqlite')
# cache = partial(Cache, storage_alias='mysql')
# cache = partial(Cache, storage_alias='default_dump')
# cache = partial(Cache, storage_alias='amnesia')


# expr = Pipeline(File(arq), cache(ApplyUsing(NB())))
# p = expr
# p.apply()
expr = Workflow(
    OnlyApply(File(arq), cache(Binarize())),
    cache(
        Partition(),
        Map(
            Wrap(
                select(SelectBest),  # slow??
                cache(ApplyUsing(select(DT, NB, hold(RF, n_estimators=40)))),
                OnlyApply(Metric(functions=['length'])),
                OnlyUse(Metric(functions=['accuracy', 'error'])),
                # AfterUse(Metric(function=['diversity']))
            ),
        ),
        # Report('HISTORY ... S: {history}'),
        Summ(function='mean_std'),
    ),
    Report('mean and std ... S: $S'),
Exemple #20
0
    UnderS
from pjml.tool.meta.wrap import Wrap

# print(SelectKB.cs)
# exit()
#
# cs = Pipeline(SelectKB)
# print(cs)
# exit()
#
# s = cs.sample()
# print(s)
# exit()

expr = Workflow(
    OnlyApply(File("abalone3.arff"), Binarize()),
    Partition(),
    Map(
        Wrap(
            select(SelectBest),
            ApplyUsing(select(DT, RF, NB)),
            OnlyApply(Metric(functions=['length'])),
            OnlyUse(Metric(functions=['accuracy', 'error'])),
            # AfterUse(Metric(function=['diversity']))
        ), ),
    Report('HISTORY ... S: {history}'),
    Summ(function='mean_std'),
    Report('mean and std ... S: $S'),
    OnlyApply(Copy(from_field="S", to_field="B")),
    OnlyApply(Report('copy S to B ... B: $B')),
    OnlyUse(
Exemple #21
0
from pjml.tool.data.evaluation.mconcat import MConcat
from pjml.tool.data.evaluation.metric import Metric
from pjml.tool.data.flow.applyusing import ApplyUsing
from pjml.tool.data.flow.file import File
from pjml.tool.data.flow.onlyoperation import OnlyApply, OnlyUse
from pjml.tool.data.flow.sink import Sink
from pjml.tool.data.manipulation.copy import Copy
from pjml.tool.data.modeling.supervised.classifier.rf import RF
from pjml.tool.data.processing.feature.binarize import Binarize
from pjml.tool.data.processing.feature.scaler.minmax import MinMax
from pjml.tool.data.processing.instance.sampler.over.random import OverS
from pjml.tool.data.processing.instance.sampler.under.random import UnderS
from pjml.tool.meta.wrap import Wrap

disable_global_pretty_printing()
d = File("abalone3.arff").apply().data

print('Construindo...')
# pipe = Pipeline(
#     OnlyApply(File("abalone3.arff")),
#     Cache(Binarize()),
#     Partition(),
#     Map(
#         Wrap(
#             MinMax(),
#             Cache(ApplyUsing(RF())),
#             OnlyApply(Metric(functions=['length'])),
#             OnlyUse(Metric(functions=['accuracy', 'error'])),
#             # AfterUse(Metric(function=['diversity']))
#         ),
#     ),