def printing_test(arq="iris.arff"): print(Chain(Map(select(File(arq))))) exp = Workflow( File(arq), Partition(), Map(PCA(), SVMC(), Metric(enhance=False)), Map(Report("<---------------------- fold"), enhance=False), Summ(function="mean", enhance=False), Reduce(), Report("mean ... S: $S", enhance=False), ) print(exp) print(select(DT(), SVMC())) sel = select(DT(), SVMC()) print(sel) print(Map(DT())) exp = ChainCS( File(arq), Partition(), Map(PCA(), select(SVMC(), DT(criterion="gini")), Metric(enhance=False)), Report("teste"), Map(Report("<---------------------- fold")), ) print(exp)
def test_cache(arq="iris.arff"): pipe = Workflow(Cache(File(arq), storage_alias="default_sqlite"), Report("{history}")) train, test = pipe.dual_transform() print("Train..............\n", train.history ^ "name") print("Test..........\n", test.history ^ "name")
def ger_workflow(seed=0, arq="iris.arff"): np.random.seed(seed) workflow = Workflow(File(arq), Partition(), Map(PCA(), select(SVMC(), DT(criterion="gini")), Metric(enhance=False)), Summ(function="mean", enhance=False), Reduce(), Report("Mean S: $S", enhance=False), seed=seed) return workflow
def test_with_summ_reduce(arq="iris.arff"): pipe = Workflow( File(arq), Partition(), Map(PCA(), SVMC(), Metric()), Map(Report("<---------------------- etapa")), Summ(), Reduce(), Report("mean ... S: $S"), ) train, test = pipe.dual_transform() print("Train..............\n", train.history ^ "longname") print("Test..........\n", test.history ^ "longname")
def test_check_architecture2(arq="iris.arff"): pipe = Workflow( File(arq), Partition(), Map(PCA(), SVMC(), Metric(enhance=False)), Summ(field="Y", function="mean", enhance=False), Report("mean ... S: $S", enhance=False), ) # tenho file na frente train_ = pipe.enhancer.transform(sd.NoData) test_ = pipe.model(sd.NoData).transform(sd.NoData) test_ = pipe.model(sd.NoData).transform((sd.NoData, sd.NoData)) train_, test_ = pipe.dual_transform(sd.NoData, sd.NoData) train_, test_ = pipe.dual_transform(sd.NoData, (sd.NoData, sd.NoData))
def test_split_train_test(arq="iris.arff"): pipe = Cache( File(arq), TsSplit( ), # TsSplit should come before TrSplit to ensure the same original data is used as input for both. TrSplit(), PCA(), SVMC(), Metric(enhance=False), Report("metric ... R: $R", enhance=False), storage_alias="oka") train, test = pipe.dual_transform() print("Train..............\n", train) print("Test..........\n", test)
def test_partition(arq="iris.arff"): pipe = Workflow( File(arq), Partition(), Map(PCA(), SVMC(), Metric(enhance=False)), Summ(function="mean", enhance=False), Reduce(), Report("mean ... S: $S", enhance=False), Report("$X"), Report("$y"), ) train, test = pipe.dual_transform() print("Train..............\n", train) print("Test..........\n", test)
def random_search(arq="iris.arff"): np.random.seed(0) exp = Workflow( File(arq), Partition(), Map(PCA(), select(SVMC(), DT(criterion="gini")), Metric()), # Map(Report("<---------------------- fold"), enhance=False), Summ(function="mean"), Reduce(), Report("Mean S: $S"), ) expr = sample(exp, n=10) result = optimize(expr, n=5) result.disable_pretty_printing() print(result)
def test_check_architecture(arq="iris.arff"): pipe = Workflow( File(arq), Partition(partitions=2), Map(PCA(), SVMC(), Metric(enhance=False)), Summ(field="Y", function="mean", enhance=False), ) # tenho file na frente train_01 = pipe.enhancer.transform(sd.NoData) test_01 = pipe.model(sd.NoData).transform(sd.NoData) train_02, test_02 = pipe.dual_transform(sd.NoData, sd.NoData) # Collection uuid depends on data, which depends on consumption. for t, *_ in train_01, train_02, test_01, test_02: # print(111111111, t.y) pass assert train_01.uuid == train_02.uuid assert test_01.uuid == test_02.uuid
def test_sequence_of_classifiers(arq="abalone.arff"): pipe = Workflow( File(arq), Binarize(), Report('1 {X.shape} {history^name}'), PCA(n=5), SVMC(), Metric(), Report('2 {X.shape} {history^name}'), DT(), Metric(), Report('3 {X.shape} {history^name}'), ) print('Enh') train = pipe.enhancer.transform(sd.NoData) print('Mod') test = pipe.model(sd.NoData).transform( sd.NoData) # TODO: pq report não aparece no test? print() print("[test_sequence_of_classifiers] Train.........\n", train.history ^ "longname") print("[test_sequence_of_classifiers] Test..........\n", test.history ^ "longname")
#exit() print('Storing iris...') try: PickleServer().store(read_arff('iris.arff')) print('ok!') except DuplicateEntryException: print('Duplicate! Ignored.') numpy.random.seed(50) # import sklearn # print('The scikit-learn version is {}.'.format(sklearn.__version__)) print('expr .................') expr = Pipeline( OnlyApply(File('iris.arff')), Cache( evaluator( Wrap( shuffle(Std, MinMax), # shuffle(Std, select(UnderS, OverS), MinMax), ApplyUsing(select(DT, NB)), ), Metric(functions=['accuracy']) ) ) ) # {history.last.config['function']}
from cururu.sql.mysql import MySQL from pjdata.content.specialdata import NoData, UUIDData # # Partitioning ############################################## from pjml.tool.data.flow.file import File from pjml.tool.data.modeling.supervised.classifier.dt import DT from pjml.tool.stream.expand.partition import Partition data = (File("iris.arff") * Partition("cv", 10)).enhancer.transform(NoData) print(list(map(lambda d: d.X[0], data.stream))) # P/ pegar todas as partições: list(data.stream) import _pickle as p storage = MySQL(db="paje:[email protected]/paje") data = storage.fetch(UUIDData("ĹЇЖȡfĭϹƗͶэգ8Ƀű")) data = DT().model(data).transform(data) with open("/tmp/lixo", "wb") as f: p.dump(data.history, f) exit() # # # # ML 1 ======================================================================== # # # Armazenar dataset, sem depender do pacote pjml. # # from cururu.pickleserver import PickleServer # # # # try: # # PickleServer().store(read_arff('iris.arff')) # # except DuplicateEntryException: # # pass
from pjml.tool.data.modeling.supervised.classifier.svmc import SVMC from pjml.tool.data.processing.feature.binarize import Binarize from pjml.tool.data.processing.instance.sampler.over.random import OverS from pjml.tool.meta.mfe import MFE # ML 1 ======================================================================== # # Armazenar dataset, sem depender do pacote pjml. # from cururu.pickleserver import PickleServer # # try: # PickleServer().store(read_arff('iris.arff')) # except DuplicateEntryException: # pass pipe = Pipeline( Cache(File('bank.arff'), Binarize(), NB(), Metric(), Report('$X'))) print('aaaaaaaa') m = pipe.apply() print(m.data) print('uuuuuuuuuuuuuuu') d = m.use() print(d) exit() # # Source('messedup-dataset'), # Keep(evaluator( # Cache( # ApplyUsing( # NB() # ), # Metric(function='accuracy')
engine='mysql', db='paje:[email protected]/paje', blocking=not True ), engine='dump', blocking=True ) cache = partial(Cache, engine='sqlite', blocking=False) # cache = partial(Cache, engine='amnesia', blocking=True) # expr = Pipeline(File(arq), cache(ApplyUsing(NB()))) # p = expr # p.apply() expr = Pipeline( OnlyApply(File(arq), cache(Binarize())), cache( Partition(), Map( Wrap( select(SelectBest), # slow?? cache(ApplyUsing(select(DT, NB, hold(RF, n_estimators=40)))), OnlyApply(Metric(functions=['length'])), OnlyUse(Metric(functions=['accuracy', 'error'])), # AfterUse(Metric(function=['diversity'])) ), ), # Report('HISTORY ... S: {history}'), Summ(function='mean_std'), ), Report('mean and std ... S: $S'),
def test_pca(arq="iris.arff"): cs = File(arq).cs pipe = Workflow(File(arq), Split(), PCA(), SVMC(), Metric()) train, test = pipe.dual_transform() print("Train..............\n", train.history ^ "name") print("Test..........\n", test.history ^ "name")
def test_metric(arq="iris.arff"): pipe = Workflow(File(arq), Split(), SVMC(), Metric(enhance=False)) train, test = pipe.dual_transform() print("Train..............\n", train) print("Test..........\n", test)
def test_split(arq="iris.arff"): pipe = Workflow(File(arq), Split(), SVMC()) train, test = pipe.dual_transform() print("Train..............\n", str(train)) print("Test..........\n", str(test))
def test_svmc(arq="iris.arff"): cs = File(arq).cs pipe = Workflow(File(arq), SVMC()) train, test = pipe.dual_transform() print("Train..............\n", train) print("Test..........\n", test)
# # s = cs.sample() # print(s) # exit() cache = partial(Cache, storage_alias='default_sqlite') # cache = partial(Cache, storage_alias='mysql') # cache = partial(Cache, storage_alias='default_dump') # cache = partial(Cache, storage_alias='amnesia') # expr = Pipeline(File(arq), cache(ApplyUsing(NB()))) # p = expr # p.apply() expr = Workflow( OnlyApply(File(arq), cache(Binarize())), cache( Partition(), Map( Wrap( select(SelectBest), # slow?? cache(ApplyUsing(select(DT, NB, hold(RF, n_estimators=40)))), OnlyApply(Metric(functions=['length'])), OnlyUse(Metric(functions=['accuracy', 'error'])), # AfterUse(Metric(function=['diversity'])) ), ), # Report('HISTORY ... S: {history}'), Summ(function='mean_std'), ), Report('mean and std ... S: $S'),
UnderS from pjml.tool.meta.wrap import Wrap # print(SelectKB.cs) # exit() # # cs = Pipeline(SelectKB) # print(cs) # exit() # # s = cs.sample() # print(s) # exit() expr = Workflow( OnlyApply(File("abalone3.arff"), Binarize()), Partition(), Map( Wrap( select(SelectBest), ApplyUsing(select(DT, RF, NB)), OnlyApply(Metric(functions=['length'])), OnlyUse(Metric(functions=['accuracy', 'error'])), # AfterUse(Metric(function=['diversity'])) ), ), Report('HISTORY ... S: {history}'), Summ(function='mean_std'), Report('mean and std ... S: $S'), OnlyApply(Copy(from_field="S", to_field="B")), OnlyApply(Report('copy S to B ... B: $B')), OnlyUse(
from pjml.tool.data.evaluation.mconcat import MConcat from pjml.tool.data.evaluation.metric import Metric from pjml.tool.data.flow.applyusing import ApplyUsing from pjml.tool.data.flow.file import File from pjml.tool.data.flow.onlyoperation import OnlyApply, OnlyUse from pjml.tool.data.flow.sink import Sink from pjml.tool.data.manipulation.copy import Copy from pjml.tool.data.modeling.supervised.classifier.rf import RF from pjml.tool.data.processing.feature.binarize import Binarize from pjml.tool.data.processing.feature.scaler.minmax import MinMax from pjml.tool.data.processing.instance.sampler.over.random import OverS from pjml.tool.data.processing.instance.sampler.under.random import UnderS from pjml.tool.meta.wrap import Wrap disable_global_pretty_printing() d = File("abalone3.arff").apply().data print('Construindo...') # pipe = Pipeline( # OnlyApply(File("abalone3.arff")), # Cache(Binarize()), # Partition(), # Map( # Wrap( # MinMax(), # Cache(ApplyUsing(RF())), # OnlyApply(Metric(functions=['length'])), # OnlyUse(Metric(functions=['accuracy', 'error'])), # # AfterUse(Metric(function=['diversity'])) # ), # ),