Ejemplo n.º 1
0
def run_automl(X, y, name, input=None, output=CategoricalVector()):
    telegram = TelegramLogger(
        token=os.environ["TOKEN"],
        channel="@autogoal_board",
        name=name,
    )
    console = ConsoleLogger()
    progress = ProgressLogger()

    automl = AutoML(
        search_iterations=1000,
        metalearning_log=True,
        search_kwargs=dict(search_timeout=2 * 60 * 60, pop_size=50),
        errors="ignore",
        input=input,
        output=output,
        cross_validation_steps=1,
    )

    automl.fit(X, y, logger=[telegram, console, progress])
Ejemplo n.º 2
0
class CustomLogger(Logger):
    def error(self, e: Exception, solution):
        if e and solution:
            with open("haha_errors.log", "a") as fp:
                fp.write(f"solution={repr(solution)}\nerror={repr(e)}\n\n")

    def update_best(self, new_best, new_fn, *args):
        with open("haha.log", "a") as fp:
            fp.write(f"solution={repr(new_best)}\nfitness={new_fn}\n\n")


# Basic logging configuration.

logger = MemoryLogger()
loggers = [ProgressLogger(), ConsoleLogger(), logger]

if args.token:
    from autogoal.contrib.telegram import TelegramLogger

    telegram = TelegramLogger(
        token=args.token,
        name=f"HAHA",
        channel=args.channel,
    )
    loggers.append(telegram)

# Finally, loading the HAHA dataset, running the `AutoML` instance,
# and printing the results.

X_train, y_train, X_test, y_test = haha.load(max_examples=args.examples)
from autogoal.contrib.keras import KerasSequenceClassifier
from autogoal.contrib.torch import BertTokenizeEmbedding
from autogoal.datasets import haha
from autogoal.kb import CategoricalVector, List, Sentence, Tuple
from autogoal.ml import AutoML
from autogoal.search import ConsoleLogger, ProgressLogger

classifier = AutoML(
    input=List(Sentence()),
    output=CategoricalVector(),
    registry=[KerasSequenceClassifier, BertTokenizeEmbedding],
    # search_kwargs=dict(memory_limit=4 * 1024 ** 3, evaluation_timeout=60),
    search_kwargs=dict(memory_limit=0, evaluation_timeout=0),
)

Xtrain, Xtest, ytrain, ytest = haha.load(max_examples=10)

# embedding = BertEmbedding()
# tokens = embedding.run(Xtrain)

# classifier = KerasSequenceClassifier().sample()
# classifier.run((tokens, ytrain))

classifier.fit(Xtrain, ytrain, logger=[ConsoleLogger(), ProgressLogger()])
import numpy as np

from autogoal.ml import AutoML
from autogoal.contrib.keras import KerasImageClassifier, KerasImagePreprocessor
from autogoal.datasets import cifar10
from autogoal.kb import CategoricalVector, Tensor4
from autogoal.search import ConsoleLogger, ProgressLogger

automl = AutoML(
    input=Tensor4(),
    output=CategoricalVector(),
    registry=[KerasImageClassifier],
    # registry=[KerasImageClassifier, KerasImagePreprocessor],
    cross_validation_steps=1,
    search_kwargs=dict(
        pop_size=20,
        search_timeout=24 * 60 * 60,
        evaluation_timeout=0,
        memory_limit=0,
        save=False,
    ),
    search_iterations=1000,
    validation_split=1 / 6)

Xtrain, ytrain, Xtest, ytest = cifar10.load()
X = np.vstack((Xtrain, Xtest))
y = np.hstack((ytrain, ytest))

automl.fit(X, y, logger=[ConsoleLogger(), ProgressLogger()])
        print(solution)
    except ValueError:
        continue

# To evaluate how good a formula is, we simply feed the expression instance
# with a sequence of numbers from 1 to 9. If the expression requires more
# than 9 digits, it results in an error. The actual value of performing
# corresponding operations is done in the `__call__` method of the expression classes.

def evaluate(expr):
    def stream():
        for i in range(1, 10):
            yield i

        raise ValueError("Too many values asked")

    return expr(stream())

# We will run 1000 iterations of each search strategy to compare their long-term performance.

search_rand = RandomSearch(grammar, evaluate, errors='ignore')
best_rand, best_fn_rand = search_rand.run(1000, logger=[ConsoleLogger(), ProgressLogger()])

search_pe = PESearch(grammar, evaluate, pop_size=10, errors='ignore')
best_pe, best_fn_pe = search_pe.run(1000, logger=[ConsoleLogger(), ProgressLogger()])

# And here are the results.

print(best_rand, best_fn_rand)
print(best_pe, best_fn_pe)
from autogoal.contrib.keras import KerasClassifier
from autogoal.datasets import cars
from autogoal.kb import CategoricalVector, MatrixContinuousDense
from autogoal.ml import AutoML
from autogoal.search import ConsoleLogger, ProgressLogger

classifier = AutoML(
    input=MatrixContinuousDense(),
    registry=[KerasClassifier],
    search_kwargs=dict(memory_limit=0, evaluation_timeout=0),
)

X, y = cars.load()

classifier.fit(X, y, logger=[ConsoleLogger(), ProgressLogger()])
Ejemplo n.º 7
0
# with a sequence of numbers from 1 to 9. If the expression requires more
# than 9 digits, it results in an error. The actual value of performing
# corresponding operations is done in the `__call__` method of the expression classes.


def evaluate(expr):
    def stream():
        for i in range(1, 10):
            yield i

        raise ValueError("Too many values asked")

    return expr(stream())


# We will run 1000 iterations of each search strategy to compare their long-term performance.

search_rand = RandomSearch(grammar, evaluate, errors='ignore')
best_rand, best_fn_rand = search_rand.run(
    1000, logger=[ConsoleLogger(), ProgressLogger()])

search_pe = PESearch(grammar, evaluate, pop_size=10, errors='ignore')
best_pe, best_fn_pe = search_pe.run(1000,
                                    logger=[ConsoleLogger(),
                                            ProgressLogger()])

# And here are the results.

print(best_rand, best_fn_rand)
print(best_pe, best_fn_pe)
    try:
        with open(fR"{path}/binary_X",
                  'rb') as xfd, open(fR"{path}/binary_Y", 'rb') as yfd:
            X = pickle.load(xfd)
            y = pickle.load(yfd)
            return X, y
    except Exception as e:
        #TODO: implement corpus reading from directories
        print(e)
        pass


if __name__ == "__main__":
    g = generate_cfg(SklearnNLPClassifier)
    X, y = load_movie_reviews(100)

    # X, y = load_corpus("examples/Revolico")


    def fitness(pipeline):
        pipeline.fit(X, y)
        score = pipeline.score(X, y)
        return score

    search = RandomSearch(g,
                          fitness,
                          random_state=0,
                          errors='warn',
                          evaluation_timeout=100)
    result = search.run(50, logger=ProgressLogger())
# care of train/test splitting, fitting a pipeline in the training set and computing
# the accuracy on the test set.

fitness_fn = movie_reviews.make_fn(examples=100)

# ### Random search
#
# The `RandomSearch` strategy simply calls `grammar.sample()` a bunch of times
# and stores the best performing pipeline. It has no intelligence whatsoever,
# but it serves as a good baseline implementation.
#
# We will run it for a total of `1000` fitness evaluations, or equivalently, a total
# of `1000` different random pipelines. To see what's actually going on we will use
# the wonderfull `enlighten` library through our implementation `EnlightenLogger`.

logger = ProgressLogger()

random_search = RandomSearch(grammar, fitness_fn, random_state=0)
best_rand, fn_rand = random_search.run(1000, logger=logger)

# !!! note
#     For reproducibility purposes we can pass a fixed random seed in `random_state`.
#
# ### Evolutionary Search
#
# Random search is fun, but to search with purpose, we need a more intelligent sampling
# strategy. The `PESearch` (short for Probabilistic Evolutionary Search, phew), does just that.
# It starts with a random sampling strategy, but as it evaluates more pipelines, it modifies
# an probabilistic sampling model so that pipelines similar to the best ones found are more
# commonly sampled.
#
Ejemplo n.º 10
0
from autogoal.contrib.sklearn import SklearnClassifier
from autogoal.grammar import generate_cfg
from autogoal.search import RandomSearch, ProgressLogger

from sklearn.datasets import make_classification

g = generate_cfg(SklearnClassifier)
X, y = make_classification()

print(g)


def fitness(pipeline):
    pipeline.fit(X, y)
    return pipeline.score(X, y)


search = RandomSearch(g, fitness, random_state=0, errors='warn')
search.run(1000, logger=ProgressLogger())
Ejemplo n.º 11
0
# care of train/test splitting, fitting a pipeline in the training set and computing
# the accuracy on the test set.

fitness_fn = movie_reviews.make_fn(examples=100)

# ### Random search
#
# The `RandomSearch` strategy simply calls `grammar.sample()` a bunch of times
# and stores the best performing pipeline. It has no intelligence whatsoever,
# but it serves as a good baseline implementation.
#
# We will run it for a total of `1000` fitness evaluations, or equivalently, a total
# of `1000` different random pipelines. To see what's actually going on we will use
# the wonderfull `enlighten` library through our implementation `EnlightenLogger`.

logger = ProgressLogger(log_solutions=True)

random_search = RandomSearch(grammar, fitness_fn, random_state=0)
best_rand, fn_rand = random_search.run(1000, logger=logger)

# !!! note
#     For reproducibility purposes we can pass a fixed random seed in `random_state`.
#
# ### Evolutionary Search
#
# Random search is fun, but to search with purpose, we need a more intelligent sampling
# strategy. The `PESearch` (short for Probabilistic Evolutionary Search, phew), does just that.
# It starts with a random sampling strategy, but as it evaluates more pipelines, it modifies
# an probabilistic sampling model so that pipelines similar to the best ones found are more
# commonly sampled.
#