Ejemplo n.º 1
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = flair.datasets.ColumnCorpus(
        data_folder=tasks_base_path / "fashion", column_format={0: "text", 2: "ner"}
    )

    # define search space
    search_space = SearchSpace()

    # sequence tagger parameter
    search_space.add(
        Parameter.EMBEDDINGS,
        hp.choice,
        options=[StackedEmbeddings([glove_embedding])],
    )
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])

    # model trainer parameter
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD])

    # training parameter
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)

    # find best parameter settings
    optimizer = SequenceTaggerParamSelector(
        corpus, "ner", results_base_path, max_epochs=2
    )
    optimizer.optimize(search_space, max_evals=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del optimizer, search_space
Ejemplo n.º 2
0
def test_text_classifier_param_selector(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")

    search_space = SearchSpace()

    # document embeddings parameter
    search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[[glove_embedding]])
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128, 256, 512])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.REPROJECT_WORDS, hp.choice, options=[True, False])
    search_space.add(Parameter.REPROJECT_WORD_DIMENSION, hp.choice, options=[64, 128])
    search_space.add(Parameter.BIDIRECTIONAL, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.25, high=0.75)

    # training parameter
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0, high=1)
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 16, 32])
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])

    param_selector = TextClassifierParamSelector(
        corpus, False, results_base_path, document_embedding_type="lstm", max_epochs=2
    )
    param_selector.optimize(search_space, max_evals=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del param_selector, search_space
Ejemplo n.º 3
0
def select_hyperparameters(params, corpus):

    search_space = SearchSpace()

    embeddings = create_embeddings(params)
    search_space.add(Parameter.EMBEDDINGS, hp.choice, options=[embeddings])
    search_space.add(Parameter.HIDDEN_SIZE,
                     hp.choice,
                     options=[32, 64, 128, 256, 512])
    # search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[256])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.DROPOUT,
                     hp.choice,
                     options=[0.3, 0.4, 0.5, 0.6, 0.7])
    # search_space.add(Parameter.LEARNING_RATE, hp.loguniform, low=-np.log(0.00001), high=np.log(1.0))
    # search_space.add(Parameter.OPTIMIZER, hp.choice, options=[Parameter.NESTEROV])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[16])

    print("Downsampling the training set to %10 of the original...")
    corpus.downsample(percentage=0.1, only_downsample_train=True)

    param_selector = SequenceTaggerParamSelector(
        corpus=corpus,
        tag_type=params['tag_type'],
        base_path=os.path.join("hyperparameter_search",
                               params['model_output_dirpath']),
        max_epochs=5,
        training_runs=3,
        evaluation_metric=EvaluationMetric.MICRO_F1_SCORE,
        optimization_value=OptimizationValue.DEV_SCORE)

    param_selector.optimize(search_space, max_evals=10)

    print(
        "Now observe %s to decide on the best hyperparameters" %
        (os.path.join("hyperparameter_search", params['model_output_dirpath'],
                      "param_selection.txt")))
Ejemplo n.º 4
0
def test_text_classifier_param_selector(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(u'imdb', base_path=tasks_base_path)
    glove_embedding = WordEmbeddings(u'en-glove')
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[[glove_embedding]])
    search_space.add(Parameter.HIDDEN_SIZE,
                     hp.choice,
                     options=[64, 128, 256, 512])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.REPROJECT_WORDS,
                     hp.choice,
                     options=[True, False])
    search_space.add(Parameter.REPROJECT_WORD_DIMENSION,
                     hp.choice,
                     options=[64, 128])
    search_space.add(Parameter.BIDIRECTIONAL, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0, high=1)
    search_space.add(Parameter.MINI_BATCH_SIZE,
                     hp.choice,
                     options=[4, 8, 16, 32])
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    param_selector = TextClassifierParamSelector(
        corpus,
        False,
        results_base_path,
        document_embedding_type=u'lstm',
        max_epochs=2)
    param_selector.optimize(search_space, max_evals=2)
    shutil.rmtree(results_base_path)
Ejemplo n.º 5
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([WordEmbeddings(u'glove')]),
                         StackedEmbeddings([
                             WordEmbeddings(u'glove'),
                             FlairEmbeddings(u'news-forward'),
                             FlairEmbeddings(u'news-backward')
                         ])
                     ])
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)
    optimizer = SequenceTaggerParamSelector(corpus,
                                            u'ner',
                                            results_base_path,
                                            max_epochs=2)
    optimizer.optimize(search_space, max_evals=2)
    shutil.rmtree(results_base_path)
Ejemplo n.º 6
0
def test_text_classifier_param_selector(results_base_path, tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "imdb")
    label_type = "sentiment"

    search_space = SearchSpace()

    # document embeddings parameter
    search_space.add(Parameter.TRANSFORMER_MODEL,
                     hp.choice,
                     options=["albert-base-v1"])
    search_space.add(Parameter.LAYERS, hp.choice, options=["-1", "-2"])

    # training parameter
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0, high=1)
    search_space.add(Parameter.MINI_BATCH_SIZE,
                     hp.choice,
                     options=[4, 8, 16, 32])
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])

    param_selector = TextClassifierParamSelector(corpus,
                                                 label_type,
                                                 False,
                                                 results_base_path,
                                                 max_epochs=2)
    param_selector.optimize(search_space, max_evals=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
    del param_selector, search_space
Ejemplo n.º 7
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = flair.datasets.ColumnCorpus(data_folder=(tasks_base_path /
                                                      'fashion'),
                                         column_format={
                                             0: 'text',
                                             2: 'ner',
                                         })
    search_space = SearchSpace()
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([WordEmbeddings('glove')]),
                         StackedEmbeddings([
                             WordEmbeddings('glove'),
                             FlairEmbeddings('news-forward-fast'),
                             FlairEmbeddings('news-backward-fast')
                         ])
                     ])
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE, hp.choice, options=[64, 128])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD])
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)
    optimizer = SequenceTaggerParamSelector(corpus,
                                            'ner',
                                            results_base_path,
                                            max_epochs=2)
    optimizer.optimize(search_space, max_evals=2)
    shutil.rmtree(results_base_path)