コード例 #1
0
def test_train_charlm__nocache_load_use_classifier():
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: TokenEmbeddings = CharLMEmbeddings('news-forward-fast', use_cache=False)
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64,
                                                                         False,
                                                                         False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, False)
    trainer.train('./results', max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

        loaded_model = TextClassifier.load_from_file('./results/final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree('./results')
コード例 #2
0
def test_train_load_use_classifier(results_base_path, tasks_base_path):

    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.IMDB,
                                           base_path=tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    trainer = TextClassifierTrainer(model, corpus, label_dict, test_mode=True)
    trainer.train(str(results_base_path), max_epochs=2)

    sentence = Sentence("Berlin is a really nice city.")

    for s in model.predict(sentence):
        for l in s.labels:
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path /
                                                 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
コード例 #3
0
def train(data_dir: str, model_dir: str, dataset_format: str='macss', num_filters: int=150,
          word_embeddings: str='de-fasttext', offset_embedding_dim: int=50, learning_rate: float=.1,
          batch_size: int=32, max_epochs: int=50, dropout: float=.5, use_char_embeddings: bool=False,
          seed: int=0, dev_size: float=.1, test_size: float=.2):

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(message)s',
        datefmt='%d-%b-%y %H:%M:%S')

    logging.info(f'Training config: {locals().items()}')

    if dataset_format not in ['macss', 'semeval']:
        raise ValueError(f"Dataset format '{dataset_format}' not supported.")

    corpus: TaggedCorpus = dataset_loader[dataset_format](data_dir, dev_size, seed)
    label_dictionary = corpus.make_label_dictionary()

    logging.info(f'Corpus: {corpus}')
    corpus.print_statistics()

    logging.info(f'Size of label dictionary: {len(label_dictionary)}')
    logging.info(f'Labels: {label_dictionary.get_items()}')

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings(word_embeddings),
        RelativeOffsetEmbeddings('offset_e1', max_len=200, embedding_dim=offset_embedding_dim),
        RelativeOffsetEmbeddings('offset_e2', max_len=200, embedding_dim=offset_embedding_dim),
    ]

    if use_char_embeddings:
        embedding_types += CharacterEmbeddings()

    document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings(embedding_types,
                                                                       num_filters=num_filters,
                                                                       dropout=dropout)

    classifier: TextClassifier = TextClassifier(document_embeddings=document_embeddings,
                                                label_dictionary=label_dictionary,
                                                multi_label=False)

    trainer: TextClassifierTrainer = TextClassifierTrainer(classifier, corpus, label_dictionary)

    trainer.train(model_dir,
                  learning_rate=learning_rate,
                  mini_batch_size=batch_size,
                  max_epochs=max_epochs)
コード例 #4
0
def train(data_dir: str,
          model_dir: str,
          dataset_format: str = 'macss_tdt',
          num_filters: int = 150,
          word_embeddings: str = 'de-fasttext',
          offset_embedding_dim: int = 100,
          learning_rate: float = .1,
          batch_size: int = 32,
          max_epochs: int = 1,
          dropout: float = .5,
          use_char_embeddings: bool = False,
          seed: int = 0,
          dev_size: float = .1,
          test_size: float = .2,
          concept_embedding_dim: int = 100):

    all_data = open('all_data.txt', encoding='utf8').read().split("\n")
    test_dev_percent = math.floor((len(all_data) * 25) / 100)
    k_folds = math.floor(len(all_data) / test_dev_percent)
    random.shuffle(all_data)
    config_name = '1_Some_Setting_Name'

    for i in range(k_folds):
        data_path = 'resources/' + config_name + '/' + str(i + 1)
        test_dev_set = all_data[(test_dev_percent * (i + 1)) -
                                test_dev_percent:test_dev_percent * (i + 1)]
        train = all_data[0:(test_dev_percent * (i + 1)) -
                         test_dev_percent] + all_data[test_dev_percent *
                                                      (i + 1):len(all_data)]
        random.shuffle(test_dev_set)
        test_perc = math.floor((len(test_dev_set) * 60) / 100)
        test = test_dev_set[0:test_perc]
        dev = test_dev_set[test_perc:len(test_dev_set)]
        os.makedirs(data_path, exist_ok=True)
        train_txt = open(data_path + '/train.txt', 'w+')
        test_txt = open(data_path + '/test.txt', 'w+')
        dev_txt = open(data_path + '/dev.txt', 'w+')
        os.system('cp -r ./Data/vocabulary/ ' + data_path)
        train_txt.write('\n'.join(train))
        test_txt.write('\n'.join(test))
        dev_txt.write('\n'.join(dev))

        train_txt.close()
        test_txt.close()
        dev_txt.close()

        #print("Train Directory: ", data_dir, dev_size, seed, "\n")

        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s - %(message)s',
                            datefmt='%d-%b-%y %H:%M:%S')

        if dataset_format not in ['macss_tdt']:
            raise ValueError(
                f"Dataset format '{dataset_format}' not supported.")

        corpus: TaggedCorpus = dataset_loader[dataset_format](data_path,
                                                              'train.txt',
                                                              'dev.txt',
                                                              'test.txt')
        label_dictionary = corpus.make_label_dictionary()  # rel-type

        # Comment out the embeddings that you don't need
        embedding_types: List[TokenEmbeddings] = [
            # mEx Fine-Tuned Word Embeddings
            #WordEmbeddings('../../Resources/mex-ft-wiki-de-finetuned-biomedical.gensim'),

            # Default German FastText Word Embeddings
            #WordEmbeddings('../../Resources/ft-wiki-de.gensim'),

            # Relative Offset Embeddings
            RelativeOffsetEmbeddings('offset_e1',
                                     max_len=200,
                                     embedding_dim=offset_embedding_dim),
            RelativeOffsetEmbeddings('offset_e2',
                                     max_len=200,
                                     embedding_dim=offset_embedding_dim),

            # Concept Embeddings
            ConceptEmbeddings('concept_1',
                              max_len=200,
                              embedding_dim=concept_embedding_dim),
            ConceptEmbeddings('concept_2',
                              max_len=200,
                              embedding_dim=concept_embedding_dim),
        ]

        if use_char_embeddings:
            embedding_types += CharacterEmbeddings()

        document_embeddings: DocumentCNNEmbeddings = DocumentCNNEmbeddings(
            embedding_types, num_filters=num_filters, dropout=dropout)

        classifier: TextClassifier = TextClassifier(
            document_embeddings=document_embeddings,
            label_dictionary=label_dictionary,
            multi_label=False)

        trainer: TextClassifierTrainer = TextClassifierTrainer(
            classifier, corpus, label_dictionary)

        trainer.train(data_path,
                      learning_rate=learning_rate,
                      mini_batch_size=batch_size,
                      max_epochs=3,
                      use_tensorboard=False,
                      embeddings_in_memory=False)