Example #1
0
def test_tagged_corpus_statistics_multi_label():
    train_sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label('label', 'class_1')

    dev_sentence = Sentence("The sun is shining.", use_tokenizer=True).add_label('label', 'class_2')

    test_sentence = Sentence("Berlin is sunny.", use_tokenizer=True)
    test_sentence.add_label('label', 'class_1')
    test_sentence.add_label('label', 'class_2')

    class_to_count_dict = Corpus._count_sentence_labels(
        [train_sentence, dev_sentence, test_sentence]
    )

    assert "class_1" in class_to_count_dict
    assert "class_2" in class_to_count_dict
    assert 2 == class_to_count_dict["class_1"]
    assert 2 == class_to_count_dict["class_2"]

    tokens_in_sentences = Corpus._get_tokens_per_sentence(
        [train_sentence, dev_sentence, test_sentence]
    )

    assert 3 == len(tokens_in_sentences)
    assert 4 == tokens_in_sentences[0]
    assert 5 == tokens_in_sentences[1]
    assert 4 == tokens_in_sentences[2]
    def _train(self, corpus: Corpus, params: dict, base_path: Path,
               max_epochs: int, optimization_value: str):
        corpus = corpus
        label_dict = corpus.make_label_dictionary()
        for sent in corpus.get_all_sentences():
            sent.clear_embeddings()
        model = self._set_up_model(params, label_dict)
        training_parameters = {
            key: params[key]
            for key, value in params.items() if key in TRAINING_PARAMETERS
        }
        model_trainer_parameters = {
            key: params[key]
            for key, value in params.items()
            if key in MODEL_TRAINER_PARAMETERS and key != 'model'
        }
        trainer: ModelTrainer = ModelTrainer(model, corpus,
                                             **model_trainer_parameters)
        path = base_path
        results = trainer.train(path,
                                max_epochs=max_epochs,
                                param_selection_mode=True,
                                **training_parameters)

        if optimization_value == "score":
            result = results['test_score']
        else:
            result = results['dev_loss_history'][-1]

        return {'result': result, 'params': params}
Example #3
0
def test_tagged_corpus_statistics_multi_label():
    train_sentence = Sentence("I love Berlin.",
                              labels=["class_1"],
                              use_tokenizer=segtok_tokenizer)
    dev_sentence = Sentence("The sun is shining.",
                            labels=["class_2"],
                            use_tokenizer=segtok_tokenizer)
    test_sentence = Sentence(
        "Berlin is sunny.",
        labels=["class_1", "class_2"],
        use_tokenizer=segtok_tokenizer,
    )

    class_to_count_dict = Corpus._get_class_to_count(
        [train_sentence, dev_sentence, test_sentence])

    assert "class_1" in class_to_count_dict
    assert "class_2" in class_to_count_dict
    assert 2 == class_to_count_dict["class_1"]
    assert 2 == class_to_count_dict["class_2"]

    tokens_in_sentences = Corpus._get_tokens_per_sentence(
        [train_sentence, dev_sentence, test_sentence])

    assert 3 == len(tokens_in_sentences)
    assert 4 == tokens_in_sentences[0]
    assert 5 == tokens_in_sentences[1]
    assert 4 == tokens_in_sentences[2]
Example #4
0
def score_flair_tagger(
        splits,
        data:Union[List[Sentence],Dataset],

):
    from flair.trainers import ModelTrainer, trainer
    logger = trainer.log
    logger.setLevel(logging.WARNING)

    data_splits = {split_name:[data[i] for i in split] for split_name,split in splits.items()}

    train_sentences,dev_sentences,test_sentences = data_splits['train'],data_splits['dev'],data_splits['test'],

    corpus = Corpus(
        train=train_sentences,
        dev=dev_sentences,
        test=test_sentences, name='scierc')
    tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE)

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=TAG_TYPE,
                                            locked_dropout=0.01,
                                            dropout=0.01,
                                            use_crf=True)
    trainer: ModelTrainer = ModelTrainer(tagger, corpus, optimizer=torch.optim.RMSprop)
    # print(tagger)
    # pprint([p_name for p_name, p in tagger.named_parameters()])
    save_path = 'flair_sequence_tagging/scierc-ner-%s'%multiprocessing.current_process()
    trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.01,
                  mini_batch_size=32,
                  max_epochs=19,
                  patience=3,
                  save_final_model=False
                  )
    # plotter = Plotter()
    # plotter.plot_training_curves('%s/loss.tsv' % save_path)
    # plotter.plot_weights('%s/weights.txt' % save_path)

    def flair_tagger_predict_bio(sentences: List[Sentence]):
        train_data = [[(token.text, token.tags[tagger.tag_type].value) for token in datum] for datum in sentences]
        targets = [bilou2bio([tag for token, tag in datum]) for datum in train_data]

        pred_sentences = tagger.predict(sentences)
        pred_data = [bilou2bio([token.tags[tagger.tag_type].value for token in datum]) for datum in pred_sentences]


        return pred_data,targets

    return {
        'train':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.train),
        'test':calc_seqtag_f1_scores(flair_tagger_predict_bio,corpus.test)
    }
Example #5
0
def test_tagged_corpus_get_all_sentences():
    train_sentence = Sentence("I'm used in training.", use_tokenizer=True)
    dev_sentence = Sentence("I'm a dev sentence.", use_tokenizer=True)
    test_sentence = Sentence('I will be only used for testing.',
                             use_tokenizer=True)
    corpus = Corpus([train_sentence], [dev_sentence], [test_sentence])
    all_sentences = corpus.get_all_sentences()
    assert (3 == len(all_sentences))
Example #6
0
def test_tagged_corpus_make_label_dictionary_string():
    sentence_1 = Sentence('sentence 1', labels=['class_1'])
    sentence_2 = Sentence('sentence 2', labels=['class_2'])
    sentence_3 = Sentence('sentence 3', labels=['class_1'])
    corpus = Corpus([sentence_1, sentence_2, sentence_3], [], [])
    label_dict = corpus.make_label_dictionary()
    assert (2 == len(label_dict))
    assert ('<unk>' not in label_dict.get_items())
    assert ('class_1' in label_dict.get_items())
    assert ('class_2' in label_dict.get_items())
Example #7
0
def test_tagged_corpus_downsample():
    sentence = Sentence('I love Berlin.',
                        labels=[Label('class_1')],
                        use_tokenizer=True)
    corpus = Corpus([
        sentence, sentence, sentence, sentence, sentence, sentence, sentence,
        sentence, sentence, sentence
    ], [], [])
    assert (10 == len(corpus.train))
    corpus.downsample(percentage=0.3, only_downsample_train=True)
    assert (3 == len(corpus.train))
Example #8
0
def run_zero_shot(train_tweets, train_y, val_tweets, val_y):
    """
    Performs the training of the zero shot learning model

    @param train_tweets: the tweets that will be used for training
    @param train_y: the training labels
    @param val_tweets: the tweets that will be used for validation
    @param val_y: the validation labels
    @return: None
    """
    # 1. Load our pre-trained TARS model for English
    print("Zero shot")
    # download https://nlp.informatik.hu-berlin.de/resources/models/tars-base/tars-base.pt
    tars = TARSClassifier.load(
        os.path.join(os.path.dirname(__file__), "..", "..", "saved_models",
                     "tars-base.pt"))

    train_tweets["output"] = train_y.iloc[:]
    train = train_tweets.apply(create_sentences, axis=1).tolist()
    train = SentenceDataset(train)

    val_tweets["output"] = val_y.iloc[:]
    val = val_tweets.apply(create_sentences, axis=1).tolist()
    val = SentenceDataset(val)

    corpus = Corpus(train=train, test=val)

    tars.add_and_switch_to_new_task(
        "POSITIVE_NEGATIVE", label_dictionary=corpus.make_label_dictionary())

    trainer = ModelTrainer(tars, corpus)

    # 4. train model
    trainer.train(
        base_path='../../data/zero_shot',  # path to store the model artifacts
        learning_rate=0.02,  # use very small learning rate
        mini_batch_size=16,  # small mini-batch size since corpus is tiny
        max_epochs=10,  # terminate after 10 epochs
    )

    print("DONE TRAINING")
    tars = TARSClassifier.load('../../model/zero_shot/final-model.pt')

    val_tweets["pred"] = val_tweets.apply(predict_few_shot,
                                          args=(tars, ),
                                          axis=1)
    val_tweets["pred"] = val_tweets["pred"].apply(lambda x: 1
                                                  if x == "positive" else -1)

    pred = pd.DataFrame(list(val_tweets["pred"]), columns=['Prediction'])
    pred.index += 1
    pred.insert(0, 'Id', pred.index)

    pred.to_csv("../../predictions/zero_shot_pred.csv", index=False)
Example #9
0
    def fit(self, X, y):
        """ Build feature vectors and train FLAIR model.

            Parameters
            ----------
            X : list(list(str))
                list of sentences. Sentences are tokenized into list 
                of words.
            y : list(list(str))
                list of list of BIO tags.

            Returns
            -------
            self
        """
        log.info("Creating FLAIR corpus...")
        Xtrain, Xval, ytrain, yval = train_test_split(X, y, test_size=0.1)
        sents_train = self._convert_to_flair(Xtrain, ytrain)
        sents_val = self._convert_to_flair(Xval, yval)
        corpus_train = Corpus(sents_train, sents_val, [], name="train-corpus")

        tag_dict = corpus_train.make_tag_dictionary(tag_type="ner")

        if self.embeddings is None:
            embedding_types = [
                WordEmbeddings("glove"),
                CharacterEmbeddings()    
            ]
            self.embeddings = StackedEmbeddings(embeddings=embedding_types)

        log.info("Building FLAIR NER...")
        self.model_ = SequenceTagger(hidden_size=self.hidden_dim,
            embeddings=self.embeddings,
            tag_dictionary=tag_dict,
            tag_type="ner",
            use_crf=self.use_crf,
            use_rnn=self.use_rnn,
            rnn_layers=self.num_rnn_layers,
            dropout=self.dropout,
            word_dropout=self.word_dropout,
            locked_dropout=self.locked_dropout)

        log.info("Training FLAIR NER...")
        opt = torch.optim.SGD if self.optimizer == "sgd" else torch.optim.Adam
        trainer = ModelTrainer(self.model_, corpus_train, opt)
        trainer.train(base_path=self.basedir,
            learning_rate=self.learning_rate,
            mini_batch_size=self.batch_size,
            max_epochs=self.max_iter)

        return self
Example #10
0
    def train(self):
        from flair.data import Corpus
        from flair.datasets import SentenceDataset
        from flair.data import Sentence

        self.classes = utils.read_class_titles(settings.CAT_DEPTH)
        self.classes['NOCAT'] = 'NOCAT'

        train = SentenceDataset([
            Sentence(row['titlen']).add_label('law_topic',
                                              self.classes[row['cat1']])
            for i, row in self.df_train.iterrows()
        ])

        # make a corpus with train and test split
        self.corpus = Corpus(train=train, dev=train)

        # 1. load base TARS
        tars = self._load_pretained_model()

        # 2. make the model aware of the desired set of labels from the new corpus
        tars.add_and_switch_to_new_task(
            "LAW_TOPIC", label_dictionary=self.corpus.make_label_dictionary())

        # 3. initialize the text classifier trainer with your corpus
        from flair.trainers import ModelTrainer
        trainer = ModelTrainer(tars, self.corpus)

        # 4. train model
        path = settings.WORKING_DIR
        if 1:
            trainer.train(
                base_path=path,
                # path to store the model artifacts
                learning_rate=5e-2,  # 5ep, 0.2 bad; 5ep with 0.1 looks ok.
                mini_batch_size=settings.MINIBATCH,
                # mini_batch_chunk_size=1, mini_batch_chunk_size=4, # optionally set this if transformer is too much for your machine
                max_epochs=settings.EPOCHS,  # terminate after 10 epochs
                train_with_dev=False,
                save_final_model=False,
                param_selection_mode=True,  # True to avoid model saves
                shuffle=False,  # Already done
            )

        # from flair.models.text_classification_model import TARSClassifier
        # self.model = TARSClassifier.load(
        #     os.path.join(path, 'best-model.pt')
        # )

        self.model = tars
Example #11
0
def train_seqtagger(train_data:Dataset,
                    dev_data:Dataset,
                    test_data:Dataset
                    ):
    corpus = Corpus(
        train=train_data,
        dev=dev_data,
        test=test_data,
        name='scierc')

    pprint(Counter([tok.tags[TAG_TYPE].value for sent in corpus.train for tok in sent]))

    tag_dictionary = corpus.make_tag_dictionary(tag_type=TAG_TYPE)
    print(tag_dictionary.idx2item)

    embedding_types: List[TokenEmbeddings] = [ WordEmbeddings('glove')]

    embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=64,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=TAG_TYPE,
                                            locked_dropout=0.01,
                                            dropout=0.01,
                                            use_crf=True)

    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus,optimizer=torch.optim.Adam)

    save_path = 'sequence_tagging/resources/taggers/scierc-ner'
    trainer.train('%s' % save_path, EvaluationMetric.MICRO_F1_SCORE,
                  learning_rate=0.01,
                  mini_batch_size=32,
                  max_epochs=20)

    # plotter = Plotter()
    # plotter.plot_training_curves('%s/loss.tsv' % save_path)
    # plotter.plot_weights('%s/weights.txt' % save_path)

    from sequence_tagging.evaluate_flair_tagger import evaluate_sequence_tagger
    pprint('train-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.train)['f1-macro'])
    pprint('dev-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.dev)['f1-macro'])
    pprint('test-f1-macro: %0.2f'%evaluate_sequence_tagger(tagger,corpus.test)['f1-macro'])
    return tagger
def test_tagged_corpus_downsample():
    sentence = Sentence("I love Berlin.",
                        use_tokenizer=True).add_label("label", "class_1")

    corpus: Corpus = Corpus(
        FlairDatapointDataset([
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
        ]),
        sample_missing_splits=False,
    )

    assert 10 == len(corpus.train)

    corpus.downsample(percentage=0.3,
                      downsample_dev=False,
                      downsample_test=False)

    assert 3 == len(corpus.train)
Example #13
0
def test_tagged_corpus_downsample():
    sentence = Sentence("I love Berlin.", use_tokenizer=True).add_label('label', 'class_1')

    corpus: Corpus = Corpus(
        [
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
        ],
        [],
        [],
    )

    assert 10 == len(corpus.train)

    corpus.downsample(percentage=0.3, downsample_dev=False, downsample_test=False)

    assert 3 == len(corpus.train)
Example #14
0
def test_tagged_corpus_get_tag_statistic():
    train_sentence = Sentence("Zalando Research is located in Berlin .")
    train_sentence[0].add_tag("ner", "B-ORG")
    train_sentence[1].add_tag("ner", "E-ORG")
    train_sentence[5].add_tag("ner", "S-LOC")

    dev_sentence = Sentence(
        "Facebook, Inc. is a company, and Google is one as well.",
        use_tokenizer=segtok_tokenizer,
    )
    dev_sentence[0].add_tag("ner", "B-ORG")
    dev_sentence[1].add_tag("ner", "I-ORG")
    dev_sentence[2].add_tag("ner", "E-ORG")
    dev_sentence[8].add_tag("ner", "S-ORG")

    test_sentence = Sentence("Nothing to do with companies.")

    tag_to_count_dict = Corpus._get_tag_to_count(
        [train_sentence, dev_sentence, test_sentence], "ner")

    assert 1 == tag_to_count_dict["S-ORG"]
    assert 1 == tag_to_count_dict["S-LOC"]
    assert 2 == tag_to_count_dict["B-ORG"]
    assert 2 == tag_to_count_dict["E-ORG"]
    assert 1 == tag_to_count_dict["I-ORG"]
Example #15
0
    def class_distribution(self,
                           multiclass: bool = False,
                           nr_classes: int = 10,
                           savefig_file=None,
                           **kwargs):
        class_count = Corpus._get_class_to_count(self.sentences)
        class_count = pd.DataFrame.from_dict(class_count,
                                             orient='index',
                                             columns=['count']).sort_values(
                                                 'count', ascending=False)
        html_table = class_count.to_html()

        # plot distribution
        class_count_top = class_count[:nr_classes].copy()
        if not multiclass:
            if nr_classes < len(class_count):
                class_count_top.loc['others'] = class_count[nr_classes:].sum()
            # pie plot class_count
            class_count_top.plot.pie(y='count', **kwargs)
            plt.legend(labels=class_count_top.index,
                       bbox_to_anchor=(1, 0, 0.1, 1),
                       loc='center right')
        else:
            class_count_top.plot.bar(y='count', **kwargs)
            plt.gca().yaxis.grid(True, linestyle='--')

        plt.tight_layout()
        if savefig_file:
            plt.savefig(self.path / savefig_file, dpi=600)
        plt.show()
Example #16
0
def test_tagged_corpus_make_vocab_dictionary():
    train_sentence = Sentence('used in training. training is cool.',
                              use_tokenizer=True)
    corpus = Corpus([train_sentence], [], [])
    vocab = corpus.make_vocab_dictionary(max_tokens=2, min_freq=(-1))
    assert (3 == len(vocab))
    assert ('<unk>' in vocab.get_items())
    assert ('training' in vocab.get_items())
    assert ('.' in vocab.get_items())
    vocab = corpus.make_vocab_dictionary(max_tokens=(-1), min_freq=(-1))
    assert (7 == len(vocab))
    vocab = corpus.make_vocab_dictionary(max_tokens=(-1), min_freq=2)
    assert (3 == len(vocab))
    assert ('<unk>' in vocab.get_items())
    assert ('training' in vocab.get_items())
    assert ('.' in vocab.get_items())
Example #17
0
def build_and_train_conll03en_flair_sequence_tagger(corpus,tag_type,tag_dictionary):
    '''
    do not change!
    same configuration as described in
      file:  "flair/resources/docs/EXPERIMENTS.md"
      section: "CoNLL-03 Named Entity Recognition (English)"
    '''
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=[
            WordEmbeddings("glove"),
            PooledFlairEmbeddings("news-forward", pooling="min"),
            PooledFlairEmbeddings("news-backward", pooling="min"),
        ]
    )
    from flair.models import SequenceTagger
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=256,
        embeddings=embeddings,
        tag_dictionary=tag_dictionary,
        tag_type=tag_type,
    )

    from flair.trainers import ModelTrainer

    corpus = Corpus(train=corpus.train, dev=corpus.dev,test=[])
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # trainer.train("resources/taggers/example-ner", train_with_dev=True, max_epochs=150) # original
    trainer.train("flair_checkpoints", train_with_dev=False, max_epochs=40,save_final_model=False) # original

    return tagger
Example #18
0
def test_tagged_corpus_downsample():
    sentence = Sentence("I love Berlin.",
                        labels=[Label("class_1")],
                        use_tokenizer=segtok_tokenizer)

    corpus: Corpus = Corpus(
        [
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
            sentence,
        ],
        [],
        [],
    )

    assert 10 == len(corpus.train)

    corpus.downsample(percentage=0.3, only_downsample_train=True)

    assert 3 == len(corpus.train)
Example #19
0
    def build_train_sequence_tagger(corpus,
                                    tag_dictionary,
                                    params: Params,
                                    TAG_TYPE="ner"):
        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=[
            WordEmbeddings("glove"),
            FlairEmbeddings("news-forward"),
            FlairEmbeddings("news-backward"),
        ])
        from flair.models import SequenceTagger

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=TAG_TYPE,
        )

        from flair.trainers import ModelTrainer

        corpus = Corpus(train=corpus.train, dev=corpus.dev, test=[])
        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(
            "flair_checkpoints",
            train_with_dev=False,
            max_epochs=params.max_epochs,
            save_final_model=False,
        )  # original

        return tagger
Example #20
0
def make_relations_tag_dictionary(corpus: Corpus,
                                  tag_type='dependency',
                                  special_tags=[]) -> Dictionary:

    tag_dictionary: Dictionary = Dictionary(add_unk=False)
    # for tag in special_tags:
    #     tag_dictionary.add_item(tag)
    for sentence in corpus.get_all_sentences():
        for token in sentence.tokens:
            tag_dictionary.add_item(token.get_tag(tag_type).value)
    return tag_dictionary
def spelling_aug(corpus):
    aug = naw.SpellingAug()
   # augmented_sentences = []

    # go through all train and dev sentences
    for sentence in corpus.train:
        augmented_texts = aug.augment(sentence, n=3)

    corpus = Corpus(train=SentenceDataset(augmented_texts),
                    dev=corpus.dev,
                    test=corpus.test)
    return corpus
Example #22
0
    def obtain_statistics(self,
                          tag_type: str = 'ner',
                          save_as_json: bool = True):
        stats_splits = self.corpus.obtain_statistics(tag_type)
        stats_complete = json.dumps(Corpus._obtain_statistics_for(
            self.sentences, 'complete', tag_type),
                                    indent=4)
        if save_as_json:
            (self.path / 'stats_splits.json').write_text(stats_splits)
            (self.path / 'stats_complete.json').write_text(stats_complete)

        return (stats_splits, stats_complete)
Example #23
0
def test_tagged_corpus_statistics_multi_label():
    train_sentence = Sentence('I love Berlin.',
                              labels=['class_1'],
                              use_tokenizer=True)
    dev_sentence = Sentence('The sun is shining.',
                            labels=['class_2'],
                            use_tokenizer=True)
    test_sentence = Sentence('Berlin is sunny.',
                             labels=['class_1', 'class_2'],
                             use_tokenizer=True)
    class_to_count_dict = Corpus._get_class_to_count(
        [train_sentence, dev_sentence, test_sentence])
    assert ('class_1' in class_to_count_dict)
    assert ('class_2' in class_to_count_dict)
    assert (2 == class_to_count_dict['class_1'])
    assert (2 == class_to_count_dict['class_2'])
    tokens_in_sentences = Corpus._get_tokens_per_sentence(
        [train_sentence, dev_sentence, test_sentence])
    assert (3 == len(tokens_in_sentences))
    assert (4 == tokens_in_sentences[0])
    assert (5 == tokens_in_sentences[1])
    assert (4 == tokens_in_sentences[2])
Example #24
0
def test_tagged_corpus_make_label_dictionary_string():
    sentence_1 = Sentence("sentence 1", labels=["class_1"])
    sentence_2 = Sentence("sentence 2", labels=["class_2"])
    sentence_3 = Sentence("sentence 3", labels=["class_1"])

    corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], [])

    label_dict = corpus.make_label_dictionary()

    assert 2 == len(label_dict)
    assert "<unk>" not in label_dict.get_items()
    assert "class_1" in label_dict.get_items()
    assert "class_2" in label_dict.get_items()
Example #25
0
def train_dev_split(sentences, dev_ratio=0.25):
    dev_size = len(sentences) * dev_ratio

    train = []
    dev = []

    for count, idx in enumerate(np.random.permutation(len(sentences))):
        if count < dev_size:
            dev.append(sentences[idx])
        else:
            train.append(sentences[idx])

    return Corpus(train=train, dev=dev, test=[])
def test_tagged_corpus_get_all_sentences():
    train_sentence = Sentence("I'm used in training.")
    dev_sentence = Sentence("I'm a dev sentence.")
    test_sentence = Sentence("I will be only used for testing.")

    corpus: Corpus = Corpus(
        FlairDatapointDataset([train_sentence]),
        FlairDatapointDataset([dev_sentence]),
        FlairDatapointDataset([test_sentence]),
    )

    all_sentences = corpus.get_all_sentences()

    assert 3 == len(all_sentences)
Example #27
0
    def create_corpus(self, train_path, val_path, test_path, chunk_len):
        """
        *** This methods is only needed when training your own models
        It is not accessible from rwtagger_script and not documented in detail. Use at your own risk. ;-)
        ***
        :param data_path:
        :return:
        """
        train_list = self.create_sentlist_from_file_batchmax(train_path, maxlen=chunk_len)
        val_list = self.create_sentlist_from_file_batchmax(val_path, maxlen=chunk_len)
        test_list = self.create_sentlist_from_file_batchmax(test_path, maxlen=chunk_len)
        corpus: Corpus = Corpus(train_list, val_list, test_list)

        return corpus
Example #28
0
    def fit(self, corpus: Corpus, model_path: str):
        self.model = TARSClassifier(
            task_name="ChemicalUnderstanding",
            label_dictionary=corpus.make_label_dictionary(),
        )

        trainer = ModelTrainer(self.model, corpus)

        trainer.train(
            base_path=model_path,
            learning_rate=0.02,
            mini_batch_size=16,
            mini_batch_chunk_size=4,
            max_epochs=10,
        )
Example #29
0
def test_tagged_corpus_make_label_dictionary():
    sentence_1 = Sentence("sentence 1").add_label("label", "class_1")

    sentence_2 = Sentence("sentence 2").add_label("label", "class_2")

    sentence_3 = Sentence("sentence 3").add_label("label", "class_1")

    corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], [])

    label_dict = corpus.make_label_dictionary("label")

    assert 3 == len(label_dict)
    assert "<unk>" in label_dict.get_items()
    assert "class_1" in label_dict.get_items()
    assert "class_2" in label_dict.get_items()
Example #30
0
def test_tagged_corpus_make_label_dictionary():
    sentence_1 = Sentence("sentence 1").add_label('label', 'class_1')

    sentence_2 = Sentence("sentence 2").add_label('label', 'class_2')

    sentence_3 = Sentence("sentence 3").add_label('label', 'class_1')

    corpus: Corpus = Corpus([sentence_1, sentence_2, sentence_3], [], [])

    label_dict = corpus.make_label_dictionary('label')

    assert 2 == len(label_dict)
    assert "<unk>" not in label_dict.get_items()
    assert "class_1" in label_dict.get_items()
    assert "class_2" in label_dict.get_items()