Esempio n. 1
0
def init_document_embeddings():
    text = 'I love Berlin. Berlin is a great place to live.'
    sentence: Sentence = Sentence(text)

    glove: TokenEmbeddings = WordEmbeddings('en-glove')
    charlm: TokenEmbeddings = CharLMEmbeddings('news-forward-fast')

    return sentence, glove, charlm
Esempio n. 2
0
def init(tasks_base_path) -> Tuple[(Corpus, Dictionary, TextClassifier)]:
    corpus = flair.datasets.ClassificationCorpus((tasks_base_path / 'ag_news'))
    label_dict = corpus.make_label_dictionary()
    glove_embedding = WordEmbeddings('turian')
    document_embeddings = DocumentRNNEmbeddings([glove_embedding], 128, 1,
                                                False, 64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    return (corpus, label_dict, model)
Esempio n. 3
0
File: flair.py Progetto: zgd716/gnes
 def post_init(self):
     from flair.embeddings import DocumentPoolEmbeddings, WordEmbeddings, FlairEmbeddings
     self._flair = DocumentPoolEmbeddings([
         WordEmbeddings('glove'),
         FlairEmbeddings('news-forward'),
         FlairEmbeddings('news-backward')
     ],
                                          pooling=self.pooling_strategy)
Esempio n. 4
0
def init_document_embeddings():
    text = "I love Berlin. Berlin is a great place to live."
    sentence: Sentence = Sentence(text)

    glove: TokenEmbeddings = WordEmbeddings("turian")
    charlm: TokenEmbeddings = FlairEmbeddings("news-forward-fast")

    return sentence, glove, charlm
Esempio n. 5
0
def init(tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path)
    label_dict = corpus.make_label_dictionary()
    glove_embedding = WordEmbeddings(u'en-glove')
    document_embeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1,
                                                 False, 64, False, False)
    model = TextClassifier(document_embeddings, label_dict, False)
    return (corpus, label_dict, model)
Esempio n. 6
0
def test_sequence_tagger_param_selector(results_base_path, tasks_base_path):
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.FASHION,
                                            base_path=tasks_base_path)

    # define search space
    search_space = SearchSpace()

    # sequence tagger parameter
    search_space.add(Parameter.EMBEDDINGS,
                     hp.choice,
                     options=[
                         StackedEmbeddings([WordEmbeddings('glove')]),
                         StackedEmbeddings([
                             WordEmbeddings('glove'),
                             CharLMEmbeddings('news-forward'),
                             CharLMEmbeddings('news-backward')
                         ])
                     ])
    search_space.add(Parameter.USE_CRF, hp.choice, options=[True, False])
    search_space.add(Parameter.DROPOUT, hp.uniform, low=0.25, high=0.75)
    search_space.add(Parameter.WORD_DROPOUT, hp.uniform, low=0.0, high=0.25)
    search_space.add(Parameter.LOCKED_DROPOUT, hp.uniform, low=0.0, high=0.5)
    search_space.add(Parameter.HIDDEN_SIZE,
                     hp.choice,
                     options=[64, 128, 256, 512])
    search_space.add(Parameter.RNN_LAYERS, hp.choice, options=[1, 2])

    # model trainer parameter
    search_space.add(Parameter.OPTIMIZER, hp.choice, options=[SGD, AdamW])

    # training parameter
    search_space.add(Parameter.MINI_BATCH_SIZE, hp.choice, options=[4, 8, 32])
    search_space.add(Parameter.LEARNING_RATE, hp.uniform, low=0.01, high=1)
    search_space.add(Parameter.ANNEAL_FACTOR, hp.uniform, low=0.3, high=0.75)
    search_space.add(Parameter.PATIENCE, hp.choice, options=[3, 5])
    search_space.add(Parameter.WEIGHT_DECAY, hp.uniform, low=0.01, high=1)

    # find best parameter settings
    optimizer = SequenceTaggerParamSelector(corpus,
                                            'ner',
                                            results_base_path,
                                            max_epochs=2)
    optimizer.optimize(search_space, max_evals=2)

    # clean up results directory
    shutil.rmtree(results_base_path)
def create_flair_embeddings(emb_name):
    emb_type, emb_subname = emb_name.split('+')
    if emb_type == 'elmo':
        return ELMoEmbeddings(emb_subname)
    elif emb_type == 'fasttext':
        return WordEmbeddings(emb_subname)
    elif emb_type == 'custom_elmo':
        return ELMoEmbeddings(options_file=Path(emb_subname) / 'options.json', weight_file=Path(emb_subname) / 'model.hdf5')
Esempio n. 8
0
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path /
                                                 "multi_class")
    label_dict = corpus.make_label_dictionary()

    word_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings = DocumentRNNEmbeddings(
        embeddings=[word_embedding],
        hidden_size=32,
        reproject_words=False,
        bidirectional=False,
    )

    model = TextClassifier(document_embeddings, label_dict, multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(
        results_base_path,
        mini_batch_size=1,
        max_epochs=100,
        shuffle=False,
        checkpoint=False,
    )

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert "apple" in sentence.get_label_names()
        assert "tv" in sentence.get_label_names()

        for l in s.labels:
            print(l)
            assert l.value is not None
            assert 0.0 <= l.score <= 1.0
            assert type(l.score) is float

    loaded_model = TextClassifier.load(results_base_path / "final-model.pt")

    sentence = Sentence("I love Berlin")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
    def _set_up_model(self, params: dict, label_dictionary):
        document_embedding = params['document_embeddings'].__name__
        if document_embedding == "DocumentRNNEmbeddings":
            embedding_params = {
                key: params[key]
                for key, value in params.items()
                if key in DOCUMENT_RNN_EMBEDDING_PARAMETERS
            }
            embedding_params['embeddings'] = [
                WordEmbeddings(TokenEmbedding) if type(params['embeddings'])
                == list else WordEmbeddings(params['embeddings'])
                for TokenEmbedding in params['embeddings']
            ]
            document_embedding = DocumentRNNEmbeddings(**embedding_params)

        elif document_embedding == "DocumentPoolEmbeddings":
            embedding_params = {
                key: params[key]
                for key, value in params.items()
                if key in DOCUMENT_POOL_EMBEDDING_PARAMETERS
            }
            embedding_params['embeddings'] = [
                WordEmbeddings(TokenEmbedding)
                for TokenEmbedding in params['embeddings']
            ]
            document_embedding = DocumentPoolEmbeddings(**embedding_params)

        elif document_embedding == "TransformerDocumentEmbeddings":
            embedding_params = {
                key: params[key]
                for key, value in params.items()
                if key in DOCUMENT_TRANSFORMER_EMBEDDING_PARAMETERS
            }
            document_embedding = TransformerDocumentEmbeddings(
                **embedding_params)

        else:
            raise Exception("Please provide a flair document embedding class")

        text_classifier: TextClassifier = TextClassifier(
            label_dictionary=label_dictionary,
            multi_label=self.multi_label,
            document_embeddings=document_embedding,
        )
        return text_classifier
Esempio n. 10
0
    def get_embeddings(self):
        embeddings = [
            PolyglotEmbeddings(self.args.lang),
            CharacterEmbeddings()
        ]
        if not self.args.lang in self.embeds_unsupported_langs:
            embeddings.append(WordEmbeddings(self.args.lang))

        return StackedEmbeddings(embeddings=embeddings)
def main(args):
    args = parser.parse_args()

    # 1. get the corpus
    column_format = {0: 'word', 1: 'pos', 2: 'ner'}

    corpus: TaggedCorpus = NLPTaskDataFetcher.load_column_corpus(
        Path(args.data_file[0]), column_format, tag_to_biloes='ner')
    print(corpus)

    # 2. what tag do we want to predict?
    tag_type = 'ner'

    # 3. make the tag dictionary from the corpus
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary.idx2item)

    # 4. initialize embeddings
    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),

        # comment in this line to use character embeddings
        # CharacterEmbeddings(),

        # comment in these lines to use contextual string embeddings
        FlairEmbeddings('news-forward'),
        FlairEmbeddings('news-backward'),

        # comment in these lines to use Bert embeddings
        # BertEmbeddings(),

        # comment in these lines to use Elmo embeddings
        # ELMoEmbeddings(),
    ]

    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # 5. initialize sequence tagger
    from flair.models import SequenceTagger

    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # 6. initialize trainer
    from flair.trainers import ModelTrainer

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # 7. start training
    trainer.train('resources/taggers/glove',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=50)
Esempio n. 12
0
def get_embeddings(emb_name):
    emb_type, emb_name = emb_name.split('+')

    if emb_type == 'elmo':
        return lambda: ELMoEmbeddings(emb_name)  # pubmed
    elif emb_type == 'fasttext':
        return lambda: WordEmbeddings(emb_name)  # en
    else:
        raise ValueError('Wrong embedding type')
Esempio n. 13
0
def test_train_load_use_classifier_multi_label(results_base_path,
                                               tasks_base_path):

    # corpus = NLPTaskDataFetcher.load_corpus('multi_class', base_path=tasks_base_path)
    corpus = NLPTaskDataFetcher.load_classification_corpus(
        data_folder=tasks_base_path / 'multi_class')
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings = DocumentLSTMEmbeddings(embeddings=[glove_embedding],
                                                 hidden_size=32,
                                                 reproject_words=False,
                                                 bidirectional=False)

    model = TextClassifier(document_embeddings, label_dict, multi_label=True)

    trainer = ModelTrainer(model, corpus)
    trainer.train(results_base_path,
                  EvaluationMetric.MICRO_F1_SCORE,
                  max_epochs=100,
                  test_mode=True,
                  checkpoint=False)

    sentence = Sentence('apple tv')

    for s in model.predict(sentence):
        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    sentence = Sentence("apple tv")

    for s in model.predict(sentence):

        assert ('apple' in sentence.get_label_names())
        assert ('tv' in sentence.get_label_names())

        for l in s.labels:
            print(l)
            assert (l.value is not None)
            assert (0.0 <= l.score <= 1.0)
            assert (type(l.score) is float)

    loaded_model = TextClassifier.load_from_file(results_base_path /
                                                 'final-model.pt')

    sentence = Sentence('I love Berlin')
    sentence_empty = Sentence('       ')

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # clean up results directory
    shutil.rmtree(results_base_path)
Esempio n. 14
0
    def de_lang(cls):
        """
        Factory method for german embeddings
        """
        embeddings = WordEmbeddings('de')  # German FastText embeddings
        # embeddings = WordEmbeddings('de-crawl')  # German FastText embeddings trained over crawls
        #embeddings = BertEmbeddings('bert-base-multilingual-cased')

        return cls(embeddings)
Esempio n. 15
0
def test_stacked_embeddings():
    text = 'I love Berlin.'
    sentence: Sentence = Sentence(text)

    glove: TokenEmbeddings = WordEmbeddings('en-glove')
    news: TokenEmbeddings = WordEmbeddings('en-news')
    charlm: TokenEmbeddings = CharLMEmbeddings('mix-backward')

    embeddings: StackedEmbeddings = StackedEmbeddings([glove, news, charlm])

    embeddings.embed(sentence)

    for token in sentence.tokens:
        assert (len(token.get_embedding()) != 0)

        token.clear_embeddings()

        assert (len(token.get_embedding()) == 0)
Esempio n. 16
0
def init(tasks_base_path) -> Tuple[(Corpus, TextRegressor, ModelTrainer)]:
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.REGRESSION,
                                            tasks_base_path)
    glove_embedding = WordEmbeddings('glove')
    document_embeddings = DocumentRNNEmbeddings([glove_embedding], 128, 1,
                                                False, 64, False, False)
    model = TextRegressor(document_embeddings)
    trainer = ModelTrainer(model, corpus)
    return (corpus, model, trainer)
Esempio n. 17
0
def train_model(directory='Data', use_BERT=True):
    # define columns
    columns = {
        0: 'ID',
        1: 'text',
        2: 'empty_0',
        3: 'pos',
        4: 'empty_1',
        5: 'empty_2',
        6: 'empty_3',
        7: 'empty_4',
        8: 'empty_5',
        9: 'tox'
    }

    # this is the folder in which train, test and dev files reside
    data_folder = directory

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='converted_data_train.conll',
                                  test_file='converted_data_test.conll',
                                  dev_file='converted_data_dev.conll')

    # tag to predict
    tag_type = 'tox'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # embeddings
    if use_BERT:
        bert_embeddings = [
            TransformerWordEmbeddings('bert-large-uncased', fine_tune=True)
        ]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=bert_embeddings)
    else:
        embedding_types = [WordEmbeddings('glove')]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

    # initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # start training
    trainer.train('resources/taggers/toxic_classifier_bert',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=5)
Esempio n. 18
0
def init() -> Tuple[TaggedCorpus, Dictionary, TextClassifier]:
    corpus = NLPTaskDataFetcher.fetch_data(NLPTask.AG_NEWS)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('en-glove')
    document_embeddings: DocumentLSTMEmbeddings = DocumentLSTMEmbeddings([glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    return corpus, label_dict, model
Esempio n. 19
0
def collect_features(embeddings):
    for embedding in embeddings:
        if embedding in {"fasttext"}:
            yield WordEmbeddings("de")
        elif embedding in {"bert"}:
            yield BertEmbeddings("bert-base-multilingual-cased", layers="-1")
        elif embedding in {"flair-forward"}:
            yield FlairEmbeddings("german-forward")
        elif embedding in {"flair-backward"}:
            yield FlairEmbeddings("german-backward")
def get_doc_embeddings():
    # initialize the word embeddings
    glove_embedding = WordEmbeddings('glove')
    flair_embedding_forward = FlairEmbeddings('news-forward-fast')
    flair_embedding_backward = FlairEmbeddings('news-backward-fast')

    # initialize the document embeddings, mode = mean
    return DocumentPoolEmbeddings(
        [glove_embedding, flair_embedding_backward, flair_embedding_forward],
        fine_tune_mode='none')
Esempio n. 21
0
def load_flair(mode = 'flair'):
    if mode == 'flair':
        stacked_embeddings = StackedEmbeddings([
            WordEmbeddings('glove'),
            PooledFlairEmbeddings('news-forward', pooling='min'),
            PooledFlairEmbeddings('news-backward', pooling='min')
        ])
    else:##bert
        stacked_embeddings = BertEmbeddings('bert-base-uncased')  ##concat last 4 layers give the best
    return stacked_embeddings
Esempio n. 22
0
def trainFlairClassifier(df, columns, trainNameCsv, testNameCsv, devNameCsv,
                         classifierFileName):
    ids = df['id'].tolist()

    nSamples = len(ids)
    idx80 = int(nSamples * 0.7)
    idx90 = int(nSamples * 0.9)

    train_ids = ids[:idx80]
    test_ids = ids[idx80:idx90]
    dev_ids = ids[idx90:]

    with TemporaryDirectory() as temp_dir:
        trainCsv = temp_dir + trainNameCsv
        testCsv = temp_dir + testNameCsv
        devCsv = temp_dir + devNameCsv

        df[df['id'].isin(train_ids)].to_csv(trainCsv,
                                            columns=columns,
                                            sep='\t',
                                            index=False,
                                            header=False)
        df[df['id'].isin(test_ids)].to_csv(testCsv,
                                           columns=columns,
                                           sep='\t',
                                           index=False,
                                           header=False)
        df[df['id'].isin(dev_ids)].to_csv(devCsv,
                                          columns=columns,
                                          sep='\t',
                                          index=False,
                                          header=False)

        corpus = NLPTaskDataFetcher.load_classification_corpus(
            temp_dir, train_file=trainCsv, test_file=testCsv, dev_file=devCsv)

        word_embeddings = [
            WordEmbeddings('glove'),
            FlairEmbeddings('news-forward-fast'),
            FlairEmbeddings('news-backward-fast')
        ]
        document_embeddings = DocumentLSTMEmbeddings(
            word_embeddings,
            hidden_size=512,
            reproject_words=True,
            reproject_words_dimension=256)
        classifier = TextClassifier(
            document_embeddings,
            label_dictionary=corpus.make_label_dictionary(),
            multi_label=False)
        trainer = ModelTrainer(classifier, corpus)

        trainer.train(temp_dir, max_epochs=50)

        classifier.save(classifierFileName)
Esempio n. 23
0
 def __init__(self, device_number='cuda:2', use_cuda = True):
     
     self.device_number = device_number
     
     if use_cuda:
         flair.device = torch.device(self.device_number) 
     
     self.stacked_embeddings = StackedEmbeddings([WordEmbeddings('glove'), 
                                     FlairEmbeddings('news-forward'), 
                                     FlairEmbeddings('news-backward'),
                                     ])
Esempio n. 24
0
    def __init__(self, hidden_dim: int, rnn_type: str, vocab_size: int,
                 tagset_size: int, task_type: str):
        super(TaskLearner, self).__init__()

        self.task_type = task_type
        self.rnn_type = rnn_type
        self.bidirectional = True
        self.num_layers = 2
        self.num_directions = 2 if self.bidirectional else 1

        # Word Embeddings (TODO: Implement pre-trained word embeddings)

        # self.word_embeddings = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim) # TODO: Implement padding_idx=self.pad_idx

        embedding_types: List[TokenEmbeddings] = [
            WordEmbeddings('glove'),
            PooledFlairEmbeddings('news-forward', pooling='min'),
            PooledFlairEmbeddings('news-backward', pooling='min')
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)
        self.embeddings = embeddings
        self.embedding_dim: int = self.embeddings.embedding_length

        if self.rnn_type == 'gru':
            rnn = nn.GRU
        elif self.rnn_type == 'lstm':
            rnn = nn.LSTM
        elif self.rnn_type == 'rnn':
            rnn = nn.RNN
        else:
            raise ValueError

        # Sequence tagger
        self.rnn = rnn(input_size=self.embedding_dim,
                       hidden_size=hidden_dim,
                       num_layers=self.num_layers,
                       dropout=0.0 if self.num_layers == 1 else 0.5,
                       bidirectional=self.bidirectional,
                       batch_first=True)

        if self.task_type == 'SEQ':
            # Linear layer that maps hidden state space from rnn to tag space
            self.hidden2tag = nn.Linear(in_features=hidden_dim *
                                        self.num_directions,
                                        out_features=tagset_size)

        if self.task_type == 'CLF':
            # COME BACK LATER...
            self.drop = nn.Dropout(p=0.5)
            self.hidden2tag = nn.Linear(in_features=hidden_dim *
                                        self.num_directions,
                                        out_features=1)
Esempio n. 25
0
def init(tasks_base_path) -> Tuple[TaggedCorpus, Dictionary, TextClassifier]:
    corpus = NLPTaskDataFetcher.load_corpus(NLPTask.AG_NEWS, tasks_base_path)
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings('turian')
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, False)

    return corpus, label_dict, model
def embedding():
    # initialize the word embeddings
    glove_embedding = WordEmbeddings('glove')
    flair_embedding_forward = FlairEmbeddings('news-forward')
    flair_embedding_backward = FlairEmbeddings('news-backward')

    # initialize the document embeddings, mode = mean
    document_embeddings = DocumentPoolEmbeddings(
        [glove_embedding, flair_embedding_backward, flair_embedding_forward])

    return document_embeddings
Esempio n. 27
0
def create_embeddings_flair(data: pd.DataFrame,
                            column: str = "text",
                            path: str = None,
                            embeddings_type: str = "tranformer",
                            typs: str = "train"):
    assert column in data.columns.tolist(
    ), "[embeddings.py] -> [create_embedding_flair] -> Input column not in dataframe columns"
    assert embeddings_type in ["tranformer", "stacked"]

    from flair.embeddings import WordEmbeddings, FlairEmbeddings, StackedEmbeddings, DocumentPoolEmbeddings, TransformerDocumentEmbeddings
    from flair.data import Sentence

    fast_text_embedding = WordEmbeddings('de')
    flair_embedding_forward = FlairEmbeddings('de-forward')
    flair_embedding_backward = FlairEmbeddings('de-backward')

    stacked_embeddings = DocumentPoolEmbeddings([
        fast_text_embedding, flair_embedding_forward, flair_embedding_backward
    ])

    transformer_embedding = TransformerDocumentEmbeddings(
        'bert-base-german-cased', fine_tune=False)

    tic = time.time()

    embeddings = []

    for i, text in enumerate(data[column].values):
        print("sentence {}/{}".format(i, len(data)))
        sentence = Sentence(text)

        if embeddings_type == "stacked":
            stacked_embeddings.embed(sentence)
        elif embeddings_type == "tranformer":
            transformer_embedding.embed(sentence)

        embedding = sentence.embedding.detach().cpu().numpy()
        embeddings.append(embedding)

    embeddings = np.array(embeddings)

    columns = [
        "embedding_{}".format(feature)
        for feature in range(embeddings.shape[1])
    ]

    csv = pd.DataFrame(embeddings, columns=columns)
    csv.to_csv(path + embeddings_type + "_" + typs + ".csv", index=False)

    toc = time.time()

    print(
        "[create_embeddings_flair] -> [embeddings_type: {}, typs: {}] -> time {}'s"
        .format(embeddings_type, typs, toc - tic))
Esempio n. 28
0
def init(tasks_base_path) -> Tuple[Corpus, Dictionary, TextClassifier]:
    # get training, test and dev data
    corpus = flair.datasets.ClassificationCorpus(tasks_base_path / "ag_news")
    label_dict = corpus.make_label_dictionary()

    glove_embedding: WordEmbeddings = WordEmbeddings("turian")
    document_embeddings: DocumentRNNEmbeddings = DocumentRNNEmbeddings(
        [glove_embedding], 128, 1, False, 64, False, False)

    model = TextClassifier(document_embeddings, label_dict, multi_label=False)

    return corpus, label_dict, model
Esempio n. 29
0
def load_and_apply_word_embeddings(emb_type: str):
    text = "I love Berlin."
    sentence: Sentence = Sentence(text)
    embeddings: TokenEmbeddings = WordEmbeddings(emb_type)
    embeddings.embed(sentence)

    for token in sentence.tokens:
        assert len(token.get_embedding()) != 0

        token.clear_embeddings()

        assert len(token.get_embedding()) == 0
Esempio n. 30
0
    def en_lang(cls):
        """
        Factory method for english embeddings
        """
        #embeddings = WordEmbeddings('en-glove')
        embeddings = WordEmbeddings(
            'en-crawl')  # FastText embeddings over web crawls
        #embeddings = WordEmbeddings('en-news')
        #embeddings = FlairEmbeddings('news-forward')
        #embeddings = BertEmbeddings()

        return cls(embeddings)