Ejemplo n.º 1
0
def _get_embedding_model(
    model_name_or_path: Union[str, HFModelResult, FlairModelResult]
) -> Union[FlairEmbeddings, WordEmbeddings, TransformerWordEmbeddings,
           Sentence]:
    "Load the proper `Embeddings` model from `model_name_or_path`"
    if isinstance(model_name_or_path, FlairModelResult):
        nm = model_name_or_path.name
        try:
            return WordEmbeddings(nm.strip('flairNLP/'))
        except:
            return FlairEmbeddings(nm.strip('flairNLP/'))

    elif isinstance(model_name_or_path, HFModelResult):
        return TransformerWordEmbeddings(model_name_or_path.name)
    else:
        res = _flair_hub.search_model_by_name(model_name_or_path,
                                              user_uploaded=True)
        if len(res) < 1:
            # No models found
            res = _hf_hub.search_model_by_name(model_name_or_path,
                                               user_uploaded=True)
            if len(res) < 1:
                raise ValueError(
                    f'Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model'
                )
            else:
                return TransformerWordEmbeddings(
                    res[0].name
                )  # Returning the first should always be the non-fast option
        else:
            nm = res[0].name
            try:
                return WordEmbeddings(nm.strip('flairNLP/'))
            except:
                return FlairEmbeddings(nm.strip('flairNLP/'))
Ejemplo n.º 2
0
    def dump_bert_vecs(df, dump_dir):
        print("Getting BERT vectors...")
        embedding = TransformerWordEmbeddings('bert-base-uncased')
        word_counter = defaultdict(int)
        stop_words = set(stopwords.words('english'))
        stop_words.add("would")
        except_counter = 0

        for index, row in df.iterrows():
            if index % 100 == 0:
                print("Finished sentences: " + str(index) + " out of " +
                      str(len(df)))
            line = row["sentence"]
            sentences = sent_tokenize(line)
            for sentence_ind, sent in enumerate(sentences):
                flag = 0
                i = 0
                sentence = None
                while flag == 0:
                    sentence = Sentence(sent[:(len(sent) - i * 100)],
                                        use_tokenizer=True)
                    try:
                        embedding.embed(sentence)
                        flag = 1
                    except Exception as e:
                        except_counter += 1
                        print("Length of sentence: ", len(sent) - i * 100)
                        print("Exception Counter while getting BERT: ",
                              except_counter, sentence_ind, index, e)
                        i += 1
                if sentence is None or len(sentence) == 0:
                    print("Length of sentence is 0: ", index)
                for token_ind, token in enumerate(sentence):
                    word = token.text
                    word = word.translate(
                        str.maketrans('', '', string.punctuation))
                    if word in stop_words or "/" in word or len(word) == 0:
                        continue
                    word_dump_dir = dump_dir + word
                    os.makedirs(word_dump_dir, exist_ok=True)
                    fname = word_dump_dir + "/" + str(
                        word_counter[word]) + ".pkl"
                    word_counter[word] += 1
                    vec = token.embedding.cpu().numpy()
                    try:
                        with open(fname, "wb") as handler:
                            pickle.dump(vec, handler)
                    except Exception as e:
                        except_counter += 1
                        print("Exception Counter while dumping BERT: ",
                              except_counter, sentence_ind, index, word, e)
Ejemplo n.º 3
0
    def dump_bert_vecs(df, dump_dir):
        print("Getting BERT vectors...")
        embedding = TransformerWordEmbeddings('roberta-base', layers='-1')
        word_counter = defaultdict(int)
        stop_words = set(stopwords.words('english'))
        stop_words.add("would")
        except_counter = 0
        key = list(word_cnt.keys())

        for index, row in df.iterrows():
            file1 = open("progress.txt", "w+")
            file1.write(str(index))
            print(index)
            if index % 100 == 0:
                print("Finished sentences: " + str(index) + " out of " +
                      str(len(df)))
            line = row["news"]
            sentences = sent_tokenize(line)

            for sentence_ind, sent in enumerate(sentences):
                sentence = Sentence(sent, use_tokenizer=True)
                try:
                    embedding.embed(sentence)
                except Exception as e:
                    except_counter += 1
                    print("Exception Counter while getting BERT: ",
                          except_counter, sentence_ind, index, e)
                    continue
                for token_ind, token in enumerate(sentence):
                    word = token.text
                    word = word.translate(
                        str.maketrans('', '', string.punctuation))
                    if word in stop_words or "/" in word or len(word) == 0 or (
                            word not in key) or word_cnt[word] < 10:
                        #print("word")
                        continue
                    word_dump_dir = dump_dir + word
                    os.makedirs(word_dump_dir, exist_ok=True)
                    fname = word_dump_dir + "/" + str(
                        word_counter[word]) + ".pkl"
                    word_counter[word] += 1
                    vec = token.embedding.cpu().numpy()
                    try:
                        with open(fname, "wb") as handler:
                            pickle.dump(vec, handler)
                    except Exception as e:
                        except_counter += 1
                        print("Exception Counter while dumping BERT: ",
                              except_counter, sentence_ind, index, word, e)
Ejemplo n.º 4
0
    def __init__(self, args, name, asp_word2idx, selected_idx=None, need_neg_senti=False):
        self.asp_word2idx = asp_word2idx
        self.need_neg_senti = need_neg_senti
        self.args = args
        self.embedding = TransformerWordEmbeddings('bert-base-uncased',layers='-1')
        if name == 'train':
            self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.train), 'train', selected_idx, filter_null=args.unsupervised)
        elif name == 'dev':
            self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.dev), 'dev', selected_idx)
        elif name == 'test':
            self.load_corpus_with_NULL_ITEM(os.path.join(args.data_dir,args.test), 'test',  selected_idx)
        else:
            raise NotImplementedError

        self.len = len(self.corpus_y)
Ejemplo n.º 5
0
def get_scibert_flair_embeddings():
    return [
        TransformerWordEmbeddings(model="allenai/scibert_scivocab_uncased",
                                  fine_tune=True),
        FlairEmbeddings("pubmed-forward"),
        FlairEmbeddings("pubmed-backward")
    ]
Ejemplo n.º 6
0
    def create_embeddings(self) -> StackedEmbeddings:

        embedding_types: List[FlairEmbeddings] = []
        
        if self.config['use_word_embeddings']:
            embedding_types.append(W2vWordEmbeddings(self.config['word_embeddings_path']))

        if self.config['use_char_embeddings']:
            embedding_types.append(CharacterEmbeddings())


        if self.config['use_flair_embeddings']:
            embedding_types.append(FlairEmbeddings('es-clinical-forward'))
            embedding_types.append(FlairEmbeddings('es-clinical-backward'))
        
        if self.config['use_beto_embeddings']:
            embedding_types.append(
                TransformerWordEmbeddings(
                    'dccuchile/bert-base-spanish-wwm-cased',
                    layers = self.config['layers'], 
                    layer_mean = self.config['layer_mean'], 
                    subtoken_pooling = self.config['subtoken_pooling']))

        embeddings: StackedEmbeddings = StackedEmbeddings(embeddings = embedding_types)
        return embeddings
Ejemplo n.º 7
0
    def __init__(self, *embeddings: str):
        print("May need a couple moments to instantiate...")
        self.embedding_stack = []

        # Load correct Embeddings module
        for model_name_or_path in embeddings:
            if ("flair" in model_name_or_path
                    or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.embedding_stack.append(
                    FlairEmbeddings(model_name_or_path))
            else:
                try:
                    self.embedding_stack.append(
                        WordEmbeddings(model_name_or_path))
                except ValueError:
                    try:
                        self.embedding_stack.append(
                            TransformerWordEmbeddings(model_name_or_path))
                    except ValueError:
                        raise ValueError(
                            f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                        )

        assert len(self.embedding_stack) != 0
        self.stacked_embeddings = StackedEmbeddings(
            embeddings=self.embedding_stack)
Ejemplo n.º 8
0
def train():
    generate_datasets()
    DATA_FOLDER = '../content/data'
    # MAX_TOKENS = 500
    columns = {0: 'text', 1: 'pos', 2: 'tag'}

    data_folder = DATA_FOLDER

    corpus: Corpus = ColumnCorpus(data_folder, columns,
                                  train_file='train-labelled.txt',
                                  test_file='dev-labelled.txt',
                                  in_memory=False)
    # corpus._train = [x for x in corpus.train if len(x) < MAX_TOKENS]
    # corpus._test = [x for x in corpus.test if len(x) < MAX_TOKENS]

    tag_type = 'tag'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    print(tag_dictionary)
    embeddings = TransformerWordEmbeddings('roberta-base', layers='-4', fine_tune=True)

    tagger: SequenceTagger = SequenceTagger(hidden_size=128,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            # dropout=0.3334816033039888,
                                            use_crf=True)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train('resources/taggers/task-TC',
                  learning_rate=0.2,
                  mini_batch_size=64,
                  max_epochs=100,
                  embeddings_storage_mode='gpu'),
Ejemplo n.º 9
0
    def build_embedding(self, lang, embedding_codes: List[str]) -> None:

        self.tic = time.time()
        self.embedding_name: str = "-".join(embedding_codes)
        self.lang = lang

        embedding_types: List[TokenEmbeddings] = []

        for code in embedding_codes:

            code = code.lower()
            assert code in [
                "bpe",
                "bert",
                "flair",
                "ft",
                "char",
                "ohe",
                "elmo",
            ], f"{code} - Invalid embedding code"

            if code == "ohe":
                embedding_types.append(OneHotEmbeddings(corpus=self.corpus))
            elif code == "ft":
                embedding_types.append(WordEmbeddings(self.lang))
            elif code == "bpe":
                embedding_types.append(BytePairEmbeddings(self.lang))
            elif code == "bert":
                embedding_types.append(
                    TransformerWordEmbeddings(
                        model=self.huggingface_ref[self.lang],
                        pooling_operation="first",
                        layers="-1",
                        fine_tune=False,
                    )
                )
            elif code == "char":
                embedding_types.append(CharacterEmbeddings())
            elif code == "flair":
                embedding_types.append(FlairEmbeddings(f"{self.lang}-forward"))
                embedding_types.append(FlairEmbeddings(f"{self.lang}-backward"))
            elif code == "elmo":
                embedding_types.append(
                    ELMoEmbeddings(model="large", embedding_mode="all")
                )

        self.embedding: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types
        )

        self.tagger: SequenceTagger = SequenceTagger(
            hidden_size=256,
            embeddings=self.embedding,
            tag_dictionary=self.tag_dictionary,
            tag_type=self.tag_type,
            use_crf=True,
        )

        self.trainer: ModelTrainer = ModelTrainer(self.tagger, self.corpus)
Ejemplo n.º 10
0
    def train(self, training_dir=None):
        from flair.trainers import ModelTrainer

        if training_dir is None:
            training_dir = flair_splitter_dep_dir

        # define columns
        columns = {0: "text", 1: "ner"}

        # this is the folder in which train, test and dev files reside
        data_folder = flair_splitter_dep_dir + "data"

        # init a corpus using column format, data folder and the names of the train, dev and test files
        # note that training data should be unescaped, i.e. tokens like "&", not "&amp;"
        corpus: Corpus = ColumnCorpus(
            data_folder,
            columns,
            train_file="sent_train.txt",
            test_file="sent_test.txt",
            dev_file="sent_dev.txt",
            document_separator_token="-DOCSTART-",
        )

        print(corpus)

        tag_type = "ner"
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary)

        # initialize embeddings
        embedding_types = [
            # WordEmbeddings('glove'),
            # comment in this line to use character embeddings
            CharacterEmbeddings(),
            # comment in these lines to use flair embeddings
            #FlairEmbeddings("news-forward"),
            #FlairEmbeddings("news-backward"),
            # BertEmbeddings('distilbert-base-cased')
            TransformerWordEmbeddings('google/electra-base-discriminator')
        ]

        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=128,
            embeddings=embeddings,
            tag_dictionary=tag_dictionary,
            tag_type=tag_type,
            use_crf=True,
        )

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(training_dir,
                      learning_rate=0.1,
                      mini_batch_size=16,
                      max_epochs=50)
        self.model = tagger
Ejemplo n.º 11
0
class WeVectorizer:
    def __init__(self, op_relations, vectorizer='spacy'):
        if vectorizer == 'spacy':
            self.vectorizer = en_core_web_md.load()
        else:
            self.vectorizer = TransformerWordEmbeddings('roberta-base')
        self.vectors = self.vectorizer_data(op_relations)

    def _vectorizer_data(self, relations):
        vecs = []
        for sent_id, per_cand, org_cand, sent_raw in tqdm(relations):
            sent = sent_raw.strip("().\n")
            org = org_cand['text']
            per = per_cand['text']
            sent_clean = sent.replace(org, "").replace(per, "")
            vecs.append(self.vec_sent(sent_clean, per, org))
        vecs = np.array(vecs)
        return vecs

    def vectorizer_data(self, relations):
        vecs = []
        for sent_id, per_cand, org_cand, sent_raw in tqdm(relations):
            sent = sent_raw.strip("().\n")
            sent = Sentence(sent)
            self.vectorizer.embed(sent)
            vecs.append(sent[0].embedding.cpu().detach().numpy())
        vecs = np.array(vecs)
        return vecs

    def vec_sent(self, sent, per_candidate, org_candidate):
        toks = [
            t for t in self.vectorizer(sent)
            if not any([t.is_space, t.is_punct, t.is_stop, t.is_currency])
            and t.has_vector
        ]
        sent_vecs = np.array([t.vector for t in toks]).mean(axis=0)
        per_vec = self.vectorize_ent(per_candidate)
        org_vec = self.vectorize_ent(org_candidate)
        res = np.concatenate([sent_vecs, per_vec, org_vec])
        return res

    def vectorize_ent(self, org_candidate):
        return np.array([t.vector
                         for t in self.vectorizer(org_candidate)]).mean(axis=0)
Ejemplo n.º 12
0
def test_train_load_use_classifier(results_base_path, tasks_base_path):
    corpus = ColumnCorpus(
        data_folder=tasks_base_path / "conllu",
        train_file="train.conllup",
        dev_file="train.conllup",
        test_file="train.conllup",
        column_format={
            1: "text",
            2: "pos",
            3: "ner"
        },
    )

    relation_label_dict = corpus.make_label_dictionary(label_type="relation")

    embeddings = TransformerWordEmbeddings()

    model: RelationExtractor = RelationExtractor(
        embeddings=embeddings,
        label_dictionary=relation_label_dict,
        label_type="relation",
        entity_label_type="ner",
        train_on_gold_pairs_only=True,
    )

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(model, corpus)

    trainer.train(
        results_base_path,
        learning_rate=0.1,
        mini_batch_size=2,
        max_epochs=3,
        shuffle=False,
    )

    del trainer, model, relation_label_dict, corpus

    loaded_model: RelationExtractor = RelationExtractor.load(
        results_base_path / "final-model.pt")
    loaded_model.train_on_gold_pairs_only = False

    sentence = Sentence(
        ["Apple", "was", "founded", "by", "Steve", "Jobs", "."])
    for token, tag in zip(sentence.tokens,
                          ["B-ORG", "O", "O", "O", "B-PER", "I-PER", "O"]):
        token.set_label("ner", tag)

    loaded_model.predict(sentence)

    assert "founded_by" == sentence.get_labels("relation")[0].value

    # loaded_model.predict([sentence, sentence_empty])
    # loaded_model.predict([sentence_empty])

    del loaded_model
Ejemplo n.º 13
0
def train_model(directory='Data', use_BERT=True):
    # define columns
    columns = {
        0: 'ID',
        1: 'text',
        2: 'empty_0',
        3: 'pos',
        4: 'empty_1',
        5: 'empty_2',
        6: 'empty_3',
        7: 'empty_4',
        8: 'empty_5',
        9: 'tox'
    }

    # this is the folder in which train, test and dev files reside
    data_folder = directory

    # init a corpus using column format, data folder and the names of the train, dev and test files
    corpus: Corpus = ColumnCorpus(data_folder,
                                  columns,
                                  train_file='converted_data_train.conll',
                                  test_file='converted_data_test.conll',
                                  dev_file='converted_data_dev.conll')

    # tag to predict
    tag_type = 'tox'
    tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

    # embeddings
    if use_BERT:
        bert_embeddings = [
            TransformerWordEmbeddings('bert-large-uncased', fine_tune=True)
        ]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=bert_embeddings)
    else:
        embedding_types = [WordEmbeddings('glove')]
        embeddings: StackedEmbeddings = StackedEmbeddings(
            embeddings=embedding_types)

    # initialize sequence tagger
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type,
                                            use_crf=True)

    # initialize trainer
    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    # start training
    trainer.train('resources/taggers/toxic_classifier_bert',
                  learning_rate=0.1,
                  mini_batch_size=32,
                  max_epochs=5)
Ejemplo n.º 14
0
def test_transformer_word_embeddings_forward_language_ids():
    cos = torch.nn.CosineSimilarity(dim=0, eps=1e-10)

    sent_en = Sentence(["This", "is", "a", "sentence"], language_code="en")
    sent_de = Sentence(["Das", "ist", "ein", "Satz"], language_code="de")

    embeddings = TransformerWordEmbeddings("xlm-mlm-ende-1024",
                                           allow_long_sentences=False)

    embeddings.embed([sent_de, sent_en])
    expected_similarities = [
        0.7102344036102295, 0.7598986625671387, 0.7437312602996826,
        0.5584433674812317
    ]

    for (token_de, token_en, exp_sim) in zip(sent_de, sent_en,
                                             expected_similarities):
        sim = cos(token_de.embedding, token_en.embedding).item()
        assert abs(exp_sim - sim) < 1e-5
Ejemplo n.º 15
0
def test_sequence_tagger_transformer_finetune(results_base_path,
                                              tasks_base_path):
    flair.set_seed(123)

    # load dataset
    corpus: Corpus = ColumnCorpus(
        data_folder=tasks_base_path / "trivial" / "trivial_bioes",
        column_format={
            0: "text",
            1: "ner"
        },
    )
    tag_dictionary = corpus.make_label_dictionary("ner")

    # tagger without CRF
    tagger: SequenceTagger = SequenceTagger(
        hidden_size=64,
        embeddings=TransformerWordEmbeddings("distilbert-base-uncased",
                                             fine_tune=True),
        tag_dictionary=tag_dictionary,
        tag_type="ner",
        use_crf=False,
        use_rnn=False,
        reproject_embeddings=False,
    )

    # train
    trainer = ModelTrainer(tagger, corpus)
    trainer.fine_tune(
        results_base_path,
        mini_batch_size=2,
        max_epochs=10,
        shuffle=True,
        learning_rate=0.5e-4,
    )

    loaded_model: SequenceTagger = SequenceTagger.load(results_base_path /
                                                       "final-model.pt")

    sentence = Sentence("this is New York")
    sentence_empty = Sentence("       ")

    loaded_model.predict(sentence)
    loaded_model.predict([sentence, sentence_empty])
    loaded_model.predict([sentence_empty])

    # check if loaded model can predict
    entities = [span.text for span in sentence.get_spans("ner")]
    assert "New York" in entities

    # check if loaded model successfully fit the training data
    result: Result = loaded_model.evaluate(corpus.test, gold_label_type="ner")
    assert result.classification_report["micro avg"]["f1-score"] == 1.0

    del loaded_model
Ejemplo n.º 16
0
def trainNER(data_dir, model_dir):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        default='bert-base-cased',
                        type=str,
                        required=True,
                        help="The pretrained model to produce embeddings")
    args = parser.parse_args()
    model = args.model

    # pdb.set_trace()
    try:
        corpus: Corpus = CONLL_03(base_path=data_dir + '/')
    except FileNotFoundError:
        columns = {0: 'text', 1: 'ner'}
        corpus: Corpus = ColumnCorpus(data_dir, columns)
    corpus.filter_empty_sentences()
    tag_type = 'ner'
    # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    tag_dictionary = corpus.make_label_dictionary('ner')
    print(tag_dictionary.get_items())
    stats = corpus.obtain_statistics()
    print(stats)
    # ['<unk>', 'O', 'B-DEVICE', 'I-DEVICE', 'B-TREE', 'I-TREE', 'B-APPLICATION', 'I-APPLICATION', 'B-LOCATION', 'I-LOCATION', '<START>', '<STOP>']
    # pdb.set_trace()

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        TransformerWordEmbeddings(
            model=model,
            layers='0',  # dtype: str
            pooling_operation='first_last',
            use_scalar_mix=False,
            batch_size=16,
            fine_tune=False,
            allow_long_sentences=False)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # biLSTM + CRF
    # tagger: SequenceTagger = SequenceTagger(hidden_size=256,
    #                                         embeddings=embeddings,
    #                                         tag_dictionary=tag_dictionary,
    #                                         tag_type=tag_type)

    model_path = '/home/carolyn/Projects/mygit/Flair-NER/exprmt-20201120/conll_frac/10ptdata/models-5e-20201124/final-model.pt'
    tagger: SequenceTagger = SequenceTagger.load(model_path)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(model_dir, train_with_dev=False, max_epochs=10)  # 150
Ejemplo n.º 17
0
def test_transformers_keep_tokenizer_when_saving(results_base_path):
    embeddings = TransformerWordEmbeddings(
        "sentence-transformers/paraphrase-albert-small-v2")
    results_base_path.mkdir(exist_ok=True, parents=True)
    initial_tagger_path = results_base_path / "initial_tokenizer.pk"
    reloaded_tagger_path = results_base_path / "reloaded_tokenizer.pk"

    initial_tagger = SequenceTagger(embeddings, Dictionary(), "ner")

    initial_tagger.save(initial_tagger_path)
    reloaded_tagger = SequenceTagger.load(initial_tagger_path)

    reloaded_tagger.save(reloaded_tagger_path)
def main(directory, embeddings, strategy):
    # 1. find corpora in data directory
    corpora = {"train": None, "dev": None, "test": None}
    for labelset in corpora:
        for file in sorted(os.listdir(directory)):
            if infer_split(file) == labelset:
                corpora[labelset] = pd.read_csv(
                    os.path.join(directory, file),
                    sep="\t",
                    names=["text", "pos", "lemma", "label"],
                    engine="python",
                    error_bad_lines=False,
                    quoting=csv.QUOTE_NONE).fillna("")
                break

    if embeddings == "elmo":
        embedder = ELMoEmbeddings("original")
    elif embeddings == "flair":
        embedder = FlairEmbeddings("news-forward")
    elif embeddings == "bert":
        embedder = TransformerWordEmbeddings('bert-base-cased')

    embeddings_dir = os.path.join(directory, embeddings + "_embeddings")
    if not os.path.exists(embeddings_dir):
        os.makedirs(embeddings_dir, exist_ok=True)

    strategy = np.mean if strategy == "mean" else np.max if strategy == "max" else np.sum if strategy == "sum" else None

    for labelset, corpus in corpora.items():
        if corpus is None:
            print(f"empty corpus: {labelset}")
            continue
        voc = sorted(corpus["text"].unique())
        print(f"Unique tokens: {len(voc)}")

        with open(os.path.join(embeddings_dir, labelset + ".w2v"), "w") as f:
            for word in voc:
                sentence = Sentence(word)
                if len(sentence) == 0:
                    continue
                embedder.embed(sentence)
                token_embedding = strategy(
                    [token.embedding.cpu().numpy() for token in sentence],
                    axis=0)
                f.write(
                    word + " " +
                    " ".join([str(num)
                              for num in token_embedding.tolist()]) + '\n')
Ejemplo n.º 19
0
 def __init__(self, config):
     """
     Load pretrained language model
     """
     super(LanguageModel, self).__init__()
     embeddings_stack = []
     transformers = config.get("language_model", "transformers")
     if transformers is not "":
         transformers = transformers.split(";")
         for model in transformers:
             embeddings_stack.append(
                 TransformerWordEmbeddings(
                     model,
                     layers="-1",
                     pooling_operation='mean',
                     # use_scalar_mix=True,
                     fine_tune=True))
     word_embeddings = config.get("language_model", "word_embeddings")
     if word_embeddings is not "":
         word_embeddings = word_embeddings.split(";")
         for model in word_embeddings:
             embeddings_stack.append(WordEmbeddings(model))
     flair_embeddings = config.get("language_model", "flair_embeddings")
     if flair_embeddings is not "":
         flair_embeddings = flair_embeddings.split(";")
         for model in flair_embeddings:
             embeddings_stack.append(FlairEmbeddings(model, fine_tune=True))
     character_embeddings = config.get("language_model",
                                       "character_embeddigs")
     if character_embeddings.lower() is "yes":
         embeddings_stack.append(CharacterEmbeddings(character_embeddings))
     bytepair_embeddings = config.get("language_model",
                                      "bytepair_embeddings")
     if bytepair_embeddings.lower() is "yes":
         embeddings_stack.append(BytePairEmbeddings())
     custom_embeddings = config.get("language_model", "custom_embeddings")
     if custom_embeddings is not "":
         custom_embeddings = custom_embeddings.split(";")
         for path in custom_embeddings:
             embeddings_stack.append(WordEmbeddings(path))
     self.lm = StackedEmbeddings(embeddings_stack)
     self.embedding_dim = self.lm.embedding_length
     self.dropout = torch.nn.Dropout(
         float(config.get("language_model", "dropout")))
     self.classify = torch.nn.Linear(self.embedding_dim, 2)
     if config.get("language_model", "relu") == "yes":
         self.relu = torch.nn.ReLU()
Ejemplo n.º 20
0
def trainNER(data_dir, model_dir):
    parser = argparse.ArgumentParser()
    parser.add_argument("--model",
                        default='bert-base-cased',
                        type=str,
                        required=True,
                        help="The pretrained model to produce embeddings")
    args = parser.parse_args()
    model = args.model
    columns = {0: 'text', 1: 'ner'}
    # pdb.set_trace()
    # print(data_dir + '/eng.train')
    corpus: Corpus = ColumnCorpus(data_dir, columns)
    corpus.filter_empty_sentences()
    tag_type = 'ner'
    # tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
    tag_dictionary = corpus.make_label_dictionary('ner')
    print(tag_dictionary.get_items())
    stats = corpus.obtain_statistics()
    print(stats)
    # ['<unk>', 'O', 'B-DEVICE', 'I-DEVICE', 'B-TREE', 'I-TREE', 'B-APPLICATION', 'I-APPLICATION', 'B-LOCATION', 'I-LOCATION', '<START>', '<STOP>']
    # pdb.set_trace()

    embedding_types: List[TokenEmbeddings] = [
        WordEmbeddings('glove'),
        TransformerWordEmbeddings(
            model=model,
            layers='0',  # dtype: str
            pooling_operation='first_last',
            use_scalar_mix=False,
            batch_size=16,
            fine_tune=False,
            allow_long_sentences=False)
    ]
    embeddings: StackedEmbeddings = StackedEmbeddings(
        embeddings=embedding_types)

    # biLSTM + CRF
    tagger: SequenceTagger = SequenceTagger(hidden_size=256,
                                            embeddings=embeddings,
                                            tag_dictionary=tag_dictionary,
                                            tag_type=tag_type)

    trainer: ModelTrainer = ModelTrainer(tagger, corpus)

    trainer.train(model_dir, train_with_dev=True, max_epochs=10)  # 150
Ejemplo n.º 21
0
def _get_embedding_model(
    model_name_or_path: str
) -> Union[FlairEmbeddings, WordEmbeddings, TransformerWordEmbeddings,
           Sentence]:
    "Load the proper `Embeddings` model from `model_name_or_path`"
    if ("flair" in model_name_or_path
            or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
        return FlairEmbeddings(model_name_or_path)
    else:
        try:
            return WordEmbeddings(model_name_or_path)
        except ValueError:
            try:
                return TransformerWordEmbeddings(model_name_or_path)
            except ValueError:
                raise ValueError(
                    f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                )
    def handle(self, *args, **options):
        file = options.get('file') or 'annotated_sentences'
        model_folder = options.get('model_folder') or 'model-var'
        columns = {0: 'text', 1: 'var'}
        data_folder = 'data/txt'

        corpus = ColumnCorpus(data_folder, columns,
                              train_file=f'{file}.txt')
        
        tag_type = 'var'

        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

        embedding_types = [
            WordEmbeddings('glove'),

            # comment in this line to use character embeddings
            # CharacterEmbeddings(),

            # comment in these lines to use flair embeddings
            # FlairEmbeddings('news-forward'),
            # FlairEmbeddings('news-backward'),
            TransformerWordEmbeddings('bert-base-uncased'),
        ]

        embeddings = StackedEmbeddings(embeddings=embedding_types)

        tagger = SequenceTagger(hidden_size=256,
                                embeddings=embeddings,
                                tag_dictionary=tag_dictionary,
                                tag_type=tag_type,
                                use_crf=True)

        trainer = ModelTrainer(tagger, corpus)

        trainer.train(f'data/models/taggers/{model_folder}',
                    learning_rate=0.1,
                    mini_batch_size=32,
                    max_epochs=150)

        
        self.stdout.write(self.style.SUCCESS(f'Successfully trained model on dataset file.'))
Ejemplo n.º 23
0
def EmbeddingFactory(parameters, corpus):
    from flair.embeddings import FlairEmbeddings, StackedEmbeddings, \
        WordEmbeddings, OneHotEmbeddings, CharacterEmbeddings, TransformerWordEmbeddings

    stack = []
    for emb in parameters.embedding.split():
        if any((spec in emb) for spec in ("bert", "gpt", "xlnet")):
            stack.append(
                TransformerWordEmbeddings(model=pretrainedstr(
                    emb, parameters.language),
                                          fine_tune=parameters.tune_embedding))
        elif emb == "flair":
            stack += [
                FlairEmbeddings(f"{parameters.language}-forward",
                                fine_tune=parameters.tune_embedding),
                FlairEmbeddings(f"{parameters.language}-backward",
                                fine_tune=parameters.tune_embedding)
            ]
        elif emb == "pos":
            stack.append(
                OneHotEmbeddings(corpus,
                                 field="pos",
                                 embedding_length=parameters.pos_embedding_dim,
                                 min_freq=1))
        elif emb == "fasttext":
            stack.append(WordEmbeddings(parameters.language))
        elif emb == "word":
            stack.append(
                OneHotEmbeddings(
                    corpus,
                    field="text",
                    embedding_length=parameters.word_embedding_dim,
                    min_freq=parameters.word_minfreq))
        elif emb == "char":
            stack.append(
                CharacterEmbeddings(
                    char_embedding_dim=parameters.char_embedding_dim,
                    hidden_size_char=parameters.char_bilstm_dim))
        else:
            raise NotImplementedError()
    return StackedEmbeddings(stack)
Ejemplo n.º 24
0
    def embed_text(
        self,
        text: Union[List[Sentence], Sentence, List[str], str],
        model_name_or_path: str = "bert-base-cased",
    ) -> List[Sentence]:
        """Produces embeddings for text

        * **text** - Text input, it can be a string or any of Flair's `Sentence` input formats
        * **model_name_or_path** - The hosted model name key or model path
        **return** - A list of Flair's `Sentence`s
        """
        # Convert into sentences
        if isinstance(text, str):
            sentences = Sentence(text)
        elif isinstance(text, list) and all(isinstance(t, str) for t in text):
            sentences = [Sentence(t) for t in text]
        else:
            sentences = text

        # Load correct Embeddings module
        if not self.models[model_name_or_path]:
            if ("flair" in model_name_or_path
                    or model_name_or_path in FLAIR_PRETRAINED_MODEL_NAMES):
                self.models[model_name_or_path] = FlairEmbeddings(
                    model_name_or_path)
            else:
                try:
                    self.models[model_name_or_path] = WordEmbeddings(
                        model_name_or_path)
                except ValueError:
                    try:
                        self.models[
                            model_name_or_path] = TransformerWordEmbeddings(
                                model_name_or_path)
                    except ValueError:
                        raise ValueError(
                            f"Embeddings not found for the model key: {model_name_or_path}, check documentation or custom model path to verify specified model"
                        )
                        return Sentence("")
        embedding = self.models[model_name_or_path]
        return embedding.embed(sentences)
Ejemplo n.º 25
0
def test_transformer_weird_sentences():

    embeddings = TransformerWordEmbeddings('distilbert-base-uncased',
                                           layers='all',
                                           use_scalar_mix=True)

    sentence = Sentence("Hybrid mesons , qq ̄ states with an admixture")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768

    sentence = Sentence("typical proportionalities of ∼ 1nmV − 1 [ 3,4 ] .")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768
Ejemplo n.º 26
0
def load_model(bert=None, document=False, flair=False):
    """Load word embeddings model."""
    if bert == 'bio':
        # https://github.com/flairNLP/flair/issues/1085
        # also see readme for instructions
        bertpath = './bert/bert-base-biobert-cased'
    elif bert == 'sci':
        # https://github.com/flairNLP/flair/issues/744
        # https://github.com/flairNLP/flair/issues/1239
        bertpath = './bert/scibert_scivocab_uncased'
    else:
        bertpath = 'bert-base-uncased'

    if document and not flair:
        bert_embedding = TransformerDocumentEmbeddings(model=bertpath,
                                                       batch_size=4)
        return bert_embedding

    bert_embedding = TransformerWordEmbeddings(model=bertpath,
                                               pooling_operation='first',
                                               batch_size=4)

    if flair:
        flair_embedding_forward = FlairEmbeddings('en-forward')
        flair_embedding_backward = FlairEmbeddings('en-backward')
        embed_arr = [
            bert_embedding,
            flair_embedding_backward,
            flair_embedding_forward,
        ]
    else:
        embed_arr = [bert_embedding]

    if document:
        document_embeddings = DocumentPoolEmbeddings(
            embed_arr, fine_tune_mode='nonlinear')
    else:
        document_embeddings = StackedEmbeddings(embed_arr)

    return document_embeddings
Ejemplo n.º 27
0
    def train(self, training_dir=None):
        from flair.trainers import ModelTrainer

        if training_dir is None:
            training_dir = script_dir + "flair" + os.sep

        # define columns
        columns = {0: "text", 1: "ner"}

        # this is the folder in which train, test and dev files reside
        data_folder = training_dir + "data"

        # init a corpus using column format, data folder and the names of the train, dev and test files
        # note that training data should be unescaped, i.e. tokens like "&", not "&amp;"
        corpus: Corpus = ColumnCorpus(
            data_folder,
            columns,
            train_file="sent_train.txt",
            test_file="sent_test.txt",
            dev_file="sent_dev.txt",
        )

        print(corpus)

        tag_type = "ner"
        tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
        print(tag_dictionary)

        # initialize embeddings
        embeddings: TransformerWordEmbeddings = TransformerWordEmbeddings('onlplab/alephbert-base')

        tagger: SequenceTagger = SequenceTagger(
            hidden_size=128, embeddings=embeddings, tag_dictionary=tag_dictionary, tag_type=tag_type, use_crf=True,
        )

        trainer: ModelTrainer = ModelTrainer(tagger, corpus)

        trainer.train(training_dir, learning_rate=0.1, mini_batch_size=32, max_epochs=50)
        self.model = tagger
Ejemplo n.º 28
0
def test_transformer_word_embeddings():

    embeddings = TransformerWordEmbeddings('distilbert-base-uncased',
                                           layers='-1,-2,-3,-4',
                                           layer_mean=False)

    sentence: Sentence = Sentence("I love Berlin")
    embeddings.embed(sentence)

    for token in sentence.tokens:
        assert len(token.get_embedding()) == 3072

        token.clear_embeddings()

        assert len(token.get_embedding()) == 0

    embeddings = TransformerWordEmbeddings('distilbert-base-uncased',
                                           layers='all',
                                           layer_mean=False)

    embeddings.embed(sentence)

    for token in sentence.tokens:
        assert len(token.get_embedding()) == 5376

        token.clear_embeddings()

        assert len(token.get_embedding()) == 0
    del embeddings

    embeddings = TransformerWordEmbeddings('distilbert-base-uncased',
                                           layers='all',
                                           layer_mean=True)

    embeddings.embed(sentence)

    for token in sentence.tokens:
        assert len(token.get_embedding()) == 768

        token.clear_embeddings()

        assert len(token.get_embedding()) == 0
    del embeddings
Ejemplo n.º 29
0
def test_transformer_weird_sentences():

    embeddings = TransformerWordEmbeddings('distilbert-base-uncased',
                                           layers='all',
                                           layer_mean=True)

    sentence = Sentence("Hybrid mesons , qq Ì„ states with an admixture")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768

    sentence = Sentence(
        "typical proportionalities of ∼ 1nmV − 1 [ 3,4 ] .")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768

    sentence = Sentence("🤟 🤟  🤟 hüllo")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768

    sentence = Sentence("🤟hallo 🤟 🤟 🤟 🤟")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768

    sentence = Sentence("🤟hallo 🤟 🤟 🤟 🤟")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768

    sentence = Sentence("🤟")
    embeddings.embed(sentence)
    for token in sentence:
        assert len(token.get_embedding()) == 768

    sentence = Sentence("🤟")
    sentence_2 = Sentence("second sentence")
    embeddings.embed([sentence, sentence_2])
    for token in sentence:
        assert len(token.get_embedding()) == 768
    for token in sentence_2:
        assert len(token.get_embedding()) == 768
from preprocessing.normalize import normalize
from utility.frequency_loader import load_frequencies, load_doc_frequencies
from utility.run_experiment import run_experiment
import os

if not os.path.exists(IMAGE_PATH):
    os.makedirs(IMAGE_PATH)

sick_all, sick_train, sick_test, sick_dev = download_and_load_sick_dataset()
print('Downloaded data')

frequency = load_frequencies("data/frequencies/frequencies.tsv")
doc_frequency = load_doc_frequencies("data/frequencies/doc_frequencies.tsv")
word2vec = load_word2vec(w2v_path)
elmo = ELMoEmbeddings('large')
bert = TransformerWordEmbeddings('bert-large-cased')
flair = StackedEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward')
])
elmo_bert = StackedEmbeddings([elmo, bert])

print("Loaded Resources")

benchmarks = [("AVG-W2V",
               ft.partial(run_avg_benchmark,
                          model=word2vec,
                          use_stoplist=False)),
              ("AVG-ELMO",
               ft.partial(run_context_avg_benchmark,