Esempio n. 1
0
import syntok.segmenter as segmenter

document = Document()  ## Create a python-docx document
cos = CosineSimilarity(dim=1, eps=1e-6)

sent_level = False
dynamic = True
graph = False
doc_embeddings = []
scores = []

stacked_embeddings = DocumentPoolEmbeddings([
    WordEmbeddings('en'),
    #WordEmbeddings('glove'),
    #WordEmbeddings('extvec'),#ELMoEmbeddings('original'),
    #BertEmbeddings('bert-base-cased'),
    #FlairEmbeddings('news-forward-fast'),
    #FlairEmbeddings('news-backward-fast'),
    #OpenAIGPTEmbeddings()
    #TransformerXLEmbeddings()
])  #, mode='max')


def set_card():
    print("Input the Card Text, press Ctrl-D to end text entry")
    card = sys.stdin.read()  #input("Input the Card Text: ")
    card_tag = input(
        "Input the card_tag, or a -1 to summarize in-terms of the card itself: "
    )
    card = str(card)
    if str(
            card_tag
Esempio n. 2
0
        "n_estimators": [50, 100, 200],
        "max_depth": [15, 25, 35]
    }),
    "Logistic Regression": (LogisticRegression(solver="saga",
                                               multi_class="multinomial"), {
                                                   "penalty": ["l2", "l1"]
                                               }),
    "Naive Bayes": (MultinomialNB(), {
        "alpha": [0.15, 0.25, 0.5, 0.65],
    }),
    "SVM": (SGDClassifier(loss='hinge', alpha=0.001, random_state=42), {
        "alpha": [0.0005, 0.001, 0.005, 0.01],
    }),
}

_EMBEDDER = DocumentPoolEmbeddings([WordEmbeddings("glove")], "mean")

_INPUT_OPTIONS = {
    "TFIDF": (False, False),
    "GloveExtraFeatures": (True, True),
    "Glove": (True, False),
}


def read_data(set_):
    return np.load(_DATA_PATH / f"data_{set_}", allow_pickle=True)


def preprocess(X,
               lem=True,
               stem=True,
import pickle
import torch

from flair.data import Sentence
from flair.embeddings import BertEmbeddings, DocumentPoolEmbeddings

print(torch.cuda.get_device_name(torch.cuda.current_device()))
print(torch.cuda.is_available())

src = open("./labeled_news.reversed.csv", 'r', newline='', encoding="utf-8")
src_reader = csv.reader(src, delimiter=",", quotechar="|")
dst = open("./4bert_vectors.pickle", "wb")

# init embedding
embedding = BertEmbeddings('bert-base-multilingual-cased')
document_embeddings = DocumentPoolEmbeddings([embedding])


def getBertVector(str):
    # create a sentence
    #print(str)
    sentence = Sentence(str)

    # embed words in sentence
    document_embeddings.embed(sentence)
   
    # print(str)
    # print(sentence.get_embedding().detach().numpy())
    
    return sentence.get_embedding().detach().numpy()
Esempio n. 4
0
class FlairTextEncoder(BaseTorchEncoder):
    """
    Encode an array of string in size `B` into an ndarray in size `B x D`

    The ndarray potentially is BatchSize x (Channel x Height x Width)

    Internally, :class:`FlairTextEncoder` wraps the DocumentPoolEmbeddings from Flair.

    :param embeddings: the name of the embeddings. Supported models include
        - ``word:[ID]``: the classic word embedding model, the ``[ID]`` are listed at
        https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
        - ``flair:[ID]``: the contextual embedding model, the ``[ID]`` are listed at
        https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
        - ``pooledflair:[ID]``: the pooled version of the contextual embedding model,
        the ``[ID]`` are listed at
        https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
        - ``byte-pair:[ID]``: the subword-level embedding model, the ``[ID]`` are listed at
        https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
        - ``Example``: ('word:glove', 'flair:news-forward', 'flair:news-backward')

    :param pooling_strategy: the strategy to merge the word embeddings into the chunk embedding.
    Supported strategies include ``mean``, ``min``, ``max``.
    """

    def __init__(self,
                 embeddings: Union[Tuple[str], List[str]] = ('word:glove', ),
                 pooling_strategy: str = 'mean',
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.embeddings = embeddings
        self.pooling_strategy = pooling_strategy
        self.max_length = -1  # reserved variable for future usages
        self._post_set_device = False

    def post_init(self):
        """
        Load model.

        Possible models are:
        - flair
        - pooledflair
        - word
        - byte-pair
        """
        import flair
        flair.device = self.device
        embeddings_list = []
        for e in self.embeddings:
            model_name, model_id = e.split(':', maxsplit=1)
            emb = None
            try:
                if model_name == 'flair':
                    from flair.embeddings import FlairEmbeddings
                    emb = FlairEmbeddings(model_id)
                elif model_name == 'pooledflair':
                    from flair.embeddings import PooledFlairEmbeddings
                    emb = PooledFlairEmbeddings(model_id)
                elif model_name == 'word':
                    from flair.embeddings import WordEmbeddings
                    emb = WordEmbeddings(model_id)
                elif model_name == 'byte-pair':
                    from flair.embeddings import BytePairEmbeddings
                    emb = BytePairEmbeddings(model_id)
            except ValueError:
                self.logger.error(f'embedding not found: {e}')
                continue
            if emb is not None:
                embeddings_list.append(emb)
        if embeddings_list:
            from flair.embeddings import DocumentPoolEmbeddings
            self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy)
            self.logger.info(f'flair encoder initialized with embeddings: {self.embeddings}')
        else:
            self.logger.error('flair encoder initialization failed.')

    def encode(self, content: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
        """
        Encode ``Document`` content from an array of string in size `B` into a ndarray in size `B x D`.

        :param content: a 1-dimension array of string type in size `B`
        :return: an ndarray in size `B x D`
        """
        from flair.data import Sentence
        c_batch = [Sentence(row) for row in content]
        self.model.embed(c_batch)
        result = [self.tensor2array(c_text.embedding) for c_text in c_batch]
        return np.vstack(result)

    def tensor2array(self, tensor):
        if isinstance(tensor, np.ndarray):
            return tensor
        return tensor.cpu().numpy() if self.on_gpu else tensor.numpy()
Esempio n. 5
0
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence
import torch.nn as nn

#flair_embedding_forward = FlairEmbeddings('news-forward')
#flair_embedding_backward = FlairEmbeddings('news-backward')

fasttext_embeddings_web = WordEmbeddings('en-crawl')
fasttext_embeddings_news = WordEmbeddings('en-news')

embedding = DocumentPoolEmbeddings(
    [fasttext_embeddings_news, fasttext_embeddings_web])

cos = nn.CosineSimilarity(dim=0)


def _get_embedding(text):
    sentence = Sentence(text)
    embedding.embed(sentence)
    vector = sentence.get_embedding()
    return vector


def _get_cosine_similarity(vec_1, vec_2):
    return round(cos(vec_1, vec_2).item(), 3)


def get_embedding_similarity(sentence_1, sentence_2):
    vec_1 = _get_embedding(sentence_1)
    vec_2 = _get_embedding(sentence_2)
    return _get_cosine_similarity(vec_1, vec_2)
Esempio n. 6
0
def generate_embeddings(docs,
                        batch_size,
                        model_name='bert-base-cased',
                        pooling='mean',
                        offset=0):
    """
    Generator function for generating embeddings from strings using a flair model. Takes a list of sentences and
    returns a list tuple. The first element represents failure (0) or success (1 or 2) and
    the second element contains a list of embeddings as numpy arrays if successful, and the indices of the failed batch
    if unsuccessful.
    The first element is 1, if batch_size embeddings were created
    :param docs: a list of strings for which embeddings should be created
    :param batch_size: integer representing how many embeddings should be created at once
    :param model_name: the model for creating the embeddings. Defaults to document embeddings using BERT-Base
    :param pooling: the pooling strategy to generate Document Embeddings
    :param offset: the offset of the integers, for printing out the correct index
    :return: a tuple (success/failure, embeddings/failed_indices)
    """
    rest = len(docs) % batch_size
    model = False
    if pooling == 'mean':
        embedding = TransformerWordEmbeddings(model_name,
                                              layers='-1',
                                              allow_long_sentences=True)
        model = DocumentPoolEmbeddings([embedding], fine_tune_mode='none')
    elif pooling == 'CLS':
        model = TransformerDocumentEmbeddings(model_name)
    if model:
        for i in range(0, len(docs) - rest, batch_size):
            sentences = [
                Sentence(sentence) for sentence in docs[i:i + batch_size]
            ]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            sentences = [Sentence(sentence) for sentence in docs[-rest:]]
            try:
                model.embed(sentences)
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, [
                    sentence.get_embedding().detach().cpu().numpy()
                    for sentence in sentences
                ]
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    elif pooling == 'SentenceBert':
        model = SentenceTransformer(model_name)
        for i in range(0, len(docs) - rest, batch_size):
            try:
                embeddings = model.encode(docs[i:i + batch_size])
                print(
                    f'successfully embedded sentences {offset + i} to {offset + i + batch_size-1}'
                )
                yield 1, embeddings
            except RuntimeError:
                print(
                    f'could not embed sentences with index {offset + i} '
                    f'to {offset + i + batch_size-1}\nstoring in failed index list'
                )
                yield 0, (offset + i, offset + i + batch_size - 1)
        if rest:
            try:
                embeddings = model.encode(docs[-rest:])
                print(
                    f'successfully embedded sentences from {len(docs) + offset - rest} to the end'
                )
                yield 1, embeddings
            except RuntimeError:
                yield 0, (len(docs) - rest, 0)
    else:
        raise Exception("No Valid model")
Esempio n. 7
0
from WorkforceSentimentMonitoring.data import get_data, merge, drop_wrong_language
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
import numpy as np
import swifter
import pickle
import os
import numpy as np
import pandas as pd

embedder = DocumentPoolEmbeddings([
    FlairEmbeddings('news-forward-fast'),
    FlairEmbeddings('news-backward-fast')
])


def embed(text, embedder):
    sentence = Sentence(text)
    embedder.embed(sentence)
    return sentence.get_embedding().detach().numpy()


if __name__ == '__main__':
    # submission, train, test = get_data()
    # df = merge(submission, train, test)
    # df = drop_wrong_language(df, "review")

    # path = os.path.split(os.path.abspath('__file__'))[0]
    # file = os.path.join(path, 'pickle_files/reviews_eng.p')
    # with open(file, 'wb') as f:
from textblob import TextBlob
import numpy as np
from multiprocessing import Process
import nltk
import nltk.data
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence, BertEmbeddings
from spacy.gold import biluo_tags_from_offsets
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, euclidean_distances
from sklearn.metrics.pairwise import cosine_similarity as cosinSim

nlp = spacy.load("en_core_web_sm")
embeddings = WordEmbeddings('glove')
EMBEDDING_DIM = 100
document_embeddings = DocumentPoolEmbeddings([embeddings], pooling='max')
tf = TfidfVectorizer(analyzer='word',
                     ngram_range=(1, 1),
                     min_df=1,
                     stop_words='english')


def get_token_offset(span, paragraph, answer_start, answer_end):
    p_doc = nlp(paragraph)
    paragraph_tokens = [token.text for token in p_doc]
    span_doc = nlp(span)
    span_tokens = [token.text for token in span_doc]
    [(start, end)
     ] = [(i, i + len(span_tokens)) for i in range(len(paragraph_tokens))
          if (paragraph_tokens[i] == span_tokens[0]
              and paragraph_tokens[i:i + len(span_tokens)] == span_tokens)]
Esempio n. 9
0
class FlairTransformer(BaseEstimator, TransformerMixin):
    """
    a general class for creating a machine learning step in the machine learning pipeline
    """
    def __init__(
        self,
        embeddings: List[TokenEmbeddings],
        fine_tune_mode="linear",
        pooling: str = "mean",
        batch_size=32,
    ):
        """
        constructor
        """
        super(FlairTransformer, self).__init__()
        self.embedder = DocumentPoolEmbeddings(embeddings=embeddings,
                                               fine_tune_mode=fine_tune_mode,
                                               pooling=pooling)
        self.batch_size = batch_size
        self.vector_cache = {}
        self.dataset_cache = {}

    def fit(self, X, y=None, **kwargs):
        """
        an abstract method that is used to fit the step and to learn by examples
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: self: the class object - an instance of the transformer - Transformer
        """
        # No fitting needed, using pre-trained embeddings_baseline
        return self

    def transform(self, X, y=None, **kwargs):
        """
        an abstract method that is used to transform according to what happend in the fit method
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """

        X = X['text']

        dataset_hash = hash(str(X) + str(self.embedder.__dict__))
        if dataset_hash in self.dataset_cache:
            return self.dataset_cache[dataset_hash]
        else:
            embeddings = []

            for first in trange(0, len(X), self.batch_size):
                subset = X[first:first + self.batch_size]
                sentences = []
                for element in subset:
                    sentence = Sentence(element)
                    # sentence.tokens = sentence.tokens[:200]
                    sentences.append(sentence)

                self.embedder.embed(sentences)
                for sentence in sentences:
                    key = sentence.to_original_text()
                    if key in self.vector_cache.keys():
                        vector = self.vector_cache[key]
                    else:
                        vector = sentence.get_embedding().cpu().detach().numpy(
                        )
                        self.vector_cache[key] = vector
                    embeddings.append(vector)

            embedding_dataset = numpy.vstack(embeddings)
            self.dataset_cache[dataset_hash] = embedding_dataset
            return embedding_dataset

    def fit_transform(self, X, y=None, **kwargs):
        """
        perform fit and transform over the data
        :param X: features - Dataframe
        :param y: target vector - Series
        :param kwargs: free parameters - dictionary
        :return: X: the transformed data - Dataframe
        """
        return self.transform(X, y)
Esempio n. 10
0
    def train_model(self,
                    model_name="text_classification_model",
                    custom_word_embeddings=None,
                    rnn_type="GRU",
                    use_pool_embedding=False,
                    hidden_size=16,
                    reproject_words=True,
                    reproject_words_dimension=128,
                    learning_rate=1e-3,
                    batch_size=8,
                    anneal_factor=0.5,
                    patience=2,
                    max_epochs=30,
                    **kwargs):
        """
        Train flair model and save it in your data folder

        Parameters
        ----------
        model_name: str
            Name of your model
        custom_word_embeddings: list<embedding>
            Use custom flair embedding

        See more in flair documentation: https://github.com/zalandoresearch/flair/tree/master/resources/docs

        Return
        -------
        None
        """
        self.model_name = model_name
        corpus = CSVClassificationCorpus(self.data_folder,
                                         self.column_name_map,
                                         skip_header=True)
        label_dict = corpus.make_label_dictionary()

        # Word embedding selection
        if custom_word_embeddings is None:
            word_embeddings = [WordEmbeddings('fr')]
        else:
            word_embeddings = custom_word_embeddings

        # initialize document embedding by passing list of word embeddings and parameters
        if use_pool_embedding:
            document_embeddings = DocumentPoolEmbeddings(
                word_embeddings, pooling='max', fine_tune_mode='nonlinear')
        else:
            document_embeddings = DocumentRNNEmbeddings(
                word_embeddings,
                hidden_size=hidden_size,
                reproject_words=reproject_words,
                reproject_words_dimension=reproject_words_dimension,
                rnn_type=rnn_type)

        # create the text classifier and initialize trainer
        classifier = TextClassifier(document_embeddings,
                                    label_dictionary=label_dict)
        trainer = ModelTrainer(classifier, corpus, optimizer=Adam)

        # let's train !
        num_workers = cpu_count()
        trainer.train("{0}\\{1}".format(self.data_folder, self.model_name),
                      learning_rate=learning_rate,
                      num_workers=num_workers,
                      mini_batch_size=batch_size,
                      anneal_factor=anneal_factor,
                      patience=patience,
                      max_epochs=max_epochs,
                      **kwargs)
Esempio n. 11
0
"""
'''
prerequisites : flair (pip install flair)
translate the hindi text into english or viceversa

E_sentences - glossary file of english text book
H_sentences_eng - translated glossary file of hindi text book


'''

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence

document_embeddings = DocumentPoolEmbeddings([
    WordEmbeddings('glove'),
    FlairEmbeddings('news-forward'),
    FlairEmbeddings('news-backward'),
])

#opening english file and embedding each sentence
eng = open('<PATH> /E_sentences.txt', 'r')
#vec_eng=open('/home/dheeraj/Desktop/IIITH-Intern/Major-TH-Tool/glossary/E_vecs.txt','a')
line = eng.readline()
eng_vecs = []
while (line):
    sentence = Sentence(line)
    document_embeddings.embed(sentence)
    li = sentence.get_embedding()
    li = li.tolist()
    eng_vecs.append(li)
    line = eng.readline()
Esempio n. 12
0
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence
from flair.data import Sentence

# initialize the word embeddings
glove_embedding = WordEmbeddings('glove')
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings(
    [glove_embedding, flair_embedding_backward, flair_embedding_forward])

# create an example sentence
sentence = Sentence('The grass is green . And the sky is blue .')

# embed the sentence with our document embedding
document_embeddings.embed(sentence)

# now check out the embedded sentence.
print(sentence.get_embedding())

document_embeddings = DocumentPoolEmbeddings(
    [glove_embedding, flair_embedding_backward, flair_embedding_backward],
    mode='min')

from flair.embeddings import WordEmbeddings, DocumentRNNEmbeddings

glove_embedding = WordEmbeddings('glove')

document_lstm_embeddings = DocumentRNNEmbeddings([glove_embedding],
                                                 rnn_type='LSTM')
Esempio n. 13
0
import torch

# Some parameters
SCORE_THRESHOLD = .6        # [0., 1.] The minimum value for an aceptable question
WINDOW_LENGTH = 128         # Character window length
QG_BEAM_SIZE = 3            # Beam-size used on question generation decoder

# 
# tagger = SequenceTagger.load('ner-ontonotes')
ne = Vocabulary.from_vocab_file('vocabularies/biology.vocab').compile()
qg = QuestionGenerator('pretrained_models/qg_model.bin', beam_size=QG_BEAM_SIZE)
qa = pipeline('question-answering')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([WordEmbeddings('glove'),
                                              FlairEmbeddings('news-backward'),
                                              FlairEmbeddings('news-forward')])

remove_punct = re.compile(r"[\(\)\'\":?¿!¡;]")


def answer_similarity(ans1, real):
    sent1 = Sentence(ans1)
    sent2 = Sentence(real)
    document_embeddings.embed(sent1)
    document_embeddings.embed(sent2)
    emb1 = sent1.get_embedding()
    emb2 = sent2.get_embedding()
    emb1 /= torch.sqrt((emb1**2).sum())
    emb2 /= torch.sqrt((emb2**2).sum())
Esempio n. 14
0
class Embeddings(BaseMatcher):
    """
    Embed words into vectors and use cosine similarity to find
    the best matches between two lists of strings

    Arguments:
        embedding_method: list of Flair embeddings to use
        min_similarity: The minimum similarity between strings, otherwise return 0 similarity
        cosine_method: The method/package for calculating the cosine similarity.
                        Options: "sparse", "sklearn", "knn".
                        Sparse is the fastest and most memory efficient but requires a
                        package that might be difficult to install.
                        Sklearn is a bit slower than sparse and requires significantly more memory as
                        the distance matrix is not sparse
                        Knn uses 1-nearest neighbor to extract the most similar strings
                        it is significantly slower than both methods but requires little memory
        model_id: The name of the particular instance, used when comparing models

    Usage:

    ```python
    model = Embeddings(min_similarity=0.5)
    ```

    Or if you want a custom model to be used and it is a word embedding model,
    pass it in as a list:

    ```python
    embedding_model = WordEmbeddings('news')
    model = Embeddings([embeddings_model], min_similarity=0.5)
    ```

    As you might have guessed, you can pass along multiple word embedding models and the
    results will be averaged:

    ```python
    fasttext_embedding = WordEmbeddings('news')
    glove_embedding = WordEmbeddings('glove')
    bert_embedding = TransformerWordEmbeddings('bert-base-multilingual-cased')
    model = Embeddings([glove_embedding,
                        fasttext_embedding,
                        bert_embedding ], min_similarity=0.5)
    ```
    """
    def __init__(self,
                 embedding_method: Union[List, None] = None,
                 min_similarity: float = 0.75,
                 cosine_method: str = "sparse",
                 model_id: str = None):
        super().__init__(model_id)
        self.type = "Embeddings"

        if not embedding_method:
            self.document_embeddings = DocumentPoolEmbeddings(
                [WordEmbeddings('news')])

        elif isinstance(embedding_method, list):
            self.document_embeddings = DocumentPoolEmbeddings(embedding_method)

        elif isinstance(embedding_method, TokenEmbeddings):
            self.document_embeddings = DocumentPoolEmbeddings(
                [embedding_method])

        else:
            self.document_embeddings = embedding_method

        self.min_similarity = min_similarity
        self.cosine_method = cosine_method

    def match(self,
              from_list: List[str],
              to_list: List[str],
              embeddings_from: np.ndarray = None,
              embeddings_to: np.ndarray = None) -> pd.DataFrame:
        """ Matches the two lists of strings to each other and returns the best mapping

        Arguments:
            from_list: The list from which you want mappings
            to_list: The list where you want to map to
            embeddings_from: Embeddings you created yourself from the `from_list`
            embeddings_to: Embeddings you created yourself from the `to_list`

        Returns:
            matches: The best matches between the lists of strings

        Usage:

        ```python
        model = Embeddings(min_similarity=0.5)
        matches = model.match(["string_one", "string_two"],
                              ["string_three", "string_four"])
        ```
        """
        if not isinstance(embeddings_from, np.ndarray):
            embeddings_from = self._embed(from_list)
        if not isinstance(embeddings_to, np.ndarray):
            embeddings_to = self._embed(to_list)

        matches = cosine_similarity(embeddings_from, embeddings_to, from_list,
                                    to_list, self.min_similarity,
                                    self.cosine_method)

        return matches

    def _embed(self, strings: List[str]) -> np.ndarray:
        """ Create embeddings from a list of strings """
        embeddings = []
        for name in strings:
            sentence = Sentence(name)
            self.document_embeddings.embed(sentence)
            embeddings.append(sentence.embedding.cpu().numpy())

        return np.array(normalize(embeddings), dtype="double")
Esempio n. 15
0
 def __create_models(self):
     models = []
     models_fit = []
     #for _params in self.model_params:
     _params = {}
     for k, v in self.params.items():
         if k.startswith('_'):
             continue
         _params[k] = v
     self.textModels = dict(
         mtc=TextModel(_params).fit(self.train),
         #charEmb=DocumentPoolEmbeddings([CharacterEmbeddings()]),
         #charLangEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings(self.lang)]),
         ##charMultiEmb=DocumentPoolEmbeddings([CharacterEmbeddings(),BytePairEmbeddings('multi')]),
         langEmb=DocumentPoolEmbeddings([BytePairEmbeddings(self.lang)]),
         charLangMultiEmb=DocumentPoolEmbeddings([
             CharacterEmbeddings(),
             BytePairEmbeddings(self.lang),
             BytePairEmbeddings('multi')
         ]),
         langMultiEmb=DocumentPoolEmbeddings(
             [BytePairEmbeddings(self.lang),
              BytePairEmbeddings('multi')]),
         bytePairEMB=DocumentPoolEmbeddings([BytePairEmbeddings('multi')]),
         #flairEmbF=DocumentPoolEmbeddings([FlairEmbeddings('multi-forward')]),
         #flairEmbB=DocumentPoolEmbeddings([FlairEmbeddings('multi-backward')]),
         #bertEMB=DocumentPoolEmbeddings([TransformerWordEmbeddings('bert-base-uncased', layers='-1')])
     )
     for km, tmodel in self.textModels.items():
         models.append({'name': km})
         models_fit.append({'name': km})
         if km == 'mtc':
             xt = tmodel.transform(self.train)
             xv = tmodel.transform(self.validation)
             X = tmodel.transform(self.data)
         else:
             sentences_train = [Sentence(txt) for txt in self.train]
             tmodel.embed(sentences_train)
             xt = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_train
             ])
             sentences_val = [Sentence(txt) for txt in self.validation]
             tmodel.embed(sentences_val)
             xv = np.array([
                 e.get_embedding().cpu().detach().numpy()
                 for e in sentences_val
             ])
             sentences = [Sentence(txt) for txt in self.data]
             tmodel.embed(sentences)
             X = np.array([
                 e.get_embedding().cpu().detach().numpy() for e in sentences
             ])
         models[-1]['xv'] = xv
         models[-1]['xt'] = xt
         models_fit[-1]['xt'] = X
         #max_iter=5000
         #if km=='mtc': max_iter=1000
         #if km=='langMulti': max_iter=5000
         #self.models[-1]['clf']=LinearSVC(max_iter=max_iter).fit(xt,self.yt)
         #yp=self.models[-1]['clf'].decision_function(xv)
         #scaler=Normalizer().fit(yp)
         #self.models[-1]['macroF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['weightedF1']=f1_score(self.yv,np.argmax(scaler.transform(yp),axis=1),average='weighted')
         #self.models[-1]['score']=f1_score(self.yv,np.argmax(yp,axis=1),average='weighted')
         #self.models[-1]['probas']=scaler.transform(yp)
         ### Fit model with all avaliable data
         #self.models_fit[-1]['clf']=LinearSVC(max_iter=max_iter).fit(X,self.y)
     print('Fitting Ensemble')
     #self.models  =  Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models)
     #self.models_fit = Parallel(n_jobs=5)(delayed(self._train_model)(md) for md in models_fit)
     self.models, self.models_fit = [], []
     for md, mdf in zip(models, models_fit):
         self.models.append(self._train_model(
             md))  #  =  [self._train_model(md) for md in models]
         self.models_fit.append(self._train_model(md))
Esempio n. 16
0
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import stopwords
from random import randrange
from scipy.sparse import coo_matrix
from selenium import webdriver
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from time import time
from tqdm import tqdm
import json
import pandas as pd
import string
import torch

# choosing fasttext because for subword information
embedding = DocumentPoolEmbeddings([WordEmbeddings('en')])


def text_scraper(urls, file):
    """function for scraping the text of a webpage given url"""
    start = time()
    print('SCRAPING WEBPAGES...')
    # creating a new instance of google chrome
    driver = webdriver.Chrome('./chromedriver')
    pages = []

    for url in tqdm(urls):
        driver.get(url)
        # extracting the title, content and date
        title = driver.find_element_by_tag_name('h1').text
        content = driver.find_element_by_tag_name('body').text
import torch
import torch.nn as nn
import numpy as np
import nltk
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable


from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, Sentence

# initialize the word embeddings
flair_embedding_forward = FlairEmbeddings('news-forward')
flair_embedding_backward = FlairEmbeddings('news-backward')

# initialize the document embeddings, mode = mean
document_embeddings = DocumentPoolEmbeddings([flair_embedding_backward,
                                              flair_embedding_forward])

# Hyper Parameters
BATCH_SIZE = 16


class ContrastiveLoss(torch.nn.Module):

    def __init__(self, margin=1.0):
        super(ContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, input1, input2, y):
        diff = input1 - input2
        dist_sq = torch.sum(torch.pow(diff, 2), 1)
        dist = torch.sqrt(dist_sq)
Esempio n. 18
0
# from methods import *
from database import Database
import torch
import flair
import pickle
from flair.data import Sentence
from flair.embeddings import FlairEmbeddings, DocumentPoolEmbeddings

curr_path = os.path.dirname(os.path.abspath(__file__))
data_path = os.path.join(curr_path, "data")

flair.device = torch.device('cuda:0')
flair.embedding_storage_mode = None
flair_emb = DocumentPoolEmbeddings([
        FlairEmbeddings('en-forward-fast'), 
        FlairEmbeddings('en-backward-fast')
    ],
    pooling='mean',
)
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

poss_sections = {
    '#introduction': ['intro', 'introduction', 'starting'],
    '#abstract': ['abstract', 'abstracts'],
    '#sota': ['background', 'backgrounds', 'state of the art', 'previous', 'related work'],
    '#method': ['method', 'methods', 'methodology', 'material', 'materials', 'development', 'description', 'model', 'procedures'],
    '#experiments_or_results': ['experiments', 'experiment', 'analysis', 'analytics', 'analisy', 'statistics', 'regression', 
        'analises', 'results', 'result', 'evaluation', 'measures', 'correlation', 'comparison', 'tests', 'test', 'lab', 'laboratory'],
    '#conclusions': ['conclusion', 'conclusions', 'discussion', 'discussions'],
}

for list_candidates in poss_sections.values():
Esempio n. 19
0
 def _embed_document(self, document_text: str,
                     doc_embeddings: DocumentPoolEmbeddings):
     sentence = Sentence(document_text)
     doc_embeddings.embed(sentence)
     return sentence.get_embedding().data.cpu().numpy()
Esempio n. 20
0
import senteval

# Set params for SentEval
# we use logistic regression (usepytorch: Fasle) and kfold 10
# In this dictionary you can add extra information that you model needs for initialization
params_senteval = {
    'task_path': PATH_TO_DATA,
    'usepytorch': True,
    'kfold': params.folds
}

b = []
for i in params.model:
    b.append(BertEmbeddings(i))
    #f.append(eval(i))
bert_encoder = DocumentPoolEmbeddings(b)
params_senteval['bert'] = bert_encoder
print(params_senteval['bert'])

nhid = params.nhid
params_senteval['classifier'] = {
    'nhid': nhid,
    'optim': 'adam',
    'batch_size': 64,
    'tenacity': 5,
    'epoch_size': 4
}


def prepare(params, samples):
def other_embeddings(embd):
    sess = tf.InteractiveSession()
    train_data_list = []
    test_data_list = []
    val_data_list = []
    if embd == 'glove':
        print('Starting Glove Embedding...')
        glove_embedding = WordEmbeddings('glove')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[glove_embedding])
    elif embd == 'xlnet':
        print('Starting XLNet Embedding...')
        xlnet_embedding = XLNetEmbeddings('xlnet-large-cased')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[xlnet_embedding])
    elif embd == 'fasttext':
        print('Starting Fasttext Embedding...')
        fasttext_embedding = WordEmbeddings('en')
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[fasttext_embedding])
    elif embd == 'elmo':
        print('Starting ELMo Embedding...')
        elmo_embedding = ELMoEmbeddings()
        document_embeddings = DocumentPoolEmbeddings(
            embeddings=[elmo_embedding])
    else:
        # init Flair embeddings
        flair_forward_embedding = FlairEmbeddings('multi-forward')
        flair_backward_embedding = FlairEmbeddings('multi-backward')
        glove_embedding = WordEmbeddings('glove')
        # now create the DocumentPoolEmbeddings object that combines all embeddings
        document_embeddings = DocumentPoolEmbeddings(embeddings=[
            glove_embedding, flair_forward_embedding, flair_backward_embedding
        ])
    print('Train embedding Started...')
    for text in final_train['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        train_data_list.append(emb)
    print('Embedded Train data!!')
    print('Test embedding Started...')
    for text in final_test['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        test_data_list.append(emb)
    print('Embedded Test data!!')
    for text in final_val['text'].tolist():
        text = Sentence(text)
        document_embeddings.embed(text)
        emb = text.get_embedding().detach().numpy()
        emb = tf.constant(emb).eval()
        val_data_list.append(emb)
    print('Embedded Test data!!')
    return train_data_list, test_data_list, val_data_list
Esempio n. 22
0
class FlairEmbeddingsClassifier(BaseEstimator):
    def __init__(
            self,
            word_embeddings: List[Embeddings] = (WordEmbeddings('de'),
                                                 WordEmbeddings('de-crawl')),
            pooling: str = 'mean',
            fine_tune_mode: str = 'nonlinear',
            distance_metric: str = 'cosine',
            n_jobs: int = 1,
            verbose: bool = False):

        self.word_embeddings = word_embeddings
        self.pooling = pooling
        self.fine_tune_mode = fine_tune_mode
        self.distance_metric = distance_metric
        self.n_jobs = n_jobs
        self.verbose = verbose

    def fit(self, X, y):

        tag_docs = self._create_tag_corpus(X, self._create_tag_docs(y))

        self.document_embedder_ = DocumentPoolEmbeddings(
            self.word_embeddings,
            pooling=self.pooling,
            fine_tune_mode=self.fine_tune_mode)

        if self.verbose:
            doc_iterator = tqdm(tag_docs, desc='Computing tag embeddings')
        else:
            doc_iterator = tag_docs

        self.tag_embeddings_ = []

        for doc in doc_iterator:
            doc_obj = Sentence(doc)
            self.document_embedder_.embed(doc_obj)
            self.tag_embeddings_.append(
                doc_obj.get_embedding().detach().numpy())

        self.tag_embeddings_ = np.array(self.tag_embeddings_)

        return self

    def predict(self, X: List[str], n_labels: int = 10) -> np.array:

        if not hasattr(self, 'tag_embeddings_'):
            raise NotFittedError

        if self.verbose:
            X_iterator = tqdm(
                X, desc='Computing embeddings for prediction samples')
        else:
            X_iterator = X

        X_embeddings = []

        for doc in X_iterator:
            doc_obj = Sentence(doc)
            self.document_embedder_.embed(doc_obj)
            X_embeddings.append(doc_obj.get_embedding().detach().numpy())

        nn = NearestNeighbors(metric=self.distance_metric,
                              n_neighbors=n_labels,
                              n_jobs=self.n_jobs)
        nn.fit(self.tag_embeddings_)

        y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]),
                            dtype='int8')

        for sample_ind, text_embedding in enumerate(X_embeddings):
            nearest_neighbors = nn.kneighbors([text_embedding])[1][0]
            y_pred[sample_ind, nearest_neighbors] = 1

        return y_pred.tocsr()

    def decision_function(self, X: List[str], n_labels: int = 10):

        if not hasattr(self, 'tag_embeddings_'):
            raise NotFittedError

        if self.verbose:
            X_iterator = tqdm(
                X, desc='Computing embeddings for prediction samples')
        else:
            X_iterator = X

        X_embeddings = []

        for doc in X_iterator:
            if doc:
                doc_obj = Sentence(doc)
            else:
                doc_obj = Sentence('Unkown')
                print('yeah')
            self.document_embedder_.embed(doc_obj)
            try:
                X_embeddings.append(doc_obj.get_embedding().detach().numpy())
            except RuntimeError as e:
                print(
                    'Could no compute embedding for sample inserting zero vector'
                )
                # TODO give index of corrupted sample
                print(e)
                X_embeddings.append(
                    np.zeros((self.tag_embeddings_[1], ),
                             dtype=self.tag_embeddings_.dtype))

        nn = NearestNeighbors(metric=self.distance_metric,
                              n_neighbors=n_labels,
                              n_jobs=self.n_jobs)
        nn.fit(self.tag_embeddings_)

        y_pred = lil_matrix((len(X), self.tag_embeddings_.shape[0]),
                            dtype='float')

        for sample_ind, sample_vec in enumerate(X_embeddings):
            distances, indices = nn.kneighbors([sample_vec])
            for distance, label_index in zip(distances, indices):
                y_pred[sample_ind, label_index] = distance

        return y_pred.tocsr()

    def log_decision_function(self, X: Iterable[str], n_labels: int = 10):
        if not hasattr(self, 'tag_embeddings_'):
            raise NotFittedError
        # TODO Uncomment this if sure that nothing will break
        distances = self.decision_function(X=X, n_labels=n_labels)
        log_distances = self._get_log_distances(distances)
        return log_distances

    def _get_log_distances(self,
                           y_distances: csr_matrix,
                           base=0.5) -> csr_matrix:
        """
        Returns the logarithmic version (base default: 0.5) of the distance matrix returned by TODO.
        This must be used in order to compute valid precision@k scores
        since small Distances should be ranked better than great ones.
        :param y_distances: sparse distance matrix (multilabel matrix with distances instead of binary indicators)
        :param base: base of the log function (must be smaller then one)
        :return: sparse matrix with the log values
        """

        log_y_distances = y_distances.tocoo()
        log_y_distances.data = np.log(log_y_distances.data) / np.log(base)
        return log_y_distances.tocsr()

    def _create_tag_corpus(self, X: np.array,
                           tag_doc_idx: np.array) -> List[str]:
        """
        Creates the corpus used to train the tag embeddings.
        Each text associated with one tag is concatenated to one big document.
        :param X: Iterable of the texts as string
        :param tag_doc_idx: Mapping of each label to their associated texts
        :return: list of shape (n_tags,) containing the texts
        """
        tag_corpus = list()
        if self.verbose:
            print('Creating Tag-Doc Corpus')
            iterator = tqdm(tag_doc_idx)
        else:
            iterator = tag_doc_idx
        for indices in iterator:
            tag_corpus.append(" ".join(X[indices]))
        return tag_corpus

    def _create_tag_docs(self, y: csr_matrix) -> np.ndarray:
        """
        Creates a mapping of each tags and their associated texts.
        :param y: sparse label matrix
        :return: array of shape (n_labels,) containing the indices of each text connected to a label
        """
        self.classes_ = y.shape[1]

        if self.verbose:
            print('Sorting tag and docs')
            iterator = tqdm(y.T)
        else:
            iterator = y.T

        tag_doc_idx = list()
        for tag_vec in iterator:
            pos_samples = tag_vec.nonzero()[1]  # get indices of pos samples
            tag_doc_idx.append(pos_samples)
        return np.asarray(tag_doc_idx)
Esempio n. 23
0
class HyperpartisanDatasetFlair(HyperpartisanDataset, FlairDataset):
    """
    Hyperpartisan News Dataset using flair-based embeddings.
    """
    def __init__(self,
                 articles: Sequence[NewsArticle],
                 max_seq_len: int = 200,
                 granularity: Union[str, Sequence[str]] = 'token',
                 use_title: bool = True,
                 max_sent_len: int = 100,
                 embeddings: Sequence[str] = ['word'],
                 avg_layers: Optional[int] = None,
                 use_cuda: bool = False):
        super().__init__(
            articles=articles,
            max_seq_len=max_seq_len,
            granularity=granularity,
            max_sent_len=max_sent_len,
            use_title=use_title,  ## HyperpartisanDataset.__init__ args
            embeddings=embeddings,
            use_cuda=use_cuda,  ## FlairDataset.__init__ args
        )

        self.embeddings = DocumentPoolEmbeddings(self.token_embeddings,
                                                 pooling='mean')
        self.avg_layers = avg_layers

        print('\nEmbeddings Model:')
        print(self.embeddings, end='\n\n')

        self.nlp = spacy.load('en_core_web_sm',
                              disable=['ner', 'parser', 'tagger'])
        self.nlp.add_pipe(self.nlp.create_pipe('sentencizer'))

    def _get_span_embedding(self,
                            text: str,
                            max_seq_len: Optional[int] = None) -> torch.Tensor:
        """
        Returns embeddings for the given sentence's text,
        in shape: (embeddings_dim,)
        """
        if len(text) < 2:
            print('Sentence is too short: "{}"'.format(text))
            return torch.zeros(self.embeddings.embedding_length,
                               dtype=torch.float32)

        s = Sentence(text)
        if max_seq_len is not None and len(
                s
        ) > max_seq_len and self.embeddings_type != 'elmo':  ## Don't crop ELMo sentences, just an experiment
            s.tokens = s.tokens[:max_seq_len]
        if self.embeddings_type == 'bert' or self.bert_tokenizer is not None:
            sent_len = self.crop_sentence_to_fit_bert(s)
            if sent_len == 0 or len(s) == 0:
                return torch.zeros(self.embeddings.embedding_length,
                                   dtype=torch.float32)

        self.embeddings.embed(s)
        return s.embedding

    def get_tokenwise(self, article: NewsArticle) -> torch.Tensor:
        return self._get_tokenwise_embeddings(
            (article.get_title() if self.use_title else "") +
            article.get_text(), self.max_seq_len)

    def get_documentwise(self, article: NewsArticle) -> torch.Tensor:
        """Returns document-wise embeddings"""
        text = (article.get_title()
                if self.use_title else "") + article.get_text()
        return self._get_span_embedding(text, self.max_seq_len)

    def get_sentencewise(self, article: NewsArticle):
        X = torch.zeros(self.max_seq_len,
                        self.embeddings.embedding_length,
                        dtype=torch.float32)

        # Title embedding
        if self.use_title:
            X[0] = self._get_span_embedding(article.get_title(),
                                            self.max_sent_len)

        # Sentence embeddings
        for i, s in enumerate(
                self.nlp(article.get_text()).sents,
                1 if self.use_title else 0):
            if i >= self.max_seq_len:
                break

            X[i] = self._get_span_embedding(s.text, self.max_sent_len)

        if self.avg_layers is not None:
            return self._avg_last_n_layers(X, self.avg_layers)
        return X

    def _avg_last_n_layers(self, X, last_n_layers):
        """
        Averages the last_n_layers from the given embedding representation,
         instead of the default concatenation.
        """
        final_emb_len = X.shape[-1] // last_n_layers
        assert X.shape[-1] % last_n_layers == 0

        X_new = torch.zeros(X.shape[0], final_emb_len, dtype=torch.float32)
        for i, emb in enumerate(X):
            for k in range(last_n_layers):
                X_new[i] += emb[k * final_emb_len:(k + 1) * final_emb_len]
            X_new[i] /= last_n_layers

        return X_new

    def get_tokenwise_grouped(self, article: NewsArticle) -> torch.Tensor:
        """Returns token-wise embeddings grouped by sentences"""
        X = torch.zeros(self.max_seq_len,
                        self.max_sent_len,
                        self.get_embeddings_dim(),
                        dtype=torch.float32)

        # Title embedding
        if self.use_title:
            X[0] = self._get_tokenwise_embeddings(article.get_title(),
                                                  self.max_sent_len)

        # Text embeddings
        for i, sent in enumerate(
                self.nlp(article.get_text()).sents,
                1 if self.use_title else 0):
            if i >= self.max_seq_len:
                break
            X[i] = self._get_tokenwise_embeddings(sent.text, self.max_sent_len)

        return X

    def get_embeddings_dim(self) -> int:
        return \
            self.embeddings.embedding_length if self.avg_layers is None else \
            self.embeddings.embedding_length // self.avg_layers
Esempio n. 24
0
from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings, Sentence
from flair.data_fetcher import NLPTaskDataFetcher
from flair.embeddings import BertEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer
from pathlib import Path
glove_embedding = WordEmbeddings('glove')
bert_embedding = BertEmbeddings('bert-base-uncased')
corpus = NLPTaskDataFetcher.load_classification_corpus(Path('./'),
                                                       test_file='test.csv',
                                                       dev_file='dev.csv',
                                                       train_file='train.csv')
document_embeddings = DocumentPoolEmbeddings([bert_embedding, glove_embedding])
classifier = TextClassifier(document_embeddings,
                            label_dictionary=corpus.make_label_dictionary(),
                            multi_label=True)
trainer = ModelTrainer(classifier, corpus)
trainer.train('./', max_epochs=10)
Esempio n. 25
0
import datetime
import spacy

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings, DocumentLSTMEmbeddings, Sentence

nlp = spacy.load('de')
glove_embedding = WordEmbeddings('de')
flair_embedding_forward = FlairEmbeddings('german-forward')
flair_embedding_backward = FlairEmbeddings('german-backward')

document_pooling_embeddings = DocumentPoolEmbeddings(
    [glove_embedding, flair_embedding_backward, flair_embedding_forward])

document_lstm_embeddings = DocumentLSTMEmbeddings(
    [glove_embedding, flair_embedding_backward, flair_embedding_forward])


def is_blacklisted(word):
    return word in [
        'polizei', 'polizist', 'beamter', 'nr.', 'berlin', 'uhr',
        'polizeimeldung', 'nicht', 'jahr', 'jährige', 'jährig', 'jähriger',
        'polizeiliche', 'polizeilich', '2015', '2016', '2014', '2017', '2018',
        'polizeibeamter', '-', 'u.a.', 'z.b.', 'der', 'die', 'das', 'dem',
        'den', 'diese', 'dieser', 'diesen', 'diesem', 'um', 'für', 'eine',
        'ein', 'einer', 'einen', 'einem', 'anderer', 'andere', 'anderen',
        'anders'
    ]


def is_empty(word):
    return word.strip() == ''
Esempio n. 26
0
class FlairTextEncoder(BaseTextTorchEncoder):
    """
    :class:`FlairTextEncoder` encodes data from an array of string in size `B` into a ndarray in size `B x D`.
    Internally, :class:`FlairTextEncoder` wraps the DocumentPoolEmbeddings from Flair.
    """

    def __init__(self,
                 embeddings: Union[Tuple[str], List[str]] = ('word:glove', 'flair:news-forward', 'flair:news-backward'),
                 pooling_strategy: str = 'mean',
                 *args,
                 **kwargs):
        """

        :param embeddings: the name of the embeddings. Supported models include
        - ``word:[ID]``: the classic word embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/CLASSIC_WORD_EMBEDDINGS.md
        - ``flair:[ID]``: the contextual embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
        - ``pooledflair:[ID]``: the pooled version of the contextual embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/FLAIR_EMBEDDINGS.md
        - ``byte-pair:[ID]``: the subword-level embedding model, the ``[ID]`` are listed at https://github.com/flairNLP/flair/blob/master/resources/docs/embeddings/BYTE_PAIR_EMBEDDINGS.md
        :param pooling_strategy: the strategy to merge the word embeddings into the chunk embedding. Supported
            strategies include ``mean``, ``min``, ``max``.
        """
        super().__init__(*args, **kwargs)
        self.embeddings = embeddings
        self.pooling_strategy = pooling_strategy
        self.max_length = -1  # reserved variable for future usages
        self._post_set_device = False

    def post_init(self):
        import flair
        flair.device = self.device
        from flair.embeddings import WordEmbeddings, FlairEmbeddings, BytePairEmbeddings, PooledFlairEmbeddings, \
            DocumentPoolEmbeddings
        embeddings_list = []
        for e in self.embeddings:
            model_name, model_id = e.split(':', maxsplit=1)
            emb = None
            try:
                if model_name == 'flair':
                    emb = FlairEmbeddings(model_id)
                elif model_name == 'pooledflair':
                    emb = PooledFlairEmbeddings(model_id)
                elif model_name == 'word':
                    emb = WordEmbeddings(model_id)
                elif model_name == 'byte-pair':
                    emb = BytePairEmbeddings(model_id)
            except ValueError:
                self.logger.error('embedding not found: {}'.format(e))
                continue
            if emb is not None:
                embeddings_list.append(emb)
        if embeddings_list:
            self.model = DocumentPoolEmbeddings(embeddings_list, pooling=self.pooling_strategy)
            self.logger.info('flair encoder initialized with embeddings: {}'.format(self.embeddings))
        else:
            self.logger.error('flair encoder initialization failed.')

    @batching
    @as_ndarray
    def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
        """

        :param data: a 1d array of string type in size `B`
        :return: an ndarray in size `B x D`
        """
        import torch
        from flair.embeddings import Sentence
        c_batch = [Sentence(row) for row in data]
        self.model.embed(c_batch)
        result = torch.stack([c_text.get_embedding() for c_text in c_batch]).detach()
        if self.on_gpu:
            result = result.cpu()
        return result.numpy()
Esempio n. 27
0
class FlairBackend(BaseEmbedder):
    """ Flair Embedding Model

    The Flair embedding model used for generating document and
    word embeddings.

    Arguments:
        embedding_model: A Flair embedding model

    Usage:

    ```python
    from bertopic.backend import FlairBackend
    from flair.embeddings import WordEmbeddings, DocumentPoolEmbeddings

    # Create a Flair Embedding model
    glove_embedding = WordEmbeddings('crawl')
    document_glove_embeddings = DocumentPoolEmbeddings([glove_embedding])

    # Pass the Flair model to create a new backend
    flair_embedder = FlairBackend(document_glove_embeddings)
    ```
    """
    def __init__(self, embedding_model: Union[TokenEmbeddings, DocumentEmbeddings]):
        super().__init__()

        # Flair word embeddings
        if isinstance(embedding_model, TokenEmbeddings):
            self.embedding_model = DocumentPoolEmbeddings([embedding_model])

        # Flair document embeddings + disable fine tune to prevent CUDA OOM
        # https://github.com/flairNLP/flair/issues/1719
        elif isinstance(embedding_model, DocumentEmbeddings):
            if "fine_tune" in embedding_model.__dict__:
                embedding_model.fine_tune = False
            self.embedding_model = embedding_model

        else:
            raise ValueError("Please select a correct Flair model by either using preparing a token or document "
                             "embedding model: \n"
                             "`from flair.embeddings import TransformerDocumentEmbeddings` \n"
                             "`roberta = TransformerDocumentEmbeddings('roberta-base')`")

    def embed(self,
              documents: List[str],
              verbose: bool = False) -> np.ndarray:
        """ Embed a list of n documents/words into an n-dimensional
        matrix of embeddings

        Arguments:
            documents: A list of documents or words to be embedded
            verbose: Controls the verbosity of the process

        Returns:
            Document/words embeddings with shape (n, m) with `n` documents/words
            that each have an embeddings size of `m`
        """
        embeddings = []
        for index, document in tqdm(enumerate(documents), disable=not verbose):
            try:
                sentence = Sentence(document) if document else Sentence("an empty document")
                self.embedding_model.embed(sentence)
            except RuntimeError:
                sentence = Sentence("an empty document")
                self.embedding_model.embed(sentence)
            embedding = sentence.embedding.detach().cpu().numpy()
            embeddings.append(embedding)
        embeddings = np.asarray(embeddings)
        return embeddings
Esempio n. 28
0
class AlbertPre:

    def __init__(self, MAX_WORD_N=150, MAX_SENT_N=30, MAX_WORD_SENT_N=300, alber_model="albert-base-v2") -> None:
        super().__init__()
        albert = BertEmbeddings(bert_model_or_path=alber_model)
        self.albert_embedding = DocumentPoolEmbeddings([albert])
        self.MAX_WORD_N = MAX_WORD_N
        self.MAX_SENT_N = MAX_SENT_N
        self.MAX_WORD_SENT_N = MAX_WORD_SENT_N

        self.sentence_piecer = MySentencePiecer()

    def get_embedding(self, sentence):
        sent = Sentence(sentence)
        self.albert_embedding.embed(sent)
        return sent.get_embedding()

    @staticmethod
    def split_in_sentences(text):
        return split_single(text)

    @staticmethod
    def load_csv(name):
        return pd.read_csv(str("../data/" + name + ".tsv"), sep="\t")

    def load_data(self):
        train_df = self.load_csv("train")
        test_df = self.load_csv("test")
        val_df = self.load_csv("val")
        return train_df, val_df, test_df

    def embed_sentences(self, sentences):
        arr_embedding = np.zeros((self.MAX_SENT_N, 3072))
        for i, sentence in enumerate(sentences):
            if len(sentence) > 0 and i < self.MAX_SENT_N:
                x = self.get_embedding(sentence[:self.MAX_WORD_SENT_N])
                x = x.to('cpu').detach().numpy()
                arr_embedding[i] = x

        return arr_embedding

    def compute_and_save_df(self, ds, name):

        path = "../data/%s" % (name)
        if not os.path.exists(path):
            os.mkdir(path)
        len_ds = len(ds)
        article_np = np.memmap(str(path + "/articles.npy"), dtype=np.float32,
                      mode='w+', shape=(len_ds, 30, 3072))

        highlight_list = []
        n_highlight_list = []
        n_article_list = []

        for i, (article, highlight) in ds.iterrows():
            article_sent = self.split_in_sentences(article)
            n_areticle = len(article_sent)
            article_np[i] = self.embed_sentences(article_sent)

            highlight_ids = np.array(self.sentence_piecer.get_ids_from_vocab(highlight))[:self.MAX_WORD_N]

            highlight_list.append(highlight_ids)
            n_highlight_list.append(highlight_ids.shape[0])
            n_article_list.append(n_areticle)

            if (i % 1000) == 0:
                print("computed [%d/%d]" % (i, len_ds))

        np.save(str(path + "/n_highlights" + ".npy"), n_highlight_list)
        np.save(str(path + "/n_articles" + ".npy"), n_article_list)
        np.save(str(path + "/highlights" + ".npy"), highlight_list)

    @staticmethod
    def load_np_files(name):
        path = "../data/%s" % name

        article_np = np.load(str(path + "/article" + ".npy"), allow_pickle=True)
        n_highlights = np.load(str(path + "/n_highlights" + ".npy"), allow_pickle=True)
        n_articles = np.load(str(path + "/n_articles" + ".npy"), allow_pickle=True)
        highlights = np.load(str(path + "/highlights" + ".npy"), allow_pickle=True)

        return article_np, n_articles, highlights, n_highlights
Esempio n. 29
0
def training_pipeline_bert(filepath=None,
                           num_words_to_print=10,
                           prefix=None,
                           min_topics=19,
                           max_topics=19,
                           step=2):

    logging.info(f'Started training_pipeline : {min_topics}-{max_topics}')
    start = datetime.datetime.now()

    if filepath is not None:
        filepath = data_dir_local / filepath
    else:
        logging.error("Please enter file name")
        exit()
    if max_topics is None:
        logging.error("Please enter a valid topic number to train model")
        exit()

    #logging.info(f'preprocessor.process_data_save: {filepath}')
    #preprocessor.process_data_save(filepath=filepath, as_text=as_text, as_pickle=as_pickle, verbose=verbose)
    #logging.info(f'phraser.raw_to_phrased_data_pipeline...')
    #phraser.raw_to_phrased_data_pipeline(to_load='text', verbose=True, overwrite_interim=True, prefix=None)
    col = cols[0]
    df = phraser.load_phrased_data_pipeline(to_load='text',
                                            verbose=True,
                                            overwrite_interim=True,
                                            prefix=None,
                                            training=True,
                                            col='resp_whytfa')

    #if prefix is None:
    #      prefix = ''
    # for topic modeling
    #trigram_docs_filepath = data_dir_processed / f'{prefix}{col}_transformed_docs_all.txt'
    #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/{prefix}{col}_transformed_docs_all.txt'
    #trigram_docs_filepath = f'/home/watsonrtdev/topic_modeling/input_data/topic_modeling/training/processed/processed_dataframe.csv'

    #print(f'Loading input file {trigram_docs_filepath}')
    # turn to posix filepaths until gensim supports this
    #trigram_docs_filepath =  trigram_docs_filepath.as_posix()

    #trigram_docs = LineSentence(trigram_docs_filepath)
    #df = pd.read_csv(trigram_docs_filepath)
    #print(df.columns)

    #default it to min/max topics
    num_topics_range = range(min_topics, max_topics + 1, step)
    #if num_topics is not None:
    #    num_topics_range = range(num_topics, num_topics + 1, step)
    print('Num_topics_range={}'.format(num_topics_range))

    #Contextual string embeddings are powerful embeddings that capture latent syntactic-semantic information that goes beyond standard word embeddings. Key differences are: (1) they are trained without any explicit notion of words and thus fundamentally model words as sequences of characters. And (2) they are contextualized by their surrounding text, meaning that the same word will have different embeddings depending on its contextual use.
    # initialise embedding classes
    flair_embedding_forward = FlairEmbeddings('news-forward')
    flair_embedding_backward = FlairEmbeddings('news-backward')

    bert_embedding = BertEmbeddings('bert-base-uncased')

    # combine word embedding models
    document_embeddings = DocumentPoolEmbeddings(
        [bert_embedding, flair_embedding_backward, flair_embedding_forward])

    # set up empty tensor
    X = torch.empty(size=(len(df.index), 7168))  #.cuda()
    # fill tensor with embeddings

    #  for text in tqdm(df['resp_whytfa']):    #df['text_cl']):
    #from tqdm import tqdm - show smart progress meter
    i = 0
    for text in df['resp_whytfa']:
        sentence = Sentence(text)
        document_embeddings.embed(sentence)
        embedding = sentence.get_embedding()
        X[i] = embedding
        i += 1

        if (i > 100):
            break

    print("before the PCA")

    #detach the tensor from the GPU and convert it to a NumPy array
    Y = X.cpu().detach().numpy()
    #del(X)
    #torch.cuda.empty_cache()

    #We want to cluster these vectors into topics, and we’ll invoke Agglomerative Clustering with Ward affinity from scikit-learn to do so.
    #Bottom-up hierarchical clustering algorithms have a memory complexity of O(n²), so we’ll use Principal Component Analysis to speed up this process.
    #As a side note, I did test a number of clustering algorithms (K-means, BIRCH, DBSCAN, Agglomerative with complete/average affinity), but Ward seems to perform the best in most cases

    #reduce the dimensionality of our vectors to length 768
    pca = IncrementalPCA(copy=False, n_components=768, batch_size=1000)
    #pca = PCA(n_components=768)
    X_red = pca.fit_transform(X)

    del (X)
    print("After the fit_transform")

    N_CLUSTERS = 5
    # WARD CLUSTER
    ward = AgglomerativeClustering(n_clusters=N_CLUSTERS,
                                   affinity='euclidean',
                                   linkage='ward')
    pred_ward = ward.fit_predict(X_red)
    print("After fit_predict")

    df['topic'] = pred_ward
    df.to_csv('bert_withtopic.csv')
    print("Write bert_withtopic.csv")

    #get topic composition
    topic_docs = []
    # group text into topic-documents
    for topic in range(N_CLUSTERS):
        topic_docs.append(' '.join(
            df[df['cluster'] == topic]['text_cl'].values))
    # apply function
    df_tfidf = get_top_words(topic_docs, 10)
    print(f"Top words: df_tfidf")

    #How good are our topics?
    #We find the centroids of the vectors by averaging them across each topic:
    topic_centroids = []
    for topic in tqdm(range(N_CLUSTERS)):
        X_topic = X_red[df.index[df['cluster'] == topic]]
        X_mean = np.mean(X_topic, axis=0)
        topic_centroids.append(X_mean)

    #calculate the euclidean distance of each Tweet vector to their respective topic centroid:
    topic_distances = []
    for row in tqdm(df.index):
        topic_centroid = topic_centroids[df.iloc[row]['cluster']]
        X_row = X_red[row]
        topic_distance = euclidean(topic_centroid, X_row)
        topic_distances.append(topic_distance)

    df['topic_distance'] = topic_distances
    #visualise the distribution of distances to the topic centroid
    #The closer the distribution to the left of the graph, the more compact the topic is
    df.to_csv('bert_withtopic_distance.csv')
    print('Write bert_withtopic_distance.csv')

    #topic similarity - how similar the topics are to each other
    #We will construct a euclidean distance matrix between the 10 topic centroids to find the distance between the topic averages
    df_dist_matrix = pd.DataFrame(distance_matrix(topic_centroids,
                                                  topic_centroids),
                                  index=range(N_CLUSTERS),
                                  columns=range(N_CLUSTERS))

    print(f"df_dist_matrix={df_dist_matrix}")
    with open('df_dist_matrix', 'w') as fout:
        fout.write(u'#' + '\t'.join(str(e)
                                    for e in df_dist_matrix.shape) + '\n')
        df_dist_matrix.tofile(fout)
class FormFieldSimilarityFinder:
  """
  The purpose of this class is to generate a Vector for each of the form field based on 3 predefined label description
  and store those in a pickle file
  """
  def __init__(self):
    # Initialize Form fields and their description
    self.name_field = ['Name of a person','A word by which a person is known',"Identity to call a person"]
    self.age_field = ['Age of a person','Number which tells how old a person is','The length of time a person has lived']
    self.address_field = ['Home Address of a person','A place of residence','Place of stay']
    
    # Intialize a dictionary with form fields and corresponding description list
    self.form_fields = {'Name':self.name_field, 'Age':self.age_field, 'Address':self.address_field}
    
    # Load all the Pretrained-Models
    self.elmo_embedding = ELMoEmbeddings()
    self.flair_forward_embedding = FlairEmbeddings('multi-forward')
    self.flair_backward_embedding = FlairEmbeddings('multi-backward')
    self.bert_embedding = BertEmbeddings('bert-base-multilingual-uncased')
    
    # Stack all the embeddings using DocumentPoolEmbeddings
    self.stacked_embedding = DocumentPoolEmbeddings(embeddings=[self.elmo_embedding,
                                                       self.flair_forward_embedding,self.flair_backward_embedding,self.bert_embedding])
    # A threshold value, only above which the match is considered
    self.threshold_value = 0.70
    
  def construct_vector(self, original_sentence):
    """
    Given a sentence, Contruct and return a vector based on different stacked embeddings
    """
    
    sentence = Sentence(original_sentence)
    self.stacked_embedding.embed(sentence)
    sentence_embedding = sentence.get_embedding()
    sentence_embedding_array = sentence_embedding.detach().numpy()

    return sentence_embedding_array
   
  
  def construct_category_vector(self, category_definitions):
    """
    Given a set of Category definitions, construct vector for each using Stacked embedding and return mean of all the vectors
    """
    
    category_vectors = []
    for each in category_definitions:
        sentence_embedding_array = self.construct_vector(each)
        category_vectors.append(sentence_embedding_array)
        single_vector = np.mean(category_vectors,0)
    return single_vector

  def store_category_vectors(self):
      """
      Build a Vector for each category and store it in a npz file
      """
      field_vector_dict = {}
      for field, description_list in self.form_fields.items():
        # Get a vector for each of the category using Stacked Embedding
        field_vector = construct_category_vector(description_list)
        field_vector_dict[field] = field_vector
      
      np.savez("field_vector.npz",**field_vector_dict)
  
  @staticmethod
  def find_similarity(vector1, vector2, method = "cosine"):
    """
    Find Similarity between two vectors based on the given similarity measure
    """
    sim_score = 0
    if "cosine":
      sim_score = cosine_similarity(vector1, vector2)
    elif "manhattan":
      sim_score =  sum(abs(val1-val2) for val1,val2 in zip(vector1,vector2))

    return sim_score
        
  def find_matching_field(self, user_field):
    """
    Method to find the closest matching field for a given form field
    """
    field_vectors = np.load('field_vector.npz')
    
    user_field_vector = self.construct_vector(user_field)
    similarity_dict = {}
    for field, vector in field_vectors.items():
      similarity_dict[field] = find_similarity(vector.reshape(1,-1),user_field_vector.reshape(1,-1))
    
    similarity_dict = {key: value for key, value in similarity_dict.items() if value>self.threshold_value}
    
    if similarity_dict:
      max_pair = max(similarity_dict.items(), key=operator.itemgetter(1))
      confidence = float("{0:.2f}".format(max_pair[1][0][0]))*100
      print(f"Closest Match to the field is '{max_pair[0]}' with confidence: {confidence}%")
      return max_pair
    else:
      print("No Confident Match is found!!!")
      return None