def __init__(self, verbose=1):
		# load laserembeddings models
		# if they're missing, we're downloading them
		try:
			self.laser = Laser()
		except:
			if verbose > 0:
				print("WARNING laserembeddings models missing, downloading ...")
			os.system("python -m laserembeddings download-models")
			self.laser = Laser()

		# load reviews csv
		# if it's missing, we're generating it
		# it is generated with "generate_csv_from_reviews.py" file, who is based on reviews in "sorted_data"
		if not os.path.isfile("labeled_reviews.csv"):
			if verbose > 0:
				print("WARNING csv missing, generating ...")
				start_timer = time.time()
			generate_csv_from_reviews.generate_csv_from_reviews("labeled_reviews.csv")
			if verbose > 0:
				print("time to generate:", round(time.time() - start_timer, 2), "s")

		# load stopwords
		f = open("sorted_data/stopwords", "r")
		self.stopwords = f.read().split("\n")
		self.stopwords.pop(-1)

		self.df_reviews = pd.read_csv("labeled_reviews.csv")

		# initialise model as False so we know he isnt already loaded
		self.model = False

		if verbose > 0:
			print("SentimentAnalyse ready to use.")
 def __init__(self):
     if Singletons.__instance is not None:
         raise Exception("The singleton is already initialized you are attempting to initialize it again get lost")
     else:
         logger.info("Initializing Laser embedder")
         self.laser_embedder = Laser()
         Singletons.__instance = self
Beispiel #3
0
 def post_init(self):
     from laserembeddings import Laser
     self.model = Laser(
         bpe_codes=self._path_to_bpe_codes,
         bpe_vocab=self._path_to_bpe_vocab,
         encoder=self._path_to_encoder,
     )
     self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder)
Beispiel #4
0
  def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ):
  
    sp = spacy.load('en_core_web_sm')
    tokenized = sp(doc)
    sentences = []
    for token in tokenized.sents:
      sentences.append(token.text)

    if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']:
      # Use encoder for mapping tokens to embeddings
      word_embedding_model = models.Transformer(model_name, 
                  tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {})
      # Apply mean pooling to get one fixed sized sentence vector
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                     pooling_mode_mean_tokens=True,
                                     pooling_mode_cls_token=False,
                                     pooling_mode_max_tokens=False)
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
      sentence_embeddings = model.encode(sentences)
    

    elif encoder == 'use':
      #!pip install embedding-as-service
      from embedding_as_service.text.encode import Encoder
      en = Encoder(embedding='use', model='use_dan', max_seq_length=256)
      sentence_embeddings = en.encode(texts=sentences)


    elif encoder == 'infersent':
      import nltk
      nltk.download('punkt')
      from models import InferSent
      params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                      'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
      infersent = InferSent(params_model)
      W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec'
      infersent.set_w2v_path(W2V_PATH)
      infersent.build_vocab(sentences, tokenize=True)
      sentence_embeddings = infersent.encode(sentences, tokenize=True)


    elif encoder == 'sent2vec':
      import sent2vec
      model = sent2vec.Sent2vecModel()
      model.load_model('drive/My Drive/torontobooks_unigram.bin') 
      sentence_embeddings = model.embed_sentences(sentences)
   

    elif encoder == 'laser':
      from laserembeddings import Laser
      laser = Laser()  ## Also used for multilingual sentence embeddings
      sentence_embeddings = laser.embed_sentences(sentences, lang='en') 
  
  
    else:
      raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder))  
  
    return list(zip(sentences, sentence_embeddings))
Beispiel #5
0
    def __init__(self, mode="train"):
        self.mode = mode

        self.src = None
        self.tgt = None
        self.scores = None

        self.df = None

        self.laser = Laser()
Beispiel #6
0
def test_laser():
    with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder:
        laser = Laser(
            Laser.DEFAULT_BPE_CODES_FILE,
            None,
            f_encoder,
        )
        assert laser.embed_sentences(
            ['hello world!', 'i hope the tests are passing'],
            lang='en').shape == (2, 1024)
    def __init__(self):
        if Singletons.__instance is not None:
            raise Exception("The singleton is already initialized you are attempting to initialize it again get lost")
        else:
            logger.info("Initializing Laser embedder")
            self.laser_embedder = Laser()
            self.cached_lq_dims = {}
            self.cached_intro_dims = {}

            # logger.info("Initializing Roberta embedder")
            # self.robert_embedder = SentenceTransformer(constants.fetch_constant("robeta_path"))
            Singletons.__instance = self
Beispiel #8
0
def transform_sentences(_sent_map):
    """
    Builds sentence embeddings using the LASER model.

    :param _df: Input data frame with column of sentences.
    :return: Torch matrix of embeddings, size 1024.
    """
    laser = Laser()
    sentences = list(_sent_map.keys())
    _sent_embs = laser.embed_sentences(sentences, lang='en')
    _sent_tensors = [torch.from_numpy(j) for j in _sent_embs]
    return torch.stack(_sent_tensors)
Beispiel #9
0
 def post_init(self):
     """
     
     creates Laser object to be used to create the embedding during encode
     """
     try:
         self.laser = Laser(bpe_codes=self._path_to_bpe_codes,
                            bpe_vocab=self._path_to_bpe_vocab,
                            encoder=self._path_to_encoder)
     except Exception as exp:
         self.logger.error(
             f'Got the following exception while instantiating Laser model {exp}'
         )
Beispiel #10
0
def laser_classifier(x_train, y_train, x_test, y_test):
    laser = Laser()

    train_vectors = [
        laser.embed_sentences([text], lang='ar') for text in x_train
    ]
    test_vectors = [
        laser.embed_sentences([text], lang='ar') for text in x_test
    ]
    train_vectors = [np.concatenate(x) for x in train_vectors]
    test_vectors = [np.concatenate(x) for x in test_vectors]

    classifier = SVC(random_state=0).fit(train_vectors, y_train)
    preds = classifier.predict(test_vectors)

    print(f'Accuracy score: {accuracy_score(preds, y_test).round(2)}')
class Singletons:
    __instance = None
    laser_embedder = None

    @staticmethod
    def get_instance():
        """Static access method"""
        if Singletons.__instance is None:
            logger.info("Calling private constructor for embedder initialization ")
            Singletons()
        return Singletons.__instance

    def __init__(self):
        if Singletons.__instance is not None:
            raise Exception("The singleton is already initialized you are attempting to initialize it again get lost")
        else:
            logger.info("Initializing Laser embedder")
            self.laser_embedder = Laser()
            Singletons.__instance = self

    def perform_embeddings(self, all_sentences):
        """
        This method embeds all the sentences passed using Laser embedder
        :param all_sentences:
        :return: list of sentence embeddings
        """
        if self.laser_embedder is not None:
            sentence_embeddings = self.laser_embedder.embed_sentences(all_sentences, ["en"] * len(all_sentences))
            return sentence_embeddings
        else:
            logger.info("the embedder is not set please restart the service")
    def __init__(self, method: str = 'muse',
                 path_to_model: str = './models/muse/'):

        assert method in self.__valid_methods, \
            f'Expected method aliases: {self.__valid_methods}'

        self.method = method

        if self.method == 'muse':
            self.__vectorizer = hub.load(path_to_model)
        elif self.method == 'use':
            self.__vectorizer = hub.load(path_to_model)
        elif self.method == 'laser':
            self.__vectorizer = Laser()
        else:
            self.__vectorizer = None
Beispiel #13
0
def test_similarity(test_data):
    if not SIMILARITY_TEST:
        pytest.skip("SIMILARITY_TEST not set")

    if not test_data:
        raise FileNotFoundError(
            'laserembeddings-test-data.npz is missing, run "python -m laserembeddings download-test-data" to fix that 🔧'
        )

    report = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                          'report', 'comparison-with-LASER.md')

    laser = Laser()

    with open(report, 'w', encoding='utf-8') as f_report:

        f_report.write(
            '# Comparison of the embeddings computed with original LASER with the embeddings computed with this package\n'
        )
        f_report.write(
            '| |language|avg. cosine similarity|min. cosine similarity|\n')
        f_report.write(
            '|-|--------|----------------------|----------------------|\n')

        for lang in test_data['langs']:

            if lang in ('cmn', 'wuu', 'yue', 'zh', 'jpn', 'ja', 'el'):
                # language not supported, ignoring
                continue

            sents = test_data[f'{lang}_sentences']
            orig_embeddings = test_data[f'{lang}_embeddings']
            embeddings = laser.embed_sentences(sents, lang)

            assert embeddings.shape == orig_embeddings.shape

            cosine_similarities = np.sum(
                orig_embeddings * embeddings,
                axis=1) / (np.linalg.norm(orig_embeddings, axis=1) *
                           np.linalg.norm(embeddings, axis=1))

            similarity_mean = np.mean(cosine_similarities)
            similarity_min = np.min(cosine_similarities)

            f_report.write(
                f'|{"✅" if similarity_min > 0.99999 else "⚠️" if similarity_mean > 0.99 else "❌"}|{lang}|{similarity_mean:.5f}|{similarity_min:.5f}|\n'
            )
Beispiel #14
0
class LaserEncoder(BaseTextEncoder):
    def __init__(self,
                 path_to_bpe_codes: str = Laser.DEFAULT_BPE_CODES_FILE,
                 path_to_bpe_vocab: str = Laser.DEFAULT_BPE_VOCAB_FILE,
                 path_to_encoder: str = Laser.DEFAULT_ENCODER_FILE,
                 language: str = 'en',
                 *args,
                 **kwargs):
        """
        
        Encoder for language-agnostic sentence representations (Laser) from Facebook research (https://github.com/facebookresearch/LASER)
        
        :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE.
        :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE.
        :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE.
        :param language: language to be passed whie creating the embedding. Defaults to en.
        """
        if not Path(path_to_bpe_codes):
            self.logger.error(f'bpe code file {path_to_bpe_codes} not found')
        else:
            self._path_to_bpe_codes = path_to_bpe_codes

        if not Path(path_to_bpe_vocab):
            self.logger.error(f'bpe vocab file {path_to_bpe_vocab} not found')
        else:
            self._path_to_bpe_vocab = path_to_bpe_vocab

        if not Path(path_to_encoder):
            self._logger.error(f'encode file {path_to_encoder} not found')
        else:
            self._path_to_encoder = path_to_encoder

        self.language = language
        super().__init__(*args, **kwargs)

    def post_init(self):
        """
        
        creates Laser object to be used to create the embedding during encode
        """
        try:
            self.laser = Laser(bpe_codes=self._path_to_bpe_codes,
                               bpe_vocab=self._path_to_bpe_vocab,
                               encoder=self._path_to_encoder)
        except Exception as exp:
            self.logger.error(
                f'Got the following exception while instantiating Laser model {exp}'
            )

    @batching
    @as_ndarray
    def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
        """
        
        :param data: a 1d array of string type in size `B`
        :return: an ndarray in size `B x D` (D=1024)
        """
        output = self.laser.embed_sentences(sentences=data, lang=self.language)
        return output
Beispiel #15
0
class LaserEncoder(BaseTorchEncoder):
    """
    Encode an array of string in size `B` into an ndarray in size `B x D`

    The ndarray potentially is BatchSize x (Channel x Height x Width)

    :class:`LaserEncoder` is a encoder based on Facebook Research's LASER
    (Language-Agnostic SEntence Representations) to compute multilingual
    sentence embeddings: https://github.com/facebookresearch/LASER
    :param path_to_bpe_codes: path to bpe codes from Laser.
        Defaults to Laser.DEFAULT_BPE_CODES_FILE.
    :param path_to_bpe_vocab: path to bpe vocabs from Laser.
        Defaults to Laser.DEFAULT_BPE_VOCAB_FILE.
    :param path_to_encoder: path to the encoder from Laser.
        Defaults to Laser.DEFAULT_ENCODER_FILE.
    :param language: language of the text. Defaults to english(en).
    :param args:  Additional positional arguments
    :param kwargs: Additional keyword arguments
    """
    def __init__(
        self,
        path_to_bpe_codes: str = None,
        path_to_bpe_vocab: str = None,
        path_to_encoder: str = None,
        language: str = 'en',
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        from laserembeddings import Laser
        self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE
        self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE
        self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE
        self.language = language.lower()

    def post_init(self):
        """Load LaserEncoder model"""
        from laserembeddings import Laser
        self.model = Laser(
            bpe_codes=self._path_to_bpe_codes,
            bpe_vocab=self._path_to_bpe_vocab,
            encoder=self._path_to_encoder,
        )
        self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder)

    @batching
    @as_ndarray
    def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray":
        """
         Encode data into an ndarray in size `B x D`.

         B is the `Batch size` and `D` the dimension.

        :param data: a 1d array of string type in size `B`
        :param args:  Additional positional arguments
        :param kwargs: Additional keyword arguments
        :return: an ndarray in size `B x D`.
        """
        return self.model.embed_sentences(data, lang=self.language)
Beispiel #16
0
class LASEREmbedder(Embedder):
    def __init__(self, tokenizer_language):
        super().__init__()
        self.laser = Laser()
        self.tokenizer_language = tokenizer_language

    def embed(self, sentence):
        return self.laser.embed_sentences(sentence, self.tokenizer_language)[0]
Beispiel #17
0
def get_vectors(strings):
    languages = []

    for string in strings:
        languages.append(classify(string)[0])

    corpus = [string.lower() for string in strings]
    corpus = [" ".join(string.splitlines()) for string in corpus]
    corpus = [re.sub(r'\W+', ' ', string) for string in corpus]

    return Laser().embed_sentences(corpus, lang=languages)
Beispiel #18
0
class AppConfig(AppConfig):
    name = "semantic_similarity"
    laser = Laser()
    model = hub.load(settings.USE_MODULE_URL)
    df = pd.read_csv(
        os.path.join(settings.ROOT_DIR, settings.SEMANTIC_SIMILARITY_DATA_FN))
    BASE_VECTORS_LOADED = np.load(os.path.join(settings.ROOT_DIR,
                                               settings.BASE_VECTORS_FN),
                                  allow_pickle=True)
    PROCESSED_DATA_LOADED = np.load(os.path.join(settings.ROOT_DIR,
                                                 settings.PROCESSED_DATA_FN),
                                    allow_pickle=True)
Beispiel #19
0
class LaserVectorizer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.model = Laser(path_to_bpe_codes, path_to_bpe_vocab,
                           path_to_encoder)
        print('Applying Laser Transform')

    def fit(self, X):
        return self

    def transform(self, X):
        x_laser = self.model.embed_sentences(X, lang='en')
        return x_laser
Beispiel #20
0
class LaserEncoder(BaseTorchEncoder):
    """
    :class:`LaserEncoder` is a encoder based on Facebook Research's LASER (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings.
    It encodes data from an 1d array of string in size `B` into an ndarray in size `B x D`.
    https://github.com/facebookresearch/LASER
    """

    def __init__(
            self,
            path_to_bpe_codes: str = None,
            path_to_bpe_vocab: str = None,
            path_to_encoder: str = None,
            language: str = 'en',
            *args,
            **kwargs,
    ):
        """
        :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE.
        :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE.
        :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE.
        :param language: language of the text. Defaults to en.
        :param args:
        :param kwargs:
        """
        super().__init__(*args, **kwargs)
        from laserembeddings import Laser
        self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE
        self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE
        self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE
        self.language = language.lower()

    def post_init(self):
        from laserembeddings import Laser
        self.model = Laser(
            bpe_codes=self._path_to_bpe_codes,
            bpe_vocab=self._path_to_bpe_vocab,
            encoder=self._path_to_encoder,
        )
        self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder)

    @batching
    @as_ndarray
    def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray":
        """
        :param data: a 1d array of string type in size `B`
        :param args:
        :param kwargs:
        :return: an ndarray in size `B x D`
        """
        return self.model.embed_sentences(data, lang=self.language)
class Vectorizer(object):

    """
    Encoding/Vectorization of text wrapper for various models.

    @:param method: str, optional (default: 'muse');
        alias of the encoding/vectorization method to use
        - 'use' - Universal Sentence Encoder
            (https://tfhub.dev/google/universal-sentence-encoder/4)
        - 'muse' - Multilingual Universal Sentence Encoder
            (https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3)
        - 'laser' - Language-Agnostic SEntence Representations
            (https://github.com/facebookresearch/LASER)
    @:param path_to_model: str, optional (default: './models/muse/');
        path to models (not needed for LASER; in case of tf-hub models,
        the parameter may either contain a link or the path to a locally saved
        model)

    """

    __valid_methods = ['muse', 'laser', 'use']

    def __init__(self, method: str = 'muse',
                 path_to_model: str = './models/muse/'):

        assert method in self.__valid_methods, \
            f'Expected method aliases: {self.__valid_methods}'

        self.method = method

        if self.method == 'muse':
            self.__vectorizer = hub.load(path_to_model)
        elif self.method == 'use':
            self.__vectorizer = hub.load(path_to_model)
        elif self.method == 'laser':
            self.__vectorizer = Laser()
        else:
            self.__vectorizer = None

    def vectorize(self, docs: List[str], **kwargs) -> List[List[float]]:

        if self.method in {'muse', 'use'}:
            result = self.__vectorizer(docs).numpy().tolist()
        elif self.method == 'laser':
            result = self.__vectorizer.embed_sentences(docs, **kwargs).tolist()
        else:
            raise ValueError(f'Method {self.method} is not available')

        return result
Beispiel #22
0
def encode_documents_laser(documents, params, tokenizer=None):
    max_input_length = params['max_length']
    laser = Laser()
    output = torch.zeros(size=(len(documents), params['max_sentences_per_doc'],
                               3, 1024),
                         dtype=torch.float)
    for doc_index, tokenized_document in tqdm(enumerate(documents)):
        lang_list = []

        for ele in tokenized_document:
            try:
                lang_list.append(detect(ele))
            except:
                lang_list.append('en')

        embeddings = laser.embed_sentences(
            tokenized_document,
            lang=lang_list)  # lang is only used for tokenization
        for seq_index, embed in enumerate(embeddings):
            if (seq_index >= params['max_sentences_per_doc']):
                continue
            output[doc_index][seq_index][0] = torch.FloatTensor(embed)

    return output
Beispiel #23
0
def test_laser():
    with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder:
        laser = Laser(
            Laser.DEFAULT_BPE_CODES_FILE,
            None,
            f_encoder,
        )
        assert laser.embed_sentences(
            ['hello world!', 'i hope the tests are passing'],
            lang='en').shape == (2, 1024)
        assert laser.embed_sentences(['hello world!', "j'aime les pâtes"],
                                     lang=['en', 'fr']).shape == (2, 1024)
        assert laser.embed_sentences('hello world!',
                                     lang='en').shape == (1, 1024)

        with pytest.raises(ValueError):
            laser.embed_sentences(['hello world!', "j'aime les pâtes"],
                                  lang=['en'])
Beispiel #24
0
def prep_laser(
        en_x: List[str], es_x: List[str], cm_x: List[str],
        test_x: List[str]) -> (np.ndarray, np.ndarray, np.ndarray, np.ndarray):
    """

    Args:
        en_x:
        es_x:
        cm_x:
        test_x:

    Returns:  en_x, es_x, cm_x, test_x

    """
    laser = Laser()
    en_x = get_laser_embeddings(en_x, "en", laser)
    es_x = get_laser_embeddings(es_x, "es", laser)
    cm_x = get_laser_embeddings(cm_x, "en", laser)
    test_x = get_laser_embeddings(test_x, "en", laser)
    return en_x, es_x, cm_x, test_x
Beispiel #25
0
def run_laser_sts_experiment(cleaning, batch_size=8, random_seed=777):

    df = concatenate("sts_data")

    list_1 = df['text_a'].tolist()
    list_2 = df['text_b'].tolist()

    list_1_embeddings = []
    list_2_embeddings = []

    laser = Laser()

    if cleaning:
        cleaned_list_1 = [clean_arabic(item) for item in list_1]
        cleaned_list_2 = [clean_arabic(item) for item in list_2]

        for x in tqdm(batch(cleaned_list_1, batch_size)):
            list_1_embeddings.extend(laser.embed_sentences(x, lang='ar'))

        print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings))))

        for x in tqdm(batch(cleaned_list_2, batch_size)):
            list_2_embeddings.extend(laser.embed_sentences(x, lang='ar'))

        print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings))))

    else:
        for x in tqdm(batch(list_1, batch_size)):
            list_1_embeddings.extend(laser.embed_sentences(x, lang='ar'))
        print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings))))

        for x in tqdm(batch(list_2, batch_size)):
            list_2_embeddings.extend(laser.embed_sentences(x, lang='ar'))

        print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings))))

    predicted_similrities = []
    similarities = df['labels'].tolist()

    for embedding_1, embedding_2 in tqdm(zip(list_1_embeddings, list_2_embeddings)):
        cos_sim = dot(embedding_1, embedding_2) / (norm(embedding_1) * norm(embedding_2))
        predicted_similrities.append(cos_sim)

    print("Pearson Coorelation - {}".format(str(pearsonr(similarities, predicted_similrities)[0])))
Beispiel #26
0
def initialize_laser():
    os.system("python -m laserembeddings download-models")
    laser = Laser()
    return laser
Beispiel #27
0
import string
import time
import smbclient
from environment import MODE
from whatlangid import WhatLangId
from sentence_transformers import SentenceTransformer

if MODE == 'local':
    from .local_constants import *
else:
    from .dev_constants import *

from .models import *

# Initialize Laser
laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder)

# Initialize Labse embedding
labse_model = SentenceTransformer(labse_location)

# Initialize language detector
language_model = WhatLangId(custom_model=whatlangid_model)
lang_detect = language_model.predict_lang
# lang_detect = classify


class base:
    def __init__(self, mode=None, model=None):
        self.mode = mode
        self.model = model
from elasticsearch import Elasticsearch
from bert_serving.client import BertClient
import json
from laserembeddings import Laser
import sys

__author__ = "Bijin Benny"
__email__ = "*****@*****.**"
__license__ = "MIT"
__version__ = "1.0"

LASER = 'laser_vector'
BERT = 'bert_vector'

laser = Laser()

#Elasticsearch DB client
es = Elasticsearch(hosts="http://*****:*****@localhost:9200/")

#Client connection to local BERT server
bc = BertClient(ip='localhost', output_fmt='list')
"""
doVectorize() pulls entries from the database and maps the text sequences into the 
vector space using either one of LASER or BERT based on the input parameter. 
BERT produces 768 dimensional vector while LASER outputs a 1024 dimensional vector
Argument : vector_type (String) --> bert_vector or laser_vector
"""


def doVectorize(vector_type):
    """
Beispiel #29
0
class FeatureExtractor:
    def __init__(self, mode="train"):
        self.mode = mode

        self.src = None
        self.tgt = None
        self.scores = None

        self.df = None

        self.laser = Laser()

    def load_data(self):
        # Base df with three columns
        path = f"en-de/{self.mode}.ende"
        src = pd.read_csv(
            f"{path}.src",
            sep="\n",
            error_bad_lines=False,
            quoting=csv.QUOTE_NONE,
            header=None,
        )
        target = pd.read_csv(
            f"{path}.mt",
            sep="\n",
            error_bad_lines=False,
            quoting=csv.QUOTE_NONE,
            header=None,
        )

        df = src.rename(columns={0: "src"})

        if self.mode != "test":
            scores = pd.read_csv(
                f"{path}.scores",
                sep="\n",
                error_bad_lines=False,
                quoting=csv.QUOTE_NONE,
                header=None,
            )
            df["scores"] = scores
        else:
            df["scores"] = [0 for _ in range(len(target))
                            ]  # just placeholder, not used for test
        df["tgt"] = target
        setattr(self, "df", df)
        return df

    def laser_embeddings(self):
        """Extract laser embeddings and reshape appropriately."""
        src = self.laser.embed_sentences(self.df["src"].tolist(),
                                         lang="en")  # (N, 1024)
        tgt = self.laser.embed_sentences(self.df["tgt"].tolist(),
                                         lang="de")  # (N, 1024)
        res = np.zeros((src.shape[0], 2, 1024))  # (N, 2, 1024) ndarray
        res[:, 0, :] = src
        res[:, 1, :] = tgt

        # Standardize scores
        res = MinMaxScaler().fit_transform(res)

        return res

    def features(self):
        """Extract baseline features"""
        sp_en = spacy.load("en")
        sp_de = spacy.load("de")
        en_checker = language_check.LanguageTool("en-GB")
        ge_checker = language_check.LanguageTool("de-DE")

        ft = self.df.copy()
        # Sentences without punctuation
        ft[["src_p", "tgt_p"]] = ft[["src", "tgt"]].applymap(lambda x: x.lower(
        ).translate(str.maketrans("", "", string.punctuation)))
        # Number of tokens
        ft["src_len"] = ft["src_p"].apply(lambda x: len(x.split(" ")))
        ft["tgt_len"] = ft["tgt_p"].apply(lambda x: len(x.split(" ")))
        count = lambda l1, l2: sum([1 for x in l1 if x in l2])
        # Number of non alphanumeric characters
        ft["src_#punc"] = ft["src"].apply(
            lambda x: count(x, set(string.punctuation)))
        ft["tgt_#punc"] = ft["tgt"].apply(
            lambda x: count(x, set(string.punctuation)))
        # Sentiment analysis
        ft["tgt_polar"] = ft["tgt"].apply(lambda x: TBD(x).sentiment.polarity)
        ft["src_polar"] = ft["src"].apply(lambda x: TBE(x).sentiment.polarity)
        ft["polar_ftf"] = (ft["tgt_polar"] - ft["src_polar"]).abs()
        # Spacy encoding
        ft["src_sp"] = ft["src"].apply(lambda x: sp_en(x))
        ft["tgt_sp"] = ft["tgt"].apply(lambda x: sp_de(x))
        # Proofread errors
        ft["sp_pos_diff"] = [
            spacy_parser(x, y, "pos_")
            for x, y in zip(ft["src_sp"], ft["tgt_sp"])
        ]
        ft["sp_ent_diff"] = [
            spacy_parser(x, y, "ents")
            for x, y in zip(ft["src_sp"], ft["tgt_sp"])
        ]
        ft["src_gram_err"] = ft["src"].apply(
            lambda x: len(en_checker.check(x)))
        ft["tgt_gram_err"] = ft["tgt"].apply(
            lambda x: len(ge_checker.check(x)))
        # Features of interest
        foi = [
            "src_len",
            "tgt_len",
            "src_#punc",
            "tgt_#punc",
            "tgt_polar",
            "src_polar",
            "src_gram_err",
            "tgt_gram_err",
            "sp_pos_diff",
            "sp_ent_diff",
        ]  # Features of interest

        features = ft[foi].values
        normalized_features = MinMaxScaler().fit_transform(features)

        return features

    def run(self):
        """Run feature extraction pipeline."""
        print("Loading data")
        self.load_data()
        print("Extracting Laser Embeddings")
        laser_embeds = self.laser_embeddings()
        print(f"Laser features extracted, shape: {laser_embeds.shape}")
        print("Extracting NLP features")
        features = self.features()
        print(f"NLP features extracted, shape: {features.shape}")
        res = namedtuple("res", ["lsr", "feats", "scores"])(
            lsr=laser_embeds, feats=features, scores=self.df["scores"].values)
        return res
Beispiel #30
0
from laserembeddings import Laser

laser = Laser()

# if all sentences are in the same language:

# embeddings = laser.embed_sentences(
#     ['let your neural network be polyglot',
#      'use multilingual embeddings!'],
#     lang='en')  # lang is only used for tokenization
#
# print ('')

from problem_util_yr.loadDict.read_json_tool import read_json
gene = read_json('./title_key_5w.json')
ll = []
ii = 0
allpkl = []
for d in gene:
    ii += 1
    if len(ll) < 10:
        ll.append(' '.join(d['title']))
    else:
        embeddings = laser.embed_sentences(ll, lang='en')

        allpkl.append([ll, embeddings])
        ll = []
    ###
    if ii > 10000: break
###
import pandas as pdd