Beispiel #1
0
def test_laser():
    with open(Laser.DEFAULT_ENCODER_FILE, 'rb') as f_encoder:
        laser = Laser(
            Laser.DEFAULT_BPE_CODES_FILE,
            None,
            f_encoder,
        )
        assert laser.embed_sentences(
            ['hello world!', 'i hope the tests are passing'],
            lang='en').shape == (2, 1024)
        assert laser.embed_sentences(['hello world!', "j'aime les pâtes"],
                                     lang=['en', 'fr']).shape == (2, 1024)
        assert laser.embed_sentences('hello world!',
                                     lang='en').shape == (1, 1024)
Beispiel #2
0
def laser_classifier(x_train, y_train, x_test, y_test):
    laser = Laser()

    train_vectors = [
        laser.embed_sentences([text], lang='ar') for text in x_train
    ]
    test_vectors = [
        laser.embed_sentences([text], lang='ar') for text in x_test
    ]
    train_vectors = [np.concatenate(x) for x in train_vectors]
    test_vectors = [np.concatenate(x) for x in test_vectors]

    classifier = SVC(random_state=0).fit(train_vectors, y_train)
    preds = classifier.predict(test_vectors)

    print(f'Accuracy score: {accuracy_score(preds, y_test).round(2)}')
class Singletons:
    __instance = None
    laser_embedder = None

    @staticmethod
    def get_instance():
        """Static access method"""
        if Singletons.__instance is None:
            logger.info("Calling private constructor for embedder initialization ")
            Singletons()
        return Singletons.__instance

    def __init__(self):
        if Singletons.__instance is not None:
            raise Exception("The singleton is already initialized you are attempting to initialize it again get lost")
        else:
            logger.info("Initializing Laser embedder")
            self.laser_embedder = Laser()
            Singletons.__instance = self

    def perform_embeddings(self, all_sentences):
        """
        This method embeds all the sentences passed using Laser embedder
        :param all_sentences:
        :return: list of sentence embeddings
        """
        if self.laser_embedder is not None:
            sentence_embeddings = self.laser_embedder.embed_sentences(all_sentences, ["en"] * len(all_sentences))
            return sentence_embeddings
        else:
            logger.info("the embedder is not set please restart the service")
Beispiel #4
0
class LaserEncoder(BaseTorchEncoder):
    """
    Encode an array of string in size `B` into an ndarray in size `B x D`

    The ndarray potentially is BatchSize x (Channel x Height x Width)

    :class:`LaserEncoder` is a encoder based on Facebook Research's LASER
    (Language-Agnostic SEntence Representations) to compute multilingual
    sentence embeddings: https://github.com/facebookresearch/LASER
    :param path_to_bpe_codes: path to bpe codes from Laser.
        Defaults to Laser.DEFAULT_BPE_CODES_FILE.
    :param path_to_bpe_vocab: path to bpe vocabs from Laser.
        Defaults to Laser.DEFAULT_BPE_VOCAB_FILE.
    :param path_to_encoder: path to the encoder from Laser.
        Defaults to Laser.DEFAULT_ENCODER_FILE.
    :param language: language of the text. Defaults to english(en).
    :param args:  Additional positional arguments
    :param kwargs: Additional keyword arguments
    """
    def __init__(
        self,
        path_to_bpe_codes: str = None,
        path_to_bpe_vocab: str = None,
        path_to_encoder: str = None,
        language: str = 'en',
        *args,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        from laserembeddings import Laser
        self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE
        self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE
        self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE
        self.language = language.lower()

    def post_init(self):
        """Load LaserEncoder model"""
        from laserembeddings import Laser
        self.model = Laser(
            bpe_codes=self._path_to_bpe_codes,
            bpe_vocab=self._path_to_bpe_vocab,
            encoder=self._path_to_encoder,
        )
        self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder)

    @batching
    @as_ndarray
    def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray":
        """
         Encode data into an ndarray in size `B x D`.

         B is the `Batch size` and `D` the dimension.

        :param data: a 1d array of string type in size `B`
        :param args:  Additional positional arguments
        :param kwargs: Additional keyword arguments
        :return: an ndarray in size `B x D`.
        """
        return self.model.embed_sentences(data, lang=self.language)
Beispiel #5
0
class LaserEncoder(BaseTextEncoder):
    def __init__(self,
                 path_to_bpe_codes: str = Laser.DEFAULT_BPE_CODES_FILE,
                 path_to_bpe_vocab: str = Laser.DEFAULT_BPE_VOCAB_FILE,
                 path_to_encoder: str = Laser.DEFAULT_ENCODER_FILE,
                 language: str = 'en',
                 *args,
                 **kwargs):
        """
        
        Encoder for language-agnostic sentence representations (Laser) from Facebook research (https://github.com/facebookresearch/LASER)
        
        :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE.
        :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE.
        :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE.
        :param language: language to be passed whie creating the embedding. Defaults to en.
        """
        if not Path(path_to_bpe_codes):
            self.logger.error(f'bpe code file {path_to_bpe_codes} not found')
        else:
            self._path_to_bpe_codes = path_to_bpe_codes

        if not Path(path_to_bpe_vocab):
            self.logger.error(f'bpe vocab file {path_to_bpe_vocab} not found')
        else:
            self._path_to_bpe_vocab = path_to_bpe_vocab

        if not Path(path_to_encoder):
            self._logger.error(f'encode file {path_to_encoder} not found')
        else:
            self._path_to_encoder = path_to_encoder

        self.language = language
        super().__init__(*args, **kwargs)

    def post_init(self):
        """
        
        creates Laser object to be used to create the embedding during encode
        """
        try:
            self.laser = Laser(bpe_codes=self._path_to_bpe_codes,
                               bpe_vocab=self._path_to_bpe_vocab,
                               encoder=self._path_to_encoder)
        except Exception as exp:
            self.logger.error(
                f'Got the following exception while instantiating Laser model {exp}'
            )

    @batching
    @as_ndarray
    def encode(self, data: 'np.ndarray', *args, **kwargs) -> 'np.ndarray':
        """
        
        :param data: a 1d array of string type in size `B`
        :return: an ndarray in size `B x D` (D=1024)
        """
        output = self.laser.embed_sentences(sentences=data, lang=self.language)
        return output
Beispiel #6
0
class LASEREmbedder(Embedder):
    def __init__(self, tokenizer_language):
        super().__init__()
        self.laser = Laser()
        self.tokenizer_language = tokenizer_language

    def embed(self, sentence):
        return self.laser.embed_sentences(sentence, self.tokenizer_language)[0]
Beispiel #7
0
  def getSentenceVector(doc, model_params: dict = {}, encoder = "distilbert", model_name = 'distilbert-base-nli-mean-tokens' ):
  
    sp = spacy.load('en_core_web_sm')
    tokenized = sp(doc)
    sentences = []
    for token in tokenized.sents:
      sentences.append(token.text)

    if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart']:
      # Use encoder for mapping tokens to embeddings
      word_embedding_model = models.Transformer(model_name, 
                  tokenizer_args= model_params['tokenizer_args'] if 'tokenizer_args' in model_params else {})
      # Apply mean pooling to get one fixed sized sentence vector
      pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                     pooling_mode_mean_tokens=True,
                                     pooling_mode_cls_token=False,
                                     pooling_mode_max_tokens=False)
      model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
      sentence_embeddings = model.encode(sentences)
    

    elif encoder == 'use':
      #!pip install embedding-as-service
      from embedding_as_service.text.encode import Encoder
      en = Encoder(embedding='use', model='use_dan', max_seq_length=256)
      sentence_embeddings = en.encode(texts=sentences)


    elif encoder == 'infersent':
      import nltk
      nltk.download('punkt')
      from models import InferSent
      params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                      'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
      infersent = InferSent(params_model)
      W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec'
      infersent.set_w2v_path(W2V_PATH)
      infersent.build_vocab(sentences, tokenize=True)
      sentence_embeddings = infersent.encode(sentences, tokenize=True)


    elif encoder == 'sent2vec':
      import sent2vec
      model = sent2vec.Sent2vecModel()
      model.load_model('drive/My Drive/torontobooks_unigram.bin') 
      sentence_embeddings = model.embed_sentences(sentences)
   

    elif encoder == 'laser':
      from laserembeddings import Laser
      laser = Laser()  ## Also used for multilingual sentence embeddings
      sentence_embeddings = laser.embed_sentences(sentences, lang='en') 
  
  
    else:
      raise ValueError('Invalid encoder {} or encoder Unavailable.'.format(encoder))  
  
    return list(zip(sentences, sentence_embeddings))
Beispiel #8
0
def run_laser_sts_experiment(cleaning, batch_size=8, random_seed=777):

    df = concatenate("sts_data")

    list_1 = df['text_a'].tolist()
    list_2 = df['text_b'].tolist()

    list_1_embeddings = []
    list_2_embeddings = []

    laser = Laser()

    if cleaning:
        cleaned_list_1 = [clean_arabic(item) for item in list_1]
        cleaned_list_2 = [clean_arabic(item) for item in list_2]

        for x in tqdm(batch(cleaned_list_1, batch_size)):
            list_1_embeddings.extend(laser.embed_sentences(x, lang='ar'))

        print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings))))

        for x in tqdm(batch(cleaned_list_2, batch_size)):
            list_2_embeddings.extend(laser.embed_sentences(x, lang='ar'))

        print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings))))

    else:
        for x in tqdm(batch(list_1, batch_size)):
            list_1_embeddings.extend(laser.embed_sentences(x, lang='ar'))
        print("Length of the list 1 embeddings {}".format(str(len(list_1_embeddings))))

        for x in tqdm(batch(list_2, batch_size)):
            list_2_embeddings.extend(laser.embed_sentences(x, lang='ar'))

        print("Length of the list 2 embeddings {}".format(str(len(list_2_embeddings))))

    predicted_similrities = []
    similarities = df['labels'].tolist()

    for embedding_1, embedding_2 in tqdm(zip(list_1_embeddings, list_2_embeddings)):
        cos_sim = dot(embedding_1, embedding_2) / (norm(embedding_1) * norm(embedding_2))
        predicted_similrities.append(cos_sim)

    print("Pearson Coorelation - {}".format(str(pearsonr(similarities, predicted_similrities)[0])))
Beispiel #9
0
class LaserVectorizer(TransformerMixin, BaseEstimator):
    def __init__(self):
        self.model = Laser(path_to_bpe_codes, path_to_bpe_vocab,
                           path_to_encoder)
        print('Applying Laser Transform')

    def fit(self, X):
        return self

    def transform(self, X):
        x_laser = self.model.embed_sentences(X, lang='en')
        return x_laser
Beispiel #10
0
def transform_sentences(_sent_map):
    """
    Builds sentence embeddings using the LASER model.

    :param _df: Input data frame with column of sentences.
    :return: Torch matrix of embeddings, size 1024.
    """
    laser = Laser()
    sentences = list(_sent_map.keys())
    _sent_embs = laser.embed_sentences(sentences, lang='en')
    _sent_tensors = [torch.from_numpy(j) for j in _sent_embs]
    return torch.stack(_sent_tensors)
Beispiel #11
0
class LaserEncoder(BaseTorchEncoder):
    """
    :class:`LaserEncoder` is a encoder based on Facebook Research's LASER (Language-Agnostic SEntence Representations) to compute multilingual sentence embeddings.
    It encodes data from an 1d array of string in size `B` into an ndarray in size `B x D`.
    https://github.com/facebookresearch/LASER
    """

    def __init__(
            self,
            path_to_bpe_codes: str = None,
            path_to_bpe_vocab: str = None,
            path_to_encoder: str = None,
            language: str = 'en',
            *args,
            **kwargs,
    ):
        """
        :param path_to_bpe_codes: path to bpe codes from Laser. Defaults to Laser.DEFAULT_BPE_CODES_FILE.
        :param path_to_bpe_vocab: path to bpe vocabs from Laser. Defaults to Laser.DEFAULT_BPE_VOCAB_FILE.
        :param path_to_encoder: path to the encoder from Laser. Defaults to Laser.DEFAULT_ENCODER_FILE.
        :param language: language of the text. Defaults to en.
        :param args:
        :param kwargs:
        """
        super().__init__(*args, **kwargs)
        from laserembeddings import Laser
        self._path_to_bpe_codes = path_to_bpe_codes or Laser.DEFAULT_BPE_CODES_FILE
        self._path_to_bpe_vocab = path_to_bpe_vocab or Laser.DEFAULT_BPE_VOCAB_FILE
        self._path_to_encoder = path_to_encoder or Laser.DEFAULT_ENCODER_FILE
        self.language = language.lower()

    def post_init(self):
        from laserembeddings import Laser
        self.model = Laser(
            bpe_codes=self._path_to_bpe_codes,
            bpe_vocab=self._path_to_bpe_vocab,
            encoder=self._path_to_encoder,
        )
        self.to_device(self.model.bpeSentenceEmbedding.encoder.encoder)

    @batching
    @as_ndarray
    def encode(self, data: "np.ndarray", *args, **kwargs) -> "np.ndarray":
        """
        :param data: a 1d array of string type in size `B`
        :param args:
        :param kwargs:
        :return: an ndarray in size `B x D`
        """
        return self.model.embed_sentences(data, lang=self.language)
class Vectorizer(object):

    """
    Encoding/Vectorization of text wrapper for various models.

    @:param method: str, optional (default: 'muse');
        alias of the encoding/vectorization method to use
        - 'use' - Universal Sentence Encoder
            (https://tfhub.dev/google/universal-sentence-encoder/4)
        - 'muse' - Multilingual Universal Sentence Encoder
            (https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3)
        - 'laser' - Language-Agnostic SEntence Representations
            (https://github.com/facebookresearch/LASER)
    @:param path_to_model: str, optional (default: './models/muse/');
        path to models (not needed for LASER; in case of tf-hub models,
        the parameter may either contain a link or the path to a locally saved
        model)

    """

    __valid_methods = ['muse', 'laser', 'use']

    def __init__(self, method: str = 'muse',
                 path_to_model: str = './models/muse/'):

        assert method in self.__valid_methods, \
            f'Expected method aliases: {self.__valid_methods}'

        self.method = method

        if self.method == 'muse':
            self.__vectorizer = hub.load(path_to_model)
        elif self.method == 'use':
            self.__vectorizer = hub.load(path_to_model)
        elif self.method == 'laser':
            self.__vectorizer = Laser()
        else:
            self.__vectorizer = None

    def vectorize(self, docs: List[str], **kwargs) -> List[List[float]]:

        if self.method in {'muse', 'use'}:
            result = self.__vectorizer(docs).numpy().tolist()
        elif self.method == 'laser':
            result = self.__vectorizer.embed_sentences(docs, **kwargs).tolist()
        else:
            raise ValueError(f'Method {self.method} is not available')

        return result
Beispiel #13
0
def test_similarity(test_data):
    if not SIMILARITY_TEST:
        pytest.skip("SIMILARITY_TEST not set")

    if not test_data:
        raise FileNotFoundError(
            'laserembeddings-test-data.npz is missing, run "python -m laserembeddings download-test-data" to fix that 🔧'
        )

    report = os.path.join(os.path.dirname(os.path.realpath(__file__)),
                          'report', 'comparison-with-LASER.md')

    laser = Laser()

    with open(report, 'w', encoding='utf-8') as f_report:

        f_report.write(
            '# Comparison of the embeddings computed with original LASER with the embeddings computed with this package\n'
        )
        f_report.write(
            '| |language|avg. cosine similarity|min. cosine similarity|\n')
        f_report.write(
            '|-|--------|----------------------|----------------------|\n')

        for lang in test_data['langs']:

            if lang in ('cmn', 'wuu', 'yue', 'zh', 'jpn', 'ja', 'el'):
                # language not supported, ignoring
                continue

            sents = test_data[f'{lang}_sentences']
            orig_embeddings = test_data[f'{lang}_embeddings']
            embeddings = laser.embed_sentences(sents, lang)

            assert embeddings.shape == orig_embeddings.shape

            cosine_similarities = np.sum(
                orig_embeddings * embeddings,
                axis=1) / (np.linalg.norm(orig_embeddings, axis=1) *
                           np.linalg.norm(embeddings, axis=1))

            similarity_mean = np.mean(cosine_similarities)
            similarity_min = np.min(cosine_similarities)

            f_report.write(
                f'|{"✅" if similarity_min > 0.99999 else "⚠️" if similarity_mean > 0.99 else "❌"}|{lang}|{similarity_mean:.5f}|{similarity_min:.5f}|\n'
            )
Beispiel #14
0
def encode_documents_laser(documents, params, tokenizer=None):
    max_input_length = params['max_length']
    laser = Laser()
    output = torch.zeros(size=(len(documents), params['max_sentences_per_doc'],
                               3, 1024),
                         dtype=torch.float)
    for doc_index, tokenized_document in tqdm(enumerate(documents)):
        lang_list = []

        for ele in tokenized_document:
            try:
                lang_list.append(detect(ele))
            except:
                lang_list.append('en')

        embeddings = laser.embed_sentences(
            tokenized_document,
            lang=lang_list)  # lang is only used for tokenization
        for seq_index, embed in enumerate(embeddings):
            if (seq_index >= params['max_sentences_per_doc']):
                continue
            output[doc_index][seq_index][0] = torch.FloatTensor(embed)

    return output
Beispiel #15
0
from numpy import dot
from numpy.linalg import norm

def cos_sim(a,b):
    cos_sim = dot(a, b)/(norm(a)*norm(b))
    return cos_sim

path_to_bpe_codes = '/home/darth.vader/laser/93langs.fcodes'
path_to_bpe_vocab = '/home/darth.vader/laser/93langs.fvocab'
path_to_encoder = '/home/darth.vader/laser/bilstm.93langs.2018-12-26.pt'

laser = Laser(path_to_bpe_codes, path_to_bpe_vocab, path_to_encoder)


emb = laser.embed_sentences(
    ['how are you'],
    lang='en') 

te_emb = laser.embed_sentences(
    ['क्या हाल है'],
    lang='hi')  

sim = cos_sim(emb[0],te_emb[0])
print(sim)


import time
import numpy as np
import langid
from ilmulti.segment import SimpleSegmenter, Segmenter
from ilmulti.sentencepiece import SentencePieceTokenizer
Beispiel #16
0
class FeatureExtractor:
    def __init__(self, mode="train"):
        self.mode = mode

        self.src = None
        self.tgt = None
        self.scores = None

        self.df = None

        self.laser = Laser()

    def load_data(self):
        # Base df with three columns
        path = f"en-de/{self.mode}.ende"
        src = pd.read_csv(
            f"{path}.src",
            sep="\n",
            error_bad_lines=False,
            quoting=csv.QUOTE_NONE,
            header=None,
        )
        target = pd.read_csv(
            f"{path}.mt",
            sep="\n",
            error_bad_lines=False,
            quoting=csv.QUOTE_NONE,
            header=None,
        )

        df = src.rename(columns={0: "src"})

        if self.mode != "test":
            scores = pd.read_csv(
                f"{path}.scores",
                sep="\n",
                error_bad_lines=False,
                quoting=csv.QUOTE_NONE,
                header=None,
            )
            df["scores"] = scores
        else:
            df["scores"] = [0 for _ in range(len(target))
                            ]  # just placeholder, not used for test
        df["tgt"] = target
        setattr(self, "df", df)
        return df

    def laser_embeddings(self):
        """Extract laser embeddings and reshape appropriately."""
        src = self.laser.embed_sentences(self.df["src"].tolist(),
                                         lang="en")  # (N, 1024)
        tgt = self.laser.embed_sentences(self.df["tgt"].tolist(),
                                         lang="de")  # (N, 1024)
        res = np.zeros((src.shape[0], 2, 1024))  # (N, 2, 1024) ndarray
        res[:, 0, :] = src
        res[:, 1, :] = tgt

        # Standardize scores
        res = MinMaxScaler().fit_transform(res)

        return res

    def features(self):
        """Extract baseline features"""
        sp_en = spacy.load("en")
        sp_de = spacy.load("de")
        en_checker = language_check.LanguageTool("en-GB")
        ge_checker = language_check.LanguageTool("de-DE")

        ft = self.df.copy()
        # Sentences without punctuation
        ft[["src_p", "tgt_p"]] = ft[["src", "tgt"]].applymap(lambda x: x.lower(
        ).translate(str.maketrans("", "", string.punctuation)))
        # Number of tokens
        ft["src_len"] = ft["src_p"].apply(lambda x: len(x.split(" ")))
        ft["tgt_len"] = ft["tgt_p"].apply(lambda x: len(x.split(" ")))
        count = lambda l1, l2: sum([1 for x in l1 if x in l2])
        # Number of non alphanumeric characters
        ft["src_#punc"] = ft["src"].apply(
            lambda x: count(x, set(string.punctuation)))
        ft["tgt_#punc"] = ft["tgt"].apply(
            lambda x: count(x, set(string.punctuation)))
        # Sentiment analysis
        ft["tgt_polar"] = ft["tgt"].apply(lambda x: TBD(x).sentiment.polarity)
        ft["src_polar"] = ft["src"].apply(lambda x: TBE(x).sentiment.polarity)
        ft["polar_ftf"] = (ft["tgt_polar"] - ft["src_polar"]).abs()
        # Spacy encoding
        ft["src_sp"] = ft["src"].apply(lambda x: sp_en(x))
        ft["tgt_sp"] = ft["tgt"].apply(lambda x: sp_de(x))
        # Proofread errors
        ft["sp_pos_diff"] = [
            spacy_parser(x, y, "pos_")
            for x, y in zip(ft["src_sp"], ft["tgt_sp"])
        ]
        ft["sp_ent_diff"] = [
            spacy_parser(x, y, "ents")
            for x, y in zip(ft["src_sp"], ft["tgt_sp"])
        ]
        ft["src_gram_err"] = ft["src"].apply(
            lambda x: len(en_checker.check(x)))
        ft["tgt_gram_err"] = ft["tgt"].apply(
            lambda x: len(ge_checker.check(x)))
        # Features of interest
        foi = [
            "src_len",
            "tgt_len",
            "src_#punc",
            "tgt_#punc",
            "tgt_polar",
            "src_polar",
            "src_gram_err",
            "tgt_gram_err",
            "sp_pos_diff",
            "sp_ent_diff",
        ]  # Features of interest

        features = ft[foi].values
        normalized_features = MinMaxScaler().fit_transform(features)

        return features

    def run(self):
        """Run feature extraction pipeline."""
        print("Loading data")
        self.load_data()
        print("Extracting Laser Embeddings")
        laser_embeds = self.laser_embeddings()
        print(f"Laser features extracted, shape: {laser_embeds.shape}")
        print("Extracting NLP features")
        features = self.features()
        print(f"NLP features extracted, shape: {features.shape}")
        res = namedtuple("res", ["lsr", "feats", "scores"])(
            lsr=laser_embeds, feats=features, scores=self.df["scores"].values)
        return res
class SentimentAnalyse(object):
	"""
	SentimentAnalyse generate a model to do sentiment snalyses on sentences
	laserembeddings is used as embedding
	keras is used to build model

	methods:
		generate_model -> train a model from reviews and save it
		load_model -> load a trained model
		model_predict -> take list of sentences and use model to predict sentiment
	"""

	def __init__(self, verbose=1):
		# load laserembeddings models
		# if they're missing, we're downloading them
		try:
			self.laser = Laser()
		except:
			if verbose > 0:
				print("WARNING laserembeddings models missing, downloading ...")
			os.system("python -m laserembeddings download-models")
			self.laser = Laser()

		# load reviews csv
		# if it's missing, we're generating it
		# it is generated with "generate_csv_from_reviews.py" file, who is based on reviews in "sorted_data"
		if not os.path.isfile("labeled_reviews.csv"):
			if verbose > 0:
				print("WARNING csv missing, generating ...")
				start_timer = time.time()
			generate_csv_from_reviews.generate_csv_from_reviews("labeled_reviews.csv")
			if verbose > 0:
				print("time to generate:", round(time.time() - start_timer, 2), "s")

		# load stopwords
		f = open("sorted_data/stopwords", "r")
		self.stopwords = f.read().split("\n")
		self.stopwords.pop(-1)

		self.df_reviews = pd.read_csv("labeled_reviews.csv")

		# initialise model as False so we know he isnt already loaded
		self.model = False

		if verbose > 0:
			print("SentimentAnalyse ready to use.")


	def _train_model(self, model, X, Y, path_model_save="model.h5", verbose=2):
		"""
		params:
			model : keras model -> model to train
			X : list of embedded sentence -> input of model (X)
			Y : list of int, sentiments of X -> output of model (y hat)
			path_model_save : str -> path to save the model
			verbose : int -> show progress if verbose > 0

		return:
			model : keras model -> the trained model
		"""

		X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
			
		# compile model
		model.compile(
			loss      = tf.keras.losses.BinaryCrossentropy(from_logits=True),
			optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5),
			metrics   = [ tf.keras.metrics.BinaryAccuracy() ],
		)
		
		# train model
		model.fit(
			X_train, Y_train, 
			batch_size = 32, 
			epochs     = 1000, 
			validation_split = 0.2,
			callbacks = [
				tf.keras.callbacks.EarlyStopping(monitor='val_loss', mode='min', verbose=0, patience=10),
				tf.keras.callbacks.ModelCheckpoint(path_model_save,  monitor='binary_accuracy', mode='max', verbose=0, save_best_only=True)
			],
			verbose=verbose
		)

		# show accuracy
		if verbose > 0:
			print()
			print('Train Accuracy')
			model.evaluate(X_train, Y_train)
			print('Test Accuracy')
			model.evaluate(X_test, Y_test)

		return model


	def preprocess_text_list(self, text_list, lang="en"):
		"""
		return preprocessed text_list
		"""
		# for i in tqdm(range(len(text_list))):
		# 	# strip "\n"
		# 	text_list[i] = text_list[i].strip("\n")
		# 	# lowercase
		# 	text_list[i] = text_list[i].lower()
		# 	# stop words
		# 	text_list[i] = ' '.join([word for word in text_list[i] if not word in self.stopwords])
		print("embedding ...")
		# embedding sentences
		preprocess_text_list = self.laser.embed_sentences(text_list, lang=lang)
		return preprocess_text_list


	def generate_model(self, sample_size=2000, path_model_save="model.h5", verbose=2):
		"""
		params:
			sample_size : int -> number of sentences use to train model
			path_model_save : str -> path to save the model
			verbose : int -> show progress if verbose > 0
		return:
			model : keras model -> trained model
		"""

		# loading data for training
		# the smaller the sample_size is, the faster the model is generated
		df_train = self.df_reviews[self.df_reviews["sentiment"] == 1].head(int(sample_size/2))
		df_train = df_train.append(self.df_reviews[self.df_reviews["sentiment"] == 0].head(sample_size - int(sample_size/2)))
		df_train = df_train.reset_index(drop=True)

		# shuffle DataFrame
		df_train = df_train.sample(frac=1).reset_index(drop=True)

		if verbose > 0:
			print("Train data successfully loaded")
			display.display(df_train.head())
			print(df_train["sentiment"].value_counts())
			print(df_train["rating"].value_counts())
			print("Shape :", df_train.shape)
			print("preprocessing text ...")

		# we're embedding sentences with laserembedding
		# embedded sentence as input X
		# sentiment values as output Y
		X_train = self.preprocess_text_list(df_train["review_text"].values.tolist())
		Y_train = df_train["rating"]

		# min max scale ratins
		Y_train = (Y_train - Y_train.min()) / (Y_train.max() - Y_train.min())

		if verbose > 0:
			print("preprocessing done")
			print("training model ...")

		# creating model
		# every len of embedding sentence are equal to 1024, so for each sentence we have 1024 inputs and 1 output
		model = tf.keras.Sequential([
			tf.keras.Input(shape=(1024,)),
			tf.keras.layers.Dense(512, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
				tf.keras.layers.BatchNormalization(),
				tf.keras.layers.Dropout(0.25),
			tf.keras.layers.Dense(128, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
				tf.keras.layers.BatchNormalization(),
				tf.keras.layers.Dropout(0.25),
			tf.keras.layers.Dense(32, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
				tf.keras.layers.BatchNormalization(),
				tf.keras.layers.Dropout(0.25),
			tf.keras.layers.Dense(8, activation=tf.keras.layers.LeakyReLU(alpha=0.1)),
				tf.keras.layers.BatchNormalization(),
				tf.keras.layers.Dropout(0.25),
			tf.keras.layers.Dense(1, activation=tf.keras.activations.sigmoid),
		])

		# train model
		self.model = self._train_model(model, X_train, Y_train, path_model_save=path_model_save, verbose=verbose)

		# save model
		self.model.save(path_model_save)
		return self.model
		

	def load_model(self, model_path):
		if not os.path.isfile(model_path):
			print(model_path, "is missing")
		else:
			self.model = tf.keras.models.load_model(model_path)
			print(model_path, "loaded")


	def model_predict(self, sentence_list):
		"""
		params:
			sentence : list(str) -> list of sentence
		return:
			if type(sentence_list) == list and self.model != False:
				return predictions
			else:
				return None
		"""
		if type(sentence_list) == list and self.model != False:
			return self.model.predict(self.preprocess_text_list(sentence_list))

		return None
Beispiel #18
0
class myVectorizer_Laser(object):
    def fit(self, X):
        self.laser = Laser()
        return self
    def transform(self, X):
        return normalize(self.laser.embed_sentences(X, lang='ru'), norm='l2')
Beispiel #19
0
def get_laser_embeddings(x: List[str], lang: str, laser=None) -> np.ndarray:
    if laser is None:
        laser = Laser()
    return laser.embed_sentences(sentences=x, lang=lang)
Beispiel #20
0
def test_ja():
    if SKIP_JA:
        pytest.skip("SKIP_JA is set")
    laser = Laser()
    assert laser.embed_sentences(['乾杯!'], lang='ja').shape == (1, 1024)
Beispiel #21
0
#      'use multilingual embeddings!'],
#     lang='en')  # lang is only used for tokenization
#
# print ('')

from problem_util_yr.loadDict.read_json_tool import read_json
gene = read_json('./title_key_5w.json')
ll = []
ii = 0
allpkl = []
for d in gene:
    ii += 1
    if len(ll) < 10:
        ll.append(' '.join(d['title']))
    else:
        embeddings = laser.embed_sentences(ll, lang='en')

        allpkl.append([ll, embeddings])
        ll = []
    ###
    if ii > 10000: break
###
import pandas as pdd
pdd.to_pickle(allpkl, './allpkl.pkl')

# embeddings = laser.embed_sentences(
#     ['今 天 天 气 晴 朗',
#      '今 天 天 气 很 不 错',
#      '股 票 怎 么 跌 成 这 样'],
#     lang='en')  # lang is only used for tokenization
import numpy as np
nlp = spacy.load('pl_core_news_lg')

model = make_pipeline(
    FunctionTransformer(lambda x: np.stack([nlp(t).vector for t in x])),
    Normalizer(),
    AgglomerativeClustering(distance_threshold=0.5, n_clusters=None),
)
clusters = model.fit_predict(texts)
print(clusters)  # [2 0 2 0 1]

from laserembeddings import Laser
laser = Laser()

model = make_pipeline(
    FunctionTransformer(lambda x: laser.embed_sentences(x, lang='en')),
    Normalizer(),
    AgglomerativeClustering(distance_threshold=0.8, n_clusters=None),
)
clusters = model.fit_predict(texts)
print(clusters)  # [1 1 1 0 0]

#results for each model
from collections import defaultdict
cluster2words = defaultdict(list)
for text, cluster in zip(texts, clusters):
    for word in text.split():
        if word not in cluster2words[cluster]:
            cluster2words[cluster].append(word)

test = [wordlist for wordlist in cluster2words.values()]
Beispiel #23
0
def test_zh():
    if SKIP_ZH:
        pytest.skip("SKIP_ZH is set")
    laser = Laser()
    assert laser.embed_sentences(['干杯!'], lang='zh').shape == (1, 1024)
Beispiel #24
0
from laserembeddings import Laser

laser = Laser()

# if all sentences are in the same language:

# embeddings = laser.embed_sentences(
#     ['let your neural network be polyglot',
#      'use multilingual embeddings!'],
#     lang='en')  # lang is only used for tokenization
#
# print ('')

embeddings = laser.embed_sentences(
    ['今 天 天 气 晴 朗', '今 天 天 气 很 不 错', '股 票 怎 么 跌 成 这 样'],
    lang='en')  # lang is only used for tokenization

# embeddings = laser.embed_sentences(
#     ['今天天气晴朗',
#      '今天天气很不错',
#      '股票怎么跌成这样'],
#     lang='zh')  #使用jieba分词# lang is only used for tokenization
#
# print ('')

# embeddings is a N*1024 (N = number of sentences) NumPy array

import pandas as pdd
pdd.to_pickle(embeddings, 'emb.pkl')
Beispiel #25
0
    def get_sentence_vec(self, sentences):

        laser = Laser()
        sentence_embeddings = laser.embed_sentences(sentences, lang='en')

        return sentence_embeddings
Beispiel #26
0
def getDocumentEmbedding(doc, model_params: dict = {}, encoder = 'xlnet', model_name = 'xlnet-base-uncased'):
  #model = SentenceTransformer(model_name, model_params)
  #sentence_embedding = model.encode(doc)

  ## Word tokenizer
  from spacy.lang.en import English
  nlp = English()
  # Create a Tokenizer with the default settings for English including punctuation rules and exceptions
  tokenizer = nlp.Defaults.create_tokenizer(nlp)
  tokens = tokenizer("This is a sentence")
  if len(tokens) > getMaxLength(encoder):
    warnings.warn("The input sequence length exceeds the maximum limit.", Warning)



  if encoder in ['bert', 'xlnet', 'longformer', 'reformer', 'distilbert', 'roberta', 'bart', 'finbert']:
    # Use BERT for mapping tokens to embeddings
    word_embedding_model = models.Transformer(model_name)
    # Apply mean pooling to get one fixed sized sentence vector
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
                                   pooling_mode_mean_tokens=True,
                                   pooling_mode_cls_token=False,
                                   pooling_mode_max_tokens=False)
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])   
    sentence_embeddings = model.encode(doc)
    

  elif encoder == 'use':
    #!pip install embedding-as-service
    from embedding_as_service.text.encode import Encoder
    en = Encoder(embedding='use', model='use_dan', max_seq_length=256)
    sentence_embeddings = en.encode(texts=doc)


  elif encoder == 'infersent':
    import nltk
    nltk.download('punkt')
    from models import InferSent
    params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                    'pool_type': 'max', 'dpout_model': 0.0, 'version': 2}
    infersent = InferSent(params_model)
    W2V_PATH = 'drive/My Drive/wiki-news-300d-1M.vec'
    infersent.set_w2v_path(W2V_PATH)
    infersent.build_vocab(sentences, tokenize=True)
    sentence_embeddings = infersent.encode(doc, tokenize=True)


  elif encoder == 'sent2vec':
    import sent2vec
    model = sent2vec.Sent2vecModel()
    model.load_model('drive/My Drive/torontobooks_unigram.bin') 
    sentence_embeddings = model.embed_sentences(doc)



  elif encoder == 'laser':
    from laserembeddings import Laser
    laser = Laser()  ## Also used for multilingual sentence embeddings
    sentence_embeddings = laser.embed_sentences(sentences, lang='en') 


  return sentence_embeddings
Beispiel #27
0
df = pd.DataFrame({
    'review_text': data.review_text,
    'rating': data.rating,
})

# data cleaning
df['review_text'] = df['review_text'].apply(lambda x: preprocessing(x))
df['rating'] = df['rating'].apply(lambda x: preprocessing(x))

# mixed data
df = df.sample(frac=1).reset_index(drop=True)

# instance laser
laser = Laser()

embed = laser.embed_sentences(df['review_text'], lang='en')

# intit train and test

# split data
X_train = embed[:1600]
X_test = embed[400:]
y_train = df['rating'][:1600]
y_test = df['rating'][400:]

# Fitting a random forest classifier to the training data

text_classifier = RandomForestClassifier(n_estimators=50)

print("Fitting random forest to training data....")
class Singletons:
    __instance = None
    laser_embedder = cached_lq_dims = cached_intro_dims = None

    # robert_embedder = None

    @staticmethod
    def get_instance():
        """Static access method"""
        if Singletons.__instance is None:
            logger.info("Calling private constructor for embedder initialization ")
            Singletons()
        return Singletons.__instance

    def __init__(self):
        if Singletons.__instance is not None:
            raise Exception("The singleton is already initialized you are attempting to initialize it again get lost")
        else:
            logger.info("Initializing Laser embedder")
            self.laser_embedder = Laser()
            self.cached_lq_dims = {}
            self.cached_intro_dims = {}

            # logger.info("Initializing Roberta embedder")
            # self.robert_embedder = SentenceTransformer(constants.fetch_constant("robeta_path"))
            Singletons.__instance = self

    def perform_embeddings(self, all_sentences):
        """
        This method embeds all the sentences passed using Laser embedder
        :param all_sentences:
        :return: list of sentence embeddings
        """
        if self.laser_embedder is not None:
            sentence_embeddings = self.laser_embedder.embed_sentences(all_sentences, ["en"] * len(all_sentences))
            return sentence_embeddings
        else:
            logger.info("the embedder is not set please restart the service")

    # def perform_embeddings(self, all_sentences):
    #     """
    #     This method embeds all the sentences passed using Laser embedder
    #     :param all_sentences:
    #     :return: list of sentence embeddings
    #     """
    #     if self.robert_embedder is not None:
    #         sentence_embeddings = self.robert_embedder.encode(all_sentences)
    #         return sentence_embeddings
    #     else:
    #         logger.info("the embedder is not set please restart the service")

    def get_cached_lq_dims(self):
        """
        :return: the dictionary of cached facets
        """
        return self.cached_lq_dims

    def set_cached_lq_dims(self, facet_name, facet):
        """
        :return: the dictionary of cached facets
        """
        self.cached_lq_dims[facet_name] = facet

    def get_cached_intro_dims(self):
        """
        :return: the dictionary of cached facets
        """
        return self.cached_intro_dims

    def set_cached_intro_dims(self, facet_name, facet):
        """
        :return: the dictionary of cached facets
        """
        self.cached_intro_dims[facet_name] = facet