def _load_word_vec_lmdb(word,path=MAIN_PATH+'/out/wv_lmdb',word_wv_dim=200):
    embeddings = LmdbEmbeddingsReader(path)
    try:
        vector = embeddings.get_word_vector(word)
        return vector
    except MissingWordError:
        print('%s not in dict' % word)
        return np.zeros(word_wv_dim)
    def test_missing_word_error(self, folder_path):
        """ Ensure a MissingWordError exception is 
        raised if the word does not exist in the 
        database.

        :return void
        """
        LmdbEmbeddingsWriter([('the', np.random.rand(10)),
                              ('is', np.random.rand(10))]).write(folder_path)

        reader = LmdbEmbeddingsReader(folder_path)

        with pytest.raises(exceptions.MissingWordError):
            reader.get_word_vector('unknown')
Ejemplo n.º 3
0
    def test_missing_word_error(self, folder_path):
        """ Ensure we can retrieve embeddings from
        a database 

        :return void
        """
        LmdbEmbeddingsWriter([
            ('the', np.ndarray(10)),
            ('is', np.ndarray(10))
        ]).write(folder_path)

        reader = LmdbEmbeddingsReader(folder_path)

        with pytest.raises(exceptions.MissingWordError):
            reader.get_word_vector('unknown')
    def test_reading_embeddings(self, folder_path):
        """ Ensure we can retrieve embeddings from
        the database.

        :return void
        """
        the_vector = np.random.rand(10)
        LmdbEmbeddingsWriter([('the', the_vector),
                              ('is', np.random.rand(10))]).write(folder_path)

        assert LmdbEmbeddingsReader(folder_path).get_word_vector(
            'the').tolist() == the_vector.tolist()
    def test_msgpack_serialization(self, folder_path):
        """ Ensure we can save and retrieve embeddings
        serialized with msgpack.

        :return void
        """
        the_vector = np.random.rand(10)

        LmdbEmbeddingsWriter(
            [('the', the_vector), ('is', np.random.rand(10))],
            serializer=MsgpackSerializer.serialize).write(folder_path)

        assert LmdbEmbeddingsReader(
            folder_path,
            unserializer=MsgpackSerializer.unserialize).get_word_vector(
                'the').tolist() == the_vector.tolist()
Ejemplo n.º 6
0
from keras.models import model_from_json
from keras.models import load_model
from lmdb_embeddings.reader import LmdbEmbeddingsReader
from lmdb_embeddings.exceptions import MissingWordError
import pandas as pd
import numpy as np
import os
import tensorflow as tf
import json

orig_cwd = os.getcwd()

global graph
graph = tf.get_default_graph()

embeddings = LmdbEmbeddingsReader('../data/lmdb_databases')
negative = [
    'not', 'neither', 'nor', 'but', 'however', 'although', 'nonetheless',
    'despite', 'except', 'even though', 'yet'
]
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)

regressor = load_model('four_emotions.h5')


def clean(doc):
    lemma = WordNetLemmatizer()
    stop_free = " ".join(
        [i for i in doc.lower().split() if i not in stop if i not in negative])
    punc_free = "".join([ch for ch in stop_free if ch not in exclude])
Ejemplo n.º 7
0
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import copy

#Note: Cwd is /opt/python/bundle/Xnum/app where Xnum is a digit
orig_cwd=os.getcwd()
orig_cwd=copy.deepcopy(orig_cwd)

global graph
graph = tf.get_default_graph()

path="/opt/data"
os.chdir(path)
embeddings=LmdbEmbeddingsReader('lmdb_databases')
os.chdir(orig_cwd)

negative = ['not', 'neither', 'nor', 'but', 'however', 'although', 'nonetheless', 'despite', 'except',
                         'even though', 'yet']
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)

regressor=load_model('four_emotions.h5')

def clean(doc):
    lemma=WordNetLemmatizer()
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop if i not in negative])
    punc_free = "".join([ch for ch in stop_free if ch not in exclude])
    normalized = " ".join([lemma.lemmatize(word) for word in punc_free.split()])
    return normalized