Ejemplo n.º 1
0
def sisters(path):
    sentence_embedding = sister.MeanEmbedding(lang="en")

    parsed_data = pd.read_csv(path, sep='|', index_col=0)
    # Pandas series with numpy array values to dataframe
    return pd.DataFrame.from_records(
        parsed_data['line'].apply(sentence_embedding))
Ejemplo n.º 2
0
def convert_to_vector_test(data):
    embedder = sister.MeanEmbedding(lang="en")
    vectorized_data = []
    for sentences in data:
        new_sent = []
        for sentence in sentences:
            new_sent.append(embedder(sentence))
        vectorized_data.append(new_sent)
    return vectorized_data
Ejemplo n.º 3
0
def convert_to_vector_representation2(data):
    embedder = sister.MeanEmbedding(lang="en")
    vectorized_data = []
    for sentences, y in data:
        new_sent = []
        for sentence in sentences:
            new_sent.append(embedder(sentence))
        vectorized_data.append((new_sent, y))
    return vectorized_data
Ejemplo n.º 4
0
def embed():
    print("Embedding transcripts")
    data = fm.get_df("0_parsed")
    sentence_embedding = sister.MeanEmbedding(lang="en")
    embedded = data["parsed"]["line"].apply(sentence_embedding)
    d = {"embedded": pd.DataFrame.from_records(embedded, index=embedded.index)}
    embedded = data.join(pd.concat(d, axis=1))

    fm.write_df(embedded, "1_embedded_fasttext")
    return embedded
Ejemplo n.º 5
0
def sister_embeddings(x, *args):
    import sister

    aggregating_strategy = args[0]
    embedding = None
    if aggregating_strategy == 'mean':
        embedding = sister.MeanEmbedding(lang="en")
    if embedding is None:
        raise KeyError("Insufficient vespine gas")
    return embedding(x)
Ejemplo n.º 6
0
def fasttext_embed(token):
    global embedder
    if embedder is None:
        embedder = sister.MeanEmbedding(lang="en")
    if token == '':
        token = 'unk'
    if token in memoization:
        return memoization[token]
    memoization[token] = embedder(token)
    return memoization[token]
Ejemplo n.º 7
0
def review_embedding(df):  #각 영화 리뷰를 100차원의 값으로 임베딩
    review_list = df['리뷰'].tolist()
    vector1_list = []
    sentence_embedding = sister.MeanEmbedding(lang="file",
                                              fasttextfile="cc.ko.100.bin")
    for i in range(len(review_list)):

        s1 = sentence_embedding(review_list[i])
        vector1_list.append(s1)

    vector_df = pd.DataFrame(vector1_list)  #DataFrame으로
    return vector_df  #merge
Ejemplo n.º 8
0
    def __init__(self):
        self.prediction_model = 'voiceCon_NET.hdf5'
        self.prediction_matches = 'prediction_matches.pickle'
        self.matcher = None

        if not os.path.exists(self.prediction_model):
            #if no trained model exists
            print('Trained model not found... Preparing to train new model')
            PrepForTrain.Prep()

        else:
            print('Trained model found')
        self.embedder = sister.MeanEmbedding(lang='en')
        self.NET = load_model(self.prediction_model)

        print('Looking for Prediction Matcher...')
        with open(self.prediction_matches, 'rb') as P:
            self.matcher = pickle.load(P)
        print('Prediction Matcher Found!!')
def fasttext(instances, lang_code='en'):
    """
    take a list of strings and lang code and return the fasttext embeddings of sentences

    :param instances: a list of triples, [left context], target word, [right context])
    :param lang_code: str
    :return: a pair of 2-D array: features of left and right contexts
    """
    embedder = sister.MeanEmbedding(lang=lang_code)
    left_feats = []
    right_feats = []
    for left_context, _, right_context in instances:
        if left_context == []:
            left_feats.append(np.zeros(300))
        else:
            left_feats.append(embedder(' '.join(left_context)))
        if right_context == []:
            right_feats.append(np.zeros(300))
        else:
            right_feats.append(embedder(' '.join(right_context)))
    return left_feats, right_feats
Ejemplo n.º 10
0
        def fasttexttagger(df_new):
            embedder = sister.MeanEmbedding(lang="en")
            for index, value in df_new.iterrows():
                sentence = df_new.at[index, "paraphrased_question"]
                vector = embedder(sentence)
                for ind, i in enumerate(vector):
                    df_new.at[index, str(ind)] = i

            numerical_cols = [
                '#', '$', "''", '(', ')', ',', '.', ':', 'CC', 'CD', 'DT',
                'EX', 'FW', 'IN', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNP',
                'NNPS', 'NNS', 'PDT', 'POS', 'PRP', 'PRP$', 'RB', 'RBR', 'RBS',
                'RP', 'SYM', 'TO', 'UH', 'VB', 'VBD', 'VBG', 'VBN', 'VBP',
                'VBZ', 'WDT', 'WP', 'WP$', 'WRB', '``'
            ]
            for i in range(0, 300):
                numerical_cols.append(str(i))
            # scaler = StandardScaler()
            # scaler2 = StandardScaler()
            # scaler.fit(df_AllMain[numerical_cols])
            # df_new[numerical_cols] = scaler.fit_transform(df_new[numerical_cols])
            return df_new
Ejemplo n.º 11
0
 def __init__(self, lang, tokenizer=None):
     if tokenizer is None:
         tokenizer = SimpleTokenizer()
     self.embedder = sister.MeanEmbedding(lang=lang, tokenizer=tokenizer)
import pickle
import numpy as np
import pandas as pd
import sister
from sister.word_embedders import FasttextEmbedding
from sklearn.metrics.pairwise import cosine_similarity
import joblib

embedder = sister.MeanEmbedding(lang="es", word_embedder = FasttextEmbedding('es'))

def remove_symbol(s):
    s = s.replace(",", "")
    s = s.replace(".", "")
    s = s.replace(";", "")
    s = s.replace(":", "")
    s = s.replace("_", "")
    s = s.replace("+", "")
    s = s.replace("ª", "")
    s = s.replace("-", "")
    s = s.replace("<", "")
    s = s.replace(">", "")
    s = s.replace("!", "")
    s = s.replace("?", "")
    s = s.replace("(", "")
    s = s.replace(")", "")
    s = s.replace("[", "")
    s = s.replace("]", "")
    s = s.replace("'", "")
    s = s.replace("0", "")
    s = s.replace("1", "")
    s = s.replace("2", "")
Ejemplo n.º 13
0
    def __init__(self):
        self.CATEGORY_FILE = 'exCats.pickle'
        self.CATEGORIES = None
        self.data_categories = 'category'  #categorized data path used for training
        self.prepared_data = 'voiceConData.pickle'
        self.retrain = False
        self.trained_model = 'voiceCon_NET.hdf5'

        if not os.path.exists(self.trained_model):
            print(
                'No existing trained model found. Preparing to train new model'
            )
            self.retrain = True
            if not os.path.exists(self.CATEGORY_FILE):
                print('There are no existing categories of commands\n\
                Creating Category file')
                with open(self.CATEGORY_FILE, 'wb') as Cfile:
                    pickle.dump([], Cfile)  # initialize with empty list
                print('New Categroy file created')
            #after new file is created, check categroy dir and load

            print('Checking for new categories...')
            with open(self.CATEGORY_FILE, 'rb') as Cfile:
                self.CATEGORIES = pickle.load(Cfile)

            new_cats = []
            for roots, files, dirs in os.walk(self.data_categories):
                print('Current Categories found: ', dirs)
                for category in dirs:
                    if category not in self.CATEGORIES:
                        new_cats.append(category)
                        print('found new categroy:', category)
                        self.CATEGORIES.append(category)
                        print('Adding new categroy')

            if len(new_cats) == 0: print('No new categories found')
            else:
                self.retrain = True
                print('Remebering new Categories:...', new_cats)

            with open(self.CATEGORY_FILE, 'wb') as Cfile:
                pickle.dump(self.CATEGORIES, Cfile)
            print('Current categories in memory:', self.CATEGORIES)

            #NB for now no ability to delete categories
            #i think i will leave it like this for a while

            #now load data and prepcess
            self.data = {}
            for category in self.CATEGORIES:
                #print(category)
                label = category.strip('.txt')
                self.data[label] = pd.read_csv(self.data_categories + '/' +
                                               category)
            #print(self.data)

            if not os.path.exists(self.prepared_data):
                print('No prepared data for training found')
                print('Initializing Embedder...')
                self.embedder = sister.MeanEmbedding(lang='en')
                self.INIT()

            if self.retrain:
                print(
                    'Since new categories were added or new model required, Retraining model...'
                )
                print('Loading Prepared Data')

                output = self.normalize()
                trainingData = output[0]
                labels = output[1]
                trainer.Trainer(trainingData, labels, see_history=True)
Ejemplo n.º 14
0
import numpy as np
import sister
embedder=sister.MeanEmbedding(lang='en')
from scipy.spatial.distance import cosine
class Intent:
    threshold=0.7 #say
    
    def intent_searcher(self,testing_phrases,training_phrases):
        iteration_count=0
        testing_phrase=embedder(testing_phrases)
        for training_phrase in training_phrases:
            training_phrase=training_phrase.reshape(300,1)
            cosine_sim=1- cosine(training_phrase,testing_phrase)
            if iteration_count == 0:
                cosine_max=cosine_sim
            if (cosine_max<cosine_sim):
                cosine_max=cosine_sim
            iteration_count= iteration_count+1
        if (cosine_max>=self.threshold):
            return True,cosine_max
        else:
            return False,0
            
Ejemplo n.º 15
0
import sister
import pickle
import os
import glob
import json
import re
import random
import numpy as np

sentence_embedding = sister.MeanEmbedding(lang="en")


def welcome_message():
    print('           _______ _______        _____  _____ ')
    print('           |______ |_____| |        |   |     |')
    print('           ______| |     | |_____ __|__ |_____|')


def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x) / np.sum(np.exp(x), axis=0)


class Salio():
    def __init__(self):
        self.last_sentence = "hi"
        self.learn = False

        try:
            self.load_pickle("save.p")
        except:
 def setup_model(self):
     # FastText
     self.model = sister.MeanEmbedding(lang="en")
Ejemplo n.º 17
0
Kipp Freud
12/02/2020
'''

#------------------------------------------------------------------

import sister
import numpy as np

from util.message import message
import util.utilities as ut

#------------------------------------------------------------------

EMBEDDER = sister.MeanEmbedding(lang="en")

#------------------------------------------------------------------

# -----------------------------------------------------------------------------------------
# public functions
# -----------------------------------------------------------------------------------------


@ut.timeit
def embed(sent):
    """
    Will return a vector embedding of the given string sentence.
    """
    if not isinstance(sent, str):
        message.logError("Given sentence must be a string.", "nlp_util::embed")