Ejemplo n.º 1
0
	def __init__(self, model_path = None):

		if model_path[3:] == 'vec': # If it is a pre-trained word vector
			ft = KeyedVectors.load_word2vec_format(model_path)
			self.model = SIF(ft, components=10)

		elif model_path[-6:] == 'pickle': # Already trained sentence vector 
			self.model = BaseSentence2VecModel.load(model_path)
Ejemplo n.º 2
0
 def sentence2vec(self, parentmodel=None, save=True):
     model_map = {
         "word2vec": self.word2vec(save=False),
         "glove": self.glove(save=False)
     }
     model = model_map.get(parentmodel, self.fasttext(save=False))
     sentence_model = SIF(model)
     sentence_model = sentence_model.train(self.data)
     logging.info("Training complete. Saving model")
     if save:
         model_name = f"sentence2vec_{'cbow' if not self.cbow else 'skipgram'}_{self.dim}.vec"
         model_path = get_path(f'/models/sentence2vec/{model_name}')
         return sentence_model.save(model_path)
     else:
         return sentence_model
Ejemplo n.º 3
0
class SIF_embeddings:
	def __init__(self, model_path = None):

		if model_path[3:] == 'vec': # If it is a pre-trained word vector
			ft = KeyedVectors.load_word2vec_format(model_path)
			self.model = SIF(ft, components=10)

		elif model_path[-6:] == 'pickle': # Already trained sentence vector 
			self.model = BaseSentence2VecModel.load(model_path)

	def fit(self,data):
		inp = CSplitIndexedList(data, custom_split=split_func)
		self.model.train(inp)

	def __call__(self,transcript):
		return self.model.infer([(transcript.split(),0)])
Ejemplo n.º 4
0
 def _train(self):
     self.sens = IndexedList(self.concepts)
     print('training SIF...')
     self.se = SIF(self.w2v_model)
     self.se.train(self.sens)
Ejemplo n.º 5
0
class MatchingModel():
    def __init__(self, all_concepts, bionlp_model, rewrite=False):
        self.corpus = []
        self.sens = []
        self.w2v_model = None
        self.se = None
        # self.file_path = file_path
        self.concepts = all_concepts
        # self.model_path = model_path
        self.w2v_model = bionlp_model
        self.rewrite_mode = rewrite

        self._read()
        self._train()

    def _read(self):
        # txt_or_csv = 'csv'
        # if txt_or_csv == 'txt':
        #     with open(self.file_path, 'r', encoding='utf-8') as file:
        #         for row in file:
        #             self.concepts.append(row.lstrip(' ').rstrip('\n'))
        # elif txt_or_csv == 'csv':
        #     with open(self.file_path, 'r', encoding='utf-8') as file:
        #         reader = csv.reader(file)
        #         next(reader)
        #         for row in reader:
        #             self.corpus.append(row[0].split())
        #             self.concepts.append(row[1])

        # self.concepts = [c.split() for c in list(set(self.concepts))]
        if self.rewrite_mode == True:
            # for procedure data set, the following 2 file paths should be replaced
            self.con_dic = dict(pd.read_csv('top200-con-dic-pro.csv'))
            self.dic = dict(pd.read_csv('top200-dic-pro.csv'))
            for key, value in self.con_dic.items():
                self.con_dic[key] = np.array(value)
            for key, value in self.dic.items():
                self.dic[key] = np.array(value)
            length = len(self.con_dic)
            X = np.ndarray(shape=(length, 200))
            self.words = []
            for key, i in zip(self.con_dic.keys(), range(length)):
                self.words.append(key)
                X[i] = self.con_dic[key]  # shape --> (length, 200)
            self.tree = KDTree(X)

        print('loading model...')
        # self.w2v_model = gensim.models.KeyedVectors.load_word2vec_format(self.model_path, binary=True)
        print('loading data...')

    def _train(self):
        self.sens = IndexedList(self.concepts)
        print('training SIF...')
        self.se = SIF(self.w2v_model)
        self.se.train(self.sens)

    def query(self, query_sen, topk=25):
        new_sen = query_sen

        if self.rewrite_mode == True:
            new_sen = []
            for w in query_sen:
                if w in self.dic and w not in self.con_dic:
                    q_emb = self.dic[w].reshape(1, 200)
                    dist, ind = self.tree.query(q_emb, k=1)
                    if dist[0][0] < 3.6:
                        # experiments shows that rewriting within a distance of 3.6 performs better than no-rewriting
                        index = ind.tolist()[0]
                        new_sen.append(self.words[index[0]])
                    else:
                        new_sen.append(w)
                else:
                    new_sen.append(w)

        # print(new_sen)
        cands = self.se.sv.similar_by_sentence(new_sen,
                                               model=self.se,
                                               topn=topk,
                                               indexable=self.sens.items)
        most_sim = [[x[0], x[2]] for x in cands]
        return most_sim
Ejemplo n.º 6
0
 def __init__(self, jokes_path, model_path):
     self.jokes = pd.read_csv(jokes_path)
     self.model = SIF.load(model_path)
     self.prev_jokes = []
Ejemplo n.º 7
0
        or row['role'] == '<sub-heading>' or row['role'] == '<separator>' or row['role'] == '<new-case>':
            continue
        # text = ""
        # tp_text = tuple(text.join(row["text"]))
        # text = text.join(row["text"])
        text = row["text"].split()
        sentences.append(text)
        count += 1 
        if count == 20:
            break

# tp_sentences = tuple(sentences)

from fse.models import SIF
from fse import IndexedList
model = SIF(wvecs)
sents = IndexedList(sentences)
model.train(sents)

# f = open("sent_embed.csv", "w")
import numpy as np

array = []
for i in range(len(model.sv)):
    for n in model.sv[i]:
        tmp = n
        print(round(tmp, 7))
        exit()
    array.append(model.sv[i])

np.savetxt(f, array, delimiter=",")
Ejemplo n.º 8
0
 def _init(self):
     self.model = SIF.load(self.args.model_path)
     self._dim = self.model.wv.vector_size
from fse import IndexedList
from fse.models.average import FAST_VERSION, MAX_WORDS_IN_BATCH
from fse.models import SIF
from gensim.models import FastText
import logging
logging.basicConfig(
    format='%(asctime)s : %(threadName)s : %(levelname)s : %(message)s',
    level=logging.INFO)

w2v_model = "H:/Vietnamese word representations/Word_vector_data/VnNewsWord2Vec/VnNewsWord2Vec.bin"

lookup = FastText.load_fasttext_format(w2v_model, encoding='utf-8')

sentences = []
s = IndexedList(sentences)
print(len(s))

title_file = 'H:/Vietnamese word representations/News-titles-embedding/Data/tokenized_titles_cleaned'

with open(title_file, 'r', encoding='utf-8') as file:
    for line in file:
        sentences.append(line.split())

s = IndexedList(sentences)

model = SIF(lookup, workers=2)
model.train(s)

model.save('sent2vec')
Ejemplo n.º 10
0
#!/usr/bin/env python3
import argparse
import logging
import pathlib
from fse import IndexedLineDocument
from fse.models import SIF

from lib import data, utils, model
import gensim.downloader as api

log = logging.getLogger("train_model")

EXPECTED_LINES = 66836199


if __name__ == "__main__":
    utils.setup_logging()
    parser = argparse.ArgumentParser()
    parser.add_argument("-d", "--data", default=data.DEFAULT_OUTPUT_PREFIX,
                        help="Prefix of input data to read, default=" + data.DEFAULT_OUTPUT_PREFIX)
    parser.add_argument("-o", "--output", default=model.DEFAULT_MODEL_FILE,
                        help="File name to save model, default=" + model.DEFAULT_MODEL_FILE)
    args = parser.parse_args()

    glove = api.load("glove-wiki-gigaword-100")
    input_path = pathlib.Path(args.data).with_suffix(".txt")
    sents = IndexedLineDocument(str(input_path))
    model = SIF(glove, workers=2)
    model.train(sents)
    model.save(args.output)
                     chunks=(2048, None),
                     dtype="f4")
else:
    z_embs = z[emb_path]

# encode & save
if "bert" in model_name:
    for i, batch in enumerate(tqdm(loader)):
        # encode
        embs = batch_encode(batch, tokenizer, model).cpu().numpy()
        # save
        start = i * args.batch_size
        end = start + embs.shape[0]
        z_embs[start:end] = embs[:]
elif "fse" in model_name:
    sent_model = SIF(model, workers=8, lang_freq="en")
    # train
    for i, batch in enumerate(loader):
        sentences = IndexedList([TextBlob(s).tokens for s in batch])
        sent_model.train(sentences)
    sent_model.save(fpath.parent / "fse.model")
    # infer
    for i, batch in enumerate(loader):
        sentences = IndexedList([TextBlob(s).tokens for s in batch])
        # encode
        embs = batch_encode(sentences, sent_model)
        # save
        start = i * args.batch_size
        end = start + embs.shape[0]
        z_embs[start:end] = embs[:]
Ejemplo n.º 12
0
for fileitem in filelist:
    print("Reading " + fileitem + "...")
    filepath = os.path.join(dirpath, fileitem)
    with open(filepath + ".txt") as f:
        temps = list()
        for a in map(lambda x: x.split(), f.read().split("\n")):
            temps.extend(a)
        sentences.append(a)

    print("Read " + fileitem)
wvmod = gensim.downloader.load("word2vec-google-news-300")

avg = Average(wvmod)
avg.wvmod = gensim.downloader.load("word2vec-google-news-300")
train(IndexedList(sentences))
sif = SIF(wvmod)
sif.train(IndexedList(sentences))

simMat = [[0 for a in filelist] for b in filelist]
for a in range(len(filelist)):
    for b in range(len(filelist)):
        sim1 = avg.sv.similarity(a, b)
        sim2 = sif.sv.similarity(a, b)
        simMat[a][b] = sim2
        # simMat[a][b] = scaled_sim(sim1, sim2)

for i in range(len(filelist)):
    print('  '.join(["     "] +
                    [str(a).center(7, ' ') for a in range(len(filelist))]))
    print(str(i).center(4, " "), end="  ")
    for j in range(len(filelist)):
Ejemplo n.º 13
0
    tokens = []
    for token in word_tokenize(sentence):
        if not_punc.match(token):
            tokens = tokens + prep_token(token)
    return tokens


sentences = CSplitIndexedList(sent_a, sent_b, custom_split=prep_sentence)

sentences[0]
models, results = {}, {}
word2vec = KeyedVectors.load("C:/Users/Kamil/Downloads/word2vec_300_3_polish.bin")


models[f"CBOW-W2V"] = Average(word2vec, lang_freq="pl")
models[f"SIF-W2V"] = SIF(word2vec, components=10)
models[f"uSIF-W2V"] = uSIF(word2vec, length=11)

from gensim.scripts.glove2word2vec import glove2word2vec  
glove = KeyedVectors.load_word2vec_format("C:/Users/Kamil/Downloads/glove_300_3_polish2.txt")
models[f"CBOW-Glove"] = Average(glove,  lang_freq="pl")
print(f"After memmap {sys.getsizeof(glove.vectors)}")
models[f"SIF-Glove"] = SIF(glove, components=15)
models[f"uSIF-Glove"] = uSIF(glove,length=11)

ft = FastTextKeyedVectors.load("D:/fasttext_300_3_polish.bin")
models[f"CBOW-FT"] = Average(ft, lang_freq="pl")
models[f"SIF-FT"] = SIF(ft, components=10)
models[f"uSIF-FT"] = uSIF(ft, length=11)