Ejemplo n.º 1
0
    def load(cls, np2vec_model_file, binary=False, word_ngrams=0):
        """
        Load the np2vec model.

        Args:
            np2vec_model_file (str): the file containing the np2vec model to load
            binary (bool): boolean indicating whether the np2vec model to load is in binary format
            word_ngrams (int {1,0}): If 1, np2vec model to load uses word vectors with subword (
            ngrams) information.

        Returns:
            np2vec model to load
        """
        if word_ngrams == 0:
            return KeyedVectors.load_word2vec_format(
                np2vec_model_file, binary=binary)
        elif word_ngrams == 1:
            return FastText.load(np2vec_model_file)
        else:
            logger.error('invalid value for \'word_ngrams\'')
Ejemplo n.º 2
0
def get_model():
    model = FastText.load(EMBEDDINGS_FASTTEXT_MODEL_FILE)
    return model
Ejemplo n.º 3
0
def reload_movie_embedding(train_percent=MOVIEP.train_percent,
                           valid=False,
                           file_path=MOVIEP.movie_data_path,
                           seq_num=MOVIEP.seq_num,
                           embedding_type=EMBEDP.embedding_type,
                           veclen=EMBEDP.veclen,
                           window=EMBEDP.window):

    instance_data = read_pickle(file_path + 'movie_review_sequence_data.pkl',
                                'r')
    instance_result = read_pickle(
        file_path + 'movie_review_sequence_result.pkl', 'r')
    # word2index = read_pickle(file_path + 'new_word2index.pkl', 'r')
    feature_tensor = np.zeros((len(instance_data), seq_num, veclen))

    if embedding_type == 'embedding':
        model = Word2Vec.load(file_path + 'movie_review_word2vec_' +
                              str(veclen) + '_window' + str(window) + '.model')
    elif embedding_type == 'embedding_skipgram':
        model = Word2Vec.load(file_path + 'movie_review_word2vec__skipgram' +
                              str(veclen) + '_window' + str(window) + '.model')
    elif embedding_type == 'fasttext':
        model = FastText.load(file_path + 'movie_review_fasttext_' +
                              str(veclen) + '_window' + str(window) + '.model')
    elif embedding_type == 'fasttext_skipgram':
        model = FastText.load(file_path + 'movie_review_fasttext__skipgram' +
                              str(veclen) + '_window' + str(window) + '.model')
    elif embedding_type == 'glove':
        model = glove2word2vec(
            file_path + 'movie_vectors_w' + str(window) + '_l' + str(veclen) +
            '.txt', file_path + 'glove' + str(veclen) + '_window' +
            str(window) + '.model')
    elif embedding_type == 'lda_sgns' or embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns':
        model = get_sgns_embedding('MovieReview')

    for instance_iter, instance in enumerate(instance_data):
        start_index = seq_num - len(instance)
        for seq_iter, seq_data in enumerate(instance):
            word_vec = model[seq_data]
            feature_tensor[instance_iter][seq_iter + start_index] += word_vec
    result_matrix = np.array(instance_result).reshape(
        (len(instance_result), -1))
    train_size = int(feature_tensor.shape[0] * train_percent)
    train_x = feature_tensor[:train_size]
    train_y = result_matrix[:train_size]
    test_x = feature_tensor[train_size:]
    test_y = result_matrix[train_size:]
    if valid:
        new_train_size = int(train_size * train_percent)
        train_x = train_x[:new_train_size]
        train_y = train_y[:new_train_size]
        test_x = train_x[new_train_size:]
        test_y = train_y[new_train_size:]

    if embedding_type == 'sg_add_sgns' or embedding_type == 'sg_cancat_sgns':
        train_x_sg, train_y_sg, test_x_sg, test_y_sg = reload_movie_embedding(
            train_percent=train_percent,
            valid=valid,
            file_path=file_path,
            seq_num=seq_num,
            embedding_type="embedding_skipgram",
            veclen=veclen,
            window=window)
        if embedding_type == 'sg_add_sgns':
            train_x = train_x + train_x_sg
            test_x = test_x + test_x_sg
        if embedding_type == 'sg_cancat_sgns':
            train_x = np.concatenate((train_x, train_x_sg), axis=2)
            test_x = np.concatenate((test_x, test_x_sg), axis=2)

    return train_x, train_y, test_x, test_y
Ejemplo n.º 4
0
    model_type = sys.argv[3]

    model_name = str(num_features) + "features_" + str(
        min_word_count) + "minwords_" + str(context) + "context_len2alldata"

    assert model_type in ["word2vec", "fasttext"]

    if model_type == "word2vec":
        # Load the trained Word2Vec model.
        model = Word2Vec.load(model_name)
        # Get wordvectors for all words in vocabulary.
        word_vectors = model.wv.vectors
        index2word = model.wv.index2word
    elif model_type == "fasttext":
        # Load the trained FastText model.
        model = FastText.load(model_name)
        # Get wordvectors for all words in vocabulary.
        word_vectors = model.wv.vectors
        index2word = model.wv.index2word

    all = pd.read_pickle('all.pkl')

    # Set number of clusters.
    num_clusters = int(sys.argv[2])
    idx, idx_proba = cluster_GMM(num_clusters, word_vectors)

    # Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
    # idx_name = "gmm_latestclusmodel_len2alldata.pkl"
    # idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl"
    # idx, idx_proba = read_GMM(idx_name, idx_proba_name)
Ejemplo n.º 5
0
from numpy.linalg import norm
import json
from gensim.models import FastText
import logging
import sys
import torch
import encoder
from torch.autograd import Variable
from sklearn.metrics.pairwise import cosine_similarity

logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
logging.root.level = logging.INFO
punctuation = '!"#$%&\'()*+,.:;<=>?@[\\]^`{|}~'
table = str.maketrans('', '', punctuation)
dictionary = {}
model = FastText.load('model/entity_fasttext_n100')
wv = model.wv
del model


def load_dictionary(dictionary_file):
    """
    Load the dictionary with article titles mapped
    to their respective abstracts containing annotated
    text.

    Argument
    --------
    dictionary_file: Input file
    """
    global dictionary
import os
from gensim.models import Word2Vec
from gensim.models import FastText

# EXP_HOME = "F:/MyWorks/Thesis Works/Crowdsource_Knowledge_Base/DeepGenQR/experiment"
EXP_HOME = "C:/My MSc/ThesisWorks/BigData_Code_Search/DeepGenQR/experiment"
model_file = EXP_HOME + '/pymodel/tomcat-fasttext-model'
model = FastText.load(model_file)
word_file = EXP_HOME + '/w2vec-data/words.txt'
vec_file = EXP_HOME + '/w2vec-data/tomcat-vector.txt'
vec_lines = list()
words = open(word_file, 'r')
for word in words:
    try:
        if model.wv.__contains__(word.strip()):
            vector = model.wv[word.strip()]
            line = word.strip() + " " + ' '.join(str(x) for x in vector)
            vec_lines.append(line)
    except IOError:
        print("Could not found " + word)
        pass

output_file = open(vec_file, 'w')
for content in vec_lines:
    output_file.write("%s\n" % content)
output_file.close()
Ejemplo n.º 7
0
from tqdm import tqdm
import random

#필요한 자료 및 모델
val_data = json.load(open("data/val.json", "rb"))  #맞춰야 하는 데이터

with open("tag_name.list", "rb") as f:
    tag_list = pickle.load(f)  #태그 리스트

with open("music_tag.dic", "rb") as f:
    music_tag = pickle.load(f)  #태그를 음악에 따라 묶은 것

with open("tag_music_freq.dic", "rb") as f:
    tag_music_freq = pickle.load(f)

fasttext = FastText.load("FastText.model")

#예상 내용이 들어가는 리스트, 일단 태그 10개, 곡 100개 만족하는거는 나중에 따로 처리해주고, 일단 채워넣는 단계
results = []

#def type1_presager(data, ...):


def type1_vote(frequency):
    for tag, freq in tags.items():
        if freq == frequency:
            answer_tags.append(tag)
        else:
            pass

Ejemplo n.º 8
0
import pandas as pd
import pickle
import re
import string
import nltk
from CobaVectorizer import MeanEmbeddingVectorizer
import gensim
from gensim.models import FastText

app = Flask(__name__, static_folder='static',
            template_folder='templates')  #Initialize the flask App
model = pickle.load(
    open('model_rf_byu200_02TS_Normal.pkl',
         'rb'))  # You can change it with your own Random Forest Model or else
loc = "FastTextModels/saved_model_gensim200SG_BYU.bin"  #You can Change it by your own pre-trained model
model_ft = FastText.load(loc)
connection = pymysql.connect(host='localhost',
                             user='******',
                             password='',
                             database='sentimen')
count = 0


@app.route('/')
def home():
    return render_template('index.html')


@app.route('/predict', methods=['POST'])
def predict():
    '''
Ejemplo n.º 9
0
tweet_words = preprocess(bipolar_data['text'])
del df
# ===========================================================================
# getting fasttext vectors
#model = FastText(tweet_words, size=100, window=3, min_count=1)
model = FastText(size=100, window=3, min_count=1)
model.build_vocab(tweet_words, update=True)
model.train(tweet_words, total_examples=model.corpus_count, epochs=10)

# ===========================================================================
# save and load model
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("fasttext.model")
model.save(fname)
model = FastText.load(fname)
# ===========================================================================
# calculate the document vector as average of all the words
index2word_set = set(model.wv.index2word)


def avg_feature_vector(words, model, num_features, word_set):
    '''
    calculates the average vector 
    '''
    feature_vec = np.zeros((num_features, ), dtype='float32')
    n_words = 0
    for word in words:
        if word in index2word_set:
            n_words += 1
            feature_vec = np.add(feature_vec, model[word])
Ejemplo n.º 10
0
            tokens = tokenizer.tokenize(text)
        # if row.sentiment.lower() == 'neutral':
        #     pred_selected_text = row.selected_text
        jaccards.append(jaccard(row.selected_text, pred_selected_text))
        pred_selected_texts.append(pred_selected_text)
        text_tokens.append(tokens)
    df['jaccard'] = jaccards
    df['pred_selected_text'] = pred_selected_texts
    df['text_tokens'] = text_tokens
    if pred_file is not None:
        df.to_csv(Path(f'{Config.pred_dir}/{pred_file}'), index=False)
    return float(np.mean(jaccards))


__roberta_tokenizer = ByteLevelBPETokenizer(
    vocab_file=str(Config.Roberta.vocab_file),
    merges_file=str(Config.Roberta.merges_file),
    add_prefix_space=True,
    lowercase=True)

__bert_tokenizer = BertWordPieceTokenizer(
    vocab_file=str(Config.Bert.vocab_file))

__xlnet_tokenizer = XLNetTokenizer(vocab_file=str(Config.XLNet.vocab_file),
                                   do_lower_case=False)

__albert_tokenizer = AlbertTokenizer(vocab_file=str(Config.Albert.vocab_file),
                                     do_lower_case=True)

__ft_embeddings = FastText.load(str(Config.ft_embeddings_path))
Ejemplo n.º 11
0
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 29 21:46:40 2018

@author: tianyu
"""

import os
import numpy as np
import pandas as pd
#os.chdir('/home/tiw15008/cleanfiles/fasttextmodel/')


from gensim.models import FastText
model = FastText.load('fasttext0928')

sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = FastText(sentences, min_count=1)

wv = model.wv

W = np.memmap("fastembed.dat", dtype=np.double, mode="r", shape=(424107, 300))
f = open("fastembed_clean.vocab",encoding='utf-8')
vocab_list = map(lambda x: eval(x.strip()), f.readlines())
vocab_dict = {w: k for k, w in enumerate(vocab_list)}

data = pd.read_table("data_gename.txt",index_col=0, delim_whitespace=True)
gene = data.index.values.tolist()

common = [word for word in gene if word.lower() in vocab_dict] # words in the dict
        print("=" * 10)
        print(summay_text)
        print("=" * 10)

        nn_word_list = ut.kakao_postagger_nn_finder(summay_text)

        print(nn_word_list)

        fasttext_data.append(nn_word_list)

    fastText_model = ut.fastText(fasttext_data)

    print('fastText_model similar list')
    # similar = model.most_similar(positive=['윤호', '하이킥'], topn=10)
    # [('순재', 0.9943705797195435), ('거침없이', 0.9900286197662354), ('그에게', 0.9879124164581299), ('중매역활', 0.9861310720443726), ('하거나', 0.9786599278450012), ('자이젠', 0.9707398414611816), ('민정은', 0.9691370725631714), ('프리실라', 0.9552605152130127), ('시온', 0.954103946685791), ('타바사', 0.9522706866264343)]
    # similar = model.most_similar(positive=['카파도키아', '아르메니아', '기원전'], topn=10)
    # [('에우메네스', 0.9945849180221558), ('페르디카스로부터', 0.9932612180709839), ('공격하', 0.9814687967300415), ('받아', 0.9809004068374634), ('알케타스', 0.9726078510284424), ('321년', 0.97102952003479), ('마족', 0.9679989814758301), ('영웅전', 0.9672538638114929), ('것이다', 0.9660188555717468), ('에린이', 0.9653569459915161)]
    similar = fastText_model.most_similar(positive=['삼성'],
                                          negative=['제로페이'],
                                          topn=10)

    print(similar)

    ut.plt_show(fastText_model, img_name='fasttext.png')
    """
    Just run `python w2v_visualizer.py word2vec.model visualize_result`
    """

    word2vec_model = FastText.load("./fastText.model")
    ut.visualize(word2vec_model, "./fastText_log")
Ejemplo n.º 13
0
from os import path, removedirs, remove, mkdir
from gensim.models import Word2Vec, FastText
from time import time

directory = "/home/zack/Desktop/Hons Project/program/models/results/"
topn = 10
start = time()
words = []

w2v = Word2Vec.load(
    "/home/zack/Desktop/Hons Project/program/models/w2v/w2v_twitteronly.model")
ft = FastText.load(
    "/home/zack/Desktop/Hons Project/program/models/fasttext/ft_twitteronly.model"
)

print("models loaded successfully!")

if path.exists(directory + "w2v_results.txt"):
    remove(directory + "w2v_results.txt")

w2v_outfile = open(directory + "w2v_results.txt", "a+")

if path.exists(directory + "ft_results.txt"):
    remove(directory + "ft_results.txt")

ft_outfile = open(directory + "ft_results.txt", "a+")

with open("/home/zack/Desktop/Hons Project/program/models/testwords.txt",
          "r") as f:
    for line in f:
        words.append(line.strip())
Ejemplo n.º 14
0
        print("appending line " + str(i))
        i += 1
    print('Data input complete.')
    # 训练数据
    model = FastText(lines, size=dim, min_count=3, iter=5)
    model.save('testModel.model')  # 保存为model格式
    model.wv.save_word2vec_format('testModelVec.vector',
                                  binary=False)  # 保存为vector


'''训练模型(如模型已存在无需重复加载训练)'''
#remove_char(write_file='clean_data.txt', read_file='data_train.txt')
#train('clean_data.txt', 200)
'''模型加载与测试'''
# 载入模型
model = FastText.load('testModel.model')

word1 = '电影'
word2 = '电视剧'
word3 = '跑步'
# 获取词向量
print(word1 + ' 的词向量为:')
print(model.wv[word1])
# 求最相似词语
print('和 ' + word1 + ' 最相似的词语为:')
print(model.most_similar(word1))
# 求相似度
print(word1 + ' 和 ' + word2 + ' 的相似度为:')
print(model.wv.similarity(word1, word2))
print(word3 + ' 和 ' + word2 + ' 的相似度为:')
print(model.wv.similarity(word3, word2))
Ejemplo n.º 15
0
class sentence2vec(object):
    # 解决matplotlib中文乱码
    plt.rcParams['font.sans-serif'] = ['SimHei']
    plt.rcParams['axes.unicode_minus'] = False

    REAL = np.float32

    # 定义预先计算归一化常数Z
    Z = 0.

    app = Flask(__name__)
    path = os.path.join(app.static_folder, Config.WordsModelFile)

    if Config.ModelMethod == 'Word2Vec':
        model = Word2Vec.load(path)
    elif Config.ModelMethod == 'FastText':
        model = FastText.load(path)
    else:
        model = Word2Vec.load(path)

    # 加载模型
    @staticmethod
    def init():
        # 预先计算归一化常数Z
        sentence2vec.Z = sentence2vec.normalization_constant_Z()
        pass

    @staticmethod
    def timefn(fn):
        """计算性能的修饰器"""
        @wraps(fn)
        def measure_time(*args, **kwargs):
            t1 = time.time()
            result = fn(*args, **kwargs)
            t2 = time.time()
            print("@timefn:" + fn.__name__ + "  生成摘要时间: " +
                  np.str('%.2f' % (np.float32(t2 - t1))) + " 秒")
            return result

        return measure_time

    @staticmethod
    def normalization_constant_Z():
        '''计算归一化常数Z'''
        vlookup = sentence2vec.model.wv.vocab
        Z = 0
        for k in vlookup:
            Z += vlookup[k].count
        return Z

    @staticmethod
    def sif_embeddings(sentences, model, alpha=1e-3):
        """计算句子向量的SIF嵌入参数
        ----------
        sentences : list
            需要计算的句子或文章
        model : word2vec或FastText训练得到的模型
            一个包含词向量和词汇表的gensim模型
        alpha : float, optional
            参数,用于根据每个单词的概率p(w)对其进行加权。
        Returns
        -------
        numpy.ndarray 
            SIF 句子嵌入矩阵 len(sentences) * dimension
        """

        vlookup = model.wv.vocab  # 获取字典索引
        vectors = model.wv  # 我们能够访问词向量
        size = model.vector_size  # 词向量维度

        output = []

        # 遍历所有的句子
        for s in sentences:
            count = 0
            v = np.zeros(size, dtype=sentence2vec.REAL)  # 摘要向量
            # 遍历所有单词
            for w in s:
                # 单词必须出现在词汇表中
                if w in vlookup:
                    for i in range(size):
                        # 平滑逆频率,SIF
                        v[i] += (alpha / (alpha +
                                          (vlookup[w].count / sentence2vec.Z))
                                 ) * vectors[w][i]
                    count += 1

            if count > 0:
                for i in range(size):
                    v[i] *= 1 / count
            output.append(v)
        return np.vstack(output).astype(sentence2vec.REAL)

    @staticmethod
    def cut(text):
        '''分词函数'''
        return ' '.join(jieba.cut(text))

    @staticmethod
    def split_sentences(text):
        '''分句函数'''
        sents = []
        text = re.sub(r'\n+', '。', text)  # 换行改成句号(标题段无句号的情况)
        text = re.sub('([。!?\?])([^’”])', r'\1\n\2', text)  # 普通断句符号且后面没有引号
        text = re.sub('(\.{6})([^’”])', r'\1\n\2', text)  # 英文省略号且后面没有引号
        text = re.sub('(\…{2})([^’”])', r'\1\n\2', text)  # 中文省略号且后面没有引号
        text = re.sub('([.。!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2',
                      text)  # 断句号+引号且后面没有引号
        text = text.replace(u'。。', u'。')  # 删除多余的句号
        text = text.replace(u'?。', u'。')  #
        text = text.replace(u'!。', u'。')  # 删除多余的句号
        text = text.replace(u'\n', u'').replace(u'\r', u'')  # 删除多余的\\r\\n
        text = text.replace(u'\u3000', u'')
        text = text.replace(u'\\n', u'')
        text = text.replace(u'点击图片', u'')
        text = text.replace(u'进入下一页', u'')
        #sentences = re.split(r'。|!|?|】|;',text) # 分句
        sentences = re.split('。|!|\!|\.|?|\?', text)  # 分句
        #sentences = re.split(r'[。,?!:]',text) # 分句
        sentences = sentences[:-1]  # 删除最后一个句号后面的空句
        for sent in sentences:
            len_sent = len(sent)
            if len_sent < 4:  # 删除换行符、一个字符等
                continue
            # sent = sent.decode('utf8')
            sent = sent.strip('  ')
            sent = sent.lstrip('【')
            sent = sent.lstrip('】')
            sents.append(sent)
        return sents

    @staticmethod
    def knn_smooth(arr):
        '''knn平滑函数'''
        result = []
        if len(arr) > 3:
            result = []
            for i in range(len(arr)):
                a = 0
                # 处理第一句余弦距离时,取第一,第二句的余弦距离之和,再取平均,作为第一句的余弦距离
                if i < 1:
                    a = ((arr[i] + arr[i + 1]) / 2)
                    result.append(a)
                # 处理中间句子余弦距离时,取前一句,当前句,后一句的余弦距离之和,再取平均,作为的余弦距离
                elif i < len(arr) - 1:
                    a = ((arr[i] + arr[i - 1] + arr[i + 1]) / 3)
                    result.append(a)
                # 处理最后一句余弦距离时,取最后一句,前一句的余弦距离之和,再取平均,作为最后一句的余弦距离
                else:
                    a = ((arr[i] + arr[i - 1]) / 2)
                    result.append(a)
        else:
            result = arr
        return result

    @staticmethod
    def get_plot(x1, x2, top_n):
        plt.figure(figsize=(12, 8))
        plt.plot(x1[:top_n],
                 linestyle='-.',
                 marker='o',
                 color='r',
                 alpha=0.5,
                 label='平滑前')
        plt.plot(x2[:top_n],
                 linestyle='-.',
                 marker='o',
                 color='g',
                 alpha=0.5,
                 label='平滑后')
        plt.title('K N N连续句子相关性的平滑')
        plt.xlabel('句子编号')
        plt.ylabel('余弦距离(数值越小,句子越重要)')
        plt.grid(linestyle='-.', alpha=0.7)
        plt.legend()
        for i, j in zip(np.arange(len(x1[:top_n])), x1[:top_n]):
            plt.text(i, j + 0.002, '%.3f' % j, color='r', alpha=0.7)
        for i, j in zip(np.arange(len(x2[:top_n])), x2[:top_n]):
            plt.text(i, j + 0.002, '%.3f' % j, color='g', alpha=0.7)

    @staticmethod
    def get_sen_doc_cosine(text, title, top_n=10, plot=True):
        '''获取 句向量/文章向量 的余弦距离'''
        # 判断对象是否list
        if isinstance(text, list): text = ' '.join(text)
        # 文章分句
        split_sens = sentence2vec.split_sentences(text)
        # 文章向量化,标题向量化
        doc_vec = sentence2vec.sif_embeddings([text],
                                              sentence2vec.model,
                                              alpha=1e-3)
        # 定义句子/文章向量  和  句子/标题向量 余弦距离空字典
        sen_doc_cosine = {}
        # 遍历文章分句,计算句向量,把文章的内容和对应的余弦距离存入字典
        for sen in split_sens:
            sen_vec = sentence2vec.sif_embeddings([sen],
                                                  sentence2vec.model,
                                                  alpha=1e-3)
            # 计算 句子/文章向量 的余弦距离
            sen_doc_cosine[sen] = cosine(sen_vec, doc_vec)
        # 句子/文章向量 余弦字典的keys,values空列表
        sen_doc_cosine_keys, sen_doc_cosine_values = [], []
        # 遍历句子/文章向量 余弦距离字典,获取正确的分句内容和对应的余弦距离存入对应列表中
        for i, j in sen_doc_cosine.items():
            sen_doc_cosine_keys.append(i)
            sen_doc_cosine_values.append(j)
        # 平滑前, 把(句子/文章向量)列表转成数组
        knn_before_cosine_values = np.array(sen_doc_cosine_values)
        # 使用自定义的knn_smooth函数,计算新的余弦距离 (平滑后的余弦距离)
        knn_after_cosine_values = np.array(
            sentence2vec.knn_smooth(sen_doc_cosine_values))
        # 定义knn平滑后的余弦距离空字典
        knn_cosine_score = {}
        # 把原分句内容和平滑后的余弦距离组合成字典
        knn_cosine_score = dict(
            zip(sen_doc_cosine_keys, knn_after_cosine_values))
        # 绘制平滑前后的余弦距离的曲线图
        if plot:
            sentence2vec.get_plot(knn_before_cosine_values,
                                  knn_after_cosine_values, top_n)
        # 返回经过平滑后的字典,降序,字典包含新闻分句和对应的余弦距离
        return sorted(knn_cosine_score.items(),
                      key=lambda x: x[1],
                      reverse=False)

    # 有输入标题
    @staticmethod
    def get_sen_doc_title_cosine(text, title, weight=0.5, top_n=10, plot=True):
        '''获取(句子/文章向量)(句子/标题向量)的余弦距离'''
        # 判断对象是否list
        if isinstance(text, list): text = ' '.join(text)
        # 文章分句
        split_sens = sentence2vec.split_sentences(text)
        # 文章向量化,标题向量化
        doc_vec = sentence2vec.sif_embeddings([text],
                                              sentence2vec.model,
                                              alpha=1e-3)
        title_vec = sentence2vec.sif_embeddings([title],
                                                sentence2vec.model,
                                                alpha=1e-3)
        # 定义句子/文章向量  和  句子/标题向量 余弦距离空字典
        sen_doc_cosine, sen_title_cosine = {}, {}
        # 遍历文章分句,计算句向量,把文章的内容和对应的余弦距离存入字典
        for sen in split_sens:
            sen_vec = sentence2vec.sif_embeddings([sen],
                                                  sentence2vec.model,
                                                  alpha=1e-3)
            # 计算 句子/文章向量 的余弦距离
            sen_doc_cosine[sen] = cosine(sen_vec, doc_vec)
            # 计算 句子/标题向量 的余弦距离
            sen_title_cosine[sen] = cosine(sen_vec, title_vec)
        # 句子/文章向量 余弦字典的keys,values空列表
        sen_doc_cosine_keys, sen_doc_cosine_values = [], []
        # 遍历句子/文章向量 余弦距离字典,获取正确的分句内容和对应的余弦距离存入对应列表中
        for i, j in sen_doc_cosine.items():
            sen_doc_cosine_keys.append(i)
            sen_doc_cosine_values.append(j)
        # 句子/标题向量 余弦字典的keys,values空列表
        sen_title_cosine_keys, sen_title_cosine_values = [], []
        # 遍历 句子/标题向量 余弦距离字典,获取正确的分句内容和对应的余弦距离存入对应列表中
        for i, j in sen_title_cosine.items():
            sen_title_cosine_keys.append(i)
            sen_title_cosine_values.append(j)
        # 平滑前,计算 (句子/文章向量)* 权重  + (句子/标题向量)* (1 - 权重)
        knn_before_cosine_values = np.array(
            sen_doc_cosine_values) * weight + np.array(
                sen_title_cosine_values) * (1 - weight)
        # 使用自定义的knn_smooth函数,计算新的余弦距离 (平滑后的余弦距离)
        knn_after_cosine_values = np.array(
            sentence2vec.knn_smooth(sen_doc_cosine_values)
        ) * weight + np.array(
            sentence2vec.knn_smooth(sen_title_cosine_values)) * (1 - weight)
        # 定义knn平滑后的余弦距离空字典
        knn_cosine_score = {}
        # 把原分句内容和平滑后的余弦距离组合成字典
        knn_cosine_score = dict(
            zip(sen_doc_cosine_keys, knn_after_cosine_values))
        # 绘制平滑前后的余弦距离的曲线图
        if plot:
            sentence2vec.get_plot(knn_before_cosine_values,
                                  knn_after_cosine_values, top_n)
        # 返回经过平滑后的字典,降序,字典包含新闻分句和对应的余弦距离
        return sorted(knn_cosine_score.items(),
                      key=lambda x: x[1],
                      reverse=False)

    @staticmethod
    def get_summarize(text, title, weight=0.5, top_n=10, plot=False):
        '''生成摘要,默认获得前10句'''
        # 获取分句
        split_sens = sentence2vec.split_sentences(text)
        # 获取排序后的字典,key为句子内容,values为分句向量与文章向量的余弦距离
        if title == '':
            ranking_sentences = sentence2vec.get_sen_doc_cosine(text,
                                                                title,
                                                                top_n=top_n,
                                                                plot=plot)
        else:
            ranking_sentences = sentence2vec.get_sen_doc_title_cosine(
                text, title, weight=weight, top_n=top_n, plot=plot)
        # 设置一个空集合和空字符
        selected_sen = set()
        if len(split_sens) > top_n:
            # 遍历top_n的句子,并添加到空集合
            for sen, _ in ranking_sentences[:top_n]:
                selected_sen.add(sen)
        else:
            for sen, _ in ranking_sentences:
                selected_sen.add(sen)
        # 设置摘要的空列表
        summarize = []
        # 遍历所有的句子,把top_n的句子,按照原新闻中的顺序拼接起来
        for sen in split_sens:
            if sen in selected_sen:
                summarize.append(sen + '。')
        summarize = ' '.join(summarize)
        return summarize
Ejemplo n.º 16
0
    for company in range(len(com_list)):
        for model_use in model_list:
            for per in percentage:
                per = int(per)
                article = pd.read_excel(
                    f'../{fo}/All_File/Final_Clean_Article.xlsx')
                article_center = pd.read_excel(
                    f'../{fo}/All_File/{com_list[company]}_intro.xlsx')

                all_article = article['內容'].tolist()
                news_time = article['時間'].tolist()

                all_article_center = article_center['Com_intro'].tolist()

                if model_use == 'fastText_stock':
                    model = FastText.load(
                        '../Word_Embedding_model/{}.model'.format(model_use))
                else:
                    model = Word2Vec.load(
                        '../Word_Embedding_model/{}.model'.format(model_use))
                print('Using Model : ',
                      '../Word_Embedding_model/{}.model'.format(model_use))

                score = []  # for cos
                Article_vector = []  # for article vector
                Article_extract = []  #for target article
                Article_vector_extract = []  # for target article vector
                Article_time_extract = []  # for target article time

                for y in all_article_center:
                    y = y.split(' ')
                    tmp_storage_y = []
Ejemplo n.º 17
0
def fasttext():
    path = 'models/fasttext/fasttext.bin'
    model = FastText.load(path)
    return 'FastText', model
Ejemplo n.º 18
0
# 참고한 사이트 https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne/code

font_name = matplotlib.font_manager.FontProperties(
    fname="C:/Windows/Fonts/.ttf"  # 한글 폰트 위치를 넣어주세요
).get_name()
matplotlib.rc('font', family=font_name)

modelPath = "/Users/lemon/Desktop/multi-class-text-classification-cnn-master_combine/trained_model_1526302044/fastText.vec"
# C:\Users\lemon\Desktop\multi-class-text-classification-cnn-master_combine\trained_model_1526302044
# C:\Users\lemon\Desktop\multi-class-text-classification-cnn-master_combine\trained_model_1526302044/word2Vec.vec
# C:/Users/lemon/Desktop/multi-class-text-classification-cnn-master_combine/trained_model_1526302044/word2Vec.vec
# /Users/lemon/Desktop/multi-class-text-classification-cnn-master_combine/trained_model_1526302044/word2Vec.vec

# model = g.Doc2Vec.load(modelPath)
model = FastText.load(modelPath)

vocab = list(model.wv.vocab)
X = model[vocab]
# X = model[model.wv.vocab]
tsne = TSNE(n_components=2)
# X_tsne = tsne.fit_transform(X[:1000,:])
X_tsne = tsne.fit_transform(X)
df = pd.DataFrame(X_tsne, index=vocab, columns=['x', 'y'])

fig = plt.figure()
ax = fig.add_subplot(1, 1, 1)

ax.scatter(df['x'], df['y'])

for word, pos in df.iterrows():
Ejemplo n.º 19
0
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import numpy as np
from get_feature import *

# 总特征list
total_feature_list = list()
# 总标签list
label_list = list()
with open("label.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()
    for line in lines:
        label = line.strip()
        label_list.append(label)

model = FastText.load("AImed.model")
with open("further corpus.txt", "r", encoding="utf-8") as file:
    lines = file.readlines()
    for line in lines:
        sentence2list = line.strip().split()
        temp = list()
        initial_vector = np.zeros(20)
        # initial_pos_vector = np.zeros(33)
        # 实体e1的向量
        v1 = model.wv['entityone']
        for item in v1:
            temp.append(item)
        # 句子的向量
        for word in sentence2list:
            vector = model.wv[word]
            initial_vector += vector
Ejemplo n.º 20
0
#characters
chars = set([w_i for w in words for w_i in w])
n_chars = len(chars)
print("Number of Labels: ", n_chars)

tag2idx = {t: i + 1 for i, t in enumerate(tags)}
tag2idx["PAD"] = 0
# Vocabulary Key:tag_index -> Value:Label/Tag
idx2tag = {i: w for w, i in tag2idx.items()}

# Char Key:char -> Value:token_index
char2idx = {c: i + 2 for i, c in enumerate(chars)}
char2idx["UNK"] = 1
char2idx["PAD"] = 0

words_fast = FastText.load('model_fast30/model_fast.model')

#load pretrained word embedding
embedding_matrix = np.ones((len(word2idx), 100), dtype='float32')
embedding_matrix[0] = np.zeros(100, dtype='float32')
# with open('wiki-news-300d-1M.vec') as f:
for i in range(2, len(idx2word) - 2):
    embedding_matrix[i] = words_fast[idx2word[i]]
#         ordered_words_ft.append(s[0])
print('Found %s word vectors.' % len(embedding_matrix))

# for word, i in word2idx.items():
#     embedding_vector = embeddings_index.get(word)
#     if embedding_vector is not None:
#         # words not found in embedding index will be all-zeros.
#         embedding_matrix[i] = embedding_vector
def load_fast_text(file):
    return FastText.load(file)
Ejemplo n.º 22
0
                        default="cosinus")
    parser.add_argument("-s",
                        "--savePath",
                        help="Path where to save model to",
                        required=True)

    args = parser.parse_args()

    print("Load data..")
    #    data = readFile(args.filename, columns=args.filenameColumns, sep=args.filenameDelimiter)
    data = pd.read_parquet(args.filename,
                           columns=["id", "text", "user_name"],
                           engine="pyarrow")

    print("Load word embeddings..")
    model_ft = FastText.load(args.wordEmbedding)

    print("Preprocess data..")
    data["text_vec"] = data[args.dataColumnName].map(
        lambda tweet: tweet_vectorizer(preprocess_tweet(tweet), model_ft))
    data["prep"] = data[args.dataColumnName].map(
        lambda tweet: preprocess_tweet(tweet))

    Nclusters = [10, 20, 30]
    scores = []
    for N in Nclusters:
        res = kmeans(data,
                     N,
                     args.maxIterations,
                     distance=args.distance,
                     vectorColumn="text_vec")
Ejemplo n.º 23
0
def trained_metric(exp_id=0,
                   n_jobs=1,
                   freqs=(80, 100),
                   window=3,
                   emb_model='ft'):
    train_docs, test_docs = split_wiki9_articles(exp_id)
    save_dir = FLAGS.save_dir

    model_name = 'wiki9_{}_{}.model'.format(emb_model, FLAGS.exp_id)
    model_path = os.path.join(save_dir, model_name)

    if emb_model == 'ft':
        model = FastText.load(model_path)
    elif emb_model == 'w2v':
        model = Word2Vec.load(model_path)
    elif emb_model == 'glove':
        model = load_glove_model(model_path)
    elif emb_model == 'tfw2v':
        model = load_tf_embedding(FLAGS.exp_id,
                                  save_dir=save_dir,
                                  epoch=FLAGS.epoch,
                                  noise_multiplier=FLAGS.noise_multiplier,
                                  l2_norm_clip=FLAGS.l2_norm_clip,
                                  microbatches=FLAGS.microbatches)
    else:
        raise ValueError('No such embedding model: {}'.format(emb_model))

    word_vectors = model.wv.vectors
    word_emb = tf.convert_to_tensor(word_vectors)
    metric_model = LinearMetricModel(word_vectors.shape[1])

    optimizer = tf.train.AdamOptimizer(5e-4)
    inputs_a = tf.placeholder(tf.int64, (None, ), name="inputs_a")
    inputs_b = tf.placeholder(tf.int64, (None, ), name="inputs_b")
    labels = tf.placeholder(tf.float32, (None, ), name="labels")

    embs_a = tf.nn.embedding_lookup(word_emb, inputs_a)
    embs_b = tf.nn.embedding_lookup(word_emb, inputs_b)

    logits = metric_model.forward(embs_a, embs_b)

    if FLAGS.metric == 'cosine':
        embs_a = tf.nn.l2_normalize(embs_a, axis=1)
        embs_b = tf.nn.l2_normalize(embs_b, axis=1)

    dot = tf.reduce_sum(tf.multiply(embs_a, embs_b), axis=1)

    # loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
    loss = tf.keras.losses.hinge(labels, logits)
    loss = tf.reduce_mean(loss)

    t_vars = tf.trainable_variables()
    grads_and_vars = optimizer.compute_gradients(loss, t_vars)
    train_ops = optimizer.apply_gradients(
        grads_and_vars, global_step=tf.train.get_or_create_global_step())

    vocab_size = len(model.wv.vocab)
    thresh = (int(vocab_size * freqs[0] / 100),
              int(vocab_size * freqs[1] / 100))

    print("Loading contexts for membership inference")

    if n_jobs > 1:
        member_job_ctxs = Parallel(n_jobs)(
            delayed(get_all_contexts)(ds, model, thresh, window)
            for ds in split_docs(train_docs, n_jobs))
        nonmember_job_ctxs = Parallel(n_jobs)(
            delayed(get_all_contexts)(ds, model, thresh, window)
            for ds in split_docs(test_docs, n_jobs))
        member_ctxs = [
            ctxs for job_ctxs in member_job_ctxs for ctxs in job_ctxs
        ]
        nonmember_ctxs = [
            ctxs for job_ctxs in nonmember_job_ctxs for ctxs in job_ctxs
        ]
    else:
        member_ctxs = get_all_contexts(train_docs, model, thresh, window)
        nonmember_ctxs = get_all_contexts(test_docs, model, thresh, window)

    print("Loaded {} member and {} nonmember".format(len(member_ctxs),
                                                     len(nonmember_ctxs)))

    membership_labels = np.concatenate(
        [np.ones(len(member_ctxs)),
         np.zeros(len(nonmember_ctxs))])

    train_ctxs, test_ctxs, train_labels, test_labels = train_test_split(
        member_ctxs + nonmember_ctxs,
        membership_labels,
        random_state=12345,
        train_size=FLAGS.train_size,
        stratify=membership_labels)

    def flatten_ctxs(ctxs, labels):
        flat_ctxs, flat_labels = [], []
        for doc_ctx, doc_label in zip(ctxs, labels):
            flat_ctxs += doc_ctx
            flat_labels.append(np.ones(len(doc_ctx)) * doc_label)
        return flat_ctxs, np.concatenate(flat_labels)

    train_ctxs, train_labels = flatten_ctxs(train_ctxs, train_labels)
    test_ctxs, test_labels = flatten_ctxs(test_ctxs, test_labels)

    train_y = []
    for ctxs, label in zip(train_ctxs, train_labels):
        train_y.append(np.ones(len(ctxs)) * label)

    train_y = np.concatenate(train_y).astype(np.float32)
    train_x = np.vstack(train_ctxs)

    def collect_scores(ctxs, labels, sess, baseline=False):
        stacked_ctxs = np.vstack(ctxs)
        stacked_scores = []
        for batch_idx in iterate_minibatches_indices(len(stacked_ctxs),
                                                     batch_size=1024,
                                                     shuffle=False):
            feed = {
                inputs_a: stacked_ctxs[batch_idx][:, 0],
                inputs_b: stacked_ctxs[batch_idx][:, 1]
            }
            scores = sess.run(dot if baseline else logits, feed_dict=feed)
            stacked_scores.append(scores)
        stacked_scores = np.concatenate(stacked_scores)

        member_metrics, nonmember_metrics = [], []
        start_idx = 0
        for ctx, label in zip(ctxs, labels):
            scores = stacked_scores[start_idx:start_idx + len(ctx)]
            start_idx += len(ctx)

            if label == 1:
                member_metrics.append(scores)
            else:
                nonmember_metrics.append(scores)
        return member_metrics, nonmember_metrics

    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess:
        sess.run(tf.global_variables_initializer())

        test_member_metrics, test_nonmember_metrics = collect_scores(
            test_ctxs, test_labels, sess, True)

        compute_adversarial_advantage(
            [np.mean(m) for m in test_member_metrics],
            [np.mean(m) for m in test_nonmember_metrics])

        print('Training attack model with {} data...'.format(len(train_x)))
        for epoch in range(30):
            iterations = 0
            train_loss = 0

            for batch_idx in iterate_minibatches_indices(len(train_y),
                                                         batch_size=512,
                                                         shuffle=True):
                feed = {
                    inputs_a: train_x[batch_idx][:, 0],
                    inputs_b: train_x[batch_idx][:, 1],
                    labels: train_y[batch_idx]
                }
                err, _ = sess.run([loss, train_ops], feed_dict=feed)
                train_loss += err
                iterations += 1

            print("Epoch: {}, Loss: {:.4f}".format(epoch,
                                                   train_loss / iterations))
            test_member_metrics, test_nonmember_metrics = collect_scores(
                test_ctxs, test_labels, sess)
            compute_adversarial_advantage(
                [np.mean(m) for m in test_member_metrics],
                [np.mean(m) for m in test_nonmember_metrics])
Ejemplo n.º 24
0
def load_FastText_yelp(path):
    print('loading FastText yelp...')
    ft_model = FastText.load(path)
    return ft_model
Ejemplo n.º 25
0
        y_embed = [f'{i}_y' for i in range(100)]
        embed = embed_df[x_embed].values + embed_df[y_embed].values

        embed = l2norm(embed)

    # save embedding vector
    with open(f'{args.savedir}/{pre_embedname}.pickle', 'wb') as f:
        pickle.dump(embed, f)

else:
    print('[{0:15s}] Evaluation'.format('STATE'))
    # configuration
    # - feature selection
    show_features = ['category', 'brand', 'nb_reviews', 'vol_price', 'product']
    # - load embed model
    model = FastText.load(f'{args.savedir}/{modelname}.bin')
    # - filtering class
    filtering = Filtering(show_features)

    # 1. load data
    data, products, info = load(reviewpath, productpath, infopath)

    # 2. preprocessing new sentence
    # test_text = GP.fit([args.search], args.wordpath, args.pospath)
    test_text = list(map(GP.stopword, [args.search]))
    test_text = GP.spacefix(test_text)
    print('[{0:15s}] result : {1:}'.format('PREPROCESSING', test_text))
    test_sent_vec = GP.sent2vec(test_text, model)
    test_sent_vec = l2norm(test_sent_vec)

    # 3. calculration similarity : cosine distance
Ejemplo n.º 26
0
# In[1]:

import pickle
from keras.models import load_model
import keras

# In[2]:

from keras.preprocessing import sequence
from keras.preprocessing.sequence import pad_sequences
from nltk import sent_tokenize, word_tokenize
from keras import backend as k

# wordvector load
from gensim.models import FastText
model = FastText.load('fasttext_model')
fasttext = model.wv

# In[3]:


def pred1(model, sentence):
    _dtype = k.floatx()
    sentence_token = []
    sentence_token += word_tokenize(sentence)

    sentence_vec = []
    sentence_vec.append([fasttext[v] for v in sentence_token])

    padd = sequence.pad_sequences(sentence_vec, maxlen=45, dtype=_dtype)
    intent = ans1(model.predict(padd)[0])
Ejemplo n.º 27
0
TIME_ZONE = 'Asia/Seoul'  # 'UTC'

USE_I18N = True

USE_L10N = True

USE_TZ = True

# Static files (CSS, JavaScript, Images)
# https://docs.djangoproject.com/en/3.1/howto/static-files/

STATIC_URL = '/static/'

# MODEL = FastText.load(r'C:\Users\NA\Desktop\Workspace\GJAI_WarmingUpProject\AIJOA_Project\wiki.ko\wiki_ko_v3.model')
MODEL = FastText.load(
    r'C:\Users\HAN\Desktop\WarmingUpProject\AIJOA_Project\wiki.ko\wiki_ko_v3.model'
)

MENULIST = {
    '폴더버거 핫치킨': [
        '골드버거 치킨', '오늘도 봐봐 치킨', '오늘도 보고 와 치킨', '불도 먹었어 치킨', '골드버거 핫치킨',
        '골드버거 치킨', '월드 보고 아침에', '오늘도 보고 와 치킨', '폴더 버거 킹', '홀더 버거 치킨',
        '뭘 더 먹어 치킨', '너 먹어 치킨', '뭐 먹어 치킨'
    ],
    '폴더버거 비프': [
        '골드버그 비프', '올더 버거 비프', '폴더 버거 비프', '골드버그 비프 세트', '올더 버거 비프 세트',
        '어디서 먹어 핑크색', '물 더 먹어 비트 세트', '골드버그 비프 세트', '올 더 버거 비틀 세트', '홀더 버거 비프',
        '뭘 더 먹어 비프', '너 먹어 피프 세트', '뭐 먹어 비프'
    ],
    '리아미라클버거':
    ['리아미라클버거', '미아 미라클버거', '리아미라클버거 세트', '미라클버거 세트', '리아 미라클 버거 세트'],