Ejemplo n.º 1
0
def get_pretrained_embedding(vocabulary_processor):
    print("Load word2vec file {}\n".format(embedding_file))
    initW = np.random.uniform(-0.25, 0.25, (n_words, EMBEDDING_SIZE))
    word_vectors = KeyedVectors.load_word2vec_format(embedding_file, binary=True)
    for word in word_vectors.vocab:
        idx = vocabulary_processor.vocabulary_.get(word)
        if idx != 0:
            initW[idx] = word_vectors[word]
    return initW
Ejemplo n.º 2
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        files = kwargs.get('files', {})
        self.dataset_name = kwargs.get('dataset_name')
        # summary length restriction
        self.max_length = 100
        # Set this to 0 is every sentence length is allowed
        self.min_sentence_length = 0
        # initiate stopword lists & w2v models
        self.stopwords = {}
        self.w2v_models = {}
        model_path = 'models/word2vec/blendle/word2vec_blendle'
        self.w2v_models['blendle'] = W2VModel(
            'blendle',
            Word2Vec.load(files.get(model_path, model_path)).wv)
        model_path = 'models/word2vec/google/GoogleNews-vectors-negative300.bin'
        self.w2v_models['google'] = W2VModel(
            'google',
            KeyedVectors.load_word2vec_format(files.get(
                model_path, model_path),
                                              binary=True))
        stopword_path = 'models/stopwords.json'
        with open(files.get(stopword_path, stopword_path)) as fh:
            self.stopwords = json.load(fh)

        # Load model state for inference
        # Only the normally trained models (no pre- or co-training) are included.
        self.USE_CUDA = False
        google_rnn_parameters = torch.load(
            'rnn/saved_models/google/rnn_normal',
            map_location=(lambda storage, loc: storage))

        blendle_rnn_parameters = torch.load(
            'rnn/saved_models/blendle/rnn_normal',
            map_location=(lambda storage, loc: storage))

        # Initiate model in eval mode with trained parameters
        self.rnn_model = {}
        self.rnn_model['blendle_normal'] = RNNModel(self.USE_CUDA,
                                                    blendle_rnn_parameters)
        self.rnn_model['google_normal'] = RNNModel(self.USE_CUDA,
                                                   google_rnn_parameters)

        # load other models
        self.svd_path = 'bin/summarization/bigram_svd_{}.model'.format(
            self.dataset_name)
        self.svd = sk_joblib.load(files.get(self.svd_path, self.svd_path))
        self.bigram_path = 'bin/summarization/bigrams_{}.pkl'.format(
            self.dataset_name)
        self.final_bigrams = pickle.load(open(self.bigram_path, 'rb'))
Ejemplo n.º 3
0
def embed():
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
    if os.path.exists(vocab_path):
        vocab = pickle.load(open(vocab_path, "rb"))
    else:
        iw, vocab, _ = build_index()
    size = len(list(vocab.keys()))
    emb = np.zeros(shape=[size, emb_dim])
    for word, index in vocab.items():
        if index in [0, 1, 2]:
            continue
        emb[index] = model[word]
    np.save(open(emb_path, "wb"), emb)
    return vocab, emb
Ejemplo n.º 4
0
def get_embedding():
    emb_path="datasets/temp/embedding.np"
    if os.path.exists(emb_path):
        return np.load(open(emb_path,'rb'))
    else:
        model=KeyedVectors.load_word2vec_format(model_path,binary=True)
        iw,vocab,_=get_vocab()
        size=len(list(vocab.keys()))
        emb=np.zeros(shape=[size,emb_dim])
        for word,index in vocab.items():
            if index in [0,1] or word not in model.vocab:
                continue
            emb[index]=model[word]
        np.save(open(emb_path,"wb"),emb)
        return emb
Ejemplo n.º 5
0
def embed():
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
    vocab = model.vocab
    size = len(vocab) + 2
    embedding = np.zeros(shape=[size, emb_dim])
    w2idx = {'UNK': 0, "<END>": 1}
    index = 2
    for word in vocab.keys():
        if word in vocab:
            embedding[index] = model[word]
        w2idx[word] = index
        index += 1
    np.save(open(emb_path, "wb"), embedding)
    pickle.dump(w2idx, open(vocab_path, 'wb'))
    return w2idx, embedding
Ejemplo n.º 6
0
 def __init__(self,\
             clambda=200,\
             n_topics=10,\
             batchsize=4096,\
             power=0.75,\
             words_pretrained=True,\
             temperature=1,\
             max_length=1000,\
             min_count=0,\
             word2vec_path=None):
     
     # 'Strength' of the dircihlet prior; 200.0 seems to work well
     self.clambda = clambda
     # Number of topics to fit
     self.n_topics = n_topics #int(os.getenv('n_topics', 10))
     self.batchsize = batchsize
     # Power for neg sampling
     self.power = power #float(os.getenv('power', 0.75))
     # Intialize with pretrained word vectors
     self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True)))
     self.temp = temperature
     self.max_length = max_length
     self.min_count = min_count
     self.word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)
Ejemplo n.º 7
0
def train_step():
    config = Config()

    with tf.Graph().as_default(), tf.Session() as session:
        # initializer = tf.random_normal_initializer(0.0, 0.02, dtype=tf.float64)
        with tf.variable_scope("mymodel", reuse=None, dtype=tf.float64):
            model = LSTMRNN(config=config, is_training=True)

        with tf.variable_scope("mymodel", reuse=True, dtype=tf.float64):
            valid_model = LSTMRNN(config=config, is_training=False)
            test_model = LSTMRNN(config=config, is_training=False)

        # add checkpoint
        pre_checkpoint_dir = os.path.abspath(
            os.path.join(config.out_dir, "checkpoints-pre"))
        pre_checkpoint_prefix = os.path.join(pre_checkpoint_dir, "model-pre")
        if not os.path.exists(pre_checkpoint_dir):
            os.makedirs(pre_checkpoint_dir)

        checkpoint_dir = os.path.abspath(
            os.path.join(config.out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())
        saver.restore(session, FLAGS.checkpoint2train)
        # tf.global_variables_initializer().run()

        global_steps = 1984156
        index = np.random.randint(1, 4000)
        begin_time = int(time.time())
        print("loading the dataset...")

        pretrained_word_model = KeyedVectors.load_word2vec_format(
            './data/GoogleNews-vectors-negative300.bin.gz', binary=True)
        '''
        pre_train_data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model,
                                                  datapath='./data/stsallrmf.mask2.p',
                                                  embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight)
        print "index: ", index

        print "begin pre-training"

        for i in range(FLAGS.pre_num_epoch):
            print "the %d epoch pre-training..." % (i + 1)

            lr = model.assign_new_lr(session, config.lr)

            print "current learning rate is %f" % lr
            # 11000+数据
            train_data, valid_data = cut_data(pre_train_data, 0.005)
            global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data)

            if i % FLAGS.pre_check_point_every == 0:
                path = saver.save(session, pre_checkpoint_prefix, global_steps)
                pos = 'index: %d, position: %s' % (i, path)
                with open(FLAGS.pre_logs_dir, 'a') as f:
                    f.write(pos)
                    f.write('\n')
                print pos

        print "pre-train finish."
        '''
        print "begin training"
        #data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtrain.mask2.p',
        #                                embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight)
        test_data = dataHelperMask.load_data(FLAGS.max_len,
                                             pretrained_word_model,
                                             datapath='./data/semtest.mask2.p',
                                             embed_dim=FLAGS.embedding_dim,
                                             alpha=FLAGS.alpha_weight)

        # print "length of train set:", len(data[2])
        # print "example of train set: ", data[3][index]
        print "length of test set:", len(test_data[2])
        print "example of test set:", test_data[3][index]

        x1, x2, y, m1, m2 = test_data
        fetches = [test_model.sent1, test_model.sent2]
        feed_dict = {}
        feed_dict[test_model.input_data_s1] = x1
        feed_dict[test_model.input_data_s2] = x2
        feed_dict[test_model.target] = y
        feed_dict[test_model.mask_s1] = m1
        feed_dict[test_model.mask_s2] = m2
        test_model.assign_new_batch_size(session, len(x1))
        sentence1, sentence2 = session.run(fetches, feed_dict)

        print 'type of sent1: ', type(sentence1)
        print 'len of sent1: ', len(sentence1)
        print 'type of sent2: ', type(sentence2)
        print 'len of sent2: ', len(sentence2)
        '''
        for i in range(config.num_epoch):
            print("the %d epoch training..." % (i + 1))

            lr = model.assign_new_lr(session, config.lr)

            print "current learning rate is %f" % lr

            train_data, valid_data = cut_data(data, 0.01)

            global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data)

            if i % config.checkpoint_every == 0 and i != 0:
                path = saver.save(session, checkpoint_prefix, global_steps)
                test_cost, test_pearson_r, test_spearman_r = evaluate(test_model, session, test_data, istest=True)
                res = 'index: %d, cost: %f, pearson_r: %f, spearman_r: %f' % (
                    i, test_cost, test_pearson_r, test_spearman_r)
                print res
                with open(FLAGS.logs_dir, 'a') as f:
                    f.write(res)
                    f.write('\n')
                    f.write('model position: {}\n'.format(path))
                print("Saved results chechpoint to{}\n".format(path))

        print("the train is finished")
        end_time = int(time.time())
        print("training takes %d seconds already\n" % (end_time - begin_time))
        '''
        print("program end!")
def load_model(path, save=None):
    wv_from_text = KeyedVectors.load_word2vec_format(path, binary=False)
    if save:
        wv_from_text.save(save)
    return wv_from_text
Ejemplo n.º 9
0
def raiseError(error):
    return error


if __name__ == '__main__':
    global model

    #----------- Parsing Arguments ---------------
    p = argparse.ArgumentParser()
    p.add_argument("--model", help="Path to the trained model")
    p.add_argument("--binary", help="Specifies the loaded model is binary")
    p.add_argument("--host", help="Host name (default: localhost)")
    p.add_argument("--port", help="Port (default: 5000)")
    p.add_argument("--path", help="Path (default: /word2vec)")
    args = p.parse_args()

    model_path = args.model if args.model else "./model.bin.gz"
    binary = True if args.binary else False
    host = args.host if args.host else "localhost"
    path = args.path if args.path else "/word2vec"
    port = int(args.port) if args.port else 5000
    if not args.model:
        print "Usage: word2vec-apy.py --model path/to/the/model [--host host --port 1234]"
    model = kv.load_word2vec_format(model_path, binary=binary)
    api.add_resource(N_Similarity, path + '/n_similarity')
    api.add_resource(Similarity, path + '/similarity')
    api.add_resource(MostSimilar, path + '/most_similar')
    api.add_resource(Model, path + '/model')
    api.add_resource(ModelWordSet, '/word2vec/model_word_set')
    app.run(host=host, port=port)
Ejemplo n.º 10
0
def loadW2V(fileName):
	"""
	Loads the word 2 vec model
	"""
	return KeyedVectors.load_word2vec_format(fileName, binary=True)
Ejemplo n.º 11
0
def loadW2V(fileName):
    """
	Loads the word 2 vec model
	"""
    return KeyedVectors.load_word2vec_format(fileName, binary=True)
Ejemplo n.º 12
0
import os
from nlpia.data.loaders import BIGDATA_PATH
from gensim.models.word2vec import KeyedVectors
path = os.path.join(BIGDATA_PATH, 'GoogleNews-vectors-negative300.bin.gz')
wv = KeyedVectors.load_word2vec_format(path, binary=True)
len(wv.vocab)
# 3000000


index = annoy.AnnoyIndex(f=len(wv[wv.index2word[0]]))
for i, word in enumerate(wv.index2word):
    if not i % 100000:
        print('{}: {}'.format(i, word))
    index.add_item(i, wv[word])
# 0: </s>
# 100000: distinctiveness
# ...
# 2600000: cedar_juniper
# 2700000: Wendy_Liberatore
# 2800000: Management_GDCM
# 2900000: BOARDED_UP


num_vectors = len(wv.vocab)
num_trees = int(np.log(num_vectors).round(0))
>>> index.build(num_trees)  # <1>
>>> index.save('Word2vec_index.ann')  # <2>
def train_step():
    config = Config()
    eval_config = Config()
    eval_config.keep_prob = 1.0

    # gpu_config=tf.ConfigProto()
    # gpu_config.gpu_options.allow_growth=True
    with tf.Graph().as_default(), tf.Session() as session:
        # 这个初始化不好,效果极差
        initializer = tf.random_normal_initializer(0.0, 0.2, dtype=tf.float32)
        with tf.variable_scope("model", reuse=None, initializer=initializer):
            model = LSTMRNN(config=config, sess=session, is_training=True)

        with tf.variable_scope("model", reuse=True, initializer=initializer):
            valid_model = LSTMRNN(config=eval_config,
                                  sess=session,
                                  is_training=False)
            test_model = LSTMRNN(config=eval_config,
                                 sess=session,
                                 is_training=False)

        # add summary
        train_summary_dir = os.path.join(config.out_dir, "summaries", "train")
        train_summary_writer = tf.summary.FileWriter(train_summary_dir,
                                                     session.graph)

        dev_summary_dir = os.path.join(eval_config.out_dir, "summaries", "dev")
        dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                   session.graph)

        # add checkpoint
        checkpoint_dir = os.path.abspath(
            os.path.join(config.out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables())

        tf.global_variables_initializer().run()

        global_steps = 1
        begin_time = int(time.time())

        print("loading the dataset...")
        pretrained_word_model = KeyedVectors.load_word2vec_format(
            './GoogleNews-vectors-negative300.bin.gz', binary=True)

        pre_train_data = data_helper.load_data(FLAGS.max_len,
                                               pretrained_word_model,
                                               datapath='./data/stsallrmf.p',
                                               embed_dim=FLAGS.emdedding_dim)
        data = data_helper.load_data(FLAGS.max_len,
                                     pretrained_word_model,
                                     datapath='./data/semtrain.p',
                                     embed_dim=FLAGS.emdedding_dim)
        test_data = data_helper.load_data(FLAGS.max_len,
                                          pretrained_word_model,
                                          datapath='./data/semtest.p',
                                          embed_dim=FLAGS.emdedding_dim)

        print("length of pre-train set:", len(pre_train_data[0]))
        print("length of train set:", len(data[0]))
        print("length of test set:", len(test_data[0]))
        print("begin pre-training")

        for i in range(70):
            print("the %d epoch pre-training..." % (i + 1))
            lr = model.assign_new_lr(session, config.lr)
            print("current learning rate is %f" % lr)

            # 11000+ data
            train_data, valid_data = cut_data(pre_train_data, 0.05)

            global_steps = run_epoch(model, session, train_data, global_steps,
                                     valid_model, valid_data,
                                     train_summary_writer, dev_summary_writer)

        path = saver.save(session, checkpoint_prefix, global_steps)
        print("pre-train finish.")
        print("Saved pre-train model chechpoint to{}\n".format(path))
        print("begin training")

        for i in range(config.num_epoch):
            print("the %d epoch training..." % (i + 1))
            # lr_decay = config.lr_decay ** max(i - config.max_decay_epoch, 0.0)
            lr = model.assign_new_lr(session, config.lr)
            print('current learning rate is %f' % lr)

            train_data, valid_data = cut_data(data, 0.1)

            global_steps = run_epoch(model, session, train_data, global_steps,
                                     valid_model, valid_data,
                                     train_summary_writer, dev_summary_writer)

            if i % config.checkpoint_every == 0:
                path = saver.save(session, checkpoint_prefix, global_steps)
                print("Saved model chechpoint to{}\n".format(path))

        print("the train is finished")
        end_time = int(time.time())
        print("training takes %d seconds already\n" % (end_time - begin_time))
        test_cost, test_pearson_r = evaluate(test_model, session, test_data)
        print("the test data cost is %f" % test_cost)
        print("the test data pearson_r is %f" % test_pearson_r)

        print("program end!")
Ejemplo n.º 14
0
#### keras tensorflow
# ref to StyleTransfer.ipynb

#### Embedding
# ref to: MNIST/Embeddings.ipynb

只要存在Embeddings,Keras会自动记录。TB里会直接看到。我只需要手动生成meta-data文件,不管是程序中绑定,还是在TB中Load都可以。至于meta-data文件,提前生成也OK,程序生成也OK。

#### NPL, NLTK, ngrams
# ref to: Quora/Quora-neural-network.ipynb
# ref to: Quora/Quora-Feature-Enginnering.ipynb

#### word2vec
from gensim.models.word2vec import Word2Vec, KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('/input/Kaggle/Word2Vec/GoogleNews-vectors-negative300.bin.gz', binary=True)
'apple' in word2vec.vocab # True

#### Tokenizer-Scikit-Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem.snowball import EnglishStemmer

# 自定义analyzer,加入stem
count_analyzer = CountVectorizer().build_analyzer()
stemmer = EnglishStemmer()

def stem_count_analyzer(doc):
    return (stemmer.stem(w) for w in count_analyzer(doc))

cv = CountVectorizer(analyzer=stem_count_analyzer, preprocessor=None, stop_words='english', max_features=128)
cv.fit(unique_questions)
Ejemplo n.º 15
0
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX


if isTest:
    words_dict = {'I': np.array([1., 1., 1., 1.], dtype=float),
                  'am': np.array([2., 2., 2., 2.], dtype=float),
                  'You': np.array([0.9, 0.9, 0.9, 0.9], dtype=float),
                  'Today': np.array([10.0, 9.0, 8.0, 7.0], dtype=float),
                  'boy': np.array([4.5, 3., 5., 6.], dtype=float),
                  'girl': np.array([4.4, 3.1, 5.2, 6.1], dtype=float)
                  }
else:
    print '读取词向量...'
    # word2vec词向量字典
    words_dict = KeyedVectors.load_word2vec_format('./others/GoogleNews-vectors-negative300.bin.gz', binary=True)

    print '词向量读取完毕!'

if isTest:
    embed_domins = 4
    p1 = ['I am a boy . ', 'You are a girl . ', 'I like playing basketball . ']
    p2 = ['Today is a nice day .', 'Something will happen today . ', 'Do you love me ? ']
    scores = [0.5, 0.4, 0.3]
    weights_dict = {'am': 0.5}
else:
    p1 = []
    p2 = []
    scores = []
    with open(single_data_path, 'r') as f:
        for line in f:
Ejemplo n.º 16
0
def train_step():
    config = Config()

    with tf.Graph().as_default(), tf.Session() as session:
        initializer = tf.random_normal_initializer(0.0, 0.02, dtype=tf.float64)
        with tf.variable_scope("mymodel", reuse=None, initializer=initializer, dtype=tf.float64):
            model = LSTMRNN(config=config, is_training=True)

        with tf.variable_scope("mymodel", reuse=True, initializer=initializer, dtype=tf.float64):
            valid_model = LSTMRNN(config=config, is_training=False)
            test_model = LSTMRNN(config=config, is_training=False)

        # add checkpoint
        pre_checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints-pre"))
        pre_checkpoint_prefix = os.path.join(pre_checkpoint_dir, "model-pre")
        if not os.path.exists(pre_checkpoint_dir):
            os.makedirs(pre_checkpoint_dir)

        checkpoint_dir = os.path.abspath(os.path.join(config.out_dir, "checkpoints"))
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)

        saver = tf.train.Saver(tf.global_variables())

        tf.global_variables_initializer().run()

        global_steps = 1

        begin_time = int(time.time())
        print("loading the dataset...")

        pretrained_word_model = KeyedVectors.load_word2vec_format('./data/GoogleNews-vectors-negative300.bin.gz',
                                                                  binary=True)
	
        pre_train_data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model,
                                                  datapath='./data/stsallrmf.mask2.p',
                                                  embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight)

        print "length of pre-train set:", len(pre_train_data[2])

        print "begin pre-training..."

        for i in range(FLAGS.pre_num_epoch):
            print "the %d epoch pre-training..." % (i + 1)

            lr = model.assign_new_lr(session, config.lr)

            print "current learning rate is %f" % lr
            # 11000+数据
            train_data, valid_data = cut_data(pre_train_data, 0.005)
            global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data)

            if i % config.pre_checkpoint_every == 0 and i != 0:
                path = saver.save(session, pre_checkpoint_prefix, global_steps)
                pos = 'index: %d, position: %s' % (i, path)
                with open(FLAGS.pre_logs_dir, 'a') as f:
                    f.write(pos)
                    f.write('\n')
                print pos

        print "pre-train finish."
	
        print "begin training"
        data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtrain.mask2.p',
                                        embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight)
        test_data = dataHelperMask.load_data(FLAGS.max_len, pretrained_word_model, datapath='./data/semtest.mask2.p',
                                             embed_dim=FLAGS.embedding_dim, alpha=FLAGS.alpha_weight)

        print "length of train set:", len(data[2])
        print "length of test set:", len(test_data[2])

        for i in range(config.num_epoch):
            print("the %d epoch training..." % (i + 1))

            lr = model.assign_new_lr(session, config.lr)

            print "current learning rate is %f" % lr

            train_data, valid_data = cut_data(data, 0.01)

            global_steps = run_epoch(model, session, train_data, global_steps, valid_model, valid_data)

            if i % config.checkpoint_every == 0:
                path = saver.save(session, checkpoint_prefix, global_steps)
                test_cost, test_pearson_r, test_spearman_r = evaluate(test_model, session, test_data, istest=True)
                res = 'index: %d, cost: %f, pearson_r: %f, spearman_r: %f' % (
                    i, test_cost, test_pearson_r, test_spearman_r)
                print res
                with open(FLAGS.logs_dir, 'a') as f:
                    f.write(res)
                    f.write('\n')
                    f.write('model position: {}\n'.format(path))
                print("Saved results chechpoint to{}\n".format(path))

        print("the train is finished")
        end_time = int(time.time())
        print("training takes %d seconds already\n" % (end_time - begin_time))

        print("program end!")
    args = parser.parse_args()
    np.random.seed(args.seed)
    random.seed(args.seed)

    meta = Meta()
    if args.edev:
        edev = read(args.edev, lang='en')
    if args.hdev:
        hdev = read(args.hdev, lang='hi')
    if args.cdev:
        cdev = read(args.cdev, lang='dev')
    if not args.load_model:
        train_e = read(args.etrain, 'en')
        train_h = read(args.htrain, 'hi')
        ewvm = KeyedVectors.load_word2vec_format(args.eembd,
                                                 binary=args.bvec,
                                                 limit=args.elimit)
        hwvm = KeyedVectors.load_word2vec_format(args.hembd,
                                                 binary=args.bvec,
                                                 limit=args.hlimit)
        meta.w_dim_eng = ewvm.syn0.shape[1]
        meta.n_words_eng = ewvm.syn0.shape[0] + meta.add_words
        meta.w_dim_hin = hwvm.syn0.shape[1]
        meta.n_words_hin = hwvm.syn0.shape[0] + meta.add_words

        get_char_map(train_e + train_h)
        meta.ew2i = {}
        for w in ewvm.vocab:
            meta.ew2i[w] = ewvm.vocab[w].index + meta.add_words
        meta.hw2i = {}
        for w in hwvm.vocab:
Ejemplo n.º 18
0
def load_model():
    model = KeyedVectors.load_word2vec_format(model_path, binary=True)
    return model
Ejemplo n.º 19
0
            
                
def train_word2vec():
    '''训练词项向量
    '''
    if not os.path.exists(text_path):
        build_text()
    model=Word2Vec(sentences=LineSentence(text_path),size=emb_dim,window=5,min_count=5,iter=10)
    model.wv.save_word2vec_format(model_path,binary=True)
    return model
    
def get_embedding():
    emb_path="datasets/temp/embedding.np"
    if os.path.exists(emb_path):
        return np.load(open(emb_path,'rb'))
    else:
        model=KeyedVectors.load_word2vec_format(model_path,binary=True)
        iw,vocab,_=get_vocab()
        size=len(list(vocab.keys()))
        emb=np.zeros(shape=[size,emb_dim])
        for word,index in vocab.items():
            if index in [0,1] or word not in model.vocab:
                continue
            emb[index]=model[word]
        np.save(open(emb_path,"wb"),emb)
        return emb
    
if __name__=="__main__":
    train_word2vec()
    model=KeyedVectors.load_word2vec_format(model_path,binary=True)
    emb=get_embedding()
Ejemplo n.º 20
0
        saver = tf.train.Saver(var_list=tf.trainable_variables(),
                               max_to_keep=FLAGS.num_checkpoints)

        # Write vocabulary
        # vocabulary_processor.save(os.path.join(FLAGS.checkpointDir, "vocab"))

        initW = None
        if FLAGS.embedding_type in \
                ['static', 'none_static', 'multiple_channels']:
            # initial matrix with random uniform
            initW = np.random.uniform(
                -0.25, 0.25,
                (len(vocabulary_processor.vocabulary_), FLAGS.embedding_dim))
            # load any vectors from the word2vec
            print("Load word2vec file {}\n".format(FLAGS.embedding_file))
            word_vectors = KeyedVectors.load_word2vec_format(
                FLAGS.embedding_file, binary=True)
            for word in word_vectors.vocab:
                idx = vocabulary_processor.vocabulary_.get(word)
                if idx != 0:
                    initW[idx] = word_vectors[word]
            sess.run(cnn.W.assign(initW))
            if FLAGS.embedding_type == 'multiple_channels':
                sess.run(cnn.W_static.assign(initW))

        # Initialize all variables
        sess.run(tf.global_variables_initializer())
        sess.run(tf.local_variables_initializer())

        def train_step(x_batch_train, y_batch_train):
            """
            A single training step