Ejemplo n.º 1
0
def main(_):
    in_files = get_files_from_dir(FLAGS.in_dir)
    if in_files is None:
        in_files = FLAGS.filenames

    dataset = Word2VecDataset(arch=FLAGS.arch,
                              algm=FLAGS.algm,
                              epochs=FLAGS.epochs,
                              batch_size=FLAGS.batch_size,
                              max_vocab_size=FLAGS.max_vocab_size,
                              min_count=FLAGS.min_count,
                              sample=FLAGS.sample,
                              window_size=FLAGS.window_size)
    dataset.build_vocab(in_files)

    word2vec = Word2VecModel(arch=FLAGS.arch,
                             algm=FLAGS.algm,
                             embed_size=FLAGS.embed_size,
                             batch_size=FLAGS.batch_size,
                             negatives=FLAGS.negatives,
                             power=FLAGS.power,
                             alpha=FLAGS.alpha,
                             min_alpha=FLAGS.min_alpha,
                             add_bias=FLAGS.add_bias,
                             random_seed=0)
    to_be_run_dict = word2vec.train(dataset, in_files)

    with tf.Session() as sess:
        sess.run(dataset.iterator_initializer)
        sess.run(tf.tables_initializer())
        sess.run(tf.global_variables_initializer())

        average_loss = 0.
        step = 0
        while True:
            try:
                result_dict = sess.run(to_be_run_dict)
            except tf.errors.OutOfRangeError:
                break

            average_loss += result_dict['loss'].mean()
            if step % FLAGS.log_per_steps == 0:
                if step > 0:
                    average_loss /= FLAGS.log_per_steps
                print('step:', step, 'average_loss:', average_loss,
                      'learning_rate:', result_dict['learning_rate'])
                average_loss = 0.

            step += 1

        syn0_final = sess.run(word2vec.syn0)

    np.save(os.path.join(FLAGS.out_dir, 'embed'), syn0_final)
    with open(os.path.join(FLAGS.out_dir, 'vocab.txt'), 'w',
              encoding="utf-8") as fid:
        for w in dataset.table_words:
            fid.write(w + '\n')

    print('Word embeddings saved to', os.path.join(FLAGS.out_dir, 'embed.npy'))
    print('Vocabulary saved to', os.path.join(FLAGS.out_dir, 'vocab.txt'))
Ejemplo n.º 2
0
from models.supersenses.embeddings import TOKENS_WORD2VEC, LEMMAS_WORD2VEC
from models.supersenses.lstm_mlp_supersenses_model import LstmMlpSupersensesModel
from word2vec import Word2VecModel

w2v = Word2VecModel.load_google_model()

def boknilev_record_to_lstm_model_sample_xs(record):
    return [LstmMlpSupersensesModel.SampleX(
                token=record['tokens'][ind],
                ind=ind,
                ud_xpos=record['preprocessing']['ud_xpos'][ind],
                ud_upos=None,
                ner=record['preprocessing']['ner'][ind],
                lemma=record['preprocessing']['lemma'][ind],
                ud_dep=record['preprocessing']['ud_dep'][ind],
                ud_head_ind=record['preprocessing']['ud_head_ind'][ind],
                is_part_of_mwe=False,
                gov_ind=record['preprocessing']['govobj'][ind]['gov'] - 1 if record['preprocessing']['govobj'][ind]['gov'] else None,
                obj_ind=record['preprocessing']['govobj'][ind]['obj'] - 1 if record['preprocessing']['govobj'][ind]['obj'] else None,
                govobj_config=record['preprocessing']['govobj'][ind]['config'],
                identified_for_pss=ind in [pp['ind'] for pp in record['pps']],
                lexcat=None,
                token_word2vec=w2v.get(record['tokens'][ind]) if record['tokens'][ind] not in TOKENS_WORD2VEC else None,
                lemma_word2vec=w2v.get(record['preprocessing']['lemma'][ind]) if record['preprocessing']['lemma'][ind] not in LEMMAS_WORD2VEC else None
            ) for ind in range(len(record['tokens']))
    ]
Ejemplo n.º 3
0
from flask import Flask, jsonify
import logging

from word2vec import Word2VecModel

logging.basicConfig(filename='synonymvis.log', level=logging.DEBUG)
app = Flask(__name__)

app.debug = False
if app.debug:
    logging.info('Starting in debug mode...')
    from word2vec_mock import Word2VecModelMock
    app.w2v_model = Word2VecModelMock()
else:
    logging.info('Loading word2vec model...')
    app.w2v_model = Word2VecModel()


@app.route('/')
def root():
    return app.send_static_file('index.html')


@app.route('/api/most_similar/<word>')
def most_similar(word):
    """Returns a list of 20 most similar (word, vector) pairs."""
    return jsonify(results=app.w2v_model.most_similar(word))


@app.route('/api/get_vectors/<words>')
def get_vectors(words):
Ejemplo n.º 4
0
from flask import Flask, request
from flask_restplus import Api, Resource
from util.utils import get_logger, is_number
from word2vec import Word2VecModel

logger = get_logger(__name__)

word2VecModel = Word2VecModel()

app = Flask(__name__)
api = Api(app,
          doc='/doc/',
          version='1.0',
          title='Content Insights with Deep Learning')

ns_word2vec = api.namespace('word2vec', 'Word2Vec')


@ns_word2vec.route('/most_similar')
class MostSimilarResource(Resource):
    @api.doc(params={
        'word': 'Word',
        'topn': 'Number of return words, default is 10'
    })
    @api.response(200, 'Success')
    def get(self):
        """Get most similar words"""
        result = {'error': False, 'message': '', 'similar': []}
        topn = request.values.get('topn', '')
        if not topn or not topn.strip():
            topn = '10'
Ejemplo n.º 5
0
def load_word2vec(path):
    w2v = Word2VecModel({})
    if os.path.exists(path):
        with open(path, 'rb') as f:
            w2v = Word2VecModel.load(f)
    return w2v
Ejemplo n.º 6
0
def main(_):
    dataset = Word2VecDataset(arch=FLAGS.arch,
                              algm=FLAGS.algm,
                              epochs=FLAGS.epochs,
                              batch_size=FLAGS.batch_size,
                              max_vocab_size=FLAGS.max_vocab_size,
                              min_count=FLAGS.min_count,
                              sample=FLAGS.sample,
                              window_size=FLAGS.window_size,
                              fixed_window_size=FLAGS.fixed_window_size,
                              )
    # create one hot encoded vocabulary dataset
    dataset.build_vocab(FLAGS.filenames)

    word2vec = Word2VecModel(arch=FLAGS.arch,
                             algm=FLAGS.algm,
                             embed_size=FLAGS.embed_size,
                             batch_size=FLAGS.batch_size,
                             negatives=FLAGS.negatives,
                             power=FLAGS.power,
                             alpha=FLAGS.alpha,
                             min_alpha=FLAGS.min_alpha,
                             add_bias=FLAGS.add_bias,
                             random_seed=0)
    # define training operations
    to_be_run_dict = word2vec.train(dataset, FLAGS.filenames)

    print("Corpus size: ", dataset._corpus_size, " words")
    print("Vocabulary size: ", len(dataset.table_words), " words")
    print("Number of sentences: ", dataset.num_sentences)

    with tf.compat.v1.Session() as sess:
        # init variables
        sess.run(dataset.iterator_initializer)
        sess.run(tf.compat.v1.tables_initializer())
        sess.run(tf.compat.v1.global_variables_initializer())

        average_loss = 0.

        step = 0
        # each steps process 1 batch
        # steps <= (corpus_size / batch) * epochs if arch = cbow
        # steps <= (corpus_size / batch) * window_size * 2 * epochs if arch = cbow
        print("Begin training")
        start_time = time.time()
        while True:
            try:
                # perform one training step
                result_dict = sess.run(to_be_run_dict)
            except tf.errors.OutOfRangeError:
                print("Training completed in ", step, " steps")
                print("Total time: %.3f" % (time.time() - start_time), " seconds")
                break
            average_loss += result_dict['loss'].mean()
            if step % FLAGS.log_per_steps == 0 and step > 0:
                average_loss /= FLAGS.log_per_steps
                print('step:', step, 'average_loss:', average_loss,
                      'learning_rate:', "%.8f" % result_dict['learning_rate'],
                      'progress:', "%.6f" % result_dict['progress'])
                average_loss = 0.
            step += 1

        syn0_final = sess.run(word2vec.syn0)

    print("Corpus size: ", dataset._corpus_size, " words")
    print("Vocabulary size: ", len(dataset.table_words), " words")
    print("Number of sentences: ", dataset.num_sentences)

    # create output folders
    if not os.path.exists(FLAGS.out_dir):
        os.makedirs(FLAGS.out_dir)

    np.save(os.path.join(FLAGS.out_dir, 'embed'), syn0_final)
    print("Embedding shape: ", syn0_final.shape)
    with open(os.path.join(FLAGS.out_dir, 'vocab.txt'), 'w', encoding="utf-8") as fid:
        for w in dataset.table_words:
            fid.write(w + '\n')
    print('Word embeddings saved to', os.path.join(FLAGS.out_dir, 'embed.npy'))
    print('Vocabulary saved to', os.path.join(FLAGS.out_dir, 'vocab.txt'))