Beispiel #1
0
 def get_vocab(self):
     mem_tokens = []
     for i in range(self.max_mem):
         mem_tokens.append('v{}'.format(i))
     vocab = data_utils.Vocab(self.namespace.get_all_names() + SPECIAL_TKS +
                              mem_tokens)
     return vocab
Beispiel #2
0
            'Error: %s does not exist.' % args.feature_weights_file

    # read the data and pickle it or load it
    preproc_data_path = os.path.join(args.model_dir,
                                     'preproc_data_%s.pkl' % args.task)

    domain2data = read_data(args.data_path)
    print('Saving domain2data object to %s...' % preproc_data_path)
    with open(preproc_data_path, 'wb') as f:
        pickle.dump(domain2data, f)

    assert set(task_trg_domains) == set(domain2data.keys())

    # create the vocabulary or load it if it was already created
    vocab_path = os.path.join(args.model_dir, 'vocab.txt')
    vocab = data_utils.Vocab(args.max_vocab_size, vocab_path)
    # retrieve all available tokenised sentences
    tokenised_sentences = data_utils.get_all_docs(domain2data.items(),
                                                  unlabeled=False)[0]
    vocab.create(tokenised_sentences)
    del tokenised_sentences

    # load word vectors if we are using them
    word2vec = None
    if args.word2vec_path:
        vocab_word2vec_file = os.path.join(args.model_dir,
                                           'vocab_word2vec.txt')
        word2vec = similarity.load_word_vectors(args.word2vec_path,
                                                vocab_word2vec_file,
                                                vocab.word2id,
                                                vector_size=args.vector_size,