import logging import torch from pytorch_mrc.data.vocabulary import Vocabulary from pytorch_mrc.dataset.squad import SquadReader, SquadEvaluator from pytorch_mrc.model.rnet_hkust import RNET from pytorch_mrc.data.batch_generator import BatchGenerator logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') data_folder = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/' embedding_folder = '/home/len/yingzq/nlp/mrc_dataset/word_embeddings/' tiny_file = data_folder + "tiny-v1.1.json" embedding_file = embedding_folder + 'glove.6B.100d.txt' reader = SquadReader(fine_grained=True) tiny_data = reader.read(tiny_file) evaluator = SquadEvaluator(tiny_file) logging.info('building vocab and making embedding...') vocab = Vocabulary() vocab.build_vocab(tiny_data, min_word_count=3, min_char_count=10) vocab.make_word_embedding(embedding_file) word_embedding = vocab.get_word_embedding() logging.info('word vocab size: {}, word embedding shape: {}'.format( len(vocab.get_word_vocab()), word_embedding.shape)) train_batch_generator = BatchGenerator() train_batch_generator.build(vocab, tiny_data, batch_size=32, shuffle=True) eval_batch_generator = BatchGenerator() eval_batch_generator.build(vocab, tiny_data, batch_size=32)
import sys sys.path.append('../..') import logging import torch from pytorch_mrc.dataset.squad import SquadReader, SquadEvaluator from pytorch_mrc.model.bidaf import BiDAF from pytorch_mrc.data.batch_generator import BatchGenerator logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') bg_folder = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/bg_data/' train_bg_file = bg_folder + "bg_train_32b_100d.pkl" eval_bg_file = bg_folder + "bg_eval_32b_100d.pkl" dev_file = "/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/dev-v1.1.json" reader = SquadReader() eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) train_batch_generator = BatchGenerator() eval_batch_generator = BatchGenerator() train_batch_generator.load(train_bg_file) eval_batch_generator.load(eval_bg_file) vocab = train_batch_generator.get_vocab() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = BiDAF(vocab, device, pretrained_word_embedding=vocab.get_word_embedding()) model.compile() model.train_and_evaluate(train_batch_generator, eval_batch_generator, evaluator, epochs=20, episodes=2)
vocab.get_word_pad_idx(), word_embedding[vocab.get_word_pad_idx()])) print('word unk token idx: {}, embedding is: \n{}'.format( vocab.get_word_unk_idx(), word_embedding[vocab.get_word_unk_idx()])) print('word `code` idx: {}, embedding is: \n{}'.format( vocab.get_word_idx('code'), word_embedding[vocab.get_word_idx('code')])) print('word `randomrandom` idx: {}, embedding is: \n{}'.format( vocab.get_word_idx('randomrandom'), word_embedding[vocab.get_word_idx('randomrandom')])) # define data path tiny_file = "/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/tiny-v1.1.json" embedding_file = '/home/len/yingzq/nlp/mrc_dataset/word_embeddings/glove.6B.100d.txt' vocab_save_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/vocab_data/vocab_tiny_100d.pkl' # where to save vocab data # read data reader = SquadReader() print('reading data from {} ...'.format(tiny_file)) tiny_data = reader.read(tiny_file) # build the vocabulary vocab = Vocabulary() print('building vocabulary...') vocab.build_vocab(tiny_data, min_word_count=3, min_char_count=10) print('making word embedding...') vocab.make_word_embedding(embedding_file) word_embedding = vocab.get_word_embedding() print_info(vocab, word_embedding) # save vocabulary print('***saveing vocabulary...***') vocab.save(vocab_save_file)
sys.path.append('../..') import logging import torch from pytorch_mrc.dataset.squad import SquadReader, SquadEvaluator from pytorch_mrc.model.rnet_sogou import RNET from pytorch_mrc.data.batch_generator import BatchGenerator logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') bg_folder = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/bg_data/' train_bg_file = bg_folder + "bg_train_50b_300d_uncased.pkl" eval_bg_file = bg_folder + "bg_eval_50b_300d_uncased.pkl" dev_file = "/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/dev-v1.1.json" reader = SquadReader(fine_grained=True) eval_data = reader.read(dev_file) evaluator = SquadEvaluator(dev_file) train_batch_generator = BatchGenerator() eval_batch_generator = BatchGenerator() train_batch_generator.load(train_bg_file) eval_batch_generator.load(eval_bg_file) vocab = train_batch_generator.get_vocab() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model = RNET(vocab, device, pretrained_word_embedding=vocab.get_word_embedding(), word_embedding_size=300) model.compile('adam', 0.001)
# define data path train_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/train-v1.1.json' dev_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/dev-v1.1.json' embedding_file = '/home/len/yingzq/nlp/mrc_dataset/word_embeddings/glove.840B.300d.txt' # the path to save file vocab_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/vocab_data/vocab_{}d_{}.pkl'.format( EMB_DIM, 'cased' if DO_LOWERCASE else 'uncased') bg_train_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/bg_data/bg_train_{}b_{}d_{}.pkl'.format( BATCH_SIZE, EMB_DIM, 'cased' if DO_LOWERCASE else 'uncased') bg_eval_file = '/home/len/yingzq/nlp/mrc_dataset/squad-v1.1/bg_data/bg_eval_{}b_{}d_{}.pkl'.format( BATCH_SIZE, EMB_DIM, 'cased' if DO_LOWERCASE else 'uncased') # read data reader = SquadReader(fine_grained=FINE_GRAINED) train_data = reader.read(train_file) eval_data = reader.read(dev_file) # build vocab and embedding vocab = Vocabulary(do_lowercase=DO_LOWERCASE) vocab.build_vocab(train_data + eval_data, min_word_count=3, min_char_count=10) vocab.make_word_embedding(embedding_file) vocab.save(vocab_file) logging.info("building train batch generator...") train_batch_generator = BatchGenerator() train_batch_generator.build(vocab, train_data, batch_size=BATCH_SIZE, shuffle=True)