def build_dev_vocab(questions, contexts): # most vocabulary comes from tr_vocab # TODO: Needs train vocab tr_vocab = pickle.load(open('train_vocab.pkl', 'rb')) existing_vocab = set(tr_vocab) glove_vocab = load_glove_vocab('./glove/glove.840B.300d.txt', 300) # return a "set" of vocabulary new_vocab = list( set([ w for doc in questions + contexts for w in doc if w not in existing_vocab and w in glove_vocab ])) vocab = tr_vocab + new_vocab print('train vocab {0}, total vocab {1}'.format(len(tr_vocab), len(vocab))) return vocab
trn_file = 'CoQA/train.json' dev_file = 'CoQA/dev.json' wv_file = args.wv_file wv_dim = args.wv_dim nlp = spacy.load('vi_spacy_model', disable=['parser']) random.seed(args.seed) np.random.seed(args.seed) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S') log = logging.getLogger(__name__) log.info('start data preparing... (using {} threads)'.format(args.threads)) glove_vocab = load_glove_vocab(wv_file, wv_dim) # return a "set" of vocabulary log.info('glove loaded.') # =============================================================== # =================== Work on training data ===================== # =============================================================== def proc_train(ith, article): rows = [] context = article['story'] for j, (question, answers) in enumerate(zip(article['questions'], article['answers'])): gold_answer = answers['input_text'] span_answer = answers['span_text']
wv_file = args.wv_file wv_dim = args.wv_dim nlp = spacy.load('en', disable=['parser']) random.seed(args.seed) np.random.seed(args.seed) logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG, datefmt='%m/%d/%Y %I:%M:%S') log = logging.getLogger(__name__) log.info('start data preparing... (using {} threads)'.format(args.threads)) # 只是得到一个set类型的单词表,没有对应的vector glove_vocab = load_glove_vocab(wv_file, wv_dim) log.info('glove loaded.') # =============================================================== # =================== Work on training data ===================== # =============================================================== def proc_train(ith, article): rows = [] for paragraph in article['paragraphs']: context = paragraph['context'] for qa in paragraph['qas']: question = qa['question'] answers = qa['orig_answer']