def preprocess(): pp = Preprocessor('data/internal/common/ners') db = qdb.QuestionDatabase(QB_QUESTION_DB) pages = set(db.page_by_count(min_count=MIN_APPEARANCES)) print(len(pages)) folds = ['train', 'test', 'devtest', 'dev'] for fold in folds: allqs = db.query('from questions where page != "" and fold == ?', (fold, ), text=True) print(fold, len(allqs)) proc_fold = [] for i, key in enumerate(allqs): q = allqs[key] if q.page in pages: qs = {} for index in q.text: qs[index] = pp.preprocess_input(q.text[index]) ans = q.page.strip().lower().replace(' ', '_') answer = pp.convert_to_indices(ans) proc_fold.append((qs, answer)) if i % 5000 == 0: print('done with ', i) print(fold, len(proc_fold)) with safe_open('output/deep/' + fold, 'wb') as f: pickle.dump(proc_fold, f, protocol=pickle.HIGHEST_PROTOCOL) with safe_open(DEEP_VOCAB_TARGET, 'wb') as f: pickle.dump((pp.vocab, pp.vdict), f, protocol=pickle.HIGHEST_PROTOCOL)
def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1): log.info("Training language model with pages that appear more than %i times" % MIN_APPEARANCES) lm = LanguageModelWriter(vocab_size, global_lms) num_docs = 0 background = defaultdict(int) # Initialize language models for title, text in text_iterator(True, QB_WIKI_LOCATION, True, QB_QUESTION_DB, True, QB_SOURCE_LOCATION, max_pages, min_pages=MIN_APPEARANCES): num_docs += 1 if num_docs % 500 == 0: log.info("{} {}".format(unidecode(title), num_docs)) log.info(str(list(lm.tokenize_without_censor(text[100:200])))) for tt in lm.tokenize_without_censor(text): background[tt] += 1 # Create the vocabulary for ii in background: lm.train_seen(ii, background[ii]) vocab = lm.finalize() log.info(str(vocab)[:80]) log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs)) del background # Train the language model doc_num = 0 for corpus, qb, wiki, source in [("wiki", False, True, False), ("qb", True, False, False), ("source", False, False, True) ]: # Add training data start = time.time() for title, text in text_iterator(wiki, QB_WIKI_LOCATION, qb, QB_QUESTION_DB, source, QB_SOURCE_LOCATION, max_pages, min_pages=MIN_APPEARANCES): doc_num += 1 if doc_num % 500 == 0 or time.time() - start > 10: log.info("Adding train doc %i, %s (%s)" % (doc_num, unidecode(title), corpus)) start = time.time() lm.add_train(corpus, title, text) log.info("Done training") if lm_out: # Create the extractor object and write out the pickle with safe_open("%s.txt" % lm_out, 'w') as f: lm.write_vocab(f) for ii, cc in enumerate(lm.corpora()): with safe_open("%s/%i" % (lm_out, ii), 'w') as f: lm.write_corpus(cc, ii, f)
def write_answer_map(answer_map, amb_answer_map, unbound_answers, answer_map_path, unbound_answer_path): with safe_open(answer_map_path, 'w') as f: json.dump({ 'answer_map': answer_map, 'ambig_answer_map': amb_answer_map }, f) with safe_open(unbound_answer_path, 'w') as f: json.dump({'unbound_answers': list(sorted(unbound_answers))}, f)
def write_answer_map(answer_map, amb_answer_map, unbound_answers, answer_map_path, unbound_answer_path): with safe_open(answer_map_path, "w") as f: json.dump( { "answer_map": answer_map, "ambig_answer_map": amb_answer_map }, f) with safe_open(unbound_answer_path, "w") as f: json.dump({"unbound_answers": list(sorted(unbound_answers))}, f)
def write_answer_map(answer_map, amb_answer_map, unbound_answers, answer_map_path, unbound_answer_path): with safe_open(answer_map_path, 'w') as f: json.dump( { 'answer_map': answer_map, 'ambig_answer_map': amb_answer_map }, f) with safe_open(unbound_answer_path, 'w') as f: json.dump({'unbound_answers': list(sorted(unbound_answers))}, f)
def generate_domain_classifier_data(weight=150): """ Reads all sentences from every wikipedia page corresponding to a known answer and splits them into two vowpal wabbit files, interleaving true quiz bowl questions randomly and with higher weight specified by the weight arg. """ qb_data = QuizBowlDataset(guesser_train=True).training_data() real_questions = [('1', str(weight), ans, clean_question(sent)) for q, ans, _ in zip(*qb_data) for sent in q] pages = set(a for _, _, a, _ in real_questions) cw = CachedWikipedia() # Split wikipedia questions into two sets wiki_questions = ([], []) use_second = False for page in pages: for sentence in sentences_from_page(cw[page]): q = clean_question(sentence) wiki_questions[use_second].append(('-1', '1', page, q)) use_second = not use_second vw_line = '{} {} \'{}|text {}\n' for i, wiki_qs in enumerate(wiki_questions): # Create list of True/False and shuffle to define ordering of train data order = list( chain(repeat(False, len(real_questions)), repeat(True, len(wiki_qs)))) random.shuffle(order) iters = (iter(real_questions), iter(wiki_qs)) with safe_open(DOMAIN_TARGET_PREFIX + str(i), 'w') as f: for choice in order: f.write(vw_line.format(*next(iters[choice])))
def ingestion_cli(start_idx): """ Input format is for jason's HS project, but can be changed. The original code for answer mapping was designed to map everything over multiple passes, not yield a callable function to map an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar functionality to map a new dataset is to combine already mapped questions with new questions, have the code map answer for both at the same time, then only use the mappings from the new questions. There are some edge cases, but this should in general work (hopefully). """ with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_questions = json.load(f)['questions'] with open('data/external/high_school_project/quizdb-20190313164802.json') as f: raw_questions = json.load(f)['data']['tossups'] new_questions = [] idx = start_idx for q in raw_questions: new_questions.append({ 'qanta_id': idx, 'text': q['text'], 'answer': q['answer'], 'page': None, 'category': None, 'subcategory': None, 'tournament': q['tournament']['name'], 'difficulty': q['tournament']['difficulty'], 'year': int(q['tournament']['year']), 'proto_id': None, 'qdb_id': q['id'], 'dataset': 'quizdb.org', 'fold': 'guesstest' }) idx += 1 questions = unmapped_questions + new_questions answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions) with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f: json.dump(report, f) write_answer_map( answer_map, amb_answer_map, unbound_answers, 'data/external/high_school_project/answer_map.json', 'data/external/high_school_project/unbound_answers.json' ) with open('data/internal/page_assignment/unmappable.yaml') as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions( new_questions, answer_map, amb_answer_map, unmappable, page_assigner ) add_sentences_(new_questions) with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f: json.dump(format_qanta_json(new_questions, DS_VERSION), f) with open('data/external/high_school_project/mapping_report.json', 'w') as f: json.dump(mapping_report, f)
def build(self, answers: Set[str], save=True): client = TagmeClient() wiki_lookup = Wikipedia() page_sentences = defaultdict(list) for ans in answers: if ans not in wiki_lookup: continue wiki_page = wiki_lookup[ans] if len(wiki_page.text) != 0: sentences = nltk.sent_tokenize(wiki_page.text) random.shuffle(sentences) clean_sentences, all_mentions = client.tag_mentions(sentences) for sent, mentions in zip(clean_sentences, all_mentions): page_mentions = {m.page for m in mentions} n_mentions = len(page_mentions) for page in page_mentions.intersection(answers): raise NotImplementedError( 'Need to fix this to use extract_wiki_sentences') stripped_sent = strip_title_references(page, sent) page_sentences[page].append( (n_mentions, stripped_sent)) if save: with safe_open(self.location, 'wb') as f: pickle.dump(page_sentences, f) return page_sentences
def load_embeddings(vocab=None, root_directory='', expand_glove=True, mask_zero=False): if os.path.exists(we_tmp_target): logger.info('Loading word embeddings from tmp cache') with safe_open(we_tmp_target, 'rb') as f: return pickle.load(f) elif os.path.exists(os.path.join(root_directory, we_target)): logger.info('Loading word embeddings from restored cache') with safe_open(os.path.join(root_directory, we_target), 'rb') as f: return pickle.load(f) else: if vocab is None: raise ValueError('To create fresh embeddings a vocab is needed') with safe_open(we_tmp_target, 'wb') as f: logger.info('Creating word embeddings and saving to cache') embed_and_lookup = create_embeddings(vocab, expand_glove=expand_glove, mask_zero=mask_zero) pickle.dump(embed_and_lookup, f) return embed_and_lookup
def load_embeddings(vocab=None, root_directory='', expand_glove=True, mask_zero=False): if os.path.exists(we_tmp_target): logger.info('Loading word embeddings from tmp cache') with safe_open(we_tmp_target, 'rb') as f: return pickle.load(f) elif os.path.exists(os.path.join(root_directory, we_target)): logger.info('Loading word embeddings from restored cache') with safe_open(os.path.join(root_directory, we_target), 'rb') as f: return pickle.load(f) else: if vocab is None: raise ValueError('To create fresh embeddings a vocab is needed') with safe_open(we_tmp_target, 'wb') as f: logger.info('Creating word embeddings and saving to cache') embed_and_lookup = create_embeddings(vocab, expand_glove=expand_glove, mask_zero=mask_zero) pickle.dump(embed_and_lookup, f) return embed_and_lookup
def run(self): with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_qanta_questions = json.load(f)['questions'] answer_map, amb_answer_map, unbound_answers, report = create_answer_map(unmapped_qanta_questions) with safe_open('data/external/answer_mapping/automatic_report.json', 'w') as f: json.dump(report, f) write_answer_map(answer_map, amb_answer_map, unbound_answers, ANSWER_MAP_PATH, UNBOUND_ANSWER_PATH)
def _load_embeddings(vocab=None, root_directory=''): if os.path.exists(TF_DAN_WE_TMP): log.info('Loading word embeddings from tmp cache') with safe_open(TF_DAN_WE_TMP, 'rb') as f: return pickle.load(f) elif os.path.exists(os.path.join(root_directory, TF_DAN_WE)): log.info('Loading word embeddings from restored cache') with safe_open(os.path.join(root_directory, TF_DAN_WE), 'rb') as f: return pickle.load(f) else: if vocab is None: raise ValueError('To create fresh embeddings a vocab is needed') with safe_open(TF_DAN_WE_TMP, 'wb') as f: log.info('Creating word embeddings and saving to cache') embed_and_lookup = _create_embeddings(vocab) pickle.dump(embed_and_lookup, f) return embed_and_lookup
def run(self): with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_qanta_questions = json.load(f)['questions'] answer_map, amb_answer_map, unbound_answers, report = create_answer_map( unmapped_qanta_questions) with safe_open('data/external/answer_mapping/automatic_report.json', 'w') as f: json.dump(report, f) write_answer_map(answer_map, amb_answer_map, unbound_answers, ANSWER_MAP_PATH, UNBOUND_ANSWER_PATH)
def get_best_wiki_questions(frac_questions=1.0): """Writes out a pickle containing a list of pairs of (text, page)""" log.info('Filtering down to top {}% of wikipedia sentences'.format( frac_questions * 100)) with ExitStack() as stack: file_pairs = [ (stack.enter_context(open(DOMAIN_TARGET_PREFIX + str(i))), stack.enter_context(open(DOMAIN_PREDICTIONS_PREFIX + str(i)))) for i in (0, 1) ] with safe_open(DOMAIN_OUTPUT.format('frac=' + str(frac_questions)), 'wb') as f: pickle.dump(_get_best(file_pairs, frac_questions), f)
def load_multi_embeddings( multi_vocab: Optional[MultiVocab] = None, root_directory='') -> Tuple[np.ndarray, MultiEmbeddingLookup]: if os.path.exists(PT_RNN_ENTITY_WE_TMP): log.info('Loading embeddings from tmp cache') with safe_open(PT_RNN_ENTITY_WE_TMP, 'rb') as f: return pickle.load(f) elif os.path.exists(os.path.join(root_directory, PT_RNN_ENTITY_WE)): log.info('Loading embeddings from restored cache') with safe_open(os.path.join(root_directory, PT_RNN_ENTITY_WE), 'rb') as f: return pickle.load(f) else: if multi_vocab is None: raise ValueError('To create new embeddings a vocab is needed') with safe_open(PT_RNN_ENTITY_WE_TMP, 'wb') as f: log.info('Creating embeddings and saving to cache') word_embeddings, word_lookup = create_embeddings(multi_vocab.word, expand_glove=True, mask_zero=True) pos_lookup = {'MASK': 0, UNK: 1} for i, term in enumerate(multi_vocab.pos, start=2): pos_lookup[term] = i iob_lookup = {'MASK': 0, UNK: 1} for i, term in enumerate(multi_vocab.iob, start=2): iob_lookup[term] = i ent_type_lookup = {'MASK': 0, UNK: 1} for i, term in enumerate(multi_vocab.ent_type, start=2): ent_type_lookup[term] = i multi_embedding_lookup = MultiEmbeddingLookup( word_lookup, pos_lookup, iob_lookup, ent_type_lookup) combined = word_embeddings, multi_embedding_lookup pickle.dump(combined, f) return combined
def evaluate(train_vector, test_vector): log.info('total training instances: {0}'.format(len(train_vector[0]))) log.info('total testing instances: {0}'.format(len(test_vector[0]))) classifier = OneVsRestClassifier(LogisticRegression(C=10), n_jobs=-1) classifier.fit(train_vector[0], train_vector[1]) with safe_open(DEEP_DAN_CLASSIFIER_TARGET, 'wb') as f: pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL) train_accuracy = classifier.score(X=train_vector[0], y=train_vector[1]) test_accuracy = classifier.score(X=test_vector[0], y=test_vector[1]) log.info('accuracy train: {0}'.format(train_accuracy)) log.info('accuracy test: {0}'.format(test_accuracy))
def compute_question_stats(question_db_path: str): dataset = QuizBowlDataset(5, qb_question_db=question_db_path) train_dev_questions = dataset.questions_in_folds(('train', 'dev')) question_lengths = [ len(q.flatten_text().split()) for q in train_dev_questions ] mean = np.mean(question_lengths) std = np.std(question_lengths) stats = (mean, std) with safe_open(SENTENCE_STATS, 'wb') as f: pickle.dump(stats, f)
def evaluate(train_vector, test_vector): log.info('total training instances: {0}'.format(len(train_vector[0]))) log.info('total testing instances: {0}'.format(len(test_vector[0]))) classifier = OneVsRestClassifier(LogisticRegression(C=10), n_jobs=-1) classifier.fit(train_vector[0], train_vector[1]) with safe_open(DEEP_DAN_CLASSIFIER_TARGET, 'wb') as f: pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL) train_accuracy = classifier.score(X=train_vector[0], y=train_vector[1]) test_accuracy = classifier.score(X=test_vector[0], y=test_vector[1]) log.info('accuracy train: {0}'.format(train_accuracy)) log.info('accuracy test: {0}'.format(test_accuracy))
def compute_classifier_input(we_dimensions=300): # Load training data with open(DEEP_TRAIN_TARGET, 'rb') as f: train_qs = pickle.load(f) # Load dev data with open(DEEP_DEV_TARGET, 'rb') as f: val_qs = pickle.load(f) # Load trained_DAN parameters with open(DEEP_DAN_PARAMS_TARGET, 'rb') as f: params = pickle.load(f) # Compute training, dev classifier vectors using DAN train_vector, test_vector = compute_vectors(train_qs, val_qs, params, we_dimensions) # Format training vector train_feats = [] train_labels = [] for e in train_vector: train_feats.append(e[0]) train_labels.append(e[1]) train_formatted = (train_feats, train_labels) # Format dev vector test_feats = [] test_labels = [] for e in test_vector: test_feats.append(e[0]) test_labels.append(e[1]) test_formatted = (test_feats, test_labels) # Save with safe_open(DEEP_DAN_TRAIN_OUTPUT, 'wb') as f: pickle.dump(train_formatted, f, protocol=pickle.HIGHEST_PROTOCOL) with safe_open(DEEP_DAN_DEV_OUTPUT, 'wb') as f: pickle.dump(test_formatted, f, protocol=pickle.HIGHEST_PROTOCOL) log.info('Classifier train/dev vectors computed using DAN')
def compute_classifier_input(we_dimensions=300): # Load training data with open(DEEP_TRAIN_TARGET, 'rb') as f: train_qs = pickle.load(f) # Load dev data with open(DEEP_DEV_TARGET, 'rb') as f: val_qs = pickle.load(f) # Load trained_DAN parameters with open(DEEP_DAN_PARAMS_TARGET, 'rb') as f: params = pickle.load(f) # Compute training, dev classifier vectors using DAN train_vector, test_vector = compute_vectors(train_qs, val_qs, params, we_dimensions) # Format training vector train_feats = [] train_labels = [] for e in train_vector: train_feats.append(e[0]) train_labels.append(e[1]) train_formatted = (train_feats, train_labels) # Format dev vector test_feats = [] test_labels = [] for e in test_vector: test_feats.append(e[0]) test_labels.append(e[1]) test_formatted = (test_feats, test_labels) # Save with safe_open(DEEP_DAN_TRAIN_OUTPUT, 'wb') as f: pickle.dump(train_formatted, f, protocol=pickle.HIGHEST_PROTOCOL) with safe_open(DEEP_DAN_DEV_OUTPUT, 'wb') as f: pickle.dump(test_formatted, f, protocol=pickle.HIGHEST_PROTOCOL) log.info('Classifier train/dev vectors computed using DAN')
def train_classifier(out, bgset, questions, class_type, limit=-1): all_questions = questions.questions_with_pages() c = Counter() train = [] for page in all_questions: for qq in all_questions[page]: if qq.fold == 'train': label = getattr(qq, class_type, "").split(":")[0].lower() if not label: continue c[label] += 1 for ss, ww, tt in qq.partials(): feats = {} total = ' '.join(tt).strip() total = alphanum.sub(' ', unidecode(total.lower())) total = total.split() # add unigrams for word in total: feats[word] = 1.0 # add bigrams currbg = set(ngrams(total, 2)) inter = currbg.intersection(bgset) for elem in inter: feats[elem] = 1.0 train.append((feats, label)) if 0 < limit < len(train): break log.info('{}: {}'.format(class_type, c)) log.info('{}: {}'.format(class_type, len(train))) log.info("{} out: training classifier".format(class_type)) classifier = SklearnClassifier(LogisticRegression(C=10)) classifier.train(train) with safe_open(class_type, 'wb') as f: pickle.dump(classifier, f) log.info('{}: accuracy@1 train: {}'.format( class_type, nltk.classify.util.accuracy(classifier, train))) return classifier
def train_classifier(out, bgset, questions, class_type, limit=-1): all_questions = questions.questions_with_pages() c = Counter() train = [] for page in all_questions: for qq in all_questions[page]: if qq.fold == 'train': label = getattr(qq, class_type, "").split(":")[0].lower() if not label: continue c[label] += 1 for ss, ww, tt in qq.partials(): feats = {} total = ' '.join(tt).strip() total = alphanum.sub(' ', unidecode(total.lower())) total = total.split() # add unigrams for word in total: feats[word] = 1.0 # add bigrams currbg = set(ngrams(total, 2)) inter = currbg.intersection(bgset) for elem in inter: feats[elem] = 1.0 train.append((feats, label)) if 0 < limit < len(train): break log.info('{}: {}'.format(class_type, c)) log.info('{}: {}'.format(class_type, len(train))) log.info("{} out: training classifier".format(class_type)) classifier = SklearnClassifier(LogisticRegression(C=10)) classifier.train(train) with safe_open(class_type, 'wb') as f: pickle.dump(classifier, f) log.info('{}: accuracy@1 train: {}'.format( class_type, nltk.classify.util.accuracy(classifier, train))) return classifier
def save(self, directory: str) -> None: params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET) with safe_open(params_path, 'wb') as f: if (self.max_len is None or self.class_to_i is None or self.i_to_class is None or self.vocab is None or self.n_classes is None): raise ValueError('Attempting to save uninitialized model parameters') pickle.dump({ 'max_len': self.max_len, 'class_to_i': self.class_to_i, 'i_to_class': self.i_to_class, 'vocab': self.vocab, 'n_classes': self.n_classes }, f) model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET) shell('cp -r {} {}'.format(DEEP_DAN_MODEL_TMP_DIR, safe_path(model_path))) we_path = os.path.join(directory, TF_DAN_WE) shutil.copyfile(TF_DAN_WE_TMP, safe_path(we_path))
def create(): vec_file = open('data/external/deep/glove.840B.300d.txt') all_vocab = {} log.info('loading vocab...') vocab, wmap = pickle.load(open('output/deep/vocab', 'rb')) for line in vec_file: split = line.split() word = " ".join(split[:-300]) if word not in wmap: continue x = wmap[word] all_vocab[word] = array(split[-300:]) all_vocab[word] = all_vocab[word].astype(float) log.info("wmap: {0} all_vocab: {1}".format(len(wmap), len(all_vocab))) d = len(all_vocab['the']) We = empty((d, len(wmap))) log.info('creating We for {0} words'.format(len(wmap))) unknown = [] offset = len(wmap) log.info('offset = {0}'.format(offset)) for word in wmap: try: We[:, wmap[word]] = all_vocab[word] except KeyError: unknown.append(word) log.info('unknown: {0}'.format(word)) # initialize unknown words with unknown token We[:, wmap[word]] = all_vocab['unknown'] log.info('unknown: {0}'.format(len(unknown))) log.info('We shape: {0}'.format(We.shape)) log.info('dumping...') with safe_open('output/deep/We', 'wb') as f: pickle.dump(We, f, protocol=pickle.HIGHEST_PROTOCOL)
def train_dan(batch_size=150, we_dimension=300, n_epochs=61, learning_rate=0.01, adagrad_reset=10): with open(DEEP_TRAIN_TARGET, 'rb') as f: train_qs = pickle.load(f) log.info('total questions: {0}'.format(len(train_qs))) total = 0 for qs, ans in train_qs: total += len(qs) log.info('total sentences: {0}'.format(total)) with open(DEEP_WE_TARGET, 'rb') as f: orig_We = pickle.load(f) len_voc = orig_We.shape[1] log.info('vocab length: {0} We shape: {1}'.format(len_voc, orig_We.shape)) # generate params / We params = gen_util.init_params(we_dimension, deep=3) # add We matrix to params params += (orig_We, ) r = gen_util.roll_params(params) dim = r.shape[0] log.info('parameter vector dimensionality: {0}'.format(dim)) # minibatch adagrad training ag = Adagrad(r.shape, learning_rate) min_error = float('inf') log.info('step 1 of 2: training DAN (takes 2-3 hours)') for epoch in range(0, n_epochs): # create mini-batches np.random.shuffle(train_qs) batches = [ train_qs[x:x + batch_size] for x in list(range(0, len(train_qs), batch_size)) ] epoch_error = 0.0 ep_t = time.time() for batch_ind, batch in enumerate(batches): now = time.time() err, grad = objective_and_grad(batch, r, we_dimension, len_voc) update = ag.rescale_update(grad) r -= update lstring = 'epoch: {0} batch_ind: {1} error, {2} time = {3}'.format( epoch, batch_ind, err, time.time() - now) log.info(lstring) epoch_error += err # done with epoch log.info(str(time.time() - ep_t)) log.info( 'done with epoch {0} epoch error = {1} min error = {2}'.format( epoch, epoch_error, min_error)) # save parameters if the current model is better than previous best model if epoch_error < min_error: min_error = epoch_error log.info('saving model...') params = gen_util.unroll_params(r, we_dimension, len_voc, deep=3) with safe_open(DEEP_DAN_PARAMS_TARGET, 'wb') as f: pickle.dump(params, f) # reset adagrad weights if epoch % adagrad_reset == 0 and epoch != 0: ag.reset_weights()
def save_classifier(classifier, class_type): with safe_open(CLASSIFIER_PICKLE_PATH.format(class_type), 'wb') as f: pickle.dump(classifier, f)
def generate_guesser_slurm(slurm_config_file, task, output_dir): with open(slurm_config_file) as f: slurm_config = yaml.load(f) default_slurm_config = slurm_config['default'] env = Environment(loader=PackageLoader('qanta', 'slurm/templates')) template = env.get_template('guesser-luigi-template.sh') enabled_guessers = list(AbstractGuesser.list_enabled_guessers()) for i, gs in enumerate(enabled_guessers): if gs.guesser_class == 'ElasticSearchGuesser': raise ValueError( 'ElasticSearchGuesser is not compatible with slurm') elif gs.guesser_class in slurm_config: guesser_slurm_config = slurm_config[gs.guesser_class] else: guesser_slurm_config = None partition = get_slurm_config_value('partition', default_slurm_config, guesser_slurm_config) qos = get_slurm_config_value('qos', default_slurm_config, guesser_slurm_config) mem_per_cpu = get_slurm_config_value('mem_per_cpu', default_slurm_config, guesser_slurm_config) gres = get_slurm_config_value('gres', default_slurm_config, guesser_slurm_config) max_time = get_slurm_config_value('max_time', default_slurm_config, guesser_slurm_config) cpus_per_task = get_slurm_config_value('cpus_per_task', default_slurm_config, guesser_slurm_config) account = get_slurm_config_value('account', default_slurm_config, guesser_slurm_config) if task == 'GuesserReport': folds = GUESSER_GENERATION_FOLDS else: folds = [] script = template.render({ 'task': task, 'guesser_module': gs.guesser_module, 'guesser_class': gs.guesser_class, 'dependency_module': gs.dependency_module, 'dependency_class': gs.dependency_class, 'config_num': gs.config_num, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account, 'folds': folds }) slurm_file = path.join(output_dir, f'slurm-{i}.sh') with safe_open(slurm_file, 'w') as f: f.write(script) singleton_path = 'qanta/slurm/templates/guesser-singleton.sh' singleton_output = path.join(output_dir, 'guesser-singleton.sh') shell(f'cp {singleton_path} {singleton_output}') master_template = env.get_template('guesser-master-template.sh') master_script = master_template.render({ 'script_list': [ path.join(output_dir, f'slurm-{i}.sh') for i in range(len(enabled_guessers)) ] + [singleton_output], 'gres': gres, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account }) with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f: f.write(master_script)
def generate_guesser_slurm(slurm_config_file, task, output_dir): with open(slurm_config_file) as f: slurm_config = yaml.load(f) default_slurm_config = slurm_config["default"] env = Environment(loader=PackageLoader("qanta", "slurm/templates")) template = env.get_template("guesser-luigi-template.sh") enabled_guessers = list(AbstractGuesser.list_enabled_guessers()) for i, gs in enumerate(enabled_guessers): if gs.guesser_class == "ElasticSearchGuesser": raise ValueError( "ElasticSearchGuesser is not compatible with slurm") elif gs.guesser_class in slurm_config: guesser_slurm_config = slurm_config[gs.guesser_class] else: guesser_slurm_config = None partition = get_slurm_config_value("partition", default_slurm_config, guesser_slurm_config) qos = get_slurm_config_value("qos", default_slurm_config, guesser_slurm_config) mem_per_cpu = get_slurm_config_value("mem_per_cpu", default_slurm_config, guesser_slurm_config) gres = get_slurm_config_value("gres", default_slurm_config, guesser_slurm_config) max_time = get_slurm_config_value("max_time", default_slurm_config, guesser_slurm_config) cpus_per_task = get_slurm_config_value("cpus_per_task", default_slurm_config, guesser_slurm_config) account = get_slurm_config_value("account", default_slurm_config, guesser_slurm_config) if task == "GuesserReport": folds = GUESSER_GENERATION_FOLDS else: folds = [] script = template.render({ "task": task, "guesser_module": gs.guesser_module, "guesser_class": gs.guesser_class, "dependency_module": gs.dependency_module, "dependency_class": gs.dependency_class, "config_num": gs.config_num, "partition": partition, "qos": qos, "mem_per_cpu": mem_per_cpu, "max_time": max_time, "gres": gres, "cpus_per_task": cpus_per_task, "account": account, "folds": folds, }) slurm_file = path.join(output_dir, f"slurm-{i}.sh") with safe_open(slurm_file, "w") as f: f.write(script) singleton_path = "qanta/slurm/templates/guesser-singleton.sh" singleton_output = path.join(output_dir, "guesser-singleton.sh") shell(f"cp {singleton_path} {singleton_output}") master_template = env.get_template("guesser-master-template.sh") master_script = master_template.render({ "script_list": [ path.join(output_dir, f"slurm-{i}.sh") for i in range(len(enabled_guessers)) ] + [singleton_output], "gres": gres, "partition": partition, "qos": qos, "mem_per_cpu": mem_per_cpu, "max_time": max_time, "gres": gres, "cpus_per_task": cpus_per_task, "account": account, }) with safe_open(path.join(output_dir, "slurm-master.sh"), "w") as f: f.write(master_script)
def train_dan(batch_size=150, we_dimension=300, n_epochs=61, learning_rate=0.01, adagrad_reset=10): with open(DEEP_TRAIN_TARGET, 'rb') as f: train_qs = pickle.load(f) log.info('total questions: {0}'.format(len(train_qs))) total = 0 for qs, ans in train_qs: total += len(qs) log.info('total sentences: {0}'.format(total)) with open(DEEP_WE_TARGET, 'rb') as f: orig_We = pickle.load(f) len_voc = orig_We.shape[1] log.info('vocab length: {0} We shape: {1}'.format(len_voc, orig_We.shape)) # generate params / We params = gen_util.init_params(we_dimension, deep=3) # add We matrix to params params += (orig_We, ) r = gen_util.roll_params(params) dim = r.shape[0] log.info('parameter vector dimensionality: {0}'.format(dim)) # minibatch adagrad training ag = Adagrad(r.shape, learning_rate) min_error = float('inf') log.info('step 1 of 2: training DAN (takes 2-3 hours)') for epoch in range(0, n_epochs): # create mini-batches np.random.shuffle(train_qs) batches = [train_qs[x: x + batch_size] for x in list(range(0, len(train_qs), batch_size))] epoch_error = 0.0 ep_t = time.time() for batch_ind, batch in enumerate(batches): now = time.time() err, grad = objective_and_grad(batch, r, we_dimension, len_voc) update = ag.rescale_update(grad) r -= update lstring = 'epoch: {0} batch_ind: {1} error, {2} time = {3}'.format( epoch, batch_ind, err, time.time() - now) log.info(lstring) epoch_error += err # done with epoch log.info(str(time.time() - ep_t)) log.info('done with epoch {0} epoch error = {1} min error = {2}'.format( epoch, epoch_error, min_error)) # save parameters if the current model is better than previous best model if epoch_error < min_error: min_error = epoch_error log.info('saving model...') params = gen_util.unroll_params(r, we_dimension, len_voc, deep=3) with safe_open(DEEP_DAN_PARAMS_TARGET, 'wb') as f: pickle.dump(params, f) # reset adagrad weights if epoch % adagrad_reset == 0 and epoch != 0: ag.reset_weights()
def save(self, directory: str): with safe_open(os.path.join(directory, IS_HUMAN_MODEL_PICKLE), 'wb') as f: pickle.dump({'is_human_model': self.is_human_model}, f)
def save(self, directory: str) -> None: shutil.copyfile(CNN_MODEL_TMP_TARGET, os.path.join(directory, CNN_MODEL_TARGET)) with safe_open(os.path.join(directory, CNN_PARAMS_TARGET), 'wb') as f: pickle.dump(self.dump_parameters(), f)
def ingestion_cli(start_idx): """ Input format is for jason's HS project, but can be changed. The original code for answer mapping was designed to map everything over multiple passes, not yield a callable function to map an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar functionality to map a new dataset is to combine already mapped questions with new questions, have the code map answer for both at the same time, then only use the mappings from the new questions. There are some edge cases, but this should in general work (hopefully). """ with open(QANTA_PREPROCESSED_DATASET_PATH) as f: unmapped_questions = json.load(f)["questions"] with open("data/external/high_school_project/quizdb-20190313164802.json" ) as f: raw_questions = json.load(f)["data"]["tossups"] new_questions = [] idx = start_idx for q in raw_questions: new_questions.append({ "qanta_id": idx, "text": q["text"], "answer": q["answer"], "page": None, "category": None, "subcategory": None, "tournament": q["tournament"]["name"], "difficulty": q["tournament"]["difficulty"], "year": int(q["tournament"]["year"]), "proto_id": None, "qdb_id": q["id"], "dataset": "quizdb.org", "fold": "guesstest", }) idx += 1 questions = unmapped_questions + new_questions answer_map, amb_answer_map, unbound_answers, report = create_answer_map( questions) with safe_open("data/external/high_school_project/automatic_report.json", "w") as f: json.dump(report, f) write_answer_map( answer_map, amb_answer_map, unbound_answers, "data/external/high_school_project/answer_map.json", "data/external/high_school_project/unbound_answers.json", ) with open("data/internal/page_assignment/unmappable.yaml") as f: unmappable = yaml.load(f) page_assigner = PageAssigner() mapping_report = unmapped_to_mapped_questions(new_questions, answer_map, amb_answer_map, unmappable, page_assigner) add_sentences_(new_questions) with open( "data/external/high_school_project/qanta.acf-regionals-2018.json", "w") as f: json.dump(format_qanta_json(new_questions, DS_VERSION), f) with open("data/external/high_school_project/mapping_report.json", "w") as f: json.dump(mapping_report, f)
def generate_guesser_slurm(slurm_config_file, task, output_dir): with open(slurm_config_file) as f: slurm_config = yaml.load(f) default_slurm_config = slurm_config['default'] env = Environment(loader=PackageLoader('qanta', 'slurm/templates')) template = env.get_template('guesser-luigi-template.sh') enabled_guessers = list(AbstractGuesser.list_enabled_guessers()) for i, gs in enumerate(enabled_guessers): if gs.guesser_class == 'ElasticSearchGuesser': raise ValueError('ElasticSearchGuesser is not compatible with slurm') elif gs.guesser_class in slurm_config: guesser_slurm_config = slurm_config[gs.guesser_class] else: guesser_slurm_config = None partition = get_slurm_config_value('partition', default_slurm_config, guesser_slurm_config) qos = get_slurm_config_value('qos', default_slurm_config, guesser_slurm_config) mem_per_cpu = get_slurm_config_value('mem_per_cpu', default_slurm_config, guesser_slurm_config) gres = get_slurm_config_value('gres', default_slurm_config, guesser_slurm_config) max_time = get_slurm_config_value('max_time', default_slurm_config, guesser_slurm_config) cpus_per_task = get_slurm_config_value('cpus_per_task', default_slurm_config, guesser_slurm_config) account = get_slurm_config_value('account', default_slurm_config, guesser_slurm_config) if task == 'GuesserReport': folds = GUESSER_GENERATION_FOLDS else: folds = [] script = template.render({ 'task': task, 'guesser_module': gs.guesser_module, 'guesser_class': gs.guesser_class, 'dependency_module': gs.dependency_module, 'dependency_class': gs.dependency_class, 'config_num': gs.config_num, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account, 'folds': folds }) slurm_file = path.join(output_dir, f'slurm-{i}.sh') with safe_open(slurm_file, 'w') as f: f.write(script) singleton_path = 'qanta/slurm/templates/guesser-singleton.sh' singleton_output = path.join(output_dir, 'guesser-singleton.sh') shell(f'cp {singleton_path} {singleton_output}') master_template = env.get_template('guesser-master-template.sh') master_script = master_template.render({ 'script_list': [ path.join(output_dir, f'slurm-{i}.sh') for i in range(len(enabled_guessers)) ] + [singleton_output], 'gres': gres, 'partition': partition, 'qos': qos, 'mem_per_cpu': mem_per_cpu, 'max_time': max_time, 'gres': gres, 'cpus_per_task': cpus_per_task, 'account': account }) with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f: f.write(master_script)
def write_bigrams(bigrams, output): with safe_open(output, 'wb') as f: pickle.dump(bigrams, f, pickle.HIGHEST_PROTOCOL)
def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1): min_appearances = conf['clm']['min_appearances'] log.info( "Training language model with pages that appear more than %i times" % min_appearances) lm = LanguageModelWriter(vocab_size, global_lms) num_docs = 0 background = defaultdict(int) # Initialize language models for title, text in text_iterator(True, QB_WIKI_LOCATION, True, QB_QUESTION_DB, True, QB_SOURCE_LOCATION, max_pages, min_pages=min_appearances): num_docs += 1 if num_docs % 500 == 0: log.info("{} {}".format(title, num_docs)) log.info(str(list(lm.tokenize_without_censor(text[100:200])))) for tt in lm.tokenize_without_censor(text): background[tt] += 1 # Create the vocabulary for ii in background: lm.train_seen(ii, background[ii]) vocab = lm.finalize() log.info(str(vocab)[:80]) log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs)) del background # Train the language model doc_num = 0 for corpus, qb, wiki, source in [("wiki", False, True, False), ("qb", True, False, False), ("source", False, False, True)]: # Add training data start = time.time() for title, text in text_iterator(wiki, QB_WIKI_LOCATION, qb, QB_QUESTION_DB, source, QB_SOURCE_LOCATION, max_pages, min_pages=min_appearances): doc_num += 1 if doc_num % 500 == 0 or time.time() - start > 10: log.info("Adding train doc %i, %s (%s)" % (doc_num, title, corpus)) start = time.time() lm.add_train(corpus, title, text) log.info("Done training") if lm_out: # Create the extractor object and write out the pickle with safe_open("%s.txt" % lm_out, 'w') as f: lm.write_vocab(f) for ii, cc in enumerate(lm.corpora()): with safe_open("%s/%i" % (lm_out, ii), 'w') as f: lm.write_corpus(cc, ii, f)
def write_bigrams(bigrams, output): with safe_open(output, 'wb') as f: pickle.dump(bigrams, f, pickle.HIGHEST_PROTOCOL)