コード例 #1
0
def preprocess():
    pp = Preprocessor('data/internal/common/ners')
    db = qdb.QuestionDatabase(QB_QUESTION_DB)

    pages = set(db.page_by_count(min_count=MIN_APPEARANCES))
    print(len(pages))
    folds = ['train', 'test', 'devtest', 'dev']
    for fold in folds:
        allqs = db.query('from questions where page != "" and fold == ?',
                         (fold, ),
                         text=True)
        print(fold, len(allqs))
        proc_fold = []
        for i, key in enumerate(allqs):
            q = allqs[key]
            if q.page in pages:
                qs = {}
                for index in q.text:
                    qs[index] = pp.preprocess_input(q.text[index])
                ans = q.page.strip().lower().replace(' ', '_')
                answer = pp.convert_to_indices(ans)
                proc_fold.append((qs, answer))
            if i % 5000 == 0:
                print('done with ', i)

        print(fold, len(proc_fold))
        with safe_open('output/deep/' + fold, 'wb') as f:
            pickle.dump(proc_fold, f, protocol=pickle.HIGHEST_PROTOCOL)

    with safe_open(DEEP_VOCAB_TARGET, 'wb') as f:
        pickle.dump((pp.vocab, pp.vdict), f, protocol=pickle.HIGHEST_PROTOCOL)
コード例 #2
0
ファイル: lm_wrapper.py プロジェクト: cequencer/qb
def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1):
    log.info("Training language model with pages that appear more than %i times" % MIN_APPEARANCES)

    lm = LanguageModelWriter(vocab_size, global_lms)
    num_docs = 0
    background = defaultdict(int)
    # Initialize language models
    for title, text in text_iterator(True, QB_WIKI_LOCATION,
                                     True, QB_QUESTION_DB,
                                     True, QB_SOURCE_LOCATION,
                                     max_pages,
                                     min_pages=MIN_APPEARANCES):
        num_docs += 1
        if num_docs % 500 == 0:
            log.info("{} {}".format(unidecode(title), num_docs))
            log.info(str(list(lm.tokenize_without_censor(text[100:200]))))

        for tt in lm.tokenize_without_censor(text):
            background[tt] += 1

    # Create the vocabulary
    for ii in background:
        lm.train_seen(ii, background[ii])
    vocab = lm.finalize()
    log.info(str(vocab)[:80])
    log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs))
    del background

    # Train the language model
    doc_num = 0
    for corpus, qb, wiki, source in [("wiki", False, True, False),
                                     ("qb", True, False, False),
                                     ("source", False, False, True)
                                     ]:
        # Add training data
        start = time.time()
        for title, text in text_iterator(wiki, QB_WIKI_LOCATION,
                                         qb, QB_QUESTION_DB,
                                         source, QB_SOURCE_LOCATION,
                                         max_pages,
                                         min_pages=MIN_APPEARANCES):
            doc_num += 1
            if doc_num % 500 == 0 or time.time() - start > 10:
                log.info("Adding train doc %i, %s (%s)" % (doc_num, unidecode(title), corpus))
                start = time.time()
            lm.add_train(corpus, title, text)

    log.info("Done training")
    if lm_out:
        # Create the extractor object and write out the pickle
        with safe_open("%s.txt" % lm_out, 'w') as f:
            lm.write_vocab(f)

        for ii, cc in enumerate(lm.corpora()):
            with safe_open("%s/%i" % (lm_out, ii), 'w') as f:
                lm.write_corpus(cc, ii, f)
コード例 #3
0
ファイル: answer_mapping.py プロジェクト: Pinafore/qb
def write_answer_map(answer_map, amb_answer_map,
                     unbound_answers, answer_map_path, unbound_answer_path):
    with safe_open(answer_map_path, 'w') as f:
        json.dump({
            'answer_map': answer_map,
            'ambig_answer_map': amb_answer_map
        }, f)

    with safe_open(unbound_answer_path, 'w') as f:
        json.dump({'unbound_answers': list(sorted(unbound_answers))}, f)
コード例 #4
0
ファイル: answer_mapping.py プロジェクト: NPSDC/qb
def write_answer_map(answer_map, amb_answer_map, unbound_answers,
                     answer_map_path, unbound_answer_path):
    with safe_open(answer_map_path, "w") as f:
        json.dump(
            {
                "answer_map": answer_map,
                "ambig_answer_map": amb_answer_map
            }, f)

    with safe_open(unbound_answer_path, "w") as f:
        json.dump({"unbound_answers": list(sorted(unbound_answers))}, f)
コード例 #5
0
ファイル: answer_mapping.py プロジェクト: ymedhat95/qb
def write_answer_map(answer_map, amb_answer_map, unbound_answers,
                     answer_map_path, unbound_answer_path):
    with safe_open(answer_map_path, 'w') as f:
        json.dump(
            {
                'answer_map': answer_map,
                'ambig_answer_map': amb_answer_map
            }, f)

    with safe_open(unbound_answer_path, 'w') as f:
        json.dump({'unbound_answers': list(sorted(unbound_answers))}, f)
コード例 #6
0
ファイル: wiki_questions.py プロジェクト: nadesai/qb
def generate_domain_classifier_data(weight=150):
    """
    Reads all sentences from every wikipedia page corresponding to a known answer and splits them into two vowpal wabbit files,

    interleaving true quiz bowl questions randomly and with higher weight specified by the weight arg.
    """
    qb_data = QuizBowlDataset(guesser_train=True).training_data()
    real_questions = [('1', str(weight), ans, clean_question(sent))
                      for q, ans, _ in zip(*qb_data) for sent in q]
    pages = set(a for _, _, a, _ in real_questions)

    cw = CachedWikipedia()

    # Split wikipedia questions into two sets
    wiki_questions = ([], [])
    use_second = False
    for page in pages:
        for sentence in sentences_from_page(cw[page]):
            q = clean_question(sentence)
            wiki_questions[use_second].append(('-1', '1', page, q))
            use_second = not use_second

    vw_line = '{} {} \'{}|text {}\n'
    for i, wiki_qs in enumerate(wiki_questions):
        # Create list of True/False and shuffle to define ordering of train data
        order = list(
            chain(repeat(False, len(real_questions)),
                  repeat(True, len(wiki_qs))))
        random.shuffle(order)
        iters = (iter(real_questions), iter(wiki_qs))
        with safe_open(DOMAIN_TARGET_PREFIX + str(i), 'w') as f:
            for choice in order:
                f.write(vw_line.format(*next(iters[choice])))
コード例 #7
0
ファイル: command.py プロジェクト: ymedhat95/qb
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)['questions']

    with open('data/external/high_school_project/quizdb-20190313164802.json') as f:
        raw_questions = json.load(f)['data']['tossups']

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            'qanta_id': idx,
            'text': q['text'],
            'answer': q['answer'],
            'page': None,
            'category': None,
            'subcategory': None,
            'tournament': q['tournament']['name'],
            'difficulty': q['tournament']['difficulty'],
            'year': int(q['tournament']['year']),
            'proto_id': None,
            'qdb_id': q['id'],
            'dataset': 'quizdb.org',
            'fold': 'guesstest'
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(questions)
    with safe_open('data/external/high_school_project/automatic_report.json', 'w') as f:
        json.dump(report, f)

    write_answer_map(
        answer_map, amb_answer_map, unbound_answers,
        'data/external/high_school_project/answer_map.json',
        'data/external/high_school_project/unbound_answers.json'
    )
    with open('data/internal/page_assignment/unmappable.yaml') as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(
        new_questions,
        answer_map, amb_answer_map,
        unmappable, page_assigner
    )

    add_sentences_(new_questions)
    with open('data/external/high_school_project/qanta.acf-regionals-2018.json', 'w') as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open('data/external/high_school_project/mapping_report.json', 'w') as f:
        json.dump(mapping_report, f)
コード例 #8
0
    def build(self, answers: Set[str], save=True):
        client = TagmeClient()
        wiki_lookup = Wikipedia()

        page_sentences = defaultdict(list)
        for ans in answers:
            if ans not in wiki_lookup:
                continue
            wiki_page = wiki_lookup[ans]
            if len(wiki_page.text) != 0:
                sentences = nltk.sent_tokenize(wiki_page.text)
                random.shuffle(sentences)
                clean_sentences, all_mentions = client.tag_mentions(sentences)
                for sent, mentions in zip(clean_sentences, all_mentions):
                    page_mentions = {m.page for m in mentions}
                    n_mentions = len(page_mentions)
                    for page in page_mentions.intersection(answers):
                        raise NotImplementedError(
                            'Need to fix this to use extract_wiki_sentences')
                        stripped_sent = strip_title_references(page, sent)
                        page_sentences[page].append(
                            (n_mentions, stripped_sent))

        if save:
            with safe_open(self.location, 'wb') as f:
                pickle.dump(page_sentences, f)

        return page_sentences
コード例 #9
0
ファイル: nn.py プロジェクト: Pinafore/qb
 def load_embeddings(vocab=None, root_directory='', expand_glove=True, mask_zero=False):
     if os.path.exists(we_tmp_target):
         logger.info('Loading word embeddings from tmp cache')
         with safe_open(we_tmp_target, 'rb') as f:
             return pickle.load(f)
     elif os.path.exists(os.path.join(root_directory, we_target)):
         logger.info('Loading word embeddings from restored cache')
         with safe_open(os.path.join(root_directory, we_target), 'rb') as f:
             return pickle.load(f)
     else:
         if vocab is None:
             raise ValueError('To create fresh embeddings a vocab is needed')
         with safe_open(we_tmp_target, 'wb') as f:
             logger.info('Creating word embeddings and saving to cache')
             embed_and_lookup = create_embeddings(vocab, expand_glove=expand_glove, mask_zero=mask_zero)
             pickle.dump(embed_and_lookup, f)
             return embed_and_lookup
コード例 #10
0
ファイル: nn.py プロジェクト: amit2014/qb
 def load_embeddings(vocab=None, root_directory='', expand_glove=True, mask_zero=False):
     if os.path.exists(we_tmp_target):
         logger.info('Loading word embeddings from tmp cache')
         with safe_open(we_tmp_target, 'rb') as f:
             return pickle.load(f)
     elif os.path.exists(os.path.join(root_directory, we_target)):
         logger.info('Loading word embeddings from restored cache')
         with safe_open(os.path.join(root_directory, we_target), 'rb') as f:
             return pickle.load(f)
     else:
         if vocab is None:
             raise ValueError('To create fresh embeddings a vocab is needed')
         with safe_open(we_tmp_target, 'wb') as f:
             logger.info('Creating word embeddings and saving to cache')
             embed_and_lookup = create_embeddings(vocab, expand_glove=expand_glove, mask_zero=mask_zero)
             pickle.dump(embed_and_lookup, f)
             return embed_and_lookup
コード例 #11
0
ファイル: pipeline.py プロジェクト: Pinafore/qb
    def run(self):
        with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
            unmapped_qanta_questions = json.load(f)['questions']

        answer_map, amb_answer_map, unbound_answers, report = create_answer_map(unmapped_qanta_questions)
        with safe_open('data/external/answer_mapping/automatic_report.json', 'w') as f:
            json.dump(report, f)
        write_answer_map(answer_map, amb_answer_map, unbound_answers, ANSWER_MAP_PATH, UNBOUND_ANSWER_PATH)
コード例 #12
0
ファイル: dan_tf.py プロジェクト: xxlatgh/qb
def _load_embeddings(vocab=None, root_directory=''):
    if os.path.exists(TF_DAN_WE_TMP):
        log.info('Loading word embeddings from tmp cache')
        with safe_open(TF_DAN_WE_TMP, 'rb') as f:
            return pickle.load(f)
    elif os.path.exists(os.path.join(root_directory, TF_DAN_WE)):
        log.info('Loading word embeddings from restored cache')
        with safe_open(os.path.join(root_directory, TF_DAN_WE), 'rb') as f:
            return pickle.load(f)
    else:
        if vocab is None:
            raise ValueError('To create fresh embeddings a vocab is needed')
        with safe_open(TF_DAN_WE_TMP, 'wb') as f:
            log.info('Creating word embeddings and saving to cache')
            embed_and_lookup = _create_embeddings(vocab)
            pickle.dump(embed_and_lookup, f)
            return embed_and_lookup
コード例 #13
0
ファイル: pipeline.py プロジェクト: theJasonFan/qb
    def run(self):
        with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
            unmapped_qanta_questions = json.load(f)['questions']

        answer_map, amb_answer_map, unbound_answers, report = create_answer_map(
            unmapped_qanta_questions)
        with safe_open('data/external/answer_mapping/automatic_report.json',
                       'w') as f:
            json.dump(report, f)
        write_answer_map(answer_map, amb_answer_map, unbound_answers,
                         ANSWER_MAP_PATH, UNBOUND_ANSWER_PATH)
コード例 #14
0
ファイル: wiki_questions.py プロジェクト: nadesai/qb
def get_best_wiki_questions(frac_questions=1.0):
    """Writes out a pickle containing a list of pairs of (text, page)"""
    log.info('Filtering down to top {}% of wikipedia sentences'.format(
        frac_questions * 100))
    with ExitStack() as stack:
        file_pairs = [
            (stack.enter_context(open(DOMAIN_TARGET_PREFIX + str(i))),
             stack.enter_context(open(DOMAIN_PREDICTIONS_PREFIX + str(i))))
            for i in (0, 1)
        ]
        with safe_open(DOMAIN_OUTPUT.format('frac=' + str(frac_questions)),
                       'wb') as f:
            pickle.dump(_get_best(file_pairs, frac_questions), f)
コード例 #15
0
ファイル: rnn_entity.py プロジェクト: amit2014/qb
def load_multi_embeddings(
        multi_vocab: Optional[MultiVocab] = None,
        root_directory='') -> Tuple[np.ndarray, MultiEmbeddingLookup]:
    if os.path.exists(PT_RNN_ENTITY_WE_TMP):
        log.info('Loading embeddings from tmp cache')
        with safe_open(PT_RNN_ENTITY_WE_TMP, 'rb') as f:
            return pickle.load(f)
    elif os.path.exists(os.path.join(root_directory, PT_RNN_ENTITY_WE)):
        log.info('Loading embeddings from restored cache')
        with safe_open(os.path.join(root_directory, PT_RNN_ENTITY_WE),
                       'rb') as f:
            return pickle.load(f)
    else:
        if multi_vocab is None:
            raise ValueError('To create new embeddings a vocab is needed')
        with safe_open(PT_RNN_ENTITY_WE_TMP, 'wb') as f:
            log.info('Creating embeddings and saving to cache')
            word_embeddings, word_lookup = create_embeddings(multi_vocab.word,
                                                             expand_glove=True,
                                                             mask_zero=True)

            pos_lookup = {'MASK': 0, UNK: 1}
            for i, term in enumerate(multi_vocab.pos, start=2):
                pos_lookup[term] = i

            iob_lookup = {'MASK': 0, UNK: 1}
            for i, term in enumerate(multi_vocab.iob, start=2):
                iob_lookup[term] = i

            ent_type_lookup = {'MASK': 0, UNK: 1}
            for i, term in enumerate(multi_vocab.ent_type, start=2):
                ent_type_lookup[term] = i

            multi_embedding_lookup = MultiEmbeddingLookup(
                word_lookup, pos_lookup, iob_lookup, ent_type_lookup)
            combined = word_embeddings, multi_embedding_lookup
            pickle.dump(combined, f)
            return combined
コード例 #16
0
def evaluate(train_vector, test_vector):
    log.info('total training instances: {0}'.format(len(train_vector[0])))
    log.info('total testing instances: {0}'.format(len(test_vector[0])))

    classifier = OneVsRestClassifier(LogisticRegression(C=10), n_jobs=-1)
    classifier.fit(train_vector[0], train_vector[1])

    with safe_open(DEEP_DAN_CLASSIFIER_TARGET, 'wb') as f:
        pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

    train_accuracy = classifier.score(X=train_vector[0], y=train_vector[1])
    test_accuracy = classifier.score(X=test_vector[0], y=test_vector[1])
    log.info('accuracy train: {0}'.format(train_accuracy))
    log.info('accuracy test: {0}'.format(test_accuracy))
コード例 #17
0
ファイル: stats.py プロジェクト: xxlatgh/qb
def compute_question_stats(question_db_path: str):
    dataset = QuizBowlDataset(5, qb_question_db=question_db_path)
    train_dev_questions = dataset.questions_in_folds(('train', 'dev'))
    question_lengths = [
        len(q.flatten_text().split()) for q in train_dev_questions
    ]

    mean = np.mean(question_lengths)
    std = np.std(question_lengths)

    stats = (mean, std)

    with safe_open(SENTENCE_STATS, 'wb') as f:
        pickle.dump(stats, f)
コード例 #18
0
ファイル: learn_classifiers.py プロジェクト: cequencer/qb
def evaluate(train_vector, test_vector):
    log.info('total training instances: {0}'.format(len(train_vector[0])))
    log.info('total testing instances: {0}'.format(len(test_vector[0])))

    classifier = OneVsRestClassifier(LogisticRegression(C=10), n_jobs=-1)
    classifier.fit(train_vector[0], train_vector[1])

    with safe_open(DEEP_DAN_CLASSIFIER_TARGET, 'wb') as f:
        pickle.dump(classifier, f, protocol=pickle.HIGHEST_PROTOCOL)

    train_accuracy = classifier.score(X=train_vector[0], y=train_vector[1])
    test_accuracy = classifier.score(X=test_vector[0], y=test_vector[1])
    log.info('accuracy train: {0}'.format(train_accuracy))
    log.info('accuracy test: {0}'.format(test_accuracy))
コード例 #19
0
ファイル: dan.py プロジェクト: cequencer/qb
def compute_classifier_input(we_dimensions=300):
    # Load training data
    with open(DEEP_TRAIN_TARGET, 'rb') as f:
        train_qs = pickle.load(f)
    # Load dev data
    with open(DEEP_DEV_TARGET, 'rb') as f:
        val_qs = pickle.load(f)
    # Load trained_DAN parameters
    with open(DEEP_DAN_PARAMS_TARGET, 'rb') as f:
        params = pickle.load(f)

    # Compute training, dev classifier vectors using DAN
    train_vector, test_vector = compute_vectors(train_qs, val_qs, params,
                                                we_dimensions)

    # Format training vector
    train_feats = []
    train_labels = []
    for e in train_vector:
        train_feats.append(e[0])
        train_labels.append(e[1])
    train_formatted = (train_feats, train_labels)

    # Format dev vector
    test_feats = []
    test_labels = []
    for e in test_vector:
        test_feats.append(e[0])
        test_labels.append(e[1])
    test_formatted = (test_feats, test_labels)

    # Save
    with safe_open(DEEP_DAN_TRAIN_OUTPUT, 'wb') as f:
        pickle.dump(train_formatted, f, protocol=pickle.HIGHEST_PROTOCOL)
    with safe_open(DEEP_DAN_DEV_OUTPUT, 'wb') as f:
        pickle.dump(test_formatted, f, protocol=pickle.HIGHEST_PROTOCOL)
    log.info('Classifier train/dev vectors computed using DAN')
コード例 #20
0
ファイル: dan.py プロジェクト: cequencer/qb
def compute_classifier_input(we_dimensions=300):
    # Load training data
    with open(DEEP_TRAIN_TARGET, 'rb') as f:
        train_qs = pickle.load(f)
    # Load dev data
    with open(DEEP_DEV_TARGET, 'rb') as f:
        val_qs = pickle.load(f)
    # Load trained_DAN parameters
    with open(DEEP_DAN_PARAMS_TARGET, 'rb') as f:
        params = pickle.load(f)
    
    # Compute training, dev classifier vectors using DAN
    train_vector, test_vector = compute_vectors(train_qs, val_qs, params, we_dimensions)
    
    # Format training vector
    train_feats = []
    train_labels = []
    for e in train_vector:
        train_feats.append(e[0])
        train_labels.append(e[1])
    train_formatted = (train_feats, train_labels)
    
    # Format dev vector
    test_feats = []
    test_labels = []
    for e in test_vector:
        test_feats.append(e[0])
        test_labels.append(e[1])
    test_formatted = (test_feats, test_labels)
    
    # Save
    with safe_open(DEEP_DAN_TRAIN_OUTPUT, 'wb') as f:
        pickle.dump(train_formatted, f, protocol=pickle.HIGHEST_PROTOCOL)
    with safe_open(DEEP_DAN_DEV_OUTPUT, 'wb') as f:
        pickle.dump(test_formatted, f, protocol=pickle.HIGHEST_PROTOCOL)
    log.info('Classifier train/dev vectors computed using DAN')
コード例 #21
0
ファイル: classifier.py プロジェクト: cequencer/qb
def train_classifier(out, bgset, questions, class_type, limit=-1):
    all_questions = questions.questions_with_pages()
    c = Counter()
    train = []
    for page in all_questions:
        for qq in all_questions[page]:
            if qq.fold == 'train':
                label = getattr(qq, class_type, "").split(":")[0].lower()
                if not label:
                    continue
                c[label] += 1

                for ss, ww, tt in qq.partials():
                    feats = {}
                    total = ' '.join(tt).strip()
                    total = alphanum.sub(' ', unidecode(total.lower()))
                    total = total.split()

                    # add unigrams
                    for word in total:
                        feats[word] = 1.0

                    # add bigrams
                    currbg = set(ngrams(total, 2))
                    inter = currbg.intersection(bgset)
                    for elem in inter:
                        feats[elem] = 1.0

                    train.append((feats, label))
            if 0 < limit < len(train):
                break

    log.info('{}: {}'.format(class_type, c))
    log.info('{}: {}'.format(class_type, len(train)))
    log.info("{} out: training classifier".format(class_type))
    classifier = SklearnClassifier(LogisticRegression(C=10))
    classifier.train(train)
    with safe_open(class_type, 'wb') as f:
        pickle.dump(classifier, f)
    log.info('{}: accuracy@1 train: {}'.format(
        class_type, nltk.classify.util.accuracy(classifier, train)))
    return classifier
コード例 #22
0
ファイル: classifier.py プロジェクト: cequencer/qb
def train_classifier(out, bgset, questions, class_type, limit=-1):
    all_questions = questions.questions_with_pages()
    c = Counter()
    train = []
    for page in all_questions:
        for qq in all_questions[page]:
            if qq.fold == 'train':
                label = getattr(qq, class_type, "").split(":")[0].lower()
                if not label:
                    continue
                c[label] += 1

                for ss, ww, tt in qq.partials():
                    feats = {}
                    total = ' '.join(tt).strip()
                    total = alphanum.sub(' ', unidecode(total.lower()))
                    total = total.split()

                    # add unigrams
                    for word in total:
                        feats[word] = 1.0

                    # add bigrams
                    currbg = set(ngrams(total, 2))
                    inter = currbg.intersection(bgset)
                    for elem in inter:
                        feats[elem] = 1.0

                    train.append((feats, label))
            if 0 < limit < len(train):
                break

    log.info('{}: {}'.format(class_type, c))
    log.info('{}: {}'.format(class_type, len(train)))
    log.info("{} out: training classifier".format(class_type))
    classifier = SklearnClassifier(LogisticRegression(C=10))
    classifier.train(train)
    with safe_open(class_type, 'wb') as f:
        pickle.dump(classifier, f)
    log.info('{}: accuracy@1 train: {}'.format(
        class_type, nltk.classify.util.accuracy(classifier, train)))
    return classifier
コード例 #23
0
ファイル: dan_tf.py プロジェクト: xxlatgh/qb
 def save(self, directory: str) -> None:
     params_path = os.path.join(directory, DEEP_DAN_PARAMS_TARGET)
     with safe_open(params_path, 'wb') as f:
         if (self.max_len is None
                 or self.class_to_i is None
                 or self.i_to_class is None
                 or self.vocab is None
                 or self.n_classes is None):
             raise ValueError('Attempting to save uninitialized model parameters')
         pickle.dump({
             'max_len': self.max_len,
             'class_to_i': self.class_to_i,
             'i_to_class': self.i_to_class,
             'vocab': self.vocab,
             'n_classes': self.n_classes
         }, f)
     model_path = os.path.join(directory, DEEP_DAN_MODEL_TARGET)
     shell('cp -r {} {}'.format(DEEP_DAN_MODEL_TMP_DIR, safe_path(model_path)))
     we_path = os.path.join(directory, TF_DAN_WE)
     shutil.copyfile(TF_DAN_WE_TMP, safe_path(we_path))
コード例 #24
0
def create():
    vec_file = open('data/external/deep/glove.840B.300d.txt')
    all_vocab = {}
    log.info('loading vocab...')
    vocab, wmap = pickle.load(open('output/deep/vocab', 'rb'))

    for line in vec_file:
        split = line.split()
        word = " ".join(split[:-300])
        if word not in wmap:
            continue
        x = wmap[word]
        all_vocab[word] = array(split[-300:])
        all_vocab[word] = all_vocab[word].astype(float)

    log.info("wmap: {0} all_vocab: {1}".format(len(wmap), len(all_vocab)))
    d = len(all_vocab['the'])

    We = empty((d, len(wmap)))

    log.info('creating We for {0} words'.format(len(wmap)))
    unknown = []

    offset = len(wmap)
    log.info('offset = {0}'.format(offset))

    for word in wmap:
        try:
            We[:, wmap[word]] = all_vocab[word]
        except KeyError:
            unknown.append(word)
            log.info('unknown: {0}'.format(word))
            # initialize unknown words with unknown token
            We[:, wmap[word]] = all_vocab['unknown']

    log.info('unknown: {0}'.format(len(unknown)))
    log.info('We shape: {0}'.format(We.shape))

    log.info('dumping...')
    with safe_open('output/deep/We', 'wb') as f:
        pickle.dump(We, f, protocol=pickle.HIGHEST_PROTOCOL)
コード例 #25
0
ファイル: dan.py プロジェクト: cequencer/qb
def train_dan(batch_size=150,
              we_dimension=300,
              n_epochs=61,
              learning_rate=0.01,
              adagrad_reset=10):
    with open(DEEP_TRAIN_TARGET, 'rb') as f:
        train_qs = pickle.load(f)

    log.info('total questions: {0}'.format(len(train_qs)))
    total = 0
    for qs, ans in train_qs:
        total += len(qs)
    log.info('total sentences: {0}'.format(total))

    with open(DEEP_WE_TARGET, 'rb') as f:
        orig_We = pickle.load(f)

    len_voc = orig_We.shape[1]
    log.info('vocab length: {0} We shape: {1}'.format(len_voc, orig_We.shape))

    # generate params / We
    params = gen_util.init_params(we_dimension, deep=3)

    # add We matrix to params
    params += (orig_We, )
    r = gen_util.roll_params(params)

    dim = r.shape[0]
    log.info('parameter vector dimensionality: {0}'.format(dim))

    # minibatch adagrad training
    ag = Adagrad(r.shape, learning_rate)
    min_error = float('inf')

    log.info('step 1 of 2: training DAN (takes 2-3 hours)')
    for epoch in range(0, n_epochs):
        # create mini-batches
        np.random.shuffle(train_qs)
        batches = [
            train_qs[x:x + batch_size]
            for x in list(range(0, len(train_qs), batch_size))
        ]

        epoch_error = 0.0
        ep_t = time.time()

        for batch_ind, batch in enumerate(batches):
            now = time.time()
            err, grad = objective_and_grad(batch, r, we_dimension, len_voc)
            update = ag.rescale_update(grad)
            r -= update
            lstring = 'epoch: {0} batch_ind: {1} error, {2} time = {3}'.format(
                epoch, batch_ind, err,
                time.time() - now)
            log.info(lstring)
            epoch_error += err

        # done with epoch
        log.info(str(time.time() - ep_t))
        log.info(
            'done with epoch {0} epoch error = {1} min error = {2}'.format(
                epoch, epoch_error, min_error))

        # save parameters if the current model is better than previous best model
        if epoch_error < min_error:
            min_error = epoch_error
            log.info('saving model...')
            params = gen_util.unroll_params(r, we_dimension, len_voc, deep=3)
            with safe_open(DEEP_DAN_PARAMS_TARGET, 'wb') as f:
                pickle.dump(params, f)

        # reset adagrad weights
        if epoch % adagrad_reset == 0 and epoch != 0:
            ag.reset_weights()
コード例 #26
0
def save_classifier(classifier, class_type):
    with safe_open(CLASSIFIER_PICKLE_PATH.format(class_type), 'wb') as f:
        pickle.dump(classifier, f)
コード例 #27
0
def generate_guesser_slurm(slurm_config_file, task, output_dir):
    with open(slurm_config_file) as f:
        slurm_config = yaml.load(f)
        default_slurm_config = slurm_config['default']
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('guesser-luigi-template.sh')
    enabled_guessers = list(AbstractGuesser.list_enabled_guessers())

    for i, gs in enumerate(enabled_guessers):
        if gs.guesser_class == 'ElasticSearchGuesser':
            raise ValueError(
                'ElasticSearchGuesser is not compatible with slurm')
        elif gs.guesser_class in slurm_config:
            guesser_slurm_config = slurm_config[gs.guesser_class]
        else:
            guesser_slurm_config = None
        partition = get_slurm_config_value('partition', default_slurm_config,
                                           guesser_slurm_config)
        qos = get_slurm_config_value('qos', default_slurm_config,
                                     guesser_slurm_config)
        mem_per_cpu = get_slurm_config_value('mem_per_cpu',
                                             default_slurm_config,
                                             guesser_slurm_config)
        gres = get_slurm_config_value('gres', default_slurm_config,
                                      guesser_slurm_config)
        max_time = get_slurm_config_value('max_time', default_slurm_config,
                                          guesser_slurm_config)
        cpus_per_task = get_slurm_config_value('cpus_per_task',
                                               default_slurm_config,
                                               guesser_slurm_config)
        account = get_slurm_config_value('account', default_slurm_config,
                                         guesser_slurm_config)
        if task == 'GuesserReport':
            folds = GUESSER_GENERATION_FOLDS
        else:
            folds = []
        script = template.render({
            'task': task,
            'guesser_module': gs.guesser_module,
            'guesser_class': gs.guesser_class,
            'dependency_module': gs.dependency_module,
            'dependency_class': gs.dependency_class,
            'config_num': gs.config_num,
            'partition': partition,
            'qos': qos,
            'mem_per_cpu': mem_per_cpu,
            'max_time': max_time,
            'gres': gres,
            'cpus_per_task': cpus_per_task,
            'account': account,
            'folds': folds
        })
        slurm_file = path.join(output_dir, f'slurm-{i}.sh')
        with safe_open(slurm_file, 'w') as f:
            f.write(script)

    singleton_path = 'qanta/slurm/templates/guesser-singleton.sh'
    singleton_output = path.join(output_dir, 'guesser-singleton.sh')
    shell(f'cp {singleton_path} {singleton_output}')

    master_template = env.get_template('guesser-master-template.sh')
    master_script = master_template.render({
        'script_list': [
            path.join(output_dir, f'slurm-{i}.sh')
            for i in range(len(enabled_guessers))
        ] + [singleton_output],
        'gres':
        gres,
        'partition':
        partition,
        'qos':
        qos,
        'mem_per_cpu':
        mem_per_cpu,
        'max_time':
        max_time,
        'gres':
        gres,
        'cpus_per_task':
        cpus_per_task,
        'account':
        account
    })
    with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f:
        f.write(master_script)
コード例 #28
0
ファイル: cli.py プロジェクト: nhatsmrt/qb
def generate_guesser_slurm(slurm_config_file, task, output_dir):
    with open(slurm_config_file) as f:
        slurm_config = yaml.load(f)
        default_slurm_config = slurm_config["default"]
    env = Environment(loader=PackageLoader("qanta", "slurm/templates"))
    template = env.get_template("guesser-luigi-template.sh")
    enabled_guessers = list(AbstractGuesser.list_enabled_guessers())

    for i, gs in enumerate(enabled_guessers):
        if gs.guesser_class == "ElasticSearchGuesser":
            raise ValueError(
                "ElasticSearchGuesser is not compatible with slurm")
        elif gs.guesser_class in slurm_config:
            guesser_slurm_config = slurm_config[gs.guesser_class]
        else:
            guesser_slurm_config = None
        partition = get_slurm_config_value("partition", default_slurm_config,
                                           guesser_slurm_config)
        qos = get_slurm_config_value("qos", default_slurm_config,
                                     guesser_slurm_config)
        mem_per_cpu = get_slurm_config_value("mem_per_cpu",
                                             default_slurm_config,
                                             guesser_slurm_config)
        gres = get_slurm_config_value("gres", default_slurm_config,
                                      guesser_slurm_config)
        max_time = get_slurm_config_value("max_time", default_slurm_config,
                                          guesser_slurm_config)
        cpus_per_task = get_slurm_config_value("cpus_per_task",
                                               default_slurm_config,
                                               guesser_slurm_config)
        account = get_slurm_config_value("account", default_slurm_config,
                                         guesser_slurm_config)
        if task == "GuesserReport":
            folds = GUESSER_GENERATION_FOLDS
        else:
            folds = []
        script = template.render({
            "task": task,
            "guesser_module": gs.guesser_module,
            "guesser_class": gs.guesser_class,
            "dependency_module": gs.dependency_module,
            "dependency_class": gs.dependency_class,
            "config_num": gs.config_num,
            "partition": partition,
            "qos": qos,
            "mem_per_cpu": mem_per_cpu,
            "max_time": max_time,
            "gres": gres,
            "cpus_per_task": cpus_per_task,
            "account": account,
            "folds": folds,
        })
        slurm_file = path.join(output_dir, f"slurm-{i}.sh")
        with safe_open(slurm_file, "w") as f:
            f.write(script)

    singleton_path = "qanta/slurm/templates/guesser-singleton.sh"
    singleton_output = path.join(output_dir, "guesser-singleton.sh")
    shell(f"cp {singleton_path} {singleton_output}")

    master_template = env.get_template("guesser-master-template.sh")
    master_script = master_template.render({
        "script_list": [
            path.join(output_dir, f"slurm-{i}.sh")
            for i in range(len(enabled_guessers))
        ] + [singleton_output],
        "gres":
        gres,
        "partition":
        partition,
        "qos":
        qos,
        "mem_per_cpu":
        mem_per_cpu,
        "max_time":
        max_time,
        "gres":
        gres,
        "cpus_per_task":
        cpus_per_task,
        "account":
        account,
    })
    with safe_open(path.join(output_dir, "slurm-master.sh"), "w") as f:
        f.write(master_script)
コード例 #29
0
ファイル: dan.py プロジェクト: cequencer/qb
def train_dan(batch_size=150, we_dimension=300, n_epochs=61, learning_rate=0.01, adagrad_reset=10):
    with open(DEEP_TRAIN_TARGET, 'rb') as f:
        train_qs = pickle.load(f)

    log.info('total questions: {0}'.format(len(train_qs)))
    total = 0
    for qs, ans in train_qs:
        total += len(qs)
    log.info('total sentences: {0}'.format(total))

    with open(DEEP_WE_TARGET, 'rb') as f:
        orig_We = pickle.load(f)

    len_voc = orig_We.shape[1]
    log.info('vocab length: {0} We shape: {1}'.format(len_voc, orig_We.shape))

    # generate params / We
    params = gen_util.init_params(we_dimension, deep=3)

    # add We matrix to params
    params += (orig_We, )
    r = gen_util.roll_params(params)

    dim = r.shape[0]
    log.info('parameter vector dimensionality: {0}'.format(dim))

    # minibatch adagrad training
    ag = Adagrad(r.shape, learning_rate)
    min_error = float('inf')

    log.info('step 1 of 2: training DAN (takes 2-3 hours)')
    for epoch in range(0, n_epochs):
        # create mini-batches
        np.random.shuffle(train_qs)
        batches = [train_qs[x: x + batch_size] for x in list(range(0, len(train_qs), batch_size))]

        epoch_error = 0.0
        ep_t = time.time()

        for batch_ind, batch in enumerate(batches):
            now = time.time()
            err, grad = objective_and_grad(batch, r, we_dimension, len_voc)
            update = ag.rescale_update(grad)
            r -= update
            lstring = 'epoch: {0} batch_ind: {1} error, {2} time = {3}'.format(
                epoch, batch_ind, err, time.time() - now)
            log.info(lstring)
            epoch_error += err

        # done with epoch
        log.info(str(time.time() - ep_t))
        log.info('done with epoch {0} epoch error = {1} min error = {2}'.format(
            epoch, epoch_error, min_error))

        # save parameters if the current model is better than previous best model
        if epoch_error < min_error:
            min_error = epoch_error
            log.info('saving model...')
            params = gen_util.unroll_params(r, we_dimension, len_voc, deep=3)
            with safe_open(DEEP_DAN_PARAMS_TARGET, 'wb') as f:
                pickle.dump(params, f)

        # reset adagrad weights
        if epoch % adagrad_reset == 0 and epoch != 0:
            ag.reset_weights()
コード例 #30
0
ファイル: elasticsearch_wikidata.py プロジェクト: nadesai/qb
 def save(self, directory: str):
     with safe_open(os.path.join(directory, IS_HUMAN_MODEL_PICKLE),
                    'wb') as f:
         pickle.dump({'is_human_model': self.is_human_model}, f)
コード例 #31
0
ファイル: cnn.py プロジェクト: amit2014/qb
 def save(self, directory: str) -> None:
     shutil.copyfile(CNN_MODEL_TMP_TARGET, os.path.join(directory, CNN_MODEL_TARGET))
     with safe_open(os.path.join(directory, CNN_PARAMS_TARGET), 'wb') as f:
         pickle.dump(self.dump_parameters(), f)
コード例 #32
0
ファイル: command.py プロジェクト: NPSDC/qb
def ingestion_cli(start_idx):
    """
    Input format is for jason's HS project, but can be changed. The original code for answer
    mapping was designed to map everything over multiple passes, not yield a callable function to map
    an arbitrary answer line to a QB answer. Rather than implement this, a hacky way to achieve similar
    functionality to map a new dataset is to combine already mapped questions with new questions, have
    the code map answer for both at the same time, then only use the mappings from the new questions.
    There are some edge cases, but this should in general work (hopefully).
    """
    with open(QANTA_PREPROCESSED_DATASET_PATH) as f:
        unmapped_questions = json.load(f)["questions"]

    with open("data/external/high_school_project/quizdb-20190313164802.json"
              ) as f:
        raw_questions = json.load(f)["data"]["tossups"]

    new_questions = []
    idx = start_idx
    for q in raw_questions:
        new_questions.append({
            "qanta_id": idx,
            "text": q["text"],
            "answer": q["answer"],
            "page": None,
            "category": None,
            "subcategory": None,
            "tournament": q["tournament"]["name"],
            "difficulty": q["tournament"]["difficulty"],
            "year": int(q["tournament"]["year"]),
            "proto_id": None,
            "qdb_id": q["id"],
            "dataset": "quizdb.org",
            "fold": "guesstest",
        })
        idx += 1
    questions = unmapped_questions + new_questions
    answer_map, amb_answer_map, unbound_answers, report = create_answer_map(
        questions)
    with safe_open("data/external/high_school_project/automatic_report.json",
                   "w") as f:
        json.dump(report, f)

    write_answer_map(
        answer_map,
        amb_answer_map,
        unbound_answers,
        "data/external/high_school_project/answer_map.json",
        "data/external/high_school_project/unbound_answers.json",
    )
    with open("data/internal/page_assignment/unmappable.yaml") as f:
        unmappable = yaml.load(f)

    page_assigner = PageAssigner()
    mapping_report = unmapped_to_mapped_questions(new_questions, answer_map,
                                                  amb_answer_map, unmappable,
                                                  page_assigner)

    add_sentences_(new_questions)
    with open(
            "data/external/high_school_project/qanta.acf-regionals-2018.json",
            "w") as f:
        json.dump(format_qanta_json(new_questions, DS_VERSION), f)

    with open("data/external/high_school_project/mapping_report.json",
              "w") as f:
        json.dump(mapping_report, f)
コード例 #33
0
ファイル: cli.py プロジェクト: Pinafore/qb
def generate_guesser_slurm(slurm_config_file, task, output_dir):
    with open(slurm_config_file) as f:
        slurm_config = yaml.load(f)
        default_slurm_config = slurm_config['default']
    env = Environment(loader=PackageLoader('qanta', 'slurm/templates'))
    template = env.get_template('guesser-luigi-template.sh')
    enabled_guessers = list(AbstractGuesser.list_enabled_guessers())

    for i, gs in enumerate(enabled_guessers):
        if gs.guesser_class == 'ElasticSearchGuesser':
            raise ValueError('ElasticSearchGuesser is not compatible with slurm')
        elif gs.guesser_class in slurm_config:
            guesser_slurm_config = slurm_config[gs.guesser_class]
        else:
            guesser_slurm_config = None
        partition = get_slurm_config_value('partition', default_slurm_config, guesser_slurm_config)
        qos = get_slurm_config_value('qos', default_slurm_config, guesser_slurm_config)
        mem_per_cpu = get_slurm_config_value('mem_per_cpu', default_slurm_config, guesser_slurm_config)
        gres = get_slurm_config_value('gres', default_slurm_config, guesser_slurm_config)
        max_time = get_slurm_config_value('max_time', default_slurm_config, guesser_slurm_config)
        cpus_per_task = get_slurm_config_value('cpus_per_task', default_slurm_config, guesser_slurm_config)
        account = get_slurm_config_value('account', default_slurm_config, guesser_slurm_config)
        if task == 'GuesserReport':
            folds = GUESSER_GENERATION_FOLDS
        else:
            folds = []
        script = template.render({
            'task': task,
            'guesser_module': gs.guesser_module,
            'guesser_class': gs.guesser_class,
            'dependency_module': gs.dependency_module,
            'dependency_class': gs.dependency_class,
            'config_num': gs.config_num,
            'partition': partition,
            'qos': qos,
            'mem_per_cpu': mem_per_cpu,
            'max_time': max_time,
            'gres': gres,
            'cpus_per_task': cpus_per_task,
            'account': account,
            'folds': folds
        })
        slurm_file = path.join(output_dir, f'slurm-{i}.sh')
        with safe_open(slurm_file, 'w') as f:
            f.write(script)

    singleton_path = 'qanta/slurm/templates/guesser-singleton.sh'
    singleton_output = path.join(output_dir, 'guesser-singleton.sh')
    shell(f'cp {singleton_path} {singleton_output}')

    master_template = env.get_template('guesser-master-template.sh')
    master_script = master_template.render({
        'script_list': [
                           path.join(output_dir, f'slurm-{i}.sh') for i in range(len(enabled_guessers))
                       ] + [singleton_output],
        'gres': gres,
        'partition': partition,
        'qos': qos,
        'mem_per_cpu': mem_per_cpu,
        'max_time': max_time,
        'gres': gres,
        'cpus_per_task': cpus_per_task,
        'account': account
    })
    with safe_open(path.join(output_dir, 'slurm-master.sh'), 'w') as f:
        f.write(master_script)
コード例 #34
0
ファイル: classifier.py プロジェクト: cequencer/qb
def write_bigrams(bigrams, output):
    with safe_open(output, 'wb') as f:
        pickle.dump(bigrams, f, pickle.HIGHEST_PROTOCOL)
コード例 #35
0
ファイル: lm_wrapper.py プロジェクト: xxlatgh/qb
def build_clm(lm_out=CLM_PATH, vocab_size=100000, global_lms=5, max_pages=-1):
    min_appearances = conf['clm']['min_appearances']
    log.info(
        "Training language model with pages that appear more than %i times" %
        min_appearances)

    lm = LanguageModelWriter(vocab_size, global_lms)
    num_docs = 0
    background = defaultdict(int)
    # Initialize language models
    for title, text in text_iterator(True,
                                     QB_WIKI_LOCATION,
                                     True,
                                     QB_QUESTION_DB,
                                     True,
                                     QB_SOURCE_LOCATION,
                                     max_pages,
                                     min_pages=min_appearances):
        num_docs += 1
        if num_docs % 500 == 0:
            log.info("{} {}".format(title, num_docs))
            log.info(str(list(lm.tokenize_without_censor(text[100:200]))))

        for tt in lm.tokenize_without_censor(text):
            background[tt] += 1

    # Create the vocabulary
    for ii in background:
        lm.train_seen(ii, background[ii])
    vocab = lm.finalize()
    log.info(str(vocab)[:80])
    log.info("Vocab size is {} from {} docs".format(len(vocab), num_docs))
    del background

    # Train the language model
    doc_num = 0
    for corpus, qb, wiki, source in [("wiki", False, True, False),
                                     ("qb", True, False, False),
                                     ("source", False, False, True)]:
        # Add training data
        start = time.time()
        for title, text in text_iterator(wiki,
                                         QB_WIKI_LOCATION,
                                         qb,
                                         QB_QUESTION_DB,
                                         source,
                                         QB_SOURCE_LOCATION,
                                         max_pages,
                                         min_pages=min_appearances):
            doc_num += 1
            if doc_num % 500 == 0 or time.time() - start > 10:
                log.info("Adding train doc %i, %s (%s)" %
                         (doc_num, title, corpus))
                start = time.time()
            lm.add_train(corpus, title, text)

    log.info("Done training")
    if lm_out:
        # Create the extractor object and write out the pickle
        with safe_open("%s.txt" % lm_out, 'w') as f:
            lm.write_vocab(f)

        for ii, cc in enumerate(lm.corpora()):
            with safe_open("%s/%i" % (lm_out, ii), 'w') as f:
                lm.write_corpus(cc, ii, f)
コード例 #36
0
ファイル: classifier.py プロジェクト: cequencer/qb
def write_bigrams(bigrams, output):
    with safe_open(output, 'wb') as f:
        pickle.dump(bigrams, f, pickle.HIGHEST_PROTOCOL)