Exemple #1
0
 def build(documents: Dict[str, str]):
     Answer.init()
     cw = CachedWikipedia()
     bar = progressbar.ProgressBar()
     for page in bar(documents):
         answer = Answer(page=page, wiki_content=cw[page].content, qb_content=documents[page])
         answer.save()
Exemple #2
0
    def training_data(self) -> TrainingData:
        cw = CachedWikipedia()
        wiki_content = []
        wiki_answers = []
        for ans in self.answers:
            wiki_page = cw[ans]
            if len(wiki_page.content) != 0:
                # Take the first paragraph, skipping the initial title and empty line after
                paragraphs = wiki_page.content.split('\n')
                if len(paragraphs) > 2:
                    n_used = 0
                    for par in paragraphs[2:]:
                        if len(par) != 0:
                            n_used += 1
                            content = unidecode(par).lower()

                            # Strip references to the title in a reasonable way
                            ans_words = unidecode(ans).lower().split('_')
                            for w in ans_words:
                                content = content.replace(w, ' ')

                            # Fix up whitespaces
                            content = re.sub('\s+', ' ', content).strip()
                            for sent in nltk.sent_tokenize(content):
                                wiki_content.append([sent])
                                wiki_answers.append(ans)
                        if n_used == self.n_paragraphs:
                            break

        return wiki_content, wiki_answers, None
Exemple #3
0
def generate_domain_classifier_data(weight=150):
    """
    Reads all sentences from every wikipedia page corresponding to a known answer and splits them into two vowpal wabbit files,

    interleaving true quiz bowl questions randomly and with higher weight specified by the weight arg.
    """
    qb_data = QuizBowlDataset(guesser_train=True).training_data()
    real_questions = [('1', str(weight), ans, clean_question(sent))
                      for q, ans, _ in zip(*qb_data) for sent in q]
    pages = set(a for _, _, a, _ in real_questions)

    cw = CachedWikipedia()

    # Split wikipedia questions into two sets
    wiki_questions = ([], [])
    use_second = False
    for page in pages:
        for sentence in sentences_from_page(cw[page]):
            q = clean_question(sentence)
            wiki_questions[use_second].append(('-1', '1', page, q))
            use_second = not use_second

    vw_line = '{} {} \'{}|text {}\n'
    for i, wiki_qs in enumerate(wiki_questions):
        # Create list of True/False and shuffle to define ordering of train data
        order = list(
            chain(repeat(False, len(real_questions)),
                  repeat(True, len(wiki_qs))))
        random.shuffle(order)
        iters = (iter(real_questions), iter(wiki_qs))
        with safe_open(DOMAIN_TARGET_PREFIX + str(i), 'w') as f:
            for choice in order:
                f.write(vw_line.format(*next(iters[choice])))
Exemple #4
0
    def build_many_docs(pages,
                        documents,
                        use_wiki=True,
                        use_qb=True,
                        use_source=False,
                        rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info('Deleting index: {}'.format(INDEX_NAME))
            ElasticSearchIndex.delete()

        if ElasticSearchIndex.exists():
            log.info('Index {} exists'.format(INDEX_NAME))
        else:
            log.info('Index {} does not exist'.format(INDEX_NAME))
            Answer.init()
            log.info(
                'Indexing questions and corresponding pages as many docs...')
            if use_qb:
                log.info('Indexing questions...')
                bar = progressbar.ProgressBar()
                for page, doc in bar(documents):
                    Answer(page=page, qb_content=doc).save()

            if use_wiki:
                log.info('Indexing wikipedia...')
                cw = CachedWikipedia()
                bar = progressbar.ProgressBar()
                for page in bar(pages):
                    content = word_tokenize(cw[page].content)
                    for i in range(0, len(content), 200):
                        chunked_content = content[i:i + 200]
                        if len(chunked_content) > 0:
                            Answer(
                                page=page,
                                wiki_content=' '.join(chunked_content)).save()
Exemple #5
0
def text_iterator(use_wiki, wiki_location,
                  use_qb, qb_location,
                  use_source, source_location,
                  limit=-1,
                  min_pages=0, country_list=COUNTRY_LIST_PATH):
    if isinstance(qb_location, str):
        qdb = QuestionDatabase(qb_location)
    else:
        qdb = qb_location
    doc_num = 0

    cw = CachedWikipedia(wiki_location, data_path(country_list))
    pages = qdb.questions_with_pages()

    for p in sorted(pages, key=lambda k: len(pages[k]), reverse=True):
        # This bit of code needs to line up with the logic in qdb.py
        # to have the same logic as the page_by_count function
        if len(pages[p]) < min_pages:
            continue

        if use_qb:
            train_questions = [x for x in pages[p] if x.fold == "train"]
            question_text = "\n".join(" ".join(x.raw_words()) for x in train_questions)
        else:
            question_text = ''

        if use_source:
            filename = '%s/%s' % (source_location, p)
            if os.path.isfile(filename):
                try:
                    with gzip.open(filename, 'rb') as f:
                        source_text = f.read()
                except zlib.error:
                    log.info("Error reading %s" % filename)
                    source_text = ''
            else:
                source_text = ''
        else:
            source_text = u''

        if use_wiki:
            wikipedia_text = cw[p].content
        else:
            wikipedia_text = u""

        total_text = wikipedia_text
        total_text += "\n"
        total_text += question_text
        total_text += "\n"
        total_text += str(source_text)

        yield p, total_text
        doc_num += 1

        if 0 < limit < doc_num:
            break
Exemple #6
0
 def __init__(self,
              xml_location="data/external/wikifier/data/output",
              wikipedia="data/external/wikipedia",
              country_list=COUNTRY_LIST_PATH):
     super(WikiLinks, self).__init__()
     self.name = "wikilinks"
     self._location = xml_location
     self.links = defaultdict(dict)
     self._wiki = CachedWikipedia(wikipedia, country_list)
     self._cache = -1
     self._matches = None
Exemple #7
0
 def build(cls):
     ix = index.create_in(WHOOSH_WIKI_INDEX_PATH, cls.schema)
     writer = ix.writer()
     cw = CachedWikipedia(QB_WIKI_LOCATION, COUNTRY_LIST_PATH)
     qdb = QuestionDatabase(QB_QUESTION_DB)
     questions = qdb.questions_with_pages()
     pages = [page for page, questions in questions if len(questions) < MAX_APPEARANCES]
     pages = list(qdb.get_all_pages(exclude_test=True))
     print("Building whoosh wiki index from {0} pages".format(len(pages)))
     bar = progressbar.ProgressBar()
     for p in bar(pages):
         writer.add_document(page=p, content=cw[p].content)
     writer.commit()
Exemple #8
0
def build_lm_data(path, output):
    cw = CachedWikipedia(path, "")
    o = open(output, 'w')

    count = 0
    for i in [x.split("/")[-1] for x in glob("%s/*" % path)]:
        count += 1
        if count % 1000 == 0:
            print("%i\t%s" % (count, unidecode(i)))
        page = cw[i]

        for ss in nltk.sent_tokenize(page.content):
            o.write("%s\n" % " ".join(kTOKENIZER(unidecode(ss.lower()))))
Exemple #9
0
 def build(cls,
           documents: Dict[str, str],
           index_path=WHOOSH_WIKI_INDEX_PATH):
     ix = index.create_in(safe_path(index_path), cls.schema)
     writer = ix.writer()
     cw = CachedWikipedia()
     print("Building whoosh wiki index from {0} pages".format(
         len(documents)))
     bar = progressbar.ProgressBar()
     for p in bar(documents):
         writer.add_document(page=p,
                             content=cw[p].content,
                             quiz_bowl=documents[p])
     writer.commit()
Exemple #10
0
def create_memory_index():
    dataset = QuizBowlDataset(guesser_train=True)
    training_data = dataset.training_data()
    answers = set(training_data[1])
    cw = CachedWikipedia()

    try:
        Index('mem').delete()
    except:
        pass
    Answer.init()
    all_wiki_pages = [cw[page] for page in answers]
    wiki_pages = [p for p in all_wiki_pages if p.content != '']
    sc = create_spark_context()
    sc.parallelize(wiki_pages, 1000).foreach(index_page)
Exemple #11
0
    def training_data(self):
        cw = CachedWikipedia(QB_WIKI_LOCATION)
        ds = QuizBowlDataset(2)
        train_data = ds.training_data()
        answer_classes = set(train_data[1])
        train_x = []
        train_y = []

        for page in answer_classes:
            sentences = list(wiki_sentences(cw[page].content))
            sampled_sentences = random.sample(
                sentences, min(len(sentences), self.max_sentences))
            training_examples = []
            for sentence in sampled_sentences:
                training_examples.append(sentence)
            train_x.append(training_examples)
            train_y.append(page)
        return train_x, train_y, None
Exemple #12
0
 def build(documents: Dict[str, str], is_human_map):
     try:
         Index('qb').delete()
     except elasticsearch.exceptions.NotFoundError:
         log.info(
             'Could not delete non-existent index, creating new index...')
     Answer.init()
     cw = CachedWikipedia()
     bar = progressbar.ProgressBar()
     for page in bar(documents):
         if page in is_human_map:
             is_human = is_human_map[page]
         else:
             is_human = False
         answer = Answer(page=page,
                         wiki_content=cw[page].content,
                         qb_content=documents[page],
                         is_human=is_human)
         answer.save()
Exemple #13
0
    def build_large_docs(documents: Dict[str, str],
                         use_wiki=True,
                         use_qb=True,
                         use_source=False,
                         rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info('Deleting index: {}'.format(INDEX_NAME))
            ElasticSearchIndex.delete()

        if ElasticSearchIndex.exists():
            log.info('Index {} exists'.format(INDEX_NAME))
        else:
            log.info('Index {} does not exist'.format(INDEX_NAME))
            Answer.init()
            cw = CachedWikipedia()
            source = Source()
            log.info(
                'Indexing questions and corresponding wikipedia pages as large docs...'
            )
            bar = progressbar.ProgressBar()
            for page in bar(documents):
                if use_wiki:
                    wiki_content = cw[page].content
                else:
                    wiki_content = ''

                if use_qb:
                    qb_content = documents[page]
                else:
                    qb_content = ''

                if use_source:
                    source_content = source[page][:50000]
                else:
                    source_content = ''

                answer = Answer(page=page,
                                wiki_content=wiki_content,
                                qb_content=qb_content,
                                source_content=source_content)
                answer.save()
    def build(documents: Dict[str, str], instance_of_map, rebuild_index=False):
        if rebuild_index or bool(int(os.getenv('QB_REBUILD_INDEX', 0))):
            log.info('Deleting index: {}'.format(INDEX_NAME))
            ElasticSearchIndex.delete()

        if ElasticSearchIndex.exists():
            log.info(
                'Index {} exists, skipping building index'.format(INDEX_NAME))
        else:
            log.info('Index {} does not exist, building index...'.format(
                INDEX_NAME))
            Answer.init()
            cw = CachedWikipedia()
            bar = progressbar.ProgressBar()
            for page in bar(documents):
                if page in instance_of_map:
                    instance_of = instance_of_map[page]
                else:
                    instance_of = NO_MATCH
                answer = Answer(page=page,
                                wiki_content=cw[page].content,
                                qb_content=documents[page],
                                instance_of=instance_of)
                answer.save()
Exemple #15
0
def init_wiki_cache(wiki_cache):
    CachedWikipedia.initialize_cache(wiki_cache)
Exemple #16
0
def init_wiki_cache(wiki_cache):
    CachedWikipedia.initialize_cache(wiki_cache)
Exemple #17
0
                    for ii in progress:
                        log.info("MAP %s: %s" %
                                 (ii, progress[ii].most_common(5)))
                    for ii in folds:
                        log.info("PB FOLD %s: %i" % (ii, folds[ii]))

    log.info("Added %i, skipped %i" % (last_id, num_skipped))

    if flags.guess:
        if not os.path.exists(flags.wiki_title):
            import urllib
            urllib.request.urlretrieve(
                "http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz",
                flags.wiki_title)

        tf = TitleFinder(flags.wiki_title, CachedWikipedia(), pa.known_pages(),
                         QuestionDatabase.normalize_answer)

        guesses = tf.best_guess(unmapped)
    else:
        guesses = dict((x, "") for x in unmapped)

    wiki_total = Counter()
    wiki_answers = defaultdict(set)
    for ii in guesses:
        page = guesses[ii]
        wiki_total[page] += unmapped[ii]
        wiki_answers[page].add(ii)

    for ii in [x for x in unmapped if not x in guesses]:
        wiki_answers[''].add(ii)
Exemple #18
0
                if last_id % 1000 == 0:
                    progress = pa.get_counts()
                    for ii in progress:
                        log.info("MAP %s: %s" % (ii, progress[ii].most_common(5)))
                    for ii in folds:
                        log.info("PB FOLD %s: %i" % (ii, folds[ii]))

    log.info("Added %i, skipped %i" % (last_id, num_skipped))

    if flags.guess:
        if not os.path.exists(flags.wiki_title):
            import urllib
            urllib.request.urlretrieve("http://dumps.wikimedia.org/enwiki/latest/enwiki-latest-all-titles-in-ns0.gz",
                                    flags.wiki_title)

        tf = TitleFinder(flags.wiki_title, CachedWikipedia(),
                        pa.known_pages(),
                        QuestionDatabase.normalize_answer)

        guesses = tf.best_guess(unmapped)
    else:
        guesses = dict((x, "") for x in unmapped)

    wiki_total = Counter()
    wiki_answers = defaultdict(set)
    for ii in guesses:
        page = guesses[ii]
        wiki_total[page] += unmapped[ii]
        wiki_answers[page].add(ii)

    for ii in [x for x in unmapped if not x in guesses]:
Exemple #19
0
                        type=str,
                        default='data/internal/page_assignment/ambiguous/')
    parser.add_argument('--unambiguous_path',
                        type=str,
                        default='data/internal/page_assignment/unambiguous/')
    flags = parser.parse_args()

    pa = PageAssigner(QuestionDatabase.normalize_answer)
    for ii in glob("%s/*" % flags.ambiguous_path):
        pa.load_ambiguous(ii)
    for ii in glob("%s/*" % flags.unambiguous_path):
        pa.load_unambiguous(ii)
    for ii in glob("%s/*" % flags.direct_path):
        pa.load_direct(ii)

    cw = CachedWikipedia()
    tf = TitleFinder("data/enwiki-latest-all-titles-in-ns0.gz",
                     cw,
                     pa.known_pages(),
                     normalize=QuestionDatabase.normalize_answer)

    for ii in [
            'die leiden des jungen werthers', '99 Luftballons',
            'saint nicholas of myra', 'édouard roche',
            'the mahdi or mohammad ahmed', 'the first vatican council',
            'antietam national battlefield', 'cia', 'samuel f b morse',
            'the passion according to st matthew or st matthew’s passion or matthäuspassion',
            'another world', 'rolling in the deep', 'tony gwynn', 'opal',
            'tylenol', 'queues', 'dachau', 'lipoproteins', 'haiku', 'japan',
            'zoroastrianism'
    ]: