Ejemplo n.º 1
0
def load_memories(text_list, n):
    if os.path.exists('/tmp/memories.pickle'):
        with open('/tmp/memories.pickle', 'rb') as f:
            memory_lookup = pickle.load(f)
    else:
        memory_lookup = {}
    memory_size = len(memory_lookup)

    if memory_size == 0:
        # If everything is missing, then parallelize for speed
        sc = create_spark_context()
        memories = sc.parallelize(text_list, 256).map(lambda t: search(t, n=n)).collect()
        for text, mem in zip(text_list, memories):
            memory_lookup[text] = mem
    else:
        # If only some things are missing, use the cache and query what is missing
        memories = []
        for text in text_list:
            if text in memory_lookup:
                memories.append(memory_lookup[text])
            else:
                mem = search(text, n=n)
                memories.append(mem)
                memory_lookup[text] = mem

    if memory_size != len(memory_lookup):
        with open('/tmp/memories.pickle', 'wb') as f:
            pickle.dump(memory_lookup, f)

    return memories
Ejemplo n.º 2
0
    def guess(self, questions: List[QuestionText],
              max_n_guesses: Optional[int]):
        log.info('Predicting the instance_of attribute for guesses...')
        class_with_probability = self.test_instance_of(questions)

        n_cores = conf['guessers']['ESWikidata']['n_cores']
        sc = create_spark_context(
            configs=[('spark.executor.cores',
                      n_cores), ('spark.executor.memory', '20g')])

        def ir_search(query_class_and_prob):
            query, class_and_prob = query_class_and_prob
            p_class, prob = class_and_prob
            return es_index.search(query,
                                   p_class,
                                   prob,
                                   self.confidence_threshold,
                                   normalize_score_by_length=self.
                                   normalize_score_by_length)[:max_n_guesses]

        spark_input = list(zip(questions, class_with_probability))
        log.info('Filtering when classification probability > {}'.format(
            self.confidence_threshold))

        return sc.parallelize(spark_input,
                              32 * n_cores).map(ir_search).collect()
Ejemplo n.º 3
0
def create_wikipedia_cache(
        parsed_wiki_path='data/external/wikipedia/parsed-wiki',
        output_path=WIKI_LOOKUP_PATH):
    from qanta.spark import create_spark_context

    sc = create_spark_context()
    db = QuestionDatabase()
    questions = list(db.all_questions().values())
    train_questions = [
        q for q in questions
        if q.fold == 'guesstrain' or q.fold == 'buzzertrain'
    ]
    answers = {q.page for q in train_questions}
    b_answers = sc.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(parsed_wiki_path)
    page_pattern = os.path.join(page_path, '*', '*')

    def parse_page(json_text):
        page = json.loads(json_text)
        return {
            'id': int(page['id']),
            'title': page['title'].replace(' ', '_'),
            'text': page['text'],
            'url': page['url']
        }

    wiki_pages = sc.textFile(page_pattern).map(parse_page).filter(
        lambda p: p['title'] in b_answers.value).collect()
    wiki_lookup = {p['title']: p for p in wiki_pages}
    with open(output_path, 'w') as f:
        json.dump(wiki_lookup, f)

    return wiki_lookup
Ejemplo n.º 4
0
def create_wikipedia_cache(parsed_wiki_path='data/external/wikipedia/parsed-wiki', output_path=WIKI_LOOKUP_PATH):
    from qanta.spark import create_spark_context

    sc = create_spark_context()
    db = QantaDatabase()
    train_questions = db.train_questions
    answers = {q.page for q in train_questions}
    b_answers = sc.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(parsed_wiki_path)
    page_pattern = os.path.join(page_path, '*', '*')

    def parse_page(json_text):
        page = json.loads(json_text)
        return {
            'id': int(page['id']),
            'title': page['title'].replace(' ', '_'),
            'text': page['text'], 'url': page['url']
        }

    wiki_pages = sc.textFile(page_pattern).map(parse_page).filter(lambda p: p['title'] in b_answers.value).collect()
    wiki_lookup = {p['title']: p for p in wiki_pages}
    with open(output_path, 'w') as f:
        json.dump(wiki_lookup, f)

    return wiki_lookup
Ejemplo n.º 5
0
def create_wikipedia_cache(
        parsed_wiki_path="data/external/wikipedia/parsed-wiki",
        output_path=WIKI_LOOKUP_PATH):
    from qanta.spark import create_spark_context

    sc = create_spark_context()
    db = QantaDatabase()
    train_questions = db.train_questions
    answers = {q.page for q in train_questions}
    b_answers = sc.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(parsed_wiki_path)
    page_pattern = os.path.join(page_path, "*", "*")

    def parse_page(json_text):
        page = json.loads(json_text)
        return {
            "id": int(page["id"]),
            "title": page["title"].replace(" ", "_"),
            "text": page["text"],
            "url": page["url"],
        }

    wiki_pages = (sc.textFile(page_pattern).map(parse_page).filter(
        lambda p: p["title"] in b_answers.value).collect())
    wiki_lookup = {p["title"]: p for p in wiki_pages}
    with open(output_path, "w") as f:
        json.dump(wiki_lookup, f)

    return wiki_lookup
Ejemplo n.º 6
0
def add_sentences_(questions, parallel=True):
    text_questions = [q['text'] for q in questions]
    sc = create_spark_context()
    if parallel:
        sentence_tokenizations = sc.parallelize(text_questions, 4000).map(nlp).collect()
    else:
        sentence_tokenizations = [nlp(q) for q in text_questions]
    for q, text, tokenization in zip(questions, text_questions, sentence_tokenizations):
        q['tokenizations'] = tokenization
        # Get the 0th sentence, end character tokenization (tuple position 1)
        q['first_sentence'] = text[:tokenization[0][1]]
Ejemplo n.º 7
0
    def guess(self, questions: List[QuestionText],
              max_n_guesses: Optional[int]):
        n_cores = conf['guessers']['ElasticSearch']['n_cores']
        sc = create_spark_context(
            configs=[('spark.executor.cores',
                      n_cores), ('spark.executor.memory', '40g')])
        b_is_human_model = sc.broadcast(self.is_human_model)

        def ir_search(query):
            is_human_model = b_is_human_model.value
            is_human_probability = is_human_model.predict_proba([query])[0][1]
            return es_index.search(query, is_human_probability)[:max_n_guesses]

        return sc.parallelize(questions, 4 * n_cores).map(ir_search).collect()
Ejemplo n.º 8
0
def create_memory_index():
    dataset = QuizBowlDataset(guesser_train=True)
    training_data = dataset.training_data()
    answers = set(training_data[1])
    cw = CachedWikipedia()

    try:
        Index('mem').delete()
    except:
        pass
    Answer.init()
    all_wiki_pages = [cw[page] for page in answers]
    wiki_pages = [p for p in all_wiki_pages if p.content != '']
    sc = create_spark_context()
    sc.parallelize(wiki_pages, 1000).foreach(index_page)
Ejemplo n.º 9
0
    def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]):
        def es_search(query):
            return self.index.search(
                query, max_n_guesses,
                normalize_score_by_length=self.normalize_score_by_length,
                wiki_boost=self.wiki_boost, qb_boost=self.qb_boost
            )

        if len(questions) > 1:
            sc = create_spark_context(configs=[('spark.executor.cores', self.n_cores), ('spark.executor.memory', '20g')])
            return sc.parallelize(questions, 16 * self.n_cores).map(es_search).collect()
        elif len(questions) == 1:
            return [es_search(questions[0])]
        else:
            return []
Ejemplo n.º 10
0
    def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]):
        def es_search(query):
            return self.index.search(
                query, max_n_guesses,
                normalize_score_by_length=self.normalize_score_by_length,
                wiki_boost=self.wiki_boost, qb_boost=self.qb_boost
            )

        if len(questions) > 1:
            sc = create_spark_context(configs=[('spark.executor.cores', self.n_cores), ('spark.executor.memory', '20g')])
            return sc.parallelize(questions, 16 * self.n_cores).map(es_search).collect()
        elif len(questions) == 1:
            return [es_search(questions[0])]
        else:
            return []