Esempio n. 1
0
def compute_question_player_counts(proto_log_path):
    spark = create_spark_session()
    df = spark.read.json(proto_log_path)
    df.createOrReplaceTempView('logs')
    question_player_counts = spark.sql("""
        SELECT object.qid, size(collect_set(object.user.id)) AS n_players
        FROM logs
        GROUP BY object.qid
    """).collect()
    return {r.qid: r.n_players for r in question_player_counts}
Esempio n. 2
0
def create_wikipedia_title_pickle(dump_path, output_path):
    from qanta.spark import create_spark_session

    spark = create_spark_session()
    wiki_df = spark.read.json(dump_path)
    raw_titles = wiki_df.select('title').distinct().collect()
    clean_titles = {normalize_wikipedia_title(r.title) for r in raw_titles}
    with open(output_path, 'wb') as f:
        pickle.dump(clean_titles, f)
    spark.stop()
Esempio n. 3
0
def create_wikipedia_title_pickle(dump_path, disambiguation_pages_path, output_path):
    from qanta.spark import create_spark_session

    with open(disambiguation_pages_path) as f:
        disambiguation_pages = set(json.load(f))

    spark = create_spark_session()
    wiki_df = spark.read.json(dump_path)
    rows = wiki_df.select('title', 'id').distinct().collect()
    content_pages = [r for r in rows if int(r.id) not in disambiguation_pages]
    clean_titles = {normalize_wikipedia_title(r.title) for r in content_pages}

    with open(output_path, 'wb') as f:
        pickle.dump(clean_titles, f)
    spark.stop()
Esempio n. 4
0
def create_wikipedia_title_pickle(dump_path, disambiguation_pages_path,
                                  output_path):
    from qanta.spark import create_spark_session

    with open(disambiguation_pages_path) as f:
        disambiguation_pages = set(json.load(f))

    spark = create_spark_session()
    wiki_df = spark.read.json(dump_path)
    rows = wiki_df.select('title', 'id').distinct().collect()
    content_pages = [r for r in rows if int(r.id) not in disambiguation_pages]
    clean_titles = {normalize_wikipedia_title(r.title) for r in content_pages}

    with open(output_path, 'wb') as f:
        pickle.dump(clean_titles, f)
    spark.stop()
Esempio n. 5
0
def create_wikipedia_cache(dump_path):
    from qanta.spark import create_spark_session

    spark = create_spark_session()
    db = QuestionDatabase()
    answers = set(db.all_answers().values())
    b_answers = spark.sparkContext.broadcast(answers)
    # Paths used in spark need to be absolute and it needs to exist
    page_path = os.path.abspath(safe_path(WIKI_PAGE_PATH))

    def create_page(row):
        title = normalize_wikipedia_title(row.title)
        filter_answers = b_answers.value
        if title in filter_answers:
            page = WikipediaPage(title, row.text, None, None, row.id, row.url)
            write_page(page, page_path=page_path)

    spark.read.json(dump_path).rdd.foreach(create_page)