Beispiel #1
0
def read_process_data():
    papers = read_papers()
    topic_mix = read_topic_mix()
    topic_category_map = read_topic_category_map()
    arxiv_cat_lookup = read_arxiv_cat_lookup()

    return papers, topic_mix, topic_category_map, arxiv_cat_lookup
def read_process_data():
    papers = read_papers()
    paper_orgs = paper_orgs_processing(read_papers_orgs(), papers)
    paper_orgs["year"] = [x.year for x in paper_orgs["date"]]

    topic_mix = read_topic_mix()
    topic_mix.set_index("article_id", inplace=True)

    return papers, paper_orgs, topic_mix
def read_process_data():
    papers = read_papers()
    papers_orgs = paper_orgs_processing(read_papers_orgs(), papers)
    topic_mix = read_topic_mix()
    topic_mix.set_index("article_id", inplace=True)
    vectors = read_vectors().pivot_table(index="article_id",
                                         columns="dimension",
                                         values="value")

    return papers, papers_orgs, topic_mix, vectors
Beispiel #4
0
def read_process_data():
    papers = read_papers()
    topic_mix = (
        remove_zero_axis(  # We remove a couple of papers with zero in all topics
            read_topic_mix().set_index("article_id")))

    logging.info("Process dfs")
    papers["year"] = [x.year for x in papers["date"]]

    return papers, topic_mix
def read_process_data():
    """Reads and processes the data"""
    arxiv_cat_lookup = read_arxiv_cat_lookup()
    papers = read_papers()
    topic_long = read_topic_long()
    topic_mix = read_topic_mix()
    cats = read_arxiv_categories()

    # Paper cats
    cat_sets = cats.groupby(["category_id"])["article_id"].apply(lambda x: set(x))

    # create a unique cat_sets
    one_cat_ps = cats.groupby("article_id")["category_id"].apply(lambda x: len(x))
    one_cat_ids = set(one_cat_ps.loc[one_cat_ps == 1].index)

    return papers, topic_mix, topic_long, cats, cat_sets, one_cat_ids, arxiv_cat_lookup
def read_process_data():
    papers = read_papers()
    paper_orgs = paper_orgs_processing(read_papers_orgs(), papers)
    topic_mix = read_topic_mix()
    topic_mix.set_index("article_id", inplace=True)

    # topic_long = read_topic_long()
    topic_category_map = read_topic_category_map()
    arxiv_cat_lookup = read_arxiv_cat_lookup()
    topic_list = topic_mix.columns

    return (
        papers,
        paper_orgs,
        topic_mix,
        topic_category_map,
        arxiv_cat_lookup,
        topic_list,
    )
Beispiel #7
0
def load_process_data():
    """Loads AI paper data for analysis in section 1."""
    logging.info("Reading data")

    arxiv_cat_lookup = read_arxiv_cat_lookup()
    papers = read_papers()
    topic_long = read_topic_long()
    topic_mix = read_topic_mix()
    cats = read_arxiv_categories()

    logging.info("Reading tokenised abstracts")
    with open(f"{project_dir}/data/interim/arxiv_tokenised.json", "r") as infile:
        arxiv_tokenised = json.load(infile)

    logging.info("Reading AI labelling outputs")
    with open(f"{project_dir}/data/interim/find_ai_outputs.p", "rb") as infile:
        ai_indices, term_counts = pickle.load(infile)

    logging.info("Processing")
    papers["tokenised"] = papers["article_id"].map(arxiv_tokenised)

    # Create category sets to identify papers in different categories
    ai_cats = ["cs.AI", "cs.NE", "stat.ML", "cs.LG"]
    cat_sets = cats.groupby("category_id")["article_id"].apply(lambda x: set(x))

    # Create one hot encodings for AI categories
    ai_binary = pd.DataFrame(index=set(cats["article_id"]), columns=ai_cats)

    for c in ai_binary.columns:
        ai_binary[c] = [x in cat_sets[c] for x in ai_binary.index]

    # Create arxiv dataset
    papers.set_index("article_id", inplace=True)

    # We remove papers without abstracts and arXiv categories
    arx = pd.concat([ai_binary, papers], axis=1, sort=True).dropna(
        axis=0, subset=["abstract", "cs.AI"]
    )

    return arx, ai_indices, term_counts, arxiv_cat_lookup, cat_sets, cats, ai_cats
Beispiel #8
0
webd = altair_visualisation_setup()

# ### Read data

# +
papers = (read_papers(
    keep_vars=['article_id', 'year', 'date', 'is_ai', 'citation_count']).query(
        "is_ai == True").reset_index(drop=True))

porgs = read_papers_orgs()

orgs = (paper_orgs_processing(
    porgs, papers).query("is_ai==True").reset_index(drop=True))

tm = read_topic_mix()
# -

# ### Create analytical table

# +
# AI papers with private orgs

ai_comp = pipe(orgs.query("org_type=='Company'")['article_id'], set)
ai_num = orgs.groupby('article_id').size()
# -

papers_an = (papers.loc[papers['article_id'].isin(set(
    orgs['article_id']))].query("is_ai == True").assign(
        is_comp=lambda x: x['article_id'].isin(ai_comp)).assign(
            num_auth=lambda x: x['article_id'].map(ai_num)).reset_index(