def read_process_data(): papers = read_papers() topic_mix = read_topic_mix() topic_category_map = read_topic_category_map() arxiv_cat_lookup = read_arxiv_cat_lookup() return papers, topic_mix, topic_category_map, arxiv_cat_lookup
def read_process_data(): papers = read_papers() paper_orgs = paper_orgs_processing(read_papers_orgs(), papers) paper_orgs["year"] = [x.year for x in paper_orgs["date"]] topic_mix = read_topic_mix() topic_mix.set_index("article_id", inplace=True) return papers, paper_orgs, topic_mix
def read_process_data(): papers = read_papers() papers_orgs = paper_orgs_processing(read_papers_orgs(), papers) topic_mix = read_topic_mix() topic_mix.set_index("article_id", inplace=True) vectors = read_vectors().pivot_table(index="article_id", columns="dimension", values="value") return papers, papers_orgs, topic_mix, vectors
def read_process_data(): papers = read_papers() topic_mix = ( remove_zero_axis( # We remove a couple of papers with zero in all topics read_topic_mix().set_index("article_id"))) logging.info("Process dfs") papers["year"] = [x.year for x in papers["date"]] return papers, topic_mix
def read_process_data(): """Reads and processes the data""" arxiv_cat_lookup = read_arxiv_cat_lookup() papers = read_papers() topic_long = read_topic_long() topic_mix = read_topic_mix() cats = read_arxiv_categories() # Paper cats cat_sets = cats.groupby(["category_id"])["article_id"].apply(lambda x: set(x)) # create a unique cat_sets one_cat_ps = cats.groupby("article_id")["category_id"].apply(lambda x: len(x)) one_cat_ids = set(one_cat_ps.loc[one_cat_ps == 1].index) return papers, topic_mix, topic_long, cats, cat_sets, one_cat_ids, arxiv_cat_lookup
def read_process_data(): papers = read_papers() paper_orgs = paper_orgs_processing(read_papers_orgs(), papers) topic_mix = read_topic_mix() topic_mix.set_index("article_id", inplace=True) # topic_long = read_topic_long() topic_category_map = read_topic_category_map() arxiv_cat_lookup = read_arxiv_cat_lookup() topic_list = topic_mix.columns return ( papers, paper_orgs, topic_mix, topic_category_map, arxiv_cat_lookup, topic_list, )
def read_process_data(): papers = read_papers() papers_orgs = paper_orgs_processing(read_papers_orgs(), papers) # Date papers_orgs["year"] = [x.year for x in papers_orgs["date"]] # Org diversity df org_div_df = f"{project_dir}/data/processed/org_diversity.csv" if os.path.exists(org_div_df) is False: logging.info("Making organisational diversity") make_org_diversity() org_diversity = pd.read_csv( f"{project_dir}/data/processed/org_diversity.csv") else: logging.info("Reading organisational diversity") org_diversity = pd.read_csv( f"{project_dir}/data/processed/org_diversity.csv") return papers, papers_orgs, org_diversity
def load_process_data(): """Loads AI paper data for analysis in section 1.""" logging.info("Reading data") arxiv_cat_lookup = read_arxiv_cat_lookup() papers = read_papers() topic_long = read_topic_long() topic_mix = read_topic_mix() cats = read_arxiv_categories() logging.info("Reading tokenised abstracts") with open(f"{project_dir}/data/interim/arxiv_tokenised.json", "r") as infile: arxiv_tokenised = json.load(infile) logging.info("Reading AI labelling outputs") with open(f"{project_dir}/data/interim/find_ai_outputs.p", "rb") as infile: ai_indices, term_counts = pickle.load(infile) logging.info("Processing") papers["tokenised"] = papers["article_id"].map(arxiv_tokenised) # Create category sets to identify papers in different categories ai_cats = ["cs.AI", "cs.NE", "stat.ML", "cs.LG"] cat_sets = cats.groupby("category_id")["article_id"].apply(lambda x: set(x)) # Create one hot encodings for AI categories ai_binary = pd.DataFrame(index=set(cats["article_id"]), columns=ai_cats) for c in ai_binary.columns: ai_binary[c] = [x in cat_sets[c] for x in ai_binary.index] # Create arxiv dataset papers.set_index("article_id", inplace=True) # We remove papers without abstracts and arXiv categories arx = pd.concat([ai_binary, papers], axis=1, sort=True).dropna( axis=0, subset=["abstract", "cs.AI"] ) return arx, ai_indices, term_counts, arxiv_cat_lookup, cat_sets, cats, ai_cats
def read_process_data(): papers = read_papers() paper_orgs = read_papers_orgs() return papers, paper_orgs
from narrowing_ai_research import project_dir import statsmodels.api as sm from statsmodels.api import add_constant from sklearn.decomposition import PCA import altair as alt from narrowing_ai_research.utils.altair_utils import altair_visualisation_setup, save_altair # - webd = altair_visualisation_setup() # ### Read data # + papers = (read_papers( keep_vars=['article_id', 'year', 'date', 'is_ai', 'citation_count']).query( "is_ai == True").reset_index(drop=True)) porgs = read_papers_orgs() orgs = (paper_orgs_processing( porgs, papers).query("is_ai==True").reset_index(drop=True)) tm = read_topic_mix() # - # ### Create analytical table # + # AI papers with private orgs