def arxiv_articles(chunk=None): '''arxiv_articles Get ArXiv article data chunks. Parameters ---------- chunk : int Specify a particular chunk of data to download. There are 17 chunks numbered 0 through 16. Returns ------- generator A generator that returns ArXiv articles in chunks of 100,000 or DataFrame A dataframe from a single chunk. ''' if chunk is None: for i in range(16): key = f'{folder}/arxiv_articles_{i:02}.pkl.bz2' yield load_df_pkl(bucket, key) else: key = f'{folder}/arxiv_articles_{chunk:02}.pkl.bz2' return load_df_pkl(bucket, key)
def patents_10k(): '''patents_10k Gets a pre-selected sample of 10,000 patents from ONS. ''' bucket='innovation-mapping-tutorials' patents_10k_key='ons/ONS_y02_sample_10000.pkl.bz2' return load_df_pkl(bucket, patents_10k_key)
def cordis_table(table): '''cordis_table Get a table from the CORDIS database. Parameters ---------- table : str Name of the table to load. Tables available include: - organisations - project_organisations - project_proposal_calls - project_topics - projects - proposal_calls - publications - reports - topics Returns ------- DataFrame A dataframe with containing the CORDIS table data. ''' key=f'{folder}/cordis_{table}.pkl.bz2' return load_df_pkl(bucket, key)
def mag_table(table='fields_of_study'): '''grid_table Get tables from the Microsoft Academic Graph database. Parameters ---------- table : str Name of the Microsoft Academic Graph table to load. Tables available include: - fields_of_study Returns ------- DataFrame A dataframe with containing the Microsoft Academic Graph table data. ''' key = f'{folder}/mag_{table}.pkl.bz2' return load_df_pkl(bucket, key)
def grid_table(table): '''grid_table Get tables from the GRID database. Parameters ---------- table : str Name of the GRID table to load. Tables available include: - aliases - institutes Returns ------- DataFrame A dataframe with containing the GRID table data. ''' key = f'{folder}/grid_{table}.pkl.bz2' return load_df_pkl(bucket, key)
def arxiv_table(table): '''arxiv_table Get a list of Parameters ---------- table : str Name of the ArXiv table to load. Tables available include: - article_categories - article_corex_topics - article_fields_of_study - article_institutes - categories - corex_topics Returns ------- DataFrame A dataframe with containing the ArXiv table data. ''' key = f'{folder}/arxiv_{table}.pkl.bz2' return load_df_pkl(bucket, key)
def gtr_link_table(table): '''gtr_link_table Get a link table from the Gateway to Research database. Link tables link project ids to other entities within GtR. Parameters ---------- table : str Name of the link table to load. Tables available include: - funds - organisations - organisations_locations - outcomes_artisticandcreativeproducts - outcomes_collaborations - outcomes_disseminations - outcomes_furtherfundings - outcomes_impactsummaries - outcomes_intellectualproperties - outcomes_keyfindings - outcomes_policyinfluences - outcomes_products - outcomes_publications - outcomes_researchdatabaseandmodels - outcomes_researchmaterials - outcomes_softwareandtechnicalproducts - outcomes_spinouts - participant - persons - topic Returns ------- DataFrame A dataframe with containing the GtR table data. ''' key = f'{folder}/link_tables/gtr_{table}_link.pkl.bz2' return load_df_pkl(bucket, key)
def patents_10k(): '''patents_10k Gets a pre-selected sample of 10,000 patents from ONS. ''' patents_10k_key = 'ons/ONS_y02_sample_10000.pkl.bz2' return load_df_pkl(bucket, patents_10k_key)