def split_csvs(notebooks, repos, owners, local): # Shuffle repositories. repos = repos.sample(frac=1).reset_index(drop=True) # Randomly assign repos and the notbooks/owners that go with them. partition_repos = np.array_split( repos, 10 ) for i in range(10): repos_i = partition_repos[i] if local: repos_i.to_csv('../csv/repos1b_{0}.csv'.format(i)) else: df_to_s3(repos_i, 'csv/repos1b_{0}.csv'.format(i)) notebooks_i = notebooks[ notebooks.repo_id.isin(repos_i["repo_id"]) ].reset_index(drop=True) if local: notebooks_i.to_csv('../csv/notebooks1b_{0}.csv'.format(i)) else: df_to_s3(notebooks_i, 'csv/notebooks1b_{0}.csv'.format(i)) owners_i = owners[ owners.owner_id.isin(repos_i["owner_id"]) ] if local: owners_i.to_csv('../csv/owners1b_{0}.csv'.format(i)) else: df_to_s3(owners_i, 'csv/owners1b_{0}.csv'.format(i))
def update_split_csvs(notebooks, repos, owners, local): all_notebook_files = [] all_notebooks = [] for i in range(10): notebooks_i = s3_to_df('csv/notebooks1_{0}.csv'.format(i)) all_notebook_files += list(notebooks_i.file) all_notebooks.append(notebooks_i) # Isolate new notebooks and shuffle new_notebooks = notebooks[~notebooks.file.isin(all_notebook_files)] print('There are {0} new notebooks to distribute.'.format(len(new_notebooks))) new_notebooks = new_notebooks.sample(frac = 1).reset_index(drop = True) # Split up and add to existing csvs partition_new_notebooks = np.array_split( new_notebooks, 10 ) for i in range(10): new_notebooks_i = partition_new_notebooks[i] old_notebooks_i = all_notebooks[i] notebooks_i = pd.concat([old_notebooks_i, new_notebooks_i]) if local: notebooks_i.to_csv('../csv/notebooks1b_{0}.csv'.format(i)) else: df_to_s3(notebooks_i, 'csv/notebooks1b_{0}.csv'.format(i)) repos_i = repos[ repos.repo_id.isin(notebooks_i.repo_id) ].reset_index(drop=True) if local: repos_i.to_csv('../csv/repos1b_{0}.csv'.format(i)) else: df_to_s3(repos_i, 'csv/repos1b_{0}.csv'.format(i)) owners_i = owners[ owners.owner_id.isin(notebooks_i.owner_id) ].reset_index(drop=True) if local: owners_i.to_csv('../csv/owners1b_{0}.csv'.format(i)) else: df_to_s3(owners_i, 'csv/owners1b_{0}.csv'.format(i))
def clean_metadata(num_needed, updating, local): """ Extract information from metadata JSON files and save to CSVs. Equivalent to Adam's 1_nb_metadata_cleaning.ipynb. """ try: if local: pass else: notebooks_done = s3_to_df("csv/notebooks1.csv") owners_done = s3_to_df("csv/owners1.csv") repos_done = s3_to_df("csv/repos1.csv") notebook_files_done = set(notebooks_done.file) owner_ids_done = set(owners_done.owner_id) repo_ids_done = set(repos_done.repo_id) print( 'Metadata already processed for {0} notebooks, {1} owners, and {2} repos.' .format(len(notebook_files_done), len(owner_ids_done), len(repo_ids_done))) except: notebook_files_done = [] owner_ids_done = [] repo_ids_done = [] print("Metadata not processed for any files.") # Get all query files. if local: nb_search_files = os.listdir(JSON_PATH) else: nb_search_files = list_s3_dir('json/') # Sort query files by size then by page number. nb_search_files = sorted(nb_search_files, key=lambda x: (int(x.split("_")[2].split("..")[0]), int(x.split("_")[3][1:].split(".")[0]))) debug_print("We have {0} query files.".format(len(nb_search_files))) notebooks = {} repos = {} owners = {} for j, json_file_name in enumerate(nb_search_files): # Keep track of progress. if (j + 1) % COUNT_TRIGGER / 100 == 0 or j + 1 == len(nb_search_files): debug_print("{0} / {1} data files processed".format( j + 1, len(nb_search_files))) file_components = json_file_name.replace(".json", "").split("_") filesize = file_components[2] query_page = int(file_components[3][1:]) if local: with open(JSON_PATH + json_file_name, "r") as json_file: # Parse file name to get size and query page. file_dict = json.load(json_file) else: obj = s3.Object("notebook-research", "json/{0}".format(json_file_name)) file_dict = json.loads(obj.get()["Body"].read().decode("UTF-8")) # Report missing data. if "incomplete_results" in file_dict: if file_dict["incomplete_results"] == True: msg = "{0} has incomplete results".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) days_since = file_dict["days_since"] if "items" in file_dict: if len(file_dict["items"]) == 0: msg = "{0} has 0 items".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) else: # Save data for each item. for i in range(len(file_dict["items"])): item = file_dict["items"][i] item_repo = item["repository"] repo_id = item_repo["id"] owner_id = item_repo["owner"]["id"] # Don"t save forked notebooks. if item_repo["fork"]: continue # Full path is unique for each file. name = "{0}/{1}/{2}".format(item_repo["owner"]["login"], item_repo["name"], item["path"]).replace( "/", "..") if name not in notebook_files_done: notebook = { "file": name, "html_url": item["html_url"], "name": item["name"], "path": item["path"], "repo_id": repo_id, "owner_id": owner_id, "filesize": filesize, "query_page": query_page, "days_since": days_since } notebooks[name] = notebook if repo_id not in repos and repo_id not in repo_ids_done: repo = { "repo_name": item_repo["name"], "owner_id": owner_id, "repo_description": item_repo["description"], "repo_fork": item_repo["fork"], "repo_html_url": item_repo["html_url"], "repo_private": item_repo["private"], } repos[repo_id] = repo if owner_id not in owners and owner_id not in owner_ids_done: owner = { "owner_html_url": item_repo["owner"]["html_url"], "owner_login": item_repo["owner"]["login"], } owners[owner_id] = owner # If updating we dont always need the full page. if updating and len(notebooks) == num_needed: break else: msg = "{0} has no items object".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) if updating and len(notebooks) == num_needed: break # Display status debug_print(("\nAfter processing all query files, " "we have {0} new notebooks.").format(len(notebooks))) debug_print("Written by {0} owners.".format(len(owners))) debug_print("Held in {0} repositories.".format(len(repos))) # Translate dictionaries to DataFrames and save to CSV. # Ordered by days since, if duplicates keep the most recent # (i.e. keep last, which was found more days since 1-1-19). notebooks_df = pd.DataFrame(notebooks).transpose()\ .sort_values(by=["days_since","file"]).drop_duplicates( subset =["file"], keep="last" ) owners_df = pd.DataFrame(owners).transpose().reset_index().rename( columns={"index": "owner_id"}, index=str) repos_df = pd.DataFrame(repos).transpose().reset_index().rename( columns={"index": "repo_id"}, index=str) if local: pd.concat([notebooks_df, notebooks_done]).to_csv("{0}/notebooks1.csv".format(PATH), index=False) pd.concat([owners_df, owners_done]).to_csv("{0}/owners1.csv".format(PATH), index=False) pd.concat([repos_df, repos_done]).to_csv("{0}/repos1.csv".format(PATH), index=False) else: df_to_s3(pd.concat([notebooks_df, notebooks_done]), "csv/notebooks1.csv") df_to_s3(pd.concat([owners_df, owners_done]), "csv/owners1.csv") df_to_s3(pd.concat([repos_df, repos_done]), "csv/repos1.csv")
def main(): # Parse command line arguments. parser = argparse.ArgumentParser() parser.add_argument("--local", action="store_const", dest="local", const=True, default=False, help="Stores results locally instead of using S3.") parser.add_argument("--worker", metavar="N", type=int, help=("GITHUB_TOKEN assigned to these sizes (workers " "sorted in alphabetical order: {0}).").format( list(TOKENS.keys()))) args = parser.parse_args() local = args.local worker = args.worker # If running in parallel, mark csv files with the worker number. global EXTENSION EXTENSION = '_{0}'.format(worker) if worker != None else '' print('EXTENSION', EXTENSION) start = datetime.datetime.now() # List of saved CSV files. if local: current_csvs = set(os.listdir(PATH)) else: current_csvs = list_s3_dir(S3_PATH) # Open basic data from json files (files created in query_git.py). if set([ "notebooks1{0}.csv".format(EXTENSION), "repos1{0}.csv".format(EXTENSION), "owners1{0}.csv".format(EXTENSION) ]).issubset(current_csvs): notebooks1 = get_df("notebooks1{0}.csv".format(EXTENSION), local) owners1 = get_df("owners1{0}.csv".format(EXTENSION), local) repos1 = get_df("repos1{0}.csv".format(EXTENSION), local) else: debug_print("Notebooks1, Owners1, and Repos1 were not found.") sys.exit(0) debug_print("Notebooks1, Owners1, and Repos1 were found and opened." + BREAK) ### Add information for repositories and owners. ################## save = False if not set([ "owners2{0}.csv".format(EXTENSION), "repos2{0}.csv".format(EXTENSION) ]).issubset(current_csvs): owners2, repos2 = update_owners_repos(owners1, repos1, local) save = True else: try: owners2_old = get_df("owners2{0}.csv".format(EXTENSION), local) repos2_old = get_df("repos2{0}.csv".format(EXTENSION), local) debug_print( "Found and opened data for {0} owners and {1} repos.".format( len(owners2_old), len(repos2_old))) except: owners2_old = [] repos2_old = [] if len(owners2_old) > 0 and len(repos2_old) > 0: owners1_new = owners1[~owners1.owner_id.isin(owners2_old.owner_id)] repos1_new = repos1[~repos1.repo_id.isin(repos2_old.repo_id)] else: owners1_new = owners1 repos1_new = repos1 debug_print("Collecting data for {0} owners and {1} repos.".format( len(owners1_new), len(repos1_new))) if len(owners1_new) > 0 and len(repos1_new) > 0: owners2_new, repos2_new = update_owners_repos( owners1_new, repos1_new, local) if len(owners2_new) > 0 and len(repos2_new) > 0: owners2 = pd.concat([owners2_old, owners2_new ]).drop_duplicates(subset='owner_id') repos2 = pd.concat([repos2_old, repos2_new ]).drop_duplicates(subset='repo_id') else: owners2 = owners2_old repos2 = repos2_old else: owners2 = owners2_old repos2 = repos2_old ## Save if save: debug_print("Saving combined data for {0} owners and {1} repos".format( len(owners2), len(repos2))) if local: owners2.to_csv("{0}/owners2{1}.csv".format(PATH, EXTENSION), index=False) repos2.to_csv("{0}/repos2{1}.csv".format(PATH, EXTENSION), index=False) else: df_to_s3(owners2, "{0}/owners2{1}.csv".format(S3_PATH, EXTENSION)) df_to_s3(repos2, "{0}/repos2{1}.csv".format(S3_PATH, EXTENSION)) debug_print("Owners2 and Repos2 were created and saved.\n" + BREAK) ## Add data on cells within each notebook. ####################### if not set(["notebooks2{0}.csv".format(EXTENSION)]).issubset(current_csvs): print("Notebooks2 not found, creating from scratch.") get_all_nb_cells(notebooks1, local, 0) else: # Get existing data. try: notebooks2_old = get_df("notebooks2{0}.csv".format(EXTENSION), local) debug_print( "Found and opened notebook data for {0} notebooks.".format( len(notebooks2_old))) except Exception as e: notebooks2_old = [] print("Notebooks2 could not be opened, creating from scratch.") print(type(e), e) # Isolate rows of notebooks1 corresponding to new notebooks if len(notebooks2_old) > 0: notebooks1_new = notebooks1[~notebooks1.file.isin(notebooks2_old. file)] else: notebooks1_new = notebooks1 debug_print("Collecting data for {0} notebooks.".format( len(notebooks1_new))) # If there are new notebooks, add cell data if len(notebooks1_new) > 0: get_all_nb_cells(notebooks1_new, local, len(notebooks2_old)) del notebooks2_old # Check time and report status. end = datetime.datetime.now() debug_print("TOTAL TIME: {0}".format(end - start))
def get_all_nb_cells(notebooks, local, done): """ Get cell and notebook data for each notebook. """ new_nb_info = {} all_cells_info = {} missing = [] for count, row in notebooks.iterrows(): # Track progress. file_name = row["file"] data = None if count % COUNT_TRIGGER == 0 or count == len(notebooks) - 1: print("{0} / {1} notebooks processed for cell data".format( count, len(notebooks) + done)) # Save data and reset. (In chunks to avoid MemoryError). if count > 0: # Transform data to DataFrame. notebooks_temp = pd.DataFrame(new_nb_info).transpose() cells_temp = pd.DataFrame( all_cells_info).transpose().reset_index(drop=True) # Save data to CSV. try: if local: notebooks_temp.to_csv( "{0}/notebooks2_{1}_{2}.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) cells_temp.to_csv("{0}/cells1_{1}_{2}.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) else: df_to_s3( notebooks_temp, "{0}/notebooks2_{1}_{2}.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( cells_temp, "{0}/cells1_{1}_{2}.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) except MemoryError: # Split data into 3 sections and try saving again. n1 = notebooks_temp.iloc[:len(notebooks_temp) // 4] n2 = notebooks_temp.iloc[len(notebooks_temp) // 4:2 * len(notebooks_temp) // 4] n3 = notebooks_temp.iloc[2 * len(notebooks_temp) // 4:3 * len(notebooks_temp) // 4] n4 = notebooks_temp.iloc[3 * len(notebooks_temp) // 4:] c1 = cells_temp.iloc[:len(cells_temp) // 8] c2 = cells_temp.iloc[len(cells_temp) // 8:2 * len(cells_temp) // 8] c3 = cells_temp.iloc[2 * len(cells_temp) // 8:3 * len(cells_temp) // 8] c4 = cells_temp.iloc[3 * len(cells_temp) // 8:4 * len(cells_temp) // 8] c5 = cells_temp.iloc[4 * len(cells_temp) // 8:5 * len(cells_temp) // 8] c6 = cells_temp.iloc[5 * len(cells_temp) // 8:6 * len(cells_temp) // 8] c7 = cells_temp.iloc[6 * len(cells_temp) // 8:7 * len(cells_temp) // 8] c8 = cells_temp.iloc[7 * len(cells_temp) // 8:] if local: n1.to_csv("{0}/notebooks2_{1}_{2}_1.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n2.to_csv("{0}/notebooks2_{1}_{2}_2.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n3.to_csv("{0}/notebooks2_{1}_{2}_3.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n4.to_csv("{0}/notebooks2_{1}_{2}_4.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c1.to_csv("{0}/cells1_{1}_{2}_1.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c2.to_csv("{0}/cells1_{1}_{2}_2.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c3.to_csv("{0}/cells1_{1}_{2}_3.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c4.to_csv("{0}/cells1_{1}_{2}_4.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c5.to_csv("{0}/cells1_{1}_{2}_5.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c6.to_csv("{0}/cells1_{1}_{2}_6.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c7.to_csv("{0}/cells1_{1}_{2}_7.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c8.to_csv("{0}/cells1_{1}_{2}_8.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) else: df_to_s3( n1, "{0}/notebooks2_{1}_{2}_1.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n2, "{0}/notebooks2_{1}_{2}_2.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n3, "{0}/notebooks2_{1}_{2}_3.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n4, "{0}/notebooks2_{1}_{2}_4.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c1, "{0}/cells1_{1}_{2}_1.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c2, "{0}/cells1_{1}_{2}_2.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c3, "{0}/cells1_{1}_{2}_3.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c4, "{0}/cells1_{1}_{2}_4.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c5, "{0}/cells1_{1}_{2}_5.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c6, "{0}/cells1_{1}_{2}_6.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c7, "{0}/cells1_{1}_{2}_7.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c8, "{0}/cells1_{1}_{2}_8.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) # Empty current dictionaries. new_nb_info = {} all_cells_info = {} print("CSVs saved") # Initialize row of data. nb_info = { "file": file_name, "google_collab": False, "nbformat": "", "nbformat_minor": "", "num_cells": 0, "kernel_lang": "", "kernel_name": "", "lang_name": "", "lang_version": "" } # Open notebooks as json. try: obj = s3.Object("notebook-research", "notebooks/{0}".format(file_name)) data = json.loads(obj.get()["Body"].read().decode("UTF-8")) except Exception: # Report missed files. msg = "Notebook {0} did not open.".format(file_name) write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) missing.append(file_name) # Add row with missing values. if file_name not in new_nb_info: new_nb_info[file_name] = nb_info continue # If data was able to load as JSON, extract information. if data and isinstance(data, dict): keys = data.keys() # Get nb top level format metadata. if "nbformat" in keys: nb_info["nbformat"] = data["nbformat"] if "nbformat_minor" in keys: nb_info["nbformat_minor"] = data["nbformat_minor"] # Get info from the metadata dictionary. if ("metadata" in keys and data["metadata"] != None and isinstance(data["metadata"], dict)): metadata_keys = data["metadata"].keys() # Access language data. if ("kernelspec" in metadata_keys and data["metadata"]["kernelspec"] != None and isinstance(data["metadata"]["kernelspec"], dict)): kernel_keys = data["metadata"]["kernelspec"].keys() # If Google colab notebook, only Python 2.7 or 3.6 are possible. if "colab" in metadata_keys: nb_info["google_collab"] = True if ("name" in kernel_keys and "display_name" in kernel_keys): nb_info["kernel_lang"] = data["metadata"][ "kernelspec"]["name"] nb_info["kernel_name"] = data["metadata"][ "kernelspec"]["display_name"] if nb_info["kernel_lang"] == "python3": nb_info["lang_name"] = "python" nb_info["lang_version"] = "3.6" elif nb_info["kernel_lang"] == "python2": nb_info["lang_name"] = "python" nb_info["lang_version"] = "2.7" # Not Google colab, access kernel language and display name. else: if "language" in kernel_keys: nb_info["kernel_lang"] = data["metadata"][ "kernelspec"]["language"] if "display_name" in kernel_keys: nb_info["kernel_name"] = data["metadata"][ "kernelspec"]["display_name"] # Access language info. if ("language_info" in metadata_keys and "colab" not in metadata_keys): lang_keys = data["metadata"]["language_info"].keys() if "name" in lang_keys and "colab" not in metadata_keys: nb_info["lang_name"] = data["metadata"][ "language_info"]["name"] if "version" in lang_keys and "colab" not in metadata_keys: nb_info["lang_version"] = data["metadata"][ "language_info"]["version"] elif "language" in metadata_keys: nb_info["lang_name"] = data["metadata"]["language"] # Get information about individual cells. cells_info = {} if "cells" in keys: nb_info["num_cells"] = len(data["cells"]) cell_id = 0 for cell in data["cells"]: cell_info, nb_language = get_single_cell( cell_id, file_name, cell, nb_info["lang_name"]) if nb_info["lang_name"] == "": nb_info["lang_name"] = nb_language.lower() if (file_name, cell_id) not in cells_info: cells_info[(file_name, cell_id)] = cell_info cell_id += 1 elif "worksheets" in keys: cell_id = 0 for w in data["worksheets"]: for cell in w["cells"]: cell_info, nb_language = get_single_cell( cell_id, file_name, cell, nb_info["lang_name"]) if nb_info["lang_name"] == "": nb_info["lang_name"] = nb_language.lower() if (file_name, cell_id) not in cells_info: cells_info[(file_name, cell_id)] = cell_info cell_id += 1 all_cells_info.update(cells_info) if file_name not in new_nb_info: new_nb_info[file_name] = nb_info debug_print("{0} notebooks are missing cell data.".format(len(missing))) return new_nb_info, all_cells_info