def download_nbs(notebooks, local, current_files): """ Download notebooks from GitHub. Equivalent to Adam's 2_nb_download.ipynb. """ debug_print("Downloading notebooks\n") already_done = 0 checkpoints = 0 new = 0 count = 0 for _, row in notebooks.sort_values(by="days_since").iterrows(): date_string = datetime.datetime.now().strftime(r"%Y-%m-%d %H:%M:%S") # Keep track of the download progress. if count % COUNT_TRIGGER == 0 or count == len(notebooks): debug_print("{0} / {1} notebooks downloaded.".format( count, len(notebooks))) count += 1 # Don't download files we already have. # Don't download files in .ipynb_checkpoints. if row["file"] in current_files: already_done += 1 continue if ".ipynb_checkpoints" in row["html_url"]: checkpoints += 1 try: # Access the raw content webpage and download the file. raw_url = row["html_url"].replace( "github.com", "raw.githubusercontent.com").replace("/blob", "") r = requests.get(raw_url) # Save file. if local: filename = "../data/notebooks/{0}".format(row["file"]) with open(filename, "w") as nb_file: nb_file.write(r.text) else: obj = s3.Object("notebook-research", "notebooks/{0}".format(row["file"])) obj.put(Body=bytes(r.text.encode("UTF-8"))) new += 1 msg = "{0}: downloaded {1}".format(date_string, row["file"]) write_to_log("../logs/nb_log.txt", msg) except Exception: # Report missed files. msg = "{0}: had trouble downloading {1}".format( date_string, row["file"]) write_to_log("../logs/nb_log.txt", msg) debug_print(msg) debug_print("{0} were already done. {1} were in ipynb checkpoints. {2} ". format(already_done, checkpoints, new) + "new notebooks were downloaded.")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--update", action="store_const", dest="updating", const=True, default=False, help=("Search notebooks that have been added " + "or updated since last search, along with new " + "notebooks")) parser.add_argument("--local", action="store_const", dest="local", const=True, default=False, help="Saves output locally instead of in S3.") parser.add_argument("--needed", metavar="num_needed", type=int) args = parser.parse_args() updating = args.updating local = args.local num_needed = args.needed if num_needed == None: print('test') num_needed = 0 for i in range(NUM_WORKERS): try: with open('num_needed_{0}.save'.format(i), 'r') as f: num_needed += int(f.readlines()[0]) except: print("Parallelize_query.py was not completed.") print("Please complete query and try again.") sys.exit(0) clean_metadata(num_needed, updating, local) debug_print("Notebooks1, Owners1, and Repos1 were created and saved. ")
def clean_metadata(num_needed, updating, local): """ Extract information from metadata JSON files and save to CSVs. Equivalent to Adam's 1_nb_metadata_cleaning.ipynb. """ try: if local: pass else: notebooks_done = s3_to_df("csv/notebooks1.csv") owners_done = s3_to_df("csv/owners1.csv") repos_done = s3_to_df("csv/repos1.csv") notebook_files_done = set(notebooks_done.file) owner_ids_done = set(owners_done.owner_id) repo_ids_done = set(repos_done.repo_id) print( 'Metadata already processed for {0} notebooks, {1} owners, and {2} repos.' .format(len(notebook_files_done), len(owner_ids_done), len(repo_ids_done))) except: notebook_files_done = [] owner_ids_done = [] repo_ids_done = [] print("Metadata not processed for any files.") # Get all query files. if local: nb_search_files = os.listdir(JSON_PATH) else: nb_search_files = list_s3_dir('json/') # Sort query files by size then by page number. nb_search_files = sorted(nb_search_files, key=lambda x: (int(x.split("_")[2].split("..")[0]), int(x.split("_")[3][1:].split(".")[0]))) debug_print("We have {0} query files.".format(len(nb_search_files))) notebooks = {} repos = {} owners = {} for j, json_file_name in enumerate(nb_search_files): # Keep track of progress. if (j + 1) % COUNT_TRIGGER / 100 == 0 or j + 1 == len(nb_search_files): debug_print("{0} / {1} data files processed".format( j + 1, len(nb_search_files))) file_components = json_file_name.replace(".json", "").split("_") filesize = file_components[2] query_page = int(file_components[3][1:]) if local: with open(JSON_PATH + json_file_name, "r") as json_file: # Parse file name to get size and query page. file_dict = json.load(json_file) else: obj = s3.Object("notebook-research", "json/{0}".format(json_file_name)) file_dict = json.loads(obj.get()["Body"].read().decode("UTF-8")) # Report missing data. if "incomplete_results" in file_dict: if file_dict["incomplete_results"] == True: msg = "{0} has incomplete results".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) days_since = file_dict["days_since"] if "items" in file_dict: if len(file_dict["items"]) == 0: msg = "{0} has 0 items".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) else: # Save data for each item. for i in range(len(file_dict["items"])): item = file_dict["items"][i] item_repo = item["repository"] repo_id = item_repo["id"] owner_id = item_repo["owner"]["id"] # Don"t save forked notebooks. if item_repo["fork"]: continue # Full path is unique for each file. name = "{0}/{1}/{2}".format(item_repo["owner"]["login"], item_repo["name"], item["path"]).replace( "/", "..") if name not in notebook_files_done: notebook = { "file": name, "html_url": item["html_url"], "name": item["name"], "path": item["path"], "repo_id": repo_id, "owner_id": owner_id, "filesize": filesize, "query_page": query_page, "days_since": days_since } notebooks[name] = notebook if repo_id not in repos and repo_id not in repo_ids_done: repo = { "repo_name": item_repo["name"], "owner_id": owner_id, "repo_description": item_repo["description"], "repo_fork": item_repo["fork"], "repo_html_url": item_repo["html_url"], "repo_private": item_repo["private"], } repos[repo_id] = repo if owner_id not in owners and owner_id not in owner_ids_done: owner = { "owner_html_url": item_repo["owner"]["html_url"], "owner_login": item_repo["owner"]["login"], } owners[owner_id] = owner # If updating we dont always need the full page. if updating and len(notebooks) == num_needed: break else: msg = "{0} has no items object".format(json_file_name) write_to_log("../logs/nb_metadata_cleaning_log.txt", msg) if updating and len(notebooks) == num_needed: break # Display status debug_print(("\nAfter processing all query files, " "we have {0} new notebooks.").format(len(notebooks))) debug_print("Written by {0} owners.".format(len(owners))) debug_print("Held in {0} repositories.".format(len(repos))) # Translate dictionaries to DataFrames and save to CSV. # Ordered by days since, if duplicates keep the most recent # (i.e. keep last, which was found more days since 1-1-19). notebooks_df = pd.DataFrame(notebooks).transpose()\ .sort_values(by=["days_since","file"]).drop_duplicates( subset =["file"], keep="last" ) owners_df = pd.DataFrame(owners).transpose().reset_index().rename( columns={"index": "owner_id"}, index=str) repos_df = pd.DataFrame(repos).transpose().reset_index().rename( columns={"index": "repo_id"}, index=str) if local: pd.concat([notebooks_df, notebooks_done]).to_csv("{0}/notebooks1.csv".format(PATH), index=False) pd.concat([owners_df, owners_done]).to_csv("{0}/owners1.csv".format(PATH), index=False) pd.concat([repos_df, repos_done]).to_csv("{0}/repos1.csv".format(PATH), index=False) else: df_to_s3(pd.concat([notebooks_df, notebooks_done]), "csv/notebooks1.csv") df_to_s3(pd.concat([owners_df, owners_done]), "csv/owners1.csv") df_to_s3(pd.concat([repos_df, repos_done]), "csv/repos1.csv")
def main(): # Parse command line arguments. parser = argparse.ArgumentParser() parser.add_argument( "-w", "--worker", metavar="N", type=int, help=("GITHUB_TOKEN assigned to these files " + "(will use partition N stored in download_partitions.pickle).")) parser.add_argument("-r", "--repos", action="store_const", dest="only_repos", const=True, default=False, help="Download repos only.") parser.add_argument("-n", "--notebooks", action="store_const", dest="only_nbs", const=True, default=False, help="Download notebooks only.") parser.add_argument("--local", action="store_const", dest="local", const=True, default=False, help="Save data locally instead of in S3.") args = parser.parse_args() worker = args.worker only_repos = args.only_repos only_nbs = args.only_nbs local = args.local # If both flags are specified, ignore. if only_repos and only_nbs: raise Exception( "Cannot use both --repos and --notebooks flags. Use --help flag for more information." ) # If a worker was specified, get partition data and correct header. if worker != None: print("Worker {0}".format(worker)) try: if local: with open("download_partitions.pickle", "rb") as f: partitions_download = pickle.load(f) partition = partitions_download[worker] else: obj = s3.Object( "notebook-research", "download_partitions_{0}.pickle".format(worker)) partition = pickle.load(BytesIO(obj.get()["Body"].read())) except Exception: print(("Download Partitions data were not found {0}. ".format( "locally" if local else "in s3") + "Please run parallelize_download.py and try again.")) sys.exit(0) notebooks1 = partition["notebooks"] obj = s3.Object("notebook-research", "csv/owners1.csv") owners = pd.read_csv(BytesIO(obj.get()["Body"].read())) obj = s3.Object("notebook-research", "csv/repos1.csv") repos = pd.read_csv(BytesIO(obj.get()["Body"].read())) owners1 = notebooks1[["owner_id" ]].merge(owners[['owner_id', 'owner_login']], on="owner_id", how='left').drop_duplicates() repos1 = notebooks1[["repo_id", 'owner_id']].merge(repos[['repo_id', 'repo_name']], on='repo_id', how='left').drop_duplicates() header = HEADERS[partition["id"]] debug_print("Partition data for downloads found and opened. " + "Notebooks1, Owners1, and Repos1 were found and opened." + BREAK) # If a worker was not specified, get all data and use first header. else: try: if local: notebooks1 = pd.read_csv("{0}/notebooks1.csv".format(PATH)) owners1 = pd.read_csv("{0}/owners1.csv".format(PATH)) repos1 = pd.read_csv("{0}/repos1.csv".format(PATH)) else: notebooks1 = s3_to_df("csv/notebooks1.csv") owners1 = s3_to_df("csv/owners1.csv") repos1 = s3_to_df("csv/repos1.csv") except Exception: print("The files 'notebooks1.csv','repos1.csv', and " + "'owners1.csv' were not found. Please run query_git.py " + "and try again.") sys.exit(0) header = HEADERS[0] # Check time and display status. print("{0} notebooks, {1} repos, {2} owners".format( len(notebooks1), len(repos1), len(owners1))) check1 = datetime.datetime.now() write_to_log("../logs/timing.txt", "download CHECKPOINT 1: {0}".format(check1)) # Download full notebooks from github. if not only_repos: if local: current_files = set(os.listdir("../data/notebooks")) else: obj = s3.Object("notebook-research", "current_notebooks.pickle") current_files = pickle.load(BytesIO(obj.get()["Body"].read())) num_done = len(current_files) debug_print( "{0} notebooks have already been downloaded.".format(num_done)) download_nbs(notebooks1, local, current_files) check2 = datetime.datetime.now() write_to_log("../logs/timing.txt", "CHECKPOINT 2: {0}".format(check2)) debug_print("\nNotebooks have been downloaded. Time: {0}{1}".format( check2 - check1, BREAK)) # Download data from github. if not only_nbs: download_repo_data(repos1, owners1, header, local) check3 = datetime.datetime.now() write_to_log("../logs/timing.txt", "CHECKPOINT 3: {0}".format(check3)) debug_print("\nRepos have been downloaded. " + "Time: {0}{1}".format(check3 - check2, BREAK))
def download_repo_data(repos, owners, header, local): """ Download repository metadata files from GitHub. """ if len(repos) == 0 or len(owners) == 0: return data_frame = repos.merge(owners, on="owner_id") # List files already downloaded. current_repos = os.listdir("../data/repos") if local else list_s3_dir( "repos/") debug_print(("There are currently {0} repo metadata files saved.").format( len(current_repos))) num_recorded_since = 0 num_tried_since = 0 hit_url = '' for i, row in data_frame.iterrows(): num_tried_since += 1 # Keep track of the download progress. if i % COUNT_TRIGGER == 0 or i == len(data_frame): debug_print("{0} / {1} repos downloaded.".format( i, len(data_frame))) # Download repository metadata. repo_recorded = False if "repo_{0}.json".format(row["repo_id"]) not in current_repos: wait_time = 0 while not repo_recorded: time.sleep(wait_time) date_string = datetime.datetime.now().strftime( r"%Y-%m-%d %H:%M:%S") url = "https://api.github.com/repos/{0}/{1}".format( row["owner_login"], row["repo_name"]) try: # Query the api. r = requests.get(url, headers=header) j = r.json() h = r.headers # Handle rate limiting. if h["Status"] == "403 Forbidden": debug_print( "{0}: Hit rate limit. Retry at {1}. {2} tried and {3} saved since last hit." .format(h["Date"], time.ctime(int(h["X-RateLimit-Reset"])), num_tried_since, num_recorded_since)) if hit_url == url: print('Same one again, skipping') repo_recorded = True continue wait_time = int( h["X-RateLimit-Reset"]) - time.time() + 1 num_tried_since = 0 num_recorded_since = 0 hit_url = url continue if "message" in j and (j["message"] == "Not Found" or j["message"] == "Bad credentials"): print(url, 'Message:', j['message']) raise Exception # Save JSON File. else: if local: filename = "../data/repos/repo_{0}.json".format( row["repo_id"]) with open(filename, "w") as repo_file: json.dump(j, repo_file) else: obj = s3.Object( "notebook-research", "repos/repo_{0}.json".format(row["repo_id"])) obj.put(Body=bytes(json.dumps(j).encode("UTF-8"))) # Report Status. msg = "{0}: downloaded repo {1}".format( date_string, row["repo_id"]) write_to_log("../logs/repo_metadata_query_log.txt", msg) repo_recorded = True wait_time = 0 num_recorded_since += 1 except Exception as e: # Report missed files. msg = "{0}: had trouble downloading repo {1}".format( date_string, row["repo_id"]) write_to_log("../logs/repo_metadata_query_log.txt", msg) debug_print(msg) debug_print(e) repo_recorded = True
def main(): # Parse command line arguments. parser = argparse.ArgumentParser() parser.add_argument( "min", type=int, help="Minimum size to search." ) parser.add_argument( "max", type=int, help="Maximum size to search." ) parser.add_argument( "--update", action="store_const", dest="updating", const=True, default=False, help=( "Search notebooks that have been added " + "or updated since last search, along with new " + "notebooks" ) ) parser.add_argument( "--local", action="store_const", dest="local", const=True, default=False, help="Saves output locally instead of in S3." ) parser.add_argument( "--worker", metavar="N", type=int, help=( "GITHUB_TOKEN assigned to these sizes (workers " + "sorted in alphabetical order: {0}).".format( list(TOKENS.keys()) ) ) ) args = parser.parse_args() MIN = args.min MAX = args.max updating = args.updating worker = args.worker local = args.local # If updating, look at saved_urls to determine a duplicate. # New versions of notebooks will overwrite earlier downloads. saved_urls = [] current_csvs = os.listdir(PATH) if local else list_s3_dir('csv') if updating and "notebooks1.csv" in current_csvs: if local: notebooks1 = pd.read_csv("{0}/notebooks1.csv".format(PATH)) else: notebooks1 = s3_to_df('csv/notebooks1.csv') saved_urls = list(notebooks1.html_url) # Set worker. if worker != None: header = HEADERS[worker] else: header = HEADERS[0] # Log and display status. write_to_log( "../logs/timing.txt", "Testing on the size range {0} to {1}".format(MIN, MAX) ) start = datetime.datetime.now() write_to_log("../logs/timing.txt", "START: {0}".format(start)) debug_print( BREAK + "Downloading and formatting data for all Jupyter " + "Notebooks publicly stored on github." + BREAK ) # List notebooks already downloaded. current_notebooks = set(notebooks1.file) if updating else [] # Get json query files for given size range. num_needed = get_json(MIN, MAX, saved_urls, header, updating, local, current_notebooks) if worker != None: with open('num_needed_{0}.save'.format(worker),'w') as f: f.write(str(num_needed)) else: command = 'nohup python3 -u process.py --needed {0}'.format(num_needed) if updating: command += ' --updating' if local: command += ' --local' os.system(command + ' > process.log &') # Check time, log, and display status. check1 = datetime.datetime.now() write_to_log("../logs/timing.txt", "CHECKPOINT 1: {0}".format(check1)) debug_print( "\nJson query files have been downloaded. " + "Time: {0}{1}".format(check1 - start, BREAK) ) # Check time, log, and display status. check2 = datetime.datetime.now() write_to_log("../logs/timing.txt","CHECKPOINT 2: {0}".format(check2)) debug_print("All together, {0}".format(check2 - start))
def save_page( url, size, header, query_status, saved_urls, current_notebooks ): """ Save results page to json file. """ # Set inital rate limiting management variables. limit_status = { "reset_time": time.time(), "limited": False, "wait_time": 0, "remaining_queries": 30 } # Query GitHub API. try: r = requests.get(url, headers = header) j = r.json() h = r.headers except requests.exceptions.Timeout: debug_print("Request timeout.") r = None limit_status["limited"] = True limit_status["wait_time"] = 60 return r, limit_status, query_status # Handle 403 error if we have hit query rate. if "Status" not in h or h["Status"] == "403 Forbidden": try: debug_print( "{0}: Hit rate limit. Retry after {1} seconds".format( h["Date"], h["Retry-After"] ) ) # Set to limited and update wait time. limit_status["limited"] = True limit_status["wait_time"] = int(h["Retry-After"]) except Exception: # Default wait time to 1 minute. limit_status["limited"] = True limit_status["wait_time"] = 60 return r, limit_status, query_status # Update rate limiting management variables. date = r.headers["Date"] query_status["num_results"] = int(j["total_count"]) limit_status["remaining_queries"] = h["X-RateLimit-Remaining"] limit_status["reset_time"] = int(h["X-RateLimit-Reset"]) # Write progress to log and display status. log_string = "{0}: {1} bytes {2} results".format( date, size, query_status["num_results"] ) write_to_log("../logs/nb_metadata_query_log.txt", log_string) debug_print(log_string) # Check if query result is acceptable. if ( query_status["num_results"] <= QUERY_CUTOFF or query_status["page"] > 1 or size.split("..")[0] == size.split("..")[1] ): # Add days since. diff = datetime.datetime.now() - datetime.datetime(2019,1,1) j["days_since"] = (diff.days + (diff.seconds + diff.microseconds/(10**6))/(60*60*24) ) # Save this page. filename = "github_notebooks_{0}_p{1}.json".format( size, query_status["page"] ) if query_status["updating"]: filename = "github_notebooks_{0}_p{1}_{2}.json".format( size, query_status["page"], datetime.datetime.now() ) if query_status["local"]: with open(JSON_PATH+filename, "w") as json_file: json.dump(j, json_file) else: obj = s3.Object("notebook-research","json/"+filename) obj.put(Body = bytes(json.dumps(j).encode("UTF-8"))) # Display status. debug_print("Saved {0} bytes, p{1}".format(size, query_status["page"])) for item in j["items"]: # If updating, done if this html_url has already been downloaded. if query_status["updating"] and "file" in item: html_url = item["html_url"].replace("#","%23") file_name = item["file"] # If the same version of an existing notebook, done. if html_url in saved_urls: debug_print(("This notebook has already been " "downloaded! Stop looking here.") ) query_status["another_page"] = False query_status["done"] = True break # If new version of an existing notebook, delete old. elif file_name in current_notebooks: if query_status["local"]: os.remove("../data/notebooks/{0}".format(file_name)) else: s3.Object( "notebook-research", "notebooks/{0}".format(file_name) ).delete() # If we"ve retrieved num_results notebooks, we"re done. path = item["repository"]["full_name"] + "/" + item["path"] query_status["all_items"].append(path) if len(query_status["all_items"]) == query_status["num_results"]: query_status["another_page"] = False query_status["done"] = True break query_status["num_needed"] += 1 # Write progress to log adn display log_string = "{0}: {1} bytes p{2} {3} items".format( date, size, query_status["page"], len(j["items"]) ) write_to_log("../logs/nb_metadata_query_log.txt", log_string) debug_print(log_string) # if less than 100 items on the page, it"s the last page # at most 10 pages if len(j["items"]) < 100 or query_status["page"] == 10: query_status["done"] = True return r, limit_status, query_status
def edit_size(current_files, minimum, maximum): """ Update minimum and maximum to an unqueried range. Returns the minimum and maximum for the first range that has yet to be queried. Example: searching 0 - 100 when 23 - 45 have already been queried returns 0 - 22. """ # Find sizes already queried. # Sort by minimum size. sizes_done = {} current_files = sorted( current_files, key = get_min, ) for filename in current_files: start = get_min(filename) if ".." in filename: end = get_max(filename) else: end = start sizes_done[start] = end while True: minimum_done_start = min(sizes_done.keys()) minimum_done_end = sizes_done[minimum_done_start] if ( minimum >= minimum_done_start and # Minimum above min queried start. maximum > minimum_done_end # Maximum above min queried end. ): # If minimum below min queried end, then range of # minimum..minimum_done_end has already been queried, # so increase minimum. if minimum <= minimum_done_end: minimum = minimum_done_end + 1 debug_print("Size {0}..{1} already queried".format( minimum_done_start, minimum_done_end )) # Remove smallest query range, continue to next smallest. sizes_done.pop(minimum_done_start) if len(sizes_done) == 0: break else: continue break # Minimum is complete, decrease maximum if necessary. if len(sizes_done) > 0: minimum_done_start = min(sizes_done.keys()) minimum_done_end = sizes_done[minimum_done_start] if maximum >= minimum_done_start: maximum = minimum_done_start - 1 debug_print("Size {0}..{1} already queried".format( minimum_done_start, minimum_done_end )) return minimum, maximum
def query(url, size, header, saved_urls, updating, local, current_notebooks): """ Query GitHub for notebooks of a given size and return query status. """ # Set inital rate limiting management variables. limit_status = { "reset_time": time.time(), "limited": False, "wait_time": 0, "remaining_queries": 30 } # Set initial query status variables. query_status = { "done": False, "page": 1, "another_page": False, "updating": updating, "local": local, "num_results": 0, "num_needed": 0, "all_items": [] } while not query_status["done"]: # Handle rate limiting status. limit_status = check_limit(limit_status) # Save this page of results. r, limit_status, query_status = save_page( url, size, header, query_status, saved_urls, current_notebooks ) if r == None: continue # If too many results, return. Handled in get_json. if (query_status["num_results"] > QUERY_CUTOFF and # Too many results. size.split("..")[0] != size.split("..")[1] # Can decrease query range (min!=max). ): query_status["done"] = True return query_status # Handle rate limiting status. if limit_status["limited"] and limit_status["wait_time"] != 0: continue # Move to the next page of results. if "next" in r.links: next_url = r.links["next"]["url"] query_status["another_page"] = True while ( query_status["another_page"] and len(query_status["all_items"]) != query_status["num_results"] ): query_status["page"] += 1 debug_print("{0} to find, {1} found, {2} unique".format( query_status["num_results"], len(query_status["all_items"]), len(set(query_status["all_items"])) )) # Handle rate limiting status. limit_status = check_limit(limit_status) # Save this page of results. r, limit_status, query_status = save_page( next_url, size, header, query_status, saved_urls, current_notebooks ) if r == None: continue # Handle rate limiting status. if limit_status["limited"] and limit_status["wait_time"] != 0: query_status["page"] -= 1 continue if "next" in r.links: # Move on to next page of results. next_url = r.links["next"]["url"] else: # Completed last page of results. query_status["another_page"] = False query_status["done"] = True # Report if too many results within a single size (e.g. 1200..1200). if ( query_status["num_results"] > QUERY_CUTOFF and size.split("..")[0] == size.split("..")[1] ): msg = "TOO MANY RESULTS: {0} bytes, {1} results".format( size.split("..")[0], query_status["num_results"] ) write_to_log("../logs/nb_metadata_query_log.txt", msg) debug_print(msg) return query_status
def get_json( minimum, maximum, saved_urls, header, updating, local, current_notebooks ): """ Download json search results for all jupyter notebooks on github within given size range (minimum - maximum). Equivalent to Adam's 0_nb_metadata_download. """ debug_print("Downloading query results by size.") current_files = ( set(os.listdir(JSON_PATH)) if local else list_s3_dir("json/") ) num_needed = 0 done = False stored_max = maximum while not done: # Change size range based on what has already been queried. if not updating: minimum, maximum = edit_size(current_files, minimum, maximum) if minimum > maximum: done = True continue size = str(minimum)+".."+str(maximum) # At this point, we have a range ready to be queried. debug_print("Querying {0} byte notebooks.".format(size)) # Query size range. url = URL + size query_status = query( url, size, header, saved_urls, updating, local, current_notebooks ) # Check number of search results. # We can only access 1000 results due to the query limit. if ( query_status["num_results"] > QUERY_CUTOFF and # Results over query limit. maximum != minimum and # Can decrease query range. query_status["page"] == 1 # On the first page of query. ): # Cut query range in half. if maximum - (maximum - minimum)//2 != maximum: maximum = maximum - (maximum - minimum)//2 else: maximum = minimum debug_print( "Too many results, trying a narrower range: " + "{0} - {1}".format(minimum, maximum) ) continue else: debug_print( "{0} / {1} results found".format( len(query_status["all_items"]), query_status["num_results"] ) ) debug_print( "{0} are unique.".format(len(set(query_status["all_items"]))) ) # Move on to next search within original query range. minimum = maximum + 1 maximum = stored_max return num_needed
def main(): # Parse command line arguments. parser = argparse.ArgumentParser() parser.add_argument("--local", action="store_const", dest="local", const=True, default=False, help="Stores results locally instead of using S3.") parser.add_argument("--worker", metavar="N", type=int, help=("GITHUB_TOKEN assigned to these sizes (workers " "sorted in alphabetical order: {0}).").format( list(TOKENS.keys()))) args = parser.parse_args() local = args.local worker = args.worker # If running in parallel, mark csv files with the worker number. global EXTENSION EXTENSION = '_{0}'.format(worker) if worker != None else '' print('EXTENSION', EXTENSION) start = datetime.datetime.now() # List of saved CSV files. if local: current_csvs = set(os.listdir(PATH)) else: current_csvs = list_s3_dir(S3_PATH) # Open basic data from json files (files created in query_git.py). if set([ "notebooks1{0}.csv".format(EXTENSION), "repos1{0}.csv".format(EXTENSION), "owners1{0}.csv".format(EXTENSION) ]).issubset(current_csvs): notebooks1 = get_df("notebooks1{0}.csv".format(EXTENSION), local) owners1 = get_df("owners1{0}.csv".format(EXTENSION), local) repos1 = get_df("repos1{0}.csv".format(EXTENSION), local) else: debug_print("Notebooks1, Owners1, and Repos1 were not found.") sys.exit(0) debug_print("Notebooks1, Owners1, and Repos1 were found and opened." + BREAK) ### Add information for repositories and owners. ################## save = False if not set([ "owners2{0}.csv".format(EXTENSION), "repos2{0}.csv".format(EXTENSION) ]).issubset(current_csvs): owners2, repos2 = update_owners_repos(owners1, repos1, local) save = True else: try: owners2_old = get_df("owners2{0}.csv".format(EXTENSION), local) repos2_old = get_df("repos2{0}.csv".format(EXTENSION), local) debug_print( "Found and opened data for {0} owners and {1} repos.".format( len(owners2_old), len(repos2_old))) except: owners2_old = [] repos2_old = [] if len(owners2_old) > 0 and len(repos2_old) > 0: owners1_new = owners1[~owners1.owner_id.isin(owners2_old.owner_id)] repos1_new = repos1[~repos1.repo_id.isin(repos2_old.repo_id)] else: owners1_new = owners1 repos1_new = repos1 debug_print("Collecting data for {0} owners and {1} repos.".format( len(owners1_new), len(repos1_new))) if len(owners1_new) > 0 and len(repos1_new) > 0: owners2_new, repos2_new = update_owners_repos( owners1_new, repos1_new, local) if len(owners2_new) > 0 and len(repos2_new) > 0: owners2 = pd.concat([owners2_old, owners2_new ]).drop_duplicates(subset='owner_id') repos2 = pd.concat([repos2_old, repos2_new ]).drop_duplicates(subset='repo_id') else: owners2 = owners2_old repos2 = repos2_old else: owners2 = owners2_old repos2 = repos2_old ## Save if save: debug_print("Saving combined data for {0} owners and {1} repos".format( len(owners2), len(repos2))) if local: owners2.to_csv("{0}/owners2{1}.csv".format(PATH, EXTENSION), index=False) repos2.to_csv("{0}/repos2{1}.csv".format(PATH, EXTENSION), index=False) else: df_to_s3(owners2, "{0}/owners2{1}.csv".format(S3_PATH, EXTENSION)) df_to_s3(repos2, "{0}/repos2{1}.csv".format(S3_PATH, EXTENSION)) debug_print("Owners2 and Repos2 were created and saved.\n" + BREAK) ## Add data on cells within each notebook. ####################### if not set(["notebooks2{0}.csv".format(EXTENSION)]).issubset(current_csvs): print("Notebooks2 not found, creating from scratch.") get_all_nb_cells(notebooks1, local, 0) else: # Get existing data. try: notebooks2_old = get_df("notebooks2{0}.csv".format(EXTENSION), local) debug_print( "Found and opened notebook data for {0} notebooks.".format( len(notebooks2_old))) except Exception as e: notebooks2_old = [] print("Notebooks2 could not be opened, creating from scratch.") print(type(e), e) # Isolate rows of notebooks1 corresponding to new notebooks if len(notebooks2_old) > 0: notebooks1_new = notebooks1[~notebooks1.file.isin(notebooks2_old. file)] else: notebooks1_new = notebooks1 debug_print("Collecting data for {0} notebooks.".format( len(notebooks1_new))) # If there are new notebooks, add cell data if len(notebooks1_new) > 0: get_all_nb_cells(notebooks1_new, local, len(notebooks2_old)) del notebooks2_old # Check time and report status. end = datetime.datetime.now() debug_print("TOTAL TIME: {0}".format(end - start))
def get_all_nb_cells(notebooks, local, done): """ Get cell and notebook data for each notebook. """ new_nb_info = {} all_cells_info = {} missing = [] for count, row in notebooks.iterrows(): # Track progress. file_name = row["file"] data = None if count % COUNT_TRIGGER == 0 or count == len(notebooks) - 1: print("{0} / {1} notebooks processed for cell data".format( count, len(notebooks) + done)) # Save data and reset. (In chunks to avoid MemoryError). if count > 0: # Transform data to DataFrame. notebooks_temp = pd.DataFrame(new_nb_info).transpose() cells_temp = pd.DataFrame( all_cells_info).transpose().reset_index(drop=True) # Save data to CSV. try: if local: notebooks_temp.to_csv( "{0}/notebooks2_{1}_{2}.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) cells_temp.to_csv("{0}/cells1_{1}_{2}.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) else: df_to_s3( notebooks_temp, "{0}/notebooks2_{1}_{2}.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( cells_temp, "{0}/cells1_{1}_{2}.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) except MemoryError: # Split data into 3 sections and try saving again. n1 = notebooks_temp.iloc[:len(notebooks_temp) // 4] n2 = notebooks_temp.iloc[len(notebooks_temp) // 4:2 * len(notebooks_temp) // 4] n3 = notebooks_temp.iloc[2 * len(notebooks_temp) // 4:3 * len(notebooks_temp) // 4] n4 = notebooks_temp.iloc[3 * len(notebooks_temp) // 4:] c1 = cells_temp.iloc[:len(cells_temp) // 8] c2 = cells_temp.iloc[len(cells_temp) // 8:2 * len(cells_temp) // 8] c3 = cells_temp.iloc[2 * len(cells_temp) // 8:3 * len(cells_temp) // 8] c4 = cells_temp.iloc[3 * len(cells_temp) // 8:4 * len(cells_temp) // 8] c5 = cells_temp.iloc[4 * len(cells_temp) // 8:5 * len(cells_temp) // 8] c6 = cells_temp.iloc[5 * len(cells_temp) // 8:6 * len(cells_temp) // 8] c7 = cells_temp.iloc[6 * len(cells_temp) // 8:7 * len(cells_temp) // 8] c8 = cells_temp.iloc[7 * len(cells_temp) // 8:] if local: n1.to_csv("{0}/notebooks2_{1}_{2}_1.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n2.to_csv("{0}/notebooks2_{1}_{2}_2.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n3.to_csv("{0}/notebooks2_{1}_{2}_3.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) n4.to_csv("{0}/notebooks2_{1}_{2}_4.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c1.to_csv("{0}/cells1_{1}_{2}_1.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c2.to_csv("{0}/cells1_{1}_{2}_2.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c3.to_csv("{0}/cells1_{1}_{2}_3.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c4.to_csv("{0}/cells1_{1}_{2}_4.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c5.to_csv("{0}/cells1_{1}_{2}_5.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c6.to_csv("{0}/cells1_{1}_{2}_6.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c7.to_csv("{0}/cells1_{1}_{2}_7.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) c8.to_csv("{0}/cells1_{1}_{2}_8.csv".format( PATH, EXTENSION, count / COUNT_TRIGGER), index=False) else: df_to_s3( n1, "{0}/notebooks2_{1}_{2}_1.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n2, "{0}/notebooks2_{1}_{2}_2.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n3, "{0}/notebooks2_{1}_{2}_3.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( n4, "{0}/notebooks2_{1}_{2}_4.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c1, "{0}/cells1_{1}_{2}_1.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c2, "{0}/cells1_{1}_{2}_2.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c3, "{0}/cells1_{1}_{2}_3.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c4, "{0}/cells1_{1}_{2}_4.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c5, "{0}/cells1_{1}_{2}_5.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c6, "{0}/cells1_{1}_{2}_6.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c7, "{0}/cells1_{1}_{2}_7.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) df_to_s3( c8, "{0}/cells1_{1}_{2}_8.csv".format( S3_PATH, EXTENSION, count / COUNT_TRIGGER)) # Empty current dictionaries. new_nb_info = {} all_cells_info = {} print("CSVs saved") # Initialize row of data. nb_info = { "file": file_name, "google_collab": False, "nbformat": "", "nbformat_minor": "", "num_cells": 0, "kernel_lang": "", "kernel_name": "", "lang_name": "", "lang_version": "" } # Open notebooks as json. try: obj = s3.Object("notebook-research", "notebooks/{0}".format(file_name)) data = json.loads(obj.get()["Body"].read().decode("UTF-8")) except Exception: # Report missed files. msg = "Notebook {0} did not open.".format(file_name) write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) missing.append(file_name) # Add row with missing values. if file_name not in new_nb_info: new_nb_info[file_name] = nb_info continue # If data was able to load as JSON, extract information. if data and isinstance(data, dict): keys = data.keys() # Get nb top level format metadata. if "nbformat" in keys: nb_info["nbformat"] = data["nbformat"] if "nbformat_minor" in keys: nb_info["nbformat_minor"] = data["nbformat_minor"] # Get info from the metadata dictionary. if ("metadata" in keys and data["metadata"] != None and isinstance(data["metadata"], dict)): metadata_keys = data["metadata"].keys() # Access language data. if ("kernelspec" in metadata_keys and data["metadata"]["kernelspec"] != None and isinstance(data["metadata"]["kernelspec"], dict)): kernel_keys = data["metadata"]["kernelspec"].keys() # If Google colab notebook, only Python 2.7 or 3.6 are possible. if "colab" in metadata_keys: nb_info["google_collab"] = True if ("name" in kernel_keys and "display_name" in kernel_keys): nb_info["kernel_lang"] = data["metadata"][ "kernelspec"]["name"] nb_info["kernel_name"] = data["metadata"][ "kernelspec"]["display_name"] if nb_info["kernel_lang"] == "python3": nb_info["lang_name"] = "python" nb_info["lang_version"] = "3.6" elif nb_info["kernel_lang"] == "python2": nb_info["lang_name"] = "python" nb_info["lang_version"] = "2.7" # Not Google colab, access kernel language and display name. else: if "language" in kernel_keys: nb_info["kernel_lang"] = data["metadata"][ "kernelspec"]["language"] if "display_name" in kernel_keys: nb_info["kernel_name"] = data["metadata"][ "kernelspec"]["display_name"] # Access language info. if ("language_info" in metadata_keys and "colab" not in metadata_keys): lang_keys = data["metadata"]["language_info"].keys() if "name" in lang_keys and "colab" not in metadata_keys: nb_info["lang_name"] = data["metadata"][ "language_info"]["name"] if "version" in lang_keys and "colab" not in metadata_keys: nb_info["lang_version"] = data["metadata"][ "language_info"]["version"] elif "language" in metadata_keys: nb_info["lang_name"] = data["metadata"]["language"] # Get information about individual cells. cells_info = {} if "cells" in keys: nb_info["num_cells"] = len(data["cells"]) cell_id = 0 for cell in data["cells"]: cell_info, nb_language = get_single_cell( cell_id, file_name, cell, nb_info["lang_name"]) if nb_info["lang_name"] == "": nb_info["lang_name"] = nb_language.lower() if (file_name, cell_id) not in cells_info: cells_info[(file_name, cell_id)] = cell_info cell_id += 1 elif "worksheets" in keys: cell_id = 0 for w in data["worksheets"]: for cell in w["cells"]: cell_info, nb_language = get_single_cell( cell_id, file_name, cell, nb_info["lang_name"]) if nb_info["lang_name"] == "": nb_info["lang_name"] = nb_language.lower() if (file_name, cell_id) not in cells_info: cells_info[(file_name, cell_id)] = cell_info cell_id += 1 all_cells_info.update(cells_info) if file_name not in new_nb_info: new_nb_info[file_name] = nb_info debug_print("{0} notebooks are missing cell data.".format(len(missing))) return new_nb_info, all_cells_info
def update_owners_repos(owners, repos, local): """ Add information on Owners and Repos""" new_repo_info = {} new_owner_info = {} repo_ids = list(repos.repo_id) missing = 0 forked = 0 moved = 0 for i, repo_id in enumerate(repo_ids): repo_json = None # Keep track of progress. if i % COUNT_TRIGGER == 0: debug_print("{0} / {1} repo data files processed".format( i, len(repo_ids))) try: obj = s3.Object("notebook-research", "repos/repo_{0}.json".format(repo_id)) repo_json = json.loads(obj.get()["Body"].read().decode("UTF-8")) except Exception: missing += 1 # Report missed files. msg = "Repo {0} metadata did not process.".format(repo_id) write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) continue if repo_json != None: if "message" in repo_json and (repo_json["message"] == "Not Found" or repo_json["message"] == "Bad credentials"): # Report missed files. missing += 1 msg = "Repo {0} metadata file did not download well.".format( repo_id) # Move bad file s3.Object( 'notebook-research', 'repos_bad/repo_{0}.json'.format(repo_id)).copy_from( CopySource='notebook-research/repos/repo_{0}.json'. format(repo_id)) s3.Object('notebook-research', 'repos/repo_{0}.json'.format(repo_id)).delete() moved += 1 write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) continue if "owner" in repo_json: owner_id = repo_json["owner"]["id"] else: # Report missed files. msg = "Repo {0} metadata file not complete.".format(repo_id) write_to_log("../logs/repo_metadata_cleaning_log.txt", msg) continue if not repo_json["fork"]: # Add repo info. repo_info = { "repo_id": repo_id, "language": repo_json["language"], "forks_count": repo_json["forks_count"], "stargazers_count": repo_json["stargazers_count"], "watchers_count": repo_json["watchers_count"], "subscribers_count": repo_json["subscribers_count"], "size": repo_json["size"], "open_issues_count": repo_json["open_issues_count"], "has_issues": repo_json["has_issues"], "has_wiki": repo_json["has_wiki"], "has_pages": repo_json["has_pages"], "has_downloads": repo_json["has_downloads"], "pushed_at": repo_json["pushed_at"], "created_at": repo_json["created_at"], "updated_at": repo_json["updated_at"] } if repo_id not in new_repo_info: new_repo_info[repo_id] = repo_info # Add owner info owner_info = { "owner_id": owner_id, "type": repo_json["owner"]["type"] } if owner_id not in new_owner_info: new_owner_info[owner_id] = owner_info else: forked += 1 else: missing += 1 # Display status. debug_print("We have {0} new repos.".format(len(new_repo_info))) debug_print("Couldn't process {0} files.".format(missing)) debug_print("{0} new repos were forked.".format(forked)) debug_print("{0} files had to be moved".format(moved)) # Translate dictionaries to DataFrames. if len(new_owner_info) > 0 and len(new_repo_info) > 0: updated_owners = owners.merge( pd.DataFrame(new_owner_info).transpose().reset_index(drop=True), on="owner_id") updated_repos = repos.merge( pd.DataFrame(new_repo_info).transpose().reset_index(drop=True), on="repo_id") else: updated_owners = [] updated_repos = [] return updated_owners, updated_repos
def parallelize_download(notebooks, repos, owners, local): # Open existing partitions if they are present. try: if local: f = open("download_partitions.pickle", "rb") partitions = pickle.load(f) f.close() else: partitions = [] for i in range(NUM_WORKERS): obj = s3.Object("notebook-research", "download_partitions_{0}.pickle".format(i)) partitions.append( pickle.load(BytesIO(obj.get()['Body'].read()))) print("Paritions opened") # List already partitioned notebooks notebooks_partitioned = [] for partition in partitions: notebooks_partitioned += list(partition['notebooks']['file']) debug_print("{0} notebooks have already been partitioned.".format( len(notebooks_partitioned))) # Isolate notebooks not yet partitioned notebooks_new = notebooks[~notebooks.file.isin(notebooks_partitioned)] if len(notebooks_new) == 0: print("All notebooks have already been partitioned.") return True except Exception as e: print(e) # All notebooks are new notebooks_new = notebooks partitions = [] for i in range(NUM_WORKERS): partitions.append({ "id": i, "notebooks": [], "repos": [], "owners": [] }) # Shuffle new notebooks notebooks_new = notebooks_new.sample(frac=1).reset_index(drop=True) # Randomly assign notebooks and the repos/owners that go with them. partition_notebooks = np.array_split(notebooks_new, NUM_WORKERS) for i in range(NUM_WORKERS): p = partitions[i] # Add new notebooks, repos, and owners to partitions if len(p["notebooks"]) > 0: p["notebooks"] = pd.concat([ p["notebooks"], # existing notebooks partition_notebooks[i] # new notebooks ]) else: p["notebooks"] = partition_notebooks[i] if len(p["repos"]) > 0: p["repos"] = pd.concat([ p["repos"], repos[repos.repo_id.isin( partition_notebooks[i]["repo_id"])].reset_index(drop=True) ]) else: p["repos"] = repos[repos.repo_id.isin( partition_notebooks[i]["repo_id"])].reset_index(drop=True) if len(p["owners"]) > 0: p["owners"] = pd.concat([ p["owners"], owners[owners.owner_id.isin( partition_notebooks[i]["owner_id"])].reset_index(drop=True) ]) else: p["owners"] = owners[owners.owner_id.isin( partition_notebooks[i]["owner_id"])].reset_index(drop=True) print('done with', i) # Save partition data. print('saving...') if local: f = open("download_partitions.pickle", "wb") pickle.dump(partitions, f) f.close() else: for i in range(len(partitions)): obj = s3.Object("notebook-research", "download_partitions_{0}.pickle".format(i)) obj.put(Body=bytes(pickle.dumps(partitions[i]))) print('...saved') return False