def sleep(github): github_limits = github.get_rate_limit() if github_limits.core.remaining == 0: Common_Utilities.go_to_sleep( "API hour limit exceeded,Go to sleep for ", 3600) if github_limits.search.remaining == 0: Common_Utilities.go_to_sleep( "API minute limit exceeded,Go to sleep for ", 61)
def get_top_repos(): print("Retrieving list of top repos... \n") config_dict = Common_Utilities.read_config_file() # read all ini data #find top repos that have pushed in the last X days time_span = int(config_dict["TIME_SPAN"]) push_date = date.today() - datetime.timedelta(days=time_span) quick_sleep = int( config_dict["QUICK_SLEEP"]) # regular sleep after each iteration error_sleep = int( config_dict["ERROR_SLEEP"] ) # Sleep after a serious issue is detected from gitHib, should be around 10min, ie 600 sec max_size = int( config_dict["MAXSIZE"] ) # maximum number of repos to look for (top X); configured in config file github = None github = Github(config_dict["TOKEN"]) # pass the connection token output_file_name = "scripts/Popularity/Top_Repo.txt" # this is the output file that we are going to send repo names to output_file = open(output_file_name, "w") output_file.close() query = "pushed:>" + str( push_date) + " " + config_dict["REPO_SEARCH_QUERY"] print(query) query_repo(output_file_name, query, github, quick_sleep, error_sleep, max_size) print("\nFinally ..... Execution is over \n")
def getLastDiscussedDates(): config_dict = Common_Utilities.read_config_file() # read all config data user_api_key = config_dict["SO_TOKEN"] so = stackexchange.Site(stackexchange.StackOverflow, app_key=user_api_key, impose_throttling=True) so.be_inclusive() data = loadLastDiscussedSOData() libraries = Library.objects.all() for library in libraries: tag = library.so_tag questions = so.questions(sort='creation', order='DESC', tagged=[tag, 'java']) dates_string = "" for i in range(0, 10): if questions == None or i >= len(questions): break if i > 0: dates_string += ';' dates_string += questions[i].creation_date.strftime( "%m/%d/%Y, %H:%M:%S") + " UTC" if len(dates_string) == 0: data[tag] = None else: data[tag] = dates_string saveData(data)
def get_issues(): performance_classifier = PerformanceClassifier() security_classifier = SecurityClassifier() config_dict = Common_Utilities.read_config_file() # read all config data lib_data_json = read_json_file(config_dict["LIBRARY_LIST"]) print("Getting JIRA issue data") getIssueDataJIRA(lib_data_json, performance_classifier, security_classifier) print("Getting GitHub issue data") getIssueData(config_dict["TOKEN"], performance_classifier, security_classifier)
def addlibraries(): config_dict = Common_Utilities.read_config_file() lines = [] date = datetime.date.today() entrymonth = date.month entryyear = date.year repositories = [] libraries = read_json_file(config_dict["LIBRARY_LIST"]) for entry in libraries: domain_name = entry['Domain'] library_name = entry['LibraryName'] domain = Domain.objects.filter(name=domain_name) if not domain.exists(): domain = Domain() domain.name = domain_name domain.save() else: domain = Domain.objects.get(name=domain_name) library = Library.objects.filter(name=library_name) #create new lib if it doesn't exist. Otherwise, update entry if not library.exists(): library = Library() else: library = Library.objects.get(name=library_name) library.name = library_name library.so_tag = entry['SOtags'] library.domain = domain library.package = entry['Package'] library.github_repo = entry['FullRepoName'] library.github_url = entry['GitHubURL'] library.jira_url = entry['JIRAURL'] library.maven_url = entry['MavenURL'] library.save() #remove any libraries in the DB that are not in the config file (i.e., sync file and DB lib list) libraries_to_consider = [entry['LibraryName'] for entry in libraries] for library in Library.objects.all(): if library.name in libraries_to_consider: continue else: print("Library ", library.name, " no longer exists in config file. Removing its data") library.delete()
def get_popularity(): print("Getting popularity") config_dict = Common_Utilities.read_config_file() # read all config data library_dict = read_libraries( config_dict["LIBRARY_LIST"]) # read all libraries to search against output_file_name = config_dict[ "POPULARITY_OUTPUT_FILE"] # this is the output file that we are going to send libraries with their total counts to output_file = open(output_file_name, "w") output_file.close() for keyword, repo in library_dict.items(): print("for lib", repo) num_dependents = get_num_dependents(repo) send_totals_to_file(output_file_name, repo, num_dependents)
def getLicenses(): config_dict = Common_Utilities.read_config_file() # read all ini data token = config_dict["TOKEN"] data = loadLicenseData() github = Github(token) libraries = Library.objects.all() for library in libraries: print("Getting license for ", library.name) try: repo = github.get_repo(library.github_repo) data[library.github_repo] = repo.get_license().license.name saveData(data) except UnknownObjectException: print("ERROR: could not get license for lib", library.name) traceback.print_exc() data[library.github_repo] = 'None' saveData(data) continue
def save_chart_in_db(pygal_chart, domain, metric_name): #save chart in DB metric = Metric.objects.get(name=metric_name) if not metric: print("No metric object found for metric: ", metric) return chart = Chart.objects.filter(domain=domain).filter(metric=metric) #create new chart if it doesn't exist. Otherwise, update entry if not chart.exists(): chart = Chart() chart.domain = domain chart.metric = metric else: chart = Chart.objects.filter(domain=domain).get(metric=metric) chart.chart = Common_Utilities.chart_to_blob( pygal_chart) #domain.name + chart_suffix) chart.save()
def run(): addlibraries() get_popularity() get_release_freq() getLicenses() getLastModificationDates() getLastDiscussedDates() get_issues() get_breaking_changes( ) #must be called after releases are fetched so after release frequency metric config_dict = Common_Utilities.read_config_file() output_path = config_dict["OUTPUT_PATH"] shutil.copy2('scripts/Popularity/popularity_results.txt', output_path) for file in glob.glob(r'scripts/License/*.pkl'): shutil.copy2(file, output_path) for file in glob.glob(r'scripts/LastModificationDate/*.pkl'): shutil.copy2(file, output_path) for file in glob.glob(r'scripts/LastDiscussedOnStackOverflow/*.pkl'): shutil.copy2(file, output_path) filldb() try: os.mkdir(output_path + "charts") except: print("Charts directory already exists") for file in glob.glob(output_path + r'*_chart.pkl'): shutil.copy2(file, output_path + "charts") for file in glob.glob(output_path + r'*.pkl'): os.remove(file) os.remove(output_path + "popularity_results.txt")
def getLastModificationDates(): config_dict = Common_Utilities.read_config_file() # read all config data token = config_dict["TOKEN"] data = loadLastModificationDateData() github = Github(token) libraries = Library.objects.all() for library in libraries: repo = github.get_repo(library.github_repo) dates_string = "" i = 0 for c in repo.get_commits(): if i == 10: break if i > 0: dates_string += ';' dates_string += c.commit.author.date.strftime( "%m/%d/%Y, %H:%M:%S") + " UTC" i += 1 data[library.github_repo] = dates_string saveData(data)
def get_release_freq(): config_dict = Common_Utilities.read_config_file() # read all config data getReleaseDates(config_dict["TOKEN"])
def query_repo(output_file_name, base_query, github, quick_sleep, error_sleep, max_size): repo_set = set() try: #check github for rate limit rate_limit = github.get_rate_limit() rate = rate_limit.search print("The rate limit is %d" % rate.limit) if rate.remaining == 0: print('You have 0/%d API calls remianing. Reset time: %d' % (rate.limit, rate.reset)) Common_Utilities.go_to_sleep( "Reached API limit per minute, Going to sleep for ", quick_sleep) else: print('You have %d/%d API calls remaining' % (rate.remaining, rate.limit)) print('Base query: %s' % base_query) curr_query = base_query + " stars:>100" while len(repo_set) < max_size: print("Collected ", len(repo_set), " repos so far") print(curr_query) result = github.search_repositories(curr_query, sort='stars', order='desc') cnt = 0 pgno = 1 # 300 is how many repo's the script reads at a time (it was kept at 300 as reading more than that may result in a crash of the Github object while cnt <= 300: try: for repo in result.get_page(pgno): repo_set.add(repo.full_name) cnt = cnt + 1 stars = repo.stargazers_count if len(repo_set) == max_size: break except: Common_Utilities.go_to_sleep( "API limit exceeded, Going to sleep for ", quick_sleep) continue if len(repo_set) == max_size: break pgno = pgno + 1 curr_query = base_query + " stars:100.." + str(stars) output_to_file(output_file_name, repo_set) # error detection, just in case except Exception as e: output_to_file(output_file_name, repo_set) print( "Error: abuse detection mechanism detected.. outputting what we have..." ) traceback.print_exc()