def patch_issues(save_to_external_file): with open(repos_filtered_heuristic_metrics, 'r') as f: repos = json.load(f) enriched_result = list() counter = 1 for p in repos: print("Patching issues for repo number " + str(counter) + " --- " + p['id']) counter += 1 if(p['source'] == "github"): mined_issue_data = get_issues_github(p) p['num_issues'] = mined_issue_data[0] p['open_issues'] = mined_issue_data[1] p['closed_issues'] = mined_issue_data[2] else: p['num_issues'] = "NA" p['open_issues'] = "NA" p['closed_issues'] = "NA" enriched_result.append(p) if(save_to_external_file): c.save(repos_filtered_heuristic_metrics, enriched_result) else: csv.register_dialect('tab_separated_csv', delimiter = '\t', quoting=csv.QUOTE_ALL, skipinitialspace=True) to_save = list() for p in enriched_result: to_save.append([p['id'], p['num_issues'], p['open_issues'], p['closed_issues']]) with open("./repos_mining_data/otherData/issues_patch.csv", 'w') as f: writer = csv.writer(f, dialect='tab_separated_csv') for row in to_save: writer.writerow(row)
def synchronize_games(self): if config.cloudfolder is None: folderpicker = SyncfolderPicker() if folderpicker.exec(): return config.cloudfolder = folderpicker.get_syncfolder() config.save() config.load() remembered_resolution_strategy = None model = self.list_found_games.model for row in range(0, model.rowCount()): item = model.item(row) game = item.game if item.checkState() == Qt.Checked: if savesync.has_conflicts(game): resolution_strategy = remembered_resolution_strategy if resolution_strategy is None: conflict_resolution_dialog = ConflictResolutionDialog(game.name) if conflict_resolution_dialog.exec(): result = conflict_resolution_dialog.get_dialog_result() resolution_strategy = result[0] if result[1]: remembered_resolution_strategy = resolution_strategy if resolution_strategy == ConflictResolutionDialog.ResolutionMethod.OVERWRITE_LOCAL: savesync.remove_local_savegame(game) savesync.move_save_to_cloud(game) elif resolution_strategy == ConflictResolutionDialog.ResolutionMethod.OVERWRITE_CLOUD: savesync.remove_cloud_savegame(game) savesync.move_save_to_cloud(game) else: savesync.move_save_to_cloud(game) self.refresh_games()
def analyze_pair(rosmap_file_path, gh_file_path, merged_file_path): with open(rosmap_file_path, 'r') as rosmap: with open(gh_file_path, 'r') as gh: # we load the data rosmap_data = get_all_github_repos_data(json.load(rosmap)) gh_reader = csv.DictReader(gh, delimiter='\t') rosmap_urls = list() for p in rosmap_data: rosmap_urls.append(get_rosmap_repo_substring(p['url'])) results = rosmap_urls for line in gh_reader: try: current_url = get_github_repo_substring(line['url']) if(not current_url in rosmap_urls): results.append(current_url) except (AttributeError, TypeError, IndexError): print("Error for: " + str(line)) gh.close() rosmap.close() c.save(merged_file_path, results) print(rosmap_file_path + " + " + gh_file_path + " = " + str(len(results)) + ". Saved in: " + merged_file_path)
def patch_languages(save_to_external_file): github_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_github_no_simul.json', 'r')) bitbucket_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_bitbucket_no_simul.json', 'r')) with open(repos_filtered_heuristic_metrics, 'r') as f: repos = json.load(f) enriched_result = list() counter = 1 for p in repos: print("Patching languages for repo number " + str(counter) + " --- " + p['id']) counter += 1 if(p['source'] == "github"): p['language'] = get_repo_data(p['id'], github_no_simul) if(p['source'] == "bitbucket"): p['language'] = get_repo_data(p['id'], bitbucket_no_simul) if(p['source'] == "gitlab"): p['language'] = "NA" enriched_result.append(p) if(save_to_external_file): c.save('./repos_mining_data/otherData/repos_filtered_launch_file_metrics_languages.json', enriched_result) else: csv.register_dialect('tab_separated_csv', delimiter = '\t', quoting=csv.QUOTE_ALL, skipinitialspace=True) to_save = list() for p in enriched_result: to_save.append([p['id'], p['language']]) with open("./repos_mining_data/otherData/languages_patch.csv", 'w') as f: writer = csv.writer(f, dialect='tab_separated_csv') for row in to_save: writer.writerow(row)
def start_detecting(): with open(cloned_repos, 'r') as f: repos_list = json.load(f) counter = 1 detection_result = list() for p in repos_list: print("Detecting files for repo number " + str(counter) + " --- " + p['id']) counter += 1 p['xml_launch_files'] = detect_xml_launch_files(p) p['py_launch_files'] = detect_py_launch_files(p) detection_result.append(p) c.save(detection_result_path, detection_result)
def crawl_data(app): try: # Tells to the caller that we just downloaded new data is_new_data_available = False # Download Google Play metadata app_metadata = get_gp_metadata(app) app_latest_version = app_metadata['version'] app_suffix_path = app['id'] + c.SEPARATOR + app_latest_version # Save the metadata if it is new metadata_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'metadata.json' if (not os.path.exists(metadata_path)): is_new_data_available = True c.save(metadata_path, app_metadata) # Save the reviews reviews_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'reviews.json' app_reviews = get_reviews(app) c.save(reviews_path, app_reviews) # Download the APK if it is new apk_path = c.APKS_PATH + app_suffix_path + '.apk' if not os.path.exists(apk_path): if not download_apk(app['id'], apk_path): print( 'Error while downloading the following app, we skip it: ' + app['id']) return False elif not apk_downloader.verify_apk(app['id'], apk_path, app_suffix_path): print('The downloaded APK is not well formed, we skip it: ' + apk_path) return False app['latest_crawled_version'] = app_latest_version app['latest_crawl'] = int(time.time()) # Let's inform the user about whether new data has been crawled if is_new_data_available: print('Crawled new data for: ' + app['id'] + ' - version: ' + app_latest_version) else: print('Already up to date: ' + app['id'] + ' - version: ' + app_latest_version) return is_new_data_available except: print('It seems like we had some problems in fetching new data for: ' + app['id'] + '. So, we skip it in the analysis.s') return False
def get_repos_list_to_clone(file_path): gitlab_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_gitlab_no_simul.json', 'r')) bitbucket_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_bitbucket_no_simul.json', 'r')) github_no_simul = json.load(open('./repos_mining_data/intermediateResults/6_github_no_simul.json', 'r')) repos = list() for p in github_no_simul: repos.append({'id': p['full_name'], 'description': p['description'], 'web_url': p['html_url'], 'clone_url': p['clone_url'], 'default_branch': p['default_branch'], 'source': 'github'}) for p in gitlab_no_simul: repos.append({'id': p['path_with_namespace'], 'description': p['description'], 'web_url': p['web_url'], 'clone_url': p['http_url_to_repo'], 'default_branch': p['default_branch'], 'source': 'gitlab'}) for p in bitbucket_no_simul: repos.append({'id': p['full_name'], 'description': p['description'], 'web_url': p['links']['html']['href'], 'clone_url': p['links']['clone'][0]['href'], 'default_branch': p['mainbranch']['name'], 'source': 'bitbucket'}) c.save(file_path, repos)
def start_cloning(): with open(repos_to_clone, 'r') as f: repos_list = json.load(f) counter = 1 cloned_repos = list() for p in repos_list: print("Cloning repo number " + str(counter) + " --- " + p['id']) counter += 1 absolute_path_to_clone = get_clone_path(p, True) local_path_to_clone = get_clone_path(p, False) clone_repo(p, absolute_path_to_clone) p['absolute_clone_path'] = absolute_path_to_clone p['local_clone_path'] = local_path_to_clone cloned_repos.append(p) c.save(cloned_repos_json, cloned_repos)
def init_ui(self): layout = QGridLayout(self) label_found_games = QLabel("Found games", self) layout.addWidget(label_found_games, 0, 0) label_synchronized_games = QLabel("Synchronized games", self) layout.addWidget(label_synchronized_games, 0, 1) self.list_found_games = GameList(self) layout.addWidget(self.list_found_games, 1, 0) self.list_synchronized_games = GameList(self) layout.addWidget(self.list_synchronized_games, 1, 1) button_synchronize = QPushButton("Synchronize selected", self) button_synchronize.clicked.connect(self.synchronize_games) layout.addWidget(button_synchronize, 2, 0) button_unsynchronize = QPushButton("Unsynchronize selected", self) button_unsynchronize.clicked.connect(self.unsynchronize_games) layout.addWidget(button_unsynchronize, 2, 1) button_group = QWidget(self) button_group_layout = QVBoxLayout(button_group) button_update = QPushButton("Update supported games list", self) button_update.clicked.connect(self.update_games_list) button_group_layout.addWidget(button_update) button_change_syncfolder = QPushButton("Change sync folder", button_group) button_change_syncfolder.clicked.connect(self.change_sync_folder) button_group_layout.addWidget(button_change_syncfolder) layout.addWidget(button_group, 1, 3, Qt.AlignTop) if not config.exists(): folderpicker = SyncfolderPicker() if folderpicker.exec(): config.cloudfolder = folderpicker.get_syncfolder() config.save() if config.exists(): config.load() self.refresh_games() self.center() self.show()
def collect_data(): apps = json.load(open(c.APPS_PATH, 'r')) for a in apps: # Crawl data from the Google Play store crawled_new_data = crawler.crawl_data(a) # if the app has a new release we did not analyze before... if crawled_new_data: # Launch the Androguard and Androwarn analyses androguard_androwarn_analyzer.analyze(a) # Analyze the servers pointed by the URLs we found in the String analysis of Androguard servers_analyzer.analyze(a) # Finally, if everything goes well, save the updated apps.json file with the new timestamps and versions c.save(c.APPS_PATH, apps)
def patch_contributors(save_to_external_file): with open(repos_filtered_heuristic_metrics, 'r') as f: repos = json.load(f) enriched_result = list() for p in repos: p['num_contributors'] = get_contributors_locally(p['local_clone_path']) enriched_result.append(p) if(save_to_external_file): c.save(repos_filtered_heuristic_metrics, enriched_result) else: csv.register_dialect('tab_separated_csv', delimiter = '\t', quoting=csv.QUOTE_ALL, skipinitialspace=True) to_save = list() for p in enriched_result: to_save.append([p['id'], p['num_contributors']]) with open("./repos_mining_data/otherData/locally_identified_contributors.csv", 'w') as f: writer = csv.writer(f, dialect='tab_separated_csv') for row in to_save: writer.writerow(row)
def update_apps_lists(root_path, countries): for e in countries: country = e['code'] lang = e['lang'] data_path = root_path + '/data_' + country if not os.path.exists(data_path): print( 'The country with code "' + country + '" is new, I am setting up its folder and app.json file now...' ) os.mkdir(data_path) os.mkdir(data_path + '/apks') os.mkdir(data_path + '/data') os.mkdir(data_path + '/reports') c.save(data_path + '/apps.json', []) c.setPaths(data_path) url = 'https://play.google.com/store/search?q=covid&gl=' + country r = requests.get(url) search_page = BeautifulSoup(r.text, 'html.parser') # We look for all the links referring to the apps listed by the search apps = search_page.find_all( "a", href=re.compile('^\/store\/apps\/details\?id=*')) app_ids = list() # We collect all app ids for a in apps: app_ids.append(a['href'].replace('/store/apps/details?id=', '')) # Remove duplicate ids app_ids = list(dict.fromkeys(app_ids)) # Now we iterate over all apps and add the new ones to the apps.json file analysed_apps = json.load(open(c.APPS_PATH, 'r')) for a in app_ids: if is_new(a, analysed_apps): analysed_apps.append({ 'id': a, 'store_country': country, 'store_lang': lang }) c.save(c.APPS_PATH, analysed_apps)
def collect_data(input_path, sonarqube): print("Sonarqube = " + str(sonarqube)) apps = json.load(open(c.APPS_PATH, 'r')) for a in apps: # Crawl data from the Google Play store crawled_new_data = crawler.crawl_data(a) # if the app has a new release we did not analyze before... if crawled_new_data: # Launch the Androguard and Androwarn analyses androguard_androwarn_analyzer.analyze(a) # Analyze the servers pointed by the URLs we found in the String analysis of Androguard servers_analyzer.analyze(a) # Sonarqube analysis can be passed as -S flag when invoking the program if sonarqube: SonarQube.sq_analyze(input_path) # Finally, if everything goes well, save the updated apps.json file with the new timestamps and versions c.save(c.APPS_PATH, apps)
def analyze(app): result = {} # We open the APK apk_path = c.get_apk_path(app) # Here we check if the APK is actually there, otherwise we skip the analysis if (not os.path.exists(apk_path)): return a, d, dx = AnalyzeAPK(apk_path) # Get all the permissions requested by the app requested_permissions = a.get_permissions() # Get all the Android activities of the app activities = a.get_activities() # Get all String constants in the app presumably containing a URL urls = list() for u in dx.find_strings("http[s]?://."): urls.append(u.get_value()) # We pack together all the partial results result['permissions'] = requested_permissions result['activities'] = activities result['urls'] = urls # We save the result into a JSON file app_suffix_path = app['id'] + c.SEPARATOR + app['latest_crawled_version'] result_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'androguard.json' c.save(result_path, result) # Now we run also the Androwarn analysis (with no Play Store look up) data = perform_analysis(apk_path, a, d, dx, False) # We generate the JSON report with the following parameters # Verbosity level: 3 (advanced) # Report type: json # Output path: same pattern as all the other JSON files produced so far androwarn_report_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'androwarn.json' generate_report(app['id'], data, 3, 'json', androwarn_report_path)
def change_sync_folder(self): folderpicker = SyncfolderPicker() if not folderpicker.exec(): return new_cloudfolder = folderpicker.get_syncfolder() if new_cloudfolder == config.cloudfolder: return games = savesync.detect_games() games_in_old_folder = [game for game in games if os.path.isdir(os.path.join(config.cloudfolder, game.id))] if len(games_in_old_folder) > 0: move_dialog = MoveSyncfolderDialog() if not move_dialog.exec(): return move = move_dialog.move for game in games_in_old_folder: if move: savesync.move_game_to_other_cloud(game, new_cloudfolder) config.cloudfolder = new_cloudfolder config.save() config.load() self.refresh_games()
def analyze(app): print('Analyzing the servers mentioned by: ' + app['id']) result = list() urls = get_candidate_urls(app) for url in urls: # Here is where we do the real Whois query try: domain_info = whois.query(url, force=1, slow_down=2) # We transform the domain object into a plain dictionary, otherwise we cannot save it into the json file item = domain_info.__dict__ result.append(item) except: print( 'Error performing the whois lookup for this server, it will be ignored: ' + url) # We save the result into a JSON file app_suffix_path = app['id'] + c.SEPARATOR + app['latest_crawled_version'] result_path = c.DATA_PATH + app_suffix_path + c.SEPARATOR + 'servers.json' c.save(result_path, result)
def apply_filtering_heuristics(): with open(detection_result_path, 'r') as f: repos = json.load(f) with_launch_file = 0 final_filtered = 0 collected_xml_launch_files = list() collected_py_launch_files = list() filtered_repos = list() for p in repos: # Check 1: the repo must contain at least one Launch file if ((len(p['xml_launch_files']) > 0) or (len(p['py_launch_files']) > 0)): with_launch_file += 1 total_nodes = 0 total_includes = 0 for el in p['xml_launch_files']: total_nodes += el['num_nodes'] total_includes += el['num_includes'] for el in p['py_launch_files']: total_nodes += el['num_nodes'] total_includes += el['num_includes'] collected_xml_launch_files.append(len(p['xml_launch_files'])) collected_py_launch_files.append(len(p['py_launch_files'])) if (total_nodes >= 2 or total_includes >= 1): final_filtered += 1 filtered_repos.append(p) c.save(filtered_heuristic, filtered_repos) print("Total number XML launch file: " + str(sum(collected_xml_launch_files))) print("Details: " + str(collected_xml_launch_files)) print("Total number Python launch file: " + str(sum(collected_py_launch_files))) print("Details: " + str(collected_py_launch_files)) print("Repos with either an XML or Python launch file: " + str(with_launch_file)) print("Repos with either more than 2 nodes or 1 include statement: " + str(final_filtered))
def collect_metrics_counts(): with open(filtered_heuristic, 'r') as f: repos_list = json.load(f) counter = 1 enriched_result = list() for p in repos_list: print("Collecting metrics for repo number " + str(counter) + " --- " + p['id']) counter += 1 metrics = count_metrics(p) p['num_issues'] = metrics[0] p['num_pull_requests'] = metrics[1] p['num_commits'] = metrics[2] p['num_branches'] = metrics[3] p['num_releases'] = metrics[4] p['num_contributors'] = metrics[5] # here we double check if there are repos with less than NUM_COMMITS commits if(p['num_commits'] != "NA"): if(int(p['num_commits']) >= c.NUM_COMMITS): enriched_result.append(p) else: print("Discarded this repo because it has less than NUM_COMMITS commits: " + p['id']) else: enriched_result.append(p) c.save(repos_filtered_heuristic_metrics, enriched_result)
def start_analysis(): with open('./otherData/rosmap_output.json', 'r') as outputfile: # we load the data data = json.load(outputfile) # in bitbucket_repos we will have the JSON representation of all the data we mined from the search API of bitbucket.org if(not os.path.isfile('./repos_mining_data/intermediateResults/0_all_bitbucket.json')): bitbucket_repos = get_all_bitbucket_repos_data(data) c.save('./repos_mining_data/intermediateResults/0_all_bitbucket.json', gitlab_repos) else: bitbucket_repos = json.load(open('./repos_mining_data/intermediateResults/0_all_bitbucket.json', 'r')) # in gitlab_repos we will have the JSON representation of all the data we mined from the search API of gitlab.com # Notice that out of the 46 initial gitlab repos, 16 of them are not hosted on gitlab.com, we did a manual analysis of those repos, which lead to no included results if(not os.path.isfile('./repos_mining_data/intermediateResults/0_all_gitlab.json')): gitlab_repos = get_all_gitlab_repos_data(data) c.save('./repos_mining_data/intermediateResults/0_all_gitlab.json', gitlab_repos) else: gitlab_repos = json.load(open('./repos_mining_data/intermediateResults/0_all_gitlab.json', 'r')) # in github_rosmap_repos we will have the JSON representation of all the data we mined from the search of ROSMAP if(not os.path.isfile('./repos_mining_data/intermediateResults/0_rosmap_github.json')): github_rosmap_repos = get_all_github_repos_data(data) c.save('./repos_mining_data/intermediateResults/0_rosmap_github.json', github_rosmap_repos) # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering commits_rosmap = to_dictionary(get_last_github_commits_api(github_rosmap_repos)) c.save('./repos_mining_data/intermediateResults/0_rosmap_github_commits.json', commits_rosmap) else: # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering github_rosmap_repos = to_dictionary(json.load(open('./repos_mining_data/intermediateResults/0_rosmap_github.json', 'r'))) commits_rosmap = json.load(open('./repos_mining_data/intermediateResults/0_rosmap_github_commits.json', 'r')) # in github_gh_repos we will have the JSON representation of all the data we mined from the search API of the GitHub platform # The initial point here is the data coming from the GHTorrent query as of this filtering step: "Filter repositories with #commits < 100" if(not os.path.isfile('./repos_mining_data/intermediateResults/2_ghtorrent_github.json')): # we load the data from the output of the GHTorrent query with open("./ghtorrentIntermediateResults/2_github_num_commits.txt", 'r') as gh: gh_reader = csv.DictReader(gh, delimiter='\t') data_ghtorrent = list() for line in gh_reader: try: data_ghtorrent.append(line) except (AttributeError, TypeError, IndexError): print("Error for: " + str(line)) github_ghtorrent_repos = get_ghtorrent_github_repos_data(data_ghtorrent) c.save('./repos_mining_data/intermediateResults/2_ghtorrent_github.json', github_ghtorrent_repos) # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering commits_ghtorrent = to_dictionary(get_last_github_commits_api(github_ghtorrent_repos)) c.save('./repos_mining_data/intermediateResults/2_ghtorrent_github_commits.json', commits_ghtorrent) else: # here we transform into a dictionary for eliminating duplicates and for easing the rest of the filtering github_ghtorrent_repos = to_dictionary(json.load(open('./repos_mining_data/intermediateResults/2_ghtorrent_github.json', 'r'))) commits_ghtorrent = json.load(open('./repos_mining_data/intermediateResults/2_ghtorrent_github_commits.json', 'r')) start_bitbucket_analysis(bitbucket_repos, data) start_gitlab_analysis(gitlab_repos) start_github_analysis(github_rosmap_repos, commits_rosmap, github_ghtorrent_repos, commits_ghtorrent, True)
def start_bitbucket_analysis(repos, rosmap_data): filtered_repos = list() print("0 - BitBucket Initial search: " + str(len(repos))) # Filter fork repositories for p in repos: if(not ('parent' in p)): filtered_repos.append(p) print("1 - BitBucket Filter fork repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/1_bitbucket_forks.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter repositories with #commits < NUM_COMMITS if(not os.path.isfile('./repos_mining_data/intermediateResults/2_bitbucket_commits.json')): for p in repos: if(get_bitbucket_commits(p['links']['clone'][0]['href']) >= c.NUM_COMMITS): filtered_repos.append(p) c.save('./repos_mining_data/intermediateResults/2_bitbucket_commits.json', filtered_repos) repos = filtered_repos print("2 - BitBucket Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(repos))) else: repos = json.load(open('./repos_mining_data/intermediateResults/2_bitbucket_commits.json', 'r')) print("2 - BitBucket Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(repos))) filtered_repos = list() # Filter repositories with at least X stars for p in repos: stars = get_rosmap_project(p['links']['html']['href'], rosmap_data)['stars'] if(stars >= NUM_STARS): filtered_repos.append(p) print("3 - BitBucket Filter repositories with at least X stars: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/3_bitbucket_stars.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter demo repositories for p in repos: if(p['description'] is not None): if(not (("demo" in p['description'].lower()) or ("course" in p['description'].lower()) or ("thesis" in p['description'].lower()) or ("exame" in p['description'].lower()))): # “demo”, “course”, "thesis", exame if(not (("demo" in p['full_name'].lower()) or ("course" in p['full_name'].lower()) or ("thesis" in p['full_name'].lower()) or ("exame" in p['full_name'].lower()))): # “demo”, “course”, "thesis", exame filtered_repos.append(p) else: filtered_repos.append(p) print("4 - Bitbucket Filter DEMO repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/4_bitbucket_no_demo.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter tools repositories for p in repos: if(p['description'] is not None): if(not (("tool" in p['description'].lower()) or ("util" in p['description'].lower()) or ("helper" in p['description'].lower()) or ("library" in p['description'].lower()) or ("util" in p['description'].lower()) or ("plugin" in p['description'].lower()) or ("plug-in" in p['description'].lower()))): # tool, util, helper, library, plugin, plug-in if(not (("tool" in p['full_name'].lower()) or ("util" in p['full_name'].lower()) or ("helper" in p['full_name'].lower()) or ("library" in p['full_name'].lower()) or ("plugin" in p['full_name'].lower()) or ("plug-in" in p['full_name'].lower()))): # tool, util, helper, library, plugin, plug-in filtered_repos.append(p) else: filtered_repos.append(p) print("5 - Bitbucket Filter TOOLS repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/5_bitbucket_no_tools.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter simulation-oriented repositories for p in repos: if(p['description'] is not None): if(not (("simulat" in p['description'].lower()) or ("gazebo" in p['description'].lower()))): # simulat, gazebo if(not (("simulat" in p['full_name'].lower()) or ("gazebo" in p['full_name'].lower()))): # simulat, gazebo filtered_repos.append(p) else: filtered_repos.append(p) print("6 - Bitbucket Filter SIMULATION repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/6_bitbucket_no_simul.json', filtered_repos) repos = filtered_repos filtered_repos = list() repos = filtered_repos filtered_repos = list() print("Bitbucket analysis done.")
def start_gitlab_analysis(repos): filtered_repos = list() print("0 - GitLab Initial search: " + str(len(repos))) # Filter fork repositories for p in repos: if((not ('fork' in p['name_with_namespace'])) and not(('fork' in p['description']))): filtered_repos.append(p) print("1 - GitLab Filter fork repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/1_gitlab_forks.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter repositories with #commits < NUM_COMMITS if(not os.path.isfile('./repos_mining_data/intermediateResults/3_bitbucket_commits.json')): for p in repos: response = form_request(p['web_url']) try: if response.status == 200: data = response.data commits = int(re.findall(r"[\d]+</strong> Commits</a>", str(data))[0].split("<")[0]) else: print("error: " + p['web_url']) except: commits = 0 if(commits >= c.NUM_COMMITS): filtered_repos.append(p) c.save('./repos_mining_data/intermediateResults/2_gitlab_commits.json', filtered_repos) repos = filtered_repos print("2 - GitLab Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(filtered_repos))) else: repos = json.load(open('./repos_mining_data/intermediateResults/2_gitlab_commits.json', 'r')) # filtered_repos print("2 - GitLab Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(repos))) filtered_repos = list() # Filter repositories with at least X stars for p in repos: if(p['star_count'] >= NUM_STARS): filtered_repos.append(p) print("3 - GitLab Filter repositories with at least X stars: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/3_gitlab_stars.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter demo repositories for p in repos: if(p['description'] is not None): if(not (("demo" in p['description'].lower()) or ("course" in p['description'].lower()) or ("thesis" in p['description'].lower()) or ("exame" in p['description'].lower()))): # “demo”, “course”, "thesis" if(not (("demo" in p['path_with_namespace'].lower()) or ("course" in p['path_with_namespace'].lower()) or ("thesis" in p['path_with_namespace'].lower()) or ("exame" in p['path_with_namespace'].lower()))): # “demo”, “course”, "thesis", "exame" filtered_repos.append(p) else: filtered_repos.append(p) print("4 - Gitlab Filter DEMO repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/4_gitlab_no_demo.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter tools repositories for p in repos: if(p['description'] is not None): if(not (("tool" in p['description'].lower()) or ("util" in p['description'].lower()) or ("helper" in p['description'].lower()) or ("library" in p['description'].lower()) or ("util" in p['description'].lower()) or ("plugin" in p['description'].lower()) or ("plug-in" in p['description'].lower()))): # tool, util, helper, library, plugin, plug-in if(not (("tool" in p['path_with_namespace'].lower()) or ("util" in p['path_with_namespace'].lower()) or ("helper" in p['path_with_namespace'].lower()) or ("library" in p['path_with_namespace'].lower()) or ("plugin" in p['path_with_namespace'].lower()) or ("plug-in" in p['path_with_namespace'].lower()))): # tool, util, helper, library, plugin, plug-in filtered_repos.append(p) else: filtered_repos.append(p) print("5 - Gitlab Filter TOOLS repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/5_gitlab_no_tools.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter simulation-oriented repositories for p in repos: if(p['description'] is not None): if(not (("simulat" in p['description'].lower()) or ("gazebo" in p['description'].lower()))): # simulat, gazebo if(not (("simulat" in p['path_with_namespace'].lower()) or ("gazebo" in p['path_with_namespace'].lower()))): # simulat, gazebo filtered_repos.append(p) else: filtered_repos.append(p) print("6 - Gitlab Filter SIMULATION repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/6_gitlab_no_simul.json', filtered_repos) repos = filtered_repos filtered_repos = list() repos = filtered_repos filtered_repos = list() print("GitLab analysis done.")
def start_github_analysis(rosmap_repos, rosmap_commits, ghtorrent_repos, ghtorrent_commits, jump_commits): filtered_repos = list() print("0 - GitHub ROSMAP Initial search: " + str(len(rosmap_repos))) # Filter ROSMAP fork repositories for key, p in rosmap_repos.items(): if(p['fork'] == False): filtered_repos.append(p) print("1 - GitHub ROSMAP Fork repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/1_github_rosmap_no_forks.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter ROSMAP repositories with #commits < NUM_COMMITS # the check below is used just for saving time in case the number of commits has been already fetched from the web # i.e., the 2_github_rosmap_commits.json already exists and it is up to date if(not jump_commits): for p in repos: if(count_commits(p, rosmap_commits) >= c.NUM_COMMITS): filtered_repos.append(p) print("2 - Github ROSMAP Filter repositories with at least " + str(c.NUM_COMMITS) + " commits: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/2_github_rosmap_commits.json', filtered_repos) else: with open('./repos_mining_data/intermediateResults/2_github_rosmap_commits.json', 'r') as outputfile: filtered_repos = json.load(outputfile) repos = filtered_repos filtered_repos = list() # MERGE rosmap and ghtorrent repos repos = union_dictionaries(to_dictionary(repos), ghtorrent_repos) repos = repos.values() commits = union_dictionaries(to_dictionary_commits(rosmap_commits), to_dictionary_commits(ghtorrent_commits)) commits = commits.values() print("MERGE - Merged lists of GitHub repos coming from the rosmap and the ghtorrent searches: " + str(len(repos))) # Filter repositories with at least X stars for p in repos: if(p['stargazers_count'] >= NUM_STARS): filtered_repos.append(p) print("3 - GitHub Filter repositories with at least X stars: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/3_github_stars.json', filtered_repos) repos = filtered_repos filtered_repos = list() # Filter demo repositories discarded = list() for p in repos: if(p['description'] is not None): if(not (("demo" in p['description'].lower()) or ("tutorial" in p['description'].lower()) or ("course" in p['description'].lower()) or ("thesis" in p['description'].lower()) or ("exame" in p['description'].lower()))): # “demo”, "tutorial", “course”, "thesis", exame if(not (("demo" in p['full_name'].lower()) or ("tutorial" in p['description'].lower()) or ("course" in p['full_name'].lower()) or ("thesis" in p['full_name'].lower()) or ("exame" in p['full_name'].lower()))): # “demo”, "tutorial", “course”, "thesis", exame filtered_repos.append(p) else: discarded.append(p) else: discarded.append(p) else: filtered_repos.append(p) print("4 - Github Filter DEMO repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/4_github_no_demo.json', filtered_repos) c.save('./repos_mining_data/intermediateResults/4_github_no_demo_discarded.json', discarded) repos = filtered_repos filtered_repos = list() discarded = list() # Filter tools repositories for p in repos: if(p['description'] is not None): if(not (("tool" in p['description'].lower()) or ("util" in p['description'].lower()) or ("helper" in p['description'].lower()) or ("library" in p['description'].lower()) or ("util" in p['description'].lower()) or ("plugin" in p['description'].lower()) or ("plug-in" in p['description'].lower()))): # tool, util, helper, library, plugin, plug-in if(not (("tool" in p['full_name'].lower()) or ("util" in p['full_name'].lower()) or ("helper" in p['full_name'].lower()) or ("library" in p['full_name'].lower()) or ("plugin" in p['full_name'].lower()) or ("plug-in" in p['full_name'].lower()))): # tool, util, helper, library, plugin, plug-in filtered_repos.append(p) else: discarded.append(p) else: discarded.append(p) else: filtered_repos.append(p) print("5 - GitHub Filter TOOLS repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/5_github_no_tools.json', filtered_repos) c.save('./repos_mining_data/intermediateResults/5_github_no_tools_discarded.json', discarded) discarded = list() repos = filtered_repos filtered_repos = list() # Filter simulation-oriented repositories for p in repos: if(p['description'] is not None): if(not (("simulat" in p['description'].lower()) or ("gazebo" in p['description'].lower()))): # simulat, gazebo if(not (("simulat" in p['full_name'].lower()) or ("gazebo" in p['full_name'].lower()))): # simulat, gazebo filtered_repos.append(p) else: discarded.append(p) else: discarded.append(p) else: filtered_repos.append(p) print("6 - GitHub Filter SIMULATION repositories: " + str(len(filtered_repos))) c.save('./repos_mining_data/intermediateResults/6_github_no_simul.json', filtered_repos) c.save('./repos_mining_data/intermediateResults/6_github_no_simul_discarded.json', discarded) discarded = list() repos = filtered_repos filtered_repos = list() print("GitHub analysis done.")