def callback_func(data): global results global prog results.append(data) prog += 1 progress(prog, total, 'Analyzing commits')
def get_libraries(self): res = {} commits = _filter_commits_by_authors(self.commit_list, self.authors) # Before we do anything, copy the repo to a temporary location so that we don't mess with the original repo tmp_repo_path = _get_temp_repo_path() now = datetime.now() print( "[%s] Copying the repository to a temporary location, this can take a while..." % now.strftime("%d/%m/%Y %H:%M:%S")) shutil.copytree(self.basedir, tmp_repo_path, symlinks=True) now = datetime.now() print("[%s] Finished copying the repository" % now.strftime("%d/%m/%Y %H:%M:%S")) # Initialise the next tmp directory as a repo and hard reset, just in case repo = git.Repo(tmp_repo_path) repo.git.clean('-fd') repo.git.checkout('master') repo.git.reset('--hard') prog = 0 total = len(commits) for commit in commits: libs_in_commit = {} files = [ os.path.join(tmp_repo_path, x.file_name) for x in commit.changed_files ] for lang, extensions in supported_languages.items(): # we have extensions now, filter the list to only files with those extensions lang_files = list( filter( lambda x: pathlib.Path(x).suffix[1:].lower() in extensions, files)) if lang_files: # if we go to this point, there were files modified in the language we support # check out the commit in our temporary branch repo.git.checkout(commit.hash) # now we need to run regex for imports for every single of such file # Load the language plugin that is responsible for parsing those files for libraries used parser = load_language(lang) # Only parse libraries if we support the current language if parser: if lang not in libs_in_commit.keys(): libs_in_commit[lang] = [] libs_in_commit[lang].extend( parser.extract_libraries(lang_files)) prog += 1 progress(prog, total, 'Analyzing libraries') if libs_in_commit: res[commit.hash] = libs_in_commit shutil.rmtree(tmp_repo_path) return res
def callback_func(self, data): # Sanitize filenames because they might have weird characters # Also cast dict.keys() to the list() so we don't get Runtime Errors keys = list(data["stats"].items()) for k, v in keys: sanitized_key = sanitize_filename(k) if sanitized_key != k: data["stats"][sanitized_key] = v data["stats"].pop(k, None) self.results.append(data) self.prog += 1 progress(self.prog, self.total, 'Analyzing commits')
def get_libraries(self): res = {} commits = _filter_commits_by_author_emails(self.commit_list, self.author_emails) if not commits: _log_info("No commmits found for the authored by selected users") return res # If we are in headless mode, we don't copy the repo to temp location if self.headless: tmp_repo_path = self.basedir repo = git.Repo(tmp_repo_path) else: # Before we do anything, copy the repo to a temporary location so that we don't mess with the original repo tmp_repo_path = _get_temp_repo_path() _log_info( "Copying the repository to a temporary location, this can take a while..." ) try: shutil.copytree("%s/.git" % self.basedir, "%s/.git" % tmp_repo_path, symlinks=True) except shutil.Error as e: module_logger.debug("Shutil error messages: {}.".format( str(e))) _log_info("Finished copying the repository to", tmp_repo_path) # Initialise the next tmp directory as a repo and hard reset, just in case repo = git.Repo(tmp_repo_path) repo.git.clean('-fd') try: repo.git.checkout('master') except git.exc.GitCommandError as err: _log_info("Cannot checkout master on repository: ", err) try: repo.git.reset('--hard') except git.exc.GitCommandError as err: _log_info("Cannot reset repository: ", err) prog = 0 total = len(commits) if not self.skip: _log_info( "Skipping is set to False. All commits and files will be evaluated. This may take time." ) else: _log_info( "Commit size limit is {} MB and file size limit is {} MB.". format(self.commit_size_limit, self.file_size_limit)) try: for commit in commits: start = time.time() module_logger.debug("Current commit hash is {}.".format( commit.hash)) libs_in_commit = {} files = [ os.path.join(tmp_repo_path, x.file_name) for x in commit.changed_files ] # if skip is not set to false in args, we may skip certain commits # Estimate the summed size of the changed files in the commit. If changed files sum more than 10 MB # or there are no changed files we recognize, we skip the commit (don't check out) est_size = _estimate_changed_file_size(files) module_logger.debug( "Changed file list is {} MBs.".format(est_size)) module_logger.debug("Skip is set to {}.".format(self.skip)) if not self.skip or ((est_size < self.commit_size_limit) and _should_we_check_out(files)): module_logger.debug("Checking out and analyzing commit.") co_start = time.time() try: repo.git.checkout(commit.hash, force=True) except Exception: continue co_end = time.time() module_logger.debug( "Checking out took {0:.6f} seconds.".format(co_end - co_start)) else: module_logger.debug("Skipping commit.") prog += 1 progress(prog, total, 'Analyzing libraries') continue for lang_root, extensions in supported_languages.items(): # we have extensions now, filter the list to only files with those extensions lang_files = list( filter( lambda x: (pathlib.Path(x).suffix[1:].lower() in extensions), files)) if lang_files: module_logger.debug( "Current language is {}, and extensions are{}". format(lang_root, extensions)) # if we go to this point, there were files modified in the language we support # check out the commit in our temporary branch # we need to filter again for files, that got deleted during the checkout # we also filter out tiles, which are larger than 2 MB to speed up the process if self.skip: filter_func = ( lambda x: os.path.isfile(x) and os.stat(x). st_size < self.file_size_limit * (1024**2)) else: filter_func = (lambda x: os.path.isfile(x)) lang_files_filtered = list( filter(filter_func, lang_files)) total_size = sum( os.stat(f).st_size for f in lang_files_filtered) module_logger.debug( "The number of files in lang_files_filtered" " is {0}, the total size is {1:.2f} MB".format( len(lang_files_filtered), total_size / (1024**2))) # now we need to run regex for imports for every single of such file # Load the language plugin that is responsible for parsing those files for libraries used parser = load_language(lang_root) # Only parse libraries if we support the current language if parser: mapped_libs = parser.extract_libraries( lang_files_filtered).items() for lang, libraries in mapped_libs: if len(libraries) == 0: continue if lang not in libs_in_commit.keys(): libs_in_commit[lang] = [] libs_in_commit[lang].extend(libraries) prog += 1 end = time.time() module_logger.debug( "Time spent processing commit {0} was {1:.4f} seconds.". format(commit.hash, end - start)) progress(prog, total, 'Analyzing libraries') if libs_in_commit: res[commit.hash] = libs_in_commit except (Exception, KeyboardInterrupt) as err: # make sure to clean up the tmp folder before dying # if we are in headless mode, it is not necessary to cleanup, # the repo will be deleted later if not self.headless: _cleanup(tmp_repo_path) raise err if not self.headless: _cleanup(tmp_repo_path) return res
def get_libraries(self): res = {} commits = _filter_commits_by_author_emails(self.commit_list, self.author_emails) if not commits: _log_info("No commmits found for the authored by selected users") return res # Before we do anything, copy the repo to a temporary location so that we don't mess with the original repo tmp_repo_path = _get_temp_repo_path() _log_info( "Copying the repository to a temporary location, this can take a while..." ) shutil.copytree(self.basedir, tmp_repo_path, symlinks=True) _log_info("Finished copying the repository to", tmp_repo_path) # Initialise the next tmp directory as a repo and hard reset, just in case repo = git.Repo(tmp_repo_path) repo.git.clean('-fd') try: repo.git.checkout('master') except git.exc.GitCommandError as err: _log_info("Cannot checkout master on repository: ", err) repo.git.reset('--hard') prog = 0 total = len(commits) try: for commit in commits: libs_in_commit = {} files = [ os.path.join(tmp_repo_path, x.file_name) for x in commit.changed_files ] for lang, extensions in supported_languages.items(): # we have extensions now, filter the list to only files with those extensions lang_files = list( filter( lambda x: pathlib.Path(x).suffix[1:].lower() in extensions, files)) if lang_files: # if we go to this point, there were files modified in the language we support # check out the commit in our temporary branch repo.git.checkout(commit.hash, force=True) # now we need to run regex for imports for every single of such file # Load the language plugin that is responsible for parsing those files for libraries used parser = load_language(lang) # Only parse libraries if we support the current language if parser: if lang not in libs_in_commit.keys(): libs_in_commit[lang] = [] libs_in_commit[lang].extend( parser.extract_libraries(lang_files)) prog += 1 progress(prog, total, 'Analyzing libraries') if libs_in_commit: res[commit.hash] = libs_in_commit except (Exception, KeyboardInterrupt) as err: # make sure to clean up the tmp folder before dying _cleanup(tmp_repo_path) raise err _cleanup(tmp_repo_path) return res
def callback_func(self, data): self.results.append(data) self.prog += 1 progress(self.prog, self.total, 'Analyzing commits')