def _identify_gh_repo(homepage): """Returns code repository dict filled with homepage, if homepage is GH repo (None otherwise) """ if parse_gh_repo(homepage): return {'url': homepage, 'type': 'git'} return None
def get_files_github_url(self, github_url): """Clone the repository from GitHub and retrieve manifest files from it.""" manifest_data = [] repo_suffix = parse_gh_repo(github_url) try: self.del_temp_files() repo_url = urljoin(self.PREFIX_URL, repo_suffix) check_valid_repo = get(repo_url) if check_valid_repo.status_code == 200: repo_clone_url = urljoin(self.PREFIX_GIT_URL, repo_suffix, '.git') Git.clone(repo_clone_url, self.CLONED_DIR) for file_obj in self.get_manifest_files(): file_content = None filename = file_obj.get('filename') filepath = file_obj.get('filepath') with open(filepath, 'rb') as m_file: file_content = m_file.read().decode('utf-8') manifest_data.append({ "filename": filename, "content": file_content, "filepath": filepath.replace(self.CLONED_DIR, '') }) except Exception: raise HTTPError(500, "Error in reading repo from github.") finally: self.del_temp_files() return manifest_data
def _get_repo_name(self, url): """Retrieve GitHub repo from a preceding Mercator scan.""" parsed = parse_gh_repo(url) if not parsed: self.log.debug('Could not parse Github repo URL %s', url) else: self._repo_url = 'https://github.com/' + parsed return parsed
def _get_repo_name(self, url): """Get GitHub repo URL.""" parsed = parse_gh_repo(url) if not parsed: logger.debug('Could not parse Github repo URL %s', url) else: self._repo_url = 'https://github.com/' + parsed return parsed
def isGhRepo(node_args, key): """Predicate if the repository is on GitHub.""" try: val = reduce(lambda m, k: m[k], key if isinstance(key, list) else [key], node_args) if parse_gh_repo(val): return True else: return False except Exception: return False
def isGhRepo(node_args, key): try: val = reduce(lambda m, k: m[k], key if isinstance(key, list) else [key], node_args) if parse_gh_repo(val): return True else: return False except Exception: return False
def _handle_java(self, data): """Handle data from pom.xml.""" # we expect pom.xml to be there, since it's always downloaded to top level by InitTask pom = data.get('pom.xml') if pom is None: return None key_map = (('name', ), ('version', ), ('description', ), ('url', 'homepage'), ('licenses', 'declared_licenses')) # handle licenses transformed = self.transform_keys(pom, key_map) if transformed['name'] is None: transformed['name'] = "{}:{}".format(pom.get('groupId'), pom.get('artifactId')) # dependencies with scope 'compile' and 'runtime' are needed at runtime; # dependencies with scope 'provided' are not necessarily runtime dependencies, # but they are commonly used for example in web applications dependencies_dict = pom.get('dependencies', {}).get('compile', {}) dependencies_dict.update( pom.get('dependencies', {}).get('runtime', {})) dependencies_dict.update( pom.get('dependencies', {}).get('provided', {})) # dependencies with scope 'test' are only needed for testing; dev_dependencies_dict = pom.get('dependencies', {}).get('test', {}) transformed['dependencies'] = [ k.rstrip(':') + ' ' + v for k, v in dependencies_dict.items() ] transformed['devel_dependencies'] = [ k.rstrip(':') + ' ' + v for k, v in dev_dependencies_dict.items() ] # handle code_repository if 'scm_url' in pom: # TODO: there's no way we can tell 100 % what the type is, but we could # try to handle at least some cases, e.g. github will always be git etc repo_type = 'git' if parse_gh_repo(pom['scm_url']) else 'unknown' transformed['code_repository'] = { 'url': pom['scm_url'], 'type': repo_type } return transformed
def normalize(self): """Normalize output from Mercator for pom.xml (Maven).""" if not self._raw_data: return {} if self._data['name'] is None: self._data['name'] = "{}:{}".format( self._raw_data.get('groupId'), self._raw_data.get('artifactId')) # dependencies with scope 'compile' and 'runtime' are needed at runtime; # dependencies with scope 'provided' are not necessarily runtime dependencies, # but they are commonly used for example in web applications dependencies_dict = self._raw_data.get('dependencies', {}).get('compile', {}) dependencies_dict.update( self._raw_data.get('dependencies', {}).get('runtime', {})) dependencies_dict.update( self._raw_data.get('dependencies', {}).get('provided', {})) # dependencies with scope 'test' are only needed for testing; dev_dependencies_dict = self._raw_data.get('dependencies', {}).get('test', {}) self._data['dependencies'] = [ k.rstrip(':') + ' ' + v for k, v in dependencies_dict.items() ] self._data['devel_dependencies'] = [ k.rstrip(':') + ' ' + v for k, v in dev_dependencies_dict.items() ] # handle code_repository if 'scm_url' in self._raw_data: # TODO: there's no way we can tell 100 % what the type is, but we could # try to handle at least some cases, e.g. github will always be git etc repo_type = 'git' if parse_gh_repo( self._raw_data['scm_url']) else 'unknown' self._data['code_repository'] = { 'url': self._raw_data['scm_url'], 'type': repo_type } return self._data
def _get_github_readme(self, url): repo_tuple = parse_gh_repo(url) if repo_tuple: project, repo = repo_tuple.split('/') else: return None for readme_type, extensions in self.README_TYPES.items(): for extension in extensions: if extension: extension = '.' + extension url = self._GITHUB_README_PATH.format(project=project, repo=repo, extension=extension) response = requests.get(url) if response.status_code != 200: self.log.debug('No README%s found for type "%s" at "%s"', extension, readme_type, url) continue self.log.debug('README%s found for type "%s" at "%s"', extension, readme_type, url) return {'type': readme_type, 'content': response.text}
def _identify_gh_repo(homepage): """Return code repository dict filled with homepage.""" if parse_gh_repo(homepage): return {'url': homepage, 'type': 'git'} return None
def get_manifest_details(self, github_url): """Retrieve manifest files from cloned repository.""" manifest_data = [] supported_manifests = { 'requirements.txt': True, 'pom.xml': True, 'package.json': True } repo_tuple = parse_gh_repo(github_url) if repo_tuple: project, repo = repo_tuple.split('/') else: return None last_commit_url = 'https://api.github.com/repos/{project}/{repo}/git/refs/heads/' \ 'master'.format(project=project, repo=repo) trees_url = 'https://api.github.com/repos/{project}/{repo}/git/trees/{sha}?recursive=1' raw_content_path = 'https://raw.githubusercontent.com/{project}/{repo}/master/{filename}' # Fetch the latest commit of the repo try: resp = requests.get(last_commit_url) except exceptions.RequestException as e: print(e) return None last_commit = '' if resp.status_code == 200: try: last_commit = resp.json()['object']['sha'] except KeyError as e: print(e) return None # Fetch the contents tree using the last commit sha try: resp = requests.get(trees_url.format(project=project, repo=repo, sha=last_commit)) except exceptions.RequestException as e: print(e) return None if resp.status_code == 200: try: tree = resp.json()['tree'] except KeyError as e: print(e) return None for t in tree: try: if supported_manifests[os.path.basename(t['path'])]: manifest_data.append({ 'filename': os.path.basename(t['path']), 'download_url': raw_content_path.format( project=project, repo=repo, filename=t['path']), 'filepath': os.path.dirname(t['path']) }) except KeyError as e: print(e) continue print(manifest_data) return manifest_data
def test_parse_gh_repo_nok(self, url): """Test parse_gh_repo().""" assert parse_gh_repo(url) is None
def test_parse_gh_repo_ok(self, url): """Test parse_gh_repo().""" assert parse_gh_repo(url) == 'foo/bar'
def test_parse_gh_repo_nok(self, url): assert parse_gh_repo(url) is None
def test_parse_gh_repo_ok(self, url): assert parse_gh_repo(url) == 'foo/bar'