def _CreateTestRepo(root_dir: pathlib.Path, owner: str, name: str) -> None: """Create an empty repo for testing indexers.""" owner_name = f'{owner}_{name}' (root_dir / owner_name / '.git').mkdir(parents=True) (root_dir / owner_name / 'src').mkdir(parents=True) pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata(owner=owner, name=name), root_dir / f'{owner_name}.pbtxt')
def CloneFromMetafile(metafile: pathlib.Path) -> None: meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) if not meta.owner and meta.name: logging.error('Metafile missing owner and name fields %s', metafile) return clone_dir = metafile.parent / f'{meta.owner}_{meta.name}' logging.debug('%s', meta) if (clone_dir / '.git').is_dir(): return # Remove anything left over from a previous attempt. subprocess.check_call(['rm', '-rf', str(clone_dir)]) cmd = ['timeout', f'{FLAGS.repository_clone_timeout_minutes}m', '/usr/bin/git', 'clone', meta.clone_from_url, str(clone_dir)] logging.debug('$ %s', ' '.join(cmd)) # Try to checkout the repository and submodules. p = subprocess.Popen(cmd + ['--recursive'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) _, stderr = p.communicate() if p.returncode and 'submodule' in stderr: # Remove anything left over from a previous attempt. subprocess.check_call(['rm', '-rf', str(clone_dir)]) # Try again, but this time without cloning submodules. p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) _, stderr = p.communicate() if p.returncode: # Give up. logging.warning('\nClone failed %s:\n%s', meta.clone_from_url, stderr) # Remove anything left over. subprocess.check_call(['rm', '-rf', str(clone_dir)])
def test_ImportFromLanguage_Java_repo( test_db: contentfiles.ContentFiles, tempdir: pathlib.Path ): """An end-to-end test of a Java importer.""" (tempdir / "Owner_Name" / ".git").mkdir(parents=True) (tempdir / "Owner_Name" / "src").mkdir(parents=True) # A repo will only be imported if there is a repo meta file. pbutil.ToFile( scrape_repos_pb2.GitHubRepoMetadata(owner="Owner", name="Name"), tempdir / "Owner_Name.pbtxt", ) # Create some files in our test repo. with open(tempdir / "Owner_Name" / "src" / "A.java", "w") as f: f.write( """ public class A { public static void helloWorld() { System.out.println("Hello, world!"); } } """ ) with open(tempdir / "Owner_Name" / "src" / "B.java", "w") as f: f.write( """ public class B { private static int foo() {return 5;} } """ ) with open(tempdir / "Owner_Name" / "README.txt", "w") as f: f.write("Hello, world!") language = scrape_repos_pb2.LanguageToClone( language="foolang", query=[], destination_directory=str(tempdir), importer=[ scrape_repos_pb2.ContentFilesImporterConfig( source_code_pattern=".*\\.java", preprocessor=[ "datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods" ], ), ], ) importer.ImportFromLanguage(test_db, language) with test_db.Session() as session: query = session.query(contentfiles.ContentFile) assert query.count() == 2 assert set([cf.text for cf in query]) == { ( "public static void helloWorld(){\n" ' System.out.println("Hello, world!");\n}\n' ), "private static int foo(){\n return 5;\n}\n", }
def ShouldImportRepo(session: orm.session.Session, metafile: pathlib.Path) -> bool: """Determine if the repository described by a metafile should be imported. A repository should be imported iff: * The metafile is a valid GitHubRepoMetadata proto. * The clone directory specified in the metafile appears to be a github repo. * The repo does not exist in the contentfiles database. """ if not (metafile.is_file() and pbutil.ProtoIsReadable( metafile, scrape_repos_pb2.GitHubRepoMetadata())): return False meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) clone_dir = metafile.parent / f'{meta.owner}_{meta.name}' if not (clone_dir / '.git').is_dir(): return False return not contentfiles.GitHubRepository.IsInDatabase(session, meta)
def GetRepositoryMetadata( repo: Repository.Repository) -> scrape_repos_pb2.GitHubRepoMetadata(): """Get metadata about a GitHub repository. Args: repo: A Repository instance. Returns: A GitHubRepoMetadata instance. """ meta = scrape_repos_pb2.GitHubRepoMetadata() meta.scraped_utc_epoch_ms = labdate.MillisecondsTimestamp( labdate.GetUtcMillisecondsNow()) meta.owner = repo.owner.login meta.name = repo.name meta.num_watchers = repo.watchers_count meta.num_forks = repo.forks_count meta.num_stars = repo.stargazers_count meta.clone_from_url = repo.clone_url return meta
def _CreateTestRepo(root_dir: pathlib.Path, owner: str, name: str) -> github_repo.GitHubRepo: """Create an empty repo for testing indexers.""" owner_name = f"{owner}_{name}" (root_dir / owner_name / ".git").mkdir(parents=True) (root_dir / owner_name / "src").mkdir(parents=True) pbutil.ToFile( scrape_repos_pb2.GitHubRepoMetadata(owner=owner, name=name), root_dir / f"{owner_name}.pbtxt", ) return github_repo.GitHubRepo(root_dir / f"{owner_name}.pbtxt")
def test_ImportFromLanguage_Java_repo(tempdir: pathlib.Path): """An end-to-end test of a Java importer.""" (tempdir / 'src').mkdir() (tempdir / 'src' / 'Owner_Name' / '.git').mkdir(parents=True) (tempdir / 'src' / 'Owner_Name' / 'src').mkdir(parents=True) # A repo will only be imported if there is a repo meta file. pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata( owner='Owner', name='Name'), tempdir / 'src' / 'Owner_Name.pbtxt') # Create some files in our test repo. with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'A.java', 'w') as f: f.write(""" public class A { public static void helloWorld() { System.out.println("Hello, world!"); } } """) with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'B.java', 'w') as f: f.write(""" public class B { private static int foo() {return 5;} } """) with open(tempdir / 'src' / 'Owner_Name' / 'README.txt', 'w') as f: f.write('Hello, world!') language = scrape_repos_pb2.LanguageToClone( language='foolang', query=[], destination_directory=str(tempdir / 'src'), importer=[ scrape_repos_pb2.ContentFilesImporterConfig( source_code_pattern='.*\\.java', preprocessor=["datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods"]), ] ) indexer.ImportFromLanguage(language, multiprocessing.Pool(1)) test_repo = github_repo.GitHubRepo(tempdir / 'src' / 'Owner_Name.pbtxt') assert (test_repo.index_dir / 'DONE.txt').is_file() assert len(list(test_repo.index_dir.iterdir())) == 3 contentfiles = list(test_repo.ContentFiles()) assert len(contentfiles) == 2 assert set([cf.text for cf in contentfiles]) == { ('public static void helloWorld(){\n' ' System.out.println("Hello, world!");\n}\n'), 'private static int foo(){\n return 5;\n}\n', }
def ImportRepo(session: orm.session.Session, language: scrape_repos_pb2.LanguageToClone, metafile: pathlib.Path, pool: multiprocessing.Pool) -> None: """Import contentfiles from repository. Args: session: A database session to import to. language: The language specification for the repo. metafile: The repo metafile. pool: A multiprocessing pool. """ meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) clone_dir = metafile.parent / f'{meta.owner}_{meta.name}' repo = contentfiles.GitHubRepository.GetOrAdd(session, meta) repo.language = language.language for importer in language.importer: if not importer.source_code_pattern: logging.error('No source_code_pattern specified! Stopping now.') return pat = importer.source_code_pattern pat = f'{clone_dir}/{pat[1:]}' if pat[ 0] == '^' else f'{clone_dir}/{pat}' cmd = [ 'find', str(clone_dir), '-type', 'f', '-regex', pat, '-not', '-path', '*/.git/*' ] logging.debug('$ %s', ' '.join(cmd)) paths = subprocess.check_output( cmd, universal_newlines=True).rstrip().split('\n') if len(paths) == 1 and not paths[0]: logging.debug('No files to import from %s', clone_dir) return logging.info("Importing %s '%s' files from %s ...", humanize.intcomma(len(paths)), importer.source_code_pattern, clone_dir) all_files_relpaths = public.GetAllFilesRelativePaths(clone_dir) jobs = [ scrape_repos_pb2.ImportWorker( clone_from_url=meta.clone_from_url, clone_dir=str(clone_dir), abspath=p, all_files_relpaths=all_files_relpaths, preprocessors=importer.preprocessor, ) for p in paths ] bar = progressbar.ProgressBar(max_value=len(jobs)) for outputs in bar(pool.imap_unordered(ImportWorker, jobs)): for output in outputs: session.add(output)
def ProcessRepo(self, repo: Repository.Repository) -> None: """Make metafile for a single repo.""" meta_path = self.GetRepoMetaPath(repo) if not pbutil.ProtoIsReadable( meta_path, scrape_repos_pb2.GitHubRepoMetadata() ): meta = GetRepositoryMetadata(repo) app.Log(2, "%s", meta) # Ignore URLs in the blacklist. if meta.clone_from_url.lower() in self.language.clone_from_url_blacklist: return pbutil.ToFile(meta, meta_path)
def MakeRepositoryMetas(self, repos: typing.List[Repository.Repository]) -> None: """Make meta files for a list of repositories. Args: repos: A list of GitHub Repository instances. """ logging.debug('Scraping %s repositories', humanize.intcomma(len(repos))) for repo in repos: self.i += 1 concat_name = '_'.join([repo.owner.login, repo.name]) clone_dir = self.destination_directory / concat_name meta_path = pathlib.Path(str(clone_dir) + '.pbtxt') if not pbutil.ProtoIsReadable(meta_path, scrape_repos_pb2.GitHubRepoMetadata()): meta = GetRepositoryMetadata(repo) logging.debug('%s', meta) pbutil.ToFile(meta, meta_path)
def __init__(self, metafile: pathlib.Path): """Instantiate a github repo. Args: metafile: The path to the github meta file proto. Raises: ValueError: In case the metafile cannot be read. """ self.metafile: pathlib.Path = metafile try: self.meta: scrape_repos_pb2.GitHubRepoMetadata = pbutil.FromFile( metafile, scrape_repos_pb2.GitHubRepoMetadata()) except pbutil.DecodeError as e: raise ValueError(f"Failed to read metafile '{self.metafile}' {e}") self.name: str = f'{self.meta.owner}_{self.meta.name}' self.clone_dir: pathlib.Path = metafile.parent / self.name self.index_dir = ( pathlib.Path(str(metafile.parent) + '.index') / self.name)
def CloneFromMetafile(metafile: pathlib.Path) -> None: meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) clone_dir = GetCloneDir(metafile) if not clone_dir: app.Error("Failed to determine clone directory") app.Log(2, "%s", meta) if (clone_dir / ".git").is_dir(): return # Remove anything left over from a previous attempt. subprocess.check_call(["rm", "-rf", str(clone_dir)]) # Try to checkout the repository and submodules. try: git_clone.GitClone( meta.clone_from_url, clone_dir, shallow=True, recursive=True, timeout=FLAGS.repository_clone_timeout_minutes * 60, ) except git_clone.RepoCloneFailed: # Remove anything left over from a previous attempt. subprocess.check_call(["rm", "-rf", str(clone_dir)]) # Try again, but this time without cloning submodules. try: git_clone.GitClone( meta.clone_from_url, clone_dir, shallow=True, recursive=False, timeout=FLAGS.repository_clone_timeout_minutes * 60, ) except git_clone.RepoCloneFailed: # Give up. app.Warning("\nClone failed %s:\n%s", meta.clone_from_url) # Remove anything left over. subprocess.check_call(["rm", "-rf", str(clone_dir)])
def IsRepoMetaFile(f: str): """Determine if a path is a GitHubRepoMetadata message.""" return (fs.isfile(f) and pbutil.ProtoIsReadable(f, scrape_repos_pb2.GitHubRepoMetadata()))
def GetCloneDir(metafile: pathlib.Path) -> Optional[pathlib.Path]: meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) if not meta.owner and meta.name: app.Error("Metafile missing owner and name fields %s", metafile) return return metafile.parent / f"{meta.owner}_{meta.name}"