def ImportRepo(session: orm.session.Session, language: scrape_repos_pb2.LanguageToClone, metafile: pathlib.Path, pool: multiprocessing.Pool) -> None: """Import contentfiles from repository. Args: session: A database session to import to. language: The language specification for the repo. metafile: The repo metafile. pool: A multiprocessing pool. """ meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata()) clone_dir = metafile.parent / f'{meta.owner}_{meta.name}' repo = contentfiles.GitHubRepository.GetOrAdd(session, meta) repo.language = language.language for importer in language.importer: if not importer.source_code_pattern: logging.error('No source_code_pattern specified! Stopping now.') return pat = importer.source_code_pattern pat = f'{clone_dir}/{pat[1:]}' if pat[ 0] == '^' else f'{clone_dir}/{pat}' cmd = [ 'find', str(clone_dir), '-type', 'f', '-regex', pat, '-not', '-path', '*/.git/*' ] logging.debug('$ %s', ' '.join(cmd)) paths = subprocess.check_output( cmd, universal_newlines=True).rstrip().split('\n') if len(paths) == 1 and not paths[0]: logging.debug('No files to import from %s', clone_dir) return logging.info("Importing %s '%s' files from %s ...", humanize.intcomma(len(paths)), importer.source_code_pattern, clone_dir) all_files_relpaths = public.GetAllFilesRelativePaths(clone_dir) jobs = [ scrape_repos_pb2.ImportWorker( clone_from_url=meta.clone_from_url, clone_dir=str(clone_dir), abspath=p, all_files_relpaths=all_files_relpaths, preprocessors=importer.preprocessor, ) for p in paths ] bar = progressbar.ProgressBar(max_value=len(jobs)) for outputs in bar(pool.imap_unordered(ImportWorker, jobs)): for output in outputs: session.add(output)
def _IndexPattern(self, indexer: scrape_repos_pb2.ContentFilesImporterConfig, pool: multiprocessing.Pool, i: IndexProgress) -> 'GitHubRepo': """Index the repo.""" pattern = indexer.source_code_pattern pattern = (f'{self.clone_dir}/{pattern[1:]}' if pattern[0] == '^' else f'{self.clone_dir}/{pattern}') cmd = [ 'find', str(self.clone_dir), '-type', 'f', '-regex', pattern, '-not', '-path', '*/.git/*' ] logging.debug('$ %s', ' '.join(cmd)) paths = subprocess.check_output( cmd, universal_newlines=True).rstrip().split('\n') if len(paths) == 1 and not paths[0]: logging.debug('No files to import from %s', self.clone_dir) return self if i: logging.info("[%s / %s] Importing %s files from %s ...", i.i, i.n, humanize.intcomma(len(paths)), self.name) else: logging.info("Importing %s files from %s ...", humanize.intcomma(len(paths)), self.name) all_files_relpaths = public.GetAllFilesRelativePaths(self.clone_dir) jobs = (scrape_repos_pb2.ImportWorker( clone_from_url=self.meta.clone_from_url, clone_dir=str(self.clone_dir), abspath=p, all_files_relpaths=all_files_relpaths, preprocessors=indexer.preprocessor, index_dir=str(self.index_dir), ) for p in paths) progress_bar = progressbar.ProgressBar(max_value=len(paths)) for _ in progress_bar(pool.imap_unordered(IndexContentFiles, jobs)): pass
"stdlib.h", "stdnoreturn.h", "string.h", "tgmath.h", "threads.h", "time.h", "uchar.h", "wchar.h", "wctype.h", } # The set of headers in the C++ standard library. _UNAME = "mac" if sys.platform == "darwin" else "linux" CXX_HEADERS = set( public.GetAllFilesRelativePaths( bazelutil.DataPath(f"libcxx_{_UNAME}/include/c++/v1"), follow_symlinks=True) + public.GetAllFilesRelativePaths( bazelutil.DataPath(f"libcxx_{_UNAME}/lib/clang/6.0.0/include"), follow_symlinks=True, )) @public.dataset_preprocessor def CxxHeaders( import_root: pathlib.Path, file_relpath: str, text: str, all_file_relpaths: typing.List[str], ) -> typing.List[str]: """Inline C++ includes.
def _IndexPattern( self, indexer: scrape_repos_pb2.ContentFilesImporterConfig, pool: multiprocessing.Pool, i: IndexProgress, ) -> "GitHubRepo": """Index the repo.""" pattern = indexer.source_code_pattern pattern = ( f"{self.clone_dir}/{pattern[1:]}" if pattern[0] == "^" else f"{self.clone_dir}/{pattern}" ) cmd = [ "find", str(self.clone_dir), "-type", "f", "-regex", pattern, "-not", "-path", "*/.git/*", ] app.Log(2, "$ %s", " ".join(cmd)) paths = ( subprocess.check_output(cmd, universal_newlines=True).rstrip().split("\n") ) if len(paths) == 1 and not paths[0]: app.Log(2, "No files to import from %s", self.clone_dir) return self if i: app.Log( 1, "[%s / %s] Importing %s files from %s ...", i.i, i.n, humanize.Commas(len(paths)), self.name, ) else: app.Log( 1, "Importing %s files from %s ...", humanize.Commas(len(paths)), self.name, ) all_files_relpaths = public.GetAllFilesRelativePaths(self.clone_dir) jobs = ( scrape_repos_pb2.ImportWorker( clone_from_url=self.meta.clone_from_url, clone_dir=str(self.clone_dir), abspath=p, all_files_relpaths=all_files_relpaths, preprocessors=indexer.preprocessor, index_dir=str(self.index_dir), ) for p in paths ) progress_bar = progressbar.ProgressBar(max_value=len(paths)) for _ in progress_bar(pool.imap_unordered(IndexContentFiles, jobs)): pass
def test_GetAllFilesRelativePaths_relpath(tempdir: pathlib.Path): """Test that relative paths are returned.""" (tempdir / 'a').touch() assert public.GetAllFilesRelativePaths(tempdir) == ['a']
def test_GetAllFilesRelativePaths_empty_dir(tempdir: pathlib.Path): """Test that an empty directory returns an empty list.""" assert public.GetAllFilesRelativePaths(tempdir) == []