Esempio n. 1
0
def ImportRepo(session: orm.session.Session,
               language: scrape_repos_pb2.LanguageToClone,
               metafile: pathlib.Path, pool: multiprocessing.Pool) -> None:
    """Import contentfiles from repository.

  Args:
    session: A database session to import to.
    language: The language specification for the repo.
    metafile: The repo metafile.
    pool: A multiprocessing pool.
  """
    meta = pbutil.FromFile(metafile, scrape_repos_pb2.GitHubRepoMetadata())
    clone_dir = metafile.parent / f'{meta.owner}_{meta.name}'
    repo = contentfiles.GitHubRepository.GetOrAdd(session, meta)
    repo.language = language.language

    for importer in language.importer:
        if not importer.source_code_pattern:
            logging.error('No source_code_pattern specified! Stopping now.')
            return

        pat = importer.source_code_pattern
        pat = f'{clone_dir}/{pat[1:]}' if pat[
            0] == '^' else f'{clone_dir}/{pat}'
        cmd = [
            'find',
            str(clone_dir), '-type', 'f', '-regex', pat, '-not', '-path',
            '*/.git/*'
        ]
        logging.debug('$ %s', ' '.join(cmd))
        paths = subprocess.check_output(
            cmd, universal_newlines=True).rstrip().split('\n')
        if len(paths) == 1 and not paths[0]:
            logging.debug('No files to import from %s', clone_dir)
            return
        logging.info("Importing %s '%s' files from %s ...",
                     humanize.intcomma(len(paths)),
                     importer.source_code_pattern, clone_dir)
        all_files_relpaths = public.GetAllFilesRelativePaths(clone_dir)
        jobs = [
            scrape_repos_pb2.ImportWorker(
                clone_from_url=meta.clone_from_url,
                clone_dir=str(clone_dir),
                abspath=p,
                all_files_relpaths=all_files_relpaths,
                preprocessors=importer.preprocessor,
            ) for p in paths
        ]
        bar = progressbar.ProgressBar(max_value=len(jobs))
        for outputs in bar(pool.imap_unordered(ImportWorker, jobs)):
            for output in outputs:
                session.add(output)
Esempio n. 2
0
 def _IndexPattern(self,
                   indexer: scrape_repos_pb2.ContentFilesImporterConfig,
                   pool: multiprocessing.Pool,
                   i: IndexProgress) -> 'GitHubRepo':
     """Index the repo."""
     pattern = indexer.source_code_pattern
     pattern = (f'{self.clone_dir}/{pattern[1:]}'
                if pattern[0] == '^' else f'{self.clone_dir}/{pattern}')
     cmd = [
         'find',
         str(self.clone_dir), '-type', 'f', '-regex', pattern, '-not',
         '-path', '*/.git/*'
     ]
     logging.debug('$ %s', ' '.join(cmd))
     paths = subprocess.check_output(
         cmd, universal_newlines=True).rstrip().split('\n')
     if len(paths) == 1 and not paths[0]:
         logging.debug('No files to import from %s', self.clone_dir)
         return self
     if i:
         logging.info("[%s / %s] Importing %s files from %s ...", i.i, i.n,
                      humanize.intcomma(len(paths)), self.name)
     else:
         logging.info("Importing %s files from %s ...",
                      humanize.intcomma(len(paths)), self.name)
     all_files_relpaths = public.GetAllFilesRelativePaths(self.clone_dir)
     jobs = (scrape_repos_pb2.ImportWorker(
         clone_from_url=self.meta.clone_from_url,
         clone_dir=str(self.clone_dir),
         abspath=p,
         all_files_relpaths=all_files_relpaths,
         preprocessors=indexer.preprocessor,
         index_dir=str(self.index_dir),
     ) for p in paths)
     progress_bar = progressbar.ProgressBar(max_value=len(paths))
     for _ in progress_bar(pool.imap_unordered(IndexContentFiles, jobs)):
         pass
Esempio n. 3
0
    "stdlib.h",
    "stdnoreturn.h",
    "string.h",
    "tgmath.h",
    "threads.h",
    "time.h",
    "uchar.h",
    "wchar.h",
    "wctype.h",
}

# The set of headers in the C++ standard library.
_UNAME = "mac" if sys.platform == "darwin" else "linux"
CXX_HEADERS = set(
    public.GetAllFilesRelativePaths(
        bazelutil.DataPath(f"libcxx_{_UNAME}/include/c++/v1"),
        follow_symlinks=True) + public.GetAllFilesRelativePaths(
            bazelutil.DataPath(f"libcxx_{_UNAME}/lib/clang/6.0.0/include"),
            follow_symlinks=True,
        ))


@public.dataset_preprocessor
def CxxHeaders(
    import_root: pathlib.Path,
    file_relpath: str,
    text: str,
    all_file_relpaths: typing.List[str],
) -> typing.List[str]:
    """Inline C++ includes.
Esempio n. 4
0
 def _IndexPattern(
   self,
   indexer: scrape_repos_pb2.ContentFilesImporterConfig,
   pool: multiprocessing.Pool,
   i: IndexProgress,
 ) -> "GitHubRepo":
   """Index the repo."""
   pattern = indexer.source_code_pattern
   pattern = (
     f"{self.clone_dir}/{pattern[1:]}"
     if pattern[0] == "^"
     else f"{self.clone_dir}/{pattern}"
   )
   cmd = [
     "find",
     str(self.clone_dir),
     "-type",
     "f",
     "-regex",
     pattern,
     "-not",
     "-path",
     "*/.git/*",
   ]
   app.Log(2, "$ %s", " ".join(cmd))
   paths = (
     subprocess.check_output(cmd, universal_newlines=True).rstrip().split("\n")
   )
   if len(paths) == 1 and not paths[0]:
     app.Log(2, "No files to import from %s", self.clone_dir)
     return self
   if i:
     app.Log(
       1,
       "[%s / %s] Importing %s files from %s ...",
       i.i,
       i.n,
       humanize.Commas(len(paths)),
       self.name,
     )
   else:
     app.Log(
       1,
       "Importing %s files from %s ...",
       humanize.Commas(len(paths)),
       self.name,
     )
   all_files_relpaths = public.GetAllFilesRelativePaths(self.clone_dir)
   jobs = (
     scrape_repos_pb2.ImportWorker(
       clone_from_url=self.meta.clone_from_url,
       clone_dir=str(self.clone_dir),
       abspath=p,
       all_files_relpaths=all_files_relpaths,
       preprocessors=indexer.preprocessor,
       index_dir=str(self.index_dir),
     )
     for p in paths
   )
   progress_bar = progressbar.ProgressBar(max_value=len(paths))
   for _ in progress_bar(pool.imap_unordered(IndexContentFiles, jobs)):
     pass
Esempio n. 5
0
def test_GetAllFilesRelativePaths_relpath(tempdir: pathlib.Path):
    """Test that relative paths are returned."""
    (tempdir / 'a').touch()
    assert public.GetAllFilesRelativePaths(tempdir) == ['a']
Esempio n. 6
0
def test_GetAllFilesRelativePaths_empty_dir(tempdir: pathlib.Path):
    """Test that an empty directory returns an empty list."""
    assert public.GetAllFilesRelativePaths(tempdir) == []