コード例 #1
0
def IndexContentFiles(job: scrape_repos_pb2.ImportWorker) -> None:
  """Index content files."""
  relpath = job.abspath[len(str(job.clone_dir)) + 1 :]
  try:
    texts = preprocessors.Preprocess(
      pathlib.Path(job.clone_dir),
      relpath,
      job.all_files_relpaths,
      job.preprocessors,
    )
    for i, text in enumerate(texts):
      sha256 = hashlib.sha256(text.encode("utf-8"))
      proto = scrape_repos_pb2.ContentFile(
        clone_from_url=job.clone_from_url,
        relpath=relpath,
        artifact_index=i,
        sha256=sha256.digest(),
        charcount=len(text),
        linecount=len(text.split("\n")),
        text=text,
      )
      path = pathlib.Path(job.index_dir) / (
        binascii.hexlify(proto.sha256).decode("utf-8") + ".pbtxt"
      )
      pbutil.ToFile(proto, path)
  except UnicodeDecodeError:
    app.Warning("Failed to decode %s", relpath)
コード例 #2
0
ファイル: github_repo.py プロジェクト: 50417/phd
 def ContentFiles(self) -> typing.Iterable[scrape_repos_pb2.ContentFile]:
   """Return an iterator over all contentfiles in the repo."""
   if self.IsIndexed():
     return (pbutil.FromFile(f, scrape_repos_pb2.ContentFile())
             for f in self.index_dir.iterdir() if f.name != 'DONE.txt')
   else:
     return []
コード例 #3
0
ファイル: contentfiles.py プロジェクト: tehranixyz/ProGraML
  def ToProto(self) -> scrape_repos_pb2.ContentFile:
    """Create protocol buffer representation.

    Returns:
      A ContentFile message.
    """
    proto = scrape_repos_pb2.ContentFile()
    return self.SetProto(proto)
コード例 #4
0
ファイル: export_corpus.py プロジェクト: SpringRi/phd
def ExportIndex(index_path: pathlib.Path, export_path: pathlib.Path) -> None:
  """Export the contents of an index directory to a directory."""
  contentfile = scrape_repos_pb2.ContentFile()
  for subdir, dirs, files in os.walk(index_path):
    for file in files:
      if file.endswith('.pbtxt'):
        try:
          pbutil.FromFile(pathlib.Path(os.path.join(subdir, file)), contentfile)
          sha256 = binascii.hexlify(contentfile.sha256).decode('utf-8')
          out_path = export_path / (sha256 + '.txt')
          if not out_path.is_file():
            with open(out_path, 'w') as f:
              f.write(contentfile.text)
              logging.debug(out_path)
        except pbutil.DecodeError:
          pass