def IndexContentFiles(job: scrape_repos_pb2.ImportWorker) -> None: """Index content files.""" relpath = job.abspath[len(str(job.clone_dir)) + 1 :] try: texts = preprocessors.Preprocess( pathlib.Path(job.clone_dir), relpath, job.all_files_relpaths, job.preprocessors, ) for i, text in enumerate(texts): sha256 = hashlib.sha256(text.encode("utf-8")) proto = scrape_repos_pb2.ContentFile( clone_from_url=job.clone_from_url, relpath=relpath, artifact_index=i, sha256=sha256.digest(), charcount=len(text), linecount=len(text.split("\n")), text=text, ) path = pathlib.Path(job.index_dir) / ( binascii.hexlify(proto.sha256).decode("utf-8") + ".pbtxt" ) pbutil.ToFile(proto, path) except UnicodeDecodeError: app.Warning("Failed to decode %s", relpath)
def ContentFiles(self) -> typing.Iterable[scrape_repos_pb2.ContentFile]: """Return an iterator over all contentfiles in the repo.""" if self.IsIndexed(): return (pbutil.FromFile(f, scrape_repos_pb2.ContentFile()) for f in self.index_dir.iterdir() if f.name != 'DONE.txt') else: return []
def ToProto(self) -> scrape_repos_pb2.ContentFile: """Create protocol buffer representation. Returns: A ContentFile message. """ proto = scrape_repos_pb2.ContentFile() return self.SetProto(proto)
def ExportIndex(index_path: pathlib.Path, export_path: pathlib.Path) -> None: """Export the contents of an index directory to a directory.""" contentfile = scrape_repos_pb2.ContentFile() for subdir, dirs, files in os.walk(index_path): for file in files: if file.endswith('.pbtxt'): try: pbutil.FromFile(pathlib.Path(os.path.join(subdir, file)), contentfile) sha256 = binascii.hexlify(contentfile.sha256).decode('utf-8') out_path = export_path / (sha256 + '.txt') if not out_path.is_file(): with open(out_path, 'w') as f: f.write(contentfile.text) logging.debug(out_path) except pbutil.DecodeError: pass