def main(argv) -> None: """Main entry point.""" if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) meta_files = [] for language in clone_list.language: directory = pathlib.Path(language.destination_directory) if directory.is_dir(): meta_files += [pathlib.Path(directory / f) for f in directory.iterdir() if IsRepoMetaFile(f)] random.shuffle(meta_files) worker = AsyncWorker(meta_files) logging.info('Cloning %s repos from GitHub ...', humanize.intcomma(worker.max)) bar = progressbar.ProgressBar(max_value=worker.max, redirect_stderr=True) worker.start() while worker.is_alive(): bar.update(worker.i) worker.join(.5) bar.update(worker.i)
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments '{}'".format(', '.join( argv[1:]))) clone_list_path = pathlib.Path(FLAGS.clone_list or '') if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) if not FLAGS.export_path: raise app.UsageError('--export_path not set.') export_path = pathlib.Path(FLAGS.export_path) export_path.mkdir(parents=True, exist_ok=True) # To export from contentfiles database. # for language in clone_list.language: # d = pathlib.Path(language.destination_directory) # d = d.parent / (str(d.name) + '.db') # db = contentfiles.ContentFiles(d) # with db.Session() as session: # (export_path / language.language).mkdir(exist_ok=True) # ExportDatabase(session, export_path / language.language) # To export from index directory. for language in clone_list.language: index_path = pathlib.Path(language.destination_directory + '.index') if index_path.is_dir(): (export_path / language.language).mkdir(exist_ok=True) ExportIndex(index_path, export_path / language.language)
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments '{}'".format(', '.join( argv[1:]))) clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) # Error early if the config contains invalid preprocessors. for language in clone_list.language: for importer in language.importer: [ preprocessors.GetPreprocessorFunction(p) for p in importer.preprocessor ] pool = multiprocessing.Pool(FLAGS.processes) for language in clone_list.language: d = pathlib.Path(language.destination_directory) d = d.parent / (str(d.name) + '.db') db = contentfiles.ContentFiles(d) if pathlib.Path(language.destination_directory).is_dir(): ImportFromLanguage(db, language, pool)
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments '{}'".format(", ".join(argv[1:]))) clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError("--clone_list is not a file.") clone_list = pbutil.FromFile( clone_list_path, scrape_repos_pb2.LanguageCloneList() ) if not FLAGS.export_path: raise app.UsageError("--export_path not set.") export_path = pathlib.Path(FLAGS.export_path) export_path.mkdir(parents=True, exist_ok=True) # To export from contentfiles database. for language in clone_list.language: d = pathlib.Path(language.destination_directory) d = d.parent / (str(d.name) + ".db") db = contentfiles.ContentFiles(f"sqlite:///{d}") with db.Session() as session: (export_path / language.language).mkdir(exist_ok=True) ExportDatabase(session, export_path / language.language)
def main(argv) -> None: """Main entry point.""" if len(argv) > 1: raise app.UsageError("Too many command-line arguments.") connection = github_api.GetDefaultGithubConnectionOrDie( extra_access_token_paths=["~/.github/access_tokens/scraper.txt"] ) clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError("--clone_list is not a file.") clone_list = pbutil.FromFile( clone_list_path, scrape_repos_pb2.LanguageCloneList() ) for language in clone_list.language: app.Log( 1, "Scraping %s repos using %s queries ...", language.language, humanize.Commas(len(language.query)), ) for query in language.query: RunQuery(QueryScraper(language, query, connection)) app.Log(1, "Finished scraping. Indexed repository counts:") for language in clone_list.language: app.Log( 1, " %s: %s", language.language, humanize.Commas(GetNumberOfRepoMetas(language)), )
def main(argv) -> None: """Main entry point.""" if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError('--clone_list is not a file.') clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) for language in clone_list.language: logging.info('Scraping %s repos using %s queries ...', language.language, humanize.intcomma(len(language.query))) for query in language.query: RunQuery(QueryScraper(language, query)) logging.info('Finished scraping. Indexed repository counts:') for language in clone_list.language: logging.info(' %s: %s', language.language, humanize.intcomma(GetNumberOfRepoMetas(language)))
def main(argv): """Main entry point.""" if len(argv) > 1: raise app.UsageError("Unknown arguments '{}'".format(", ".join( argv[1:]))) clone_list_path = pathlib.Path(FLAGS.clone_list or "") if not clone_list_path.is_file(): raise app.UsageError("--clone_list is not a file.") clone_list = pbutil.FromFile(clone_list_path, scrape_repos_pb2.LanguageCloneList()) # Error early if the config contains invalid preprocessors. for language in clone_list.language: for importer in language.importer: [ preprocessors.GetPreprocessorFunction(p) for p in importer.preprocessor ] pool = multiprocessing.Pool(FLAGS.indexer_processes) for language in clone_list.language: ImportFromLanguage(language, pool)