コード例 #1
0
def main(argv) -> None:
  """Main entry point."""
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  clone_list_path = pathlib.Path(FLAGS.clone_list or "")
  if not clone_list_path.is_file():
    raise app.UsageError('--clone_list is not a file.')
  clone_list = pbutil.FromFile(clone_list_path,
                               scrape_repos_pb2.LanguageCloneList())

  meta_files = []
  for language in clone_list.language:
    directory = pathlib.Path(language.destination_directory)
    if directory.is_dir():
      meta_files += [pathlib.Path(directory / f) for f in directory.iterdir() if
                     IsRepoMetaFile(f)]
  random.shuffle(meta_files)
  worker = AsyncWorker(meta_files)
  logging.info('Cloning %s repos from GitHub ...',
               humanize.intcomma(worker.max))
  bar = progressbar.ProgressBar(max_value=worker.max, redirect_stderr=True)
  worker.start()
  while worker.is_alive():
    bar.update(worker.i)
    worker.join(.5)
  bar.update(worker.i)
コード例 #2
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments '{}'".format(', '.join(
            argv[1:])))

    clone_list_path = pathlib.Path(FLAGS.clone_list or '')
    if not clone_list_path.is_file():
        raise app.UsageError('--clone_list is not a file.')
    clone_list = pbutil.FromFile(clone_list_path,
                                 scrape_repos_pb2.LanguageCloneList())

    if not FLAGS.export_path:
        raise app.UsageError('--export_path not set.')
    export_path = pathlib.Path(FLAGS.export_path)
    export_path.mkdir(parents=True, exist_ok=True)

    # To export from contentfiles database.
    # for language in clone_list.language:
    #   d = pathlib.Path(language.destination_directory)
    #   d = d.parent / (str(d.name) + '.db')
    #   db = contentfiles.ContentFiles(d)
    #   with db.Session() as session:
    #     (export_path / language.language).mkdir(exist_ok=True)
    #     ExportDatabase(session, export_path / language.language)

    # To export from index directory.
    for language in clone_list.language:
        index_path = pathlib.Path(language.destination_directory + '.index')
        if index_path.is_dir():
            (export_path / language.language).mkdir(exist_ok=True)
            ExportIndex(index_path, export_path / language.language)
コード例 #3
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments '{}'".format(', '.join(
            argv[1:])))

    clone_list_path = pathlib.Path(FLAGS.clone_list or "")
    if not clone_list_path.is_file():
        raise app.UsageError('--clone_list is not a file.')
    clone_list = pbutil.FromFile(clone_list_path,
                                 scrape_repos_pb2.LanguageCloneList())

    # Error early if the config contains invalid preprocessors.
    for language in clone_list.language:
        for importer in language.importer:
            [
                preprocessors.GetPreprocessorFunction(p)
                for p in importer.preprocessor
            ]

    pool = multiprocessing.Pool(FLAGS.processes)
    for language in clone_list.language:
        d = pathlib.Path(language.destination_directory)
        d = d.parent / (str(d.name) + '.db')
        db = contentfiles.ContentFiles(d)
        if pathlib.Path(language.destination_directory).is_dir():
            ImportFromLanguage(db, language, pool)
コード例 #4
0
def main(argv):
  """Main entry point."""
  if len(argv) > 1:
    raise app.UsageError("Unknown arguments '{}'".format(", ".join(argv[1:])))

  clone_list_path = pathlib.Path(FLAGS.clone_list or "")
  if not clone_list_path.is_file():
    raise app.UsageError("--clone_list is not a file.")
  clone_list = pbutil.FromFile(
    clone_list_path, scrape_repos_pb2.LanguageCloneList()
  )

  if not FLAGS.export_path:
    raise app.UsageError("--export_path not set.")
  export_path = pathlib.Path(FLAGS.export_path)
  export_path.mkdir(parents=True, exist_ok=True)

  # To export from contentfiles database.
  for language in clone_list.language:
    d = pathlib.Path(language.destination_directory)
    d = d.parent / (str(d.name) + ".db")
    db = contentfiles.ContentFiles(f"sqlite:///{d}")
    with db.Session() as session:
      (export_path / language.language).mkdir(exist_ok=True)
      ExportDatabase(session, export_path / language.language)
コード例 #5
0
def main(argv) -> None:
  """Main entry point."""
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")

  connection = github_api.GetDefaultGithubConnectionOrDie(
    extra_access_token_paths=["~/.github/access_tokens/scraper.txt"]
  )

  clone_list_path = pathlib.Path(FLAGS.clone_list or "")
  if not clone_list_path.is_file():
    raise app.UsageError("--clone_list is not a file.")

  clone_list = pbutil.FromFile(
    clone_list_path, scrape_repos_pb2.LanguageCloneList()
  )

  for language in clone_list.language:
    app.Log(
      1,
      "Scraping %s repos using %s queries ...",
      language.language,
      humanize.Commas(len(language.query)),
    )
    for query in language.query:
      RunQuery(QueryScraper(language, query, connection))

  app.Log(1, "Finished scraping. Indexed repository counts:")
  for language in clone_list.language:
    app.Log(
      1,
      "  %s: %s",
      language.language,
      humanize.Commas(GetNumberOfRepoMetas(language)),
    )
コード例 #6
0
ファイル: scraper.py プロジェクト: BeauJoh/phd
def main(argv) -> None:
  """Main entry point."""
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  clone_list_path = pathlib.Path(FLAGS.clone_list or "")
  if not clone_list_path.is_file():
    raise app.UsageError('--clone_list is not a file.')

  clone_list = pbutil.FromFile(clone_list_path,
                               scrape_repos_pb2.LanguageCloneList())

  for language in clone_list.language:
    logging.info('Scraping %s repos using %s queries ...',
                 language.language, humanize.intcomma(len(language.query)))
    for query in language.query:
      RunQuery(QueryScraper(language, query))

  logging.info('Finished scraping. Indexed repository counts:')
  for language in clone_list.language:
    logging.info('  %s: %s', language.language,
                 humanize.intcomma(GetNumberOfRepoMetas(language)))
コード例 #7
0
def main(argv):
    """Main entry point."""
    if len(argv) > 1:
        raise app.UsageError("Unknown arguments '{}'".format(", ".join(
            argv[1:])))

    clone_list_path = pathlib.Path(FLAGS.clone_list or "")
    if not clone_list_path.is_file():
        raise app.UsageError("--clone_list is not a file.")
    clone_list = pbutil.FromFile(clone_list_path,
                                 scrape_repos_pb2.LanguageCloneList())

    # Error early if the config contains invalid preprocessors.
    for language in clone_list.language:
        for importer in language.importer:
            [
                preprocessors.GetPreprocessorFunction(p)
                for p in importer.preprocessor
            ]

    pool = multiprocessing.Pool(FLAGS.indexer_processes)
    for language in clone_list.language:
        ImportFromLanguage(language, pool)