Example #1
0
def ImportFromLanguage(db: contentfiles.ContentFiles,
                       language: scrape_repos_pb2.LanguageToClone,
                       pool: multiprocessing.Pool) -> None:
    """Import contentfiles from a language specification.

  Args:
    db: The database to import to.
    language: The language to import.
    pool: A multiprocessing pool.

  Raises:
    ValueError: If importer field not set.
  """
    if not language.importer:
        raise ValueError('LanguageToClone.importer field not set')

    with db.Session() as session:
        repos_to_import = [
            pathlib.Path(language.destination_directory / f)
            for f in pathlib.Path(language.destination_directory).iterdir()
            if ShouldImportRepo(
                session, pathlib.Path(language.destination_directory / f))
        ]
    random.shuffle(repos_to_import)
    logging.info('Importing %s %s repos ...',
                 humanize.intcomma(len(repos_to_import)),
                 language.language.capitalize())
    for metafile in repos_to_import:
        with db.Session(commit=True) as session:
            ImportRepo(session, language, metafile, pool)
Example #2
0
def MaskOnMinStarCount(db: contentfiles.ContentFiles,
                       min_star_count: int) -> None:
    """Mask by the minimum repo star count.

  Args:
    db: The database to modify.
    min_star_count: The minimum number of stars for a repo to be active.
  """
    with db.Session(commit=not FLAGS.dry_run) as session:
        active_repo_count = (session.query(
            contentfiles.GitHubRepository).filter(
                contentfiles.GitHubRepository.active).count())

        repos_to_mark_inactive = (session.query(
            contentfiles.GitHubRepository).filter(
                contentfiles.GitHubRepository.active == True).filter(
                    contentfiles.GitHubRepository.num_stars < min_star_count))
        repos_to_mark_inactive_count = repos_to_mark_inactive.count()

        app.Log(
            1,
            "Marking %s of %s active repos inactive (%.2f %%)",
            humanize.Commas(repos_to_mark_inactive_count),
            humanize.Commas(active_repo_count),
            (repos_to_mark_inactive_count / active_repo_count) * 100,
        )
        repos_to_mark_inactive.update({"active": False})
Example #3
0
def MaskOnMaxRepoCount(db: contentfiles.ContentFiles,
                       max_repo_count: int) -> None:
    """Mask by the maximum number of repos.

  Args:
    db: The database to modify.
    max_repo_count: The maximum number of active repos.
  """
    with db.Session(commit=not FLAGS.dry_run) as session:
        active_repos = session.query(
            contentfiles.GitHubRepository.clone_from_url).filter(
                contentfiles.GitHubRepository.active == True)
        active_repos_count = active_repos.count()

        repos_to_mark_inactive_count = max(0,
                                           active_repos_count - max_repo_count)

        repos_to_mark_inactive = active_repos.order_by(
            db.Random()).limit(repos_to_mark_inactive_count)

        app.Log(
            1,
            "Marking %s of %s active repos inactive (%.2f %%)",
            humanize.Commas(repos_to_mark_inactive_count),
            humanize.Commas(active_repos_count),
            (repos_to_mark_inactive_count / active_repos_count) * 100,
        )
        # Can't call Query.update() or Query.delete() when limit() has been called,
        # hence the subquery.
        clone_from_urls = {r[0] for r in repos_to_mark_inactive}
        session.query(contentfiles.GitHubRepository).filter(
            contentfiles.GitHubRepository.clone_from_url.in_(
                clone_from_urls)).update({"active": False},
                                         synchronize_session="fetch")
Example #4
0
def test_Exporter(
  db: contentfiles.ContentFiles, empty_db: contentfiles.ContentFiles
):
  """Test that exporter behaves as expected."""
  exporter = export_java_corpus.Exporter(db, empty_db, static_only=True)
  exporter.start()
  exporter.join()

  with empty_db.Session() as s:
    assert s.query(contentfiles.GitHubRepository).count() == 1
    assert s.query(contentfiles.ContentFile).count() == 1

    repo = s.query(contentfiles.GitHubRepository).first()
    assert repo.clone_from_url == "abc"

    contentfile = s.query(contentfiles.ContentFile).first()
    assert contentfile.sha256 != "000"
    assert contentfile.relpath == "foo"
    assert (
      contentfile.text
      == """\
public static void main(String[] args){
  System.out.println("Hello, world");
}
"""
    )
    assert contentfile.charcount == len(
      """\
public static void main(String[] args){
  System.out.println("Hello, world");
}
"""
    )
    assert contentfile.linecount == 4
Example #5
0
def test_ImportFromLanguage_Java_repo(
  test_db: contentfiles.ContentFiles, tempdir: pathlib.Path
):
  """An end-to-end test of a Java importer."""
  (tempdir / "Owner_Name" / ".git").mkdir(parents=True)
  (tempdir / "Owner_Name" / "src").mkdir(parents=True)

  # A repo will only be imported if there is a repo meta file.
  pbutil.ToFile(
    scrape_repos_pb2.GitHubRepoMetadata(owner="Owner", name="Name"),
    tempdir / "Owner_Name.pbtxt",
  )

  # Create some files in our test repo.
  with open(tempdir / "Owner_Name" / "src" / "A.java", "w") as f:
    f.write(
      """
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
"""
    )
  with open(tempdir / "Owner_Name" / "src" / "B.java", "w") as f:
    f.write(
      """
public class B {
  private static int foo() {return 5;}
}
"""
    )
  with open(tempdir / "Owner_Name" / "README.txt", "w") as f:
    f.write("Hello, world!")

  language = scrape_repos_pb2.LanguageToClone(
    language="foolang",
    query=[],
    destination_directory=str(tempdir),
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java",
        preprocessor=[
          "datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods"
        ],
      ),
    ],
  )
  importer.ImportFromLanguage(test_db, language)
  with test_db.Session() as session:
    query = session.query(contentfiles.ContentFile)
    assert query.count() == 2
    assert set([cf.text for cf in query]) == {
      (
        "public static void helloWorld(){\n"
        '  System.out.println("Hello, world!");\n}\n'
      ),
      "private static int foo(){\n  return 5;\n}\n",
    }
Example #6
0
def ProcessRepo(
  input_db: contentfiles.ContentFiles,
  output_db: contentfiles.ContentFiles,
  clone_from_url: str,
  static_only: bool,
):
  """Preprocess all content files from a single scraped repo."""
  with input_db.Session(commit=True) as input_session:
    with output_db.Session(commit=True) as output_session:
      with tempfile.TemporaryDirectory(prefix="phd_") as d:
        DoProcessRepo(
          input_session,
          output_session,
          clone_from_url,
          pathlib.Path(d),
          static_only,
        )
Example #7
0
def test_Exporter_overloaded_method_extraction(
  db: contentfiles.ContentFiles, empty_db: contentfiles.ContentFiles
):
  """Test that exporter behaves as expected."""
  exporter = export_java_corpus.Exporter(db, empty_db, static_only=True)

  with db.Session(commit=True) as s:
    s.add(
      contentfiles.ContentFile(
        clone_from_url="abc",
        relpath="a/file.txt",
        artifact_index=0,
        sha256="000",
        charcount=200,
        linecount=10,
        text="""
public class HelloWorld {
  private static int foo(int a) {
    return 5;
  }

  private static int foo(float a) {
    return 5;
  }

  private static int foo(double a) {
    return 5;
  }
}
""",
      )
    )

  exporter.start()
  exporter.join()

  with empty_db.Session() as s:
    query = s.query(contentfiles.ContentFile).filter(
      contentfiles.ContentFile.relpath == "a/file.txt"
    )
    assert query.count() == 3
    for cf in query:
      assert "private static int foo(" in cf.text

    indices = {cf.artifact_index for cf in query}
    assert indices == {0, 1, 2}
Example #8
0
def ProcessBatch(
  input_db: contentfiles.ContentFiles,
  pp_db: preprocessed.PreprocessedContentFile,
  outdir: pathlib.Path,
  ids: typing.List[int],
):
  with pp_db.Session(commit=True) as pp_session:
    with input_db.Session() as input_session:
      to_preprocess = pp_session.query(
        preprocessed.PreprocessedContentFile
      ).filter(preprocessed.PreprocessedContentFile.id.in_(ids))
      ProcessList(input_session, to_preprocess, outdir)
Example #9
0
def test_PipelinedScraper_contentfiles_database_ignores_duplicates(
  language: scrape_repos_pb2.LanguageCloneList,
  query: scrape_repos_pb2.GitHubRepositoryQuery,
  connection: MockGitHubConnection,
  db: contentfiles.ContentFiles,
):
  """Test database contents."""
  scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db)
  scraper.start()
  scraper.join()
  with db.Session() as session:
    original_contentfile_count = session.query(contentfiles.ContentFile).count()
    assert original_contentfile_count

  # Run the scraper again.
  scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db)
  scraper.start()
  scraper.join()
  with db.Session() as session:
    assert (
      session.query(contentfiles.ContentFile).count()
      == original_contentfile_count
    )
Example #10
0
def test_PipelinedScraper_contentfiles_database_repo_contents(
  language: scrape_repos_pb2.LanguageCloneList,
  query: scrape_repos_pb2.GitHubRepositoryQuery,
  connection: MockGitHubConnection,
  db: contentfiles.ContentFiles,
):
  """Test database contents."""
  # This test will fail if the contents of GitHub repository
  # https://github.com/ChrisCummins/empty_repository_for_testing change.
  scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db)
  scraper.start()
  scraper.join()
  with db.Session() as session:
    assert session.query(contentfiles.GitHubRepository).count() == 1
    repo = session.query(contentfiles.GitHubRepository).first()

    assert repo.clone_from_url == (
      "https://github.com/ChrisCummins/empty_repository_for_testing.git"
    )
Example #11
0
def ResetExported(db: contentfiles.ContentFiles) -> None:
    """Restore exported status to database.

  Args:
    db: The database to modify.
  """
    with db.Session(commit=not FLAGS.dry_run) as session:
        exported_repos = session.query(contentfiles.GitHubRepository).filter(
            contentfiles.GitHubRepository.exported == True)
        exported_repos_count = exported_repos.count()

        repos_count = session.query(contentfiles.GitHubRepository).count()

        app.Log(
            1,
            "Marking %s of %s repos as not exported (%.2f %%)",
            humanize.Commas(exported_repos_count),
            humanize.Commas(repos_count),
            (exported_repos_count / repos_count) * 100,
        )
        exported_repos.update({"exported": False})
Example #12
0
def MaskOnMaxRepoFileCount(db: contentfiles.ContentFiles,
                           max_repo_file_count: int) -> None:
    """Mask by the maximum repo file count.

  Args:
    db: The database to modify.
    max_repo_file_count: The maxmium number of contentfiles in a repo for it to
        be active.
  """
    with db.Session(commit=not FLAGS.dry_run) as session:
        active_repo_count = (session.query(
            contentfiles.GitHubRepository).filter(
                contentfiles.GitHubRepository.active).count())

        repos_to_mark_inactive = (session.query(
            contentfiles.ContentFile.clone_from_url,
            sql.func.count(contentfiles.ContentFile.clone_from_url),
        ).join(contentfiles.GitHubRepository).filter(
            contentfiles.GitHubRepository.active == True).group_by(
                contentfiles.ContentFile.clone_from_url).having(
                    sql.func.count(contentfiles.ContentFile.clone_from_url) >
                    max_repo_file_count))
        repos_to_mark_inactive_count = repos_to_mark_inactive.count()

        app.Log(
            1,
            "Marking %s of %s active repos inactive (%.2f %%)",
            humanize.Commas(repos_to_mark_inactive_count),
            humanize.Commas(active_repo_count),
            (repos_to_mark_inactive_count / active_repo_count) * 100,
        )

        # Can't call Query.update() or Query.delete() when limit() has been called,
        # hence the subquery.
        clone_from_urls = {r.clone_from_url for r in repos_to_mark_inactive}
        session.query(contentfiles.GitHubRepository).filter(
            contentfiles.GitHubRepository.clone_from_url.in_(
                clone_from_urls)).update({"active": False},
                                         synchronize_session="fetch")
Example #13
0
def Reset(db: contentfiles.ContentFiles) -> None:
    """Restore active status to database.

  Args:
    db: The database to modify.
  """
    with db.Session(commit=not FLAGS.dry_run) as session:
        inactive_repos = session.query(contentfiles.GitHubRepository).filter(
            contentfiles.GitHubRepository.active == False)
        inactive_repos_count = inactive_repos.count()

        repos_count = session.query(contentfiles.GitHubRepository).count()

        app.Log(
            1,
            "Restoring active status to %s of %s repos (%.2f %%)",
            humanize.Commas(inactive_repos_count),
            humanize.Commas(repos_count),
            (inactive_repos_count / repos_count) * 100,
        )
        inactive_repos.update({"active": True})

        inactive_cf = session.query(contentfiles.ContentFile).filter(
            contentfiles.ContentFile.active == False)
        inactive_cf_count = inactive_cf.count()

        cf_count = session.query(contentfiles.ContentFile).count()

        app.Log(
            1,
            "Restoring active status to %s of %s content files (%.2f %%)",
            humanize.Commas(inactive_cf_count),
            humanize.Commas(cf_count),
            (inactive_cf_count / cf_count) * 100,
        )
        inactive_cf.update({"active": True})
Example #14
0
def test_PipelinedScraper_contentfiles_database_contents(
  language: scrape_repos_pb2.LanguageCloneList,
  query: scrape_repos_pb2.GitHubRepositoryQuery,
  connection: MockGitHubConnection,
  db: contentfiles.ContentFiles,
):
  """Test database contents."""
  # This test will fail if the contents of GitHub repository
  # https://github.com/ChrisCummins/empty_repository_for_testing change.
  scraper = pipelined_scraper.PipelinedScraper(language, query, connection, db)
  scraper.start()
  scraper.join()
  with db.Session() as session:
    assert session.query(contentfiles.ContentFile).count() == 1
    contentfile = session.query(contentfiles.ContentFile).first()

    assert contentfile.clone_from_url == (
      "https://github.com/ChrisCummins/empty_repository_for_testing.git"
    )
    assert contentfile.relpath == "HelloWorld.java"
    assert contentfile.artifact_index == 0
    assert contentfile.text == HELLO_WORLD_TEXT
    assert contentfile.charcount == len(HELLO_WORLD_TEXT)
    assert contentfile.linecount == len(HELLO_WORLD_TEXT.split("\n"))
Example #15
0
def PopulateBytecodeTable(
  cf: contentfiles.ContentFiles,
  language: str,
  db: bytecode_database.Database,
  pool: typing.Optional[multiprocessing.Pool] = None,
):
  # Only one process at a time can run this method.
  mutex = lockfile.AutoLockFile(granularity="function")

  # We use the database URL as the name of the source.
  source_name = cf.url

  # Read source files from the contenfiles database, process them into
  # bytecodes, and, if successful, write them into the database. We process
  # files sorted by their numeric ID in the contentfiles database, so that if
  with db.Session() as s:
    # Get the ID of the last-processed bytecode file to resume from.
    resume_from = int(
      (
        s.query(bytecode_database.LlvmBytecode.relpath)
        .filter(bytecode_database.LlvmBytecode.source_name == cf.url)
        .filter(bytecode_database.LlvmBytecode.language == language)
        # Note the cast to integer: relpath is a string column, sorting by it
        # in its native type would sort the string (e.g. '9' > '10'.
        .order_by(
          sql.cast(bytecode_database.LlvmBytecode.relpath, sql.Integer).desc()
        )
        .limit(1)
        .first()
        or (0,)
      )[0]
    )

  with mutex, cf.Session() as cf_s, sqlutil.BufferedDatabaseWriter(
    db, max_buffer_length=10
  ) as writer:
    # Get the ID of the last contentfile to process.
    n = (
      cf_s.query(contentfiles.ContentFile.id)
      .join(contentfiles.GitHubRepository)
      .filter(contentfiles.GitHubRepository.language == language)
      .order_by(contentfiles.ContentFile.id.desc())
      .limit(1)
      .one_or_none()
      or (0,)
    )[0]
    app.Log(
      1,
      "Starting at row %s / %s",
      humanize.Commas(resume_from),
      humanize.Commas(n),
    )

    # A query to return the <id,text> tuples of files to process.
    q = (
      cf_s.query(contentfiles.ContentFile.id, contentfiles.ContentFile.text)
      .filter(contentfiles.ContentFile.id > resume_from)
      .join(contentfiles.GitHubRepository)
      .filter(contentfiles.GitHubRepository.language == language)
      .order_by(contentfiles.ContentFile.id)
    )

    row_batches = sqlutil.OffsetLimitBatchedQuery(
      q, batch_size=FLAGS.batch_size
    )

    for i, batch in zip(range(resume_from, n + 1), row_batches):
      app.Log(
        1,
        "Processing batch of %d contentfiles -> bytecodes, %s / %s (%.1f%%)",
        FLAGS.batch_size,
        humanize.Commas(i),
        humanize.Commas(n),
        (i / n) * 100,
      )
      protos = GetBytecodesFromContentFiles(source_name, language, batch.rows)
      writer.AddMany(
        [
          bytecode_database.LlvmBytecode(
            **bytecode_database.LlvmBytecode.FromProto(proto)
          )
          for proto in protos
        ]
      )