Example #1
0
def test_ImportFromLanguage_Java_repo(
  test_db: contentfiles.ContentFiles, tempdir: pathlib.Path
):
  """An end-to-end test of a Java importer."""
  (tempdir / "Owner_Name" / ".git").mkdir(parents=True)
  (tempdir / "Owner_Name" / "src").mkdir(parents=True)

  # A repo will only be imported if there is a repo meta file.
  pbutil.ToFile(
    scrape_repos_pb2.GitHubRepoMetadata(owner="Owner", name="Name"),
    tempdir / "Owner_Name.pbtxt",
  )

  # Create some files in our test repo.
  with open(tempdir / "Owner_Name" / "src" / "A.java", "w") as f:
    f.write(
      """
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
"""
    )
  with open(tempdir / "Owner_Name" / "src" / "B.java", "w") as f:
    f.write(
      """
public class B {
  private static int foo() {return 5;}
}
"""
    )
  with open(tempdir / "Owner_Name" / "README.txt", "w") as f:
    f.write("Hello, world!")

  language = scrape_repos_pb2.LanguageToClone(
    language="foolang",
    query=[],
    destination_directory=str(tempdir),
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java",
        preprocessor=[
          "datasets.github.scrape_repos.preprocessors." "extractors:JavaMethods"
        ],
      ),
    ],
  )
  importer.ImportFromLanguage(test_db, language)
  with test_db.Session() as session:
    query = session.query(contentfiles.ContentFile)
    assert query.count() == 2
    assert set([cf.text for cf in query]) == {
      (
        "public static void helloWorld(){\n"
        '  System.out.println("Hello, world!");\n}\n'
      ),
      "private static int foo(){\n  return 5;\n}\n",
    }
Example #2
0
def test_ImportFromLanguage_Java_repo(tempdir: pathlib.Path):
  """An end-to-end test of a Java importer."""
  (tempdir / 'src').mkdir()
  (tempdir / 'src' / 'Owner_Name' / '.git').mkdir(parents=True)
  (tempdir / 'src' / 'Owner_Name' / 'src').mkdir(parents=True)

  # A repo will only be imported if there is a repo meta file.
  pbutil.ToFile(scrape_repos_pb2.GitHubRepoMetadata(
      owner='Owner',
      name='Name'),
      tempdir / 'src' / 'Owner_Name.pbtxt')

  # Create some files in our test repo.
  with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'A.java', 'w') as f:
    f.write("""
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
""")
  with open(tempdir / 'src' / 'Owner_Name' / 'src' / 'B.java', 'w') as f:
    f.write("""
public class B {
  private static int foo() {return 5;}
}
""")
  with open(tempdir / 'src' / 'Owner_Name' / 'README.txt', 'w') as f:
    f.write('Hello, world!')

  language = scrape_repos_pb2.LanguageToClone(
      language='foolang',
      query=[],
      destination_directory=str(tempdir / 'src'),
      importer=[
        scrape_repos_pb2.ContentFilesImporterConfig(
            source_code_pattern='.*\\.java',
            preprocessor=["datasets.github.scrape_repos.preprocessors."
                          "extractors:JavaMethods"]),
      ]
  )
  indexer.ImportFromLanguage(language, multiprocessing.Pool(1))

  test_repo = github_repo.GitHubRepo(tempdir / 'src' / 'Owner_Name.pbtxt')
  assert (test_repo.index_dir / 'DONE.txt').is_file()
  assert len(list(test_repo.index_dir.iterdir())) == 3
  contentfiles = list(test_repo.ContentFiles())
  assert len(contentfiles) == 2
  assert set([cf.text for cf in contentfiles]) == {
    ('public static void helloWorld(){\n'
     '  System.out.println("Hello, world!");\n}\n'),
    'private static int foo(){\n  return 5;\n}\n',
  }
Example #3
0
def test_GitHubRepo_Index_not_cloned(test_repo: github_repo.GitHubRepo):
    """Indexing a repo which is not cloned does nothing."""
    fs.rm(test_repo.clone_dir)
    assert not test_repo.IsIndexed()
    test_repo.Index([
        scrape_repos_pb2.ContentFilesImporterConfig(
            source_code_pattern='.*\\.java',
            preprocessor=[
                "datasets.github.scrape_repos.preprocessors."
                "extractors:JavaMethods"
            ]),
    ], multiprocessing.Pool(1))
    assert not test_repo.IsIndexed()
Example #4
0
def test_GitHubRepo_Index_index_dir_paths(tempdir: pathlib.Path):
    """Test that index directories are produced in the correct location."""
    repo = _CreateTestRepo(tempdir / 'java', 'Foo', 'Bar')
    repo.Index([
        scrape_repos_pb2.ContentFilesImporterConfig(
            source_code_pattern='.*\\.java',
            preprocessor=[
                "datasets.github.scrape_repos.preprocessors."
                "extractors:JavaMethods"
            ]),
    ], multiprocessing.Pool(1))
    assert (tempdir / 'java.index').is_dir()
    assert (tempdir / 'java.index' / 'Foo_Bar').is_dir()
Example #5
0
def language(
  tempdir: pathlib.Path, query: scrape_repos_pb2.GitHubRepositoryQuery
) -> scrape_repos_pb2.LanguageCloneList:
  return scrape_repos_pb2.LanguageToClone(
    language="java",
    query=[query],
    destination_directory=str(tempdir),
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java"
      )
    ],
  )
Example #6
0
def test_GitHubRepo_Index_Java_repo(test_repo: github_repo.GitHubRepo):
    """An end-to-end test of a Java indexer."""
    (test_repo.clone_dir / "src").mkdir(exist_ok=True)
    with open(test_repo.clone_dir / "src" / "A.java", "w") as f:
        f.write("""
public class A {
  public static void helloWorld() {
    System.out.println("Hello, world!");
  }
}
""")
    with open(test_repo.clone_dir / "src" / "B.java", "w") as f:
        f.write("""
public class B {
  private static int foo() {return 5;}
}
""")
    with open(test_repo.clone_dir / "README.txt", "w") as f:
        f.write("Hello, world!")

    assert not test_repo.index_dir.is_dir()
    assert not list(test_repo.ContentFiles())
    test_repo.Index(
        [
            scrape_repos_pb2.ContentFilesImporterConfig(
                source_code_pattern=".*\\.java",
                preprocessor=[
                    "datasets.github.scrape_repos.preprocessors."
                    "extractors:JavaMethods"
                ],
            ),
        ],
        multiprocessing.Pool(1),
    )
    assert test_repo.index_dir.is_dir()

    assert (test_repo.index_dir / "DONE.txt").is_file()
    assert len(list(test_repo.index_dir.iterdir())) == 3
    contentfiles = list(test_repo.ContentFiles())
    assert len(contentfiles) == 2

    assert set([cf.text for cf in contentfiles]) == {
        ("public static void helloWorld(){\n"
         '  System.out.println("Hello, world!");\n}\n'),
        "private static int foo(){\n  return 5;\n}\n",
    }
Example #7
0
def GetLanguageToClone(
  query_prefix: str, destination_dir: str
) -> scrape_repos_pb2.LanguageToClone:
  # See: https://help.github.com/en/articles/sorting-search-results
  sort_by = random.choice(["stars", "forks", "updated"])
  return scrape_repos_pb2.LanguageToClone(
    language="java",
    query=[
      scrape_repos_pb2.GitHubRepositoryQuery(
        string=f"{query_prefix} language:java sort:{sort_by} fork:false"
      )
    ],
    destination_directory=destination_dir,
    importer=[
      scrape_repos_pb2.ContentFilesImporterConfig(
        source_code_pattern=".*\\.java"
      )
    ],
    clone_from_url_blacklist=BLACKLIST_GITHUB_REPOS,
  )