Beispiel #1
0
def test_PreprocessContentfiles():
  """Test preprocessing a basic input."""
  pp_cfs = preprocess_java_corpus.PreprocessContentfiles(
    [
      contentfiles.ContentFile(
        text="""
private static int Foobar(int foo) {
          int bar = 10    + 1; foo += bar;
                foo *= 2;
          return foo + 10;
        }
"""
      )
    ]
  )
  assert len(pp_cfs) == 1
  assert (
    pp_cfs[0].text
    == """\
private static int fn_A(int a){
  int b=10 + 1;
  a+=b;
  a*=2;
  return a + 10;
}
"""
  )
  assert pp_cfs[0].preprocessing_succeeded
Beispiel #2
0
def test_PreprocessContentfiles_method_depends_on_java_util():
  """Test that a method which uses java.util.ArrayList works."""
  pp_cfs = preprocess_java_corpus.PreprocessContentfiles(
    [
      contentfiles.ContentFile(
        text="""
private static int Foobar(int a, ArrayList<Integer> _) {
  int b=10 + 1;
  a+=b;
  a*=2;
  return a + 10;
}
"""
      )
    ]
  )
  assert len(pp_cfs) == 1
  assert (
    pp_cfs[0].text
    == """\
private static int fn_A(int a,ArrayList<Integer> b){
  int c=10 + 1;
  a+=c;
  a*=2;
  return a + 10;
}
"""
  )
  assert pp_cfs[0].preprocessing_succeeded
Beispiel #3
0
def test_Exporter_overloaded_method_extraction(
  db: contentfiles.ContentFiles, empty_db: contentfiles.ContentFiles
):
  """Test that exporter behaves as expected."""
  exporter = export_java_corpus.Exporter(db, empty_db, static_only=True)

  with db.Session(commit=True) as s:
    s.add(
      contentfiles.ContentFile(
        clone_from_url="abc",
        relpath="a/file.txt",
        artifact_index=0,
        sha256="000",
        charcount=200,
        linecount=10,
        text="""
public class HelloWorld {
  private static int foo(int a) {
    return 5;
  }

  private static int foo(float a) {
    return 5;
  }

  private static int foo(double a) {
    return 5;
  }
}
""",
      )
    )

  exporter.start()
  exporter.join()

  with empty_db.Session() as s:
    query = s.query(contentfiles.ContentFile).filter(
      contentfiles.ContentFile.relpath == "a/file.txt"
    )
    assert query.count() == 3
    for cf in query:
      assert "private static int foo(" in cf.text

    indices = {cf.artifact_index for cf in query}
    assert indices == {0, 1, 2}
Beispiel #4
0
def ImportWorker(
    job: scrape_repos_pb2.ImportWorker
) -> typing.List[contentfiles.ContentFile]:
  """Import a content file."""
  relpath = job.abspath[len(str(job.clone_dir)) + 1:]
  outputs: typing.List[contentfiles.ContentFile] = []
  try:
    texts = preprocessors.Preprocess(pathlib.Path(job.clone_dir), relpath,
                                     job.all_files_relpaths, job.preprocessors)
    for i, text in enumerate(texts):
      sha256 = hashlib.sha256(text.encode('utf-8'))
      outputs.append(contentfiles.ContentFile(
          clone_from_url=job.clone_from_url,
          relpath=relpath, artifact_index=i,
          sha256=sha256.digest(), charcount=len(text),
          linecount=len(text.split('\n')), text=text))
  except UnicodeDecodeError:
    logging.warning('Failed to decode %s', relpath)
  return outputs
Beispiel #5
0
def db(tempdir: pathlib.Path) -> contentfiles.ContentFiles:
  db_ = contentfiles.ContentFiles(f"sqlite:///{tempdir}/a")
  with db_.Session(commit=True) as session:
    session.add(
      contentfiles.GitHubRepository(
        owner="foo",
        name="bar",
        clone_from_url="abc",
        num_stars=0,
        num_forks=0,
        num_watchers=0,
        active=1,
        exported=0,
        date_scraped=datetime.datetime.utcnow(),
        language="java",
      )
    )
    session.add(
      contentfiles.ContentFile(
        clone_from_url="abc",
        relpath="foo",
        artifact_index=0,
        sha256="000",
        charcount=100,
        linecount=4,
        active=1,
        text="""
import java.util.ArrayList;

public class HelloWorld {
  private int foo(ArrayList<Integer> x) {
    return 5;
  }

  public static void main(String[] args) {
    System.out.println("Hello, world");
  }
}
""",
      )
    )
  return db_
Beispiel #6
0
def DoProcessRepo(
  input_session: sqlutil.Session,
  output_session: sqlutil.Session,
  clone_from_url: str,
  workding_dir: pathlib.Path,
  static_only: bool,
) -> None:
  """Preprocess all content files from a single scraped repo."""
  candidate_contentfiles = input_session.query(
    contentfiles.ContentFile.relpath, contentfiles.ContentFile.text
  ).filter(contentfiles.ContentFile.clone_from_url == clone_from_url)
  contentfiles_to_export = (
    candidate_contentfiles.filter(
      contentfiles.ContentFile.linecount >= FLAGS.min_line_count
    )
    .filter(contentfiles.ContentFile.charcount >= FLAGS.min_char_count)
    .all()
  )
  app.Log(
    2,
    "Exporting %s of %s content files from %s",
    humanize.Commas(len(contentfiles_to_export)),
    humanize.Commas(candidate_contentfiles.count()),
    clone_from_url,
  )

  # Create the directory tree first.
  for relpath, method_text in contentfiles_to_export:
    path = workding_dir / relpath
    path.parent.mkdir(parents=True, exist_ok=True)
    fs.Write(path, method_text.encode("utf-8"), overwrite_existing=False)

  # Copy repo to output.
  repo = input_session.query(contentfiles.GitHubRepository).filter(
    contentfiles.GitHubRepository.clone_from_url == clone_from_url
  )
  ImportQueryResults(repo, output_session)

  # Run the preprocessors.
  methods_lists = extractors.BatchedMethodExtractor(
    [text for _, text in contentfiles_to_export]
  )

  relpath_counters = collections.defaultdict(int)

  for (relpath, text), methods in zip(contentfiles_to_export, methods_lists):
    # Attempt to extract all imports for this content file.
    # NOTE(2019-06-28): Disabled import inlining to simplify the synthesis
    # pipeline. We may wish to revisit this at a later date.
    # imports = GetJavaImports(text)

    for i, original_method_text in enumerate(methods):
      # Insert "//import ..." comments before each method so that we know which
      # packages must be imported.
      # NOTE(2019-06-28): Disabled import inlining to simplify the synthesis
      # pipeline. We may wish to revisit this at a later date.
      # method_text = InsertImportCommentHeader(original_method_text, imports)
      method_text = original_method_text

      encoded_text = method_text.encode("ascii", "ignore")
      sha256 = hashlib.sha256(encoded_text).hexdigest()
      method_text = encoded_text.decode("ascii")
      # Add new contentfile.
      output_session.add(
        contentfiles.ContentFile(
          clone_from_url=clone_from_url,
          relpath=relpath,
          artifact_index=relpath_counters[relpath],
          sha256=sha256,
          charcount=len(original_method_text),
          linecount=len(original_method_text.split("\n")),
          text=method_text,
        )
      )
      relpath_counters[relpath] += 1

  # Mark repo as exported.
  repo.update({"exported": True})