def setUp(self): BaseTestCase.setUp(self) file_path = path.dirname(path.realpath(__file__)) repos_path = path.join(file_path, '..', '..', 'src', 'test', 'resources', 'siva-files') repos_format = 'siva' self.engine = Engine(self.session, repos_path, repos_format)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-d', '--data', type=str, help="Path of the data.", required=True) parser.add_argument('-o', '--output', type=str, help="Path output to save the data.", required=False) args = parser.parse_args() data = args.data spark = SparkSession.builder \ .master("local[*]").appName("Examples") \ .getOrCreate() #engine = Engine(spark, "/home/hydra/projects/source_d/repositories/siva.srd/latest/*", "siva") # engine = Engine(spark, "/home/hydra/projects/source_d/repositories/siva.srd/latest/*", "siva") engine = Engine( spark, "/home/hydra/projects/source_d/data/selected_repositories/siva/latest/*", "siva") engine = Engine(spark, args.data, "siva") print("%d repositories successfully loaded" % (engine.repositories.count() / 2)) binary_uasts = engine.repositories.references.head_ref.commits.tree_entries.blobs \ .classify_languages().where('lang = "Python"') \ .extract_uasts().select('path', 'uast').rdd.filter(lambda r: len(r['uast']) > 0).collect() uasts = [] for b_uast in binary_uasts: uasts.append(Node.FromString(b_uast["uast"][0])) del binary_uasts rules_count, nodes_count = process_uasts(uasts) print_statistics(rules_count, nodes_count) cluster_nodes(nodes_count)
def create_engine(session_name, repositories, repository_format=EngineDefault.REPOSITORY_FORMAT, bblfsh=EngineDefault.BBLFSH, engine=EngineDefault.VERSION, config=SparkDefault.CONFIG, packages=SparkDefault.JAR_PACKAGES, spark=SparkDefault.MASTER_ADDRESS, spark_local_dir=SparkDefault.LOCAL_DIR, spark_log_level=SparkDefault.LOG_LEVEL, dep_zip=SparkDefault.DEP_ZIP, memory=SparkDefault.MEMORY): config += (get_bblfsh_dependency(bblfsh), ) packages += (get_engine_package(engine), ) session = create_spark(session_name, spark=spark, spark_local_dir=spark_local_dir, config=config, packages=packages, spark_log_level=spark_log_level, dep_zip=dep_zip, memory=memory) logging.getLogger("engine").info("Initializing engine on %s", repositories) return Engine(session, repositories, repository_format)
def create_engine(session_name, repositories, repository_format="siva", bblfsh=None, engine=None, config=SparkDefault.CONFIG, packages=SparkDefault.PACKAGES, spark=SparkDefault.MASTER_ADDRESS, spark_local_dir=SparkDefault.LOCAL_DIR, spark_log_level=SparkDefault.LOG_LEVEL, memory=SparkDefault.MEMORY, dep_zip=False): if not bblfsh: bblfsh = "localhost" if not engine: engine = get_engine_version() config = assemble_spark_config(config=config, memory=memory) add_engine_dependencies(engine=engine, config=config, packages=packages) add_bblfsh_dependencies(bblfsh=bblfsh, config=config) session = create_spark(session_name, spark=spark, spark_local_dir=spark_local_dir, config=config, packages=packages, spark_log_level=spark_log_level, dep_zip=dep_zip) log = logging.getLogger("engine") log.info("Initializing on %s", repositories) engine = Engine(session, repositories, repository_format) return engine
def main(): file_path = os.path.dirname(os.path.realpath(__file__)) repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') session = SparkSession.builder.appName("test").master( 'local[*]').getOrCreate() engine = Engine(session, repos_path) engine.repositories.references.master_ref.commits.show()
def main(): file_path = os.path.dirname(os.path.realpath(__file__)) repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') session = SparkSession.builder.appName("test").master('local[*]').getOrCreate() engine = Engine(session, repos_path) refs = engine.repositories.filter('id = "github.com/xiyou-linuxer/faq-xiyoulinux"')\ .references.select('name').collect() refs = [r['name'] for r in refs] print("REFERENCES:") for r in refs: print(r)
def main(): file_path = os.path.dirname(os.path.realpath(__file__)) repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') session = SparkSession.builder.appName("test").master( 'local[*]').getOrCreate() engine = Engine(session, repos_path) engine.repositories.references\ .filter('name = "refs/heads/develop"')\ .commits.first_reference_commit.tree_entries.blobs\ .classify_languages()\ .filter('lang = "Ruby"')\ .extract_uasts()\ .show()
def main(): file_path = os.path.dirname(os.path.realpath(__file__)) repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') session = SparkSession.builder.appName("test").master( 'local[*]').getOrCreate() engine = Engine(session, repos_path, "siva") rows = engine.repositories.select('id').collect() repos = [r['id'] for r in rows] print("REPOS:") for r in repos: print(r)
def main(): file_path = os.path.dirname(os.path.realpath(__file__)) repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files') session = SparkSession.builder.appName("test").master( 'local[*]').getOrCreate() engine = Engine(session, repos_path, "siva") rows = engine.repositories.references.head_ref.commits.first_reference_commit\ .tree_entries.select('path').collect() files = [r['path'] for r in rows] print("FILES:") for f in files: print(f)
def create_engine(session_name, repositories, repository_format="siva", bblfsh=None, engine=None, config=SparkDefault.CONFIG, packages=SparkDefault.PACKAGES, spark=SparkDefault.MASTER_ADDRESS, spark_local_dir=SparkDefault.LOCAL_DIR, spark_log_level=SparkDefault.LOG_LEVEL, memory=SparkDefault.MEMORY, dep_zip=False): if not bblfsh: bblfsh = "localhost" if not engine: try: engine = get_distribution("sourced-engine").version except DistributionNotFound: log = logging.getLogger("engine_version") engine = requests.get("https://api.github.com/repos/src-d/engine/releases/latest") \ .json()["tag_name"].replace("v", "") log.warning( "Engine not found, queried GitHub to get the latest release tag (%s)", engine) config = assemble_spark_config(config=config, memory=memory) add_engine_dependencies(engine=engine, config=config, packages=packages) add_bblfsh_dependencies(bblfsh=bblfsh, config=config) session = create_spark(session_name, spark=spark, spark_local_dir=spark_local_dir, config=config, packages=packages, spark_log_level=spark_log_level, dep_zip=dep_zip) log = logging.getLogger("engine") log.info("Initializing on %s", repositories) engine = Engine(session, repositories, repository_format) return engine
class EngineTestCase(BaseTestCase): def setUp(self): BaseTestCase.setUp(self) file_path = path.dirname(path.realpath(__file__)) repos_path = path.join(file_path, '..', '..', 'src', 'test', 'resources', 'siva-files') repos_format = 'siva' self.engine = Engine(self.session, repos_path, repos_format) def test_repositories(self): df = self.engine.repositories ids = [row.id for row in df.sort(df.id).select(df.id).collect()] self.assertEqual(ids, REPOSITORIES) def test_references(self): df = self.engine.repositories.references refs = df.select(df.name).distinct().collect() self.assertEquals(len(refs), 44) def test_references_head(self): df = self.engine.repositories.references.head_ref hashes = [r.hash for r in df.distinct().sort(df.hash).collect()] self.assertEqual(hashes,['202ceb4d3efd2294544583a7d4dc92899aa0181f', '2060ee6252a64337c404a4fb44baf374c0bc7f7a', 'dbfab055c70379219cbcf422f05316fdf4e1aed3', 'fff7062de8474d10a67d417ccea87ba6f58ca81d', 'fff7062de8474d10a67d417ccea87ba6f58ca81d']) def test_references_master(self): df = self.engine.repositories.references.master_ref hashes = [r.hash for r in df.distinct().sort(df.hash).collect()] self.assertEqual(hashes, ['202ceb4d3efd2294544583a7d4dc92899aa0181f', '2060ee6252a64337c404a4fb44baf374c0bc7f7a', 'dbfab055c70379219cbcf422f05316fdf4e1aed3', 'fff7062de8474d10a67d417ccea87ba6f58ca81d', 'fff7062de8474d10a67d417ccea87ba6f58ca81d']) def test_references_ref(self): df = self.engine.repositories.references.ref('refs/heads/develop') self.assertEqual(len(df.collect()), 2) def test_all_commits(self): df = self.engine.repositories.references.all_reference_commits repo_commits = df.groupBy(df.repository_id)\ .count()\ .collect() self.assertEqual(len(repo_commits), len(REPOSITORIES)) for repo in repo_commits: self.assertEqual(repo['count'], REPOSITORY_COMMITS[repo.repository_id]) def test_commits(self): df = self.engine.repositories.references.filter("name not like 'refs/tags/%'") repo_refs = df.groupBy(df.repository_id).count().collect() repos = {} for repo in repo_refs: repos[repo["repository_id"]] = repo["count"] df = self.engine.repositories.references.commits repo_commits = df.groupBy(df.repository_id) \ .count() \ .collect() self.assertEqual(len(repo_commits), len(REPOSITORIES)) for repo in repo_commits: self.assertEqual(repo['count'], repos[repo["repository_id"]]) def test_tree_entries(self): df = self.engine.repositories.references.all_reference_commits.tree_entries self.assertEqual(df.count(), 304362) entry = df.sort(df.blob).limit(1).first() self.assertEqual(entry.blob, '0020a823b6e5b06c9adb7def76ccd7ed098a06b8') self.assertEqual(entry.path, 'spec/database_spec.rb') def test_blobs(self): df = self.engine.repositories.references.all_reference_commits\ .tree_entries.blobs.drop("repository_id", "reference_name").distinct() self.assertEqual(df.count(), 91944) file = df.sort(df.blob_id).limit(1).first() self.assertEqual(file.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8") self.assertEqual(file.path, 'spec/database_spec.rb') def test_classify_languages(self): df = self.engine.repositories.references.all_reference_commits.tree_entries.blobs row = df.sort(df.blob_id).limit(1).classify_languages().first() self.assertEqual(row.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8") self.assertEqual(row.path, 'spec/database_spec.rb') self.assertEqual(row.lang, "Ruby") def test_extract_uasts(self): df = self.engine.repositories.references.all_reference_commits.tree_entries.blobs row = df.sort(df.blob_id).limit(1).classify_languages()\ .extract_uasts().first() self.assertEqual(row.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8") self.assertEqual(row.path, 'spec/database_spec.rb') self.assertEqual(row.lang, "Ruby") self.assertTrue(len(row.uast) > 0) df = self.engine.repositories.references.all_reference_commits.tree_entries.blobs row = df.sort(df.blob_id).limit(1).extract_uasts().first() self.assertEqual(row.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8") self.assertEqual(row.path, 'spec/database_spec.rb') self.assertTrue(len(row.uast) > 0) def test_engine_blobs(self): rows = self.engine.repositories.references.head_ref.all_reference_commits.sort('hash').limit(10).collect() repos = [] hashes = [] for row in rows: repos.append(row['repository_id']) hashes.append(row['hash']) df = self.engine.blobs(repos, ["refs/heads/HEAD"], hashes)\ .drop("repository_id", "reference_name").distinct() self.assertEqual(df.count(), 655) def test_engine_blobs_repository(self): blobs = self.engine.blobs(repository_ids=['github.com/xiyou-linuxer/faq-xiyoulinux'])\ .drop("repository_id", "reference_name").distinct() self.assertEqual(blobs.count(), 2421) def test_engine_blobs_reference(self): blobs = self.engine.blobs(reference_names=['refs/heads/develop'])\ .drop("repository_id", "reference_name").distinct() self.assertEqual(blobs.count(), 425) def test_engine_blobs_hash(self): blobs = self.engine.blobs(commit_hashes=['fff7062de8474d10a67d417ccea87ba6f58ca81d'])\ .drop("repository_id", "reference_name").distinct() self.assertEqual(blobs.count(), 2) def test_uast_query(self): df = self.session.createDataFrame(PYTHON_FILES, FILE_COLUMNS) repos = self.engine.repositories df = BlobsDataFrame(df._jdf, repos._session, repos._implicits) rows = df.extract_uasts().query_uast('//*[@roleIdentifier and not(@roleIncomplete)]').collect() self.assertEqual(len(rows), 1) idents = [] for row in rows: for node in row["result"]: node = parse_uast_node(node) idents.append(node.token) self.assertCountEqual(idents, ["contents", "read", "f", "open", "f"]) def test_uast_query_cols(self): df = self.session.createDataFrame(PYTHON_FILES, FILE_COLUMNS) repos = self.engine.repositories df = BlobsDataFrame(df._jdf, repos._session, repos._implicits) rows = df.extract_uasts()\ .query_uast('//*[@roleIdentifier]')\ .query_uast('/*[not(@roleIncomplete)]', 'result', 'result2')\ .collect() self.assertEqual(len(rows), 1) idents = [] for row in rows: for node in row["result2"]: node = parse_uast_node(node) idents.append(node.token) self.assertCountEqual(idents, ["contents", "read", "f", "open", "f"]) def test_extract_tokens(self): df = self.session.createDataFrame(PYTHON_FILES, FILE_COLUMNS) repos = self.engine.repositories df = BlobsDataFrame(df._jdf, repos._session, repos._implicits) row = df.extract_uasts().query_uast('//*[@roleIdentifier and not(@roleIncomplete)]')\ .extract_tokens().first() self.assertCountEqual(row["tokens"], ["contents", "read", "f", "open", "f"]) def test_metadata(self): tmpdir = tempfile.mkdtemp() self.engine.save_metadata(tmpdir) db_path = path.join(tmpdir, 'engine_metadata.db') self.assertTrue(path.exists(db_path)) engine = self.engine.from_metadata(tmpdir) expected = self.engine.repositories.count() obtained = engine.repositories.count() self.assertEqual(obtained, expected) shutil.rmtree(tmpdir)
# import the source{d} engine from sourced.engine import Engine from pyspark.sql import SparkSession from pyspark.sql.functions import * # start a new session spark = SparkSession.builder \ .master("local[*]").appName("Examples") \ .getOrCreate() engine = Engine(spark, "/repositories") # get identifiers of all Python files idents = engine.repositories.filter("is_fork = false") \ .references \ .head_ref.commits.first_reference_commit \ .files \ .classify_languages() \ .extract_uasts() \ .query_uast('//*[@roleIdentifier and not(@roleIncomplete)]') \ .filter("is_binary = false") \ .filter("lang = 'Python'") \ .select("file_hash", "result").distinct() # get and show the tokens from the identifiers tokens = idents.extract_tokens() tokens.limit(10).show()