Ejemplo n.º 1
0
 def setUp(self):
     BaseTestCase.setUp(self)
     file_path = path.dirname(path.realpath(__file__))
     repos_path = path.join(file_path, '..', '..', 'src', 'test',
                            'resources', 'siva-files')
     repos_format = 'siva'
     self.engine = Engine(self.session, repos_path, repos_format)
Ejemplo n.º 2
0
def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('-d',
                        '--data',
                        type=str,
                        help="Path of the data.",
                        required=True)
    parser.add_argument('-o',
                        '--output',
                        type=str,
                        help="Path output to save the data.",
                        required=False)

    args = parser.parse_args()
    data = args.data


    spark = SparkSession.builder \
        .master("local[*]").appName("Examples") \
        .getOrCreate()

    #engine = Engine(spark, "/home/hydra/projects/source_d/repositories/siva.srd/latest/*", "siva")
    # engine = Engine(spark, "/home/hydra/projects/source_d/repositories/siva.srd/latest/*", "siva")
    engine = Engine(
        spark,
        "/home/hydra/projects/source_d/data/selected_repositories/siva/latest/*",
        "siva")
    engine = Engine(spark, args.data, "siva")
    print("%d repositories successfully loaded" %
          (engine.repositories.count() / 2))

    binary_uasts = engine.repositories.references.head_ref.commits.tree_entries.blobs \
        .classify_languages().where('lang = "Python"') \
        .extract_uasts().select('path', 'uast').rdd.filter(lambda r: len(r['uast']) > 0).collect()

    uasts = []

    for b_uast in binary_uasts:
        uasts.append(Node.FromString(b_uast["uast"][0]))

    del binary_uasts

    rules_count, nodes_count = process_uasts(uasts)

    print_statistics(rules_count, nodes_count)

    cluster_nodes(nodes_count)
Ejemplo n.º 3
0
def create_engine(session_name,
                  repositories,
                  repository_format=EngineDefault.REPOSITORY_FORMAT,
                  bblfsh=EngineDefault.BBLFSH,
                  engine=EngineDefault.VERSION,
                  config=SparkDefault.CONFIG,
                  packages=SparkDefault.JAR_PACKAGES,
                  spark=SparkDefault.MASTER_ADDRESS,
                  spark_local_dir=SparkDefault.LOCAL_DIR,
                  spark_log_level=SparkDefault.LOG_LEVEL,
                  dep_zip=SparkDefault.DEP_ZIP,
                  memory=SparkDefault.MEMORY):

    config += (get_bblfsh_dependency(bblfsh), )
    packages += (get_engine_package(engine), )
    session = create_spark(session_name,
                           spark=spark,
                           spark_local_dir=spark_local_dir,
                           config=config,
                           packages=packages,
                           spark_log_level=spark_log_level,
                           dep_zip=dep_zip,
                           memory=memory)
    logging.getLogger("engine").info("Initializing engine on %s", repositories)
    return Engine(session, repositories, repository_format)
Ejemplo n.º 4
0
def create_engine(session_name,
                  repositories,
                  repository_format="siva",
                  bblfsh=None,
                  engine=None,
                  config=SparkDefault.CONFIG,
                  packages=SparkDefault.PACKAGES,
                  spark=SparkDefault.MASTER_ADDRESS,
                  spark_local_dir=SparkDefault.LOCAL_DIR,
                  spark_log_level=SparkDefault.LOG_LEVEL,
                  memory=SparkDefault.MEMORY,
                  dep_zip=False):
    if not bblfsh:
        bblfsh = "localhost"
    if not engine:
        engine = get_engine_version()
    config = assemble_spark_config(config=config, memory=memory)
    add_engine_dependencies(engine=engine, config=config, packages=packages)
    add_bblfsh_dependencies(bblfsh=bblfsh, config=config)
    session = create_spark(session_name,
                           spark=spark,
                           spark_local_dir=spark_local_dir,
                           config=config,
                           packages=packages,
                           spark_log_level=spark_log_level,
                           dep_zip=dep_zip)
    log = logging.getLogger("engine")
    log.info("Initializing on %s", repositories)
    engine = Engine(session, repositories, repository_format)
    return engine
Ejemplo n.º 5
0
def main():
    file_path = os.path.dirname(os.path.realpath(__file__))
    repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test',
                              'resources', 'siva-files')
    session = SparkSession.builder.appName("test").master(
        'local[*]').getOrCreate()
    engine = Engine(session, repos_path)
    engine.repositories.references.master_ref.commits.show()
Ejemplo n.º 6
0
def main():
    file_path = os.path.dirname(os.path.realpath(__file__))
    repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test', 'resources', 'siva-files')
    session = SparkSession.builder.appName("test").master('local[*]').getOrCreate()
    engine = Engine(session, repos_path)
    refs = engine.repositories.filter('id = "github.com/xiyou-linuxer/faq-xiyoulinux"')\
        .references.select('name').collect()

    refs = [r['name'] for r in refs]

    print("REFERENCES:")
    for r in refs:
        print(r)
Ejemplo n.º 7
0
def main():
    file_path = os.path.dirname(os.path.realpath(__file__))
    repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test',
                              'resources', 'siva-files')
    session = SparkSession.builder.appName("test").master(
        'local[*]').getOrCreate()
    engine = Engine(session, repos_path)
    engine.repositories.references\
        .filter('name = "refs/heads/develop"')\
        .commits.first_reference_commit.tree_entries.blobs\
        .classify_languages()\
        .filter('lang = "Ruby"')\
        .extract_uasts()\
        .show()
Ejemplo n.º 8
0
def main():
    file_path = os.path.dirname(os.path.realpath(__file__))
    repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test',
                              'resources', 'siva-files')
    session = SparkSession.builder.appName("test").master(
        'local[*]').getOrCreate()
    engine = Engine(session, repos_path, "siva")
    rows = engine.repositories.select('id').collect()

    repos = [r['id'] for r in rows]

    print("REPOS:")
    for r in repos:
        print(r)
Ejemplo n.º 9
0
def main():
    file_path = os.path.dirname(os.path.realpath(__file__))
    repos_path = os.path.join(file_path, '..', '..', '..', 'src', 'test',
                              'resources', 'siva-files')
    session = SparkSession.builder.appName("test").master(
        'local[*]').getOrCreate()
    engine = Engine(session, repos_path, "siva")
    rows = engine.repositories.references.head_ref.commits.first_reference_commit\
        .tree_entries.select('path').collect()

    files = [r['path'] for r in rows]

    print("FILES:")
    for f in files:
        print(f)
Ejemplo n.º 10
0
def create_engine(session_name,
                  repositories,
                  repository_format="siva",
                  bblfsh=None,
                  engine=None,
                  config=SparkDefault.CONFIG,
                  packages=SparkDefault.PACKAGES,
                  spark=SparkDefault.MASTER_ADDRESS,
                  spark_local_dir=SparkDefault.LOCAL_DIR,
                  spark_log_level=SparkDefault.LOG_LEVEL,
                  memory=SparkDefault.MEMORY,
                  dep_zip=False):
    if not bblfsh:
        bblfsh = "localhost"
    if not engine:
        try:
            engine = get_distribution("sourced-engine").version
        except DistributionNotFound:
            log = logging.getLogger("engine_version")
            engine = requests.get("https://api.github.com/repos/src-d/engine/releases/latest") \
                .json()["tag_name"].replace("v", "")
            log.warning(
                "Engine not found, queried GitHub to get the latest release tag (%s)",
                engine)
    config = assemble_spark_config(config=config, memory=memory)
    add_engine_dependencies(engine=engine, config=config, packages=packages)
    add_bblfsh_dependencies(bblfsh=bblfsh, config=config)
    session = create_spark(session_name,
                           spark=spark,
                           spark_local_dir=spark_local_dir,
                           config=config,
                           packages=packages,
                           spark_log_level=spark_log_level,
                           dep_zip=dep_zip)
    log = logging.getLogger("engine")
    log.info("Initializing on %s", repositories)
    engine = Engine(session, repositories, repository_format)
    return engine
Ejemplo n.º 11
0
class EngineTestCase(BaseTestCase):

    def setUp(self):
        BaseTestCase.setUp(self)
        file_path = path.dirname(path.realpath(__file__))
        repos_path = path.join(file_path, '..', '..', 'src', 'test', 'resources', 'siva-files')
        repos_format = 'siva'
        self.engine = Engine(self.session, repos_path, repos_format)


    def test_repositories(self):
        df = self.engine.repositories
        ids = [row.id for row in df.sort(df.id).select(df.id).collect()]
        self.assertEqual(ids, REPOSITORIES)


    def test_references(self):
        df = self.engine.repositories.references
        refs = df.select(df.name).distinct().collect()
        self.assertEquals(len(refs), 44)


    def test_references_head(self):
        df = self.engine.repositories.references.head_ref
        hashes = [r.hash for r in df.distinct().sort(df.hash).collect()]
        self.assertEqual(hashes,['202ceb4d3efd2294544583a7d4dc92899aa0181f',
                                  '2060ee6252a64337c404a4fb44baf374c0bc7f7a',
                                  'dbfab055c70379219cbcf422f05316fdf4e1aed3',
                                  'fff7062de8474d10a67d417ccea87ba6f58ca81d',
                                  'fff7062de8474d10a67d417ccea87ba6f58ca81d'])


    def test_references_master(self):
        df = self.engine.repositories.references.master_ref
        hashes = [r.hash for r in df.distinct().sort(df.hash).collect()]
        self.assertEqual(hashes, ['202ceb4d3efd2294544583a7d4dc92899aa0181f',
                                  '2060ee6252a64337c404a4fb44baf374c0bc7f7a',
                                  'dbfab055c70379219cbcf422f05316fdf4e1aed3',
                                  'fff7062de8474d10a67d417ccea87ba6f58ca81d',
                                  'fff7062de8474d10a67d417ccea87ba6f58ca81d'])


    def test_references_ref(self):
        df = self.engine.repositories.references.ref('refs/heads/develop')
        self.assertEqual(len(df.collect()), 2)


    def test_all_commits(self):
        df = self.engine.repositories.references.all_reference_commits
        repo_commits = df.groupBy(df.repository_id)\
            .count()\
            .collect()

        self.assertEqual(len(repo_commits), len(REPOSITORIES))
        for repo in repo_commits:
            self.assertEqual(repo['count'],
                             REPOSITORY_COMMITS[repo.repository_id])


    def test_commits(self):
        df = self.engine.repositories.references.filter("name not like 'refs/tags/%'")
        repo_refs = df.groupBy(df.repository_id).count().collect()
        repos = {}
        for repo in repo_refs:
            repos[repo["repository_id"]] = repo["count"]

        df = self.engine.repositories.references.commits
        repo_commits = df.groupBy(df.repository_id) \
            .count() \
            .collect()

        self.assertEqual(len(repo_commits), len(REPOSITORIES))
        for repo in repo_commits:
            self.assertEqual(repo['count'], repos[repo["repository_id"]])


    def test_tree_entries(self):
        df = self.engine.repositories.references.all_reference_commits.tree_entries
        self.assertEqual(df.count(), 304362)
        entry = df.sort(df.blob).limit(1).first()
        self.assertEqual(entry.blob, '0020a823b6e5b06c9adb7def76ccd7ed098a06b8')
        self.assertEqual(entry.path, 'spec/database_spec.rb')


    def test_blobs(self):
        df = self.engine.repositories.references.all_reference_commits\
            .tree_entries.blobs.drop("repository_id", "reference_name").distinct()
        self.assertEqual(df.count(), 91944)
        file = df.sort(df.blob_id).limit(1).first()
        self.assertEqual(file.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8")
        self.assertEqual(file.path, 'spec/database_spec.rb')


    def test_classify_languages(self):
        df = self.engine.repositories.references.all_reference_commits.tree_entries.blobs
        row = df.sort(df.blob_id).limit(1).classify_languages().first()
        self.assertEqual(row.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8")
        self.assertEqual(row.path, 'spec/database_spec.rb')
        self.assertEqual(row.lang, "Ruby")


    def test_extract_uasts(self):
        df = self.engine.repositories.references.all_reference_commits.tree_entries.blobs
        row = df.sort(df.blob_id).limit(1).classify_languages()\
            .extract_uasts().first()
        self.assertEqual(row.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8")
        self.assertEqual(row.path, 'spec/database_spec.rb')
        self.assertEqual(row.lang, "Ruby")
        self.assertTrue(len(row.uast) > 0)

        df = self.engine.repositories.references.all_reference_commits.tree_entries.blobs
        row = df.sort(df.blob_id).limit(1).extract_uasts().first()
        self.assertEqual(row.blob_id, "0020a823b6e5b06c9adb7def76ccd7ed098a06b8")
        self.assertEqual(row.path, 'spec/database_spec.rb')
        self.assertTrue(len(row.uast) > 0)


    def test_engine_blobs(self):
        rows = self.engine.repositories.references.head_ref.all_reference_commits.sort('hash').limit(10).collect()
        repos = []
        hashes = []
        for row in rows:
            repos.append(row['repository_id'])
            hashes.append(row['hash'])

        df = self.engine.blobs(repos, ["refs/heads/HEAD"], hashes)\
            .drop("repository_id", "reference_name").distinct()
        self.assertEqual(df.count(), 655)


    def test_engine_blobs_repository(self):
        blobs = self.engine.blobs(repository_ids=['github.com/xiyou-linuxer/faq-xiyoulinux'])\
            .drop("repository_id", "reference_name").distinct()
        self.assertEqual(blobs.count(), 2421)


    def test_engine_blobs_reference(self):
        blobs = self.engine.blobs(reference_names=['refs/heads/develop'])\
            .drop("repository_id", "reference_name").distinct()
        self.assertEqual(blobs.count(), 425)


    def test_engine_blobs_hash(self):
        blobs = self.engine.blobs(commit_hashes=['fff7062de8474d10a67d417ccea87ba6f58ca81d'])\
            .drop("repository_id", "reference_name").distinct()
        self.assertEqual(blobs.count(), 2)


    def test_uast_query(self):
        df = self.session.createDataFrame(PYTHON_FILES, FILE_COLUMNS)
        repos = self.engine.repositories
        df = BlobsDataFrame(df._jdf, repos._session, repos._implicits)
        rows = df.extract_uasts().query_uast('//*[@roleIdentifier and not(@roleIncomplete)]').collect()
        self.assertEqual(len(rows), 1)

        idents = []
        for row in rows:
            for node in row["result"]:
                node = parse_uast_node(node)
                idents.append(node.token)

        self.assertCountEqual(idents, ["contents", "read", "f", "open", "f"])


    def test_uast_query_cols(self):
        df = self.session.createDataFrame(PYTHON_FILES, FILE_COLUMNS)
        repos = self.engine.repositories
        df = BlobsDataFrame(df._jdf, repos._session, repos._implicits)
        rows = df.extract_uasts()\
            .query_uast('//*[@roleIdentifier]')\
            .query_uast('/*[not(@roleIncomplete)]', 'result', 'result2')\
            .collect()
        self.assertEqual(len(rows), 1)

        idents = []
        for row in rows:
            for node in row["result2"]:
                node = parse_uast_node(node)
                idents.append(node.token)

        self.assertCountEqual(idents, ["contents", "read", "f", "open", "f"])


    def test_extract_tokens(self):
        df = self.session.createDataFrame(PYTHON_FILES, FILE_COLUMNS)
        repos = self.engine.repositories
        df = BlobsDataFrame(df._jdf, repos._session, repos._implicits)
        row = df.extract_uasts().query_uast('//*[@roleIdentifier and not(@roleIncomplete)]')\
            .extract_tokens().first()

        self.assertCountEqual(row["tokens"], ["contents", "read", "f", "open", "f"])

    def test_metadata(self):
        tmpdir = tempfile.mkdtemp()

        self.engine.save_metadata(tmpdir)
        db_path = path.join(tmpdir, 'engine_metadata.db')
        self.assertTrue(path.exists(db_path))

        engine = self.engine.from_metadata(tmpdir)
        expected = self.engine.repositories.count()
        obtained = engine.repositories.count()
        self.assertEqual(obtained, expected)

        shutil.rmtree(tmpdir)
Ejemplo n.º 12
0
# import the source{d} engine
from sourced.engine import Engine
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

# start a new session
spark = SparkSession.builder \
        .master("local[*]").appName("Examples") \
        .getOrCreate()

engine = Engine(spark, "/repositories")

# get identifiers of all Python files
idents = engine.repositories.filter("is_fork = false") \
         .references \
         .head_ref.commits.first_reference_commit \
         .files \
         .classify_languages() \
         .extract_uasts() \
         .query_uast('//*[@roleIdentifier and not(@roleIncomplete)]') \
         .filter("is_binary = false") \
         .filter("lang = 'Python'") \
         .select("file_hash", "result").distinct()

# get and show the tokens from the identifiers
tokens = idents.extract_tokens()
tokens.limit(10).show()