Example #1
0
 def test_head_files(self):
     df = HeadFiles()(self.engine.repositories)
     df_as_dict = df.first().asDict()
     keys = set(df_as_dict.keys())
     self.assertIn("commit_hash", keys)
     self.assertIn("path", keys)
     self.assertIn("content", keys)
     self.assertIn("reference_name", keys)
Example #2
0
 def test_uast_deserializer(self):
     df = HeadFiles()(self.engine.repositories)
     df_uast = UastExtractor()(df)
     r2d = UastRow2Document()
     row_uast = r2d.documentize(df_uast.first())
     uasts_empty = list(UastDeserializer().deserialize_uast(df.first()))
     uasts = list(UastDeserializer().deserialize_uast(row_uast))
     self.assertTrue(len(uasts_empty) == 0)
     self.assertTrue(len(uasts) > 0)
Example #3
0
    def test_language_selector(self):
        language_selector = LanguageSelector(languages=["XML", "YAML"], blacklist=True)
        df = language_selector(HeadFiles()(self.engine.repositories).classify_languages())
        langs = [x.lang for x in df.select("lang").distinct().collect()]
        self.assertEqual(langs, ["Markdown", "Gradle", "Text", "INI",
                                 "Batchfile", "Python", "Java", "Shell"])

        language_selector = LanguageSelector(languages=["Python", "Java"], blacklist=False)
        df = language_selector(HeadFiles()(self.engine.repositories).classify_languages())
        langs = [x.lang for x in df.select("lang").distinct().collect()]
        self.assertEqual(langs, ["Python", "Java"])
    def test_parquet(self):
        languages1 = ["Python", "Java"]
        languages2 = ["Java"]

        engine = create_engine("test", SIVA_DIR)
        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector(languages1)) \
            .link(Collector()) \
            .execute()
        self.assertEqual({x.lang for x in res}, set(languages1))

        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector(languages2)) \
            .link(Collector()) \
            .execute()
        self.assertEqual({x.lang for x in res}, set(languages2))

        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector(languages2, blacklist=True)) \
            .link(Collector()) \
            .execute()
        self.assertEqual(set(), {x.lang for x in res} & set(languages2))

        res = Ignition(engine) \
            .link(HeadFiles()) \
            .link(LanguageExtractor()) \
            .link(LanguageSelector([])) \
            .link(Collector()) \
            .execute()
        self.assertEqual(set(), {x.lang for x in res})

        parquet_loader = create_parquet_loader("test_parquet",
                                               repositories=PARQUET_DIR)
        df = parquet_loader.execute()
        with self.assertRaises(AttributeError):
            LanguageSelector(languages1)(df)

        df_with_lang = df.withColumn("lang", lit("BestLang"))
        self.assertEqual(
            0, len(LanguageSelector(languages1)(df_with_lang).collect()))

        self.assertEqual(
            df_with_lang.collect(),
            LanguageSelector(["BestLang"])(df_with_lang).collect())
Example #5
0
def repo2bow(repository: str,
             repository_format: str,
             docfreq_threshold: int,
             docfreq: DocumentFrequencies,
             languages: List[str] = None,
             blacklist_languages=False,
             engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]:
    log = logging.getLogger("repo2bow")
    token_index = {"i." + key: int(val) for (key, val) in docfreq}
    session_name = "repo2bow-%s" % uuid4()
    engine_args = {
        "repositories": repository,
        "repository_format": repository_format,
    }
    if engine_kwargs is not None:
        engine_args.update(engine_kwargs)
    engine = create_engine(session_name, **engine_args)
    root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles()
    if languages is not None:
        file_source = root >> \
                      LanguageExtractor() >> \
                      LanguageSelector(languages=languages, blacklist=blacklist_languages)
    else:
        file_source = root
    bag = (file_source >> UastExtractor() >> Moder("repo") >>
           UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures(
               IdentifiersBagExtractor(docfreq_threshold)) >>
           BagFeatures2TermFreq() >> TFIDF(
               token_index, docfreq.docs,
               engine.session.sparkContext) >> Collector()).execute()
    log.info("extracted %d identifiers", len(bag))
    return {r.token[2:]: r.value for r in bag}
Example #6
0
 def test_uast_extractor(self):
     df = HeadFiles()(self.engine.repositories)
     df_uast = UastExtractor()(df)
     self.assertIn("uast", df_uast.columns)