Example #1
0
 def test_uast_deserializer(self):
     df = HeadFiles()(self.engine.repositories)
     df_uast = UastExtractor()(df)
     r2d = UastRow2Document()
     row_uast = r2d.documentize(df_uast.first())
     uasts_empty = list(UastDeserializer().deserialize_uast(df.first()))
     uasts = list(UastDeserializer().deserialize_uast(row_uast))
     self.assertTrue(len(uasts_empty) == 0)
     self.assertTrue(len(uasts) > 0)
Example #2
0
def repo2bow(repository: str,
             repository_format: str,
             docfreq_threshold: int,
             docfreq: DocumentFrequencies,
             languages: List[str] = None,
             blacklist_languages=False,
             engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]:
    log = logging.getLogger("repo2bow")
    token_index = {"i." + key: int(val) for (key, val) in docfreq}
    session_name = "repo2bow-%s" % uuid4()
    engine_args = {
        "repositories": repository,
        "repository_format": repository_format,
    }
    if engine_kwargs is not None:
        engine_args.update(engine_kwargs)
    engine = create_engine(session_name, **engine_args)
    root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles()
    if languages is not None:
        file_source = root >> \
                      LanguageExtractor() >> \
                      LanguageSelector(languages=languages, blacklist=blacklist_languages)
    else:
        file_source = root
    bag = (file_source >> UastExtractor() >> Moder("repo") >>
           UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures(
               IdentifiersBagExtractor(docfreq_threshold)) >>
           BagFeatures2TermFreq() >> TFIDF(
               token_index, docfreq.docs,
               engine.session.sparkContext) >> Collector()).execute()
    log.info("extracted %d identifiers", len(bag))
    return {r.token[2:]: r.value for r in bag}
Example #3
0
def preprocess_source(args):
    log = logging.getLogger("preprocess_source")
    if os.path.exists(args.batches):
        log.critical("%s must not exist", args.batches)
        return 1
    if not args.config:
        args.config = []
    engine = create_engine("source2bags-%s" % uuid4(), args.repositories, args)
    pipeline = Engine(engine, explain=args.explain).link(
        DzhigurdaFiles(args.dzhigurda))
    uasts = pipeline.link(UastExtractor(languages=[args.language]))
    fields = uasts.link(FieldsSelector(fields=args.fields))
    saver = fields.link(ParquetSaver(save_loc=args.batches))

    saver.explode()
Example #4
0
def preprocess_source(args):
    log = logging.getLogger("preprocess_source")
    if os.path.exists(args.output):
        log.critical("%s must not exist", args.output)
        return 1
    if not args.config:
        args.config = []

    engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__)
    ignition = Ignition(engine, explain=args.explain)
    ignition \
        .link(DzhigurdaFiles(args.dzhigurda)) \
        .link(UastExtractor(languages=args.languages)) \
        .link(FieldsSelector(fields=args.fields)) \
        .link(ParquetSaver(save_loc=args.output)) \
        .execute()
    pipeline_graph(args, log, ignition)
Example #5
0
def source2bags(args):
    log = logging.getLogger("bags")
    if os.path.exists(args.batches):
        log.critical("%s must not exist", args.batches)
        return 1
    if not args.config:
        args.config = []
    try:
        cassandra_utils.configure(args)
        engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__)
        extractors = [
            __extractors__[s](args.min_docfreq,
                              **__extractors__[s].get_kwargs_fromcmdline(args))
            for s in args.feature
        ]
        pipeline = Engine(engine, explain=args.explain).link(
            DzhigurdaFiles(args.dzhigurda))
        uasts = pipeline.link(UastExtractor(languages=[args.language]))
        if args.persist is not None:
            uasts = uasts.link(Cacher(args.persist))
        uasts.link(MetadataSaver(args.keyspace, args.tables["meta"]))
        uasts = uasts.link(UastDeserializer())
        uasts.link(Repo2Quant(extractors, args.nb_partitions))
        uasts.link(Repo2DocFreq(extractors))
        pipeline.explode()
        bags = uasts.link(Repo2WeightedSet(extractors))
        if args.persist is not None:
            bags = bags.link(Cacher(args.persist))
        batcher = bags.link(BagsBatcher(extractors))
        batcher.link(BagsBatchSaver(args.batches, batcher))
        bags.link(BagsSaver(args.keyspace, args.tables["bags"]))
        bags.explode()
        log.info("Writing %s", args.docfreq)
        batcher.model.save(args.docfreq)
        if args.graph:
            log.info("Dumping the graph to %s", args.graph)
            with open(args.graph, "w") as f:
                pipeline.graph(stream=f)
    finally:
        if args.pause:
            input("Press Enter to exit...")
Example #6
0
 def test_uast_extractor(self):
     df = HeadFiles()(self.engine.repositories)
     df_uast = UastExtractor()(df)
     self.assertIn("uast", df_uast.columns)