def test_uast_deserializer(self): df = HeadFiles()(self.engine.repositories) df_uast = UastExtractor()(df) r2d = UastRow2Document() row_uast = r2d.documentize(df_uast.first()) uasts_empty = list(UastDeserializer().deserialize_uast(df.first())) uasts = list(UastDeserializer().deserialize_uast(row_uast)) self.assertTrue(len(uasts_empty) == 0) self.assertTrue(len(uasts) > 0)
def repo2bow(repository: str, repository_format: str, docfreq_threshold: int, docfreq: DocumentFrequencies, languages: List[str] = None, blacklist_languages=False, engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]: log = logging.getLogger("repo2bow") token_index = {"i." + key: int(val) for (key, val) in docfreq} session_name = "repo2bow-%s" % uuid4() engine_args = { "repositories": repository, "repository_format": repository_format, } if engine_kwargs is not None: engine_args.update(engine_kwargs) engine = create_engine(session_name, **engine_args) root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles() if languages is not None: file_source = root >> \ LanguageExtractor() >> \ LanguageSelector(languages=languages, blacklist=blacklist_languages) else: file_source = root bag = (file_source >> UastExtractor() >> Moder("repo") >> UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures( IdentifiersBagExtractor(docfreq_threshold)) >> BagFeatures2TermFreq() >> TFIDF( token_index, docfreq.docs, engine.session.sparkContext) >> Collector()).execute() log.info("extracted %d identifiers", len(bag)) return {r.token[2:]: r.value for r in bag}
def preprocess_source(args): log = logging.getLogger("preprocess_source") if os.path.exists(args.batches): log.critical("%s must not exist", args.batches) return 1 if not args.config: args.config = [] engine = create_engine("source2bags-%s" % uuid4(), args.repositories, args) pipeline = Engine(engine, explain=args.explain).link( DzhigurdaFiles(args.dzhigurda)) uasts = pipeline.link(UastExtractor(languages=[args.language])) fields = uasts.link(FieldsSelector(fields=args.fields)) saver = fields.link(ParquetSaver(save_loc=args.batches)) saver.explode()
def preprocess_source(args): log = logging.getLogger("preprocess_source") if os.path.exists(args.output): log.critical("%s must not exist", args.output) return 1 if not args.config: args.config = [] engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__) ignition = Ignition(engine, explain=args.explain) ignition \ .link(DzhigurdaFiles(args.dzhigurda)) \ .link(UastExtractor(languages=args.languages)) \ .link(FieldsSelector(fields=args.fields)) \ .link(ParquetSaver(save_loc=args.output)) \ .execute() pipeline_graph(args, log, ignition)
def source2bags(args): log = logging.getLogger("bags") if os.path.exists(args.batches): log.critical("%s must not exist", args.batches) return 1 if not args.config: args.config = [] try: cassandra_utils.configure(args) engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__) extractors = [ __extractors__[s](args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args)) for s in args.feature ] pipeline = Engine(engine, explain=args.explain).link( DzhigurdaFiles(args.dzhigurda)) uasts = pipeline.link(UastExtractor(languages=[args.language])) if args.persist is not None: uasts = uasts.link(Cacher(args.persist)) uasts.link(MetadataSaver(args.keyspace, args.tables["meta"])) uasts = uasts.link(UastDeserializer()) uasts.link(Repo2Quant(extractors, args.nb_partitions)) uasts.link(Repo2DocFreq(extractors)) pipeline.explode() bags = uasts.link(Repo2WeightedSet(extractors)) if args.persist is not None: bags = bags.link(Cacher(args.persist)) batcher = bags.link(BagsBatcher(extractors)) batcher.link(BagsBatchSaver(args.batches, batcher)) bags.link(BagsSaver(args.keyspace, args.tables["bags"])) bags.explode() log.info("Writing %s", args.docfreq) batcher.model.save(args.docfreq) if args.graph: log.info("Dumping the graph to %s", args.graph) with open(args.graph, "w") as f: pipeline.graph(stream=f) finally: if args.pause: input("Press Enter to exit...")
def test_uast_extractor(self): df = HeadFiles()(self.engine.repositories) df_uast = UastExtractor()(df) self.assertIn("uast", df_uast.columns)