def repo2bow(repository: str, repository_format: str, docfreq_threshold: int, docfreq: DocumentFrequencies, languages: List[str] = None, blacklist_languages=False, engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]: log = logging.getLogger("repo2bow") token_index = {"i." + key: int(val) for (key, val) in docfreq} session_name = "repo2bow-%s" % uuid4() engine_args = { "repositories": repository, "repository_format": repository_format, } if engine_kwargs is not None: engine_args.update(engine_kwargs) engine = create_engine(session_name, **engine_args) root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles() if languages is not None: file_source = root >> \ LanguageExtractor() >> \ LanguageSelector(languages=languages, blacklist=blacklist_languages) else: file_source = root bag = (file_source >> UastExtractor() >> Moder("repo") >> UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures( IdentifiersBagExtractor(docfreq_threshold)) >> BagFeatures2TermFreq() >> TFIDF( token_index, docfreq.docs, engine.session.sparkContext) >> Collector()).execute() log.info("extracted %d identifiers", len(bag)) return {r.token[2:]: r.value for r in bag}
def create_uast_source(args, session_name, select=HeadFiles, language_selector=None, extract_uast=True): if args.parquet: parquet_loader_args = filter_kwargs(args.__dict__, create_parquet_loader) start_point = create_parquet_loader(session_name, **parquet_loader_args) root = start_point if extract_uast and "uast" not in [ col.name for col in start_point.execute().schema ]: raise ValueError("The parquet files do not contain UASTs.") else: engine_args = filter_kwargs(args.__dict__, create_engine) root = create_engine(session_name, **engine_args) if language_selector is None: language_selector = LanguageSelector(languages=args.languages) start_point = Ignition(root, explain=args.explain) \ .link(select()) \ .link(language_selector) if extract_uast: start_point = start_point.link(UastExtractor()) return root, start_point
def preprocess_source(args): log = logging.getLogger("preprocess_source") if os.path.exists(args.batches): log.critical("%s must not exist", args.batches) return 1 if not args.config: args.config = [] engine = create_engine("source2bags-%s" % uuid4(), args.repositories, args) pipeline = Engine(engine, explain=args.explain).link( DzhigurdaFiles(args.dzhigurda)) uasts = pipeline.link(UastExtractor(languages=[args.language])) fields = uasts.link(FieldsSelector(fields=args.fields)) saver = fields.link(ParquetSaver(save_loc=args.batches)) saver.explode()
def preprocess_source(args): log = logging.getLogger("preprocess_source") if os.path.exists(args.output): log.critical("%s must not exist", args.output) return 1 if not args.config: args.config = [] engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__) ignition = Ignition(engine, explain=args.explain) ignition \ .link(DzhigurdaFiles(args.dzhigurda)) \ .link(UastExtractor(languages=args.languages)) \ .link(FieldsSelector(fields=args.fields)) \ .link(ParquetSaver(save_loc=args.output)) \ .execute() pipeline_graph(args, log, ignition)
def create_file_source(args: argparse.Namespace, session_name: str): if args.parquet: parquet_loader_args = filter_kwargs(args.__dict__, create_parquet_loader) root = create_parquet_loader(session_name, **parquet_loader_args) file_source = root.link( LanguageSelector.maybe(languages=args.languages, blacklist=args.blacklist)) else: engine_args = filter_kwargs(args.__dict__, create_engine) root = Ignition(create_engine(session_name, **engine_args), explain=args.explain) file_source = root.link(DzhigurdaFiles(args.dzhigurda)) if args.languages is not None: file_source = file_source \ .link(LanguageExtractor()) \ .link(LanguageSelector(languages=args.languages, blacklist=args.blacklist)) return root, file_source
def source2bags(args): log = logging.getLogger("bags") if os.path.exists(args.batches): log.critical("%s must not exist", args.batches) return 1 if not args.config: args.config = [] try: cassandra_utils.configure(args) engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__) extractors = [ __extractors__[s](args.min_docfreq, **__extractors__[s].get_kwargs_fromcmdline(args)) for s in args.feature ] pipeline = Engine(engine, explain=args.explain).link( DzhigurdaFiles(args.dzhigurda)) uasts = pipeline.link(UastExtractor(languages=[args.language])) if args.persist is not None: uasts = uasts.link(Cacher(args.persist)) uasts.link(MetadataSaver(args.keyspace, args.tables["meta"])) uasts = uasts.link(UastDeserializer()) uasts.link(Repo2Quant(extractors, args.nb_partitions)) uasts.link(Repo2DocFreq(extractors)) pipeline.explode() bags = uasts.link(Repo2WeightedSet(extractors)) if args.persist is not None: bags = bags.link(Cacher(args.persist)) batcher = bags.link(BagsBatcher(extractors)) batcher.link(BagsBatchSaver(args.batches, batcher)) bags.link(BagsSaver(args.keyspace, args.tables["bags"])) bags.explode() log.info("Writing %s", args.docfreq) batcher.model.save(args.docfreq) if args.graph: log.info("Dumping the graph to %s", args.graph) with open(args.graph, "w") as f: pipeline.graph(stream=f) finally: if args.pause: input("Press Enter to exit...")
def setUpClass(cls): cls.engine = create_engine("test_with_engine", SIVA_DIR, "siva") cls.spark = cls.engine.session cls.data = ParquetLoader(session=cls.spark, paths=PARQUET_DIR).execute().rdd.coalesce(1)
def setUpClass(cls): cls.engine = create_engine("test_with_engine", SIVA_DIR, "siva") cls.spark = cls.engine.session
def warmup(args): create_engine("warmup", "/tmp", **args.__dict__)
def warmup(args): engine_args = filter_kwargs(args.__dict__, create_engine) create_engine("warmup", "/tmp", **engine_args)