def repo2bow(repository: str, repository_format: str, docfreq_threshold: int, docfreq: DocumentFrequencies, languages: List[str] = None, blacklist_languages=False, engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]: log = logging.getLogger("repo2bow") token_index = {"i." + key: int(val) for (key, val) in docfreq} session_name = "repo2bow-%s" % uuid4() engine_args = { "repositories": repository, "repository_format": repository_format, } if engine_kwargs is not None: engine_args.update(engine_kwargs) engine = create_engine(session_name, **engine_args) root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles() if languages is not None: file_source = root >> \ LanguageExtractor() >> \ LanguageSelector(languages=languages, blacklist=blacklist_languages) else: file_source = root bag = (file_source >> UastExtractor() >> Moder("repo") >> UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures( IdentifiersBagExtractor(docfreq_threshold)) >> BagFeatures2TermFreq() >> TFIDF( token_index, docfreq.docs, engine.session.sparkContext) >> Collector()).execute() log.info("extracted %d identifiers", len(bag)) return {r.token[2:]: r.value for r in bag}
def repos2coocc(args): log = logging.getLogger("repos2coocc") id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq, split_stem=args.split) session_name = "repos2coocc-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) df_model = create_or_load_ordered_df( args, ndocs, uast_extractor.link(Uast2BagFeatures(id_extractor))) token2index = root.session.sparkContext.broadcast(df_model.order) uast_extractor \ .link(CooccConstructor(token2index=token2index, token_parser=id_extractor.id2bag.token_parser, namespace=id_extractor.NAMESPACE)) \ .link(CooccModelSaver(args.output, df_model)) \ .execute() pipeline_graph(args, log, root)
def repos2df(args): log = logging.getLogger("repos2df") extractors = create_extractors_from_args(args) session_name = "repos2df-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) quant = Uast2Quant(extractors) uast_extractor.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", args.quant) QuantizationLevels().construct(quant.levels).save(args.quant) df = uast_extractor \ .link(Uast2BagFeatures(extractors)) \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing docfreq model to %s", args.docfreq_out) OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out) pipeline_graph(args, log, root)
def test_uast_deserializer(self): df = HeadFiles()(self.engine.repositories) df_uast = UastExtractor()(df) r2d = UastRow2Document() row_uast = r2d.documentize(df_uast.first()) uasts_empty = list(UastDeserializer().deserialize_uast(df.first())) uasts = list(UastDeserializer().deserialize_uast(row_uast)) self.assertTrue(len(uasts_empty) == 0) self.assertTrue(len(uasts) > 0)
def test_error(self): with self.assertRaises(ValueError): create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), 10, None) with self.assertRaises(ValueError): session = create_spark("test_df_util") uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(Moder("file")) \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(IdentifiersBagExtractor())) create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), None, uast_extractor)
def repos2roles_and_ids(args): log = logging.getLogger("repos2roles_and_ids") session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures([RolesAndIdsExtractor(args.split)])) \ .link(Rower(lambda x: dict(identifier=x[0][0], role=x[1]))) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def test_create(self): session = create_spark("test_df_util") uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(UastRow2Document()) ndocs = uast_extractor.link(Counter()).execute() uast_extractor = uast_extractor.link(UastDeserializer()) \ .link(Uast2BagFeatures([IdentifiersBagExtractor()])) with tempfile.TemporaryDirectory() as tmpdir: tmp_path = os.path.join(tmpdir, "df.asdf") args = argparse.Namespace(docfreq_in=None, docfreq_out=tmp_path, min_docfreq=1, vocabulary_size=1000) df_model = create_or_load_ordered_df(args, ndocs, uast_extractor) self.assertEqual(df_model.docs, ndocs) self.assertTrue(os.path.exists(tmp_path))
def repos2roles_and_ids(args): log = logging.getLogger("repos2roles_and_ids") session_name = "repos2roles_and_ids-%s" % uuid4() extractor = RoleIdsExtractor() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2Features(extractor)) \ .link(Rower(lambda x: {"identifier": x["roleids"][0], "role": x["roleids"][1]})) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2bow_entry_template(args, select=HeadFiles, cache_hook=None, save_hook=None): log = logging.getLogger("repos2bow") extractors = create_extractors_from_args(args) session_name = "repos2bow-%s" % uuid4() root, start_point = create_uast_source(args, session_name, select=select) uast_extractor = start_point.link(Moder(args.mode)) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) if cache_hook is not None: uast_extractor.link(cache_hook()).execute() # We link UastRow2Document after Cacher here because cache_hook() may want to have all possible # Row items. uast_extractor = uast_extractor.link(UastRow2Document()) log.info("Extracting UASTs and indexing documents...") document_indexer = Indexer(Uast2BagFeatures.Columns.document) uast_extractor.link(document_indexer).execute() ndocs = len(document_indexer) log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) quant = Uast2Quant(extractors) uast_extractor.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", args.quant) QuantizationLevels().construct(quant.levels).save(args.quant) uast_extractor = uast_extractor \ .link(Uast2BagFeatures(extractors)) log.info("Calculating the document frequencies...") df = uast_extractor.link(BagFeatures2DocFreq()).execute() log.info("Writing docfreq to %s", args.docfreq) df_model = OrderedDocumentFrequencies() \ .construct(ndocs, df) \ .prune(args.min_docfreq) \ .greatest(args.vocabulary_size) \ .save(args.docfreq) bags_writer = uast_extractor \ .link(BagFeatures2TermFreq()) \ .link(TFIDF(df_model)) \ .link(document_indexer) \ .link(Indexer(Uast2BagFeatures.Columns.token, df_model.order)) if save_hook is not None: bags_writer = bags_writer \ .link(Repartitioner.maybe(args.partitions * 10, args.shuffle)) \ .link(save_hook()) bags_writer.link(BOWWriter(document_indexer, df_model, args.bow, args.batch)) \ .execute() pipeline_graph(args, log, root)
def test_create(self): session = create_spark("test_quant_util") extractor = ChildrenBagExtractor() with tempfile.NamedTemporaryFile(mode="r+b", suffix="-quant.asdf") as tmp: path = tmp.name uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(Moder("file")) \ .link(UastRow2Document()) \ .link(UastDeserializer()) create_or_apply_quant(path, [extractor], uast_extractor) self.assertIsNotNone(extractor.levels) self.assertTrue(os.path.exists(path)) model_levels = QuantizationLevels().load(source=path)._levels["children"] for key in model_levels: self.assertListEqual(list(model_levels[key]), list(extractor.levels[key]))
def repos2id_distance(args): log = logging.getLogger("repos2roles_and_ids") extractor = IdentifierDistance(args.split, args.type, args.max_distance) session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(extractor)) \ .link(Rower(lambda x: {"identifier1": x[0][0][0], "identifier2": x[0][0][1], "distance": x[1]})) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def code2vec(args): log = logging.getLogger("code2vec") session_name = "code2vec-%s" % uuid4() root, start_point = create_uast_source(args, session_name) res = start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures([UastPathsBagExtractor(args.max_length, args.max_width)])) \ .link(Collector()) \ .execute() # TODO: Add rest of data pipeline: extract distinct paths and terminal nodes for embedding mapping # TODO: Add transformer to write bags and vocabs to a model # TODO: Add ML pipeline pipeline_graph(args, log, root)
def repos2id_sequence(args): log = logging.getLogger("repos2id_distance") extractor = IdSequenceExtractor(args.split) session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) if not args.skip_docname: mapper = Rower(lambda x: {"document": x[0][1], "identifiers": x[0][0]}) else: mapper = Rower(lambda x: {"identifiers": x[0][0]}) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(extractor)) \ .link(mapper) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2bow_index_template(args): log = logging.getLogger("repos2bow_index") extractors = create_extractors_from_args(args) session_name = "repos2bow_index_features-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point.link(Moder(args.mode)) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(UastRow2Document()) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs and indexing documents ...") document_indexer = Indexer(Uast2BagFeatures.Columns.document) uast_extractor.link(document_indexer).execute() document_indexer.save_index(args.cached_index_path) ndocs = len(document_indexer) log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) if args.quant: create_or_apply_quant(args.quant, extractors, uast_extractor) if args.docfreq_out: create_or_load_ordered_df(args, ndocs, uast_extractor.link(Uast2BagFeatures(*extractors))) pipeline_graph(args, log, root)
def repos2coocc_entry(args): log = logging.getLogger("repos2coocc") id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq, split_stem=args.split) session_name = "repos2coocc-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) df = uast_extractor \ .link(Uast2BagFeatures([id_extractor])) \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing document frequency model to %s...", args.docfreq) df_model = OrderedDocumentFrequencies() \ .construct(ndocs, df) \ .prune(args.min_docfreq) \ .greatest(args.vocabulary_size) \ .save(args.docfreq) token2index = root.session.sparkContext.broadcast(df_model.order) uast_extractor \ .link(CooccConstructor(token2index=token2index, token_parser=id_extractor.id2bag.token_parser, namespace=id_extractor.NAMESPACE)) \ .link(CooccModelSaver(args.output, df_model)) \ .execute() pipeline_graph(args, log, root)
def repos2bow_template(args, cache_hook: Transformer = None, save_hook: Transformer = None): log = logging.getLogger("repos2bow") extractors = create_extractors_from_args(args) session_name = "repos2bow-%s" % uuid4() root, start_point = create_uast_source(args, session_name) log.info("Loading the document index from %s ...", args.cached_index_path) docfreq = DocumentFrequencies().load(source=args.cached_index_path) document_index = {key: int(val) for (key, val) in docfreq} try: if args.quant is not None: create_or_apply_quant(args.quant, extractors, None) df_model = create_or_load_ordered_df(args, None, None) except ValueError: return 1 ec = EngineConstants.Columns if args.mode == Moder.Options.repo: def keymap(r): return r[ec.RepositoryId] else: def keymap(r): return r[ec.RepositoryId] + UastRow2Document.REPO_PATH_SEP + \ r[ec.Path] + UastRow2Document.PATH_BLOB_SEP + r[ec.BlobId] log.info("Caching UASTs to disk after partitioning by document ...") start_point = start_point.link(Moder(args.mode)) \ .link(Repartitioner.maybe(args.num_iterations, keymap=keymap)) \ .link(Cacher.maybe("DISK_ONLY")) for num_part in range(args.num_iterations): log.info("Running job %s of %s", num_part + 1, args.num_iterations) selected_part = start_point \ .link(PartitionSelector(num_part)) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) if cache_hook is not None: selected_part.link(cache_hook()).execute() uast_extractor = selected_part \ .link(UastRow2Document()) \ .link(Cacher.maybe(args.persist)) log.info("Collecting distinct documents ...") documents = uast_extractor \ .link(FieldsSelector([Uast2BagFeatures.Columns.document])) \ .link(Distinct()) \ .link(Collector()) \ .execute() selected_part.unpersist() documents = {row.document for row in documents} reduced_doc_index = { key: document_index[key] for key in document_index if key in documents} document_indexer = Indexer(Uast2BagFeatures.Columns.document, reduced_doc_index) log.info("Processing %s distinct documents", len(documents)) bags = uast_extractor \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(*extractors)) \ .link(BagFeatures2TermFreq()) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs and collecting distinct tokens ...") tokens = bags \ .link(FieldsSelector([Uast2BagFeatures.Columns.token])) \ .link(Distinct()) \ .link(Collector()) \ .execute() uast_extractor.unpersist() tokens = {row.token for row in tokens} reduced_token_freq = {key: df_model[key] for key in df_model.df if key in tokens} reduced_token_index = {key: df_model.order[key] for key in df_model.df if key in tokens} log.info("Processing %s distinct tokens", len(reduced_token_freq)) log.info("Indexing by document and token ...") bags_writer = bags \ .link(TFIDF(reduced_token_freq, df_model.docs, root.session.sparkContext)) \ .link(document_indexer) \ .link(Indexer(Uast2BagFeatures.Columns.token, reduced_token_index)) if save_hook is not None: bags_writer = bags_writer \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(save_hook()) bow = args.bow.split(".asdf")[0] + "_" + str(num_part + 1) + ".asdf" bags_writer \ .link(Repartitioner.maybe( args.partitions, keymap=lambda x: x[Uast2BagFeatures.Columns.document])) \ .link(BOWWriter(document_indexer, df_model, bow, args.batch)) \ .execute() bags.unpersist() pipeline_graph(args, log, root)
def test_documentize(self): r2d = UastRow2Document() row = Row(repository_id="1", path="2", blob_id="3", uast="4") row2 = r2d.documentize(row) row2_correct = Row(document="1//2@3", uast="4") self.assertEqual(row2, row2_correct)