def repos2coocc(args): log = logging.getLogger("repos2coocc") id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq, split_stem=args.split) session_name = "repos2coocc-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) df_model = create_or_load_ordered_df( args, ndocs, uast_extractor.link(Uast2BagFeatures(id_extractor))) token2index = root.session.sparkContext.broadcast(df_model.order) uast_extractor \ .link(CooccConstructor(token2index=token2index, token_parser=id_extractor.id2bag.token_parser, namespace=id_extractor.NAMESPACE)) \ .link(CooccModelSaver(args.output, df_model)) \ .execute() pipeline_graph(args, log, root)
def repos2df(args): log = logging.getLogger("repos2df") extractors = create_extractors_from_args(args) session_name = "repos2df-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) quant = Uast2Quant(extractors) uast_extractor.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", args.quant) QuantizationLevels().construct(quant.levels).save(args.quant) df = uast_extractor \ .link(Uast2BagFeatures(extractors)) \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing docfreq model to %s", args.docfreq_out) OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out) pipeline_graph(args, log, root)
def test_create(self): session = create_spark("test_df_util") uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(UastRow2Document()) ndocs = uast_extractor.link(Counter()).execute() uast_extractor = uast_extractor.link(UastDeserializer()) \ .link(Uast2BagFeatures([IdentifiersBagExtractor()])) with tempfile.TemporaryDirectory() as tmpdir: tmp_path = os.path.join(tmpdir, "df.asdf") args = argparse.Namespace(docfreq_in=None, docfreq_out=tmp_path, min_docfreq=1, vocabulary_size=1000) df_model = create_or_load_ordered_df(args, ndocs, uast_extractor) self.assertEqual(df_model.docs, ndocs) self.assertTrue(os.path.exists(tmp_path))
def repos2coocc_entry(args): log = logging.getLogger("repos2coocc") id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq, split_stem=args.split) session_name = "repos2coocc-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) df = uast_extractor \ .link(Uast2BagFeatures([id_extractor])) \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing document frequency model to %s...", args.docfreq) df_model = OrderedDocumentFrequencies() \ .construct(ndocs, df) \ .prune(args.min_docfreq) \ .greatest(args.vocabulary_size) \ .save(args.docfreq) token2index = root.session.sparkContext.broadcast(df_model.order) uast_extractor \ .link(CooccConstructor(token2index=token2index, token_parser=id_extractor.id2bag.token_parser, namespace=id_extractor.NAMESPACE)) \ .link(CooccModelSaver(args.output, df_model)) \ .execute() pipeline_graph(args, log, root)