def repos2df(args): log = logging.getLogger("repos2df") extractors = create_extractors_from_args(args) session_name = "repos2df-%s" % uuid4() root, start_point = create_uast_source(args, session_name) uast_extractor = start_point \ .link(UastRow2Document()) \ .link(Cacher.maybe(args.persist)) log.info("Extracting UASTs...") ndocs = uast_extractor.link(Counter()).execute() log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) quant = Uast2Quant(extractors) uast_extractor.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", args.quant) QuantizationLevels().construct(quant.levels).save(args.quant) df = uast_extractor \ .link(Uast2BagFeatures(extractors)) \ .link(BagFeatures2DocFreq()) \ .execute() log.info("Writing docfreq model to %s", args.docfreq_out) OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out) pipeline_graph(args, log, root)
def test_apply(self): extractor = ChildrenBagExtractor() create_or_apply_quant(paths.QUANTLEVELS, [extractor]) self.assertIsNotNone(extractor.levels) model_levels = QuantizationLevels().load(source=paths.QUANTLEVELS)._levels["children"] for key in model_levels: self.assertListEqual(list(model_levels[key]), list(extractor.levels[key]))
def test_write(self): levels = {"xxx": {"a": numpy.array([1, 2, 3]), "b": numpy.array([4, 5, 6]), "c": numpy.array([7, 8, 9])}, "yyy": {"q": numpy.array([3, 2, 1]), "w": numpy.array([6, 5, 4]), "e": numpy.array([9, 8, 7])}} buffer = BytesIO() QuantizationLevels().construct(levels).save(output=buffer, series="quant") buffer.seek(0) model = QuantizationLevels().load(buffer) levels = model.levels self.assertEqual(len(levels), 2) self.assertEqual(len(levels["xxx"]), 3) self.assertEqual(len(levels["yyy"]), 3) self.assertTrue((levels["xxx"]["a"] == numpy.array([1, 2, 3])).all()) self.assertTrue((levels["xxx"]["b"] == numpy.array([4, 5, 6])).all()) self.assertTrue((levels["xxx"]["c"] == numpy.array([7, 8, 9])).all()) self.assertTrue((levels["yyy"]["q"] == numpy.array([3, 2, 1])).all()) self.assertTrue((levels["yyy"]["w"] == numpy.array([6, 5, 4])).all()) self.assertTrue((levels["yyy"]["e"] == numpy.array([9, 8, 7])).all())
def create_or_apply_quant(model_path: str, extractors: List[BagsExtractor], extracted_uasts=None): log = logging.getLogger("create_or_apply_quant") if os.path.exists(model_path): log.info("Loading the quantization levels from %s and applying quantization to supported" " extractors...", model_path) try: QuantizationLevels().load(source=model_path).apply_quantization(extractors) except (ValueError, ImportError): pass else: return if extracted_uasts is None: log.error("[IN] only mode, please supply a quantization levels model") raise ValueError else: quant = Uast2Quant(extractors) extracted_uasts.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", model_path) QuantizationLevels().construct(quant.levels) \ .save(output=model_path, series="quant")
def repos2bow_entry_template(args, select=HeadFiles, cache_hook=None, save_hook=None): log = logging.getLogger("repos2bow") extractors = create_extractors_from_args(args) session_name = "repos2bow-%s" % uuid4() root, start_point = create_uast_source(args, session_name, select=select) uast_extractor = start_point.link(Moder(args.mode)) \ .link(Repartitioner.maybe(args.partitions, args.shuffle)) \ .link(Cacher.maybe(args.persist)) if cache_hook is not None: uast_extractor.link(cache_hook()).execute() # We link UastRow2Document after Cacher here because cache_hook() may want to have all possible # Row items. uast_extractor = uast_extractor.link(UastRow2Document()) log.info("Extracting UASTs and indexing documents...") document_indexer = Indexer(Uast2BagFeatures.Columns.document) uast_extractor.link(document_indexer).execute() ndocs = len(document_indexer) log.info("Number of documents: %d", ndocs) uast_extractor = uast_extractor.link(UastDeserializer()) quant = Uast2Quant(extractors) uast_extractor.link(quant).execute() if quant.levels: log.info("Writing quantization levels to %s", args.quant) QuantizationLevels().construct(quant.levels).save(args.quant) uast_extractor = uast_extractor \ .link(Uast2BagFeatures(extractors)) log.info("Calculating the document frequencies...") df = uast_extractor.link(BagFeatures2DocFreq()).execute() log.info("Writing docfreq to %s", args.docfreq) df_model = OrderedDocumentFrequencies() \ .construct(ndocs, df) \ .prune(args.min_docfreq) \ .greatest(args.vocabulary_size) \ .save(args.docfreq) bags_writer = uast_extractor \ .link(BagFeatures2TermFreq()) \ .link(TFIDF(df_model)) \ .link(document_indexer) \ .link(Indexer(Uast2BagFeatures.Columns.token, df_model.order)) if save_hook is not None: bags_writer = bags_writer \ .link(Repartitioner.maybe(args.partitions * 10, args.shuffle)) \ .link(save_hook()) bags_writer.link(BOWWriter(document_indexer, df_model, args.bow, args.batch)) \ .execute() pipeline_graph(args, log, root)
def test_create(self): session = create_spark("test_quant_util") extractor = ChildrenBagExtractor() with tempfile.NamedTemporaryFile(mode="r+b", suffix="-quant.asdf") as tmp: path = tmp.name uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \ .link(Moder("file")) \ .link(UastRow2Document()) \ .link(UastDeserializer()) create_or_apply_quant(path, [extractor], uast_extractor) self.assertIsNotNone(extractor.levels) self.assertTrue(os.path.exists(path)) model_levels = QuantizationLevels().load(source=path)._levels["children"] for key in model_levels: self.assertListEqual(list(model_levels[key]), list(extractor.levels[key]))
class QuantizationLevelsTests(unittest.TestCase): def setUp(self): self.model = QuantizationLevels().load(source=paths.QUANTLEVELS) def test_levels(self): levels = self.model.levels self.assertIsInstance(levels, dict) self.assertEqual(len(levels), 1) self.assertIsInstance(levels["children"], dict) self.assertEqual(len(levels["children"]), 259) def test_len(self): self.assertEqual(len(self.model), 1) def test_write(self): levels = {"xxx": {"a": numpy.array([1, 2, 3]), "b": numpy.array([4, 5, 6]), "c": numpy.array([7, 8, 9])}, "yyy": {"q": numpy.array([3, 2, 1]), "w": numpy.array([6, 5, 4]), "e": numpy.array([9, 8, 7])}} buffer = BytesIO() QuantizationLevels().construct(levels).save(output=buffer, series="quant") buffer.seek(0) model = QuantizationLevels().load(buffer) levels = model.levels self.assertEqual(len(levels), 2) self.assertEqual(len(levels["xxx"]), 3) self.assertEqual(len(levels["yyy"]), 3) self.assertTrue((levels["xxx"]["a"] == numpy.array([1, 2, 3])).all()) self.assertTrue((levels["xxx"]["b"] == numpy.array([4, 5, 6])).all()) self.assertTrue((levels["xxx"]["c"] == numpy.array([7, 8, 9])).all()) self.assertTrue((levels["yyy"]["q"] == numpy.array([3, 2, 1])).all()) self.assertTrue((levels["yyy"]["w"] == numpy.array([6, 5, 4])).all()) self.assertTrue((levels["yyy"]["e"] == numpy.array([9, 8, 7])).all()) def test_dump(self): self.assertEqual(self.model.dump(), "Schemes: [('children', '259@10')]")
def setUp(self): self.model = QuantizationLevels().load(source=paths.QUANTLEVELS)