Example #1
0
def repos2df(args):
    log = logging.getLogger("repos2df")
    extractors = create_extractors_from_args(args)
    session_name = "repos2df-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    quant = Uast2Quant(extractors)
    uast_extractor.link(quant).execute()
    if quant.levels:
        log.info("Writing quantization levels to %s", args.quant)
        QuantizationLevels().construct(quant.levels).save(args.quant)
    df = uast_extractor \
        .link(Uast2BagFeatures(extractors)) \
        .link(BagFeatures2DocFreq()) \
        .execute()
    log.info("Writing docfreq model to %s", args.docfreq_out)
    OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out)
    pipeline_graph(args, log, root)
Example #2
0
 def test_apply(self):
     extractor = ChildrenBagExtractor()
     create_or_apply_quant(paths.QUANTLEVELS, [extractor])
     self.assertIsNotNone(extractor.levels)
     model_levels = QuantizationLevels().load(source=paths.QUANTLEVELS)._levels["children"]
     for key in model_levels:
         self.assertListEqual(list(model_levels[key]), list(extractor.levels[key]))
Example #3
0
 def test_write(self):
     levels = {"xxx": {"a": numpy.array([1, 2, 3]), "b": numpy.array([4, 5, 6]),
                       "c": numpy.array([7, 8, 9])},
               "yyy": {"q": numpy.array([3, 2, 1]), "w": numpy.array([6, 5, 4]),
                       "e": numpy.array([9, 8, 7])}}
     buffer = BytesIO()
     QuantizationLevels().construct(levels).save(output=buffer, series="quant")
     buffer.seek(0)
     model = QuantizationLevels().load(buffer)
     levels = model.levels
     self.assertEqual(len(levels), 2)
     self.assertEqual(len(levels["xxx"]), 3)
     self.assertEqual(len(levels["yyy"]), 3)
     self.assertTrue((levels["xxx"]["a"] == numpy.array([1, 2, 3])).all())
     self.assertTrue((levels["xxx"]["b"] == numpy.array([4, 5, 6])).all())
     self.assertTrue((levels["xxx"]["c"] == numpy.array([7, 8, 9])).all())
     self.assertTrue((levels["yyy"]["q"] == numpy.array([3, 2, 1])).all())
     self.assertTrue((levels["yyy"]["w"] == numpy.array([6, 5, 4])).all())
     self.assertTrue((levels["yyy"]["e"] == numpy.array([9, 8, 7])).all())
Example #4
0
def create_or_apply_quant(model_path: str, extractors: List[BagsExtractor], extracted_uasts=None):
    log = logging.getLogger("create_or_apply_quant")
    if os.path.exists(model_path):
        log.info("Loading the quantization levels from %s and applying quantization to supported"
                 " extractors...", model_path)
        try:
            QuantizationLevels().load(source=model_path).apply_quantization(extractors)
        except (ValueError, ImportError):
            pass
        else:
            return
    if extracted_uasts is None:
        log.error("[IN] only mode, please supply a quantization levels model")
        raise ValueError
    else:
        quant = Uast2Quant(extractors)
        extracted_uasts.link(quant).execute()
        if quant.levels:
            log.info("Writing quantization levels to %s", model_path)
            QuantizationLevels().construct(quant.levels) \
                .save(output=model_path, series="quant")
Example #5
0
def repos2bow_entry_template(args,
                             select=HeadFiles,
                             cache_hook=None,
                             save_hook=None):
    log = logging.getLogger("repos2bow")
    extractors = create_extractors_from_args(args)
    session_name = "repos2bow-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name, select=select)
    uast_extractor = start_point.link(Moder(args.mode)) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    if cache_hook is not None:
        uast_extractor.link(cache_hook()).execute()
    # We link UastRow2Document after Cacher here because cache_hook() may want to have all possible
    # Row items.
    uast_extractor = uast_extractor.link(UastRow2Document())
    log.info("Extracting UASTs and indexing documents...")
    document_indexer = Indexer(Uast2BagFeatures.Columns.document)
    uast_extractor.link(document_indexer).execute()
    ndocs = len(document_indexer)
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    quant = Uast2Quant(extractors)
    uast_extractor.link(quant).execute()
    if quant.levels:
        log.info("Writing quantization levels to %s", args.quant)
        QuantizationLevels().construct(quant.levels).save(args.quant)
    uast_extractor = uast_extractor \
        .link(Uast2BagFeatures(extractors))
    log.info("Calculating the document frequencies...")
    df = uast_extractor.link(BagFeatures2DocFreq()).execute()
    log.info("Writing docfreq to %s", args.docfreq)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq)
    bags_writer = uast_extractor \
        .link(BagFeatures2TermFreq()) \
        .link(TFIDF(df_model)) \
        .link(document_indexer) \
        .link(Indexer(Uast2BagFeatures.Columns.token, df_model.order))
    if save_hook is not None:
        bags_writer = bags_writer \
            .link(Repartitioner.maybe(args.partitions * 10, args.shuffle)) \
            .link(save_hook())
    bags_writer.link(BOWWriter(document_indexer, df_model, args.bow, args.batch)) \
        .execute()
    pipeline_graph(args, log, root)
Example #6
0
 def test_create(self):
     session = create_spark("test_quant_util")
     extractor = ChildrenBagExtractor()
     with tempfile.NamedTemporaryFile(mode="r+b", suffix="-quant.asdf") as tmp:
         path = tmp.name
         uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
             .link(Moder("file")) \
             .link(UastRow2Document()) \
             .link(UastDeserializer())
         create_or_apply_quant(path, [extractor], uast_extractor)
         self.assertIsNotNone(extractor.levels)
         self.assertTrue(os.path.exists(path))
         model_levels = QuantizationLevels().load(source=path)._levels["children"]
         for key in model_levels:
             self.assertListEqual(list(model_levels[key]), list(extractor.levels[key]))
Example #7
0
class QuantizationLevelsTests(unittest.TestCase):
    def setUp(self):
        self.model = QuantizationLevels().load(source=paths.QUANTLEVELS)

    def test_levels(self):
        levels = self.model.levels
        self.assertIsInstance(levels, dict)
        self.assertEqual(len(levels), 1)
        self.assertIsInstance(levels["children"], dict)
        self.assertEqual(len(levels["children"]), 259)

    def test_len(self):
        self.assertEqual(len(self.model), 1)

    def test_write(self):
        levels = {"xxx": {"a": numpy.array([1, 2, 3]), "b": numpy.array([4, 5, 6]),
                          "c": numpy.array([7, 8, 9])},
                  "yyy": {"q": numpy.array([3, 2, 1]), "w": numpy.array([6, 5, 4]),
                          "e": numpy.array([9, 8, 7])}}
        buffer = BytesIO()
        QuantizationLevels().construct(levels).save(output=buffer, series="quant")
        buffer.seek(0)
        model = QuantizationLevels().load(buffer)
        levels = model.levels
        self.assertEqual(len(levels), 2)
        self.assertEqual(len(levels["xxx"]), 3)
        self.assertEqual(len(levels["yyy"]), 3)
        self.assertTrue((levels["xxx"]["a"] == numpy.array([1, 2, 3])).all())
        self.assertTrue((levels["xxx"]["b"] == numpy.array([4, 5, 6])).all())
        self.assertTrue((levels["xxx"]["c"] == numpy.array([7, 8, 9])).all())
        self.assertTrue((levels["yyy"]["q"] == numpy.array([3, 2, 1])).all())
        self.assertTrue((levels["yyy"]["w"] == numpy.array([6, 5, 4])).all())
        self.assertTrue((levels["yyy"]["e"] == numpy.array([9, 8, 7])).all())

    def test_dump(self):
        self.assertEqual(self.model.dump(), "Schemes: [('children', '259@10')]")
Example #8
0
 def setUp(self):
     self.model = QuantizationLevels().load(source=paths.QUANTLEVELS)