Beispiel #1
0
def repos2coocc(args):
    log = logging.getLogger("repos2coocc")
    id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq,
                                           split_stem=args.split)
    session_name = "repos2coocc-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())

    df_model = create_or_load_ordered_df(
        args, ndocs, uast_extractor.link(Uast2BagFeatures(id_extractor)))

    token2index = root.session.sparkContext.broadcast(df_model.order)
    uast_extractor \
        .link(CooccConstructor(token2index=token2index,
                               token_parser=id_extractor.id2bag.token_parser,
                               namespace=id_extractor.NAMESPACE)) \
        .link(CooccModelSaver(args.output, df_model)) \
        .execute()
    pipeline_graph(args, log, root)
Beispiel #2
0
def repos2df(args):
    log = logging.getLogger("repos2df")
    extractors = create_extractors_from_args(args)
    session_name = "repos2df-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())
    quant = Uast2Quant(extractors)
    uast_extractor.link(quant).execute()
    if quant.levels:
        log.info("Writing quantization levels to %s", args.quant)
        QuantizationLevels().construct(quant.levels).save(args.quant)
    df = uast_extractor \
        .link(Uast2BagFeatures(extractors)) \
        .link(BagFeatures2DocFreq()) \
        .execute()
    log.info("Writing docfreq model to %s", args.docfreq_out)
    OrderedDocumentFrequencies().construct(ndocs, df).save(args.docfreq_out)
    pipeline_graph(args, log, root)
Beispiel #3
0
 def test_create(self):
     session = create_spark("test_df_util")
     uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
         .link(UastRow2Document())
     ndocs = uast_extractor.link(Counter()).execute()
     uast_extractor = uast_extractor.link(UastDeserializer()) \
         .link(Uast2BagFeatures([IdentifiersBagExtractor()]))
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp_path = os.path.join(tmpdir, "df.asdf")
         args = argparse.Namespace(docfreq_in=None, docfreq_out=tmp_path, min_docfreq=1,
                                   vocabulary_size=1000)
         df_model = create_or_load_ordered_df(args, ndocs, uast_extractor)
         self.assertEqual(df_model.docs, ndocs)
         self.assertTrue(os.path.exists(tmp_path))
Beispiel #4
0
def repos2coocc_entry(args):
    log = logging.getLogger("repos2coocc")
    id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq,
                                           split_stem=args.split)
    session_name = "repos2coocc-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())

    df = uast_extractor \
        .link(Uast2BagFeatures([id_extractor])) \
        .link(BagFeatures2DocFreq()) \
        .execute()

    log.info("Writing document frequency model to %s...", args.docfreq)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq)

    token2index = root.session.sparkContext.broadcast(df_model.order)
    uast_extractor \
        .link(CooccConstructor(token2index=token2index,
                               token_parser=id_extractor.id2bag.token_parser,
                               namespace=id_extractor.NAMESPACE)) \
        .link(CooccModelSaver(args.output, df_model)) \
        .execute()
    pipeline_graph(args, log, root)