Esempio n. 1
0
    def Identifiers(self, request, context):
        """Extract identifiers weighted set"""

        extractor = IdentifiersBagExtractor(
            docfreq_threshold=request.docfreqThreshold,
            split_stem=request.splitStem,
            weight=request.weight or 1)

        return self._create_response(extractor.extract(request.uast))
Esempio n. 2
0
def repos2coocc(args):
    log = logging.getLogger("repos2coocc")
    id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq,
                                           split_stem=args.split)
    session_name = "repos2coocc-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())

    df_model = create_or_load_ordered_df(
        args, ndocs, uast_extractor.link(Uast2BagFeatures(id_extractor)))

    token2index = root.session.sparkContext.broadcast(df_model.order)
    uast_extractor \
        .link(CooccConstructor(token2index=token2index,
                               token_parser=id_extractor.id2bag.token_parser,
                               namespace=id_extractor.NAMESPACE)) \
        .link(CooccModelSaver(args.output, df_model)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 3
0
def repo2bow(repository: str,
             repository_format: str,
             docfreq_threshold: int,
             docfreq: DocumentFrequencies,
             languages: List[str] = None,
             blacklist_languages=False,
             engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]:
    log = logging.getLogger("repo2bow")
    token_index = {"i." + key: int(val) for (key, val) in docfreq}
    session_name = "repo2bow-%s" % uuid4()
    engine_args = {
        "repositories": repository,
        "repository_format": repository_format,
    }
    if engine_kwargs is not None:
        engine_args.update(engine_kwargs)
    engine = create_engine(session_name, **engine_args)
    root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles()
    if languages is not None:
        file_source = root >> \
                      LanguageExtractor() >> \
                      LanguageSelector(languages=languages, blacklist=blacklist_languages)
    else:
        file_source = root
    bag = (file_source >> UastExtractor() >> Moder("repo") >>
           UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures(
               IdentifiersBagExtractor(docfreq_threshold)) >>
           BagFeatures2TermFreq() >> TFIDF(
               token_index, docfreq.docs,
               engine.session.sparkContext) >> Collector()).execute()
    log.info("extracted %d identifiers", len(bag))
    return {r.token[2:]: r.value for r in bag}
Esempio n. 4
0
    def test_error(self):
        with self.assertRaises(ValueError):
            create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), 10, None)

        with self.assertRaises(ValueError):
            session = create_spark("test_df_util")
            uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
                .link(Moder("file")) \
                .link(UastRow2Document()) \
                .link(UastDeserializer()) \
                .link(Uast2BagFeatures(IdentifiersBagExtractor()))
            create_or_load_ordered_df(argparse.Namespace(docfreq_in=None), None, uast_extractor)
Esempio n. 5
0
 def test_create(self):
     session = create_spark("test_df_util")
     uast_extractor = ParquetLoader(session, paths.PARQUET_DIR) \
         .link(UastRow2Document())
     ndocs = uast_extractor.link(Counter()).execute()
     uast_extractor = uast_extractor.link(UastDeserializer()) \
         .link(Uast2BagFeatures([IdentifiersBagExtractor()]))
     with tempfile.TemporaryDirectory() as tmpdir:
         tmp_path = os.path.join(tmpdir, "df.asdf")
         args = argparse.Namespace(docfreq_in=None, docfreq_out=tmp_path, min_docfreq=1,
                                   vocabulary_size=1000)
         df_model = create_or_load_ordered_df(args, ndocs, uast_extractor)
         self.assertEqual(df_model.docs, ndocs)
         self.assertTrue(os.path.exists(tmp_path))
Esempio n. 6
0
def repos2coocc_entry(args):
    log = logging.getLogger("repos2coocc")
    id_extractor = IdentifiersBagExtractor(docfreq_threshold=args.min_docfreq,
                                           split_stem=args.split)
    session_name = "repos2coocc-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    uast_extractor = start_point \
        .link(UastRow2Document()) \
        .link(Repartitioner.maybe(args.partitions, args.shuffle)) \
        .link(Cacher.maybe(args.persist))
    log.info("Extracting UASTs...")
    ndocs = uast_extractor.link(Counter()).execute()
    log.info("Number of documents: %d", ndocs)
    uast_extractor = uast_extractor.link(UastDeserializer())

    df = uast_extractor \
        .link(Uast2BagFeatures([id_extractor])) \
        .link(BagFeatures2DocFreq()) \
        .execute()

    log.info("Writing document frequency model to %s...", args.docfreq)
    df_model = OrderedDocumentFrequencies() \
        .construct(ndocs, df) \
        .prune(args.min_docfreq) \
        .greatest(args.vocabulary_size) \
        .save(args.docfreq)

    token2index = root.session.sparkContext.broadcast(df_model.order)
    uast_extractor \
        .link(CooccConstructor(token2index=token2index,
                               token_parser=id_extractor.id2bag.token_parser,
                               namespace=id_extractor.NAMESPACE)) \
        .link(CooccModelSaver(args.output, df_model)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 7
0
def _identifiers_extractor(uast, options):
    return list(
        IdentifiersBagExtractor(docfreq_threshold=options.docfreqThreshold,
                                split_stem=options.splitStem,
                                weight=options.weight or 1).extract(uast))