Example #1
0
def repo2bow(repository: str,
             repository_format: str,
             docfreq_threshold: int,
             docfreq: DocumentFrequencies,
             languages: List[str] = None,
             blacklist_languages=False,
             engine_kwargs: Dict[str, Any] = None) -> Dict[str, float]:
    log = logging.getLogger("repo2bow")
    token_index = {"i." + key: int(val) for (key, val) in docfreq}
    session_name = "repo2bow-%s" % uuid4()
    engine_args = {
        "repositories": repository,
        "repository_format": repository_format,
    }
    if engine_kwargs is not None:
        engine_args.update(engine_kwargs)
    engine = create_engine(session_name, **engine_args)
    root = Ignition(engine) >> RepositoriesFilter(r"^file://.*") >> HeadFiles()
    if languages is not None:
        file_source = root >> \
                      LanguageExtractor() >> \
                      LanguageSelector(languages=languages, blacklist=blacklist_languages)
    else:
        file_source = root
    bag = (file_source >> UastExtractor() >> Moder("repo") >>
           UastDeserializer() >> UastRow2Document() >> Uast2BagFeatures(
               IdentifiersBagExtractor(docfreq_threshold)) >>
           BagFeatures2TermFreq() >> TFIDF(
               token_index, docfreq.docs,
               engine.session.sparkContext) >> Collector()).execute()
    log.info("extracted %d identifiers", len(bag))
    return {r.token[2:]: r.value for r in bag}
Example #2
0
def create_uast_source(args,
                       session_name,
                       select=HeadFiles,
                       language_selector=None,
                       extract_uast=True):
    if args.parquet:
        parquet_loader_args = filter_kwargs(args.__dict__,
                                            create_parquet_loader)
        start_point = create_parquet_loader(session_name,
                                            **parquet_loader_args)
        root = start_point
        if extract_uast and "uast" not in [
                col.name for col in start_point.execute().schema
        ]:
            raise ValueError("The parquet files do not contain UASTs.")
    else:
        engine_args = filter_kwargs(args.__dict__, create_engine)
        root = create_engine(session_name, **engine_args)
        if language_selector is None:
            language_selector = LanguageSelector(languages=args.languages)
        start_point = Ignition(root, explain=args.explain) \
            .link(select()) \
            .link(language_selector)
        if extract_uast:
            start_point = start_point.link(UastExtractor())
    return root, start_point
Example #3
0
def preprocess_source(args):
    log = logging.getLogger("preprocess_source")
    if os.path.exists(args.batches):
        log.critical("%s must not exist", args.batches)
        return 1
    if not args.config:
        args.config = []
    engine = create_engine("source2bags-%s" % uuid4(), args.repositories, args)
    pipeline = Engine(engine, explain=args.explain).link(
        DzhigurdaFiles(args.dzhigurda))
    uasts = pipeline.link(UastExtractor(languages=[args.language]))
    fields = uasts.link(FieldsSelector(fields=args.fields))
    saver = fields.link(ParquetSaver(save_loc=args.batches))

    saver.explode()
Example #4
0
def preprocess_source(args):
    log = logging.getLogger("preprocess_source")
    if os.path.exists(args.output):
        log.critical("%s must not exist", args.output)
        return 1
    if not args.config:
        args.config = []

    engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__)
    ignition = Ignition(engine, explain=args.explain)
    ignition \
        .link(DzhigurdaFiles(args.dzhigurda)) \
        .link(UastExtractor(languages=args.languages)) \
        .link(FieldsSelector(fields=args.fields)) \
        .link(ParquetSaver(save_loc=args.output)) \
        .execute()
    pipeline_graph(args, log, ignition)
Example #5
0
File: basic.py Project: y1026/ml
def create_file_source(args: argparse.Namespace, session_name: str):
    if args.parquet:
        parquet_loader_args = filter_kwargs(args.__dict__,
                                            create_parquet_loader)
        root = create_parquet_loader(session_name, **parquet_loader_args)
        file_source = root.link(
            LanguageSelector.maybe(languages=args.languages,
                                   blacklist=args.blacklist))
    else:
        engine_args = filter_kwargs(args.__dict__, create_engine)
        root = Ignition(create_engine(session_name, **engine_args),
                        explain=args.explain)
        file_source = root.link(DzhigurdaFiles(args.dzhigurda))
        if args.languages is not None:
            file_source = file_source \
                .link(LanguageExtractor()) \
                .link(LanguageSelector(languages=args.languages, blacklist=args.blacklist))

    return root, file_source
Example #6
0
def source2bags(args):
    log = logging.getLogger("bags")
    if os.path.exists(args.batches):
        log.critical("%s must not exist", args.batches)
        return 1
    if not args.config:
        args.config = []
    try:
        cassandra_utils.configure(args)
        engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__)
        extractors = [
            __extractors__[s](args.min_docfreq,
                              **__extractors__[s].get_kwargs_fromcmdline(args))
            for s in args.feature
        ]
        pipeline = Engine(engine, explain=args.explain).link(
            DzhigurdaFiles(args.dzhigurda))
        uasts = pipeline.link(UastExtractor(languages=[args.language]))
        if args.persist is not None:
            uasts = uasts.link(Cacher(args.persist))
        uasts.link(MetadataSaver(args.keyspace, args.tables["meta"]))
        uasts = uasts.link(UastDeserializer())
        uasts.link(Repo2Quant(extractors, args.nb_partitions))
        uasts.link(Repo2DocFreq(extractors))
        pipeline.explode()
        bags = uasts.link(Repo2WeightedSet(extractors))
        if args.persist is not None:
            bags = bags.link(Cacher(args.persist))
        batcher = bags.link(BagsBatcher(extractors))
        batcher.link(BagsBatchSaver(args.batches, batcher))
        bags.link(BagsSaver(args.keyspace, args.tables["bags"]))
        bags.explode()
        log.info("Writing %s", args.docfreq)
        batcher.model.save(args.docfreq)
        if args.graph:
            log.info("Dumping the graph to %s", args.graph)
            with open(args.graph, "w") as f:
                pipeline.graph(stream=f)
    finally:
        if args.pause:
            input("Press Enter to exit...")
Example #7
0
 def setUpClass(cls):
     cls.engine = create_engine("test_with_engine", SIVA_DIR, "siva")
     cls.spark = cls.engine.session
     cls.data = ParquetLoader(session=cls.spark,
                              paths=PARQUET_DIR).execute().rdd.coalesce(1)
Example #8
0
 def setUpClass(cls):
     cls.engine = create_engine("test_with_engine", SIVA_DIR, "siva")
     cls.spark = cls.engine.session
Example #9
0
def warmup(args):
    create_engine("warmup", "/tmp", **args.__dict__)
Example #10
0
def warmup(args):
    engine_args = filter_kwargs(args.__dict__, create_engine)
    create_engine("warmup", "/tmp", **engine_args)