Exemple #1
0
    def test_parquet_saver(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            dirname = tmpdir
        try:
            # load and save data
            rows = [("Alice", 1)]
            df = self.spark.createDataFrame(rows, ["name", "age"])
            ParquetSaver(dirname + "/", explain=True)(df.rdd)
            ParquetSaver(dirname + "2/")(df.rdd)

            # read saved data and check it
            data = ParquetLoader(session=self.spark, paths=dirname).execute()
            self.assertEqual(data.count(), 1)
        finally:
            shutil.rmtree(dirname)
Exemple #2
0
def preprocess_source(args):
    log = logging.getLogger("preprocess_source")
    if os.path.exists(args.batches):
        log.critical("%s must not exist", args.batches)
        return 1
    if not args.config:
        args.config = []
    engine = create_engine("source2bags-%s" % uuid4(), args.repositories, args)
    pipeline = Engine(engine, explain=args.explain).link(
        DzhigurdaFiles(args.dzhigurda))
    uasts = pipeline.link(UastExtractor(languages=[args.language]))
    fields = uasts.link(FieldsSelector(fields=args.fields))
    saver = fields.link(ParquetSaver(save_loc=args.batches))

    saver.explode()
Exemple #3
0
def preprocess_source(args):
    log = logging.getLogger("preprocess_source")
    if os.path.exists(args.output):
        log.critical("%s must not exist", args.output)
        return 1
    if not args.config:
        args.config = []

    engine = create_engine("source2bags-%s" % uuid4(), **args.__dict__)
    ignition = Ignition(engine, explain=args.explain)
    ignition \
        .link(DzhigurdaFiles(args.dzhigurda)) \
        .link(UastExtractor(languages=args.languages)) \
        .link(FieldsSelector(fields=args.fields)) \
        .link(ParquetSaver(save_loc=args.output)) \
        .execute()
    pipeline_graph(args, log, ignition)
Exemple #4
0
def preprocess_repos(args):
    log = logging.getLogger("preprocess_repos")
    session_name = "preprocess_repos-%s" % uuid4()

    if os.path.exists(args.output):
        log.critical("%s must not exist", args.output)
        return 1
    if not args.config:
        args.config = []
    root, start_point = create_uast_source(args, session_name)

    start_point \
        .link(Moder(args.mode)) \
        .link(FieldsSelector(fields=args.fields)) \
        .link(ParquetSaver(save_loc=args.output)) \
        .execute()
    pipeline_graph(args, log, root)