Esempio n. 1
0
def repos2ids(args):
    log = logging.getLogger("repos2ids")
    session_name = "repos2ids-%s" % uuid4()

    root, start_point = create_file_source(args, session_name)
    start_point \
        .link(Repartitioner(args.partitions, args.shuffle)) \
        .link(ContentToIdentifiers(args.split)) \
        .link(IdentifiersToDataset(args.idfreq)) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 2
0
def repos2roles_and_ids(args):
    log = logging.getLogger("repos2roles_and_ids")
    session_name = "repos2roles_and_ids-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures([RolesAndIdsExtractor(args.split)])) \
        .link(Rower(lambda x: dict(identifier=x[0][0], role=x[1]))) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 3
0
def repos2roles_and_ids(args):
    log = logging.getLogger("repos2roles_and_ids")
    session_name = "repos2roles_and_ids-%s" % uuid4()
    extractor = RoleIdsExtractor()
    root, start_point = create_uast_source(args, session_name)

    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2Features(extractor)) \
        .link(Rower(lambda x: {"identifier": x["roleids"][0], "role": x["roleids"][1]})) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 4
0
def repos2ids(args):
    log = logging.getLogger("repos2ids")
    session_name = "repos2ids-%s" % uuid4()
    language_selector = LanguageSelector(languages=["null"], blacklist=True)
    root, start_point = create_uast_source(args,
                                           session_name,
                                           language_selector=language_selector,
                                           extract_uast=False)
    start_point \
        .link(Repartitioner(args.partitions, args.shuffle)) \
        .link(ContentToIdentifiers(args.split)) \
        .link(IdentifiersToDataset(args.idfreq)) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 5
0
def repos2id_distance(args):
    log = logging.getLogger("repos2roles_and_ids")
    extractor = IdentifierDistance(args.split, args.type, args.max_distance)
    session_name = "repos2roles_and_ids-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)

    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures(extractor)) \
        .link(Rower(lambda x: {"identifier1": x[0][0][0],
                               "identifier2": x[0][0][1],
                               "distance": x[1]})) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 6
0
def repos2id_sequence(args):
    log = logging.getLogger("repos2id_distance")
    extractor = IdSequenceExtractor(args.split)
    session_name = "repos2roles_and_ids-%s" % uuid4()
    root, start_point = create_uast_source(args, session_name)
    if not args.skip_docname:
        mapper = Rower(lambda x: {"document": x[0][1], "identifiers": x[0][0]})
    else:
        mapper = Rower(lambda x: {"identifiers": x[0][0]})
    start_point \
        .link(UastRow2Document()) \
        .link(UastDeserializer()) \
        .link(Uast2BagFeatures(extractor)) \
        .link(mapper) \
        .link(CsvSaver(args.output)) \
        .execute()
    pipeline_graph(args, log, root)
Esempio n. 7
0
    def test_csv_saver(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            dirname = tmpdir

        # load and save data
        rows = [("Alice", 1)]
        df = self.spark.createDataFrame(rows, ["name", "age"])
        CsvSaver(dirname)(df.rdd)

        # read saved data and check it
        for root, d, files in os.walk(dirname):
            for f in files:
                filename = os.path.join(root, f)
                if filename.endswith(".csv"):
                    with open(filename) as f:
                        reader = csv.reader(f)
                        next(reader)
                        data = [r for r in reader]

        self.assertEqual(len(data), 1)
        self.assertEqual(data[0][0], rows[0][0])
        self.assertEqual(int(data[0][1]), rows[0][1])