def repos2ids(args): log = logging.getLogger("repos2ids") session_name = "repos2ids-%s" % uuid4() root, start_point = create_file_source(args, session_name) start_point \ .link(Repartitioner(args.partitions, args.shuffle)) \ .link(ContentToIdentifiers(args.split)) \ .link(IdentifiersToDataset(args.idfreq)) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2roles_and_ids(args): log = logging.getLogger("repos2roles_and_ids") session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures([RolesAndIdsExtractor(args.split)])) \ .link(Rower(lambda x: dict(identifier=x[0][0], role=x[1]))) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2roles_and_ids(args): log = logging.getLogger("repos2roles_and_ids") session_name = "repos2roles_and_ids-%s" % uuid4() extractor = RoleIdsExtractor() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2Features(extractor)) \ .link(Rower(lambda x: {"identifier": x["roleids"][0], "role": x["roleids"][1]})) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2ids(args): log = logging.getLogger("repos2ids") session_name = "repos2ids-%s" % uuid4() language_selector = LanguageSelector(languages=["null"], blacklist=True) root, start_point = create_uast_source(args, session_name, language_selector=language_selector, extract_uast=False) start_point \ .link(Repartitioner(args.partitions, args.shuffle)) \ .link(ContentToIdentifiers(args.split)) \ .link(IdentifiersToDataset(args.idfreq)) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2id_distance(args): log = logging.getLogger("repos2roles_and_ids") extractor = IdentifierDistance(args.split, args.type, args.max_distance) session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(extractor)) \ .link(Rower(lambda x: {"identifier1": x[0][0][0], "identifier2": x[0][0][1], "distance": x[1]})) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2id_sequence(args): log = logging.getLogger("repos2id_distance") extractor = IdSequenceExtractor(args.split) session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) if not args.skip_docname: mapper = Rower(lambda x: {"document": x[0][1], "identifiers": x[0][0]}) else: mapper = Rower(lambda x: {"identifiers": x[0][0]}) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(extractor)) \ .link(mapper) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def test_csv_saver(self): with tempfile.TemporaryDirectory() as tmpdir: dirname = tmpdir # load and save data rows = [("Alice", 1)] df = self.spark.createDataFrame(rows, ["name", "age"]) CsvSaver(dirname)(df.rdd) # read saved data and check it for root, d, files in os.walk(dirname): for f in files: filename = os.path.join(root, f) if filename.endswith(".csv"): with open(filename) as f: reader = csv.reader(f) next(reader) data = [r for r in reader] self.assertEqual(len(data), 1) self.assertEqual(data[0][0], rows[0][0]) self.assertEqual(int(data[0][1]), rows[0][1])