def repos2id_sequence(args): log = logging.getLogger("repos2id_distance") extractor = IdSequenceExtractor(args.split) session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) if not args.skip_docname: mapper = Rower(lambda x: {"document": x[0][1], "identifiers": x[0][0]}) else: mapper = Rower(lambda x: {"identifiers": x[0][0]}) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(extractor)) \ .link(mapper) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2roles_and_ids(args): log = logging.getLogger("repos2roles_and_ids") session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures([RolesAndIdsExtractor(args.split)])) \ .link(Rower(lambda x: dict(identifier=x[0][0], role=x[1]))) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)
def repos2id_distance(args): log = logging.getLogger("repos2roles_and_ids") extractor = IdentifierDistance(args.split, args.type, args.max_distance) session_name = "repos2roles_and_ids-%s" % uuid4() root, start_point = create_uast_source(args, session_name) start_point \ .link(UastRow2Document()) \ .link(UastDeserializer()) \ .link(Uast2BagFeatures(extractor)) \ .link(Rower(lambda x: {"identifier1": x[0][0][0], "identifier2": x[0][0][1], "distance": x[1]})) \ .link(CsvSaver(args.output)) \ .execute() pipeline_graph(args, log, root)