def to_prov(repo):
    doc = prov.model.ProvDocument()
    doc.update(storeT.to_prov(None, None))
    doc.update(storeBus.to_prov(None, None))
    doc.update(combine_t_bus.to_prov(None, None))
    doc.update(geoagg.to_prov(None, None))
    doc.update(pagerank.to_prov(None, None))
    
    repo.record(doc.serialize()) # Record the provenance document.
    with open('plan.json','w') as plan:
        plan.write(json.dumps(json.loads(doc.serialize()), indent=4))
    print(doc.get_provn())
def to_prov(repo):
    doc = prov.model.ProvDocument()
    doc.update(storeT.to_prov(None, None))
    doc.update(storeBus.to_prov(None, None))
    doc.update(combine_t_bus.to_prov(None, None))
    doc.update(geoagg.to_prov(None, None))
    doc.update(pagerank.to_prov(None, None))

    repo.record(doc.serialize())  # Record the provenance document.
    with open('plan.json', 'w') as plan:
        plan.write(json.dumps(json.loads(doc.serialize()), indent=4))
    print(doc.get_provn())
Beispiel #3
0
def run_job_with_params(repo, job_params, doc):
    drop_derived_collections(repo)
    store_json(repo, 'nikolaj.params', job_params)
    startTime, _, endTime = geoagg.run()
    doc.update(geoagg.to_prov(startTime, endTime, job_params[0]))
    startTime, _, endTime = pagerank.run()
    doc.update(pagerank.to_prov(startTime, endTime, job_params[1]))


if __name__ == "__main__":
    repo = get_auth_repo('nikolaj', 'nikolaj')
    doc = prov.model.ProvDocument()

    startTime, _, endTime = storeT.run()
    doc.update(storeT.to_prov(startTime, endTime))
    startTime, _, endTime = storeBus.run()
    doc.update(storeBus.to_prov(startTime, endTime))

    t_only_params = [{
        "id": "geoagg_params",
        "maxDistance": 0,
        "output_col_name": "nikolaj.stops_with_neighs_t_only",
        "input_cols": ["nikolaj.raw_t_stops"],
        "routeUnion": ["$routes", "$geo_neigh_routes"],
        "neighUnion": ["$neighs", "$geo_neighs"]
    }, {
        "id": "pagerank_params",
        "input_col_name": "nikolaj.stops_with_neighs_t_only",
        "output_col_name": "nikolaj.pagerank_result_t_only"
    }]
    repo.dropPerm('nikolaj.params')

def run_job_with_params(repo, job_params, doc):
    drop_derived_collections(repo)
    store_json(repo, 'nikolaj.params', job_params)
    startTime, _, endTime = geoagg.run()
    doc.update(geoagg.to_prov(startTime, endTime, job_params[0]))
    startTime, _, endTime = pagerank.run()
    doc.update(pagerank.to_prov(startTime, endTime, job_params[1]))

if __name__ == "__main__":
    repo = get_auth_repo('nikolaj', 'nikolaj')
    doc = prov.model.ProvDocument()
    
    startTime, _, endTime = storeT.run()
    doc.update(storeT.to_prov(startTime, endTime))
    startTime, _, endTime = storeBus.run()
    doc.update(storeBus.to_prov(startTime, endTime))
    
    t_only_params = [
        { "id" : "geoagg_params", "maxDistance" : 0, "output_col_name": "nikolaj.stops_with_neighs_t_only", "input_cols": [ "nikolaj.raw_t_stops" ], "routeUnion" : [ "$routes", "$geo_neigh_routes" ], "neighUnion" : [ "$neighs", "$geo_neighs" ] },
        { "id" : "pagerank_params", "input_col_name": "nikolaj.stops_with_neighs_t_only", "output_col_name" : "nikolaj.pagerank_result_t_only" }
    ]

    t_500walk_params = [
        { "id" : "geoagg_params", "maxDistance" : 500, "output_col_name": "nikolaj.stops_with_neighs_t_500walk", "input_cols": [ "nikolaj.raw_t_stops" ], "routeUnion" : [ "$routes", "$geo_neigh_routes" ], "neighUnion" : [ "$neighs", "$geo_neighs" ] },
        { "id" : "pagerank_params", "input_col_name": "nikolaj.stops_with_neighs_t_500walk", "output_col_name" : "nikolaj.pagerank_result_t_500walk" }
    ]

    t_500walk_bus_params = [
        { "id" : "geoagg_params", "maxDistance" : 500, "output_col_name": "nikolaj.stops_with_neighs_t_500walk_bus", "input_cols": [ "nikolaj.raw_t_stops", "nikolaj.raw_bus_stops" ], "routeUnion" : [ "$routes", "$geo_neigh_routes" ], "neighUnion" : [ "$neighs", "$geo_neighs" ] },