stateRDD = workflow.apply_context(stateRDD1, state_context) stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK) stateRDD.setName("stateRDD") countryRDD = workflow.apply_context(countryRDD1, country_context) countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK) countryRDD.setName("countryRDD") cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd, city_context) cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK) cityAlternateRDD.setName("cityAlternateRDD") fileUtil.save_file(cityAlternateRDD, outputFilename+"_cityalternate", "text", "json") city_reduced_rdd = workflow.reduce_rdds(10, cityRDD, cityAlternateRDD) city_reduced_rdd = city_reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK) city_reduced_rdd.setName("city_reduced_rdd") # fileUtil.save_file(countryRDD, outputFilename+"_Country", "text", "json") # fileUtil.save_file(city_reduced_rdd, outputFilename+"_City", "text", "json") # fileUtil.save_file(stateRDD, outputFilename+"_State", "text", "json") mergeRDD1 = EntityMerger.merge_rdds(city_reduced_rdd, "address.addressCountry", countryRDD,10) # fileUtil.save_file(mergeRDD1, outputFilename+"_State_Country", "text", "json") mergeRDD2 = EntityMerger.merge_rdds(mergeRDD1, "address.addressRegion", stateRDD,10) #3. Save the output
print "offer model done" # Apply the Seller model seller_rdd = workflow.run_karma( escorts_rdd, github_base + '/datasets/ht/CDRv2/seller/ht-seller-model.ttl', "http://dig.isi.edu/ht/data/", "http://schema.dig.isi.edu/ontology/PersonOrOrganization1", context_url, numFramerPartitions) seller_rdd.persist(StorageLevel.MEMORY_AND_DISK) rdd_list.append(seller_rdd) print "seller model done" # After applying all the karma models on the datasets, we not reduce them so that we can # join on same uris and remove duplicates reduced_rdd = workflow.reduce_rdds(numFramerPartitions, *rdd_list) reduced_rdd = reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK) reduced_rdd.setName("reduced_rdd") #6. Apply the Framer and contexts # We define types to be all Classes in the model that have a uri on which we would # like the framer to do the joins types = [{ "name": "AdultService", "uri": "http://schema.dig.isi.edu/ontology/AdultService" }, { "name": "EmailAddress", "uri": "http://schema.dig.isi.edu/ontology/EmailAddress" }, { "name": "GeoCoordinates",
stateRDD = workflow.apply_context(stateRDD1, state_context) stateRDD = stateRDD.persist(StorageLevel.MEMORY_AND_DISK) stateRDD.setName("stateRDD") countryRDD = workflow.apply_context(countryRDD1, country_context) countryRDD = countryRDD.persist(StorageLevel.MEMORY_AND_DISK) countryRDD.setName("countryRDD") cityAlternateRDD = workflow.apply_context(city_alternate_names_rdd, city_context) cityAlternateRDD = cityAlternateRDD.persist(StorageLevel.MEMORY_AND_DISK) cityAlternateRDD.setName("cityAlternateRDD") fileUtil.save_file(cityAlternateRDD, outputFilename + "_cityalternate", "text", "json") city_reduced_rdd = workflow.reduce_rdds(10, cityRDD, cityAlternateRDD) city_reduced_rdd = city_reduced_rdd.persist(StorageLevel.MEMORY_AND_DISK) city_reduced_rdd.setName("city_reduced_rdd") # fileUtil.save_file(countryRDD, outputFilename+"_Country", "text", "json") # fileUtil.save_file(city_reduced_rdd, outputFilename+"_City", "text", "json") # fileUtil.save_file(stateRDD, outputFilename+"_State", "text", "json") mergeRDD1 = EntityMerger.merge_rdds(city_reduced_rdd, "address.addressCountry", countryRDD, 10) # fileUtil.save_file(mergeRDD1, outputFilename+"_State_Country", "text", "json") mergeRDD2 = EntityMerger.merge_rdds(mergeRDD1, "address.addressRegion", stateRDD, 10)
workflow = Workflow(sc) fileUtil = FileUtil(sc) # Read input inputRDD = fileUtil.load_file(inputFilename, inputType, "json") #1. Apply the first karma Model outputRDD1 = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions) #2. Apply the second Karma Model outputRDD2 = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model2.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions) #3. Combined the data and then apply the Karma JSON Reducer reducedRDD = workflow.reduce_rdds(numPartitions, outputRDD1, outputRDD2) #4. Save the output fileUtil.save_file(reducedRDD, outputFilename, "text", "json")