def testTokenizer(sc, input_dir, output_dir, config, limit=None, input_file_format="sequence", input_data_type="json", output_file_format="sequence", output_data_type="json", **kwargs): print(limit) futil = FileUtil(sc) # LOAD DATA rdd_ingest = futil.load_file(input_dir, file_format=kwargs.get("file_format"), data_type=input_data_type) rdd_ingest.setName('rdd_ingest_input') ## TOKENIZE #tokOptions = {"file_format": input_file_format, # "data_type": input_data_type} tokenizer = Tokenizer(config, **kwargs) rdd_tokenized = tokenizer.perform(rdd_ingest) # SAVE DATA outOptions = {} futil.save_file(rdd_tokenized, output_dir, file_format=output_file_format, data_type=output_data_type, **outOptions)
def testLSH(sc, inputFilename,outputFilename,configFilename, **kwargs): ''' kwargs is a dictionary of inputs a sample input would look like options = { "file_format":"sequence", "data_type":"json", "numHashes":100 , "numItemsInBand": 10, "computeSimilarity": True, "threshold":0.8, "base":"saam-city.json", "topk":3, "candidatesName":"candidates") } ''' futil = FileUtil(sc) #Tokenize ###################### rdd_input = futil.load_file(inputFilename, file_format=kwargs['file_format'], data_type="json") rdd_input.setName('rdd_input') tokenizer = Tokenizer(configFilename, **kwargs) rdd_tokenized = tokenizer.perform(rdd_input) futil = FileUtil(sc) outOptions = {} #you can save the tokens file here by using #futil.save_file(rdd_tokenized,outputFilename,file_format='sequence',data_type='json',**outOptions) #Hashing ####################### hasher = Hasher(**kwargs) rdd_minHashes = hasher.perform(rdd_tokenized) rdd_minHashes.setName('rdd_minhashes') #futil.save_file(rdd_minHashes,outputFilename,file_format='sequence',data_type='json',**outOptions) #clustering ######################### clusterer = Clusterer(**kwargs) rdd_clusters = clusterer.perform(rdd_minHashes) #futil.save_file(rdd_clusters,outputFilename,file_format='text',data_type='json',**outOptions) #unionfind ######################### unionFind = UnionFind(**kwargs) rdd_unionfind = unionFind.perform(rdd_clusters) # SAVE DATA futil.save_file(rdd_unionfind,outputFilename,file_format='text',data_type='json',**outOptions)
def load_jsonlines(sc, input, file_format='sequence', data_type='json', separator='\t'): fUtil = FileUtil(sc) rdd = fUtil.load_file(input, file_format=file_format, data_type=data_type, separator=separator) return rdd
def testSignature(sc,inputFileName,outputFileName,file_format): """ :param sc: this is sparkcontext variable needed for spark :param inputFileName: inputdir to which cluster_id needs to be added :param outputFileName: outputdir :param file_format: text/seq """ fUtil = FileUtil(sc) rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json") addClusterId = AddClusterId() rdd = addClusterId.perform(rdd) fUtil.save_file(rdd,outputFileName,file_format='text',data_type='json')
def testSignature(sc, inputFileName, outputFileName, file_format): config = {"runlimit": 5, "field": "title_signature"} fUtil = FileUtil(sc) rdd = fUtil.load_file(inputFileName, file_format=file_format, data_type="json") signature = ComputeSignature(**config) rdd = signature.perform(rdd) fUtil.save_file(rdd, outputFileName, file_format="text", data_type="json")
def save_jsonlines(sc, rdd, output_dir, file_format='sequence', data_type='json', separator='\t'): fUtil = FileUtil(sc) fUtil.save_file(rdd, output_dir, file_format=file_format, data_type=data_type, separator=separator)
def testSignature(sc,inputFileName,outputFileName,file_format): config = { "runlimit" : 5, "field" : "title_signature" } fUtil = FileUtil(sc) rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json") signature = ComputeSignature(**config) rdd = signature.perform(rdd) fUtil.save_file(rdd,outputFileName,file_format="text",data_type="json")
def preprocess(sc, inputDir, file_format, outputDir): """ this method just reads the offer file and creates vertexrdd and edgerdd required for graphx vertexrdd will be node uri and type edgesrdd will be node a,node b,edge type :param inputDir: :param file_format: :return: """ fileUtil = FileUtil(sc) inputrdd = fileUtil.load_file(inputDir, file_format=file_format, data_type='json') vertexrdd = inputrdd.flatMapValues(lambda x: nodes_mapper(x)) #rdd = vertexrdd.foreach(lambda (x,y):f(x,y)) edgerdd = inputrdd.flatMapValues(lambda x: edges_mapper(x)) fileUtil.save_file(vertexrdd, outputDir + 'vertex', file_format='text', data_type='json') fileUtil.save_file(edgerdd, outputDir + 'edges', file_format='text', data_type='json')
def preprocess(sc,inputDir,file_format,outputDir): """ this method just reads the offer file and creates vertexrdd and edgerdd required for graphx vertexrdd will be node uri and type edgesrdd will be node a,node b,edge type :param inputDir: :param file_format: :return: """ fileUtil = FileUtil(sc) inputrdd=fileUtil.load_file(inputDir,file_format=file_format,data_type='json') vertexrdd = inputrdd.flatMapValues(lambda x:nodes_mapper(x)) #rdd = vertexrdd.foreach(lambda (x,y):f(x,y)) edgerdd = inputrdd.flatMapValues(lambda x : edges_mapper(x)) fileUtil.save_file(vertexrdd,outputDir+'vertex',file_format='text',data_type='json') fileUtil.save_file(edgerdd,outputDir+'edges',file_format='text',data_type='json')
parser.add_option("-r", "--separator", dest="separator", type="string", help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options java_import(sc._jvm, "edu.isi.karma") inputFilename = args[0] input_country = args[1] outputFilename = args[2] city_alternate_name_input = args[3] city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json" state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json" country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json" fileUtil = FileUtil(sc) workflow = Workflow(sc) # 1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) input_country_rdd = workflow.batch_read_csv(input_country) input_alternate_city_rdd = workflow.batch_read_csv(city_alternate_name_input) print input_alternate_city_rdd.first() inputRDD_partitioned = inputRDD.partitionBy(10) #2. Apply the karma Model cityRDD1 = workflow.run_karma(inputRDD_partitioned, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl", "http://dig.isi.edu/geonames", "http://schema.org/City1",
from pyspark import SparkContext, StorageLevel from pyspark.sql import HiveContext from digSparkUtil.fileUtil import FileUtil from py4j.java_gateway import java_import from argparse import ArgumentParser from effectWorkflow import EffectWorkflow if __name__ == "__main__": sc = SparkContext() sqlContext = HiveContext(sc) java_import(sc._jvm, "edu.isi.karma") workflow = EffectWorkflow(sc, sqlContext) fileUtil = FileUtil(sc) parser = ArgumentParser() parser.add_argument("-i", "--inputTable", help="Input Table", required=True) parser.add_argument("-o", "--output", help="Output Folder", required=True) parser.add_argument("-n", "--partitions", help="Number of partitions", required=False, default=20) parser.add_argument("-t", "--outputtype", help="Output file type - text or sequence", required=False, default="sequence")
dest="threshold", default=0.0, help="similarity threshold") parser.add_option("-e", "--base", dest="base", type="string", help="base file", default="") parser.add_option("-o", "--outputformat", dest="outputformat", type="string", help="output file format: text/sequence", default="text") parser.add_option("-y", "--outputtype", dest="outputtype", type="string", help="output type: csv/json", default="json") parser.add_option("-k", "--topk", dest="topk", type="int", help="top n matches", default=3) parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int", help="number of partitions", default=10) parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string", help="name for json element for matching candidates", default="candidates") (c_options, args) = parser.parse_args() print "Got options:", c_options inputFilename = args[0] outputFilename = args[1] print "Save to:", outputFilename kwargs = as_dict(c_options) clusterer = Clusterer(**kwargs) fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFilename,file_format='text') cluster_rdd = clusterer.compute_clusters(rdd) fileUtil.save_file(cluster_rdd,outputFilename,file_format='text')
def convert(sc,inputFileName,outputFileName): fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFileName,file_format="text",data_type="json") fileUtil.save_file(rdd,outputFileName)
def __init__(self, config_filename, **p_options): self.options = as_dict(p_options) self.config = FileUtil.get_json_config(config_filename) print('In tokenizer')
from pyspark import SparkContext, SparkConf, StorageLevel from optparse import OptionParser from digSparkUtil.fileUtil import FileUtil if __name__ == '__main__': parser = OptionParser() (c_options, args) = parser.parse_args() print "Got options:", c_options filename = args[0] file_format = args[1] out_filename = args[2] out_format = args[3] uris = args[4].split(",") print "Filename", filename, file_format print "Output:", out_filename, out_format print "Filter:", args[4] sc = SparkContext(appName="DIG-FILTER") conf = SparkConf() fileUtil = FileUtil(sc) input_rdd = fileUtil.load_file(filename, file_format, "json") output_rdd = input_rdd.filter(lambda x: x[0] in uris).coalesce(1) fileUtil.save_file(output_rdd, out_filename, out_format, "json")
try: f_dict["schema:name"] = x['label'] except KeyError: pass return f_dict sc = SparkContext(appName="TEST") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] outputFilename = argv[2] fileUtil = FileUtil(sc) workflow = Workflow(sc) contextUrl = "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json" #1. Read the input #test big file inputRDD = workflow.batch_read_csv(inputFilename).partitionBy(1) #test small file # inputRDD = workflow.batch_read_csv(inputFilename) #2. Apply the karma Model outputRDD = workflow.run_karma(inputRDD, "https://raw.githubusercontent.com/american-art/autry/master/AutryMakers/AutryMakers-model.ttl",
outputFilename = argv[4] loadelasticsearch = argv[5] == "True" es_server = argv[6] es_port = argv[7] es_index = argv[8] #After applying karma, we would like to reduce the number of partitions numFramerPartitions = max(10, numPartitions / 10) github_base = 'https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0' context_url = github_base + '/karma/karma-context.json' workflow = Workflow(sc) fileUtil = FileUtil(sc) rdd_list = list() #Read the input data escorts_rdd = inputRDD = fileUtil.load_file( inputFilename, inputType, "json").partitionBy(numPartitions) escorts_rdd.persist(StorageLevel.MEMORY_AND_DISK) # Apply the main model main_rdd = workflow.run_karma( escorts_rdd, github_base + '/datasets/ht/CDRv2/main/ht-main-model.ttl', "http://dig.isi.edu/ht/data/", "http://schema.org/Offer1", context_url, numFramerPartitions ) #Partition the output data by numFramerPartitions and the rest #of the workflow works with the same number of partitions
pass return f_dict if __name__ == "__main__": sc = SparkContext(appName="TEST") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] outputFilename = argv[2] numPartitions = 1000 numFramerPartitions = max(10, numPartitions / 10) fileUtil = FileUtil(sc) workflow = Workflow(sc) contextUrl = "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json" #1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) #2. Apply the karma Model outputRDD = workflow.run_karma(inputRDD, "https://raw.githubusercontent.com/american-art/npg/master/NPGConstituents/NPGConstituents-model.ttl", "http://americanartcollaborative.org/npg/", "http://www.cidoc-crm.org/cidoc-crm/E39_Actor1", "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json", num_partitions=numPartitions, data_type="csv", additional_settings={"karma.input.delimiter":","})
if middlename != "": name_string += " " + middlename f_dict["schema:name"] = name_string except KeyError: pass return f_dict sc = SparkContext(appName="TEST") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] outputFilename = argv[2] fileUtil = FileUtil(sc) workflow = Workflow(sc) contextUrl = "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json" #1. Read the input #test big file # inputRDD = workflow.batch_read_csv(inputFilename).partitionBy(1000) #test small file inputRDD = workflow.batch_read_csv(inputFilename) #2. Apply the karma Model outputRDD = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/american-art/npg/master/NPGConstituents/NPGConstituents-model.ttl",
parser.add_argument("-r", "--branch", help="Branch to pull models and frames from", required=False, default="master") parser.add_argument( "-s", "--since", help="Get data since a timestamp - format: %Y-%m-%dT%H:%M:%S%Z", default="", required=False) args = parser.parse_args() print("Got arguments:", args) fileUtil = FileUtil(sc) hdfs_client = Client(args.hdfsManager) #Config().get_client('dev') #sc._jsc.hadoopConfiguration() workflow = EffectWorkflow(sc, sqlContext, hdfs_client) inputTable = args.inputTable.strip() outputFilename = args.output.strip() outputFileType = args.outputtype.strip() hiveQuery = args.query.strip() isIncremental = args.incremental since = args.since.strip() if since == "initial": since = "" if len(since) > 0: timestamp = DateUtil.unix_timestamp(since,
def convert(sc, inputFileName, outputFileName): fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFileName, file_format="text", data_type="json") fileUtil.save_file(rdd, outputFileName)
for object in list(tuple[1]): if 'seller_uri' in object: yield key + "\t" + object['seller_uri'] if 'phones' in object: for phone in object['phones']: yield key + "\t" + phone if 'emails' in object: for email in object['emails']: yield key + "\t" + email if __name__ == '__main__': inputFileName = sys.argv[1] file_type = sys.argv[2] outputFileName = sys.argv[3] sc = SparkContext(appName="Generate Data for Max Cliques") fileUtil = FileUtil(sc) input_rdd = fileUtil.load_file(inputFileName,file_format=file_type) # input_rdd = input_rdd.sample(False, 0.5, seed=1234) # input_rdd.foreach(lambda x : f1(x)) intermediate_rdd = input_rdd.flatMap(lambda x : helper(x)) #fileUtil.save_file(intermediate_rdd,"intermediate_1",file_format="text") intermediate_rdd = intermediate_rdd.groupByKey() #output_rdd.foreach(lambda x : f(x)) intermediate_rdd = intermediate_rdd.flatMap(lambda x : wrapper(x)) intermediate_rdd = intermediate_rdd.coalesce(1) intermediate_rdd.saveAsTextFile(outputFileName)
help="field separator", default="\t") (c_options, args) = parser.parse_args() print "Got options:", c_options java_import(sc._jvm, "edu.isi.karma") inputFilename = args[0] input_country = args[1] outputFilename = args[2] city_alternate_name_input = args[3] city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json" state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json" country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json" fileUtil = FileUtil(sc) workflow = Workflow(sc) # 1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) input_country_rdd = workflow.batch_read_csv(input_country) input_alternate_city_rdd = workflow.batch_read_csv( city_alternate_name_input) print input_alternate_city_rdd.first() inputRDD_partitioned = inputRDD.partitionBy(10) #2. Apply the karma Model cityRDD1 = workflow.run_karma( inputRDD_partitioned, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
--archives ../karma.zip \ --py-files ../lib/python-lib.zip \ --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaContextWorkflow.py \ ../sample-data/sample-unicode-jsonld.json text \ https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \ ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karmaContextWorkflow") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] inputType = argv[2] contextUrl = argv[3] outputFilename = argv[4] fileUtil = FileUtil(sc) workflow = Workflow(sc) # Read input inputRDD = fileUtil.load_file(inputFilename, inputType, "json") # Apply the context outputRDD = workflow.apply_context(inputRDD, contextUrl) # Save the output fileUtil.save_file(outputRDD, outputFilename, "text", "json")
--py-files /home/hadoop/effect-workflows/lib/python-lib.zip \ --archives /home/hadoop/effect-workflows/karma.zip \ /home/hadoop/effect-workflows/effectWorkflow.py \ cdr hdfs://ip-172-31-19-102/user/effect/data/cdr-framed sequence 10 ''' context_url = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json" base_uri = "http://effect.isi.edu/data/" if __name__ == "__main__": sc = SparkContext() conf = SparkConf() java_import(sc._jvm, "edu.isi.karma") workflow = Workflow(sc) fileUtil = FileUtil(sc) parser = ArgumentParser() parser.add_argument("-i", "--input", help="input folder", required=True) parser.add_argument("-o", "--output", help="input folder", required=True) parser.add_argument("-n", "--partitions", help="Number of partitions", required=False, default=20) parser.add_argument("-t", "--host", help="ES hostname", default="localhost", required=False) parser.add_argument("-p",
--archives ../karma.zip \ --py-files ../lib/python-lib.zip \ --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaWorkflowCSV.py ../sample-data/sample-unicode.txt ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karmaCSV") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] outputFilename = argv[2] numPartitions = 1 fileUtil = FileUtil(sc) workflow = Workflow(sc) #1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) #2. Apply the karma Model outputRDD = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/data", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions, data_type="csv", additional_settings={
--driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaWorkflow.py ../sample-data/part-00002-seq sequence json 1 ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karma") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] inputFileType = argv[2] inputDataType = argv[3] numPartitions = int(argv[4]) outputFilename = argv[5] fileUtil = FileUtil(sc) workflow = Workflow(sc) #1. Read the input inputRDD = fileUtil.load_file(inputFilename, inputFileType, inputDataType).partitionBy(numPartitions) #2. Apply the karma Model contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json" outputRDD_karma = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl", "http://dig.isi.edu/ht/data/", "http://schema.org/WebPage1", contextUrl, num_partitions=numPartitions,