def testTokenizer(sc, input_dir, output_dir, config, limit=None, input_file_format="sequence", input_data_type="json", output_file_format="sequence", output_data_type="json", **kwargs): print(limit) futil = FileUtil(sc) # LOAD DATA rdd_ingest = futil.load_file(input_dir, file_format=kwargs.get("file_format"), data_type=input_data_type) rdd_ingest.setName('rdd_ingest_input') ## TOKENIZE #tokOptions = {"file_format": input_file_format, # "data_type": input_data_type} tokenizer = Tokenizer(config, **kwargs) rdd_tokenized = tokenizer.perform(rdd_ingest) # SAVE DATA outOptions = {} futil.save_file(rdd_tokenized, output_dir, file_format=output_file_format, data_type=output_data_type, **outOptions)
def preprocess(sc, inputDir, file_format, outputDir): """ this method just reads the offer file and creates vertexrdd and edgerdd required for graphx vertexrdd will be node uri and type edgesrdd will be node a,node b,edge type :param inputDir: :param file_format: :return: """ fileUtil = FileUtil(sc) inputrdd = fileUtil.load_file(inputDir, file_format=file_format, data_type='json') vertexrdd = inputrdd.flatMapValues(lambda x: nodes_mapper(x)) #rdd = vertexrdd.foreach(lambda (x,y):f(x,y)) edgerdd = inputrdd.flatMapValues(lambda x: edges_mapper(x)) fileUtil.save_file(vertexrdd, outputDir + 'vertex', file_format='text', data_type='json') fileUtil.save_file(edgerdd, outputDir + 'edges', file_format='text', data_type='json')
def load_jsonlines(sc, input, file_format='sequence', data_type='json', separator='\t'): fUtil = FileUtil(sc) rdd = fUtil.load_file(input, file_format=file_format, data_type=data_type, separator=separator) return rdd
def save_jsonlines(sc, rdd, output_dir, file_format='sequence', data_type='json', separator='\t'): fUtil = FileUtil(sc) fUtil.save_file(rdd, output_dir, file_format=file_format, data_type=data_type, separator=separator)
def testSignature(sc, inputFileName, outputFileName, file_format): config = {"runlimit": 5, "field": "title_signature"} fUtil = FileUtil(sc) rdd = fUtil.load_file(inputFileName, file_format=file_format, data_type="json") signature = ComputeSignature(**config) rdd = signature.perform(rdd) fUtil.save_file(rdd, outputFileName, file_format="text", data_type="json")
--archives ../karma.zip \ --py-files ../lib/python-lib.zip \ --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \ karmaWorkflowCSV.py ../sample-data/sample-unicode.txt ../sample-data/output ''' if __name__ == "__main__": sc = SparkContext(appName="karmaCSV") java_import(sc._jvm, "edu.isi.karma") inputFilename = argv[1] outputFilename = argv[2] numPartitions = 1 fileUtil = FileUtil(sc) workflow = Workflow(sc) #1. Read the input inputRDD = workflow.batch_read_csv(inputFilename) #2. Apply the karma Model outputRDD = workflow.run_karma( inputRDD, "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl", "http://dig.isi.edu/data", "http://schema.org/WebPage1", "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json", num_partitions=numPartitions, data_type="csv", additional_settings={
def convert(sc, inputFileName, outputFileName): fileUtil = FileUtil(sc) rdd = fileUtil.load_file(inputFileName, file_format="text", data_type="json") fileUtil.save_file(rdd, outputFileName)