Ejemplo n.º 1
0
def testTokenizer(sc,
                  input_dir,
                  output_dir,
                  config,
                  limit=None,
                  input_file_format="sequence",
                  input_data_type="json",
                  output_file_format="sequence",
                  output_data_type="json",
                  **kwargs):

    print(limit)

    futil = FileUtil(sc)

    # LOAD DATA
    rdd_ingest = futil.load_file(input_dir,
                                 file_format=kwargs.get("file_format"),
                                 data_type=input_data_type)
    rdd_ingest.setName('rdd_ingest_input')

    ## TOKENIZE
    #tokOptions = {"file_format": input_file_format,
    #             "data_type": input_data_type}
    tokenizer = Tokenizer(config, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_ingest)

    # SAVE DATA
    outOptions = {}
    futil.save_file(rdd_tokenized,
                    output_dir,
                    file_format=output_file_format,
                    data_type=output_data_type,
                    **outOptions)
Ejemplo n.º 2
0
def testTokenizer(sc, input_dir, output_dir, config,
                  limit=None, 
                  input_file_format="sequence",
                  input_data_type="json",
                  output_file_format="sequence",
                  output_data_type="json",
                  **kwargs):

    print(limit)

    futil = FileUtil(sc)

    # LOAD DATA
    rdd_ingest = futil.load_file(input_dir, file_format=kwargs.get("file_format"),
                                 data_type=input_data_type)
    rdd_ingest.setName('rdd_ingest_input')

    ## TOKENIZE
    #tokOptions = {"file_format": input_file_format,
     #             "data_type": input_data_type}
    tokenizer = Tokenizer(config, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_ingest)

    # SAVE DATA
    outOptions = {}
    futil.save_file(rdd_tokenized, output_dir, file_format=output_file_format, 
                    data_type=output_data_type, 
                    **outOptions)
Ejemplo n.º 3
0
def testLSH(sc, inputFilename,outputFilename,configFilename,
             **kwargs):

    '''
    kwargs is a dictionary of inputs a sample input would look like
    options = {
               "file_format":"sequence",
               "data_type":"json",
               "numHashes":100 ,
               "numItemsInBand": 10,
               "computeSimilarity": True,
               "threshold":0.8,
               "base":"saam-city.json",
               "topk":3,
               "candidatesName":"candidates")
               }
    '''
    futil = FileUtil(sc)


    #Tokenize
    ######################
    rdd_input = futil.load_file(inputFilename, file_format=kwargs['file_format'], data_type="json")
    rdd_input.setName('rdd_input')

    tokenizer = Tokenizer(configFilename, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_input)

    futil = FileUtil(sc)
    outOptions = {}

    #you can save the tokens file here by using
    #futil.save_file(rdd_tokenized,outputFilename,file_format='sequence',data_type='json',**outOptions)

    #Hashing
    #######################

    hasher = Hasher(**kwargs)
    rdd_minHashes = hasher.perform(rdd_tokenized)
    rdd_minHashes.setName('rdd_minhashes')
    #futil.save_file(rdd_minHashes,outputFilename,file_format='sequence',data_type='json',**outOptions)


    #clustering
    #########################
    clusterer = Clusterer(**kwargs)
    rdd_clusters = clusterer.perform(rdd_minHashes)
    #futil.save_file(rdd_clusters,outputFilename,file_format='text',data_type='json',**outOptions)

    #unionfind
    #########################
    unionFind = UnionFind(**kwargs)
    rdd_unionfind = unionFind.perform(rdd_clusters)

    # SAVE DATA
    futil.save_file(rdd_unionfind,outputFilename,file_format='text',data_type='json',**outOptions)
Ejemplo n.º 4
0
def load_jsonlines(sc,
                   input,
                   file_format='sequence',
                   data_type='json',
                   separator='\t'):
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(input,
                          file_format=file_format,
                          data_type=data_type,
                          separator=separator)
    return rdd
Ejemplo n.º 5
0
def testSignature(sc,inputFileName,outputFileName,file_format):
    """
    :param sc: this is sparkcontext variable needed for spark
    :param inputFileName: inputdir to which cluster_id needs to be added
    :param outputFileName: outputdir
    :param file_format: text/seq
    """
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json")
    addClusterId = AddClusterId()
    rdd = addClusterId.perform(rdd)
    fUtil.save_file(rdd,outputFileName,file_format='text',data_type='json')
Ejemplo n.º 6
0
def testSignature(sc, inputFileName, outputFileName, file_format):

    config = {"runlimit": 5, "field": "title_signature"}

    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,
                          file_format=file_format,
                          data_type="json")
    signature = ComputeSignature(**config)
    rdd = signature.perform(rdd)

    fUtil.save_file(rdd, outputFileName, file_format="text", data_type="json")
Ejemplo n.º 7
0
def save_jsonlines(sc,
                   rdd,
                   output_dir,
                   file_format='sequence',
                   data_type='json',
                   separator='\t'):
    fUtil = FileUtil(sc)
    fUtil.save_file(rdd,
                    output_dir,
                    file_format=file_format,
                    data_type=data_type,
                    separator=separator)
Ejemplo n.º 8
0
def testSignature(sc,inputFileName,outputFileName,file_format):

    config = {
    "runlimit" : 5,
    "field" : "title_signature"
    }

    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json")
    signature = ComputeSignature(**config)
    rdd = signature.perform(rdd)

    fUtil.save_file(rdd,outputFileName,file_format="text",data_type="json")
Ejemplo n.º 9
0
def preprocess(sc, inputDir, file_format, outputDir):
    """
    this method just reads the offer file and creates vertexrdd and edgerdd required for graphx
    vertexrdd will be node uri and type
    edgesrdd will be node a,node b,edge type
    :param inputDir:
    :param file_format:
    :return:
    """
    fileUtil = FileUtil(sc)
    inputrdd = fileUtil.load_file(inputDir,
                                  file_format=file_format,
                                  data_type='json')
    vertexrdd = inputrdd.flatMapValues(lambda x: nodes_mapper(x))
    #rdd = vertexrdd.foreach(lambda (x,y):f(x,y))

    edgerdd = inputrdd.flatMapValues(lambda x: edges_mapper(x))
    fileUtil.save_file(vertexrdd,
                       outputDir + 'vertex',
                       file_format='text',
                       data_type='json')
    fileUtil.save_file(edgerdd,
                       outputDir + 'edges',
                       file_format='text',
                       data_type='json')
Ejemplo n.º 10
0
def preprocess(sc,inputDir,file_format,outputDir):
    """
    this method just reads the offer file and creates vertexrdd and edgerdd required for graphx
    vertexrdd will be node uri and type
    edgesrdd will be node a,node b,edge type
    :param inputDir:
    :param file_format:
    :return:
    """
    fileUtil = FileUtil(sc)
    inputrdd=fileUtil.load_file(inputDir,file_format=file_format,data_type='json')
    vertexrdd = inputrdd.flatMapValues(lambda x:nodes_mapper(x))
    #rdd = vertexrdd.foreach(lambda (x,y):f(x,y))

    edgerdd = inputrdd.flatMapValues(lambda x : edges_mapper(x))
    fileUtil.save_file(vertexrdd,outputDir+'vertex',file_format='text',data_type='json')
    fileUtil.save_file(edgerdd,outputDir+'edges',file_format='text',data_type='json')
Ejemplo n.º 11
0
    parser.add_option("-r", "--separator", dest="separator", type="string",
                      help="field separator", default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    java_import(sc._jvm, "edu.isi.karma")
    inputFilename = args[0]
    input_country = args[1]
    outputFilename = args[2]
    city_alternate_name_input = args[3]
    city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json"
    state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json"
    country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json"

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd =  workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(inputRDD_partitioned,
                                   "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
                                   "http://dig.isi.edu/geonames",
                                   "http://schema.org/City1",
Ejemplo n.º 12
0
from pyspark import SparkContext, StorageLevel
from pyspark.sql import HiveContext
from digSparkUtil.fileUtil import FileUtil
from py4j.java_gateway import java_import
from argparse import ArgumentParser
from effectWorkflow import EffectWorkflow

if __name__ == "__main__":
    sc = SparkContext()
    sqlContext = HiveContext(sc)

    java_import(sc._jvm, "edu.isi.karma")
    workflow = EffectWorkflow(sc, sqlContext)
    fileUtil = FileUtil(sc)

    parser = ArgumentParser()
    parser.add_argument("-i",
                        "--inputTable",
                        help="Input Table",
                        required=True)
    parser.add_argument("-o", "--output", help="Output Folder", required=True)
    parser.add_argument("-n",
                        "--partitions",
                        help="Number of partitions",
                        required=False,
                        default=20)
    parser.add_argument("-t",
                        "--outputtype",
                        help="Output file type - text or sequence",
                        required=False,
                        default="sequence")
Ejemplo n.º 13
0
                      dest="threshold", default=0.0, help="similarity threshold")
    parser.add_option("-e", "--base", dest="base", type="string",
                      help="base file", default="")
    parser.add_option("-o", "--outputformat", dest="outputformat", type="string",
                      help="output file format: text/sequence", default="text")
    parser.add_option("-y", "--outputtype", dest="outputtype", type="string",
                      help="output type: csv/json", default="json")
    parser.add_option("-k", "--topk", dest="topk", type="int",
                      help="top n matches", default=3)
    parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)
    parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string",
                        help="name for json element for matching candidates", default="candidates")
    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    inputFilename = args[0]
    outputFilename = args[1]
    print "Save to:", outputFilename

    kwargs = as_dict(c_options)
    clusterer = Clusterer(**kwargs)
    fileUtil = FileUtil(sc)

    rdd = fileUtil.load_file(inputFilename,file_format='text')

    cluster_rdd = clusterer.compute_clusters(rdd)

    fileUtil.save_file(cluster_rdd,outputFilename,file_format='text')

def save_jsonlines(sc, rdd, output_dir, file_format='sequence', data_type='json', separator='\t'):
    fUtil = FileUtil(sc)
    fUtil.save_file(rdd, output_dir, file_format=file_format, data_type=data_type, separator=separator)
Ejemplo n.º 15
0
def convert(sc,inputFileName,outputFileName):
    fileUtil = FileUtil(sc)
    rdd = fileUtil.load_file(inputFileName,file_format="text",data_type="json")
    fileUtil.save_file(rdd,outputFileName)
Ejemplo n.º 16
0
 def __init__(self, config_filename, **p_options):
     self.options = as_dict(p_options)
     self.config = FileUtil.get_json_config(config_filename)
     print('In tokenizer')
Ejemplo n.º 17
0
from pyspark import SparkContext, SparkConf, StorageLevel
from optparse import OptionParser
from digSparkUtil.fileUtil import FileUtil

if __name__ == '__main__':
    parser = OptionParser()

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    filename = args[0]
    file_format = args[1]
    out_filename = args[2]
    out_format = args[3]
    uris = args[4].split(",")

    print "Filename", filename, file_format
    print "Output:", out_filename, out_format
    print "Filter:", args[4]

    sc = SparkContext(appName="DIG-FILTER")
    conf = SparkConf()

    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_file(filename, file_format, "json")
    output_rdd = input_rdd.filter(lambda x: x[0] in uris).coalesce(1)
    fileUtil.save_file(output_rdd, out_filename, out_format, "json")
Ejemplo n.º 18
0
        try:
            f_dict["schema:name"] = x['label']
        except KeyError:
            pass
        return f_dict
        

    sc = SparkContext(appName="TEST")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]


    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)
    contextUrl = "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json"

    #1. Read the input

    #test big file
    inputRDD = workflow.batch_read_csv(inputFilename).partitionBy(1)

    #test small file
    # inputRDD = workflow.batch_read_csv(inputFilename)


    #2. Apply the karma Model
    outputRDD = workflow.run_karma(inputRDD,
                                   "https://raw.githubusercontent.com/american-art/autry/master/AutryMakers/AutryMakers-model.ttl",
Ejemplo n.º 19
0
    outputFilename = argv[4]
    loadelasticsearch = argv[5] == "True"

    es_server = argv[6]
    es_port = argv[7]
    es_index = argv[8]

    #After applying karma, we would like to reduce the number of partitions
    numFramerPartitions = max(10, numPartitions / 10)

    github_base = 'https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0'
    context_url = github_base + '/karma/karma-context.json'

    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    rdd_list = list()

    #Read the input data
    escorts_rdd = inputRDD = fileUtil.load_file(
        inputFilename, inputType, "json").partitionBy(numPartitions)
    escorts_rdd.persist(StorageLevel.MEMORY_AND_DISK)

    # Apply the main model
    main_rdd = workflow.run_karma(
        escorts_rdd, github_base + '/datasets/ht/CDRv2/main/ht-main-model.ttl',
        "http://dig.isi.edu/ht/data/", "http://schema.org/Offer1", context_url,
        numFramerPartitions
    )  #Partition the output data by numFramerPartitions and the rest
    #of the workflow works with the same number of partitions
Ejemplo n.º 20
0
        pass

    return f_dict

if __name__ == "__main__":

    sc = SparkContext(appName="TEST")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]
    numPartitions = 1000
    numFramerPartitions = max(10, numPartitions / 10)

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)
    contextUrl = "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json"

    #1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)

    #2. Apply the karma Model
    outputRDD = workflow.run_karma(inputRDD,
                                   "https://raw.githubusercontent.com/american-art/npg/master/NPGConstituents/NPGConstituents-model.ttl",
                                   "http://americanartcollaborative.org/npg/",
                                   "http://www.cidoc-crm.org/cidoc-crm/E39_Actor1",
                                   "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json",
                                   num_partitions=numPartitions,
                                   data_type="csv",
                                   additional_settings={"karma.input.delimiter":","})
Ejemplo n.º 21
0
            if middlename != "":
                name_string += " " + middlename
            f_dict["schema:name"] = name_string
        except KeyError:
            pass

        return f_dict

    sc = SparkContext(appName="TEST")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)
    contextUrl = "https://raw.githubusercontent.com/american-art/aac-alignment/master/karma-context.json"

    #1. Read the input

    #test big file
    # inputRDD = workflow.batch_read_csv(inputFilename).partitionBy(1000)

    #test small file
    inputRDD = workflow.batch_read_csv(inputFilename)

    #2. Apply the karma Model
    outputRDD = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/american-art/npg/master/NPGConstituents/NPGConstituents-model.ttl",
def load_jsonlines(sc, input, file_format='sequence', data_type='json', separator='\t'):
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(input, file_format=file_format, data_type=data_type, separator=separator)
    return rdd
Ejemplo n.º 23
0
    parser.add_argument("-r",
                        "--branch",
                        help="Branch to pull models and frames from",
                        required=False,
                        default="master")
    parser.add_argument(
        "-s",
        "--since",
        help="Get data since a timestamp - format: %Y-%m-%dT%H:%M:%S%Z",
        default="",
        required=False)

    args = parser.parse_args()
    print("Got arguments:", args)

    fileUtil = FileUtil(sc)
    hdfs_client = Client(args.hdfsManager)  #Config().get_client('dev')
    #sc._jsc.hadoopConfiguration()
    workflow = EffectWorkflow(sc, sqlContext, hdfs_client)

    inputTable = args.inputTable.strip()
    outputFilename = args.output.strip()
    outputFileType = args.outputtype.strip()
    hiveQuery = args.query.strip()
    isIncremental = args.incremental

    since = args.since.strip()
    if since == "initial":
        since = ""
    if len(since) > 0:
        timestamp = DateUtil.unix_timestamp(since,
Ejemplo n.º 24
0
def convert(sc, inputFileName, outputFileName):
    fileUtil = FileUtil(sc)
    rdd = fileUtil.load_file(inputFileName,
                             file_format="text",
                             data_type="json")
    fileUtil.save_file(rdd, outputFileName)
Ejemplo n.º 25
0
 def __init__(self, config_filename, **p_options):
     self.options = as_dict(p_options)
     self.config = FileUtil.get_json_config(config_filename)
     print('In tokenizer')
    for object in list(tuple[1]):
        if 'seller_uri' in object:
            yield key + "\t" + object['seller_uri']
        if 'phones' in object:
            for phone in object['phones']:
                yield key + "\t" + phone
        if 'emails' in object:
            for email in object['emails']:
                yield key + "\t" + email



if __name__ == '__main__':
    inputFileName = sys.argv[1]
    file_type = sys.argv[2]
    outputFileName = sys.argv[3]

    sc = SparkContext(appName="Generate Data for Max Cliques")
    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_file(inputFileName,file_format=file_type)
    # input_rdd = input_rdd.sample(False, 0.5, seed=1234)
    # input_rdd.foreach(lambda x : f1(x))
    intermediate_rdd = input_rdd.flatMap(lambda x : helper(x))
    #fileUtil.save_file(intermediate_rdd,"intermediate_1",file_format="text")
    intermediate_rdd = intermediate_rdd.groupByKey()

    #output_rdd.foreach(lambda x : f(x))
    intermediate_rdd = intermediate_rdd.flatMap(lambda x : wrapper(x))

    intermediate_rdd = intermediate_rdd.coalesce(1)
    intermediate_rdd.saveAsTextFile(outputFileName)
Ejemplo n.º 27
0
                      help="field separator",
                      default="\t")

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    java_import(sc._jvm, "edu.isi.karma")
    inputFilename = args[0]
    input_country = args[1]
    outputFilename = args[2]
    city_alternate_name_input = args[3]
    city_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_context.json"
    state_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/state_context.json"
    country_context = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/country_context.json"

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # 1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)
    input_country_rdd = workflow.batch_read_csv(input_country)
    input_alternate_city_rdd = workflow.batch_read_csv(
        city_alternate_name_input)
    print input_alternate_city_rdd.first()

    inputRDD_partitioned = inputRDD.partitionBy(10)

    #2. Apply the karma Model
    cityRDD1 = workflow.run_karma(
        inputRDD_partitioned,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/datasets/geonames/allCountries/city_model.ttl",
Ejemplo n.º 28
0
      --archives ../karma.zip \
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaContextWorkflow.py \
      ../sample-data/sample-unicode-jsonld.json text \
      https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \
      ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaContextWorkflow")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputType = argv[2]
    contextUrl = argv[3]
    outputFilename = argv[4]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    # Apply the context
    outputRDD = workflow.apply_context(inputRDD, contextUrl)

    # Save the output
    fileUtil.save_file(outputRDD, outputFilename, "text", "json")
    --py-files /home/hadoop/effect-workflows/lib/python-lib.zip \
    --archives /home/hadoop/effect-workflows/karma.zip \
    /home/hadoop/effect-workflows/effectWorkflow.py \
    cdr hdfs://ip-172-31-19-102/user/effect/data/cdr-framed sequence 10
'''

context_url = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0/karma/karma-context.json"
base_uri = "http://effect.isi.edu/data/"

if __name__ == "__main__":
    sc = SparkContext()
    conf = SparkConf()

    java_import(sc._jvm, "edu.isi.karma")
    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    parser = ArgumentParser()
    parser.add_argument("-i", "--input", help="input folder", required=True)
    parser.add_argument("-o", "--output", help="input folder", required=True)
    parser.add_argument("-n",
                        "--partitions",
                        help="Number of partitions",
                        required=False,
                        default=20)
    parser.add_argument("-t",
                        "--host",
                        help="ES hostname",
                        default="localhost",
                        required=False)
    parser.add_argument("-p",
Ejemplo n.º 30
0
      --archives ../karma.zip \
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaWorkflowCSV.py ../sample-data/sample-unicode.txt ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaCSV")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]
    numPartitions = 1

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)

    #2. Apply the karma Model
    outputRDD = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/data",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={
Ejemplo n.º 31
0
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaWorkflow.py ../sample-data/part-00002-seq sequence json 1 ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karma")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputFileType = argv[2]
    inputDataType = argv[3]
    numPartitions = int(argv[4])
    outputFilename = argv[5]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = fileUtil.load_file(inputFilename, inputFileType,
                                  inputDataType).partitionBy(numPartitions)

    #2. Apply the karma Model
    contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json"
    outputRDD_karma = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        contextUrl,
        num_partitions=numPartitions,