Ejemplo n.º 1
0
def preprocess(sc, inputDir, file_format, outputDir):
    """
    this method just reads the offer file and creates vertexrdd and edgerdd required for graphx
    vertexrdd will be node uri and type
    edgesrdd will be node a,node b,edge type
    :param inputDir:
    :param file_format:
    :return:
    """
    fileUtil = FileUtil(sc)
    inputrdd = fileUtil.load_file(inputDir,
                                  file_format=file_format,
                                  data_type='json')
    vertexrdd = inputrdd.flatMapValues(lambda x: nodes_mapper(x))
    #rdd = vertexrdd.foreach(lambda (x,y):f(x,y))

    edgerdd = inputrdd.flatMapValues(lambda x: edges_mapper(x))
    fileUtil.save_file(vertexrdd,
                       outputDir + 'vertex',
                       file_format='text',
                       data_type='json')
    fileUtil.save_file(edgerdd,
                       outputDir + 'edges',
                       file_format='text',
                       data_type='json')
Ejemplo n.º 2
0
def testTokenizer(sc,
                  input_dir,
                  output_dir,
                  config,
                  limit=None,
                  input_file_format="sequence",
                  input_data_type="json",
                  output_file_format="sequence",
                  output_data_type="json",
                  **kwargs):

    print(limit)

    futil = FileUtil(sc)

    # LOAD DATA
    rdd_ingest = futil.load_file(input_dir,
                                 file_format=kwargs.get("file_format"),
                                 data_type=input_data_type)
    rdd_ingest.setName('rdd_ingest_input')

    ## TOKENIZE
    #tokOptions = {"file_format": input_file_format,
    #             "data_type": input_data_type}
    tokenizer = Tokenizer(config, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_ingest)

    # SAVE DATA
    outOptions = {}
    futil.save_file(rdd_tokenized,
                    output_dir,
                    file_format=output_file_format,
                    data_type=output_data_type,
                    **outOptions)
Ejemplo n.º 3
0
def testTokenizer(sc, input_dir, output_dir, config,
                  limit=None, 
                  input_file_format="sequence",
                  input_data_type="json",
                  output_file_format="sequence",
                  output_data_type="json",
                  **kwargs):

    print(limit)

    futil = FileUtil(sc)

    # LOAD DATA
    rdd_ingest = futil.load_file(input_dir, file_format=kwargs.get("file_format"),
                                 data_type=input_data_type)
    rdd_ingest.setName('rdd_ingest_input')

    ## TOKENIZE
    #tokOptions = {"file_format": input_file_format,
     #             "data_type": input_data_type}
    tokenizer = Tokenizer(config, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_ingest)

    # SAVE DATA
    outOptions = {}
    futil.save_file(rdd_tokenized, output_dir, file_format=output_file_format, 
                    data_type=output_data_type, 
                    **outOptions)
Ejemplo n.º 4
0
def testLSH(sc, inputFilename,outputFilename,configFilename,
             **kwargs):

    '''
    kwargs is a dictionary of inputs a sample input would look like
    options = {
               "file_format":"sequence",
               "data_type":"json",
               "numHashes":100 ,
               "numItemsInBand": 10,
               "computeSimilarity": True,
               "threshold":0.8,
               "base":"saam-city.json",
               "topk":3,
               "candidatesName":"candidates")
               }
    '''
    futil = FileUtil(sc)


    #Tokenize
    ######################
    rdd_input = futil.load_file(inputFilename, file_format=kwargs['file_format'], data_type="json")
    rdd_input.setName('rdd_input')

    tokenizer = Tokenizer(configFilename, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_input)

    futil = FileUtil(sc)
    outOptions = {}

    #you can save the tokens file here by using
    #futil.save_file(rdd_tokenized,outputFilename,file_format='sequence',data_type='json',**outOptions)

    #Hashing
    #######################

    hasher = Hasher(**kwargs)
    rdd_minHashes = hasher.perform(rdd_tokenized)
    rdd_minHashes.setName('rdd_minhashes')
    #futil.save_file(rdd_minHashes,outputFilename,file_format='sequence',data_type='json',**outOptions)


    #clustering
    #########################
    clusterer = Clusterer(**kwargs)
    rdd_clusters = clusterer.perform(rdd_minHashes)
    #futil.save_file(rdd_clusters,outputFilename,file_format='text',data_type='json',**outOptions)

    #unionfind
    #########################
    unionFind = UnionFind(**kwargs)
    rdd_unionfind = unionFind.perform(rdd_clusters)

    # SAVE DATA
    futil.save_file(rdd_unionfind,outputFilename,file_format='text',data_type='json',**outOptions)
Ejemplo n.º 5
0
def load_jsonlines(sc,
                   input,
                   file_format='sequence',
                   data_type='json',
                   separator='\t'):
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(input,
                          file_format=file_format,
                          data_type=data_type,
                          separator=separator)
    return rdd
Ejemplo n.º 6
0
def testSignature(sc, inputFileName, outputFileName, file_format):

    config = {"runlimit": 5, "field": "title_signature"}

    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,
                          file_format=file_format,
                          data_type="json")
    signature = ComputeSignature(**config)
    rdd = signature.perform(rdd)

    fUtil.save_file(rdd, outputFileName, file_format="text", data_type="json")
Ejemplo n.º 7
0
def testSignature(sc,inputFileName,outputFileName,file_format):
    """
    :param sc: this is sparkcontext variable needed for spark
    :param inputFileName: inputdir to which cluster_id needs to be added
    :param outputFileName: outputdir
    :param file_format: text/seq
    """
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json")
    addClusterId = AddClusterId()
    rdd = addClusterId.perform(rdd)
    fUtil.save_file(rdd,outputFileName,file_format='text',data_type='json')
Ejemplo n.º 8
0
def testSignature(sc,inputFileName,outputFileName,file_format):

    config = {
    "runlimit" : 5,
    "field" : "title_signature"
    }

    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,file_format=file_format,data_type="json")
    signature = ComputeSignature(**config)
    rdd = signature.perform(rdd)

    fUtil.save_file(rdd,outputFileName,file_format="text",data_type="json")
Ejemplo n.º 9
0
def preprocess(sc,inputDir,file_format,outputDir):
    """
    this method just reads the offer file and creates vertexrdd and edgerdd required for graphx
    vertexrdd will be node uri and type
    edgesrdd will be node a,node b,edge type
    :param inputDir:
    :param file_format:
    :return:
    """
    fileUtil = FileUtil(sc)
    inputrdd=fileUtil.load_file(inputDir,file_format=file_format,data_type='json')
    vertexrdd = inputrdd.flatMapValues(lambda x:nodes_mapper(x))
    #rdd = vertexrdd.foreach(lambda (x,y):f(x,y))

    edgerdd = inputrdd.flatMapValues(lambda x : edges_mapper(x))
    fileUtil.save_file(vertexrdd,outputDir+'vertex',file_format='text',data_type='json')
    fileUtil.save_file(edgerdd,outputDir+'edges',file_format='text',data_type='json')
    for object in list(tuple[1]):
        if 'seller_uri' in object:
            yield key + "\t" + object['seller_uri']
        if 'phones' in object:
            for phone in object['phones']:
                yield key + "\t" + phone
        if 'emails' in object:
            for email in object['emails']:
                yield key + "\t" + email



if __name__ == '__main__':
    inputFileName = sys.argv[1]
    file_type = sys.argv[2]
    outputFileName = sys.argv[3]

    sc = SparkContext(appName="Generate Data for Max Cliques")
    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_file(inputFileName,file_format=file_type)
    # input_rdd = input_rdd.sample(False, 0.5, seed=1234)
    # input_rdd.foreach(lambda x : f1(x))
    intermediate_rdd = input_rdd.flatMap(lambda x : helper(x))
    #fileUtil.save_file(intermediate_rdd,"intermediate_1",file_format="text")
    intermediate_rdd = intermediate_rdd.groupByKey()

    #output_rdd.foreach(lambda x : f(x))
    intermediate_rdd = intermediate_rdd.flatMap(lambda x : wrapper(x))

    intermediate_rdd = intermediate_rdd.coalesce(1)
    intermediate_rdd.saveAsTextFile(outputFileName)
def load_jsonlines(sc, input, file_format='sequence', data_type='json', separator='\t'):
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(input, file_format=file_format, data_type=data_type, separator=separator)
    return rdd
Ejemplo n.º 12
0
def convert(sc,inputFileName,outputFileName):
    fileUtil = FileUtil(sc)
    rdd = fileUtil.load_file(inputFileName,file_format="text",data_type="json")
    fileUtil.save_file(rdd,outputFileName)
Ejemplo n.º 13
0
from pyspark import SparkContext, SparkConf, StorageLevel
from optparse import OptionParser
from digSparkUtil.fileUtil import FileUtil

if __name__ == '__main__':
    parser = OptionParser()

    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    filename = args[0]
    file_format = args[1]
    out_filename = args[2]
    out_format = args[3]
    uris = args[4].split(",")

    print "Filename", filename, file_format
    print "Output:", out_filename, out_format
    print "Filter:", args[4]

    sc = SparkContext(appName="DIG-FILTER")
    conf = SparkConf()

    fileUtil = FileUtil(sc)
    input_rdd = fileUtil.load_file(filename, file_format, "json")
    output_rdd = input_rdd.filter(lambda x: x[0] in uris).coalesce(1)
    fileUtil.save_file(output_rdd, out_filename, out_format, "json")
Ejemplo n.º 14
0
    es_port = argv[7]
    es_index = argv[8]

    #After applying karma, we would like to reduce the number of partitions
    numFramerPartitions = max(10, numPartitions / 10)

    github_base = 'https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/3.0'
    context_url = github_base + '/karma/karma-context.json'

    workflow = Workflow(sc)
    fileUtil = FileUtil(sc)

    rdd_list = list()

    #Read the input data
    escorts_rdd = inputRDD = fileUtil.load_file(
        inputFilename, inputType, "json").partitionBy(numPartitions)
    escorts_rdd.persist(StorageLevel.MEMORY_AND_DISK)

    # Apply the main model
    main_rdd = workflow.run_karma(
        escorts_rdd, github_base + '/datasets/ht/CDRv2/main/ht-main-model.ttl',
        "http://dig.isi.edu/ht/data/", "http://schema.org/Offer1", context_url,
        numFramerPartitions
    )  #Partition the output data by numFramerPartitions and the rest
    #of the workflow works with the same number of partitions
    main_rdd.persist(StorageLevel.MEMORY_AND_DISK)
    rdd_list.append(main_rdd)
    print "main model done"

    # Apply the AdultService Model
    adultservice_rdd = workflow.run_karma(
Ejemplo n.º 15
0
                      dest="threshold", default=0.0, help="similarity threshold")
    parser.add_option("-e", "--base", dest="base", type="string",
                      help="base file", default="")
    parser.add_option("-o", "--outputformat", dest="outputformat", type="string",
                      help="output file format: text/sequence", default="text")
    parser.add_option("-y", "--outputtype", dest="outputtype", type="string",
                      help="output type: csv/json", default="json")
    parser.add_option("-k", "--topk", dest="topk", type="int",
                      help="top n matches", default=3)
    parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)
    parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string",
                        help="name for json element for matching candidates", default="candidates")
    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    inputFilename = args[0]
    outputFilename = args[1]
    print "Save to:", outputFilename

    kwargs = as_dict(c_options)
    clusterer = Clusterer(**kwargs)
    fileUtil = FileUtil(sc)

    rdd = fileUtil.load_file(inputFilename,file_format='text')

    cluster_rdd = clusterer.compute_clusters(rdd)

    fileUtil.save_file(cluster_rdd,outputFilename,file_format='text')

Ejemplo n.º 16
0
      --archives ../karma.zip \
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaContextWorkflow.py \
      ../sample-data/sample-unicode-jsonld.json text \
      https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json \
      ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaContextWorkflow")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputType = argv[2]
    contextUrl = argv[3]
    outputFilename = argv[4]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    # Read input
    inputRDD = fileUtil.load_file(inputFilename, inputType, "json")

    # Apply the context
    outputRDD = workflow.apply_context(inputRDD, contextUrl)

    # Save the output
    fileUtil.save_file(outputRDD, outputFilename, "text", "json")
Ejemplo n.º 17
0
def convert(sc, inputFileName, outputFileName):
    fileUtil = FileUtil(sc)
    rdd = fileUtil.load_file(inputFileName,
                             file_format="text",
                             data_type="json")
    fileUtil.save_file(rdd, outputFileName)
Ejemplo n.º 18
0
if __name__ == "__main__":
    sc = SparkContext(appName="karma")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    inputFileType = argv[2]
    inputDataType = argv[3]
    numPartitions = int(argv[4])
    outputFilename = argv[5]

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = fileUtil.load_file(inputFilename, inputFileType,
                                  inputDataType).partitionBy(numPartitions)

    #2. Apply the karma Model
    contextUrl = "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/karma/context.json"
    outputRDD_karma = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/usc-isi-i2/dig-alignment/development/versions/2.0/datasets/ht/CDR/ht-model.ttl",
        "http://dig.isi.edu/ht/data/",
        "http://schema.org/WebPage1",
        contextUrl,
        num_partitions=numPartitions,
        data_type=inputDataType)

    #3. Apply the content
    outputRDD = workflow.apply_context(outputRDD_karma, contextUrl)