def testTokenizer(sc,
                  input_dir,
                  output_dir,
                  config,
                  limit=None,
                  input_file_format="sequence",
                  input_data_type="json",
                  output_file_format="sequence",
                  output_data_type="json",
                  **kwargs):

    print(limit)

    futil = FileUtil(sc)

    # LOAD DATA
    rdd_ingest = futil.load_file(input_dir,
                                 file_format=kwargs.get("file_format"),
                                 data_type=input_data_type)
    rdd_ingest.setName('rdd_ingest_input')

    ## TOKENIZE
    #tokOptions = {"file_format": input_file_format,
    #             "data_type": input_data_type}
    tokenizer = Tokenizer(config, **kwargs)
    rdd_tokenized = tokenizer.perform(rdd_ingest)

    # SAVE DATA
    outOptions = {}
    futil.save_file(rdd_tokenized,
                    output_dir,
                    file_format=output_file_format,
                    data_type=output_data_type,
                    **outOptions)
def preprocess(sc, inputDir, file_format, outputDir):
    """
    this method just reads the offer file and creates vertexrdd and edgerdd required for graphx
    vertexrdd will be node uri and type
    edgesrdd will be node a,node b,edge type
    :param inputDir:
    :param file_format:
    :return:
    """
    fileUtil = FileUtil(sc)
    inputrdd = fileUtil.load_file(inputDir,
                                  file_format=file_format,
                                  data_type='json')
    vertexrdd = inputrdd.flatMapValues(lambda x: nodes_mapper(x))
    #rdd = vertexrdd.foreach(lambda (x,y):f(x,y))

    edgerdd = inputrdd.flatMapValues(lambda x: edges_mapper(x))
    fileUtil.save_file(vertexrdd,
                       outputDir + 'vertex',
                       file_format='text',
                       data_type='json')
    fileUtil.save_file(edgerdd,
                       outputDir + 'edges',
                       file_format='text',
                       data_type='json')
def load_jsonlines(sc,
                   input,
                   file_format='sequence',
                   data_type='json',
                   separator='\t'):
    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(input,
                          file_format=file_format,
                          data_type=data_type,
                          separator=separator)
    return rdd
def save_jsonlines(sc,
                   rdd,
                   output_dir,
                   file_format='sequence',
                   data_type='json',
                   separator='\t'):
    fUtil = FileUtil(sc)
    fUtil.save_file(rdd,
                    output_dir,
                    file_format=file_format,
                    data_type=data_type,
                    separator=separator)
Exemple #5
0
def testSignature(sc, inputFileName, outputFileName, file_format):

    config = {"runlimit": 5, "field": "title_signature"}

    fUtil = FileUtil(sc)
    rdd = fUtil.load_file(inputFileName,
                          file_format=file_format,
                          data_type="json")
    signature = ComputeSignature(**config)
    rdd = signature.perform(rdd)

    fUtil.save_file(rdd, outputFileName, file_format="text", data_type="json")
Exemple #6
0
      --archives ../karma.zip \
      --py-files ../lib/python-lib.zip \
      --driver-class-path ~/github/Web-Karma/karma-spark/target/karma-spark-0.0.1-SNAPSHOT-shaded.jar \
      karmaWorkflowCSV.py ../sample-data/sample-unicode.txt ../sample-data/output
'''

if __name__ == "__main__":
    sc = SparkContext(appName="karmaCSV")

    java_import(sc._jvm, "edu.isi.karma")

    inputFilename = argv[1]
    outputFilename = argv[2]
    numPartitions = 1

    fileUtil = FileUtil(sc)
    workflow = Workflow(sc)

    #1. Read the input
    inputRDD = workflow.batch_read_csv(inputFilename)

    #2. Apply the karma Model
    outputRDD = workflow.run_karma(
        inputRDD,
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-model1.ttl",
        "http://dig.isi.edu/data",
        "http://schema.org/WebPage1",
        "https://raw.githubusercontent.com/dkapoor/test/master/sample-unicode-context.json",
        num_partitions=numPartitions,
        data_type="csv",
        additional_settings={
Exemple #7
0
def convert(sc, inputFileName, outputFileName):
    fileUtil = FileUtil(sc)
    rdd = fileUtil.load_file(inputFileName,
                             file_format="text",
                             data_type="json")
    fileUtil.save_file(rdd, outputFileName)