def main(hdfs_uri):
    """Divolte Spark Example.

    This example processes published Divolte log files at a given location.

    It displays:

     1. The total number of events in the log files.
     2. An arbitrary event.
     3. The ID of the session with the most events, along with the first 10
        events in that session.

    This is equivalent to the scala example.
    """
    sc = SparkContext()

    # Hadoop files are always read as an RDD of key/value pairs. Avro files contain only keys, however,
    # so we immediately map out the values.
    events_rdd = sc.newAPIHadoopFile(
        hdfs_uri,
        'org.apache.avro.mapreduce.AvroKeyInputFormat',
        'org.apache.avro.mapred.AvroKey',
        'org.apache.hadoop.io.NullWritable',
        keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter').map(lambda (k,v): k)

    # We are going to process the RDD several times, so cache the original
    # set in cluster memory so it doesn't have to be loaded each time.
    events_rdd.cache()

    # Calculate the total number of events.
    total_event_count = events_rdd.count()

    # Get the first event in our dataset (which isn't ordered yet).
    an_event = events_rdd.take(1)

    # Find the session with the most events.
    (longest_session_id, longest_session_count) = events_rdd \
        .map(lambda event: (event['sessionId'], 1)) \
        .reduceByKey(lambda x,y: x + y) \
        .reduce(lambda x,y: max(x, y, key=lambda (e, c): c))

    # For the session with the most events, find the first 10 events.
    first_events = events_rdd \
        .filter(lambda event: event['sessionId'] == longest_session_id) \
        .map(lambda event: (event['location'], event['timestamp'])) \
        .takeOrdered(10, lambda event: event[1])

    # Simple function for rendering timestamps.
    def timestamp_to_string(ts):
        return datetime.fromtimestamp(ts / 1000.0).strftime('%Y-%m-%d %H:%M:%S')

    # Print the results we accumulated, with some whitespace at the
    # front to separate this from the logging.
    print "\n\n"
    print "Number of events in data: %d" % total_event_count
    print "An event:\n%s" % json.dumps(an_event, indent=2)
    print "Session with id '%s' has the most events: %d" % (longest_session_id, longest_session_count)
    print "First 10 events:"
    print "\n".join(["  %s: %s" % (timestamp_to_string(ts), location) for (location, ts) in first_events])
def main():
    sc = SparkContext("yarn", "Simple Titles")

    xml_posts = sc.newAPIHadoopFile("/user/cloudera/stackexchange/Posts.xml",
                                    'com.databricks.spark.xml.XmlInputFormat',
                                    'org.apache.hadoop.io.Text',
                                    'org.apache.hadoop.io.Text',
                                    conf={
                                        'xmlinput.start': '<row',
                                        'xmlinput.end': '/>'
                                    })
    each_post = xml_posts.map(lambda x: x[1])
    post_fields = each_post.map(processXmlFields).filter(
        lambda x: x is not None)
    post_fields.saveAsTextFile(
        '/user/cloudera/stackexchange/simple_titles_txt')
Beispiel #3
0
def main(hdfs_uri):
    """Divolte Spark Example.

    This example processes published Divolte log files at a given location.

    It displays:

     1. The total number of events in the log files.
     2. An arbitrary event.
     3. The ID of the session with the most events, along with the first 10
        events in that session.

    This is equivalent to the scala example.
    """
    sc = SparkContext()
    events_rdd = sc.newAPIHadoopFile(
        hdfs_uri,
        'org.apache.avro.mapreduce.AvroKeyInputFormat',
        'org.apache.avro.mapred.AvroKey',
        'org.apache.hadoop.io.NullWritable',
        keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter').map(lambda (k,v): k)

    
    events_rdd.cache()

    total_event_count = events_rdd.count()

    # Get the first event in our dataset (which isn't ordered yet).
    an_event = events_rdd.take(1)

    # Find the session with the most events.
    

    distinct_events = events_rdd \
        .map(lambda event: (event['eventType'],1)).reduceByKey(add).collect()
    #.map(lambda event: (event['eventType'], event['timestamp'])) \
    # Simple function for rendering timestamps.
    def timestamp_to_string(ts):
        return datetime.fromtimestamp(ts / 1000.0).strftime('%Y-%m-%d %H:%M:%S')

    # Print the results we accumulated, with some whitespace at the
    # front to separate this from the logging.
    distinct_events= dict(distinct_events)
    with open(ouptput_file,"w") as h:
        h.write(json.dumps(distinct_events, indent=2))
        h.close()
Beispiel #4
0
def main(hdfs_uri):

    sc = SparkContext()
    events_rdd = sc.newAPIHadoopFile(
        hdfs_uri,
        'org.apache.avro.mapreduce.AvroKeyInputFormat',
        'org.apache.avro.mapred.AvroKey',
        'org.apache.hadoop.io.NullWritable',
        keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter'
    ).map(lambda (k, v): k)

    events_rdd.cache()
    data = dict()
    total_event_count = events_rdd.count()
    data[VISITS] = total_event_count
    # Get the first event in our dataset (which isn't ordered yet).
    #an_event = events_rdd.take(1)

    # Find the session with the most events.

    distinct_events = events_rdd.map(
        lambda event: (event[EVENTYPE], 1)).reduceByKey(add).collect()
    #.map(lambda event: (event['eventType'], event['timestamp'])) \
    # Simple function for rendering timestamps.

    # Print the results we accumulated, with some whitespace at the
    # front to separate this from the logging.
    distinct_events = dict(distinct_events)
    data[DISTINCT_EVENTS] = dict(distinct_events)

    unique_visits = events_rdd.map(lambda event: event[REMOTE_HOST],
                                   1).distinct().count()
    data[UNIQUE_VISITS] = unique_visits

    session_rdd = events_rdd.map(lambda event:
                                 (event[REMOTE_HOST], event[TIMESTAMP]))

    avg = session_rdd.groupBy(lambda e: e[0]).map(
        lambda e: averageTime(e[0], e[1])).collect()
    data[AVERAGE_VISIT] = dict(avg)

    with open(ouptput_file, "w") as h:
        h.write(json.dumps(data, indent=2))
        h.close()
    avroRdd.map(lambda x: (x, None)).saveAsNewAPIHadoopFile(
        fileAvroOut,
        "org.apache.avro.mapreduce.AvroKeyOutputFormat",
        "org.apache.avro.mapred.AvroKey",
        "org.apache.hadoop.io.NullWritable",
        keyConverter="irt.pythonconverters.Scheme1ToAvroKeyConverter",
        conf=conf)

    # ------------------------------------
    # -- read data from avro            --
    # ------------------------------------

    avroRdd2 = sc.newAPIHadoopFile(
        fileAvroOut,
        "org.apache.avro.mapreduce.AvroKeyInputFormat",
        "org.apache.avro.mapred.AvroKey",
        "org.apache.hadoop.io.NullWritable",
        keyConverter="irt.pythonconverters.AvroWrapperToJavaConverter",
        conf=conf)

    crudeData = avroRdd2.collect()

    output = crudeData[0][0]

    for k in ['raw1', 'raw2', 'raw3']:
        output[k] = convBytesToObjectPickle(output[k])

    print 80 * '#'
    print "input Record"
    print 80 * '#'
    pprint(record)
        Run with example jar:
        ./bin/spark-submit --driver-class-path /path/to/example/jar \
        /path/to/examples/avro_inputformat.py <data_file> [reader_schema_file]
        Assumes you have Avro data stored in <data_file>. Reader schema can be optionally specified
        in [reader_schema_file].
        """, file=sys.stderr)
        exit(-1)

    path = sys.argv[1]
    sc = SparkContext(appName="AvroKeyInputFormat")

    conf = None
    if len(sys.argv) == 3:
        schema_rdd = sc.textFile(sys.argv[2], 1).collect()
        conf = {"avro.schema.input.key": reduce(lambda x, y: x + y, schema_rdd)}


    avro_rdd = sc.newAPIHadoopFile(
        path,
        "org.apache.avro.mapreduce.AvroKeyInputFormat",
        "org.apache.avro.mapred.AvroKey",
        "org.apache.hadoop.io.NullWritable",
        keyConverter="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter",
        conf=conf)

    output = avro_rdd.map(lambda x: x[0]).collect()
    for k in output:
        print(k)

    sc.stop()
Beispiel #7
0
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if  not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
    schema = ''.join(avsc.split()) # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey="org.apache.avro.mapred.AvroKey"
    awrite="org.apache.hadoop.io.NullWritable"
    aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if  isinstance(data_path, list):
        avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if  spec_file:
        if  os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if  script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if  not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        mro = obj.MapReduce(spec)
        # example of collecting records from mapper and
        # passing all of them to reducer function
        records = avro_rdd.map(mro.mapper).collect()
        out = mro.reducer(records)

        # the map(f).reduce(f) example but it does not collect
        # intermediate records
        # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
    else:
        records = avro_rdd.map(basic_mapper).collect()
        out = basic_reducer(records)
    ctx.stop()
    if  verbose:
        logger.info("Elapsed time %s" % htime(time.time()-time0))
    return out
from pyspark import SparkConf, SparkContext

conf = SparkConf()

conf.setAppName("spark_app_wordcount_extend")

sc = SparkContext(conf=conf)

pairs = sc.newAPIHadoopFile(
    "/user/yurun/spark/textfile/",
    "org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "org.apache.hadoop.io.Text")

words = pairs.map(lambda pair: pair[1]).flatMap(lambda line: line.split("\t"))

pairs = words.map(lambda word: (word, 1))

counts = pairs.reduceByKey(lambda a, b: a + b)

results = counts.collect()

for result in results:
    print result

sc.stop()
Beispiel #9
0
        vec = x['vec'].copy()
        for word in vec:
            key = user_family + ':' + word
            if key in row:
                vec[word] = vec[word] + int(row[key])

        count = 1
        if count_loc in row:
            count = str(count + int(row[count_loc]))
        else:
            count = str(count)

        #write user vector +count
        temp = {}
        for word in vec:
            temp[user_family + ':' + word] = str(vec[word])
        vec = temp
        vec[count_loc] = count
        vec[cf1 + ':' + article_pref + id] = 'true'

        table.put(user_pref + username, vec)


## call and run
file_rdds=sc.newAPIHadoopFile(files, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                    "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text",
                    conf={"textinputformat.record.delimiter": "</page>"})\
.map(split_str)\
.map(parse_xml)\
.foreach(write_hbase)
Beispiel #10
0
# 初始化spark 配置
conf = SparkConf()
conf.setAppName("Simple App")
conf.set("spark.executor.memory","1g")
sc = SparkContext(conf = conf)
logFile = "./log.log"  # Should be some file on your system
logData = sc.textFile(logFile).cache()

#########################################################
"""                                                  TOP 100                                           """
kFileNum = 200
total_words = None
for file_id in range(1,kFileNum):
    YOUR_FILE = "wet_data/CC-MAIN-20150728002301-%05d-ip-10-236-191-2.ec2.internal.warc.wet"%file_id
    YOUR_DELIMITER = "WARC/1.0"
    text_file= sc.newAPIHadoopFile(YOUR_FILE,"org.apache.hadoop.mapreduce.lib.input.TextInputFormat", "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text", conf = {"textinputformat.record.delimiter":YOUR_DELIMITER}).map(lambda l:l[1])
    # 打开文件, 语句的结果是一个RDD 
    def split_and_remove_no_meaning_word(line):
    	candidate_words = line.split()
    	# 去掉符号
    	candidate_words = map(lambda word: filter(str.isalpha, str(word.encode('utf8')) ), candidate_words)
    	no_meanning_words = ['was','over','her','them','news','they','what','like','now','use','how','see','add','help','when','who','there','here','back','also','most','over','make','years','had','into','have','may','any','other','more','has','one','which','out','their','some','than','its','off','only','his','just','get','been','were','would','our','ago','not','the','and','with','for','your','you','the','from','are','that','all','will','this','can','but','about','warcdate','warctype','contentlength','warcrecordid','contenttype','warcblockdigest','warctargeturi','warcrefersto']
    	words = filter(lambda word: len(word) > 2 and word not in no_meanning_words, candidate_words)
    	return words

    text_file1 = text_file.map(lambda line:line.lower()).flatMap(split_and_remove_no_meaning_word)
    words = text_file1.map(lambda word: (word,1)).reduceByKey(lambda a,b: a+b)
    if total_words == None:
        total_words = words
    else:
        total_words = total_words.union(words)
Beispiel #11
0
def main(hdfs_uri):
    """Divolte Spark Example.

    This example processes published Divolte log files at a given location.

    It displays:

     1. The total number of events in the log files.
     2. An arbitrary event.
     3. The ID of the session with the most events, along with the first 10
        events in that session.

    This is equivalent to the scala example.
    """
    sc = SparkContext()

    # Hadoop files are always read as an RDD of key/value pairs. Avro files contain only keys, however,
    # so we immediately map out the values.
    events_rdd = sc.newAPIHadoopFile(
        hdfs_uri,
        'org.apache.avro.mapreduce.AvroKeyInputFormat',
        'org.apache.avro.mapred.AvroKey',
        'org.apache.hadoop.io.NullWritable',
        keyConverter='io.divolte.spark.pyspark.avro.AvroWrapperToJavaConverter'
    ).map(lambda (k, v): k)

    # We are going to process the RDD several times, so cache the original
    # set in cluster memory so it doesn't have to be loaded each time.
    events_rdd.cache()

    # Calculate the total number of events.
    total_event_count = events_rdd.count()

    # Get the first event in our dataset (which isn't ordered yet).
    an_event = events_rdd.take(1)

    # Find the session with the most events.
    (longest_session_id, longest_session_count) = events_rdd \
        .map(lambda event: (event['sessionId'], 1)) \
        .reduceByKey(lambda x,y: x + y) \
        .reduce(lambda x,y: max(x, y, key=lambda (e, c): c))

    # For the session with the most events, find the first 10 events.
    first_events = events_rdd \
        .filter(lambda event: event['sessionId'] == longest_session_id) \
        .map(lambda event: (event['location'], event['timestamp'])) \
        .takeOrdered(10, lambda event: event[1])

    # Simple function for rendering timestamps.
    def timestamp_to_string(ts):
        return datetime.fromtimestamp(ts /
                                      1000.0).strftime('%Y-%m-%d %H:%M:%S')

    # Print the results we accumulated, with some whitespace at the
    # front to separate this from the logging.
    print "\n\n"
    print "Number of events in data: %d" % total_event_count
    print "An event:\n%s" % json.dumps(an_event, indent=2)
    print "Session with id '%s' has the most events: %d" % (
        longest_session_id, longest_session_count)
    print "First 10 events:"
    print "\n".join([
        "  %s: %s" % (timestamp_to_string(ts), location)
        for (location, ts) in first_events
    ])
Beispiel #12
0
from IOCommon import IOCommon

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: terasort <HDFS_INPUT> <HDFS_OUTPUT>"
        exit(-1)
    sc = SparkContext(appName="PythonTeraSort")
    reducer = int(IOCommon.getProperty("hibench.default.shuffle.parallelism"))

    version_api = IOCommon.getProperty("hibench.hadoop.version")
    # load
    if version_api == "hadoop1":
        lines = sc.textFile(sys.argv[1], 1).map(lambda x: (x[:10], x[10:]))
    elif version_api == "hadoop2":
        lines = sc.newAPIHadoopFile(
            sys.argv[1], "org.apache.hadoop.examples.terasort.TeraInputFormat",
            "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")

    # sort
    sortedCount = lines.sortByKey(lambda x: x, numPartitions=reducer)

    # save
    if version_api == "hadoop1":
        lines = sortedCount.map(lambda x: x[0] + x[1])
        sortedCount.saveAsTextFile(sys.argv[2])
    elif version_api == "hadoop2":
        sortedCount.saveAsNewAPIHadoopFile(
            sys.argv[2],
            "org.apache.hadoop.examples.terasort.TeraOutputFormat",
            "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
Beispiel #13
0
    labeled_doc = model.label(doc_raw_text)
    sentence_index = 0
    for sentence in document.sentences:
        labeled_sentence = labeled_doc[sentence_index]
        sentence_index += 1

        doc_output += "\n" + sentence.sent_id + "\n"
        doc_output += sentence.text + "\n"
        word_index = 0
        for word in sentence.words_conll:
            dictionary = labeled_sentence[word_index]
            doc_output += word + "\t" + dictionary["label"] + "\n"
            word_index += 1
    return doc_output


conf = SparkConf().setAppName('rnn2argument')
sc = SparkContext(conf=conf)
print("available nodes: ", sc.defaultParallelism)

text = sc.newAPIHadoopFile(
    "conll.txt",
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.LongWritable',
    'org.apache.hadoop.io.Text',
    conf={'textinputformat.record.delimiter': '# newdoc'})

text.map(lambda x: parse_doc(x))\
    .map(lambda x: label_doc(x)) \
    .saveAsTextFile("output")
    # blank line is used as delimeter
    # Note that the first record have have an extra line "total number:#" as its first line
    hadoop_conf = {"textinputformat.record.delimiter": "\n\n"} 

    is_ec2 = True 
    # If the submission is on EC2, you need to set access key and secret access key
    if is_ec2:
        hadoop_conf["fs.s3n.awsAccessKeyId"] = "AKIaYOURaOWNaKEYaJPQ"
        hadoop_conf["fs.s3n.awsSecretAccessKey"] = "v5vBmazYOURaOWNaSECRETaKEYaT8yX4jXC+mGLl"
        master_add = "ec2-54-213-21-124.us-west-2.compute.amazonaws.com"

    # Read the file with the function newAPIHadoopFile. The RDD object has elements like this: <lineNumber, textOfTweet>. 
    # With the function textFile in the Word Count example, the hadoopConf can not be passed in. 
    lines = sc.newAPIHadoopFile(filepath, "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                                "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text",
                                conf=hadoop_conf)

    # print out records to understand the logic of RDD.
    # NOTE: you can not print an RDD object directly. Take a small sample from it 
    #print(lines.take(5))

    # A tweet has text "No Post Title" is considered as a bad record
    bad_msg = "W\tNo Post Title"

    # In the class, MapReduce is introduced in a simple form. In Spark, map and reduce have more variants. Key-value pair <K, V> can be key 
    # <K> only.  This function maps a record  to <0> or <1>
    flag = lines.map(lambda x: 0 if -1 == string.find(x[1], bad_msg) else 1) 

    # print mapped keys
    #print(flag.take(5)) 
Beispiel #15
0
        exit(-1)
    conf = SparkConf().set("spark.default.parallelism", "3")

    sc = SparkContext(appName="SequenceFile", conf=conf)
    path = sys.argv[1]
    out = sys.argv[2]

    # 读取sequence file 文件
    #lines = sc.sequenceFile(path, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable")

    # hadoop 老的api, mapred
    #lines = sc.hadoopFile(path, "org.apache.hadoop.mapred.SequenceFileInputFormat",
    #                            "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable")

    # hadoop 新的api  mapreduce
    lines = sc.newAPIHadoopFile(path, "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
                                "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable")

    # hadoop 新api 中getSplits getSplits(JobContext context)
    # hadoop  旧的api 中 getSplits(JobConf job, int numSplits)
    #  textFile 用的旧的api 和 hadoopFile 相同 打印2个partition
    #  newApiHadoopFile 打印一个partition 
    print "sequence partitions: %s" % lines.getNumPartitions()

    results = lines.mapValues(lambda x: long(x))
    for i in results.take(10):
        print i[0] , i[1]
    print results.count()

    # saveAsSequenceFile(path, compressionCodecClass=None)
    lines.saveAsSequenceFile(out, "org.apache.hadoop.io.compress.GzipCodec")
Beispiel #16
0
anchorDics = [{}]*num_dics
for tid, tarID in enumerate(tarIDs):
    dicID = int(tid/maxAnchor)   
    anchorDics[dicID][tarID] = anchorDic[tarID]
 
anchorDicBC = sc.broadcast(anchorDics)
 
time3 = time.time()
print('load dic2', time3-time2)

# rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_anchored_doc/part-[0-2]*',
# rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_anchored_doc/part-[3-5]*',
# rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_anchored_doc/part-*',
# rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_12_anchored_doc/part-[0-2]*',
rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_12_anchored_doc/part-[3-4]*',
# rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_12_anchored_doc/part-*',
                           'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
                           'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.Text')

# reuse = 0
def process(webpage):
    if webpage:
        record = tryJson(webpage)
    else:
#         return 'None1'
        return None
    if not record:
#         return 'None2'
        return None
    if not record[1]:
        return None
    try:
Beispiel #17
0
def run(schema_file,
        data_path,
        script=None,
        spec_file=None,
        verbose=None,
        yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd)  # merge all entries from rdd list
    schema = ''.join(avsc.split())  # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey = "org.apache.avro.mapred.AvroKey"
    awrite = "org.apache.hadoop.io.NullWritable"
    aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if isinstance(data_path, list):
        avro_rdd = ctx.union([
            ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf)
            for f in data_path
        ])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path,
                                        aformat,
                                        akey,
                                        awrite,
                                        aconv,
                                        conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if spec_file:
        if os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        # we have a nested use case when one MR return WMArchive spec
        # we'll loop in that case until we get non-spec output
        while True:
            mro = obj.MapReduce(spec)
            # example of collecting records from mapper and
            # passing all of them to reducer function
            records = avro_rdd.map(mro.mapper).collect()
            out = mro.reducer(records)
            logger.info('OUTPUT %s %s' % (out, type(out)))
            if is_spec(out):
                spec = out
            else:
                break

        # the map(f).reduce(f) example but it does not collect
        # intermediate records
        # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
    else:
        records = avro_rdd.map(basic_mapper).collect()
        out = basic_reducer(records)
    ctx.stop()
    if verbose:
        logger.info("Elapsed time %s" % htime(time.time() - time0))
    return out
from operator import add
import sys
from pyspark import SparkContext

#Giving a Name and using the local Spark Master
sc = SparkContext(appName="LZO Wordcount")


if __name__ == "__main__":
	if len(sys.argv) != 2:
		print >> sys.stderr, """
		Usage: wordcount_lzo_file.py <data_file>
		Run with example jar:
		spark-submit --driver-class-path /home/spark/lib/hadoop-lzo.jar /path/to/examples/wordcount_lzo_file.py <data_file>
		"""
		exit(-1)

	path = sys.argv[1]
	print path
	conf = None

	#Reading a file in HDFS(use absolute path)
	csv = sc.newAPIHadoopFile(path,"com.hadoop.mapreduce.LzoTextInputFormat","org.apache.hadoop.io.LongWritable","org.apache.hadoop.io.Text").count()

	print csv
    
    #for k in output:
    #    print k
Beispiel #19
0
    ## Initialize Spark Context
    conf = SparkConf().setAppName("XMLPARSER").setMaster("local")
    sc = SparkContext(conf=conf)

    ## File to read
    postsFile = sys.argv[1] if len(sys.argv) > 1 else 'Posts.xml'

    ## Directory to output the processed RDD
    output = sys.argv[2] if len(sys.argv) > 2 else 'results'

    ## Provide start and end tags for xml entrees
    xmlConf = {'xmlinput.start': '<row', 'xmlinput.end': '/>'}

    ## Read file using hadoop using the spark-xml input format
    records = sc.newAPIHadoopFile(postsFile,
                                  'com.databricks.spark.xml.XmlInputFormat',
                                  'org.apache.hadoop.io.Text',
                                  'org.apache.hadoop.io.Text',
                                  conf=xmlConf)

    ## Remove autogenerated ids
    normalizedRecords = records.map(lambda x: x[1])

    ## Extract the required fields
    rdd = normalizedRecords.map(processRows)

    ## Save the generated RDD into HDFS
    rdd.saveAsPickleFile(output)

    sc.stop()
Beispiel #20
0
'''

Formatos de entrada/salida de Hadoop
Spark puede interactuar con cualquier formato de fichero soportado por Hadoop
- Soporta las APIs “vieja” y “nueva”
- Permite acceder a otros tipos de almacenamiento (no fichero), p.e. HBase o MongoDB, a través de saveAsHadoopDataSet y/o saveAsNewAPIHadoopDataSet
'''

%pyspark
# Salvamos el RDD clave/valor como fichero de texto Hadoop (TextOutputFormat)
rdd.saveAsNewAPIHadoopFile("file:///tmp/hadoopfileoutdir", 
                            "org.apache.hadoop.mapreduce.lib.output.TextOutputFormat",
                            "org.apache.hadoop.io.Text",
                            "org.apache.hadoop.io.IntWritable")

%sh
echo 'Directorio de salida'
ls -l /tmp/hadoopfileoutdir
cat /tmp/hadoopfileoutdir/part-r-00001

%pyspark
# Lo leemos como fichero clave-valor Hadoop (KeyValueTextInputFormat)
rdd3 = sc.newAPIHadoopFile("file:///tmp/hadoopfileoutdir", 
                          "org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat",
                          "org.apache.hadoop.io.Text",
                          "org.apache.hadoop.io.IntWritable")
                          
print("Contenido del RDD {0}".format(rdd3.collect()))

Beispiel #21
0
from pyspark.sql import SQLContext


def tryJson(data):
    try:
        return json.loads(data)
    except:
        return None


sparkConf = SparkConf().setAppName("PySpark clueWeb09 doc collect")
sc = SparkContext(conf=sparkConf)

# rdd1 = sc.newAPIHadoopFile('hdfs:///user/qile1864/anchor_frags/frags_9_all_hadoop_t3/all_part-00000.4.preLSH',
rdd1 = sc.newAPIHadoopFile(
    'hdfs:///user/qile1864/anchor_frags/frags_12_all_hadoop_t9/all_part-00000.4.preLSH',
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.Text', 'org.apache.hadoop.io.Text')

mergeSim = 0.9
hash_len = 128
hash_ngram = 3


def process(anchors):
    if not anchors:
        return None
#     try:
    record = tryJson(anchors)
    if not record:
        return None
Beispiel #22
0
def run(schema_file, data_path, script=None, spec_file=None, verbose=None, rout=None, yarn=None):
    """
    Main function to run pyspark job. It requires a schema file, an HDFS directory
    with data and optional script with mapper/reducer functions.
    """
    if  script:
        script = get_script(script)
    if  verbose:
        print("### schema: %s" % schema_file)
        print("### path  : %s" % data_path)
        print("### script: %s" % script)
        print("### spec  : %s" % spec_file)
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat", pyFiles=[script])
    logger = SparkLogger(ctx)
    if  not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd) # merge all entries from rdd list
    schema = ''.join(avsc.split()) # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat="org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey="org.apache.avro.mapred.AvroKey"
    awrite="org.apache.hadoop.io.NullWritable"
    aconv="org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if  isinstance(data_path, list):
        avro_rdd = ctx.union([ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf) for f in data_path])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path, aformat, akey, awrite, aconv, conf=conf)

    # process data, here the map will read record from avro file
    # if we need a whole record we'll use lambda x: x[0], e.g.
    # output = avro_rdd.map(lambda x: x[0]).collect()
    #
    # if we need a particular key, e.g. jobid, we'll extract it
    # within lambda function, e.g. lambda x: x[0]['jobid'], e.g.
    # output = avro_rdd.map(lambda x: x[0]['jobid']).collect()
    #
    # in more general way we write mapper/reducer functions which will be
    # executed by Spark via collect call
    spec = None
    if  spec_file:
        if  os.path.isfile(spec_file):
            spec = json.load(open(spec_file))
        else:
            spec = json.loads(spec_file)
    if  verbose:
        spec['verbose'] = 1
        print("### spec %s" % json.dumps(spec))
    if  rout:
        spec['output'] = rout
    if  script:
        obj = import_(script)
        logger.info("Use user-based script %s" % obj)
        if  not hasattr(obj, 'MapReduce'):
            logger.error('Unable to find MapReduce class in %s, %s' \
                    % (script, obj))
            ctx.stop()
            return
        # we have a nested use case when one MR return WMArchive spec
        # we'll loop in that case until we get non-spec output
        count = 0
        while True:
            mro = obj.MapReduce(spec)
            mname = mro.__dict__.get('name', '').split('.')[0]
            print("### Load %s" % mname)
            if  mname.lower().endswith('counter'):
                out = avro_rdd.filter(mro.mapper).count()
                if  rout:
                    with open(rout, 'w') as ostream:
                        ostream.write(out)
                break
            # example of collecting records from mapper and
            # passing all of them to reducer function
            records = avro_rdd.filter(mro.mapper).collect()
            out = mro.reducer(records)
            if  verbose:
                print("### Loop count %s" % count)
            if  count > 3:
                print("### WARNING, loop counter exceed its limit")
                break
            if  is_spec(out):
                spec = out
            else:
                break
            count += 1

        # the map(f).reduce(f) example but it does not collect
        # intermediate records
        # out = avro_rdd.map(obj.mapper).reduce(obj.reducer).collect()
    else:
        records = avro_rdd.map(basic_mapper).collect()
        out = basic_reducer(records)
    ctx.stop()
    if  verbose:
        logger.info("Elapsed time %s" % htime(time.time()-time0))
    return out
Beispiel #23
0
if __name__ == "__main__":
    if len(sys.argv) != 3:
        print >>sys.stderr, "Usage: terasort <HDFS_INPUT> <HDFS_OUTPUT>"
        exit(-1)
    sc = SparkContext(appName="PythonTeraSort")
    reducer = int(IOCommon.getProperty("hibench.default.shuffle.parallelism"))

    version_api = IOCommon.getProperty("hibench.hadoop.version")
    # load
    if version_api == "hadoop1":
        lines = sc.textFile(sys.argv[1], 1).map(lambda x: (x[:10], x[10:]))
    elif version_api == "hadoop2":
        lines = sc.newAPIHadoopFile(
            sys.argv[1],
            "org.apache.hadoop.examples.terasort.TeraInputFormat",
            "org.apache.hadoop.io.Text",
            "org.apache.hadoop.io.Text",
        )

    # sort
    sortedCount = lines.sortByKey(lambda x: x, numPartitions=reducer)

    # save
    if version_api == "hadoop1":
        lines = sortedCount.map(lambda x: x[0] + x[1])
        sortedCount.saveAsTextFile(sys.argv[2])
    elif version_api == "hadoop2":
        sortedCount.saveAsNewAPIHadoopFile(
            sys.argv[2],
            "org.apache.hadoop.examples.terasort.TeraOutputFormat",
            "org.apache.hadoop.io.Text",
def launch_spark_job():
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext
    from pyspark.sql.functions import concat, col, lit

    readFile = sys.argv[1]
    k = int(sys.argv[2])
    num_partitions = int(sys.argv[3])
    conf = SparkConf().setAppName("reads Loader" + str(num_partitions))
    sc = SparkContext(conf=conf)
    sc.addPyFile("utils.py")
    sc.setCheckpointDir(
        "hdfs://doop-mng1.haifa.ibm.com:8020/projects/Store_Analytics/SparkCheckPoints"
    )
    import utils
    # from utils import map_read_to_anchors_list, convert_anchors_list_to_seq_edges
    readLines = (
        sc.newAPIHadoopFile(
            readFile,
            'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
            'org.apache.hadoop.io.LongWritable',
            'org.apache.hadoop.io.Text',
            conf={'textinputformat.record.delimiter': '@'})  #, 
        .map(lambda delim_lines_tup: delim_lines_tup[1]
             )  # keeps just the lines and not the @ delimiter
        .filter(lambda x: x.startswith("SRR")
                )  # gets rid of entries due to '@' appearing in the wrong line
        .map(lambda x: x.split("\n")[:2]
             )  # splits the lines, keeps only the first two
        .filter(lambda x: len(x) == 2)  # git rid of any cut off records
        .repartition(num_partitions)
        # .cache()
    )

    print("----------------------there are %i reads" % (readLines.count()))

    # get new RDD including lists of kmers (with no Ns), (k+1)mers
    kmers = (readLines.map(lambda entry: entry[1]).flatMap(
        lambda read: getKmerToNextCharCounts(read, k)))

    print("----------------------there are %i kmers instances" %
          (kmers.count()))

    kmers_with_exts = (kmers.reduceByKey(func=lambda x, y: x + y))

    print("----------------------there are %i distinct kmers" %
          (kmers_with_exts.count()))

    junctions = kmers_with_exts.filter(lambda kmer_tup: my_filter(kmer_tup))

    print("----------------------there are %i junctions" % junctions.count())

    # for i in junctions.take(10):
    #     if sum(i[1])>1:
    #         print i

    generate_juncs = build_partial_junctions_set()
    junctions_set_rdd = (junctions.mapPartitions(generate_juncs).reduceByKey(
        merge_sets).collect())

    juncs_broadcast = sc.broadcast(junctions_set_rdd[0][1])
    print("----------------------there are %i junctions in broadcast" %
          len(juncs_broadcast.value))

    # build edge set rdd, filter out edges including a junction at some end

    def read_line_map_function(read_line):
        return utils.map_read_to_anchors_list(read_line[1], k - 10, 10,
                                              juncs_broadcast.value)

    edges_rdd = (readLines.map(
        lambda read_line: read_line_map_function(read_line)).flatMap(
            lambda anchors: utils.convert_anchors_list_to_seq_edges(anchors),
            preservesPartitioning=True).filter(
                lambda (a, b, c): a not in juncs_broadcast.value and b not in
                juncs_broadcast.value))

    print("----------------------there are %i total edges" % edges_rdd.count())

    # create SQLContext to be able to create dataFrame from rdd
    sqc = SQLContext(sc)
    edges_df = sqc.createDataFrame(edges_rdd, ["src", "dst", "overlap"])
    vertices_df = edges_df.select(
        concat(col("src"), lit(" "), col("dst")).alias('id')).dropDuplicates()
    g = GraphFrame(vertices_df, edges_df)

    # vertices_df.agg(*[count(c).alias(c) for c in vertices_df.columns]).show()

    print("----------------------there are %i total vertices" %
          vertices_df.count())

    # get connected components of remaining graph

    result = g.connectedComponents()
    result.select("id", "component").orderBy("component").show()
Beispiel #25
0
from elasticsearch import search
from sparql import query_abstract
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

sc = SparkContext("yarn", "wdps1811")

KEYNAME = "WARC-TREC-ID"
INFILE = sys.argv[1]
OUTFILE = sys.argv[2]
ELASTICSEARCH = sys.argv[3]
SPARQL = sys.argv[4]

rdd = sc.newAPIHadoopFile(
    INFILE,
    "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "org.apache.hadoop.io.Text",
    conf={"textinputformat.record.delimiter": "WARC/1.0"})


def find_key(payload):
    key = None
    for line in payload.splitlines():
        if line.startswith(KEYNAME):
            key = line.split(":")[1]
            return key
    return ""


vectorizer = TfidfVectorizer()
Beispiel #26
0
    config["stage1File"] = filepath
    with open("config_{0}.json".format(index), "w") as outfile:
        json.dump(config, outfile, indent=4)
    subprocess.call(["./mdm", "config_{0}.json".format(index)])
    subprocess.call([
        "mv", "Final_MDM_{0}.root".format(index),
        "/hdfs/user/hjayatissa/geant_mdm_csi/stage2"
    ])
    subprocess.call([
        "/usr/local/hadoop/bin/hdfs", "dfs", "-chown", "hjayatissa",
        "/user/hjayatissa/geant_mdm_csi/stage2/Final_MDM_{0}.root".format(
            index)
    ])


if __name__ == "__main__":
    sconf = SparkConf().setAppName("mdm-CsI-2")
    sconf.set("spark.executor.memory", "13g")
    sconf.set("spark.python.worker.reuse", "false")
    sc = SparkContext(conf=sconf)

    sc.addFile("mdm")
    sc.addFile("config/config_isobutane_22Ne_6Li_geant_oxford.json")
    sc.addFile("run_oxf.mac")

    file_name = "hdfs://gr-gmaster.tamu.edu:9000//user/hjayatissa/geant_mdm_csi/stage1/MDM_*.root"
    lines = sc.newAPIHadoopFile(file_name, "edu.tamu.hadoop.RootInputFormat",
                                "org.apache.hadoop.io.IntWritable",
                                "org.apache.hadoop.io.Text")
    lines.foreach(lambda x: stage2(x))
with codecs.open(dicFile, 'r', 'utf-8') as f:
    lines = [line.strip() for line in f]

for line in lines:
    url = line.split('\t')[0]
    docDic[url] = len(docDic)

# print(docDic)

docDicBC = sc.broadcast(docDic)

# CW 09
rdd1 = sc.newAPIHadoopFile(
    'hdfs:///corpora/corpora-thirdparty/corpus-clueweb/09-mapfile/data-r-*',
    # CW 12
    # rdd1 = sc.newAPIHadoopFile('hdfs:///corpora/corpora-thirdparty/corpus-clueweb/12-mapfile/data-r-*',
    # rdd1 = sc.newAPIHadoopFile('hdfs:///corpora/corpora-thirdparty/corpus-clueweb/12-mapfile/data-r-00000',
    'org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat',
    'org.apache.hadoop.io.Text',
    'org.apache.hadoop.io.Text')
maxHtml = 50000


def checkDoc(data):
    if not data:
        #         print('not data')
        #         return 'not data'
        return
    try:
        #     record = json.loads(data)
        record = tryJson(data)
        #         print(str(record)[:100])
    conf = SparkConf().set("spark.default.parallelism", "3")

    sc = SparkContext(appName="SequenceFile", conf=conf)
    path = sys.argv[1]
    out = sys.argv[2]

    # 读取sequence file 文件
    #lines = sc.sequenceFile(path, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable")

    # hadoop 老的api, mapred
    #lines = sc.hadoopFile(path, "org.apache.hadoop.mapred.SequenceFileInputFormat",
    #                            "org.apache.hadoop.io.Text", "org.apache.hadoop.io.LongWritable")

    # hadoop 新的api  mapreduce
    lines = sc.newAPIHadoopFile(
        path, "org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat",
        "org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable")

    # hadoop 新api 中getSplits getSplits(JobContext context)
    # hadoop  旧的api 中 getSplits(JobConf job, int numSplits)
    #  textFile 用的旧的api 和 hadoopFile 相同 打印2个partition
    #  newApiHadoopFile 打印一个partition
    print "sequence partitions: %s" % lines.getNumPartitions()

    results = lines.mapValues(lambda x: long(x))
    for i in results.take(10):
        print i[0], i[1]
    print results.count()

    # saveAsSequenceFile(path, compressionCodecClass=None)
    lines.saveAsSequenceFile(out, "org.apache.hadoop.io.compress.GzipCodec")
from operator import add
import sys
from pyspark import SparkContext

#Giving a Name and using the local Spark Master
sc = SparkContext(appName="LZO Wordcount")

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print >> sys.stderr, """
		Usage: wordcount_lzo_file.py <data_file>
		Run with example jar:
		spark-submit --driver-class-path /home/spark/lib/hadoop-lzo.jar /path/to/examples/wordcount_lzo_file.py <data_file>
		"""
        exit(-1)

    path = sys.argv[1]
    print path
    conf = None

    #Reading a file in HDFS(use absolute path)
    csv = sc.newAPIHadoopFile(path, "com.hadoop.mapreduce.LzoTextInputFormat",
                              "org.apache.hadoop.io.LongWritable",
                              "org.apache.hadoop.io.Text").count()

    print csv

#for k in output:
#    print k
Beispiel #30
0
text = "org.apache.hadoop.io.LongWritable"

dep_seq = sc.sequenceFile("file:///home/cloudera/departments/part-00001", text,
                          text)

dep = sc.sequenceFile("/user/cloudera/intelli",
                      "org.apache.hadoop.io.LongWritable", "departments")

# $ SPARK_CLASSPATH=/path/to/elasticsearch-hadoop.jar ./bin/pyspark
# conf = {"es.resource" : "index/type"}   # assume Elasticsearch is running on localhost defaults
# rdd = sc.newAPIHadoopRDD("org.elasticsearch.hadoop.mr.EsInputFormat",\
#     "org.apache.hadoop.io.NullWritable", "org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=conf)

rdd = sc.newAPIHadoopFile(
    path="/user/cloudera/intelli/part-m-00000",
    inputFormatClass=
    "org.apache.hadoop.mapreduce.lib.input.SequenceFileAsBinaryInputFormat",
    keyClass="org.apache.hadoop.io.LongWritable",
    valueClass="org.apache.hadoop.io.Text")

## Above solution does not work
# SO ultimate inference is if the sqoop import does not give the writable type formats in the output sequence file format
# one has to built an converter class for this using attribute valueConverter="valueConverterClass" in the
# sc.sequenceFile or newAPIHadoopFile making converter class is not easy
# Writable Type	        Python Type
# Text	                unicode str
# IntWritable	        int
# FloatWritable	        float
# DoubleWritable	    float
# BooleanWritable	    bool
# BytesWritable	        bytearray
# NullWritable	        None
Beispiel #31
0
source_path = '/' + filename

dest_path = '/filter1_' + filename

key_set = [
    '101', '103', '195', '196', '198', '199', '200', '202', '203', '204',
    '205', '210', '295', '296', '298', '299', '304', '321', '350', '380',
    '395', '398', '399', '499', '527', '540', '541', '542', '543', '558',
    '564', '565', '578', '595', '596', '598', '599', '699', '700', '799',
    '899', '996', '998', '999'
]

sc = SparkContext()

source = sc.newAPIHadoopFile(
    source_path,
    'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
    'org.apache.hadoop.io.Text',
    'org.apache.hadoop.io.LongWritable',
    conf={'textinputformat.record.delimiter': '{1:'})

type_matched = srouce.map(lambda m: filter_message(m[1])).filter(lambda x: x)

type_report = type_matched.map(lambda m: generate_IO_report(m)).reduceByKey(
    add).map(lambda x: toCSVLine(x)).coalesce(1).saveAsTextFile(dest_path)

# receiving_bic_matched=type_matched.filter_receiving_bic(lambda x:filter_receiving_bic(x,args.receiving_bic_type))

# sending_bic_matched = receiving_bic_matched.filter_sending_bic(lambda x: filter_sending_bic(x, args.sending_bic_type))
Beispiel #32
0
"""
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("""
        Usage: parquet_inputformat.py <data_file>

        Run with example jar:
        ./bin/spark-submit --driver-class-path /path/to/example/jar \\
                /path/to/examples/parquet_inputformat.py <data_file>
        Assumes you have Parquet data stored in <data_file>.
        """,
              file=sys.stderr)
        exit(-1)

    path = sys.argv[1]
    sc = SparkContext(appName="ParquetInputFormat")

    parquet_rdd = sc.newAPIHadoopFile(
        path,
        'org.apache.parquet.avro.AvroParquetInputFormat',
        'java.lang.Void',
        'org.apache.avro.generic.IndexedRecord',
        valueConverter=
        'org.apache.spark.examples.pythonconverters.IndexedRecordToJavaConverter'
    )
    output = parquet_rdd.map(lambda x: x[1]).collect()
    for k in output:
        print(k)

    sc.stop()
Beispiel #33
0
    key, name, entity_id = record
    yield key + '\t' + name + '\t' + entity_id

if __name__ == "__main__":
    try:
        _, DOMAIN_ES, DOMAIN_KB, INPUT, OUTPUT = sys.argv
    except Exception:
        print('Usage: DOMAIN_ES, DOMAIN_TRIDENT')
        sys.exit(0)
    SPACY = spacy.load("en_core_web_sm")
    # Spark setup with conf from command line
    sc = SparkContext()
    # split WARC
    config = {"textinputformat.record.delimiter": "WARC/1.0"}

    # Read the Warc file to rdd
    rdd = sc.newAPIHadoopFile(INPUT,
                               "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
                               "org.apache.hadoop.io.LongWritable",
                               "org.apache.hadoop.io.Text", conf=config)

    # Process the warc files, result is an rdd with each element "key + '\t' + name + '\t' + FreebaseID"
    rdd = rdd.flatMap(record_to_html)
    rdd = rdd.flatMap(html_to_text)
    rdd = rdd.flatMap(named_entity_recognition)
    rdd = rdd.flatMap(generate_candidates)
    rdd = rdd.flatMap(output)

    #print(rdd.take(10))
    result = rdd.saveAsTextFile(OUTPUT)
Beispiel #34
0
        Run with example jar:
        ./bin/spark-submit --driver-class-path /path/to/example/jar \
        /path/to/examples/avro_inputformat.py <data_file> [reader_schema_file]
        Assumes you have Avro data stored in <data_file>. Reader schema can be optionally specified
        in [reader_schema_file].
        """
        exit(-1)

    path = sys.argv[1]
    sc = SparkContext(appName="AvroKeyInputFormat")

    conf = None
    if len(sys.argv) == 3:
        schema_rdd = sc.textFile(sys.argv[2], 1).collect()
        conf = {
            "avro.schema.input.key": reduce(lambda x, y: x + y, schema_rdd)
        }

    avro_rdd = sc.newAPIHadoopFile(
        path,
        "org.apache.avro.mapreduce.AvroKeyInputFormat",
        "org.apache.avro.mapred.AvroKey",
        "org.apache.hadoop.io.NullWritable",
        keyConverter=
        "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter",
        conf=conf)
    output = avro_rdd.map(lambda x: x[0]).collect()
    for k in output:
        print k
Beispiel #35
0
        sc.addPyFile(dir_path + "/element.py")
        sc.addPyFile(dir_path + "/node.py")
        sc.addPyFile(dir_path + "/quadtree.py")
        sc.addPyFile(dir_path + "/query.py")
        sc.addPyFile(dir_path + "/util.py")
        sc.addPyFile(dir_path + "/voxel.py")
        sc.addPyFile(dir_path + "/executionengine.py")

        # sc.addFile(path.dirname(path.abspath(__file__)), True)
        query = Query.defineQuery(float(argv[3]))

        query_broadcast = sc.broadcast(query)

        text_rdd = sc.newAPIHadoopFile(
            argv[1],
            "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
            "org.apache.hadoop.io.Text",
            "org.apache.hadoop.io.LongWritable",
            conf={"textinputformat.record.delimiter": "\n\n"})

        tree_rdd = text_rdd.mapPartitions(build_tree)

        candidates_rdd = tree_rdd.flatMap(produce_candidates_color)

        relations_rdd = candidates_rdd.map(prepare_matching_pairs)

        filter_rdd = relations_rdd.map(filter_candidates)

        result = relations_rdd.collect()
        sc.stop()

        save_result(result, argv[2])
from pyspark import SparkContext, SparkConf, SQLContext

conf = SparkConf().setAppName("FileFormatReader")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

# conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

ff = sc.newAPIHadoopFile(
    "/sparkdemo/hdfs/sample.avro",
    "org.apache.avro.mapreduce.AvroKeyInputFormat",
    "org.apache.avro.mapred.AvroKey",
    "org.apache.hadoop.io.NullWritable",
    keyConverter=
    "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter")

# dataFrame = sqlContext.createDataFrame(ff)
# dataFrame.write.format("com.databricks.spark.avro").save("/sparkdemo/hdfs/")
print ff.collect()
Beispiel #37
0
# * List the 5 most common elements for each order (word, bigram, trigram...). For each element, list the sequence of words and the number of occurances.
# 
# Basically, you need to change all punctuations to a space and define as a word anything that is between whitespace or at the beginning or the end of a sentence, and does not consist of whitespace (strings consisiting of only white spaces should not be considered as words). The important thing here is to be simple, not to be 100% correct in terms of parsing English. Evaluation will be primarily based on identifying the 5 most frequent n-grams in correct order for all values of n. Some slack will be allowed in the values of frequency of ngrams to allow flexibility in text processing.   
# 
# This text is short enough to process on a single core using standard python. However, you are required to solve it using RDD's for the whole process. At the very end you can use `.take(5)` to bring the results to the central node for printing.

# The code for reading the file and splitting it into sentences is shown below:

# In[1]:

#path = '../Data/Moby-Dick.txt'
path = '/data/Moby-Dick.txt'

textRDD = sc.newAPIHadoopFile(path,
                              'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
                              'org.apache.hadoop.io.LongWritable',
                              'org.apache.hadoop.io.Text',
                               conf={'textinputformat.record.delimiter': "\r\n\r\n"}) \
            .map(lambda x: x[1])

sentences=textRDD.flatMap(lambda x: x.split(". "))


# Note: For running the file on cluster, change the file path to `'/data/Moby-Dick.txt'`

# Let `freq_ngramRDD` be the final result RDD containing the n-grams sorted by their frequency in descending order. Use the following function to print your final output:

# In[2]:

def printOutput(n,freq_ngramRDD):
    top=freq_ngramRDD.take(5)
    print '\n============ %d most frequent %d-grams'%(5,n)
Beispiel #38
0
def runActionsHistoryQuery(schema_file, data_path, verbose=None, yarn=None):
    """
    Function to run a pyspark job to find the logs associated with the actionshistory.json file 
    To run use a spec file with only timerange, e.g. "spec":{ "timerange":[20180706,20180706]}}
    and then execute: myspark --spec=large_query.spec --logfail
    """

    if verbose:
        print("### schema: %s" % schema_file)
        print("### path  : %s" % data_path)
        print("### spec  : %s" % spec_file)
    time0 = time.time()
    # pyspark modules
    from pyspark import SparkContext
    from pyspark import SQLContext, StorageLevel

    # define spark context, it's main object which allow
    # to communicate with spark
    ctx = SparkContext(appName="AvroKeyInputFormat")
    logger = SparkLogger(ctx)
    if not verbose:
        logger.set_level('ERROR')
    if yarn:
        logger.info("YARN client mode enabled")

    # load FWJR schema
    rdd = ctx.textFile(schema_file, 1).collect()

    # define input avro schema, the rdd is a list of lines (sc.textFile similar to readlines)
    avsc = reduce(lambda x, y: x + y, rdd)  # merge all entries from rdd list
    schema = ''.join(avsc.split())  # remove spaces in avsc map
    conf = {"avro.schema.input.key": schema}

    # define newAPIHadoopFile parameters, java classes
    aformat = "org.apache.avro.mapreduce.AvroKeyInputFormat"
    akey = "org.apache.avro.mapred.AvroKey"
    awrite = "org.apache.hadoop.io.NullWritable"
    aconv = "org.apache.spark.examples.pythonconverters.AvroWrapperToJavaConverter"

    # load data from HDFS
    if isinstance(data_path, list):
        avro_rdd = ctx.union([
            ctx.newAPIHadoopFile(f, aformat, akey, awrite, aconv, conf=conf)
            for f in data_path
        ])
    else:
        avro_rdd = ctx.newAPIHadoopFile(data_path,
                                        aformat,
                                        akey,
                                        awrite,
                                        aconv,
                                        conf=conf)

    # load failing task
    rdd_failing_tasks = ctx.textFile(
        "hdfs:///cms/users/llayer/failing_tasks.csv")

    #
    # first step: filter the data for failing workflows and join with the actionshistory workflows
    #

    # filter the tasks - keep only failing that have a log file
    def getFailing(row):
        rec = row[0]
        meta = rec.get('meta_data', {})
        if meta.get('jobstate', '') != 'jobfailed':
            return False
        if rec.get('LFNArray', []) == []:
            return False
        return True

    # create key-value structure for join
    def avro_rdd_KV(row):
        rec = row[0]
        task = rec["task"]
        return (task, rec)

    # filter + join task names
    fail_workflows = avro_rdd.filter(lambda x : getFailing(x)) \
                     .map(lambda x : avro_rdd_KV(x)) \
                     .join(rdd_failing_tasks.map(lambda x : (x,x)))

    #
    # second step: filter the data for logcollect tasks and join with the previous result
    #

    # keep only logcollect jobs
    def filterLogCollect(row):
        rec = row[0]
        meta = rec.get('meta_data', {})
        if meta.get('jobtype', '').lower() != 'logcollect':
            return False
        return True

    # create KV structure using log archive as key and return important information
    def log_KV(row):
        rec = row[1][0]
        task = rec["task"]
        lfn_array = rec.get('LFNArray', [])
        meta = rec.get('meta_data', {})
        jobstate = meta.get('jobstate', '')
        steps = rec.get('steps', [])
        status = []
        site = []
        for step in steps:
            status.append(step.get('status', ''))
            site.append(step.get('site', ''))
        out_dict = {
            'task': task,
            'jobstate': jobstate,
            'status': status,
            'site': site
        }
        return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn]

    # create KV structure for log collect jobs using log archives as keys
    def logcoll_KV(row):
        rec = row[0]
        task = rec["task"]
        lfn_array = rec.get('LFNArray', [])
        logCollect = ""
        for lfn in lfn_array:
            if 'logcollect' in lfn.lower():
                logCollect = lfn
        meta = rec.get('meta_data', {})
        jobstate = meta.get('jobstate', '')
        out_dict = {
            'logcollect_task': task,
            'logcollect_jobstate': jobstate,
            'logcollect_lfn': logCollect
        }
        return [(lfn, out_dict) for lfn in lfn_array if 'logArch' in lfn]

    # log collect tasks
    logColl = avro_rdd.filter(lambda x: filterLogCollect(x)).flatMap(
        lambda x: logcoll_KV(x))

    # join the frames with log archives as keys
    result = fail_workflows.flatMap(lambda x: log_KV(x)).join(logColl)

    # write back the records to hdfs
    result.saveAsTextFile("hdfs:///cms/users/llayer/logs10.csv")

    ctx.stop()
    if verbose:
        logger.info("Elapsed time %s" % htime(time.time() - time0))

    return 0
Beispiel #39
0
#
# * Convert all text to lower case, remove all punctuations. (Finally, the text should contain only letters, numbers and spaces)
# * Count the occurance of each word and of each 2,3,4,5 - gram
# * List the 5 most common elements for each order (word, bigram, trigram...). For each element, list the sequence of words and the number of occurances.
#
# Basically, you need to change all punctuations to a space and define as a word anything that is between whitespace or at the beginning or the end of a sentence, and does not consist of whitespace (strings consisiting of only white spaces should not be considered as words). The important thing here is to be simple, not to be 100% correct in terms of parsing English. Evaluation will be primarily based on identifying the 5 most frequent n-grams in correct order for all values of n. Some slack will be allowed in the values of frequency of ngrams to allow flexibility in text processing.
#
# This text is short enough to process on a single core using standard python. However, you are required to solve it using RDD's for the whole process. At the very end you can use `.take(5)` to bring the results to the central node for printing.

# The code for reading the file and splitting it into sentences is shown below:

# In[1]:

textRDD = sc.newAPIHadoopFile('/data/Moby-Dick.txt',
                              'org.apache.hadoop.mapreduce.lib.input.TextInputFormat',
                              'org.apache.hadoop.io.LongWritable',
                              'org.apache.hadoop.io.Text',
                               conf={'textinputformat.record.delimiter': "\r\n\r\n"}) \
            .map(lambda x: x[1])

sentences = textRDD.flatMap(lambda x: x.split(". "))

# #### Note:
# By default, the delimiter string in Spark is "\n". Thus, values in each partition of textRDD describe lines from the file rather than sentences. As a result, sentences may be split over multiple lines. For this input file, a good approach will be to delimit by paragraph so that each value in the RDD is one paragraph (instead of one line). Then we can split the paragraphs into sentences.
#
# This is done by setting the `textinputformat.record.delimiter` parameter to `"\r\n\r\n"` in the configuration file.

# Let `freq_ngramRDD` be the final result RDD containing the n-grams sorted by their frequency in descending order. Use the following function to print your final output:

# In[2]:

Beispiel #40
0
        if i[1] in d:
            d[i[1]].append(int(i[2]))
        else:
            d[i[1]] = [int(i[2])]
    v = []
    for i in word['reviews']:
        v.append((i[1], d[i[1]]))
    return v


conf = SparkConf().setMaster("local").setAppName("WordCount")
sc = SparkContext(conf=conf)
pp = pprint.PrettyPrinter(indent=1)
text_file = sc.newAPIHadoopFile(
    'amazon-meta.txt',
    "org.apache.hadoop.mapreduce.lib.input.TextInputFormat",
    "org.apache.hadoop.io.LongWritable",
    "org.apache.hadoop.io.Text",
    conf={"textinputformat.record.delimiter": '\r\n\r\n'})  # '\r\n\r\n'
amazon = text_file.map(mapWord)
#pp.pprint(amazon.take(5))
'''
a = amazon
    .filter(lambda x: "0312982178" == x['ASIN']) \#retorna um boleando
    .map(produtoBestWorst) \#retorna para todos e muda valor
    .take(1)[0] \ #pegar somente o primeiro elemento do vetor pois se não da erro, ele quer pegar todo o vet
# print(vet)
# best = vet[0:5]
# rate = vet[-5:]
# pp.pprint(best)
# pp.pprint(rate)
pp.pprint(a[0:5])
if __name__ == "__main__":
    if len(sys.argv) != 2:
        print(
            """
        Usage: parquet_inputformat.py <data_file>

        Run with example jar:
        ./bin/spark-submit --driver-class-path /path/to/example/jar \\
                /path/to/examples/parquet_inputformat.py <data_file>
        Assumes you have Parquet data stored in <data_file>.
        """,
            file=sys.stderr,
        )
        exit(-1)

    path = sys.argv[1]
    sc = SparkContext(appName="ParquetInputFormat")

    parquet_rdd = sc.newAPIHadoopFile(
        path,
        "parquet.avro.AvroParquetInputFormat",
        "java.lang.Void",
        "org.apache.avro.generic.IndexedRecord",
        valueConverter="org.apache.spark.examples.pythonconverters.IndexedRecordToJavaConverter",
    )
    output = parquet_rdd.map(lambda x: x[1]).collect()
    for k in output:
        print(k)

    sc.stop()
from pyspark import SparkConf, SparkContext

conf = SparkConf()

conf.setAppName("spark_app_wordcount_extend")

sc = SparkContext(conf=conf)

pairs = sc.newAPIHadoopFile(
    "/user/yurun/spark/textfile/",
    "org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat",
    "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text")

words = pairs.map(lambda pair: pair[1]).flatMap(lambda line: line.split("\t"))

pairs = words.map(lambda word: (word, 1))

counts = pairs.reduceByKey(lambda a, b: a + b)

results = counts.collect()

for result in results:
    print result

sc.stop()