Beispiel #1
0
def recom(matrix_file_name, user_file_name, output="re.out"):
    sc = SparkContext("local[8]", "Recommendation")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    matrix = sc.sequenceFile(matrix_file_name)
    user = sc.sequenceFile(user_file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    user_tuples = user.flatMap(flat_user) \
                 .map(map_user) \
                 .sortByKey(keyfunc=lambda k: int(k))

    keys = user_tuples.keys().collect()

    matrix_tuples = matrix.flatMap(flat_matrix) \
                          .map(map_matrix) \
                          .filter(lambda x: x[0] in keys)
    global mt 
    mt = matrix_tuples.collectAsMap()

    recm = user_tuples.flatMap(flat_recom) \
                      .reduceByKey(reduce_recom) \
                      .filter(lambda x: x[0] not in keys) \
                      .sortBy(lambda (key, value): int(value))
 
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    recm.coalesce(1).saveAsTextFile(output)
Beispiel #2
0
def do_process(args):
    sc = SparkContext(appName="task")

    wikipedia_rdd = sc.sequenceFile(args.wikipedia)
    wikidata_rdd = sc.sequenceFile(args.wikidata)

    result = wikidata_rdd\
        .join(wikipedia_rdd)\
        .map(lambda x: map_by_qid(x, args.lang))\
        .groupByKey()\
        .mapValues(list)\
        .flatMap(map_filter_unique)

    result.repartition(1).saveAsTextFile(args.output)
Beispiel #3
0
def user_artist_matrix(file_name, output="user_artist_matrix.out"):
    sc = SparkContext("local[8]", "UserArtistMatrix")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    ua_matrix = file.flatMap(ua_flat_doc) \
                 .map(ua_map) \
                 .reduceByKey(ua_reduce) \
                 .sortByKey(keyfunc=lambda k: int(k))

    ua_matrix = ua_matrix.flatMap(ua_flat_vec)

    global avg_matrix
    avg_matrix = ua_matrix.reduceByKey(ua_reduce_vec) \
                         .map(ua_map_avg)
    
    avg_matrix = avg_matrix.collectAsMap()

    co_matrix = ua_matrix.map(ua_map_cmp) \
                         .reduceByKey(ua_reduce_cmp) \
                         .map(ua_map_cmp_final)

    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    co_matrix.coalesce(1).saveAsTextFile(output)
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
    parser.add_argument('-o','--output', help="UTF-8 output file on cluster.", required=False)
    parser.add_argument('-p','--printToLog', help="Print results to log.", required=False, action='store_true')
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagCounts = data.values().flatMap(getTokens).countByValue()

    # So far, this code isn't useful.  The output fiile is written by the
    # master node into an isolated folder, and I don't know of a way to
    # retrieve it.
    if args.output != None:
        with codecs.open(args.output, 'wb', 'utf-8') as f:
            for k in sorted(tagCounts):
                f.write(k + " " + str(tagCounts[k]) + "\n")

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    if args.printToLog:
        for k in sorted(tagCounts):
            print json.dumps(k), tagCounts[k]
    print "========================================"
Beispiel #5
0
def user_artist_matrix(file_name, output="user_artist_matrix.out"):
    sc = SparkContext("local[8]", "UserArtistMatrix")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)
    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    ua_matrix = file.flatMap(ua_flat_doc) \
                 .map(ua_map) \
                 .reduceByKey(ua_reduce) \
                 .sortByKey(keyfunc=lambda k: int(k))

    ua_matrix = ua_matrix.flatMap(ua_flat_vec)

    global avg_matrix
    avg_matrix = ua_matrix.reduceByKey(ua_reduce_vec) \
                         .map(ua_map_avg)

    avg_matrix = avg_matrix.collectAsMap()

    co_matrix = ua_matrix.map(ua_map_cmp) \
                         .reduceByKey(ua_reduce_cmp) \
                         .map(ua_map_cmp_final)
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    co_matrix.coalesce(1).saveAsTextFile(output)
Beispiel #6
0
def main(argv):
    inputSequenceDir = ""
    outputSequenceDir = ""
    try:
        opts, args = getopt.getopt(argv, "i:o:")
    except getopt.GetoptError:
        sys.exit(2)
    for (opt, arg) in opts:
        if opt == '-i':
            inputSequenceDir = arg
        elif opt == '-o':
            outputSequenceDir = arg
    sc = SparkContext(appName="Fix XML App")
    datarawRDD = sc.sequenceFile(inputSequenceDir)
    cleanedRDD = datarawRDD.map(lambda x: trim(x))

    outputFormatClassName = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"
    conf1 = {
        "mapreduce.output.fileoutputformat.compress": "true",
        "mapreduce.output.fileoutputformat.compress.codec":
        "org.apache.hadoop.io.compress.DefaultCodec",
        "mapreduce.output.fileoutputformat.compress.type": "RECORD"
    }
    cleanedRDD.saveAsNewAPIHadoopFile(outputSequenceDir, outputFormatClassName,
                                      "org.apache.hadoop.io.Text",
                                      "org.apache.hadoop.io.Text", None, None,
                                      conf1)

    print "OK Bye Bye"
Beispiel #7
0
def wordcount(file_name, output="spark-wc-out-wordcount"):
    """ Reads in a sequence file FILE_NAME to be manipulated """

    # We'll be using this to call Spark built-in functions!
    sc = SparkContext("local[8]", "WordCount")

    # This gets us an RDD object, on which we can apply the Spark
    # built-in functions! In particular, file is an RDD object
    # which contains all the documents specified by the sequence file
    # passed into this function.

    # What do I mean by "contains all the documents"? If you
    # import this file from the interpreter (pyspark), and you ran
    # this line of code, in addition to file.take(n), you would
    # get the text of the first n documents specified by the sequence file input.
    file = sc.sequenceFile(file_name)
    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items. IMPORTANT: It then applies that function to every one of its elements,
      and condenses them into a single RDD. In our case, that means it will apply
      our flat_map function to every DOCUMENT in file, and then combine the results
      into a SINGLE RDD.
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.coalesce(1).saveAsTextFile(output)
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords, noPublisherRecords, noPublisherNameRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    noPublisherRecords = sc.accumulator(0)
    noPublisherNameRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    keyCounts = data.values().flatMap(getKeys).countByValue()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "noPublisherRecords = %d" % noPublisherRecords.value
    print "noPublisherNameRecords = %d" % noPublisherNameRecords.value
    for k in sorted(keyCounts):
        print k, keyCounts[k]
    print "========================================"

    sc.stop()
Beispiel #9
0
def wordCount(file_name, output="spark-wc-out-nonarticleCount"):
    sc = SparkContext("local[8]",
                      "WordCount",
                      conf=SparkConf().set("spark.hadoop.validateOutputSpecs",
                                           "false"))
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)
    """
    - Explanation:
    -
    - `flatMap` takes in a function that will take one input and outputs 0 or
    -   more items. All returned results are combined into a single list of
    -   items that future functions are run on. We use this function to
    -   transform our document into a list of words.
    -
    - `map` takes in a function take in one item, perform an action on it, and
    -   return the result. When called on a list, it applies the function to
    -   each item in the list. We use this function transform our words into
    -   `(key, value)` pairs, with the key being the word and the value being
    -   the number of times it occurs.
    -
    - `reduceByKey` groups a list of `(key, value)` pairs by keys and runs a
    -   function on each key which takes two values and returns a single value
    -   (i.e. "reducing" them two inputs into one). It will be called
    -   iteratively on each key until only a single value remains for that key.
    -   We use this function to sum the number of times a word occurs.
    """
    counts = file.flatMap(splitDocument) \
                 .map(toPairs) \
                 .reduceByKey(sumCounts) \
                 .sortByKey()
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.coalesce(1).saveAsTextFile(output)
Beispiel #10
0
def index(file_name, output="spark-wc-out-index"):
    sc = SparkContext("local[8]", "Index")
    file = sc.sequenceFile(file_name)

    indices = file.flatMap(flat_map) \
                  .reduceByKey(reduce)

    indices.coalesce(1).saveAsTextFile(output)
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    global goodJsonRecords, badJsonRecords
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagTokenCounts = data.values().flatMap(getTokens).countByValue()
    sc.stop()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "========================================"

    # Restructure the data, grouping by tag (token type indicator):
    tagTokenLists = {}
    for tagToken in tagTokenCounts.keys():
        (tag, tokenValue) = tagToken.split(":", 1)
        count = tagTokenCounts[tagToken]
        if tag not in tagTokenLists:
            tagTokenLists[tag] = []
        tagTokenLists[tag].append(Token(tokenValue, count))

    # Process each tag seperately:
    for tag in tagTokenLists.keys():
        tokenList = tagTokenLists[tag]

        # Sort the tokens by descending count and ascending token value:
        sortedTokenList = sorted(tokenList, key=attrgetter("value"))
        sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True)

        # Calculate the cumulative token count for each token in sorted order:
        totalTokens = 0
        for token in sortedTokenList:
            totalTokens += token.count
            token.cumulativeCount = totalTokens

        # We'll use the final total later, but we need it as a float to ensure
        # floating point division is used:
        floatTotalTokens = float(totalTokens)

        # Print the sorted tokens with cumulative counts, fraction of
        # total (cunumative distribution function), and index
        # (enumerate the tokens per tag, starting with 1).
        print "========================================"
        tokenIndex = 0
        for token in sortedTokenList:
            tokenIndex += 1
            fractionOfTotal = token.cumulativeCount / floatTotalTokens
            print("{0:8d} {1:50} {2:10d} {3:10d} {4:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value),
                                                                 token.count, token.cumulativeCount, fractionOfTotal))
        print "========================================"
Beispiel #12
0
def docwordcount(file_name, output="spark-wc-out-docwordcount"):
    sc = SparkContext("local[8]", "DocWordCount")
    file = sc.sequenceFile(file_name)

    counts = file.flatMap(flatMapFunc) \
                 .distinct \
                 .map(mapFunc) \
                 .reduceByKey(reduceFunc)
                 .sortByKey()
Beispiel #13
0
def docwordcount(file_name, output="wc-out-docwordcount"):
    sc = SparkContext("local[8]", "DocWordCount")
    file = sc.sequenceFile(file_name)
    """ Your code here. """
    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)

    counts.coalesce(1).saveAsTextFile(output)
Beispiel #14
0
def docwordcount(file_name, output="spark-wc-out-docwordcount"):
    sc = SparkContext("local[8]", "DocWordCount")
    file = sc.sequenceFile(file_name)

    counts = file.flatMap(flatMapFunc) \
                 .map(mapFunc) \
                 .reduceByKey(reduceFunc)

    counts.coalesce(1).saveAsTextFile(output)
Beispiel #15
0
def docwordcount(file_name, output="spark-wc-out-docwordcount"):
    sc = SparkContext("local[8]", "DocWordCount")
    file = sc.sequenceFile(file_name)

    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)

    counts.coalesce(1).saveAsTextFile(output)
Beispiel #16
0
def index(file_name, output="spark-wc-out-index"):
    sc = SparkContext("local[8]", "Index")
    file = sc.sequenceFile(file_name)

    indices = file.flatMap(flat_map) \
                  .map(map) \
                  .reduceByKey(reduce)

    indices.coalesce(1).saveAsTextFile(output)
def createIndices(file_name, output="spark-wc-out-createIndices"):
    sc = SparkContext("local[8]",
                      "CreateIndices",
                      conf=SparkConf().set("spark.hadoop.validateOutputSpecs",
                                           "false"))
    file = sc.sequenceFile(file_name)

    indices = file.flatMap(flatMapFunc) \
                 .reduceByKey(reduceFunc) \
                 .sortByKey()

    indices.coalesce(1).saveAsTextFile(output)
Beispiel #18
0
def index(file_name, output="spark-wc-out-index"):
    sc = SparkContext("local[8]", "Index")
    file = sc.sequenceFile(file_name)

    # Same message as last exercise: Feel free to modify
    # this structure so it suits your functions better (and
    # so that it satisfies the requirements).

    indices = file.flatMap(flat_map) \
                  .map(map) \
                  .reduceByKey(reduce)

    indices.coalesce(1).saveAsTextFile(output)
def mostPopular(file_name, output="spark-wc-out-mostPopular"):
    sc = SparkContext("local[8]",
                      "WordCount",
                      conf=SparkConf().set("spark.hadoop.validateOutputSpecs",
                                           "false"))
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)

    counts = file.flatMap(splitDocument) \
                 .map(toPairs) \
                 .reduceByKey(sumCounts)
    # TODO: add appropriate extra transformations here
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.coalesce(1).saveAsTextFile(output)
def main(argv=None):
    """this is called if run from command line"""

    parser = argparse.ArgumentParser()
    parser.add_argument("-e", "--excludeTags", help="Comma-separated list of tags to exclude.", required=False)
    parser.add_argument("--includeTags", help="Comma-separated list of tags to include.", required=False)
    parser.add_argument("-i", "--input", help="Seq or tuple input file.", required=True)
    parser.add_argument("--inputTuples", help="The input file is in tuple format.", required=False, action="store_true")
    parser.add_argument("-o", "--output", help="UTF-8 output file on cluster.", required=False)
    parser.add_argument("-p", "--printToLog", help="Print results to log.", required=False, action="store_true")
    args = parser.parse_args()

    if args.excludeTags and args.includeTags:
        print "Pick either --excludeTags or --includeTags, not both."
        return 1

    sc = SparkContext()

    global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    excludedTagCount = sc.accumulator(0)
    includedTagCount = sc.accumulator(0)
    tokenCount = sc.accumulator(0)

    if args.inputTuples:
        data = sc.textFile(args.input).map(lambda x: eval(x))
    else:
        data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagPhraseCounts = data.values().flatMap(getPhrasesMaker(args.includeTags, args.excludeTags)).countByValue()
    sc.stop()

    # So far, this code isn't useful.  The output fiile is written by the
    # master node into an isolated folder, and I don't know of a way to
    # retrieve it.
    if args.output != None:
        with codecs.open(args.output, "wb", "utf-8") as f:
            for k in sorted(tagPhraseCounts):
                f.write(k + " " + str(tagPhraseCounts[k]) + "\n")

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "excludedTagCount = %d" % excludedTagCount.value
    print "includedTagCount = %d" % includedTagCount.value
    print "tokenCount = %d" % tokenCount.value
    if args.printToLog:
        for k in sorted(tagPhraseCounts):
            print json.dumps(k), tagPhraseCounts[k]
    print "========================================"
Beispiel #21
0
def wordcount(file_name, output="spark-wc-out-wordcount"):
    sc = SparkContext("local[8]", "WordCount")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)
    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.coalesce(1).saveAsTextFile(output)
Beispiel #22
0
def perWordDocumentCount(file_name, output="spark-wc-out-perWordDocumentCount"):
    sc = SparkContext("local[8]", "PerWordDocumentCount", conf=SparkConf().set("spark.hadoop.validateOutputSpecs", "false"))
    file = sc.sequenceFile(file_name)

    """
    You will need to add, remove, and/or modify function calls here.
    The function `distinct()` may be helpful...
    Be sure that your output ends up in alphabetial order.
    """
    counts = file.flatMap(flatMapFunc) \
                 .distinct()\
                 .map(mapFunc)\
                 .sortByKey()\
                 .reduceByKey(reduceFunc)

    counts.coalesce(1).saveAsTextFile(output)
def docwordcount(file_name, output="spark-wc-out-docwordcount"):
    # These two lines of code are identical from wordcount.py
    sc = SparkContext("local[8]", "DocWordCount")
    file = sc.sequenceFile(file_name)

    # This is the given framework for the function. We urge
    # you not to change it TOO much.
    # You CAN complete the exercise by MOSTLY modifying the
    # functions above, but you ARE free to change this.
    # In particular, you'll probably want to add a
    # transformation at the very end to sort stuff...
    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)

    counts.coalesce(1).saveAsTextFile(output)
Beispiel #24
0
def main(argv=None):
    """this is called if run from command line"""

    parser = argparse.ArgumentParser()
    parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    recordCount = data.count()

    print "========================================"
    print recordCount
    print "========================================"

    sc.stop()
Beispiel #25
0
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-i','--input', help="Required Seq input file on cluster.", required=True)
    args = parser.parse_args()

    sc = SparkContext()
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    dataWithGoodJson = data.filter(goodJsonFilter)
    recordCount = dataWithGoodJson.count()

    print "========================================"
    print recordCount
    print "========================================"

    sc.stop()
def artist_user_matrix(file_name, output="artist_user_matrix.out"):
    sc = SparkContext("local[8]", "UserArtistMatrix")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)
    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    counts = file.flatMap(flat_Map) \
                 .map(map) \
                 .reduceByKey(reduce) \
                 .sortByKey(keyfunc=lambda k: int(k))
    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.map(lambda x: x[0] + ' ' + x[1]).coalesce(1).saveAsTextFile(output)
Beispiel #27
0
def wordcount(file_name, output="spark-wc-out-wordcount"):
    sc = SparkContext("local[8]", "WordCount")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    counts = file.flatMap(flat_map) \
                 .map(map) \
                 .reduceByKey(reduce)

    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.coalesce(1).saveAsTextFile(output)
def artist_user_matrix(file_name, output="artist_user_matrix.out"):
    sc = SparkContext("local[8]", "UserArtistMatrix")
    """ Reads in a sequence file FILE_NAME to be manipulated """
    file = sc.sequenceFile(file_name)

    """
    - flatMap takes in a function that will take one input and outputs 0 or more
      items
    - map takes in a function that will take one input and outputs a single item
    - reduceByKey takes in a function, groups the dataset by keys and aggregates
      the values of each key
    """
    counts = file.flatMap(flat_Map) \
                 .map(map) \
                 .reduceByKey(reduce) \
                 .sortByKey(keyfunc=lambda k: int(k))

    """ Takes the dataset stored in counts and writes everything out to OUTPUT """
    counts.map(lambda x: x[0] + ' ' + x[1]).coalesce(1).saveAsTextFile(output)
Beispiel #29
0
def main(argv=None):
    """this is called if run from command line"""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        "-c", "--count", help="Optionally report a count of records extracted.", required=False, action="store_true"
    )
    parser.add_argument("-i", "--input", help="Required Seq input file on cluster.", required=True)
    parser.add_argument("-k", "--key", help="Required extraction key.", required=True)
    parser.add_argument(
        "-s", "--sample", type=int, default=0, help="Optionally print a sample of results.", required=False
    )
    args = parser.parse_args()

    extractionKey = args.key

    def extractValues(value):
        try:
            d = json.loads(value)
            if extractionKey in d:
                return iter([d[extractionKey]])
            else:
                return iter([])
        except:
            return iter([])

    sc = SparkContext()
    data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    extractedValuePairs = data.flatMapValues(extractValues)

    if args.count:
        recordCount = extractedValuePairs.count()
        print "========================================"
        print recordCount
        print "========================================"

    if args.sample > 0:
        sampleSet = extractedValuePairs.take(args.sample)
        print "========================================"
        for record in sampleSet:
            print record
        print "========================================"
Beispiel #30
0
def main(argv):
    inputSequenceDir = ""
    outputSequenceDir = ""
    try:
        opts, args = getopt.getopt(argv,"i:o:")
    except getopt.GetoptError:
        sys.exit(2)
    for (opt,arg) in opts :
        if opt == '-i':
            inputSequenceDir = arg
        elif opt == '-o' :
            outputSequenceDir = arg
    sc = SparkContext(appName="Fix XML App")
    datarawRDD = sc.sequenceFile(inputSequenceDir)
    cleanedRDD = datarawRDD.map(lambda x : trim(x))
    
    outputFormatClassName = "org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat"
    conf1= {"mapreduce.output.fileoutputformat.compress": "true", 
            "mapreduce.output.fileoutputformat.compress.codec":"org.apache.hadoop.io.compress.DefaultCodec",
            "mapreduce.output.fileoutputformat.compress.type":"RECORD"}
    cleanedRDD.saveAsNewAPIHadoopFile(outputSequenceDir,outputFormatClassName,"org.apache.hadoop.io.Text","org.apache.hadoop.io.Text",None,None,conf1)

    print "OK Bye Bye"
Beispiel #31
0
def labelData(input):
	values = input[0]
	genre = input[1]
	if genre in values:
		label = 1
		values.remove(genre)
	else:
		label = 0	
	values = [x if x < genre else x-1 for x in values] #shift the attributes by one index
	ones = []
	ones = [1] * len(values)
	return LabeledPoint(label, SparseVector(column_num-1, values, ones))


#set hdfs path
data = sc.sequenceFile("hdfs://nameservice1/user/geap/warehouse/camus/etl/rat/hourly/2015/06/01/00/*")
data = sc.sequenceFile("hdfs://localhost:9000/test/*")

parsedData = data.filter(filterPoint).map(parsePoint).reduceByKey(lambda x, y : x + y).map(lambda (k, v) : list(set(v)))
parsedData.cache()

#Calculate total number of columns in the dataset
column_num = parsedData.flatMap(lambda _ : _ ).distinct().count()
column_id = parsedData.flatMap(lambda _ : _ ).distinct().collect()
column_id.sort()

#choose a genre to test, default is 100th column as target variable
genre = 1

sortedData = parsedData.map(sortPoint)
#!/usr/bin/env python

def clean_geonames(item):
    if "geonames_address" in item:
        addresses = item["geonames_address"]
        result = []
        for addr in addresses:
            geo = {}
            geo["lat"] = addr["geo"]["lat"]
            geo["lon"] = addr["geo"]["lon"]
            addr["geo"] = geo
            if "hasAlternateName" in addr:
                del addr["hasAlternateName"]
            result.append(addr)
        item["geonames_address"] = result
    return item

if __name__ == "__main__":
    from pyspark import SparkContext
    import json
    import sys

    sc = SparkContext(appName="LSH")
    inputFilename = sys.argv[1]
    outputFilename = sys.argv[2]

    rdd = sc.sequenceFile(inputFilename)
    json_rdd = rdd.mapValues(lambda x: json.loads(x))
    revised_rdd = json_rdd.mapValues(lambda x: clean_geonames(x))
    revised_rdd.mapValues(lambda x: json.dumps(x)).saveAsSequenceFile(outputFilename)
Beispiel #33
0
sqlContext.setConf('spark.sql.parquet.compression.codec','snappy')
orders = sqlContext.read.load('/user/saurinchauhan/anilagrawal/cloudera/problem5/parquet-snappy-compress','parquet')

# save the data to hdfs using no compression as parquet file at /user/cloudera/problem5/parquet-no-compress
sqlContext.setConf('spark.sql.parquet.compression.codec','uncompressed')
orders.write.save('/user/saurinchauhan/anilagrawal/cloudera/problem5/parquet-no-compress','parquet')

# save the data to hdfs using snappy compression as avro file at /user/cloudera/problem5/avro-snappy
sqlContext.setConf('spark.sql.avro.compression.codec','snappy')
orders.write.save('/user/saurinchauhan/anilagrawal/cloudera/problem5/avro-snappy','com.databricks.spark.avro')

orders = sqlContext.read.load('/user/saurinchauhan/anilagrawal/cloudera/problem5/avro-snappy','com.databricks.spark.avro')

# save the data to hdfs using no compression as json file at /user/cloudera/problem5/json-no-compress
orders.toJSON().saveAsTextFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/json-no-compress')

# save the data to hdfs using gzip compression as json file at /user/cloudera/problem5/json-gzip
orders.toJSON().saveAsTextFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/json-gzip','org.apache.hadoop.io.compress.GzipCodec')

orders = sqlContext.read.load('/user/saurinchauhan/anilagrawal/cloudera/problem5/json-gzip','json')

# save the data to as comma separated text using gzip compression at   /user/cloudera/problem5/csv-gzip
orders.rdd.map(lambda line: (str(line[0])+","+str(line[1])+","+str(line[2])+","+line[3])).saveAsTextFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/csv-gzip','org.apache.hadoop.io.compress.GzipCodec')

orders = sc.sequenceFile('/user/saurinchauhan/anilagrawal/cloudera/problem5/sequence','org.apache.hadoop.io.Text','org.apache.hadoop.io.Text')

ordersDF = orders.map(lambda line: tuple(line[1].split(','))).toDF()

sqlContext.setConf('spark.sql.parquet.compression.codec','uncompressed')
ordersDF.write.save('/user/saurinchauhan/anilagrawal/cloudera/problem5/orc','orc')
Beispiel #34
0
    if exp:
        status = exp.groupdict()["status"]
        request = exp.groupdict()["request"]
        if request:
           requestFields = request.split()
           if (len(requestFields) > 1):
            # converted bytearray to string
                return (str(requestFields[1]), str(status))


if __name__ == "__main__":

    sc = SparkContext(appName="SparkHdfsLogAggregator")
    sc.setLogLevel("ERROR")

    logs = sc.sequenceFile('/user/maria_dev/logs/19-07-22/2020/00/')

    lines = logs.map(lambda x: x[1])
    urls_status = lines.map(extractURLRequestAndStatus)

    # Reduce by URL over a 5-minute window sliding every second
    urlStatusMapper = urls_status.map(lambda x: (x, 1))
    urlStatusReducer = urlStatusMapper.reduceByKey(lambda x, y: x + y)

    '''
    Sort and print the results in
    descending order of count
    '''
    sortedResults = urlStatusReducer.sortBy(lambda x: -x[1])
    print sortedResults.collect()
Beispiel #35
0
d_out = to_hdfs_url(args.output)
min_df = int(args.min_document_frequency)

# remove any previous output (is there a way to it from spark?)
#system("hdfs dfs -rm -r %s" % d_out)

# import spark-realated stuff
from pyspark import SparkContext
from pyspark.mllib.feature import HashingTF, IDF

# init the spark context
if "sc" not in globals():
    sc = SparkContext( appName="TF-IDF")

# Load documents (one per line).
documents = sc.sequenceFile(docs_dir)

#keep only the content
documents = documents.map(lambda (fname, content): content.split(" "))

hashingTF = HashingTF()
tf = hashingTF.transform(documents)


# IDF
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

#save
tfidf.saveAsTextFile(d_out)
Beispiel #36
0
        #create hashes and reduce by key
        dict = document_terms.flatMap(lambda terms: [(t, self.indexOf(t)) for t in terms]).reduceByKey(lambda a, b: a)
        return dict

def filter_and_split(text):
    delims = u"\r\n\t.,;:'\"()?!$#-0123456789/*%<>@[]+`~_=&^ "
    translate_table = dict((ord(char), u" ") for char in delims)
    return text.lower().strip().translate(translate_table).split(" ")


# init the spark context
if "sc" not in globals():
    sc = SparkContext( appName="TF-IDF")

# Load documents (one per line).
documents = sc.sequenceFile(docs_dir).map(lambda (fname, content): filter_and_split(content))
documents.cache()

# # keep only the content (replace, lower, split, etc)
# documents = documents.

hashingTF = myHashingTF()


# create the tf vectors
tf = hashingTF.transform(documents)
# create the idf vectors
idf = IDF().fit(tf)
tfidf = idf.transform(tf)
#save
tfidf.saveAsTextFile(d_out)
Beispiel #37
0
    #outfile = '/user/ychan/data/out/karma/part-00000'
    #infile_type = 'sequence'

    infile = '/user/ychan/data/blog/blog.json'
    outfile = '/user/ychan/data/out/blog/blog.json.extractions'
    infile_type = 'text'

    #infile = '/user/ychan/data/twitter/tweet.json'
    #outfile = '/user/ychan/data/out/twitter/tweet.json.extractions'
    #infile_type = 'text'

    if infile_type == 'text':
        rdd = sc.textFile(infile).map(lambda x: json.loads(x)).map(
            lambda x: (x["url"], x))
    else:
        rdd = sc.sequenceFile(infile).mapValues(lambda x: json.loads(x))

    print('rdd count %d' % (rdd.count()))

    start_time = time.time()
    feature_rdd = rdd.mapValues(lambda x: decoder.line_to_predictions(
        ner_fea, Decoder(params), x, attribute_name))
    #for fv in feature_rdd.take(3):
    #    print(fv)
    #end_time = time.time()
    #print("****************** Elapsed time to transform RDD was %g seconds" % (end_time - start_time))

    #start_time = time.time()
    feature_rdd.mapValues(lambda x: json.dumps(x)).saveAsSequenceFile(outfile)
    end_time = time.time()
    print(
                    agg_feature_name = agg_feature["name"]
                else:
                    agg_feature_name = agg_feature[0]["name"]
                fc[agg_feature_name] = agg_feature
    return cluster

if __name__ == "__main__":
    """
        Usage: featureReducer.py [input] [output] [reducer:feature_name]...
    """
    sc = SparkContext(appName="DigFeatureReducer")

    inputFilename = sys.argv[1]
    outputFilename = sys.argv[2]

    data = sc.sequenceFile(inputFilename, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    json_values = data.mapValues(lambda row: json.loads(row))

    child_name = sys.argv[3]
    aggregations = sys.argv[4:]
    # result = json_values
    # for aggregation in aggregations:
    #     result = rdd_aggregate(result, aggregation)

    if len(aggregations) > 0:
        result = json_values.mapValues(lambda cluster: aggregate_features(cluster, child_name, aggregations))

    result.mapValues(lambda cluster: threadUtil.get_sorted_cluster(cluster)).saveAsSequenceFile(outputFilename)


Beispiel #39
0
#!/usr/bin/env python

from pyspark import SparkContext
import sys



if __name__ == "__main__":
    def extractNumLines(line):
        global lines
        lines += 1
        # print lines, ":", line, "\n\n"
        return line

    sc = SparkContext(appName="CountKeys")
    file = sc.sequenceFile( sys.argv[1])
    rdd = file.reduceByKey(lambda x, y:  x)

    lines = sc.accumulator( 0)
    num_lines = rdd.map(extractNumLines)
    num_lines.collect()
    print "Num lines: %d" % lines.value
Beispiel #40
0
def main(args):
    path_prefix = "hdfs:///amplab/sequence"

    conf = None
    sc = None
    sql = None
    for size in args.sizes:
        timings = {}

        # uservisits
        # rankings
        # crawl

        visitors_table_name = "visitors_%s" % size
        rankings_table_name = "rankings_%s" % size
        crawl_table_name    = "crawl_%s"    % size

        if conf is None: conf = SparkConf()
        if sc is None: sc = SparkContext(conf=conf)
        if sql is None: sql = SQLContext(sc)

        tic()
        sql.createDataFrame(
            sc.sequenceFile(path.join(path_prefix, size, "uservisits"))
                .map(lambda x: tuple(x[1].split(",")))
                .map(lambda x: x[:4] + parse_agent(x[4]) + x[5:])
                .map(lambda x: Row(source_ip       =      (x[ 0]),
                                   url             =      (x[ 1]),
                                   date            = date (x[ 2]),
                                   revenue         = float(x[ 3]),
                                   os_name         =      (x[ 4]),
                                   os_version      =      (x[ 5]),
                                   browser_name    =      (x[ 6]),
                                   browser_version =      (x[ 7]),
                                   country         =      (x[ 8]),
                                   language        =      (x[ 9]),
                                   search          =      (x[10]),
                                   duration        = int  (x[11]),))
        ).registerTempTable(visitors_table_name)
        timings["open-and-register"] = toc()

        tic()
        os_results = sql.sql("""
            SELECT   os_name       AS os,
                     COUNT(FALSE)  AS total_visitors,
                     SUM(revenue)  AS total_revenue,
                     AVG(revenue)  AS average_revenue,
                     SUM(duration) AS total_duration,
                     AVG(duration) AS average_duration

            FROM     {}

            GROUP BY os_name
        """.format(visitors_table_name)).toPandas()
        timings["q-stats-by-os"] = toc()
        os_results.index = os_results.pop("os")

        tic()
        browser_results = sql.sql("""
            SELECT   browser_name  AS browser,
                     COUNT(FALSE)  AS total_visitors,
                     SUM(revenue)  AS total_revenue,
                     AVG(revenue)  AS average_revenue,
                     SUM(duration) AS total_duration,
                     AVG(duration) AS average_duration

            FROM     {}

            GROUP BY browser_name
        """.format(visitors_table_name)).toPandas()
        timings["q-stats-by-browser"] = toc()
        browser_results.index = browser_results.pop("browser")

        top_dir = path.join("results", size, "spark", str(args.nodes))
        mkdir_p(top_dir)
        with open(path.join(top_dir, "timings"), "w") as f:
            for entry in timings.items():
                f.write("%s, %.18e\n" % entry)
            f.flush()

        browser_results.to_pickle(path.join(top_dir, "browser"))
        os_results.to_pickle(path.join(top_dir, "os"))

    return 0
Beispiel #41
0
    inputPath = sys.argv[1]
    year_to_search = sys.argv[2]
    outputFile = sys.argv[3]

    # start SparkContext
    conf = SparkConf().setAppName('popular_4gram')
    sc = SparkContext(conf=conf)

    log4jLogger = sc._jvm.org.apache.log4j
    LOGGER = log4jLogger.LogManager.getLogger(__name__)
    LOGGER.info('***** PYSPARK SCRIPT LOGGER INITIALIZED')

    # read input
    LOGGER.info('***** READING LZO HADOOP FILE')
    # LZO indexed by row i.e. <1:ngram data, 2: ngram data, 3: ngram data>
    files = sc.sequenceFile(inputPath, "org.apache.hadoop.io.LongWritable", "org.apache.hadoop.io.Text")
    lzoRDD = files.map(lambda x: x[1])

    # # map to 3-tuples of (ngram, year, count)
    LOGGER.info('***** SPLITTING LZO INPUT')
    allEntries = lzoRDD.map(lambda x: re.split(r'\t+',x))

    # LOGGER.info('***** GENERATING 3-TUPLES')
    # # 4gram: x[0]:ngram - x[1]:year - x[2]:occurrences
    # formattedEntries = allEntries.map(lambda x: (x[0], x[1], x[2]))
    # # formattedEntries - "word word word word", 1905, 54

    LOGGER.info('***** FILTERING ENTRIES TO INPUTTED YEAR')
    filteredEntries = allEntries.filter(lambda x: x[1] == year_to_search)

    LOGGER.info('***** SORT BY OCCURRENCES')
    if ("-ner" in processarguments):
        doNer = True
        outNer = True
    else:
        doNer = False
        outNer = False
    
# #    startTime = time.clock()
            
    # Initialize Spark
    conf = SparkConf()
    spark = SparkContext(appName="estnltk_seqfile_analyser", conf=conf)

    logger = logging.getLogger('pyspark') # logging does not work, pickling err
    # Open input files
    input_files = spark.sequenceFile(sys.argv[1])    
    

    # Perform all processes in one map
    keytextrdd = input_files.map(lambda keyval : processSequencePair(keyval[0], keyval[1]))
    keytextrdd.coalesce(1).saveAsTextFile(sys.argv[2])
    '''

    # Clean the input files (html -> only text content). Justext returns paragraphs.
    if (isPlaintextInput == True):
        keytextpairs = input_files.map(lambda line : (line[0], estnltk.Text(line[1])))
    else:
        keytextpairs = input_files.map(lambda line : (line[0], parseHtmlToText(line[1])))

# #    emptytextpairs = keytextpairs.filter(lambda keytext : len(keytext[1].words)==0)
    keytextpairs = keytextpairs.filter(lambda keytext : len(keytext[1].words) > 0)
Beispiel #43
0
 #!/usr/bin/python
 # -*- coding: utf-8 -*- 

from pyspark import SparkContext

sc = SparkContext("local", "Simple App")

data = sc.sequenceFile("programming_ranking/*",
	"org.apache.hadoop.io.Text",
	"org.apache.hadoop.io.DoubleWritable")

print (data.take(3))
    arr = {}
    for w in res:
        if len(w) >= minlen:
            #w = w.decode('utf-8-sig').encode('utf-8').lower()
            w = w.decode('utf-8-sig').lower()
            arr[w] = 1 if not arr.has_key(w) else arr[w] + 1
    return arr.items()


conf = SparkConf()
conf.setAppName('Spanish')

sc = SparkContext(conf=conf)

f = sc.sequenceFile('/project/public/collections-as-data',
                    'org.apache.hadoop.io.Text',
                    'org.apache.hadoop.io.BytesWritable').cache()
scanned = f.filter(
    lambda (n, t): n.split('/')[-1].startswith('chc') and n.endswith('txt'))

count = f.count()
txtcount = scanned.count()

top10 = scanned.flatMap(lambda (n, t): split_to_words(t, 4)).reduceByKey(
    lambda a, b: a + b).sortBy(lambda x: -x[1]).take(10)

print 'found ', count, 'files, ', txtcount, ' of them are scanned files'

for t in top10:
    print t[0].encode('utf-8'), t[1]
def main(argv=None):
    '''this is called if run from command line'''

    parser = argparse.ArgumentParser()
    parser.add_argument('-e','--excludeTags', help="Comma-separated list of tags to exclude.", required=False)
    parser.add_argument(     '--includeTags', help="Comma-separated list of tags to include.", required=False)
    parser.add_argument('-i','--input', help="Seq or tuple input data file.", required=True)
    parser.add_argument(     '--inputTuples', help="The input file is in tuple format.", required=False, action='store_true')
    args = parser.parse_args()

    if args.excludeTags and args.includeTags:
        print "Pick either --excludeTags or --includeTags, not both."
        return 1

    sc = SparkContext()

    global goodJsonRecords, badJsonRecords, excludedTagCount, includedTagCount, tokenCount
    goodJsonRecords = sc.accumulator(0)
    badJsonRecords = sc.accumulator(0)
    excludedTagCount = sc.accumulator(0)
    includedTagCount = sc.accumulator(0)
    tokenCount = sc.accumulator(0)

    if args.inputTuples:
        data = sc.textFile(args.input).map(lambda x: eval(x))
    else:
        data = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text", "org.apache.hadoop.io.Text")
    tagTokenCounts = data.values().flatMap(getTokensMaker(args.includeTags, args.excludeTags)).countByValue()
    sc.stop()

    print "========================================"
    print "goodJsonRecords = %d" % goodJsonRecords.value
    print "badJsonRecords = %d" % badJsonRecords.value
    print "excludedTagCount = %d" % excludedTagCount.value
    print "includedTagCount = %d" % includedTagCount.value
    print "tokenCount = %d" % tokenCount.value
    print "========================================"

    # Restructure the data, grouping by tag (token type indicator):
    tagTokenLists = {}
    for tagToken in tagTokenCounts.keys():
        (tag, tokenValue) = tagToken.split(":", 1)
        count = tagTokenCounts[tagToken]
        if tag not in tagTokenLists:
            tagTokenLists[tag] = []
        tagTokenLists[tag].append(Token(tokenValue, count))

    # Process each tag seperately:
    for tag in tagTokenLists.keys():
        tokenList = tagTokenLists[tag]

        # Sort the tokens by descending count and ascending token value:
        sortedTokenList = sorted(tokenList, key=attrgetter("value"))
        sortedTokenList = sorted(sortedTokenList, key=attrgetter("count"), reverse=True)

        # Calculate the cumulative token count for each token in sorted order:
        totalTokens = 0
        for token in sortedTokenList:
            totalTokens += token.count
            token.cumulativeCount = totalTokens

        # We'll use the final total later, but we need it as a float to ensure
        # floating point division is used:
        floatTotalTokens = float(totalTokens)

        # Print the sorted tokens with counts, fraction of total,
        # cumulative counts, cumulative distribution function, and
        # index (enumerate the tokens per tag, starting with 1).
        print "========================================"
        tokenIndex = 0
        for token in sortedTokenList:
            tokenIndex += 1
            fractionOfTotal = token.count / floatTotalTokens
            cumulativeFractionOfTotal = token.cumulativeCount / floatTotalTokens
            print("{0:8d} {1:50} {2:10d} {3:.5f} {4:10d} {5:.5f}".format(tokenIndex, json.dumps(tag + ": " + token.value),
                                                                         token.count, fractionOfTotal,
                                                                         token.cumulativeCount, cumulativeFractionOfTotal))
        print "========================================"
Beispiel #46
0
    parser = OptionParser()

    (c_options, args) = parser.parse_args()
    input_path = args[0]
    index = args[1]
    doc = args[2]

    sc = SparkContext(appName="DIG-LOAD_TO_ES")
    conf = SparkConf()

    es_write_conf = {
        "es.nodes": "10.1.94.103",
        "es.port": "9201",
        "es.nodes.discover": "false",
        'es.nodes.wan.only': "true",
        "es.resource": index + '/' + doc,  # use domain as `doc_type`
        "es.http.timeout": "30s",
        "es.http.retries": "20",
        "es.batch.write.retry.count": "20",  # maximum number of retries set
        "es.batch.write.retry.wait":
        "300s",  # on failure, time to wait prior to retrying
        "es.batch.size.entries": "200000",  # number of docs per batch
        "es.mapping.id": "cdr_id",  # use `doc_id` as Elasticsearch `_id`
        "es.input.json": "true"
    }

    es_man = ES(sc, conf, es_write_conf=es_write_conf)
    input_rdd = sc.sequenceFile(input_path)  # .partitionBy(1000)
    print input_rdd.first()
    es_man.rdd2es(input_rdd)
Beispiel #47
0
def main(argv=None):
    '''this is called if run from command line'''
    parser = argparse.ArgumentParser()
    parser.add_argument('--cache', help="Optionally cache the RDD in memory.", required=False, action='store_true')
    parser.add_argument('--coalesceInput', type=int, default=0, help="Reduce the number of partitions on input.", required=False)
    parser.add_argument('--coalesceOutput', type=int, default=0, help="Reduce the number of partitions on output.", required=False)
    parser.add_argument('--count', help="Count the records before writing output.", required=False, action='store_true')
    parser.add_argument('-d','--debug', help="Give debugging feedback.", required=False, action='store_true')
    parser.add_argument('--download', help="Ask Spark to download the feature list and model files to the clients.", required=False, action='store_true')
    parser.add_argument('-e','--embedKey', help="Embed the key in the output.", required=False)
    parser.add_argument('-f','--featlist', help="Input file with features to be extracted, one feature entry per line.", required=True)
    parser.add_argument(     '--fusePhrases', '--fusedPhrases', help="Join each result phrase", required=False, action='store_true')
    parser.add_argument('-k','--keyed', help="The input lines are keyed.", required=False, action='store_true')
    parser.add_argument('--hybridJaccardConfig', help="Configuration file for hybrid Jaccard processing.", required=False)
    parser.add_argument('-i','--input', help="Input file with Web scraping sentences in keyed JSON Lines format.", required=True)
    parser.add_argument('--inputPairs', help="Test the paired input data processing path.", required=False, action='store_true')
    parser.add_argument('--inputSeq', help="Read input from a Hadooop SEQ data file.", required=False, action='store_true')
    parser.add_argument('--inputTuples', help="The input pairs are encoded as tuples", required=False, action='store_true')
    parser.add_argument('-j','--justTokens', help="The input JSON line data is just tokens.", required=False, action='store_true')
    parser.add_argument('-m','--model', help="Input model file.", required=True)
    parser.add_argument('-o','--output', help="Output file of phrases in keyed JSON Lines format.", required=True)
    parser.add_argument('--outputCompressionClass', help="Compression class for text files.", required=False)
    parser.add_argument('--outputPairs', help="Test the paired output data processing path.", required=False, action='store_true')
    parser.add_argument('--outputSeq', help="Write output to a Hadooop SEQ data file.", required=False, action='store_true')
    parser.add_argument('--outputTuples', help="The outout pairs are encoded as tuples", required=False, action='store_true')
    parser.add_argument('--pairs', help="Test the paired data processing path.", required=False, action='store_true')
    parser.add_argument('-p', '--partitions', help="Number of partitions.", required=False, type=int, default=1)
    parser.add_argument('-s','--statistics', help="Report use statistics.", required=False, action='store_true')
    parser.add_argument('-t','--tags', help="Restrict the set of tags and optionally rename them: tagName,tagName:newTagName,...", required=False)
    parser.add_argument('-v','--verbose', help="Report progress.", required=False, action='store_true')
    parser.add_argument('-x','--extract', help="Name the field with text or tokens.", required=False)
    args = parser.parse_args()

    if args.verbose:
        print "========================================"
        print "Starting applyCrfSparkTest."
        print "========================================"

    # Open a Spark context:
    if args.verbose:
        print "========================================"
        print "Creating SparkContext."
        print "========================================"
        # TODO: Use time.monotonic() in python >= 3.3
        startTime = time.time() # Start timing here.

    sc = SparkContext()

    if args.verbose:
        print "========================================"
        print "SparkContext created. Application ID: "
        print sc.applicationId
        # TODO: use time.monotonic() in Python >= 3.3
        duration = time.time() - startTime
        print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
        print "========================================"

    #  Set up a CRF tagger object:
    tagger = applyCrfSpark.ApplyCrfSpark(args.featlist, args.model, args.hybridJaccardConfig,
                                         inputPairs=args.inputPairs or args.pairs or args.inputSeq,
                                         inputTuples=args.inputTuples,
                                         inputKeyed=args.keyed, inputJustTokens=args.justTokens,
                                         extractFrom=args.extract, tagMap=args.tags,
                                         fusePhrases=args.fusePhrases, embedKey=args.embedKey,
                                         outputPairs=args.outputPairs or args.pairs or args.outputSeq,
                                         outputTuples=args.outputTuples,
                                         debug=args.debug, sumStatistics=args.statistics)
    if args.verbose:
        print "========================================"
        print "CRF++ tagger created."
        # TODO: use time.monotonic() in Python >= 3.3
        duration = time.time() - startTime
        print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
        print "========================================"

    if args.statistics:
        # Convert statistics to Spark accumulators:
        tagger.initializeSparkStatistics(sc)

    if args.download:
        # Ask Spark to download the feature list and model files from the
        # driver to the clients.
        tagger.requestSparkDownload(sc)

    minPartitions = args.partitions
    if minPartitions == 0:
        minPartitions = None

    # We'll accept three types of input files: a Sequence file, a text file
    # with tab-separated key and JSON Lines data, or a text file of JSON Lines
    # data (with the output field embedded as an entry in the top-level
    # dictionary).
    if args.inputSeq:
        # This is the primary input path.
        if args.verbose:
            print "========================================"
            print "Opening the input sequence file:"
            print args.input
            print "========================================"
        inputRDD = sc.sequenceFile(args.input, "org.apache.hadoop.io.Text",  "org.apache.hadoop.io.Text",
                                   minSplits=minPartitions)
    else:
        if args.verbose:
            print "========================================"
            print "Opening the input text file:"
            print args.input
            print "========================================"
        inputRDD = sc.textFile(args.input, minPartitions)
        if args.inputPairs or args.pairs:
            if args.verbose:
                print "========================================"
                print "Converting the text lines into input pairs by splitting on tab."
                print "========================================"
            inputRDD = inputRDD.map(lambda s: s.split('\t', 1))

    if args.verbose:
        print "========================================"
        print "inputRDD is ready to read from the input file."
        # TODO: use time.monotonic() in Python >= 3.3
        duration = time.time() - startTime
        print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
        print "========================================"

    # Which is better? coalescing before processing or after processing?
    if args.coalesceInput > 0:
        numPartitions = inputRDD.getNumPartitions()
        if args.coalesceInput < numPartitions:
            if args.verbose:
                print "========================================"
                print "Coalescing partitions on input %d ==> %d" % (numPartitions, args.coalesceInput)
                print "========================================"
            inputRDD = inputRDD.coalesce(args.coalesceInput)
            if args.verbose:
                print "========================================"
                # TODO: use time.monotonic() in Python >= 3.3
                duration = time.time() - startTime
                print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
                print "========================================"

    if args.cache:
        print "========================================"
        print "Caching the input data."
        inputRDD.cache()
        # TODO: use time.monotonic() in Python >= 3.3
        duration = time.time() - startTime
        print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
        print "========================================"

    if args.count:
        print "========================================"
        print "Counting records..."
        localRecordCount = inputRDD.count()
        print "Record count: %d" % localRecordCount
        # TODO: use time.monotonic() in Python >= 3.3
        duration = time.time() - startTime
        print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
        print "========================================"

    # Perform the main RDD processing.
    if args.verbose:
        print "========================================"
        print "Requesting CRF++ tagging"
        print "========================================"
    resultsRDD = tagger.perform(inputRDD)

    # Which is better? coalescing before processing or after processing?
    if args.coalesceOutput > 0:
        numPartitions = resultsRDD.getNumPartitions()
        if args.coalesceOutput < numPartitions:
            if args.verbose:
                print "========================================"
                print "Coalescing partitions on output %d ==> %d" % (numPartitions, args.coalesceOutput)
                print "========================================"
            resultsRDD = resultsRDD.coalesce(args.coalesceOutput)
            if args.verbose:
                print "========================================"
                # TODO: use time.monotonic() in Python >= 3.3
                duration = time.time() - startTime
                print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
                print "========================================"

    # The output will either be a Sequence file or a text file.  If
    # it's a text file, it might be a tab-separated pair file, or just
    # JSON Lines data.  In either case, the main RDD processing took
    # care of all necessary formatting.  Actually, it "will take
    # care", because it won't really be executed until the save
    # action, below, takes place.
    if args.outputSeq:
        if args.verbose:
            print "========================================"
            print "Transforming data and saving the result as a Hadoop SEQ file."
            print args.output
            print "========================================"
        resultsRDD.saveAsNewAPIHadoopFile(args.output,
                                          outputFormatClass="org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat",
                                          keyClass="org.apache.hadoop.io.Text",
                                          valueClass="org.apache.hadoop.io.Text")
    else:
        if args.verbose:
            print "========================================"
            print "Transforming data and saving the result as a text file."
            print args.output
            print "========================================"
        # Paired results will be converted automatically.
        resultsRDD.saveAsTextFile(args.output,
                                  compressionCodecClass=args.outputCompressionClass)

    if args.statistics:
        print "========================================"
        tagger.showStatistics()
        print "========================================"

    if args.verbose:
        print "========================================"
        print "Ending applyCrfSparkTest."
        # TODO: use time.monotonic() in Python >= 3.3
        duration = time.time() - startTime
        print "Elapsed time: %s" % str(datetime.timedelta(seconds=duration))
        print "========================================"
    sc.stop()
    # hiveQuery = "select * from CDR where source_name='asu-twitter'"
    numPartitions = int(args.partitions)

    numFramerPartitions = numPartitions / 2
    numHivePartitions = numPartitions
    if since == "":
        numHivePartitions = numPartitions * 20

    hdfsRelativeFilname = outputFilename
    if hdfsRelativeFilname.startswith("hdfs://"):
        idx = hdfsRelativeFilname.find("/", 8)
        if idx != -1:
            hdfsRelativeFilname = hdfsRelativeFilname[idx:]

    if not args.karma:
        reduced_rdd_start = sc.sequenceFile(
            outputFilename + "/reduced_rdd").mapValues(lambda x: json.loads(x))
        reduced_rdd  = workflow.reduce_rdds_with_settings({"karma.provenance.properties": "source,publisher,dateRecorded:date,observedDate:date"},
                                                  numPartitions, reduced_rdd_start)\
                                .persist(StorageLevel.MEMORY_AND_DISK)
    else:
        if args.incremental is True:
            if len(since) > 0:
                reduced_rdd_done = hdfs_data_done(
                    hdfs_client, hdfsRelativeFilname + "/reduced_rdd/" + since)
            else:
                reduced_rdd_done = hdfs_data_done(
                    hdfs_client, hdfsRelativeFilname + "/reduced_rdd/initial")
        else:
            reduced_rdd_done = hdfs_data_done(
                hdfs_client, hdfsRelativeFilname + "/reduced_rdd")
Beispiel #49
0

#  写CSV
def writeRecords(records):
	"""写一些CSV记录"""
	output = StringIO.StringIO()
	writer = csv.DictWriter(output, fieldnames = ["name", "favouriteAnimal"])
	for record in records:
		writer.writerow(record)
	return [output.getvalue()]

pandasLovers.mapPartitions(writeRecords).saveAsTextFile(outputFile)


#  读取SequenceFile
val data = sc.sequenceFile(inFile,
	"org.apache.hadoop.io.Text", "org.apache.hadoop.io.IntWritable")


# 创建HiveContext并查询数据
from pyspark.sql import HiveContext

hiveCtx = HiveContext(sc)
rows = hiveCtx.sql("SELECT name, age FROM usrs")
firstRow = rows.first()
print firstRow.name

# 使用Spark SQL 读取 JSON 数据
tweets = hiveCtx.jsonFile("tweets.json")
tweets.registerTempTable("tweets")
results = hiveCtx.sql("SELECT usr.name, text FROM tweets")
Beispiel #50
0
#!/usr/bin/env python

if __name__ == "__main__":
    from pyspark import SparkContext
    import json
    import sys

    sc = SparkContext(appName="sample")
    inputFilename = sys.argv[1]
    outputFilename = sys.argv[2]

    rdd = sc.sequenceFile(inputFilename).mapValues(lambda x: json.loads(x))
    rdd2 = rdd.map(lambda (x, y): json.dumps(y))
    rdd2.saveAsTextFile(outputFilename)
from pyspark import SparkContext
import re
from stemming.porter2 import stem
import numpy as np
import hadoopy

#input_path="hdfs://localhost:9000/alice.txt"
input_hdfs_path="hdfs://localhost:9000/user/user/simplewikiFromHbase"
output_hdfs_path='hdfs://localhost:9000/user/user/indexwikiFromSpark'

words_stop = [line.rstrip('\n') for line in open('../stop_words.txt')]
words_stop.append('')

sc=SparkContext()

lines = sc.sequenceFile(input_hdfs_path).map(lambda (x,y):(x[5:].decode('utf-8'),y[5:].decode('utf-8')))

splitText = lines.map(lambda (url,text):(url,[stem(word.group().lower()) for word in re.finditer(r"\w+",text,re.UNICODE) if word.group().lower() not in words_stop]))

tf = splitText.map(lambda (url,splittedText):(url,{word:1.0*splittedText.count(word)/len(splittedText) for word in splittedText}))

tfWordAsKey = tf.flatMap(lambda (url,tf):[(word,[(url,tf[word])]) for word in tf]).reduceByKey(lambda a,b:a+b)

tfidf = tfWordAsKey.map(lambda (word,tfList):(word,[(url,tf*np.log10(27474.0/len(tfList))) for (url,tf) in tfList]))

NwordsMax = 200000
def read_rdd(rdd):
    for key,data in rdd.takeSample(True,NwordsMax):
        yield key,data

if hadoopy.exists(output_hdfs_path):
Beispiel #52
0
from pyspark import SparkContext, SparkConf
import shutil

conf = SparkConf().setAppName('sequenceFiles').setMaster('local').set("spark.ui.port", "4050")
sc = SparkContext(conf=conf)

rdd = sc.parallelize(range(1, 4)).map(lambda x: (x, "a" * x))

shutil.rmtree("sequence_file")

rdd.saveAsSequenceFile("sequence_file")
print(sorted(sc.sequenceFile("sequence_file").collect()))
    parser.add_option("-k", "--topk", dest="topk", type="int",
                      help="top n matches", default=3)
    parser.add_option("-x", "--numPartitions", dest="numPartitions", type="int",
                      help="number of partitions", default=10)
    parser.add_option("-z", "--candidatesName", dest="candidates_name", type="string",
                        help="name for json element for matching candidates", default="candidates")
    (c_options, args) = parser.parse_args()
    print "Got options:", c_options

    inputFilename = args[0]
    outputFilename = args[1]
    print "Save to:", outputFilename

    clusterer = Clusterer(c_options.numPartitions,
                          c_options.computeSimilarity, c_options.threshold)
    rdd = sc.sequenceFile(inputFilename).mapValues(lambda x: json.loads(x))
    if len(c_options.base) > 0:
        base = sc.sequenceFile(c_options.base).mapValues(lambda x: json.loads(x))
        result = clusterer.compute_clusters_with_base(rdd, base)
    else:
        if c_options.computeIdenticalClusters is True:
            (key_clusterids, result) = clusterer.compute_identical_clusters(rdd)
        else:
            result = clusterer.compute_clusters(rdd)

    if c_options.outputtype == "json":
        result = clusterer.output_json(result, c_options.topk, c_options.candidates_name)
    else:
        result = clusterer.output_csv(result, c_options.topk, c_options.separator)

    if c_options.outputformat == "text":
# initiate the spark
conf = SparkConf()
sc = SparkContext(conf=conf)

# COMMAND ----------

# read RDDs for each points that are saved from Task A
if DATABRICKS:
    rdds = [f.name[:-1] for f in dbutils.fs.ls(RDD_DIR)]
else:
    rdds = os.listdir(RDD_DIR)
rdds.remove("docf")  # we don't load docf here
#rdds = ["f11", "f12", "f13", "f31", "f32", "f33"]  # test data for lightweight purpose
for rdd_name in rdds:
    rdd_new = sc.sequenceFile(RDD_DIR + rdd_name)
    rdd_new.persist()
    datapoints.append(DataPoint(name=rdd_name, rdd=rdd_new))

# read docf RDD. It contains all the words i.e. all the dimensions. The value does not matter
docf = sc.sequenceFile(RDD_DIR + "docf")
"""
initialize the K-cluster
to initialize, I choose random point from the input as the centroid of the cluster
both clusters from cosine similarity and euclidean distance are generated here, so the clusters
from two distance functions can be computed simultaneously
"""
for i in range(0, K_CLUSTERS):
    for j in range(0, 2):
        rnd = random.randint(1, len(rdds) - 1)
        rdd_new = sc.sequenceFile(RDD_DIR + "f" + str(rnd))
Beispiel #55
0
from pyspark import SparkContext, SparkConf
"""
SequenceFiles are a popular Hadoop format composed of flat files with key/value pairs. SequenceFiles have sync markers 
that allow Spark to seek to a point in the file and then resynchronize with the record boundaries. This allows Spark 
to efficiently read SequenceFiles in parallel from multiple nodes.
"""

sparkconf = SparkConf().setAppName('Sequence Read').setMaster('local')
sc = SparkContext(conf=sparkconf)

links_file = sc.sequenceFile("hdfs://172.19.0.2/pagerank/seq/links",
                             keyClass="org.apache.hadoop.io.Text",
                             valueClass="org.apache.hadoop.io.Text")
urls_file = sc.sequenceFile("hdfs://172.19.0.2/pagerank/seq/urls",
                            keyClass="org.apache.hadoop.io.Text",
                            valueClass="org.apache.hadoop.io.Text")
links_rdd = links_file.values()
urls_rdd = urls_file.values().persist()

res_rdd = urls_rdd.join(links_rdd)
print res_rdd.take(5)
"""
Datatype and its corresponding hadoop writable types

Int             IntWritable or VIntWritable2
Long            LongWritable or VLongWritable2
Float           FloatWritable
Double          DoubleWritable
Boolean         BooleanWritable
Array[Byte]     BytesWritable
String          Text
from pyspark.sql.types import StringType, IntegerType, LongType
from pyspark.sql.window import Window


def writeMYSQL(df, table):
    return df.write.jdbc(url="jdbc:mysql://localhost:3306/rdbms",
                         table=table,
                         mode="overwrite",
                         properties={"user": "******"})


# Initializing Spark
sc = SparkContext()
sc.setLogLevel("WARN")

rdd = sc.sequenceFile("hdfs:///flume/events/*/*/*").map(
    lambda x: Row(*x[1].split(",")))
sqlContext = HiveContext(sc)

df = sqlContext.createDataFrame(rdd, [
    "purchaseDate2", "productName", "productPrice", "productCategory",
    "clientIPAddress"
]).cache()

#sparkTopCategories
topCategories = df.groupBy("productCategory")\
    .count()\
    .select(col("productCategory"), col("count").alias("cnt"))\
    .orderBy(col("count").desc())\
    .limit(10)
topCategories.show()
writeMYSQL(topCategories, "sparkTopCategories")
    return None


if __name__ == "__main__":
    sc = SparkContext(appName="DIG-NAME-EXTRACTION")

    parser = OptionParser()

    (c_options, args) = parser.parse_args()
    input_path = args[0]
    names_file = args[1]
    output_path = args[2]

    input_rdd = (
        sc.sequenceFile(input_path)
        .mapValues(lambda x: json.loads(x))
        .mapValues(generate_input)
        .filter(lambda x: x[1] is not None)
    )

    print json.dumps(input_rdd.first()[1])
    t = trie.CharTrie()
    names = json.load(codecs.open(names_file, "r", "utf-8"))
    for name in names:
        t[name] = name
    T = sc.broadcast(t)

    results = input_rdd.mapValues(lambda x: name_extractor(x, T))
    print results.first()
    results.mapValues(lambda x: json.dumps(x)).saveAsSequenceFile(output_path)
Beispiel #58
0
    return item[0], math.log(N / item[1], 10)


def freq(item):
    return item[0], math.log(1 + item[1], 10)


def relevancy(item):
    return item[0], item[1][0] * item[1][1]


if __name__ == '__main__':
    scope = 0
    N = 0
    sc = SparkContext(appName='project')
    rdd = sc.sequenceFile("s3://megadados-alunos/web-brasil")

    # Variable scope -> 0 = Words together, 1 = Hyundai alone, 2 = Honda alone
    scope = 0
    docs_together = rdd.flatMap(document_counter).reduceByKey(count_words)
    words_together = rdd.flatMap(word_counter).reduceByKey(count_words)

    scope = 1
    docs_hyundai = rdd.flatMap(document_counter).reduceByKey(count_words)
    words_hyundai = rdd.flatMap(word_counter).reduceByKey(count_words)

    scope = 2
    docs_honda = rdd.flatMap(document_counter).reduceByKey(count_words)
    words_honda = rdd.flatMap(word_counter).reduceByKey(count_words)

    N = docs_together.count()
Beispiel #59
0
# network-mounted shared file system.

# $ PYSPARK_DRIVER_PYTHON=ipython ./bin/pyspark --master local
###############################################################################

# parallelize collections #####################################
disData = sc.parallelize([1, 2, 3, 4])
dis_kv = sc.parallelize([('a', 1), ('b', 1)])

# text file, either local path, or hdfs://, s3n://, etc URI
disFile = sc.textFile("README.md")

# SequenceFile
rdd = sc.parallelize(range(1, 4)).map(lambda x: (x, "a"*x))
rdd.saveAsSequenceFile("seq_file")
sorted(sc.sequenceFile("seq_file").collect())


###############################################################################
#                                RDD operation
#     RDDs support two types of operations: transformations(create a new
# dataset) and actions(return a value to the driver program).
#     All transformations in Spark are lazy, and are only computed when an
# action requires.
#     By default, each transformed RDD may be recomputed each time you run an
# action on it. However, you may also persist an RDD in memory, disk or
# replicated across multiple nodes.
###############################################################################

lines = sc.textFile("README.md")