Beispiel #1
0
def driver(sc, inputFilename, outputDirectory, 
           crfExecutable, crfScript, 
           featureListFilename, crfModelFilename, 
           eyeColorRef, eyeColorConfig, hairRef, hairConfig, 
           limit=limit, location='hdfs', outputFormat="text", partitions=None):
    dump = False
    partitions = None

    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile(crfExecutable)
    sc.addFile(crfScript)
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
    smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    if location == "hdfs":
        print "We want to do hdfs dfs -rm -r %s" % outputDirectory
    elif location == "local":
        try:
            shutil.rmtree(outputDirectory)
            print "rmtree %s" % outputDirectory
        except:
            pass
    else:
        raise RuntimeError("No such location: %s" % location)

    rdd_sequence_file_input = sc.sequenceFile(inputFilename)
    rdd_sequence_file_input.setName('rdd_sequence_file_input')
    # rdd_sequence_file_input.persist()
    
    origSize = rdd_sequence_file_input.count()
#     if limit:
#         rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
    if partitions:
        rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
    print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())

    rdd_json = rdd_sequence_file_input.mapValues(lambda x: json.loads(x))
    rdd_json.setName('rdd_json')
    # rdd_json.persist()

    # all below should also be done for title
    rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
    rdd_body.setName('rdd_body')
    # rdd_body.persist()
    if dump:
        rdd_body.saveAsTextFile(ff("body"))
        
    rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))
    rdd_body_tokens.setName('rdd_body_tokens')
    # rdd_body_tokens.persist()
    if dump:
        rdd_body_tokens.saveAsTextFile(ff("body_tokens"))

    rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True)))
    rdd_features.setName('rdd_features')
    # rdd_features.persist()
    if dump:
        rdd_features.saveAsTextFile(ff("features"))
    
    # rdd_pipeinput = rdd_features.mapValues(lambda x: base64.b64encode(vectorToString(x)))
    rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x))
    rdd_pipeinput.setName('rdd_pipeinput')
    # rdd_pipeinput.persist()
    if dump:
        rdd_pipeinput.values().saveAsTextFile(ff("pi"))
    # This caused a cannot concatenate string + None error
    # rdd_pipeinput.saveAsTextFile(outputDirectory + "-pipeinput")

    # DON'T USE SparkFiles.get to fetch the crf_test or model
    # This only works with local Spark (--master local[*])
    if location == 'hdfs':
        cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd
    rdd_pipeinput.saveAsTextFile(ff("before"))
    exit(0)

    rdd_crf_b64 = rdd_pipeinput.values().pipe(cmd)
    rdd_crf_b64.setName('rdd_crf_b64')
    # rdd_crf_b64.persist()
    if dump:
        rdd_crf_b64.saveAsTextFile(ff("po"))

    # Go directly from base64 output to a reconstructed tuple format mapping URI to vector of vectors, 
    # with empty string suffix indicating blank line
    # This is key for avoiding the groupBy step
    rdd_restore = rdd_crf_b64.map(lambda x: restore(x))
    rdd_restore.setName('rdd_restore')
    # rdd_restore.persist()
    if dump:
        rdd_restore.saveAsTextFile(ff("restore"))

    # ### WE NO LONGER HAVE TO GROUPBY
    # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
    # rdd_withuri = sc.parallelize(rdd_withuri.take(10))

    rdd_harvested = rdd_restore.mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1])
    rdd_harvested.setName('rdd_harvested')
    # rdd_harvested.persist()
    if dump:
        rdd_harvested.saveAsTextFile(ff("harvested"))

    # This has the effect of generating 0, 1, 2, ... lines according to the number of spans
    rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x))
    rdd_controlled.setName('rdd_controlled')
    # rdd_controlled.persist()

    # map any eyeColor spans using smEyeColor, hairType spans using smHairColor
    rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor}))
    rdd_aligned.setName('rdd_aligned')
    # rdd_aligned.persist()
    if dump:
        rdd_aligned.saveAsTextFile(ff("aligned"))

    rdd_aligned_json = rdd_aligned.mapValues(lambda x: json.dumps(x))
    rdd_aligned_json.setName('rdd_aligned_json')
    # rdd_aligned_json.persist()
    if dump:
        rdd_aligned_json.saveAsTextFile(ff("aligned_json"))

    rdd_final = rdd_aligned_json
    empty = rdd_final.isEmpty()
    if not empty:
        l = "unknown>1"
        print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory)
        # print len(rdd_final.collect())
        if outputFormat == "sequence":
            rdd_final.saveAsSequenceFile(outputDirectory)
        elif outputFormat == "text":
            rdd_final.saveAsTextFile(outputDirectory)
        else:
            raise RuntimeError("Unrecognized output format: %s" % outputFormat)
    else:
        print "### No records: no output into %s" % (outputDirectory)
Beispiel #2
0
    rdd_body = rdd_json.mapValues(lambda x: extract_body(x))
    rdd_body_tokens = rdd_body.mapValues(lambda x: textTokens(x))

    # TBD
    # rdd_title = rdd_json.mapValues(lambda x: extract_title(x))
    # rdd_title_tokens = rdd.title.mapValues(lambda x: textTokens(x))
    # all below should also be done for title

    # not a pair RDD?
    rdd_features = rdd_body_tokens.map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True)))
    rdd_pipeinput = rdd_features.mapValues(lambda x: vectorToString(x))

    cmd = SparkFiles.get("crf_test") + " -m " + SparkFiles.get(crfModelFilename)
    rdd_crf = rdd_pipeinput.values().pipe(cmd)
    # not a pair RDD
    # but we have the URI in the -3 position
    # and the index in the -2 position
    rdd_withuri = rdd_crf.map(lambda x: reconstructTuple(x))

    rdd_grouped = rdd_withuri.groupByKey()
    rdd_flat = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))])
    rdd_harvested = rdd_flat.mapValues(lambda x: computeSpans(x, indexed=True))

    # This has the effect of generating 0, 1, 2, ... lines according to the number of spans
    rdd_controlled = rdd_harvested.flatMapValues(lambda x: list(x))
    # map any eyeColor spans using smEye, hairType spans using smHair
    rdd_aligned = rdd_controlled.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEye, "hairType": smHair}))

    rdd_final = rdd_aligned
    rdd_final.saveAsTextFile(outputDirectory)
Beispiel #3
0
def crfalign(sc, inputFilename, outputDirectory, 
            limit=LIMIT, location='hdfs', outputFormat="text", partitions=None, deleteFirst=True):

    crfConfigDir = os.path.join(os.path.dirname(__file__), "data/config")
    def cpath(n):
        return os.path.join(crfConfigDir, n)

    smEyeColor = HybridJaccard(ref_path=cpath("eyeColor_reference_wiki.txt"),
                               config_path=cpath("eyeColor_config.txt"))
    smHairColor = HybridJaccard(ref_path=cpath("hairColor_reference_wiki.txt"),
                                config_path=cpath("hairColor_config.txt"))
    print smEyeColor, smHairColor

    if location == "hdfs":
        if deleteFirst:
            namenode = "memex-nn1"
            port = 8020
            client = Client(namenode, 8020, use_trash=True)
            try:
                for deleted in client.delete([outputDirectory], recurse=True):
                    print deleted
            except FileNotFoundException as e:
                pass

    # hypothesis1: data fetched this way prompts the lzo compression error
    # hypothesis2: but it doesn't matter, error is just a warning
    if partitions:
        if limit:
            rdd_crfl = sc.parallelize(rdd_crfl.take(limit))
            rdd_crfl = rdd_crfl.repartition(partitions)
        else:
            print inputFilename
            rdd_crfl = sc.textFile(inputFilename, minPartitions=partitions)
    else:
        rdd_crfl = sc.textFile(inputFilename)
    rdd_crfl.setName('rdd_crfl')
    # rdd_crfl.persist()
    print "beginning: %s partitions" % rdd_crfl.getNumPartitions()

    # "value-only" RDD, not a pair RDD
    # but we have the URI in the -3 position
    # and the index in the -2 position
    rdd_withuri = rdd_crfl.map(lambda x: reconstructTuple(x))

    # Note: groupByKey returns iterable, not data; so no point in printing
    rdd_grouped = rdd_withuri.groupByKey()
    # sort the vectors by index (within key groups)
    rdd_sorted = rdd_grouped.mapValues(lambda x: [l[1:] for l in sorted(x, key=lambda r: int(r[0]))])
    # find all contiguous spans of marked-up tokens
    # returns 0 or more dicts per URI key
    rdd_spans = rdd_sorted.mapValues(lambda x: computeSpans(x, indexed=True))
    # flatten to (URI, single dict) on each line
    rdd_flat = rdd_spans.flatMapValues(lambda x: list(x))
    # rdd_flat = rdd_flat.coalesce(rdd_flat.getNumPartitions() / 3)
    # # map any eyeColor spans using smEyeColor, hairType spans using smHairColor
    # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor}))
    rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor.findBestMatch, "hairType": smHairColor.findBestMatch}))
    # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": fakeFindBestMatch, "hairType": fakeFindBestMatch}))
    # rdd_aligned = rdd_flat.mapValues(lambda x: alignToControlledVocab(x, {}))
    # rdd_aligned = rdd_spans

    # rdd_final = rdd_crfl
    rdd_final = rdd_aligned
    print outputFormat
    if outputFormat == "sequence":
        rdd_final.saveAsSequenceFile(outputDirectory)
    elif outputFormat == "text":
        print "saving to %s" % outputDirectory
        rdd_final.saveAsTextFile(outputDirectory)
    else:
        raise RuntimeError("Unrecognized output format: %s" % outputFormat)
Beispiel #4
0
def driver(sc, inputFilename, outputDirectory, 
           crfExecutable, crfScript, 
           featureListFilename, crfModelFilename, 
           eyeColorRef, eyeColorConfig, hairRef, hairConfig, 
           limit=limit, location='hdfs', outputFormat="text", partitions=None):
    dump = False
    partitions = 8

    # Program to compute CRF++
    c = crf_features.CrfFeatures(featureListFilename)
    # Add files to be downloaded with this Spark job on every node.
    sc.addFile(crfExecutable)
    sc.addFile(crfScript)
    sc.addFile(crfModelFilename)

    # Map to reference sets
    smEyeColor = HybridJaccard(ref_path=eyeColorRef, config_path=eyeColorConfig)
    smHairColor = HybridJaccard(ref_path=hairRef, config_path=hairConfig)

    if location == "hdfs":
        print "We want to do hdfs dfs -rm -r %s" % outputDirectory
    elif location == "local":
        try:
            shutil.rmtree(outputDirectory)
            print "rmtree %s" % outputDirectory
        except:
            pass
    else:
        raise RuntimeError("No such location: %s" % location)

    rdd_sequence_file_input = sc.sequenceFile(inputFilename)
    rdd_sequence_file_input.setName('rdd_sequence_file_input')
    # rdd_sequence_file_input.persist()
    
    origSize = rdd_sequence_file_input.count()
#     if limit:
#         rdd = sc.parallelize(rdd_sequence_file_input.take(limit))
    if partitions:
        rdd_sequence_file_input = rdd_sequence_file_input.repartition(partitions)
    print "### input %s: %d ads (orig %s, limit was %s), %d partitions" % (inputFilename, rdd_sequence_file_input.count(), origSize, limit, rdd_sequence_file_input.getNumPartitions())

    if location == 'hdfs':
        cmd = "%s %s" % (os.path.basename(crfScript), os.path.basename(crfModelFilename))
    elif location == 'local':
        cmd = "%s %s" % (SparkFiles.get(os.path.basename(crfScript)), SparkFiles.get(os.path.basename(crfModelFilename)))
    print "### %s" % cmd

    # ### WE NO LONGER HAVE TO GROUPBY
    # ### BUT WE MUST TREAT EACH LINE INDIVIDUALLY NOW
    # rdd_withuri = sc.parallelize(rdd_withuri.take(10))

    rdd_final = rdd_sequence_file_input.mapValues(lambda x: json.loads(x)).mapValues(lambda x: extract_body(x)).mapValues(lambda x: textTokens(x)).map(lambda x: (x[0], c.computeFeatMatrix(x[1], False, addLabels=[x[0]], addIndex=True))).mapValues(lambda x: base64.b64encode(vectorToString(x))).values().pipe(cmd).map(lambda x: restore(x)).mapValues(lambda x: computeSpans(x, indexed=True)).filter(lambda p: p[1]).flatMapValues(lambda x: list(x)).mapValues(lambda x: alignToControlledVocab(x, {"eyeColor": smEyeColor, "hairType": smHairColor})).mapValues(lambda x: json.dumps(x))

    empty = rdd_final.isEmpty()
    if not empty:
        l = "unknown>1"
        print "### writing %s output (%s records) to %s" % (outputFormat, l, outputDirectory)
        print len(rdd_final.collect())
#         if outputFormat == "sequence":
#             rdd_final.saveAsSequenceFile(outputDirectory)
#         elif outputFormat == "text":
#             rdd_final.saveAsTextFile(outputDirectory)
#         else:
#             raise RuntimeError("Unrecognized output format: %s" % outputFormat)
    else:
        print "### No records: no output into %s" % (outputDirectory)