Exemple #1
0
def load_joint_tokens(manifest, base='gs'):
    spark = elizabeth.session()
    ctx = spark.sparkContext

    # Special base paths
    if base == 'https': base = 'https://storage.googleapis.com/uga-dsp/project2/data'
    if base == 'gs': base = 'gs://uga-dsp/project2/data'

    # Read the manifest as an iterator over (id, url).
    # We use Spark to build the iterator to support hdfs etc.
    manifest = str(manifest)  # cast to str to support pathlib.Path etc.
    manifest = ctx.textFile(manifest)  # RDD[hash]
    # RDD[binary_url, asm_url]
    manifest = manifest.map(lambda x: (hash_to_url(x, base=base, kind='bytes'), hash_to_url(x, base=base, kind='asm')))
    manifest = manifest.zipWithIndex()  # RDD[(url, url), id]
    manifest = manifest.map(lambda x: (x[1], x[0][0], x[0][1]))  # RDD[id, url, url]
    manifest = manifest.toLocalIterator()  # (id, url, url)

    # load the two files for each document in the manifest
    data = [(ctx.wholeTextFiles(byte_url).map(lambda x, id=id: (id, x[1])),
        ctx.wholeTextFiles(asm_url).map(lambda x, id=id: (id, x[1])) )
        for id, byte_url, asm_url in manifest]  # [ (RDD[id, text], RDD[id,text]) ]

    #[RDD[id, bytes + asm_text]]
    data = [rdd1.join(rdd2).map(lambda x: (x[0], x[1][0] + "\n\n" + x[1][1])) for rdd1, rdd2 in data]
    data = ctx.union(data)  # RDD[id, text]
    data = data.toDF(['id', 'text'])  # DF[id, text]

    opcodes = '|'.join(_opcodes)
    tokenizer = RegexTokenizer(inputCol='text', outputCol='features', gaps=False)
    tokenizer.setPattern(r'(\b[0-9A-F]{2} [0-9A-F]{2}\b)|'  # gets (mostly) bigrams of bytes
                         r'(\.?\w+:(?=[0-9A-F]{8}\s))|'     # gets segment title
                         r'(\b(' + opcodes + r')\b)')       # gets opcodes
    data = tokenizer.transform(data)
    data = data.drop('text')

    return data
Exemple #2
0
def load_data(manifest, base='gs', kind='bytes'):
    '''Load data from a manifest file into a DataFrame.

    A manifest file gives the hash identifying each document on separate lines.

    The returned DataFrame has columns `id`, `url`, and `text` where `id`
    is a document identifier, `url` is the path to the document, and `text`
    is the contents.

    Note that the document ID is _not_ the same as the hash. The ID is
    guaranteed to uniquely identify one document and the order of the IDs is
    guaranteed to match the order given in the manifest file.

    Args:
        manifest (path):
            Path or URL of the manifest file.
        base (path):
            The base of the URL or path to the data. The special strings 'gs'
            and 'https' expand to the URLs used by Data Science Practicum at
            UGA over the Google Storage and HTTPS protocols respectivly.
        kind (str):
            The kind of file to use, one of 'bytes' or 'asm'.
            - 'bytes' loads hex strings for the bytes in the binary files.
            - 'asm' loads segment titles and the opcodes from the asm files.

    Returns:
        DataFrame[id: bigint, url: string, text: string]
    '''
    spark = elizabeth.session()
    ctx = spark.sparkContext

    # Special base paths
    if base == 'https': base = 'https://storage.googleapis.com/uga-dsp/project2/data'
    if base == 'gs': base = 'gs://uga-dsp/project2/data'

    kind = 'bytes'
    if kind != 'bytes':
        kind='asm'

    # Read the manifest as an iterator over (id, url).
    # We use Spark to build the iterator to support hdfs etc.
    manifest = str(manifest)  # cast to str to support pathlib.Path etc.
    manifest = ctx.textFile(manifest)                           # RDD[hash]
    manifest = manifest.map(hash_to_url(base=base, kind=kind))  # RDD[url]
    manifest = manifest.zipWithIndex()                          # RDD[url, id]
    manifest = manifest.map(lambda x: (x[1], x[0]))             # RDD[id, url]
    manifest = manifest.toLocalIterator()                       # (id, url)

    # Load all files in the base directoy, then join out the ones in the manifest.
    prepend = lambda *args: lambda x: (*args, *x)
    data = ((id, ctx.wholeTextFiles(url)) for id, url in manifest)  # (id, RDD[url, text])
    data = [rdd.map(prepend(id)) for id, rdd in data]               # [RDD[id, url, text]]
    data = ctx.union(data)                                          # RDD[id, url, text]
    data = data.toDF(['id', 'url', 'text'])                         # DF[id, url, text]

    # Tokenization : DF[id, url, text, tokens]
    tokenizer = RegexTokenizer(inputCol='text', outputCol='features', gaps=False)
    opcodes = '|'.join(_opcodes)
    if kind == 'bytes': tokenizer.setPattern('(?<= )[0-9A-F]{2}')
    elif kind == 'asm': tokenizer.setPattern(r'(\.?\w+:(?=[0-9A-F]{8}\s))|(\b(' + opcodes + r')\b)')
    data = tokenizer.transform(data)
    data = data.drop('text')

    return data