Python SparkContext.binaryFiles Examples, pyspark.SparkContext.binaryFiles Python Examples

Example #1

0

Show file

def generate_parquet(feature_path, mask_path, output_path):
    """[summary]
    Generate parquet file with two columns
        - First column: npG_array representing image
        - Second column: np_array representing mask

    Arguments:
        feature_path {[type]} -- path to all images
        mask_path {[type]} -- path to masks of images
        output_path {[type]} -- parquet path
    """

    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession, Row

    from pyspark.sql import Row
    from pyspark.sql.types import _infer_schema
    from pyspark.sql.functions import monotonically_increasing_id

    rowgroup_size_mb = 256
    spark_conf = SparkConf().setAppName('Image preprocess')
    sc = SparkContext(conf=spark_conf)
    session = SparkSession(sc)

    # Load images and convert it to dataframe
    images_rdd = sc.binaryFiles(feature_path).values()
    image_flat_numpy_rdd = images_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \
                                     .map(lambda pair_np_array_id: {'features': pair_np_array_id[0], 'id': pair_np_array_id[1]}) \
                                     .map(lambda x: dict_to_spark_row(FeatureSchema, x))

    image_df = session.createDataFrame(image_flat_numpy_rdd,
                                       FeatureSchema.as_spark_schema())
    # .withColumn("id", monotonically_increasing_id()) # Generate table row id
    # Load masks and convert it to dataframe
    mask_rdd = sc.binaryFiles(mask_path).values().zipWithIndex()

    # Convert mask rgb value to 0 for not building and 1 for building
    mask_flat_numpy_rdd = mask_rdd.map(lambda pair_raw_image_id: (raw_image_to_numpy_array(pair_raw_image_id[0]), pair_raw_image_id[1])) \
                                           .map(lambda pair_np_array_id: ((pair_np_array_id[0] / 255).astype(np.uint8), pair_np_array_id[1])) \
                                           .map(lambda pair_std_np_array_id: {'masks': pair_std_np_array_id[0], 'id': pair_std_np_array_id[1]}) \
                                           .map(lambda x: dict_to_spark_row(MaskSchema, x))

    mask_df = session.createDataFrame(mask_flat_numpy_rdd,
                                      MaskSchema.as_spark_schema())
    #.withColumn("id", monotonically_increasing_id()) # Generate table row id
    mask_df.show(5, False)
    # Concat image_df and mask_df row by row
    train_df = image_df.join(mask_df, "id", "inner").drop('id')

    #print("Summary =>>>>>>>>>>>>>>>>>>>>>>>....>>>")
    #print("Image count {} , mask count {}, train_count {}".format(image_df.count(), mask_df.count(), train_df.count()))
    #print("=======================================")
    with materialize_dataset(session, output_path, TrainSchema,
                             rowgroup_size_mb):
        train_df.write \
                 .mode('overwrite') \
                 .parquet(output_path)

Example #2

0

Show file

def generate_parquet(feature_path, mask_path, output_path):
    """[summary]
    Generate parquet file with two columns
        - First column: np_array representing image
        - Second column: np_array representing mask

    Arguments:
        feature_path {[type]} -- path to all images
        mask_path {[type]} -- path to masks of images
        output_path {[type]} -- parquet path
    """

    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SparkSession, Row

    from pyspark.sql import Row
    from pyspark.sql.types import _infer_schema
    from pyspark.sql.functions import monotonically_increasing_id

    rowgroup_size_mb = 256
    spark_conf = SparkConf().setAppName('Image preprocess')
    sc = SparkContext(conf=spark_conf)
    session = SparkSession(sc)

    # Load images and convert it to dataframe
    images_rdd = sc.binaryFiles(feature_path)
    image_flat_numpy_rdd = images_rdd.values().map(raw_image_to_numpy_array) \
                                            .map(lambda x: {'features': x}) \
                                            .map(lambda x: dict_to_spark_row(FeatureSchema, x))
    image_df = session.createDataFrame(image_flat_numpy_rdd, FeatureSchema.as_spark_schema()) \
                        .withColumn("id", monotonically_increasing_id()) # Generate table row id

    # Load masks and convert it to dataframe
    mask_rdd = sc.binaryFiles(mask_path)
    mask_flat_numpy_rdd = mask_rdd.values().map(raw_image_to_numpy_array) \
                                           .map(lambda image_np_array: (image_np_array / 255).astype(np.uint8)) \
                                           .map(lambda x: {'masks': x}) \
                                           .map(lambda x: dict_to_spark_row(MaskSchema, x))

    mask_df = session.createDataFrame(mask_flat_numpy_rdd, MaskSchema.as_spark_schema()) \
                        .withColumn("id", monotonically_increasing_id()) # Generate table row id

    # Concat image_df and mask_df row by row
    train_df = image_df.join(mask_df, "id", "outer").drop("id")
    with materialize_dataset(session, output_path, TrainSchema,
                             rowgroup_size_mb):
        train_df.write \
                .mode('overwrite') \
                .parquet(output_path)

Example #3

0

Show file

File: vocloud_preprocess.py Project: palicand/vocloud_spark_import

def main(argv):
    logging.config.fileConfig(
        os.path.join(os.path.dirname(os.path.realpath(__file__)),
                     "logging.ini"))
    parsed_args = parse_args(argv)
    spark_conf = SparkConf()
    sc = SparkContext(conf=spark_conf)
    with open(parsed_args.config) as in_config:
        preprocess_conf = json.load(in_config)
    if preprocess_conf.get("binary_input", True):
        files = sc.binaryFiles(preprocess_conf["input"],
                               preprocess_conf.get('partitions', 4000))
    else:
        files = sc.wholeTextFiles(preprocess_conf["input"],
                                  preprocess_conf.get('partitions', 4000))
    files = files.repartition(preprocess_conf.get('partitions', 4000))
    metadata = parse_metadata(preprocess_conf["labeled"]["metadata"])
    labeled = sc.textFile(preprocess_conf["labeled"]["file"], preprocess_conf.get('partitions', 4000)).\
                          map(lambda x: parse_labeled_line(x, metadata, True)).filter(lambda x: x.iloc[0]["label"] != 4).map(transform_labels)
    header, resampled = prep.preprocess(
        sc,
        files,
        labeled,
        label=preprocess_conf.get('label', True),
        cut=preprocess_conf.get("cut", {
            "low": 6300,
            "high": 6700
        }),
        pca=preprocess_conf.get("pca", None),
        partitions=preprocess_conf.get('partitions', 100))
    resampled.map(
        lambda x: x.to_csv(None, header=None).rstrip("\n")).saveAsTextFile(
            preprocess_conf["output"])

Example #4

0

Show file

File: binarize_spark.py Project: gkiar/sim

def main():

    conf = SparkConf().setAppName("binarize nifti")
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    parser = argparse.ArgumentParser(description='Binarize images')
    parser.add_argument('threshold', type=int, help="binarization threshold")
    parser.add_argument('folder_path',
                        type=str,
                        help='folder path containing all of the splits')
    parser.add_argument('output_path', type=str, help='output folder path')
    parser.add_argument('num',
                        type=int,
                        choices=[2, 4, 6, 8],
                        help='number of binarization operations')
    parser.add_argument('-m',
                        '--in_memory',
                        type=bool,
                        default=True,
                        help='in memory computation')

    args = parser.parse_args()

    nibRDD = sc.binaryFiles(args.folder_path)\
        .map(lambda x: get_data(x))

    client = Config().get_client('dev')

    if args.in_memory == 'True':
        print "Performing in-memory computations"

        for i in xrange(num - 1):
            nibRDD = nibRDD.map(lambda x: binarize(x, args.threshold))
        nibRDD = nibRDD.map(lambda x: binarize_and_save(
            x, args.threshold, args.output_path, client)).collect()

    else:
        print "Writing intermediary results to disk and loading from disk"

        binRDD = nibRDD.map(lambda x: binarize_and_save(
            x, args.threshold, args.output_path + "1", client)).collect()

        for i in xrange(num - 1):
            binRDD = sc.binaryFiles(args.output_path + "1")\
                         .map(lambda x: get_data(x))\
                         .map(lambda x: binarize_and_save(x, args.threshold, args.output_path + "1", client)).collect()

Example #5

0

Show file

def main():
    sc = SparkContext(appName="tileMapper")
    print("I do all the input output jazz")

    ###########################################################################
    big_image = sc.binaryFiles("Reference/108103_sm.jpg")
    tile_avgs = big_image.flatMap(extract_opencv_tiles())
    #buckets = tile_avgs.collect()
    #print("Bucket",buckets)
    tileMap = tile_avgs.map(
        lambda l: [item for sublist in l for item in sublist])
    tileList = tileMap.collect()
    print("Tile Map", tileMap)
    print("Tile Map", tileMap.collect())
    print("Tile List", tileList)
    print("Tile LIst", type(tileList))
    ############################################################################

    clusterIndex = getIndex()
    kmModel = KMeansModel.load(sc, "myModelPath")
    readyToCombine = []
    currentRow = None
    noOfRow = 0
    noOfCol = 0
    firstTile = tileList[0]
    tileSize = firstTile[1]
    #Randomly Get small images using kmeans match
    for tile in tileList:
        if tile[0] == currentRow:
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
            noOfCol = noOfCol + 1
        else:
            currentRow = tile[0]
            noOfCol = 1
            noOfRow = noOfRow + 1
            currentRow = tile[0]
            smallImg = findSmallImage(kmModel, [tile[4], tile[5], tile[6]],
                                      tileSize, clusterIndex)
            readyToCombine.append(smallImg)
    #Put small images into the big image canvas

    canvas = np.zeros((noOfRow * tileSize, noOfCol * tileSize, 3), np.uint8)

    #Print Image
    print("No. of Col", noOfCol)
    print("No. of Row", noOfRow)
    #print("Before Print, Check Once again",readyToCombine)
    mosaicImage = printImage(readyToCombine, canvas, noOfCol, noOfRow,
                             tileSize)

    print("Finished processing of image")
    cv2.imwrite('mosaicImageYeah.jpg', mosaicImage)

Example #6

0

Show file

def main(sujet):
    #conf = SparkConf()
    #conf.set("spark.executor.memory", "4g")
    #sc = SparkContext(conf=conf)

    sc = SparkContext()
    spark = SparkSession.builder.getOrCreate()

    # Load files
    #rdd = sc.binaryFiles('hdfs://localhost:9000/data-wiki/work/historique.avro')
    rdd = sc.binaryFiles(parm_histo)

    # Parse avro files
    nodes = rdd.flatMap(lambda args: fastavro.reader(BytesIO(args[1])))

    # Convert to a resilient distributed dataset (RDD) of rows
    rows = nodes.map(lambda node: Row(**node))

    # Convert to a Spark dataframe
    df = spark.createDataFrame(rows, samplingRatio=1)

    # Cache data to avoid re-computing everything
    df.persist()

    historique = df
    #liensql =  spark.read.format("avro").load("hdfs://localhost:9000/data-wiki/work/pagesql.avro")
    liensql = spark.read.format("avro").load(parm_sql)

    # Récupération des contributeurs du sujet
    sel_historique = historique.filter(historique.title == sujet)
    title_historique = (sel_historique.first().title)
    id_historique = (sel_historique.first().id)
    dt = sel_historique.select(explode(sel_historique.contributors)).groupBy("col").count()

    # liens historiques précédents
    liensql_from = liensql.filter(liensql.page_title == sujet).join(historique, (historique.id == liensql.page_id))
    df = liensql_from.select(explode(liensql_from.contributors)).groupBy("col").count()
    dtf = df.unionAll(dt).orderBy('count', ascending=False)

    # liens historiques suivants
    liensql_to = liensql.filter(liensql.page_id == id_historique).join(historique, (historique.title == liensql.page_title))
    dt = liensql_to.select(explode(liensql_to.contributors)).groupBy("col").count()
    dc = dtf.unionAll(dt).orderBy('count', ascending=False)
 
    dc.createTempView("datasql")
    spark.sql("SELECT col as contributeur, count as score FROM datasql limit 3").show()
    print("Les meilleurs contributeurs pour le sujet wikipédia : " + sys.argv[3])

Example #7

0

Show file

def main(args):
    conf = SparkConf().setMaster("local[4]").setAppName("transport")
    sc = SparkContext(conf=conf)

    sqlContext = SQLContext(sc)

    if args.environment == "local":
        input_path = "datasets/test-folder.small/*.zip"
        output_path = "./"
    elif args.environment == "cloud":
        input_path = "s3n://dtpm-transactions/test-folder.small/*.zip"
        output_path = "s3n://dtpm-transactions/parquet/"

    rdd = sc.binaryFiles(input_path).flatMap(
        lambda a: extract_files(a[0], a[1]))

    # Decode bytes and convert it in a list of strings
    rdd = rdd.mapValues(
        lambda file: BytesIO(file).read().decode('cp1252').split('\n'))

    # Drop header and last (and empty) row
    rdd = rdd.mapValues(lambda table: table[1:-1])

    # Change type of columns
    rdd = rdd.flatMap(lambda a: prepare_csv(a[0], a[1]))

    header = [
        'FILE_NAME', 'FECHAHORATRX', 'CODIGOENTIDAD', 'NOMBREENTIDAD',
        'CODIGOSITIO', 'NOMBRESITIO', 'NROTARJETA'
    ]
    header = list(map(lambda a: a.lower(), header))

    df = rdd.toDF(header)

    days = [
        file.file_name for file in df.select('file_name').distinct().collect()
    ]

    for directory in days:
        if not os.path.exists(directory):
            os.makedirs(directory)
        df_day = df.select(df.columns[1:]).where(df.file_name == directory)
        df_day.write.parquet(output_path + directory + "/data.parquet",
                             compression="gzip")

Example #8

0

Show file

File: vocloud_preprocess.py Project: palicand/vocloud_spark_import

def main(argv):
    logging.config.fileConfig(os.path.join(os.path.dirname(os.path.realpath(__file__)), "logging.ini"))
    parsed_args = parse_args(argv)
    spark_conf = SparkConf()
    sc = SparkContext(conf=spark_conf)
    with open(parsed_args.config) as in_config:
        preprocess_conf = json.load(in_config)
    if preprocess_conf.get("binary_input", True):
        files = sc.binaryFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000))
    else:
        files = sc.wholeTextFiles(preprocess_conf["input"], preprocess_conf.get('partitions', 4000))
    files = files.repartition(preprocess_conf.get('partitions', 4000))
    metadata = parse_metadata(preprocess_conf["labeled"]["metadata"])
    labeled = sc.textFile(preprocess_conf["labeled"]["file"], preprocess_conf.get('partitions', 4000)).\
                          map(lambda x: parse_labeled_line(x, metadata, True)).filter(lambda x: x.iloc[0]["label"] != 4).map(transform_labels)
    header, resampled = prep.preprocess(sc, files, labeled, label=preprocess_conf.get('label', True),
                                        cut=preprocess_conf.get("cut", {"low": 6300, "high": 6700}),
                                pca=preprocess_conf.get("pca", None), partitions=preprocess_conf.get('partitions', 100))
    resampled.map(lambda x: x.to_csv(None, header=None).rstrip("\n")).saveAsTextFile(preprocess_conf["output"])

Example #9

0

Show file

class SparkUtils:
    def __init__(self, master, app_name):
        if os.environ["pfe_env"] != "dev":
            self.sc = SparkContext(appName=app_name)
            self.sc.addFile('/FileProcessor.py')
            self.sc.addFile('/FileIndexProducer.py')
            self.sc.addFile('/FileIndexRepository.py')
            self.sc.addFile('/FileUrlProcessor.py')
            self.sc.addFile('/LdaTopicsDescriptionProducer.py')
            self.sc.addFile('/LdaTopicsDescriptionRepository.py')
            self.sc.addFile('/Parser.py')
            self.sc.addFile('/SparkProcessor.py')
            self.sc.addFile('/SparkUtils.py')
            self.sc.addFile('/TextMostCommonWordsExtractor.py')
            self.sc.addFile('/TextPreProcessor.py')
            self.sc.addFile('/TextSummarizer.py')
            self.sc.addFile('/thumbnail_temp.py')
            self.sc.addFile('/ThumbnailGenerator.py')
            self.sc.addFile('/NotificationConstants.py')
            self.sc.addFile('/RabbitMqConstants.py')
        else:
            self.sc = SparkContext(master=master, appName=app_name)
        self.sql_context = SQLContext(self.sc)

    # output rdd:(url, b'content")
    def read_files(self, path):
        return self.sc.binaryFiles(path)

    def rdd_to_df(self, rdd, schema):
        df = self.sql_context.createDataFrame(rdd, schema)
        return df

    def join_df(self, df0, df1, join_col, df0_selected_cols, df1_selected_cols):
        df0_selected_cols = ["df0."+x for x in df0_selected_cols]
        df1_selected_cols = ["df1."+x for x in df1_selected_cols]
        df0 = df0.alias('df0')
        df1 = df1.alias('df1')
        df = df0.join(df1, col("df0."+join_col) == col("df1."+join_col))\
            .select(df0_selected_cols + df1_selected_cols)
        return df

Example #10

0

Show file

File: task.py Project: yuwtennis/apache-spark

def run():

    aws_access_key_id = os.getenv('AWS_ACCESS_KEY_ID')
    aws_secret_access_key = os.getenv('AWS_SECRET_ACCESS_KEY')
    ds = os.getenv('DATA_SOURCE')

    conf = SparkConf()
    sc = SparkContext(conf=conf)

    # https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html
    # Authenticating with S3
    sc._jsc.hadoopConfiguration().set('fs.s3a.access.key', aws_access_key_id)
    sc._jsc.hadoopConfiguration().set('fs.s3a.secret.key',
                                      aws_secret_access_key)

    imgs = sc.binaryFiles(ds)
    imgs.foreach(\
        partial(\
            upload_img_to_s3,\
            aws_access_key_id=aws_access_key_id,\
            aws_secret_access_key=aws_secret_access_key\
        )\
    )

Example #11

0

Show file

File: binarize_fsl.py Project: gkiar/sim

def main():

    conf = SparkConf().setAppName("binarize nifti")
    sc = SparkContext(conf=conf)
    sc.setLogLevel('ERROR')

    parser = argparse.ArgumentParser(
        description='Binarize images using FSL installed in a Docker container'
    )
    parser.add_argument('threshold', type=int, help="binarization threshold")
    parser.add_argument('folder_path',
                        type=str,
                        help='folder path containing all of the splits')
    parser.add_argument('output_path', type=str, help='output folder path')

    args = parser.parse_args()

    print args.folder_path
    client = Config().get_client('dev')

    nibRDD = sc.binaryFiles(args.folder_path)\
        .map(lambda x: get_data(x))\
        .map(lambda x: binarize(x, args.threshold))\
        .map(lambda x: copy_to_hdfs(x, args.output_path, client)).collect()

Example #12

0

Show file

from PIL import Image, ImageFilee

import pyspark
import numpy as np
import pydoop.hdfs as hdfs
import os
import string
import random
import cv2

sc = SparkContext("spark://discus-p2irc-master:7077", "imageSharpening")
#sc = SparkContext("local", "sharpenedImages")

processing_start_time = time()

images_rdd = sc.binaryFiles(
    'hdfs://discus-p2irc-master:54310/user/hduser/landsat_images', 100)
#images_rdd = sc.binaryFiles('file:///sparkdata/p2irc-images', 264)

#ImageFile.LOAD_TRUNCATED_IMAGES = True
#images_to_bytes = lambda rawdata: Image.open(StringIO(rawdata)).convert('RGB')


def images_to_bytes(rawdata):
    ImageFile.LOAD_TRUNCATED_IMAGES = True
    return (rawdata[0], Image.open(StringIO(rawdata[1])).convert('RGB'))


images_bytes = images_rdd.values().map(images_to_bytes)
images_bytes.persist(pyspark.StorageLevel.MEMORY_AND_DISK)

processing_end_time = time() - processing_start_time

Example #13

0

Show file

File: 03-spark-analysis.py Project: romaingd/Dev

import os
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row

from io import BytesIO
import json
import fastavro

sc = SparkContext()
spark = SparkSession.builder.getOrCreate()

# Load files
rdd = sc.binaryFiles('hdfs://localhost:9000/data/paris/master/full/*.avro') # (filename, content)
# If it takes too long to process all files, you may want to reduce the number
# of processed files. E.g:
# rdd = sc.binaryFiles('hdfs://localhost:9000/data/paris/master/full/2.250182*.avro') # (filename, content)

# Parse avro files
nodes = rdd.flatMap(lambda args: fastavro.reader(BytesIO(args[1])))

# Convert to a resilient distributed dataset (RDD) of rows
rows = nodes.map(lambda node: Row(**node))

# Convert to a Spark dataframe
df = spark.createDataFrame(rows)

# Cache data to avoid re-computing everything
df.persist()

print("There are %d nodes in the dataset" % df.count())

Example #14

0

Show file

File: convert_image_to_array.py Project: wykbill03/Yelp_Photo_Classification

from pyspark import SparkConf
from StringIO import StringIO
from PIL import Image
import numpy as np
import os, tempfile
import datetime
sc = SparkContext()

# AWS S3 credentials:

AWS_KEY = ""
AWS_SECRET = ""
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET)

directory = 's3n://amlyelp/subset/trainnew/'

images = sc.binaryFiles(directory)

image_to_array = lambda rawdata: np.asarray(Image.open(StringIO(rawdata)))

image_array = images.map(lambda x: (x[0], image_to_array(x[1])))

image_array_flatten = image_array.map(lambda x: (x[0], x[1].flatten()))

image_array_flatten = image_array_flatten.map(lambda x: (x[0].split('/')[-1].\
                                                         replace('.jpg', '')," ".join(np.char.mod('%d', x[1]))))\
                                         .repartition(120).cache()

image_array_flatten.saveAsTextFile("s3n://amlyelp/subset/train_image_array/")

Example #15

0

Show file

File: imageStitching.py Project: hsabiu/thesis-scripts

        #images.append(Image("/home/hduser/dev-materials/", "IMG_%04d_1.png" % i))
        images.append(Image("/home/hduser/dev-materials/", "IMG_%04d_1.tif" % (i + 50)))

    #result = stitch_two(images[0], images[1])
    #fileName = "stitched_two.png"
    #result[0].save(path="/home/hduser/dev-materials/", filename=fileName)

    result, images = stitch_multiple(images)

    output_image_name = "stitched_img_.png"
    result.save(path="/home/hduser/dev-materials/", filename=output_image_name)
"""

reading_start_time = time()
images_rdd = sc.binaryFiles(
    'hdfs://discus-p2irc-master:54310/user/hduser/registration_images_tif',
    100)

images_bytes = images_rdd.map(read_images) \
    .map(lambda rawdata: (rawdata[0][78:79], [rawdata[1]])) \
    .reduceByKey(lambda first_image, second_image: first_image + second_image)

images_bytes.persist(pyspark.StorageLevel.MEMORY_AND_DISK_SER)
reading_end_time = time() - reading_start_time

processing_start_time = time()
images_bytes.foreach(stitch_multiple)
processing_end_time = time() - processing_start_time
"""
    .groupByKey() \
    .mapValues(list)

Example #16

0

Show file

File: foreach.py Project: cpatrick/NEX

import re
import os
import sys
import numpy as np

srtm_dtype = np.dtype('>i2')
filename_regex = re.compile('([NSEW]\d+[NSEW]\d+).*')

# The data directory, needs to be available to all node in the cluster
data_files = '/media/bitbucket/srtm/version2_1/SRTM3/North_America'

# Build up the context, using the master URL
sc = SparkContext('spark://ulex:7077', 'srtm')

# Now load all the zip files into a RDD
data = sc.binaryFiles(data_files)

# The two accumulators are used to collect values across the cluster
num_samples_acc = sc.accumulator(0)
sum_acc = sc.accumulator(0)

# Function to array
def read_array(data):
    hgt_2darray = np.flipud(np.fromstring(data, dtype=srtm_dtype).reshape(1201, 1201))

    return hgt_2darray

# Function to process a HGT file
def process_file(file):
    (name, content) = file

Example #17

0

Show file

File: parquet.py Project: vikashsahu4/Datapipeline-Kafka-S3

from pyspark import SparkContext
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
'''
from pyspark import SparkContext
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
#df = spark.read.format("avro").load("sample.avro")
df = sqlContext.read.format("com.databricks.spark.avro").load("sample.avro")
df.show()
sc.stop()
'''

with open('data_schema.json') as f:
    schema = json.load(f)

print(type(schema))
print(schema)

rdd = sc.binaryFiles("/home/vsahu/project/spark/sample.avro").flatMap(
    lambda args: fastavro.reader(BytesIO(args[1]), reader_schema=schema))
print(rdd.collect())

df = rdd.toDF()
#df = sqlContext.createDataFrame(rdd, schema)

df.write.parquet("sample1.parquet")

Example #18

0

Show file

def readimage(path):
    with open(path, "rb") as f:
        return bytearray(f.read())

execution_path = os.getcwd()
directory = 'hdfscontentFromFlume'
#directory = '/flume/eventdata'
IMAGE_SIZE = (10,7.5)

with tf.device('/gpu:0'):
    with detection_graph.as_default():
        with tf.Session(graph=detection_graph) as sess:
            for filename in filelst:
                print('Fetching image byte stream '+filename)
                try:
                    byteFileAsRdd = sc.binaryFiles('hdfs://localhost:9000'+filename).take(1)
                    img_str = bytearray(byteFileAsRdd[0][1])
                    #img_str = readimagehdfs(filename)
                    arr = np.asarray(img_str, dtype=np.uint8)
                    image = cv2.imdecode( arr, -1)

                    #img_str = readimage(directory+'/'+filename)
                    #arr = np.asarray(img_str, dtype=np.uint8)
                    #image = cv2.imdecode( arr, -1)

                    if (type(image) is np.ndarray):
                        # Fetched image now detect objects
                        image_np = image
                        # Expand dimensions since the model expects images to have shape: [1, None, None, 3]
                        image_np_expanded = np.expand_dims(image_np, axis=0)
                        image_tensor = detection_graph.get_tensor_by_name('image_tensor:0')

Example #19

0

Show file

# This is the path of the directory where the images will be stored after face is detected.
# After the face is detected in the image, we will draw a rectangle around the face in the image & store that image in the below directory.
rect_img_dir = './face_detected/'

# Haar Cascade Classifier (from OpenCV library)
# This classifier will be used to detect front faces in the images.
# Give below the path of the classifier.
distCascade = "./haarcascade_frontalface_default.xml"

# This adds the Cascade file on different nodes in Spark cluster.
# This is necessary if you run this spark code on muti-node spark cluster.
sc.addFile(distCascade)

# Converting the images into RDD
images_RDD = sc.binaryFiles(img_dir)

# For more details about this function. You can do help(sc.binaryFiles)

# If you have large number of images to process (like a million) then the Spark will by default make a lot of partitions.
# To repartition your image data into less number of partitions, you can run below command & change the number of partitions to what you want.
#images_RDD = images_RDD.repartition(20000)


# Face Detection function
def face_detect(an_img_rdd_element):
    x = an_img_rdd_element[0]
    img = an_img_rdd_element[1]
    img_fname = x.split("/")[-1]
    file_bytes = np.asarray(bytearray(img), dtype=np.uint8)
    im = cv2.imdecode(file_bytes, 1)

Example #20

0

Show file

    imgs2 = pca_reduced_features.filter(lambda x: x[0] in z2)
    eucledian_distance2 = ziplist2.cartesian(imgs2)
    eucledian_distance2 = eucledian_distance2.map(lambda x: (calcdistance(
        x[0][1], x[1][1]), (x[0][0], x[1][0]))).sortByKey(ascending=True)
    print(eucledian_distance2.collect())
    print("\n\n")


if __name__ == "__main__":

    conf = SparkConf()
    conf.setAppName("Assignment 2")
    sc = SparkContext(conf=conf)

    #Reading the files
    rdd = sc.binaryFiles("hdfs:/data/large_sample")

    #Filtering the images
    rdd = rdd.map(lambda x: ((x[0].split("/")), x[1])).map(lambda x:
                                                           (x[0][-1], x[1]))

    #getting the arrays corresponding to each image
    rdd = rdd.map(lambda x: (x[0], getOrthoTif(x[1])))
    #print(rdd.collect())

    #breaking the image into 25 evenly sized subimages
    rdd = rdd.map(lambda x: (x[0], breakimage(x[1], 500)))
    #print(rdd.collect())

    #creating a final rdd for (imagename, array)
    finalrdd = rdd.flatMap(

Example #21

0

Show file

conf.set("spark.executor.memory", '17g')
# conf.set('spark.driver.memory','8g')
# conf.set('spark.memory.offHeap.enabled','true')
# conf.set('spark.memory.offHeap.size','15g')
# conf.set('spark.cores.max','5')
# conf.set('spark.driver.maxResultSize','8g')
# conf.set('spark.sql.shuffle.partitions','1000')
# conf.set('spark.storage.memoryFraction','0.1')
conf.set('spark.yarn.executor.memoryOverhead', '3g')
sc = SparkContext(conf=conf)

# <h2>Create RDD of Forest Gain Images</h2>

# In[310]:

rdd = sc.binaryFiles("hdfs://proj-working-m:8020/user/saif/gain/")

# <h2> Map function for Gain images </h2>
# <p>Counts the total forest gain in a particular image. Each image cover 10x10degrees in the world map.<br>
#    Images have values 0's for no forest and 1's for forest gain. These outputs constitute a part of our final results
# to report the total forest gain in all of South America from 2000-2012 </p>
# <h3> Pipeline used:</h3>
# <ul>
#     <li>Spark</li>
#     <li>Hadoop(HDFS)</li>
# </ul>

# In[311]:


def mapper_func(x):

Example #22

0

Show file

File: reproject_to_s3.py Project: lossyrob/dec2015-workshop

if __name__ == '__main__':
    from pyspark import SparkContext, SparkConf

    parser = argparse.ArgumentParser()

    parser.add_argument('src_tif_dir', help='Directory with files to reproject')
    parser.add_argument('dst_dir', help='Directory to write reproject files')
    parser.add_argument('--data-name', help='Optional identifer to prefix files with', default='')
    parser.add_argument('--dst-crs', help='CRS to reproject files to', default='EPSG:3857')
    parser.add_argument('--extension', help='Only consider files ending in this extension', default='')
    parser.add_argument('--region', help='Region for the S3 client to use', default='')

    args = parser.parse_args()

    spark_conf = SparkConf().setAppName('Rainfall-Reprojection')
    sc = SparkContext(conf=spark_conf)

    raw_tifs = sc.binaryFiles(args.src_tif_dir)

    if args.extension:
        raw_tifs = raw_tifs.filter(lambda (path, _): path.endswith(args.extension))

    reprojected_tifs = raw_tifs.map(
        lambda (src_tif_path_remote, tif_bytes): process_tif(
            src_tif_path_remote, tif_bytes, args.data_name, args.dst_crs, args.dst_dir, args.region
        )
    )

    num_reprojected = reprojected_tifs.count()

Example #23

0

Show file

# Choice of number of blocks being Blocks * Blocks
Blocks = 8

# Threshold of the edge map
T = 50

# Size of the filter and number to be extended
filterSize = 3
numExt = (filterSize - 1) // 2

# getting an instance of spark context
sc = sc()

# Obtaining rdd through of hdfs
hdfsDirectory = 'hdfs://localhost:9000/SampleImages/'
rdd = sc.binaryFiles(hdfsDirectory + '*')

# Decoding the images -- file_params (fileName, binary)
rdd = rdd.map(lambda file_params: (
    file_params[0],
    cv2.imdecode(np.asarray(bytearray(file_params[1]), dtype=np.uint8), 1)))

# file_params (fileName, img) -> file_params (i, (fileName, img))
rdd = rdd.flatMap(lambda file_params: extendVertical(file_params))

# file_params (i, (fileName, img)) -> file_params ((i,j),(fileName, img))
rdd = rdd.flatMap(lambda file_params: extendHorizontal(file_params))

# Transforming the images to a gray color scale -- rdd input: file_params ((i,j),(fileName, img))
rdd = rdd.map(lambda file_params: ((file_params[0][0], file_params[0][1]), (
    file_params[1][0], cv2.cvtColor(file_params[1][1], cv2.COLOR_BGR2GRAY))))

Example #24

0

Show file

File: cde_spark.py Project: SharpLu/Sympathy-for-data-benchmark


# (field0, series.y)
def eval_flow_cde(x):
    return eval_flow_spark(x, bc_output_dir.value)

def plotImage(x):
    t0 = time.time()
    distribution_plot_subsets_spark(x[1], bc_output_dir.value)
    print "plotImage used:  ", t0 - time.time()
    # print("-----"+x[0],x[1])

# hdfs
startTime = time.time()
t0 = time.time()
hdfsFile = sc.binaryFiles(input_dir).persist(StorageLevel.MEMORY_AND_DISK)
#########################################################Total Time Used: 8.73000001907
adaf_objs = hdfsFile.map(read_dat_hdfs)\
    .map(ExtractVIN) \
    .map(process_dat_adaf).map(sort_adaf) \
    .map(vehical_config) \
    .filter(do_filter)\
    .map(subsetMetaData)
#########################################################Total Time Used: 25.1680002213

# rdd_vehical_config = hdfsFile.map(read_dat_hdfs) \
#     .map(ExtractVIN) \
#     .map(process_dat_adaf).map(sort_adaf) \
#     .map(vehical_config)
# rdd_vehical_config.count()
# print("rdd_vehical_config Used Time: {}".format(time.time() - t0))

Example #25

0

Show file

    OUTPUT_FILE_TYPE = ".png"
    # Directory to store registered images
    OUTPUT_FILE_PATH = output_root_path
    # Directory to store processed registered images
    OUTPUT_PROCESSED_PATH = output_root_path + "/processed/"

    # Set spark configurations
    sc = SparkContext(appName=job_name)

    reading_start_time = time()

    # When reading from local file system
    #images_rdd = sc.binaryFiles('file:///sparkdata/registration_images')

    # When reading from HDFS
    images_rdd = sc.binaryFiles(input_path)

    # Calculate the index to use for getting images group
    index = images_rdd.first()[0].find("IMG_") + 4

    images_group_rdd = images_rdd.map(read_images) \
        .map(lambda rawdata: (rawdata[0][index:rawdata[0].rfind('_')], (rawdata[0][index:], rawdata[1]))) \
        .reduceByKey(lambda first_image, second_image: (first_image + second_image))

    reading_end_time = time() - reading_start_time

    processing_start_time = time()

    images_group_rdd.foreach(register_group)

    processing_end_time = time() - processing_start_time

Example #26

0

Show file

File: simple_map.py Project: cchriste/dataflow

    t0=tbegin=time.time()

    if gen_num_blocks>0 and gen_block_size>0:
        rdd=sc.parallelize(range(gen_num_blocks),args.nodes*12*args.nparts)
        gen_block_count=gen_block_size*1E6/24  # 24 bytes per vector
        print("generating %d blocks of %d vectors each..."%(gen_num_blocks,gen_block_count))
        outfile.write("generating data...\n")
        outfile.write("partition_multiplier: "+str(args.nparts)+"\n")
        outfile.write("gen_num_blocks: "+str(gen_num_blocks)+"\n")
        outfile.write("gen_block_size: "+str(gen_block_size)+"\n")
        outfile.write("total_data_size: "+str(gen_num_blocks*gen_block_size)+"\n")
        A=rdd.map(lambda x:generate(x,gen_block_count))
    elif args.src:
        outfile.write("reading data...\n")
        outfile.write(args.src+"\n")
        rdd = sc.binaryFiles(args.src)
        A = rdd.map(parseVectors)
    else:
        print("either --src or --generate must be specified")
        sc.stop();
        from sys import exit
        exit(-1)

    #rdd.foreach(noop)  #useful to force pipeline to execute for debugging
    tmark=time.time()
    outfile.write("read/parse or generate partitions: %0.6f\n"%(tmark-t0))
    outfile.write("numPartitions(%d,%s): %d\n"%(A.id(),A.name(),A.getNumPartitions()))
    t0=tmark

    # apply simple operation (V'=V+V0)
    shift=np.array([25.25,-12.125,6.333],dtype=np.float64)

Example #27

0

Show file

File: reproject_to_hdfs.py Project: lossyrob/dec2015-workshop

        '--partitions', default=250, type=int,
        help=('Number of partitions to coalesce geotiffs to else '
              'each geotiff will end up in its own partition'))
    parser.add_argument(
        '--sampling-method', default="nearest",
        choices=SAMPLING_METHODS.keys(),
        help=('Sampling method to use during reprojection')
    )
    parser.add_argument(
        '--no-data-value', default=None,
        help='Value to represent no data if not set in original geotiff'
    )

    args = parser.parse_args()

    spark_conf = SparkConf().setAppName('Azavea-Data-Hub-Reprojection')
    sc = SparkContext(conf=spark_conf)

    sampling_method = SAMPLING_METHODS.get(args.sampling_method, RESAMPLING.nearest)

    raw_tifs = sc.binaryFiles(args.src_tif_dir).coalesce(args.partitions)

    reprojected_tifs = raw_tifs.map(
        lambda (src_tif_path_remote, tif_bytes): reproject_tif(
            src_tif_path_remote, tif_bytes, args.dst_crs,
            sampling_method, args.no_data_value
        )
    )

    reprojected_tifs.saveAsSequenceFile(args.rdd_dst)

Example #28

0

Show file

    with client.write(outDir + '/TRANSFORM_' + image[0].split("/")[-1], overwrite=True) as writer:
        writer.write(buf.getvalue())
    buf.close()


sc = SparkContext(appName="color")
sqlContext = SQLContext(sc)
inputDir = argv[1]
outputDir = argv[2]
numPartitions = int(argv[3])
df = sqlContext.read.parquet(inputDir + '/satMetadata.parquet')
first = df.first()
satHeight = first[2]
x = list(map(lambda x: x * satHeight, first[0]))
xmin = min(x)
xmax = max(x)
y = list(map(lambda x: x * satHeight, first[1]))
ymin = min(y)
ymax = max(y)
satLongitude = first[3]
satSweep = first[4]
date = first[5]
add_seconds = date
displayDate = datetime(2000, 1, 1, 12) + timedelta(seconds=add_seconds)
images = sc.binaryFiles(inputDir + '/*.png', numPartitions)
imageToArray = lambda rawdata: np.asarray(Image.open(BytesIO(rawdata))).astype(np.uint8)
imageArrays = images.mapValues(imageToArray)
imageArrays.foreachPartition( lambda x: check_call(["kinit", "-kt", "brad.keytab", "*****@*****.**"] ))
imageArrays.map(lambda image: addMap(outputDir, image, satLongitude, xmin, xmax, ymin, ymax, displayDate)).collect()
imageArrays.map(lambda image: transform(outputDir, image, x, y, displayDate)).collect()

Example #29

0

Show file

if __name__ == "__main__":

    application_start_time = time()

    input_path = sys.argv[1]
    output_path = sys.argv[2]
    job_name = sys.argv[3]

    subprocess.call(["hadoop", "fs", "-rm", "-r", output_path])

    sc = SparkContext(appName=job_name)

    build_start_time = time()

    images_rdd = sc.binaryFiles(input_path) \
        .map(images_to_descriptors) \
        .filter(lambda x: x[1].all() != None) \
        .map(lambda x: (x[0], x[1]))

    features = images_rdd.flatMap(lambda x: x[1])

    model = KMeans.train(features,
                         3,
                         maxIterations=5,
                         initializationMode="random")
    clusterCenters = model.clusterCenters

    build_end_time = time() - build_start_time

    processing_start_time = time()

    data_to_cluster = images_rdd.map(lambda x: [x, clusterCenters])

Example #30

0

Show file

                    "flower_area_bounds" : flower_area_bounds,
                    "flower_area_mask" : flower_area_mask}

    # Save/Overwrite dictionary
    #io.imsave(path + plot_mask_name, dict_to_save)
    np.save(path + plot_mask_name, dict_to_save)

#setImagesFilepaths('/sparkdata/tmp-dir/2016-07-05_1207/')

#sc = SparkContext("local[4]", "images_plot_mask")

sc = SparkContext("spark://discus-p2irc-master:7077", "images_plot_mask")

#images_read = (sc.binaryFiles('hdfs://discus-p2irc-master:54310/user/hduser/plot_images/2016-07-05_1207', 12))

images_read = (sc.binaryFiles('hdfs://discus-p2irc-master:54310/user/hduser/plot_images/2016-07-05_1207', 600))

images_bytes = (images_read.map(images_to_bytes))

images_bytes.persist(pyspark.StorageLevel.MEMORY_AND_DISK_SER)

images_mask_computed = images_bytes.foreach(computePlotMask)

#images_histogram_computed = images_bytes.foreach(computeHistograms)

#images_histogram_computed = images_bytes.foreach(computeHistograms)
print plot_mask
print "=========================="
print "images plot mask completed"
print "=========================="

Example #31

0

Show file

File: a2_islam.py Project: ibipul/map-reduce-projects

    :rtype dict{}:
    """
    sim_map = defaultdict(list)
    for x in svd_collect:
        sim_map[x[0]] = x[1]
    return dict(sim_map)

############
##
## PY Spark Code Section
##
############
# Timing tracker
start_time = time.time()
## Read Files
rdd = sc.binaryFiles(_LOCAL_FILES_REGEX) #
#rdd = sc.binaryFiles('hdfs:/data/large_sample')
#rdd = sc.binaryFiles('hdfs:/data/small_sample')

## Obtain RDD as:[ (filename, tiffMatrix)...]
rdd2 = rdd.map(lambda kv: getTiffAsMatrix(kv))
## Split each matrix to 500x500x4 images RDD: [ (img-0, 500x500x4),...(img-n, 500x500x4)]
rdd3 = rdd2.flatMap(lambda kv: tiffmatrixSplit(kv))
## Collect operation for 1.E
data_for_print1e = rdd3.filter(lambda x:display1e(x)).collect()
## Smooth out pixels to get RDD: [(img-0,500x500), (img-1,500x500) ...]
rdd4 = rdd3.map(lambda kv: tilePixelIntensityConverter(kv))
## Call Persist on RDD at this stage
rdd4.persist()
## Call down-scale of resolution on each sub image, default factor=10
## Gives RDD[ (img-0,50x50),(img-1,50x50)...]

Example #32

0

Show file

# sc=SparkContext.getOrCreate()

dataDir = r'hdfs:/data/large_sample/'
noOfBuckets = 135
noOfBands = 4
noOfPartitions = 46

# dataDir =r'hdfs:/data/small_sample/'
# noOfBuckets = 50
# noOfBands = 8
# noOfPartitions = 5
# dataDir =r'C:\Users\SSDN-Dinesh\Desktop\SBU\BDA\Assignment2\a2_small_sample'

#gives key as file Path and value as binary of file
data = sc.binaryFiles(dataDir)
#gives key as file name and value as binary of file
# fileName = data.map(lambda x:(x[0].split('/')[-1],x[1]))
# outName = 'fileName'
# out1 = fileName.collect()
# broadCastFileNames = sc.broadcast(fileNames)

#gives key as file name and array of image
fullImgs = data.map(lambda x: (x[0].split('/')[-1], getOrthoTif(x[1])))
# out1  = fullImgs.collect()
# outName = 'ImageShapeRDD'

# # imgShape = fullImgs.map(lambda x:(x[0],x[1].shape))
# # out = imgShape.collect()
# # outName = 'imgShape'
#

Example #33

0

Show file

departmentsWithEmployeesSeq1 = [departmentWithEmployees1, departmentWithEmployees2]
df1 = sql_context.createDataFrame(departmentsWithEmployeesSeq1)

display(df1)
'''

#dado = Row("imagem", "label")
#rdd = sc.parallelize(l)
#images = rdd.map(lambda x: Row(id=x[0]))
#dataset = sql_context.createDataFrame([dado])

path = "/Users/leopoldolusquino/Documents/Doutorado/Tese/originais/"
transformador = binary_input_transformer.BinaryInputTransformer()
numPartitions = 10

rdd = sc.binaryFiles(
    path, minPartitions=numPartitions).repartition(numPartitions).take(10)

file_bytes = np.asarray(bytearray(rdd[0][1]), dtype=np.uint8)

print(file_bytes)

image = cv2.imdecode(file_bytes, 1)

print(image)

#print(rdd.count())

rdd = sc.binaryFiles(path, minPartitions=numPartitions).select(
    input_file_name(), "label".rdd)

#print(image)

Example #34

0

Show file

    tmp_path = docfile
    os.remove(docfile)
    return res

#for i in os.listdir(tmp_path):
#	if os.path.isdir(os.path.join(tmp_path, i)):
#		shutil.rmtree(os.path.join(tmp_path, i))

path = 'file:///home/ubuntu/chenq/docx_evaluate_score/data/all_docx/input/*.docx'
path = 'file:///dev/shm/test_docx/input/*.docx'
#path='file:///home/ubuntu/chenq/test_docx/input/*.docx'
#path='file:////dev/shm/input/*.docx'
#path='file:////dev/shm/input2/input/*.docx'
#path='/user/ubuntu/docx/input/*.docx'
#path='har:////user/ubuntu/ainput/docx.har/input/*.docx'
rdd = sc.binaryFiles(path)
#rdd.cache()
#rdd.count()
#rdd=rdd.repartition(32)
doc = rdd.map(lambda x: (x[0], read(x[1], x[0])))

#doc.foreach(print)
doc.cache()
nread = doc.count()

df = doc.toDF()
df.write.parquet(
    'file:///user/ubuntu/all_docx_10349_single_node_on_desktop5_localfile_shm_24cores_tika.parquet'
)

print(nread)

Example #35

0

Show file

File: binaryFiles.py Project: zephyrGit/Pyspark

from pyspark import SparkContext, SparkConf
import numpy as np

conf = SparkConf()
conf.set('master', 'spark://hadoop-maste:7077')
context = SparkContext(conf=conf)

rdd = context.binaryFiles('/datas/pics/')
print('applicationId:', context.applicationId)
result = rdd.collect()
for data in result:
    print(data[0], data[1][:10])
context.stop()

Example #36

0

Show file

import sys
import findspark
if ("-localhost" in sys.argv):
    findspark.init("/u/cs451/packages/spark")
import pyspark
from pyspark import SparkContext

if __name__ == "__main__":
    sc = SparkContext(appName="ConvertToSequenceFile")

    input_dir_path = sys.argv[1]
    output_path = sys.argv[2]
    num_partitions = sys.argv[3]

    # Delete output_path if it already exists
    fs = (sc._jvm.org.apache.hadoop.fs.FileSystem.get(
        sc._jsc.hadoopConfiguration()))
    fs.delete(sc._jvm.org.apache.hadoop.fs.Path(output_path), True)

    sc.binaryFiles(input_dir_path,
                   int(num_partitions)).saveAsSequenceFile(output_path)

Example #37

0

Show file

File: image_k-means.py Project: wykbill03/Yelp_Photo_Classification

import boto
import datetime

sc = SparkContext()

# AWS S3 credentials:

AWS_KEY = ""
AWS_SECRET = ""
sc._jsc.hadoopConfiguration().set("fs.s3n.awsAccessKeyId", AWS_KEY)
sc._jsc.hadoopConfiguration().set("fs.s3n.awsSecretAccessKey", AWS_SECRET)


directory = 's3n://amlyelp/subset/trainnew/'

images = sc.binaryFiles(directory)

image_to_array = lambda rawdata: np.asarray(Image.open(StringIO(rawdata)))

image_array = images.map(lambda x: (x[0],image_to_array(x[1])))

image_array_flatten = image_array.map(lambda x: (x[0],x[1].flatten())).cache()
del image_array
del images

train = image_array_flatten.values().repartition(200).cache()

clusters = KMeans.train(train, 50, maxIterations=50)

clusters.save(sc, 's3n://amlyelp/subset/model/kmeans/50_iters_'+\
              str(datetime.datetime.now()).replace(' ', '_')+'/')