Python DataFrameReader.collect Examples

Programming Language: Python

Namespace/Package Name: pyspark.sql

Class/Type: DataFrameReader

Method/Function: collect

Examples at hotexamples.com: 3

Python DataFrameReader.collect - 3 examples found. These are the top rated real world Python examples of pyspark.sql.DataFrameReader.collect extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

DataFrameReader(19)

collect(3)

jdbc(2)

show(2)

select(2)

format(2)

registerTempTable(1)

printSchema(1)

option(1)

isEmpty(1)

load(1)

alias(1)

filter(1)

csv(1)

createOrReplaceTempView(1)

count(1)

toDF(1)

Example #1

Show file

def spark_intersect(mutation_table_name, regions_table_name, DB_CONF, output_format, regions=None, jdbc_jar='postgresql-42.2.12.jar', groupby=None, useSQL=False, minCount=-1, tumorType=None, filter=None):

    # SQL VS MINE8: 388[1h] , 1507 (25min), 1018[bin=20], (1h,  no bins), 1101 (5 bins), 994 [100] - 952 [200] 916(ctcf) 941[41]
    # 590 ETS1
    #3h13 geco 4h37 genomic

    fs_db_dir  =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled")

    numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1))
    memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g")
    sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true"
    print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).")
    start_time = time.time()

    os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7")
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

    driver_class = "org.postgresql.Driver"

    cores = os.getenv('MUTVIZ_CORES', "*")

    print("#### SPARK CONFIGURATION ####")
    print("SPARK HOME: " + os.getenv('SPARK_HOME'))
    print("Using cores: "+cores)
    print("Using memory: "+memory)
    print("Using partitions: "+str(numPartitions))
    print("Debug enabled: " + str(sparkDebug))
    if tumorType:
        print("Tumor Type id: "+str(tumorType))
    if filter:
        print("Filter count: "+str(len(filter)))
    print("#############################")

    spark = SparkSession.builder \
        .master("local["+cores+"]") \
        .appName("Word Count") \
        .config("spark.jars", jdbc_jar) \
        .config("spark.driver.memory", memory) \
        .config("spark.driver.cores", cores) \
        .getOrCreate()

    sql_ctx = SQLContext(spark.sparkContext)
    sc = spark.sparkContext

    properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class}
    url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"]


    if fs_db_dir == 'disabled':
        mutations = DataFrameReader(sql_ctx).jdbc(
            url='jdbc:%s' % url, table=mutation_table_name, properties=properties
        )
    else:
        customSchema = StructType([
            StructField("donor_id", IntegerType(), False),
            StructField("tumor_type_id", IntegerType(), False),
            StructField("chrom", IntegerType(), False),
            StructField("position", IntegerType(), False),
            StructField("mutation_code_id", IntegerType(), False),
            StructField("trinucleotide_id_r", IntegerType(), False)]
        )

        mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name)

    def filter1(x):
        return int(x["tumor_type_id"]) == int(tumorType)
    def filter2(x):
        return  int(x["donor_id"]) in filter

    if tumorType:
        mutations = mutations.rdd.filter(filter1)
        if filter!=None:
            mutations = mutations.filter(filter2)

        if mutations.isEmpty():
            return []
        else:
            mutations = mutations.toDF()



    regions_df = DataFrameReader(sql_ctx).jdbc(
        url='jdbc:%s' % url, table=regions_table_name, properties=properties
    )

    if sparkDebug:
        print("############ mutations ==> ", mutations.count())
        print("############ regions   ==>", regions_df.count())

    # new
    if useSQL :
        mutations.registerTempTable("mutations")
        regions_df.registerTempTable("regions")

        sql_res = spark.sql("SELECT m.tumor_type_id, m.trinucleotide_id_r, count(*) from mutations as m, regions as r WHERE m.chrom=r.chrom AND m.position >= r.pos_start AND m.position <= r.pos_stop GROUP BY m.tumor_type_id, m.trinucleotide_id_r")

        res = sql_res.rdd.map(lambda r: [r["tumor_type_id"], r["trinucleotide_id_r"], r["count(1)"]]).collect()
        print("Spark execution took %s seconds ---" % (time.time() - start_time))
    #print(sql_res.collect())

    else:

        # regions = regions_df.collect()
        # print("====> REGIONS COLLECTED AFTER (S) %s" % (time.time() - start_time))
        # rb = defaultdict(list)
        # for v in regions: rb[v["chrom"]].append(v)
        #
        # for c in rb:
        #     rb[c] = sorted(rb[c], key=itemgetter('pos_start', 'pos_stop'))

        regions = regions_df.collect()
        regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop'))

        print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time))
        regions_broadcast = sc.broadcast(regions)
        print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time))


        def partitionWork(p):

            localMutations = list(p)
            matched = []

            if sparkDebug:
                print("====> PROCESSING PARTITION AFTER (S)  %s" % (time.time() - start_time))

            if localMutations:

                import copy
                localRegions = copy.deepcopy(regions_broadcast.value)

                if localRegions:
                    sorted_mutations = sorted(localMutations, key=itemgetter('position'))
                    sorted_regions = localRegions

                    cur_reg_idx = 0
                    cur_mut_idx = 0

                    while( cur_mut_idx < len(sorted_mutations)  and cur_reg_idx < len(sorted_regions) ):

                        cur_reg = sorted_regions[cur_reg_idx]
                        cur_mut = sorted_mutations[cur_mut_idx]

                        if cur_mut["position"] < cur_reg["pos_start"]:
                            cur_mut_idx += 1
                        elif cur_mut["position"] <= cur_reg["pos_stop"]:
                            if cur_reg["chrom"] == cur_mut["chrom"]:
                                matched.append(cur_mut)
                            else:
                                # look ahead
                                next_region_index = cur_reg_idx + 1
                                while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][
                                    "pos_start"] <= cur_mut["position"]:
                                    if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \
                                            sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]:
                                        matched.append(cur_mut)
                                    next_region_index = next_region_index + 1

                            cur_mut_idx += 1
                        else:
                            cur_reg_idx += 1

            return matched


        #if numPartitions > 0:
        #    res = mutations.rdd.groupBy(lambda e: e["chrom"],numPartitions=numPartitions).flatMap(partitionWork)
        #else:
        #    res = mutations.rdd.groupBy(lambda e: e["chrom"]).flatMap(partitionWork)

        if sparkDebug:
            print("#### NUM PARTITIONS: ", mutations.rdd.getNumPartitions)

        res = mutations.rdd.mapPartitions(partitionWork)


        if sparkDebug:
            print("############ results ==> ", res.count())

        # Grouping
        #todo: if empty
        if groupby:
            if minCount==-1:
                res = res.toDF().groupBy(groupby).count().rdd.map(output_format)
            else:
                res_df = res.toDF().groupBy(groupby).count()
                res = res_df.filter(res_df["count"]>minCount).rdd.map(output_format)

            if sparkDebug:
                print("############ results after grouping ==> ", res.count())

    res = res.collect()
    sc.stop()

    print("Spark execution took %s seconds ---" % (time.time() - start_time))

    return res

Example #2

Show file

def spark_intersect2(mutation_table_name, regions_table_name, DB_CONF, jdbc_jar='postgresql-42.2.12.jar'):

    fs_db_dir  =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled")

    numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1))
    memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g")
    sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true"
    print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).")
    start_time = time.time()

    os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7")
    os.environ["PYSPARK_PYTHON"] = sys.executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable

    driver_class = "org.postgresql.Driver"

    cores = os.getenv('MUTVIZ_CORES', "*")

    print("#### SPARK CONFIGURATION ####")
    print("SPARK HOME: " + os.getenv('SPARK_HOME'))
    print("Using cores: "+cores)
    print("Using memory: "+memory)
    print("Using partitions: "+str(numPartitions))
    print("Debug enabled: " + str(sparkDebug))
    print("#############################")

    spark = SparkSession.builder \
        .master("local["+cores+"]") \
        .appName("Word Count") \
        .config("spark.jars", jdbc_jar) \
        .config("spark.driver.memory", memory) \
        .config("spark.driver.cores", cores) \
        .getOrCreate()

    sql_ctx = SQLContext(spark.sparkContext)
    sc = spark.sparkContext

    properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class}
    url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"]


    if fs_db_dir == 'disabled':
        mutations = DataFrameReader(sql_ctx).jdbc(
            url='jdbc:%s' % url, table=mutation_table_name, properties=properties
        )
    else:
        customSchema = StructType([
            StructField("donor_id", IntegerType(), False),
            StructField("tumor_type_id", IntegerType(), False),
            StructField("chrom", IntegerType(), False),
            StructField("position", IntegerType(), False),
            StructField("mutation_code_id", IntegerType(), False),
            StructField("trinucleotide_id_r", IntegerType(), False)]
        )

        mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name)

    regions_df = DataFrameReader(sql_ctx).jdbc(
        url='jdbc:%s' % url, table=regions_table_name, properties=properties
    )

    regions = regions_df.collect()
    regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop'))

    print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time))
    regions_broadcast = sc.broadcast(regions)
    print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time))


    def partitionWork(p):

        localMutations = list(p)
        matched = []

        if sparkDebug:
            print("====> PROCESSING PARTITION AFTER (S)  %s" % (time.time() - start_time))

        if localMutations:

            import copy
            localRegions = copy.deepcopy(regions_broadcast.value)

            if localRegions:
                sorted_mutations = sorted(localMutations, key=itemgetter('position'))
                sorted_regions = localRegions

                cur_reg_idx = 0
                cur_mut_idx = 0

                while( cur_mut_idx < len(sorted_mutations)  and cur_reg_idx < len(sorted_regions) ):

                    cur_reg = sorted_regions[cur_reg_idx]
                    cur_mut = sorted_mutations[cur_mut_idx]

                    if cur_mut["position"] < cur_reg["pos_start"]:
                        cur_mut_idx += 1
                    elif cur_mut["position"] <= cur_reg["pos_stop"]:
                        if cur_reg["chrom"] == cur_mut["chrom"]:
                            matched.append(cur_mut)
                        else:
                            # look ahead
                            next_region_index = cur_reg_idx + 1
                            while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][
                                "pos_start"] <= cur_mut["position"]:
                                if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \
                                        sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]:
                                    matched.append(cur_mut)
                                next_region_index = next_region_index + 1

                        cur_mut_idx += 1
                    else:
                        cur_reg_idx += 1

        return matched

    res = mutations.rdd.mapPartitions(partitionWork).toDF().toPandas()


    print("Spark execution took %s seconds ---" % (time.time() - start_time))

    return res

Example #3

Show file

            },
            sort_keys=False)
    except BaseException as e:
        return None


try:
    data_file = "airports.dat"
    data_file_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat"
    download_file(data_file_url, data_file)
    kafka_url = "172.18.0.100:9092"
    kafka_topic = "airports"

    sc = SparkContext("local", "example-of-processing-data")
    sc.setLogLevel("ERROR")
    sqlContext = SQLContext(sc)

    airports = DataFrameReader(sqlContext).csv(data_file)

    producer = KafkaProducer(bootstrap_servers=kafka_url)
    #for index in range(1, airports.count()):
    #    print(to_json(airports.take(index)[0]))
    for each in airports.collect():
        # logging.debug(as_json(each))
        producer.send(kafka_topic, as_json(each))

finally:
    producer.close()
    sc.stop()
    shutil.os.remove(data_file)