def spark_intersect(mutation_table_name, regions_table_name, DB_CONF, output_format, regions=None, jdbc_jar='postgresql-42.2.12.jar', groupby=None, useSQL=False, minCount=-1, tumorType=None, filter=None): # SQL VS MINE8: 388[1h] , 1507 (25min), 1018[bin=20], (1h, no bins), 1101 (5 bins), 994 [100] - 952 [200] 916(ctcf) 941[41] # 590 ETS1 #3h13 geco 4h37 genomic fs_db_dir =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled") numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1)) memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g") sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true" print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).") start_time = time.time() os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7") os.environ["PYSPARK_PYTHON"] = sys.executable os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable driver_class = "org.postgresql.Driver" cores = os.getenv('MUTVIZ_CORES', "*") print("#### SPARK CONFIGURATION ####") print("SPARK HOME: " + os.getenv('SPARK_HOME')) print("Using cores: "+cores) print("Using memory: "+memory) print("Using partitions: "+str(numPartitions)) print("Debug enabled: " + str(sparkDebug)) if tumorType: print("Tumor Type id: "+str(tumorType)) if filter: print("Filter count: "+str(len(filter))) print("#############################") spark = SparkSession.builder \ .master("local["+cores+"]") \ .appName("Word Count") \ .config("spark.jars", jdbc_jar) \ .config("spark.driver.memory", memory) \ .config("spark.driver.cores", cores) \ .getOrCreate() sql_ctx = SQLContext(spark.sparkContext) sc = spark.sparkContext properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class} url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"] if fs_db_dir == 'disabled': mutations = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=mutation_table_name, properties=properties ) else: customSchema = StructType([ StructField("donor_id", IntegerType(), False), StructField("tumor_type_id", IntegerType(), False), StructField("chrom", IntegerType(), False), StructField("position", IntegerType(), False), StructField("mutation_code_id", IntegerType(), False), StructField("trinucleotide_id_r", IntegerType(), False)] ) mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name) def filter1(x): return int(x["tumor_type_id"]) == int(tumorType) def filter2(x): return int(x["donor_id"]) in filter if tumorType: mutations = mutations.rdd.filter(filter1) if filter!=None: mutations = mutations.filter(filter2) if mutations.isEmpty(): return [] else: mutations = mutations.toDF() regions_df = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=regions_table_name, properties=properties ) if sparkDebug: print("############ mutations ==> ", mutations.count()) print("############ regions ==>", regions_df.count()) # new if useSQL : mutations.registerTempTable("mutations") regions_df.registerTempTable("regions") sql_res = spark.sql("SELECT m.tumor_type_id, m.trinucleotide_id_r, count(*) from mutations as m, regions as r WHERE m.chrom=r.chrom AND m.position >= r.pos_start AND m.position <= r.pos_stop GROUP BY m.tumor_type_id, m.trinucleotide_id_r") res = sql_res.rdd.map(lambda r: [r["tumor_type_id"], r["trinucleotide_id_r"], r["count(1)"]]).collect() print("Spark execution took %s seconds ---" % (time.time() - start_time)) #print(sql_res.collect()) else: # regions = regions_df.collect() # print("====> REGIONS COLLECTED AFTER (S) %s" % (time.time() - start_time)) # rb = defaultdict(list) # for v in regions: rb[v["chrom"]].append(v) # # for c in rb: # rb[c] = sorted(rb[c], key=itemgetter('pos_start', 'pos_stop')) regions = regions_df.collect() regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop')) print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time)) regions_broadcast = sc.broadcast(regions) print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time)) def partitionWork(p): localMutations = list(p) matched = [] if sparkDebug: print("====> PROCESSING PARTITION AFTER (S) %s" % (time.time() - start_time)) if localMutations: import copy localRegions = copy.deepcopy(regions_broadcast.value) if localRegions: sorted_mutations = sorted(localMutations, key=itemgetter('position')) sorted_regions = localRegions cur_reg_idx = 0 cur_mut_idx = 0 while( cur_mut_idx < len(sorted_mutations) and cur_reg_idx < len(sorted_regions) ): cur_reg = sorted_regions[cur_reg_idx] cur_mut = sorted_mutations[cur_mut_idx] if cur_mut["position"] < cur_reg["pos_start"]: cur_mut_idx += 1 elif cur_mut["position"] <= cur_reg["pos_stop"]: if cur_reg["chrom"] == cur_mut["chrom"]: matched.append(cur_mut) else: # look ahead next_region_index = cur_reg_idx + 1 while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][ "pos_start"] <= cur_mut["position"]: if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \ sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]: matched.append(cur_mut) next_region_index = next_region_index + 1 cur_mut_idx += 1 else: cur_reg_idx += 1 return matched #if numPartitions > 0: # res = mutations.rdd.groupBy(lambda e: e["chrom"],numPartitions=numPartitions).flatMap(partitionWork) #else: # res = mutations.rdd.groupBy(lambda e: e["chrom"]).flatMap(partitionWork) if sparkDebug: print("#### NUM PARTITIONS: ", mutations.rdd.getNumPartitions) res = mutations.rdd.mapPartitions(partitionWork) if sparkDebug: print("############ results ==> ", res.count()) # Grouping #todo: if empty if groupby: if minCount==-1: res = res.toDF().groupBy(groupby).count().rdd.map(output_format) else: res_df = res.toDF().groupBy(groupby).count() res = res_df.filter(res_df["count"]>minCount).rdd.map(output_format) if sparkDebug: print("############ results after grouping ==> ", res.count()) res = res.collect() sc.stop() print("Spark execution took %s seconds ---" % (time.time() - start_time)) return res
def spark_intersect2(mutation_table_name, regions_table_name, DB_CONF, jdbc_jar='postgresql-42.2.12.jar'): fs_db_dir =os.getenv('MUTVIZ_FS_DB_FOLDER', "disabled") numPartitions = int(os.getenv('MUTVIZ_NUM_PARTITIONS', -1)) memory = os.getenv('MUTVIZ_DRIVER_MEMORY', "50g") sparkDebug = os.getenv('MUTVIZ_SPARK_DEBUG', "false") == "true" print("USING "+str(numPartitions)+" PARTITIONS (-1:AUTO).") start_time = time.time() os.environ["SPARK_HOME"] = os.getenv('MUTVIZ_SPARK_HOME', "/var/lib/spark-2.4.5-bin-hadoop2.7") os.environ["PYSPARK_PYTHON"] = sys.executable os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable driver_class = "org.postgresql.Driver" cores = os.getenv('MUTVIZ_CORES', "*") print("#### SPARK CONFIGURATION ####") print("SPARK HOME: " + os.getenv('SPARK_HOME')) print("Using cores: "+cores) print("Using memory: "+memory) print("Using partitions: "+str(numPartitions)) print("Debug enabled: " + str(sparkDebug)) print("#############################") spark = SparkSession.builder \ .master("local["+cores+"]") \ .appName("Word Count") \ .config("spark.jars", jdbc_jar) \ .config("spark.driver.memory", memory) \ .config("spark.driver.cores", cores) \ .getOrCreate() sql_ctx = SQLContext(spark.sparkContext) sc = spark.sparkContext properties = {'user': DB_CONF["postgres_user"], 'password':DB_CONF["postgres_pw"], 'driver': driver_class} url = 'postgresql://'+DB_CONF["postgres_host"]+':'+DB_CONF["postgres_port"]+'/'+DB_CONF["postgres_db"] if fs_db_dir == 'disabled': mutations = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=mutation_table_name, properties=properties ) else: customSchema = StructType([ StructField("donor_id", IntegerType(), False), StructField("tumor_type_id", IntegerType(), False), StructField("chrom", IntegerType(), False), StructField("position", IntegerType(), False), StructField("mutation_code_id", IntegerType(), False), StructField("trinucleotide_id_r", IntegerType(), False)] ) mutations = spark.read.format("csv").option("header", "true").schema(customSchema).load(fs_db_dir + "/"+mutation_table_name) regions_df = DataFrameReader(sql_ctx).jdbc( url='jdbc:%s' % url, table=regions_table_name, properties=properties ) regions = regions_df.collect() regions = sorted(regions, key=itemgetter('pos_start', 'pos_stop')) print("====> REGIONS SORTED AFTER (S) %s" % (time.time() - start_time)) regions_broadcast = sc.broadcast(regions) print("====> REGIONS BROADCAST AFTER (S) %s" % (time.time() - start_time)) def partitionWork(p): localMutations = list(p) matched = [] if sparkDebug: print("====> PROCESSING PARTITION AFTER (S) %s" % (time.time() - start_time)) if localMutations: import copy localRegions = copy.deepcopy(regions_broadcast.value) if localRegions: sorted_mutations = sorted(localMutations, key=itemgetter('position')) sorted_regions = localRegions cur_reg_idx = 0 cur_mut_idx = 0 while( cur_mut_idx < len(sorted_mutations) and cur_reg_idx < len(sorted_regions) ): cur_reg = sorted_regions[cur_reg_idx] cur_mut = sorted_mutations[cur_mut_idx] if cur_mut["position"] < cur_reg["pos_start"]: cur_mut_idx += 1 elif cur_mut["position"] <= cur_reg["pos_stop"]: if cur_reg["chrom"] == cur_mut["chrom"]: matched.append(cur_mut) else: # look ahead next_region_index = cur_reg_idx + 1 while next_region_index < len(sorted_regions) and sorted_regions[next_region_index][ "pos_start"] <= cur_mut["position"]: if sorted_regions[next_region_index]["chrom"] == cur_mut["chrom"] and \ sorted_regions[next_region_index]["pos_stop"] >= cur_mut["position"]: matched.append(cur_mut) next_region_index = next_region_index + 1 cur_mut_idx += 1 else: cur_reg_idx += 1 return matched res = mutations.rdd.mapPartitions(partitionWork).toDF().toPandas() print("Spark execution took %s seconds ---" % (time.time() - start_time)) return res
}, sort_keys=False) except BaseException as e: return None try: data_file = "airports.dat" data_file_url = "https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat" download_file(data_file_url, data_file) kafka_url = "172.18.0.100:9092" kafka_topic = "airports" sc = SparkContext("local", "example-of-processing-data") sc.setLogLevel("ERROR") sqlContext = SQLContext(sc) airports = DataFrameReader(sqlContext).csv(data_file) producer = KafkaProducer(bootstrap_servers=kafka_url) #for index in range(1, airports.count()): # print(to_json(airports.take(index)[0])) for each in airports.collect(): # logging.debug(as_json(each)) producer.send(kafka_topic, as_json(each)) finally: producer.close() sc.stop() shutil.os.remove(data_file)