コード例 #1
0
ファイル: io.py プロジェクト: robertjklein/fishing
def variants_from_tped (tped):
  """
  Given a tped file in a data frame, extracts the variant information to match the VCF format above with applicable columns (CHR, ID, POS), along with MAP and
  an array of available alleles.
  Uses pandas UDFs to convert the splitdata array to get the non-'0' alleles present
  """

  # Define the UDFs locally here.  Since I'm working on arrays, the best way I found to do this is to use the Pandas.Series apply function
  def pandas_get_alleles_present (x):
    return(list(frozenset(x[4:]) - frozenset(['0'])))
  tped_get_alleles_present = f.pandas_udf(lambda x: x.apply(pandas_get_alleles_present), ArrayType(StringType()))

  # Compute missingness directly from the allele array
  def pandas_get_missing (x):
    count = 0
    for i in range(4, len(x), 2):
      if (x[i] == '0') | (x[i+1] == '0'):
        count += 1
    return(count)
  tped_get_missing = f.pandas_udf(lambda x: x.apply(pandas_get_missing), IntegerType())

  # Split the tped file
  splitdata = tped.select("filename",f.split(tped.data,"[\t ]+").alias("split_data"),tped.lineid.alias("VAR_IDX"))

  # Pull out the first four columns with appropriate casts, and get frac_missing and alleles_present from the UDFs above
  with_alleles = splitdata.select("filename","VAR_IDX", \
    f.element_at(splitdata.split_data,1).alias("CHR"), \
    f.element_at(splitdata.split_data,2).alias("ID"), \
    f.element_at(splitdata.split_data,3).cast(FloatType()).alias("MAP"), \
    f.element_at(splitdata.split_data,4).cast(IntegerType()).alias("POS"), \
    ((f.size(splitdata.split_data) - 4) / 2).alias("n_samples"), \
    tped_get_alleles_present("split_data").alias("alleles_present"), \
    tped_get_missing("split_data").cast(FloatType()).alias("missingcnt"))

  return(with_alleles)
コード例 #2
0
def main(input_dir,output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('created_utc_iso', types.DateType()),
        types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType()))
    ])

    headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1)
    ).withColumn(
        'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2)
    ).cache()

    for year_int in range(2008,2020):
        print('Plotting for '+str(year_int))
        headlines_year = split_sentiment_df.where(
            functions.year(split_sentiment_df['created_utc_iso']) == year_int
        ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso']))

        headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg(
            functions.collect_set(headlines_year['title_clean']).alias('titles_group')
        )
        headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') )
        string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines

        wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot)
        wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
コード例 #3
0
ファイル: functions.py プロジェクト: projectglow/glow
def _get_base_cols(row: StructExpression) -> List[Column]:
    assert check_argument_types()

    contig_name_col = fx.col("`locus.contig`").alias("contigName")

    start_col = (fx.col("`locus.position`") - 1).cast("long").alias("start")

    end_col = start_col + fx.length(fx.element_at("alleles", 1))
    has_info = 'info' in row and isinstance(row.info.dtype, tstruct)
    if has_info and 'END' in row.info and row.info.END.dtype == tint:
        end_col = fx.coalesce(fx.col("`info.END`"), end_col)
    end_col = end_col.cast("long").alias("end")

    names_elems = []
    if 'varid' in row and row.varid.dtype == tstr:
        names_elems.append("varid")
    if 'rsid' in row and row.rsid.dtype == tstr:
        names_elems.append("rsid")
    names_col = fx.expr(
        f"nullif(filter(array({','.join(names_elems)}), n -> isnotnull(n)), array())").alias("names")

    reference_allele_col = fx.element_at("alleles", 1).alias("referenceAllele")

    alternate_alleles_col = fx.expr("slice(alleles, 2, size(alleles) - 1)").alias("alternateAlleles")

    base_cols = [
        contig_name_col, start_col, end_col, names_col, reference_allele_col, alternate_alleles_col
    ]
    assert check_return_type(base_cols)
    return base_cols
コード例 #4
0
def test_array_element_at(data_gen):
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: unary_op_df(spark, data_gen).select(
            element_at(col('a'), 1), element_at(col('a'), -1)),
        conf={
            'spark.sql.ansi.enabled': False,
            'spark.sql.legacy.allowNegativeScaleOfDecimal': True
        })
コード例 #5
0
ファイル: __init__.py プロジェクト: willb/spark-event-vis
def driver_accumulator_updates(df):
    return df.where(
        df.Event ==
        'org.apache.spark.sql.execution.ui.SparkListenerDriverAccumUpdates'
    ).select("executionId",
             F.explode("accumUpdates").alias("updates")).select(
                 "executionId",
                 F.element_at(F.col("updates"), 1).alias("accumulator"),
                 F.element_at(F.col("updates"), 2).alias("value"))
コード例 #6
0
def process_adverse_events(adverse_events: str) -> DataFrame:
    """
    Loads and processes the adverse events input TSV.

    Ex. input record:
        biologicalSystem | gastrointestinal
        effect           | activation_general
        efoId            | EFO_0009836
        ensemblId        | ENSG00000133019
        pmid             | 23197038
        ref              | Bowes et al. (2012)
        symptom          | bronchoconstriction
        target           | CHRM3
        uberonCode       | UBERON_0005409
        url              | null

    Ex. output record:
        id         | ENSG00000133019
        event      | bronchoconstriction
        datasource | Bowes et al. (2012)
        eventId    | EFO_0009836
        literature | 23197038
        url        | null
        biosample  | {gastrointestinal, UBERON_0005409, null, null, null}
        effects    | [{activation, general}]
    """

    ae_df = (spark.read.csv(adverse_events, sep='\t', header=True).select(
        F.col('ensemblId').alias('id'),
        F.col('symptom').alias('event'),
        F.col('efoId').alias('eventId'),
        F.col('ref').alias('datasource'),
        F.col('pmid').alias('literature'),
        'url',
        F.struct(
            F.col('biologicalSystem').alias('tissueLabel'),
            F.col('uberonCode').alias('tissueId'),
            F.lit(None).alias('cellLabel'),
            F.lit(None).alias('cellFormat'),
            F.lit(None).alias('cellId'),
        ).alias('biosample'),
        F.split(F.col('effect'), '_').alias('effects'),
    ).withColumn(
        'effects',
        F.struct(
            F.element_at(F.col('effects'), 1).alias('direction'),
            F.element_at(F.col('effects'), 2).alias('dosing')),
    ))

    # Multiple dosing effects need to be grouped in the same record.
    effects_df = ae_df.groupBy('id', 'event', 'datasource').agg(
        F.collect_set(F.col("effects")).alias("effects"))
    ae_df = ae_df.drop("effects").join(effects_df,
                                       on=["id", "event", "datasource"],
                                       how="left")

    return ae_df
コード例 #7
0
def select_type2(df, *type2):
    """Select fields for log format; 155138

    :param df: Input DataFrame
    :param type2: A list of shopping_sites_id
    :return: Output DataFrame
    """
    stage_df = (
        df.filter(df.logtype.isin('view')
                  & df.info.siteseq.isin(*type2)).select(
                      'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype',
                      json_tuple(df.custom, 'og:url', 'og:title').alias(
                          'productCode', 'productName')).withColumnRenamed(
                              'info.siteseq', 'siteseq'))
    stage_df = stage_df.withColumn('productCode',
                                   split(stage_df['productCode'], '/'))

    return (df.filter(
        df.logtype.isin('login', 'purchase', 'cart')
        & df.info.siteseq.isin(*type2)).select(
            'maid', 'info.siteseq', 'userid', 'timestamp', 'logtype',
            json_tuple(df.custom, 'productCode', 'productName').alias(
                'productCode', 'productName')).withColumnRenamed(
                    'info.siteseq', 'siteseq').unionAll((stage_df.select(
                        'maid', 'siteseq', 'userid', 'timestamp', 'logtype',
                        element_at(stage_df.productCode,
                                   -1).alias('productCode'), 'productName'))))
コード例 #8
0
def test_array_element_at_ansi_fail(data_gen):
    assert_gpu_and_cpu_error(
        lambda spark: unary_op_df(spark, data_gen).select(
            element_at(col('a'), 100)).collect(),
        conf={
            'spark.sql.ansi.enabled': True,
            'spark.sql.legacy.allowNegativeScaleOfDecimal': True
        },
        error_message='java.lang.ArrayIndexOutOfBoundsException')
コード例 #9
0
ファイル: io.py プロジェクト: robertjklein/fishing
def gts_from_impute (infile):
  # Get the main data and put a unique index on each variant
  maindata = infile.filter(infile.data[0:1] != "#")
  splitdata = maindata.select("filename",f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX"))

  gtdata1 = splitdata.select("filename", "VAR_IDX", f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","COLUMN_IDX","GTPROB").filter("COLUMN_IDX > 4")
  # Now, get subject ID and which GT
  gtdata2 = gtdata1.select("filename", "VAR_IDX", "GTPROB", "COLUMN_IDX", f.floor((gtdata1.COLUMN_IDX - 5) / 3).alias("SAMPLE_IDX"), ((gtdata1.COLUMN_IDX - 5) % 3).cast(StringType()).alias("GT_IDX"))
  gtdata3 = rkutil.withColumnsRenamed(gtdata2.groupBy("filename","VAR_IDX","SAMPLE_IDX").pivot("GT_IDX",["0","1","2"]).agg(f.collect_list("GTPROB")), ["0","1","2"],["c0","c1","c2"])
  gtdata4 = gtdata3.select("filename","VAR_IDX","SAMPLE_IDX", f.element_at(gtdata3.c0, 1).cast(FloatType()).alias("P11"), f.element_at(gtdata3.c1, 1).cast(FloatType()).alias("P12"), f.element_at(gtdata3.c2, 1).cast(FloatType()).alias("P22"))
  return(gtdata4)
コード例 #10
0
ファイル: array_test.py プロジェクト: sameerz/spark-rapids
def test_array_element_at_zero_index_fail(index, ansi_enabled):
    message = "SQL array indices start at 1"
    if isinstance(index, int):
        test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(
            element_at(col('a'), index)).collect()
    else:
        test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr(
            'element_at(a, b)').collect()
    assert_gpu_and_cpu_error(
        test_func,
        conf={'spark.sql.ansi.enabled':ansi_enabled},
        error_message=message)
コード例 #11
0
def top10_trends(data):
    result = {"videos": []}
    videos = data.groupBy(["video_id", "title", "description"]) \
        .agg(collect_list(array('trending_date', "views", "likes", "dislikes")) \
             .alias("trending_days"), countDistinct("trending_date"))

    videos = videos.orderBy("count(trending_date)", ascending=False)
    videos = videos.withColumn("latest_views", element_at("trending_days", -1)[1]) \
        .withColumn("latest_likes", element_at("trending_days", -1)[2]) \
        .withColumn("latest_dislikes", element_at("trending_days", -1)[3])

    for row in videos.rdd.collect():
        result["videos"].append({
            "id": row["video_id"],
            "title": row["title"],
            "description": row["description"],
            "latest_views": row["latest_views"],
            "trending_days": row["trending_days"]
        })

    result["videos"] = result["videos"][:10]
    return result
コード例 #12
0
def joinDataSet():

    spark = SparkSession.builder.appName('csv_parse').getOrCreate()

    #Load xml
    xml_df = spark.read.format('com.databricks.spark.xml'). \
    option("rootTag", "feed"). \
           option("rowTag","doc"). \
    load(xml_s3_path). \
           withColumn("title",f.ltrim(f.split(f.col("title"),":").getItem(1))). \
    withColumn("shortUrl",f.split(f.col("url"),"/"))

    selectedData = xml_df.select(
        "title", "url",
        f.element_at(f.col('shortUrl'), -1).alias('shortUrl'), "abstract")
    selectedData.repartition(1).write.option(
        "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_2,
                                                          header='false')
    selectedData.createOrReplaceTempView("wiki_pages")

    #Load csv
    json_schema = ArrayType(
        StructType([
            StructField('name', StringType(), nullable=False),
            StructField('id', IntegerType(), nullable=False)
        ]))

    df = spark.read.option("header",True). \
    option("quote","\""). \
    option("escape","\""). \
    option("multiLine",True). \
    csv(csv_s3_path). \
    withColumn("sanitizedTitle",f.regexp_replace(f.col("title"),"\\s+","_")). \
    withColumn("year",f.split(f.col("release_date"),"-").getItem(0)). \
    withColumn("companiesList",f.from_json(f.col("production_companies"),json_schema)). \
    withColumn("companiesList",f.concat_ws("-",f.col("companiesList.name")))

    csvSelectedData = df.select("title", "sanitizedTitle")
    csvSelectedData.repartition(1).write.option(
        "sep", "\t").format('csv').mode("overwrite").save(csv_output_path_3,
                                                          header='false')

    df.createOrReplaceTempView("movies_metadata")

    # Join datasets
    q = spark.sql(join_sql_query)

    # Write output to s3
    q.repartition(1).write.option(
        "sep", "\t").format('csv').mode("overwrite").save(csv_output_path,
                                                          header='false')
コード例 #13
0
ファイル: array_test.py プロジェクト: sameerz/spark-rapids
def test_array_element_at_ansi_fail_invalid_index(index):
    message = "ArrayIndexOutOfBoundsException" if is_before_spark_330() else "SparkArrayIndexOutOfBoundsException"
    if isinstance(index, int):
        test_func = lambda spark: unary_op_df(spark, ArrayGen(int_gen)).select(
            element_at(col('a'), index)).collect()
    else:
        test_func = lambda spark: two_col_df(spark, ArrayGen(int_gen), index).selectExpr(
            'element_at(a, b)').collect()
    # For 3.3.0+ strictIndexOperator should not affect element_at
    test_conf=copy_and_update(ansi_enabled_conf, {'spark.sql.ansi.strictIndexOperator': 'false'})
    assert_gpu_and_cpu_error(
        test_func,
        conf=test_conf,
        error_message=message)
コード例 #14
0
def simple_additive_gt(data):
    df = data.select("*",
                     element_at(split(data.RAWGT, "\s+"), 1).alias("a1"),
                     element_at(split(data.RAWGT, "\s+"), 2).alias("a2"))
    df = df.select("*", (df.a1 == df.REF).alias("a1R"),
                   (df.a2 == df.REF).alias("a2R"),
                   (df.a1 == df.ALT).alias("a1A"),
                   (df.a2 == df.ALT).alias("a2A"))

    # Now, filter out those for which a1 isn't REF or ALT or a2 isn't REF or ALT
    df = df.drop("a1", "a2").withColumn("a1", df.a1A | df.a1R).withColumn(
        "a2", df.a2A | df.a2R)

    df.filter((df.a1 == False) | (df.a2 == False)).select(
        "VAR_IDX", "RAWGT", "REF", "ALT").distinct().show()

    # And make the additive
    df = df.filter(df.a1 == True).filter(
        df.a2 == True).filter("RAWGT != '0 0'").withColumn(
            "GT_ADD",
            df.a1A.cast(IntegerType()) + df.a2A.cast(IntegerType())).drop(
                "a1A", "a2A", "a1R", "a2R", "a1", "a2", "REF", "ALT")
    return (df)
def assoc_fn(df: DataFrame, group_by_cols):
    gbc = [col(x) for x in group_by_cols]
    h_fn = partial(harmonic_fn,
                   partition_cols=group_by_cols,
                   over_col="evs_score",
                   output_col=harmonic_col)
    assoc_df = (df.withColumn(
        "evs_score", array_min(array(col("evidence_score") / 10.0, lit(1.0)))
    ).transform(h_fn).groupBy(*gbc).agg(
        countDistinct(col("pmid")).alias("f"),
        mean(col("evidence_score")).alias("mean"),
        stddev(col("evidence_score")).alias("std"),
        max(col("evidence_score")).alias("max"),
        min(col("evidence_score")).alias("min"),
        expr("approx_percentile(evidence_score, array(0.25, 0.5, 0.75))").
        alias("q"),
        count(col("pmid")).alias("N"),
        first(col(harmonic_col)).alias(harmonic_col)).withColumn(
            "median", element_at(col("q"), 2)).withColumn(
                "q1", element_at(col("q"),
                                 1)).withColumn("q3", element_at(col("q"),
                                                                 3)).drop("q"))

    return assoc_df
コード例 #16
0
ファイル: io.py プロジェクト: robertjklein/fishing
def variants_from_vcf (vcf):
  """
  Given a VCF file in a data frame, extract the first 9 variant columns and give them unique identifiers.  Include genotype columns as an array parsed out
  with a pandas udf
  """

  # Get the main data and put a unique index on each variant
  maindata = vcf.filter(vcf.data.startswith('#') == False)
  splitdata = maindata.select("filename",f.split(f.substring_index('data',"[\t ]+",9),"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX"))
    
  # Now pull out the columns one at a time, casting non-strings to appropriate type.  Split out INFO and FORMAT here
  variant = splitdata.select("filename","VAR_IDX",\
    f.element_at(splitdata.split_data,1).alias("CHR"),\
    f.element_at(splitdata.split_data,2).cast(IntegerType()).alias("POS"),\
    f.element_at(splitdata.split_data,3).alias("ID"),\
    f.element_at(splitdata.split_data,4).alias("REF"),\
    f.element_at(splitdata.split_data,5).alias("ALT"),\
    f.element_at(splitdata.split_data,6).cast(FloatType()).alias("QUAL"),\
    f.element_at(splitdata.split_data,7).alias("FILTER"),\
    f.split(f.element_at(splitdata.split_data,8), ";").alias("INFO"),\
    f.split(f.element_at(splitdata.split_data,9), ":").alias("FORMAT"))
  return(variant)
コード例 #17
0
ファイル: io.py プロジェクト: robertjklein/fishing
def variants_from_impute (infile):
  
  # First map the filename to CHR
  chrs = dict()
  for filenamerow in infile.select("filename").distinct().collect():
    s = filenamerow.filename
    i = s.find("chr")
    if (i>0):
      start = i+3
      stop = (i+3)+s[start:].find(".")
      cur_chr = s[start:stop]
      if cur_chr == 'X':
        cur_chr = '23'
    else:
      cur_chr = 'ND'
    chrs[s] = cur_chr
  
      
  
  # Get the main data and put a unique index on each variant.  Add in the CHR here.
  maindata = infile.filter(infile.data[0:1] != "#")
  splitdata = maindata.select(maindata.filename, \
                              maindata.lineid.alias("VAR_IDX"), \
                              maindata.data, \
                              f.split(maindata.data,"[\t ]+").alias("split_data"), \
                              maindata.filename.alias("CHR")).replace(chrs, subset="CHR")
                              
  # Now pull out the first five columns one at a time, casting non-strings to appropriate type.  
  variant = splitdata.select("filename","VAR_IDX","data","CHR",\
    f.element_at(splitdata.split_data,1).alias("COL1ID"),\
    f.element_at(splitdata.split_data,2).alias("RAWID"),\
    f.element_at(splitdata.split_data,3).cast(IntegerType()).alias("POS"),\
    f.element_at(splitdata.split_data,4).alias("ALLELE1"),\
    f.element_at(splitdata.split_data,5).alias("ALLELE2"))

  # Next, get the rsID, if present in the RAWID column
  variant2 = variant.select("*", f.split(variant.RAWID, ":").alias("split_id"))
  variant3 = variant2.select("*", f.element_at(variant2.split_id, 1).alias("EXTRACTID"))
  
  return(variant3)
コード例 #18
0
def main(desc_file, evid_file, cell_file, out_file):
    sparkConf = (SparkConf().set('spark.driver.memory', '15g').set(
        'spark.executor.memory',
        '15g').set('spark.driver.maxResultSize',
                   '0').set('spark.debug.maxToStringFields', '2000').set(
                       'spark.sql.execution.arrow.maxRecordsPerBatch',
                       '500000'))
    spark = (SparkSession.builder.config(
        conf=sparkConf).master('local[*]').getOrCreate())

    # Log parameters:
    logging.info(f'Evidence file: {evid_file}')
    logging.info(f'Description file: {desc_file}')
    logging.info(f'Cell type annotation: {cell_file}')
    logging.info(f'Output file: {out_file}')

    # Read files:
    evidence_df = (spark.read.csv(evid_file, sep='\t',
                                  header=True).drop('pmid', 'gene_set_name',
                                                    'disease_name'))
    cell_lines_df = spark.read.csv(cell_file, sep='\t', header=True)
    description_df = spark.read.csv(desc_file, sep='\t', header=True)

    # Logging dataframe stats:
    logging.info(f'Number of evidence: {evidence_df.count()}')
    logging.info(f'Number of descriptions: {description_df.count()}')
    logging.info(f'Number of cell/tissue annotation: {cell_lines_df.count()}')

    # Tissues and cancer types are annotated together in the same column (tissue_or_cancer_type)
    # To disambiguate one from another, the column is combined with the cell lines
    # First on the tissue level:
    tissue_desc = (description_df.withColumnRenamed(
        'tissue_or_cancer_type', 'tissue').join(cell_lines_df,
                                                on='tissue',
                                                how='inner'))

    # And then on the disease level:
    cell_desc = (description_df.withColumnRenamed('tissue_or_cancer_type',
                                                  'diseaseFromSource').join(
                                                      cell_lines_df,
                                                      on='diseaseFromSource',
                                                      how='inner'))

    merged_annotation = (
        # Concatenating the above generated dataframes:
        cell_desc.union(tissue_desc)

        # Aggregating by disease and method:
        .groupBy('diseaseFromSource', 'efo_id', 'method')

        # The cell annotation is aggregated in a list of struct:
        .agg(
            collect_set(
                struct(col('name'), col('id'), col('tissue'),
                       col('tissueId'))).alias('diseaseCellLines')
        ).drop('method'))

    # Joining merged annotation with evidence:
    pooled_evidence_df = (
        evidence_df.select(
            col('target_id').alias('targetFromSourceId'),
            col('disease_id').alias('efo_id'),
            col('score').alias('resourceScore').cast(FloatType()),
        )

        # Some of the target identifier are not Ensembl Gene id - replace them:
        .replace(to_replace=CRISPR_SYMBOL_MAPPING,
                 subset=['targetFromSourceId'])

        # Merging with descriptions:
        .join(merged_annotation, on='efo_id', how='outer')

        # From EFO uri, generate EFO id:
        .withColumn(
            'diseaseFromSourceMappedId',
            element_at(split(col('efo_id'), '/'),
                       -1).alias('diseaseFromSourceMappedId')).drop('efo_id')

        # Adding constants:
        .withColumn('datasourceId', lit('crispr')).withColumn(
            'datatypeId', lit('affected_pathway')).persist())

    logging.info(
        f'Saving {pooled_evidence_df.count()} CRISPR evidence in JSON format, to: {out_file}'
    )

    write_evidence_strings(pooled_evidence_df, out_file)
コード例 #19
0
def debug_augmentation(df):
    return (df.select("customerID").distinct().select(
        "customerID",
        F.substring("customerID", 0, 10).alias("originalID"),
        F.element_at(F.split("customerID", "-", -1), 3).alias("suffix"),
    ))
コード例 #20
0
key_data = spark.sql("SELECT `key` FROM logging_demp.key_table")

## Create some the parent child pairs we need to create out structure
def zip_pairs(value):
  lead_list = value.copy()
  lead_list.pop()
  lead_list.insert(0,None)
  result = [item for item in zip(lead_list,value)]
  
  return result

pairZip = udf(zip_pairs, ArrayType(ArrayType(StringType())) )

df2 = key_data.select("key").withColumn("key_split", split(col("key"), "/")) \
        .withColumn("depth", size(col("key_split"))) \
        .withColumn("file", element_at(col("key_split"), -1) ) \
        .withColumn("pairs", pairZip(col("key_split")))

## Schema to create.
## prefixes all refer to a file.
## a file is the list thing in the prefix (-1 index in the python list once we split)
## All other bits are parents.
## two types

## "Folder" / File / "table" which is group of files
## tables can have partitions too which maybe important
## Note Folder doesn't matter for perf testing but is used for permission models and end user exploration

## Folder has parent attribute
## File too but File can change depending on repacking and drop table / append / repartition write
コード例 #21
0
def main(input_dir, output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('polarity_subjectivity',
                          types.ArrayType(types.FloatType())),
        types.StructField('score', types.LongType()),
        types.StructField('num_comments', types.LongType()),
    ])

    headlines_df = spark.read.json(input_dir,
                                   encoding='utf-8',
                                   schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity',
        functions.element_at(headlines_df['polarity_subjectivity'],
                             1)).withColumn(
                                 'subjectivity',
                                 functions.element_at(
                                     headlines_df['polarity_subjectivity'], 2))

    df_sentiment = split_sentiment_df.withColumn(
        'label', get_label(split_sentiment_df['polarity']))

    training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25])

    headline_vector_size = 3
    word_freq_vector_size = 100

    tokenizer = Tokenizer(inputCol='title_clean', outputCol='words')
    headline2Vector = Word2Vec(vectorSize=headline_vector_size,
                               minCount=0,
                               inputCol='words',
                               outputCol='headline_vector')
    hashingTF = HashingTF(inputCol='words',
                          outputCol='word_counts',
                          numFeatures=word_freq_vector_size)
    idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5)
    headline_vector_size_hint = VectorSizeHint(
        inputCol='headline_vector',
        size=headline_vector_size)  #need this for streaming
    word_freq_vector_size_hint = VectorSizeHint(
        inputCol='word_frequecy',
        size=word_freq_vector_size)  #need this for streaming
    feature_assembler = VectorAssembler(inputCols=[
        'headline_vector', 'score', 'num_comments', 'subjectivity',
        'word_frequecy'
    ],
                                        outputCol='features')
    dt_classifier = DecisionTreeClassifier(featuresCol='features',
                                           labelCol='label',
                                           predictionCol='prediction',
                                           maxDepth=9)

    pipeline = Pipeline(stages=[
        tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint,
        word_freq_vector_size_hint, feature_assembler, dt_classifier
    ])
    sentiment_model = pipeline.fit(training_set)

    validation_predictions = sentiment_model.transform(validation_set)

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='label')
    validation_score = evaluator.evaluate(validation_predictions)
    print('Validation score for Sentiment model F1: %g' % (validation_score, ))

    validation_score_accuracy = evaluator.evaluate(
        validation_predictions, {evaluator.metricName: "accuracy"})
    print('Validation score for Sentiment model Accuracy: %g' %
          (validation_score_accuracy, ))

    sentiment_model.write().overwrite().save(output_dir)
コード例 #22
0
def get_department(the_col: Union[str, f.Column]) -> f.Column:
    _the_col = the_col if isinstance(the_col, f.Column) else f.col(the_col)
    return f.element_at(f.split(_the_col, '_'), 1)
コード例 #23
0
    def process_biomarkers(
        self,
        biomarkers_df: DataFrame,
        source_df: DataFrame,
        disease_df: DataFrame,
        drugs_df: DataFrame
    ) -> DataFrame:
        """The diverse steps to prepare and enrich the input table"""

        biomarkers_enriched = (
            biomarkers_df
            .select(
                'Biomarker', 'IndividualMutation',
                array_distinct(split(col('Alteration'), ';')).alias('alterations'),
                array_distinct(split(col('Gene'), ';')).alias('gene'),
                split(col('AlterationType'), ';').alias('alteration_types'),
                array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'),
                array_distinct(split(col('Drug'), ';|,')).alias('drug'),
                'DrugFullName', 'Association', 'gDNA',
                array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'),
                array_distinct(split(col('Source'), ';')).alias('source')
            )
            .withColumn('confidence', explode(col('confidence')))
            .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name')))
            .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', ''))
            .withColumn('drug', explode(col('drug')))
            .withColumn('drug', translate(col('drug'), '[]', ''))
            .withColumn('gene', explode(col('gene')))
            .replace(to_replace=GENENAMESOVERRIDE, subset=['gene'])
            .withColumn('gene', upper(col('gene')))
            # At this stage alterations and alteration_types are both arrays
            # Disambiguation when the biomarker consists of multiple alterations is needed
            # This is solved by:
            # 1. Zipping both fields - tmp consists of a list of alteration/type tuples
            # 2. tmp is exploded - tmp consists of the alteration/type tuple
            # 3. alteration & alteration_type columns are overwritten with the elements in the tuple
            .withColumn(
                'tmp',
                self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types')))
            .withColumn('tmp', explode(col('tmp')))
            .withColumn('alteration_type', element_at(col('tmp'), 2))
            .withColumn(
                'alteration',
                when(
                    ~col('IndividualMutation').isNull(),
                    col('IndividualMutation')
                )
                .otherwise(element_at(col('tmp'), 1))
            )
            .drop('tmp')
            # Clean special cases on the alteration string
            .withColumn(
                'alteration',
                when(
                    col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.',
                    col('Biomarker')  # 'NRAS (12,13,59,61,117,146)'
                )
                .when(
                    # Cleans strings like 'ARAF:.'
                    col('alteration').contains(':.'),
                    translate(col('alteration'), ':.', '')
                )
                .when(
                    # Fusion genes are described with '__'
                    # biomarker is a cleaner representation when there's one alteration
                    (col('alteration').contains('__')) & (~col('Biomarker').contains('+')),
                    col('Biomarker')
                )
                .otherwise(col('alteration'))
            )
            # Split source into literature and urls
            # literature contains PMIDs
            # urls are enriched from the source table if not a CT
            .withColumn('source', explode(col('source')))
            .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source')))
            .join(source_df, on='source', how='left')
            .withColumn(
                'literature',
                when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2))
            )
            .withColumn(
                'urls',
                when(
                    col('source').startswith('NCT'),
                    struct(
                        lit('Clinical Trials').alias('niceName'),
                        concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url')
                    )
                )
                .when(
                    (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')),
                    struct(col('niceName'), col('url'))
                )
            )
            # The previous conditional clause creates a struct regardless of
            # whether any condition is met. The empty struct is replaced with null
            .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls')))
            # Enrich data
            .withColumn('functionalConsequenceId', col('alteration_type'))
            .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId'])
            .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association'])
            .join(disease_df, on='tumor_type', how='left')
            .withColumn('drug', upper(col('drug')))
            .withColumn(
                # drug class is coalesced when the precise name of the medicine is not provided
                'drug',
                when(col('drug') == '', col('DrugFullName')).otherwise(col('drug')))
            .join(drugs_df, on='drug', how='left')
            .withColumn('drug', initcap(col('drug')))
            # Translate variantId
            .withColumn(
                'variantId',
                when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA')))
            )
            # Assign a GO ID when a gene expression data is reported
            .withColumn(
                'geneExpressionId',
                when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')),
                    'GO_0010628'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')),
                    'GO_0010629'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')),
                    'GO_0010467'
                )
            )
            # Create variant struct
            .withColumn(
                'variant',
                when(
                    col('alteration_type') != 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('variantId').alias('id'),
                        col('functionalConsequenceId')
                    )
                )
            )
            # Create geneExpression struct
            .withColumn(
                'geneExpression',
                when(
                    col('alteration_type') == 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('geneExpressionId').alias('id'))
                )
            )
        )

        pre_evidence = (
            biomarkers_enriched
            .withColumn('datasourceId', lit('cancer_biomarkers'))
            .withColumn('datatypeId', lit('affected_pathway'))
            .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource')
            .withColumnRenamed('drug', 'drugFromSource')
            # diseaseFromSourceMappedId, drugId populated above
            .withColumnRenamed('Association', 'drugResponse')
            # confidence, literature and urls populated above
            .withColumnRenamed('gene', 'targetFromSourceId')
            .withColumnRenamed('Biomarker', 'biomarkerName')
            # variant, geneExpression populated above
            .drop(
                'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId',
                'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url')
        )

        # Group evidence
        self.evidence = (
            pre_evidence
            .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId',
                     'drugResponse', 'targetFromSourceId', 'diseaseFromSource',
                     'diseaseFromSourceMappedId', 'confidence', 'biomarkerName')
            .agg(
                collect_set('literature').alias('literature'),
                collect_set('urls').alias('urls'),
                collect_set('variant').alias('variant'),
                collect_set('geneExpression').alias('geneExpression'),
            )
            # Replace empty lists with null values
            .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature')))
            .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls')))
            .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant')))
            .withColumn(
                'geneExpression',
                when(size(col('geneExpression')) == 0, lit(None))
                .otherwise(col('geneExpression')))
            # Collect variant info into biomarkers struct
            .withColumn(
                'biomarkers',
                struct(
                    'variant',
                    'geneExpression'
                ))
            .drop('variant', 'geneExpression')
            .distinct()
        )

        return self.evidence
コード例 #24
0
def main(spark):
    path = '../../../../data/census/'
    filename = "PEP_2017_PEPANNRES.csv"
    absolute_file_path = get_absolute_file_path(path, filename)

    # Ingestion of the census data
    census_df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .option("encoding", "cp1252") \
        .load(absolute_file_path)

    census_df = census_df.drop("GEO.id") \
        .drop("rescen42010") \
        .drop("resbase42010") \
        .drop("respop72010") \
        .drop("respop72011") \
        .drop("respop72012") \
        .drop("respop72013") \
        .drop("respop72014") \
        .drop("respop72015") \
        .drop("respop72016") \
        .withColumnRenamed("respop72017", "pop2017") \
        .withColumnRenamed("GEO.id2", "countyId") \
        .withColumnRenamed("GEO.display-label", "county")

    logging.warning("Census data")
    census_df.sample(0.1).show(3, False)
    census_df.printSchema()

    path = '../../../../data/dapip/'
    filename = "InstitutionCampus.csv"
    absolute_file_path = get_absolute_file_path(path, filename)

    # Higher education institution (and yes, there is an Arkansas College
    # of Barbering and Hair Design)
    higher_ed_df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(absolute_file_path)

    higher_ed_df = higher_ed_df \
        .filter("LocationType = 'Institution'") \
        .withColumn("addressElements", F.split(F.col("Address"), " "))

    higher_ed_df = higher_ed_df.withColumn("addressElementCount",
                                           F.size(F.col("addressElements")))

    higher_ed_df = higher_ed_df.withColumn(
        "zip9",
        F.element_at(F.col("addressElements"), F.col("addressElementCount")))

    higher_ed_df = higher_ed_df.withColumn("splitZipCode",
                                           F.split(F.col("zip9"), "-"))

    higher_ed_df = higher_ed_df \
        .withColumn("zip", F.col("splitZipCode")[0]) \
        .withColumnRenamed("LocationName", "location") \
        .drop("DapipId") \
        .drop("OpeId") \
        .drop("ParentName") \
        .drop("ParentDapipId") \
        .drop("LocationType") \
        .drop("Address") \
        .drop("GeneralPhone") \
        .drop("AdminName") \
        .drop("AdminPhone") \
        .drop("AdminEmail") \
        .drop("Fax") \
        .drop("UpdateDate") \
        .drop("zip9") \
        .drop("addressElements") \
        .drop("addressElementCount") \
        .drop("splitZipCode") \
        .alias("highered")

    logging.warning("Higher education institutions (DAPIP)")
    higher_ed_df.sample(0.1).show(3, False)
    higher_ed_df.printSchema()

    path = '../../../../data/hud/'
    filename = "COUNTY_ZIP_092018.csv"
    absolute_file_path = get_absolute_file_path(path, filename)

    # Zip to county
    county_zip_df = spark.read.format("csv") \
        .option("header", "true") \
        .option("inferSchema", "true") \
        .load(absolute_file_path)

    county_zip_df = county_zip_df \
        .drop("res_ratio") \
        .drop("bus_ratio") \
        .drop("oth_ratio") \
        .drop("tot_ratio") \
        .alias("hud")

    logging.warning("Counties / ZIP Codes (HUD)")
    county_zip_df.sample(0.1) \
        .show(3, False)

    county_zip_df.printSchema()

    # Institutions per county id
    instit_per_county_df = higher_ed_df.join(
        county_zip_df, higher_ed_df["zip"] == county_zip_df["zip"], "inner")

    logging.warning("Higher education institutions left-joined with HUD")
    instit_per_county_df.filter(higher_ed_df["zip"] == 27517) \
        .show(20, False)

    instit_per_county_df.printSchema()

    # Institutions per county name
    instit_per_county_df = instit_per_county_df.join(
        census_df, instit_per_county_df["county"] == census_df["countyId"],
        "left")

    logging.warning("Higher education institutions and county id with census")
    instit_per_county_df.filter(higher_ed_df["zip"] == 27517) \
        .show(20, False)

    instit_per_county_df.filter(higher_ed_df["zip"] == 2138) \
        .show(20, False)

    # Final clean up
    instit_per_county_df = instit_per_county_df.drop("highered.zip") \
        .drop("hud.county") \
        .drop("countyId") \
        .distinct()

    logging.warning("Final list")
    instit_per_county_df.show(200, False)

    logging.warning("The combined list has {} elements.".format(
        instit_per_county_df.count()))
コード例 #25
0
    chains.persist()

    #%%
    chains = chains.join(data, chains.next == data.tweet_id, 'inner')\
        .select(
            'sender',
            'tweets',
            data.text.alias('response'),
            data.author_id,
            data.response_tweet_id.alias('next'),
        )
    #%%
    chains = chains.withColumn(
        'self_response',
        functions.element_at('sender', -1) == functions.col('author_id'))
    #%%
    chains.persist()
    #Add to samples
    samples = samples.unionAll(
        chains.filter(~chains['self_response']).select('sender', 'tweets',
                                                       'response',
                                                       'author_id'))
    # Remove finished chains
    samples = samples.checkpoint()

    #%%
    chains = chains.filter(chains.next != 'none')

    chains.persist()
コード例 #26
0
def main(input_dir,output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('created_utc_iso', types.DateType()),
        types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())),
        types.StructField('score',types.LongType())
    ])
    headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80).cache()

    agg_scores = headlines_df.groupBy(headlines_df['created_utc_iso']).agg(
        functions.collect_set(headlines_df['score']).alias('scores_per_day')
    )

    top_scores_df = agg_scores.withColumn('sorted_scores',top_scores(agg_scores['scores_per_day']))

    #getting top headlines with the highest scores
    top_headlines = headlines_df.join(functions.broadcast(top_scores_df),on=['created_utc_iso']).where(
        arr_contains(top_scores_df['sorted_scores'],headlines_df['score'])
    ).select(
        headlines_df['title_clean'],
        headlines_df['created_utc_iso'],
        headlines_df['polarity_subjectivity'],
        headlines_df['score']
    ).withColumn(
        'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1)
    ).withColumn(
        'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2)
    )

    agg_sentiment_by_day = top_headlines.groupBy(top_headlines['created_utc_iso']).agg(
        functions.avg(top_headlines['polarity']).alias('avg_sentiment')
    ).cache()

    assembler = VectorAssembler(inputCols=['avg_sentiment'], outputCol='features')
    scaler = MinMaxScaler(inputCol='features',outputCol='normalized_avg_vector')

    pipeline = Pipeline(stages=[assembler, scaler])
    scaler_model = pipeline.fit(agg_sentiment_by_day)

    scaled_avg = scaler_model.transform(agg_sentiment_by_day)
    scaled_avg = scaled_avg.withColumn('normalized_avg',first_element(scaled_avg['normalized_avg_vector']))

    #save scaled_avg to file need to coelesce aggregates into 1 file
    #because this will be read by pandas later on which doesnt support multi file
    #scaled_avg.select(
        #scaled_avg['created_utc_iso'].alias('date'),
        #scaled_avg['normalized_avg'].alias('avg_sentiment_top_news')
    #).coalesce(1).write.csv(output_dir, mode='overwrite', compression='gzip')


    #this wil always be each day of year since from 2008 to 2019
    #all data is aggregated into 365 days * 11 years = at around 4000 records
    aggregate_pandas = scaled_avg.select(
        scaled_avg['created_utc_iso'].alias('date'),
        scaled_avg['normalized_avg'].alias('sentiment')
    ).toPandas()

    aggregate_pandas = aggregate_pandas.set_index('date');
    plt.plot(aggregate_pandas['sentiment'],marker='.',alpha=0.5,linestyle='None')
    plt.savefig(output_dir+ '/sentiment_series.png')
コード例 #27
0
def compile_array_index(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    index = op.index.op().value + 1
    return F.element_at(src_column, index)
コード例 #28
0
def task_3(data_io, product_data):
    # -----------------------------Column names--------------------------------
    # Inputs:
    asin_column = 'asin'
    price_column = 'price'

    attribute = 'also_viewed'
    related_column = 'related'
    # Outputs:
    meanPriceAlsoViewed_column = 'meanPriceAlsoViewed'
    countAlsoViewed_column = 'countAlsoViewed'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    # countAlsoViewed
    added_also_viewed = product_data.select(
        asin_column, price_column, F.element_at(related_column, attribute))
    #                                     .withColumnRenamed(
    #                                         'element_at(related, also_viewed)',
    #                                          attribute)

    added_count = added_also_viewed.withColumn(countAlsoViewed_column,
                                               F.size(F.col('element_at(related, also_viewed)')))\
    .replace(-1, None)

    # countAlsoViewed res
    exploded_df = added_count.select(asin_column,
                                     F.explode('element_at(related, also_viewed)'))\
                             .withColumnRenamed(
                                        'col',
                                         'to_join')

    joined_df = exploded_df.join(added_count.withColumnRenamed(
        'asin', 'to_join'),
                                 on='to_join',
                                 how='inner')

    # meanPriceAlsoViewed
    out_df = joined_df.groupby(asin_column).agg({price_column: 'mean'})\
        .withColumnRenamed('avg(price)', meanPriceAlsoViewed_column)

    out_df = added_count.join(out_df, on=asin_column, how='left')

    count_total = out_df.count()

    # meanPriceAlsoViewed res
    mean_meanPriceAlsoViewed = out_df.select(F.avg(
        out_df.meanPriceAlsoViewed)).head()[0]
    variance_meanPriceAlsoViewed = out_df.select(
        F.variance(out_df.meanPriceAlsoViewed)).head()[0]
    numNulls_meanPriceAlsoViewed = out_df.filter(
        out_df.meanPriceAlsoViewed.isNull()).count()

    mean_countAlsoViewed = out_df.select(F.avg(
        out_df.countAlsoViewed)).head()[0]
    variance_countAlsoViewed = out_df.select(F.variance(
        out_df.countAlsoViewed)).head()[0]
    numNulls_countAlsoViewed = out_df.filter(
        out_df.countAlsoViewed.isNull()).count()

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'mean_meanPriceAlsoViewed': None,
        'variance_meanPriceAlsoViewed': None,
        'numNulls_meanPriceAlsoViewed': None,
        'mean_countAlsoViewed': None,
        'variance_countAlsoViewed': None,
        'numNulls_countAlsoViewed': None
    }
    # Modify res:

    res['count_total'] = int(count_total)

    res['mean_meanPriceAlsoViewed'] = float(mean_meanPriceAlsoViewed)
    res['variance_meanPriceAlsoViewed'] = float(variance_meanPriceAlsoViewed)
    res['numNulls_meanPriceAlsoViewed'] = int(numNulls_meanPriceAlsoViewed)

    res['mean_countAlsoViewed'] = float(mean_countAlsoViewed)
    res['variance_countAlsoViewed'] = float(variance_countAlsoViewed)
    res['numNulls_countAlsoViewed'] = int(numNulls_countAlsoViewed)

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_3')
    return res
コード例 #29
0
def foreach_jdbc_writer(df, epoch_id):
    df.write.\
    jdbc(url="jdbc:mysql://localhost/world",table="amazon_products",mode='append',properties={"driver":"com.mysql.cj.jdbc.Driver","user":"******"})


spark = SparkSession.builder.master('local[2]').appName(
    'StreamingDemo').getOrCreate()

df = spark.readStream.format('kafka')\
    .option('kafka.bootstrap.servers','localhost:9092')\
    .option('subscribe','amazon')\
    .load()

deser = udf(lambda x: pickle.loads(x), MapType(StringType(), StringType()))

deserlizedDF = df.withColumn('map', deser(df['value']))
parsedDF = deserlizedDF.withColumn('title',element_at('map','productTitle'))\
    .withColumn('Categories',element_at('map','productCategories'))\
    .withColumn('Rating',element_at('map','productRating'))\
    .withColumn('Description',element_at('map','productDescription'))\
    .withColumn('Prices',element_at('map','productPrices'))\
    .withColumn('Min_Price',array_min(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))\
    .withColumn('Max_Price',array_max(split(element_at('map','productPrices'),r'#*\$').cast(ArrayType(FloatType()))))

projectedDF = parsedDF.select('title', 'Categories', 'Rating', 'Prices',
                              'Min_Price', 'Max_Price')

result = projectedDF.writeStream.foreachBatch(foreach_jdbc_writer).start()

result.awaitTermination()
コード例 #30
0
                 'agg1-street-index',
                 settings=settings,
                 append=False)

# In[8]:

agg_stop_df = read_elastic("agg1-street-index",
                           array_field="reverse_gecode").withColumnRenamed(
                               'coordinates', 'agg_coords')
# .withColumn('reverse_gecode', F.array_distinct("reverse_gecode"))

stop_df = read_elastic('stop-index')

reverse_gecode_df = stop_df.join(
    agg_stop_df,
    (F.round(F.element_at(stop_df.coordinates, 1), 5) == F.round(
        F.element_at(agg_stop_df.agg_coords, 1), 5)) &
    (F.round(F.element_at(stop_df.coordinates, 2), 5) == F.round(
        F.element_at(agg_stop_df.agg_coords, 2), 5)),
    how='left').drop('agg_coords')

settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "actualDelay": {
                "type": "long"
            },