def main(lowtopic_dist, hightopic_dist, output_folder):

    lowtopic_df = spark.read.csv(lowtopic_dist, header=True)
    hightopic_df = spark.read.csv(hightopic_dist, header=True)

    df_init_low = lowtopic_df.select(
        lowtopic_df['BusinessID'], lowtopic_df['BusinessName'],
        join_udf(
            lowtopic_df['topic_distribution']).alias('topic_distribution'))
    df_expl_low = df_init_low.select(df_init_low['BusinessID'], df_init_low['BusinessName'], \
                         functions.posexplode(functions.split(df_init_low['topic_distribution'], ','))) \
                            .withColumnRenamed('pos', 'TopicID') \
                            .withColumnRenamed('col', 'TopicDistribution')

    df_expl_low.coalesce(1).write.csv(output_folder +
                                      '/Topic_low_topic_exploded',
                                      header=True)

    df_init_high = hightopic_df.select(
        hightopic_df['BusinessID'], hightopic_df['BusinessName'],
        join_udf(
            hightopic_df['topic_distribution']).alias('topic_distribution'))
    df_expl_high = df_init_high.select(df_init_high['BusinessID'], df_init_high['BusinessName'], \
                         functions.posexplode(functions.split(df_init_high['topic_distribution'], ','))) \
                            .withColumnRenamed('pos', 'TopicID') \
                            .withColumnRenamed('col', 'TopicDistribution')

    df_expl_high.coalesce(1).write.csv(output_folder +
                                       '/Topic_high_topic_exploded',
                                       header=True)
    def join_urls_and_weighted_word_vectors(
            split_urls_and_coefficients: DataFrame,
            word_vectors: DataFrame) -> DataFrame:
        """
        Explodes split URLs to words and calculates each word's weighted word vector.

        :param split_urls_and_coefficients: A DataFrame of split URLs and coefficients with columns: id, url, split_url,
                                            coefficients.
        :param word_vectors: A DataFrame of words and vectors with columns: word, vector.
        :return: A DataFrame of URLs and weighted word vectors with columns: id, url, pos, word, weighted_word_vector,
                 coefficient.
        """
        combined_split_urls_and_coefficients = split_urls_and_coefficients \
            .rdd.map(URLVectorCalculator.combine_split_urls_and_coefficients).toDF()
        return combined_split_urls_and_coefficients \
            .select("id", "url", posexplode("split_urls_and_coefficients")) \
            .withColumnRenamed("col", "word_and_coefficient") \
            .rdd.map(lambda e:
                     IDURLPosWordCoefficient(
                         e.id, e.url, e.pos, e.word_and_coefficient[0], e.word_and_coefficient[1])) \
            .toDF().join(word_vectors, "word") \
            .rdd.map(lambda e:
                     IDURLPosWordVectorCoefficient(id=e.id, url=e.url, pos=e.pos, word=e.word,
                                                   coefficient=e.coefficient,
                                                   weighted_word_vector=list([float(x) for x in
                                                                              np.multiply(e.vector.values,
                                                                                          e.coefficient)]))) \
            .toDF()
Example #3
0
def transform_abstracts_words(dataframe):
    udf_function_clean = udf(generate_cleaned_abstracts, StringType())
    udf_function_sentiment = udf(generate_sentiment, DoubleType())

    df_abstracts = (dataframe.select(
        "paper_id",
        func.posexplode("abstract").alias("pos", "value")).select(
            "paper_id", "pos", "value.text").withColumn(
                "ordered_text",
                func.collect_list("text").over(
                    Window.partitionBy("paper_id").orderBy("pos"))
            ).groupBy("paper_id").agg(
                func.max("ordered_text").alias("sentences")).select(
                    "paper_id",
                    func.array_join("sentences",
                                    " ").alias("abstract")).withColumn(
                                        "words",
                                        func.size(func.split(
                                            "abstract", "\s+"))))

    df_abstracts = df_abstracts.withColumn("clean_abstract",
                                           udf_function_clean("abstract"))
    df_abstracts = df_abstracts.withColumn(
        "sentiment_abstract", udf_function_sentiment("clean_abstract"))

    return df_abstracts
Example #4
0
def get_filtered_by_month(data: DataFrame) -> DataFrame:
    """
    Method transforms periods "start_date - end_date" to year and month number

    source:
    +---+----------+----------+
    |key|start_date|  end_date|
    +---+----------+----------+
    |  5|2018-01-01|2018-01-09|
    +---+----------+----------+

    result:
    +---+----------+----------+
    |key|      year|     month|
    +---+----------+----------+
    |  5|      2018|         1|
    +---+----------+----------+
    """

    transformed_data = data \
        .withColumn("start_date", F.trunc("start_date", "month")) \
        .withColumn("monthsDiff", F.months_between("end_date", "start_date")) \
        .withColumn("repeat", F.expr("split(repeat(',', monthsDiff), ',')")) \
        .select("*", F.posexplode("repeat").alias("date", "val")) \
        .withColumn("month", F.expr("add_months(datab, date)")) \
        .drop('start_date', 'end_date', 'monthsDiff', 'repeat', 'val', 'date') \
        .dropDuplicates()

    return transformed_data
Example #5
0
def with_exploded_hits(df: DataFrame) -> DataFrame:
    return (
        df
        .select(
            F.posexplode('hit_page_ids').alias('hit_position', 'hit_page_id'),
            *df.columns)
        .drop('hit_page_ids')
        .withColumn('clicked', F.expr('array_contains(click_page_ids, hit_page_id)'))
        .drop('click_page_ids'))
Example #6
0
def samples_from_vcf (vcf):
  """
  Given a dataframe consisting of filename and data, in vcf-like format, returns a list of the subjects
  with column number

  Samples come from line that starts with one #
  Split the line by "\t" to get an array of column names
  """
  temp = vcf.filter(vcf.data[0:2] == '#C').select('filename',f.posexplode(f.split('data','\t'))).filter('pos > 9')
  return(temp.select((temp.pos - 1).alias('SAMPLE_IDX'), temp.col.alias('SAMPLE_NAME'), 'filename'))
Example #7
0
def gts_from_impute (infile):
  # Get the main data and put a unique index on each variant
  maindata = infile.filter(infile.data[0:1] != "#")
  splitdata = maindata.select("filename",f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX"))

  gtdata1 = splitdata.select("filename", "VAR_IDX", f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","COLUMN_IDX","GTPROB").filter("COLUMN_IDX > 4")
  # Now, get subject ID and which GT
  gtdata2 = gtdata1.select("filename", "VAR_IDX", "GTPROB", "COLUMN_IDX", f.floor((gtdata1.COLUMN_IDX - 5) / 3).alias("SAMPLE_IDX"), ((gtdata1.COLUMN_IDX - 5) % 3).cast(StringType()).alias("GT_IDX"))
  gtdata3 = rkutil.withColumnsRenamed(gtdata2.groupBy("filename","VAR_IDX","SAMPLE_IDX").pivot("GT_IDX",["0","1","2"]).agg(f.collect_list("GTPROB")), ["0","1","2"],["c0","c1","c2"])
  gtdata4 = gtdata3.select("filename","VAR_IDX","SAMPLE_IDX", f.element_at(gtdata3.c0, 1).cast(FloatType()).alias("P11"), f.element_at(gtdata3.c1, 1).cast(FloatType()).alias("P12"), f.element_at(gtdata3.c2, 1).cast(FloatType()).alias("P22"))
  return(gtdata4)
Example #8
0
    def populate(cls, spark=None):
        sdf = (TrainClickGrouped.query(spark).drop(
            'joblist', 'rel_list', 'id_list', 'jobno_list', 'action_list',
            'text_score').select(
                '*',
                f.posexplode('job_rel_list').alias(
                    'pos_in_list',
                    'job_rel_in_list')).drop('job_rel_list').select(
                        '*', 'job_rel_in_list.*').drop('job_rel_in_list'))

        cls.write(sdf, compression='snappy')
def prepData(category):
    
    df = spark.read.parquet("gs://core_bucket_abell/aspect_ranking/extracted_aspects/Extracted " + category + ".parquet")

    #explode aspects
    spread_df = df.select("*", F.posexplode("indiv_aspects").alias("pos", "split_aspect"))

    # get unique by category/review and split aspect - remove blank aspects
    unique_cat_aspect = spread_df.groupBy(*('asin', "reviews", "main_cat", "categories", "scores", "split_aspect", "idx")).count()\
        .withColumn("split_aspect", F.lower(F.col("split_aspect")))\
        .withColumn("split_aspect", rmStop_udf(F.col("split_aspect")))\
        .filter("split_aspect != ''")
    
    return unique_cat_aspect
Example #10
0
    def populate(cls, spark=None):
        sdf = (
            TestsetClickProcessed.query(spark)
            # .drop('joblist', 'rel_list', 'id_list', 'jobno_list', 'action_list')
            .select('*',
                    f.posexplode('joblist').alias('pos_in_list',
                                                  'job')).withColumn(
                                                      'rel',
                                                      f.lit(0)).drop('joblist')
            # .select('*', 'job_in_list.*')
            # .drop('job_in_list')
        )

        cls.write(sdf, compression='snappy')
Example #11
0
    def main(self, sc: SparkContext, *args):
        os.makedirs(os.path.join(DATASET_DIR,
                                 clean_filename(self.filter_city)),
                    exist_ok=True)

        spark = SparkSession(sc)
        train_df = spark.read.csv(self.input()[0].path,
                                  header=True,
                                  inferSchema=True)
        train_df = train_df.withColumn("impressions_array",
                                       F.split(train_df.impressions, "\|"))

        meta_df = spark.read.csv(self.input()[1].path,
                                 header=True,
                                 inferSchema=True)

        # Filter dataset
        if self.filter_city != 'all':
            if self.filter_city == 'recsys':
                train_df = train_df.filter(train_df.city.isin(RECSYS_CITIES))
            else:
                train_df = train_df.filter(train_df.city == self.filter_city)

            # Filter reference
            reference_df = train_df.select("reference").distinct()

            # Filte item impressions
            item_id_df = train_df.select(
                posexplode("impressions_array").alias(
                    "pos_item_idx",
                    "reference")).select("reference").distinct()
            #raise(Exception(train_df.show()))
            item_id_df = item_id_df.union(reference_df).select(
                "reference").distinct()

            meta_df = meta_df.join(
                item_id_df, meta_df.item_id == item_id_df.reference).select(
                    "item_id", "properties")

        if self.sample_size > 0:
            train_df = train_df.sort("timestamp",
                                     ascending=False).limit(self.sample_size)

        # Save
        train_df.toPandas().to_csv(self.output()[0].path, index=False)
        meta_df.toPandas().to_csv(self.output()[1].path, index=False)
Example #12
0
def gts_from_vcf (vcf, *variant):

  """
  Given a VCF file, and an optional variant file to filter on, returns the gt matrix
  """
  
  if (len(variant)==0):
     maindata = vcf.filter(vcf.data[0:1] != "#")
  else:
    var = variant[0].withColumnRenamed('filename','filename2')
    maindata = vcf.join(var,(var.filename2 == vcf.filename) & (var.VAR_IDX == vcf.lineid), "inner")

  splitdata = maindata.select(maindata.filename,f.split(maindata.data,"[\t ]+").alias("split_data"),maindata.lineid.alias("VAR_IDX"))
    
  # Explode out the genotype data, but leave out the first nine columns
  gt = splitdata.select('filename','VAR_IDX', f.posexplode(splitdata.split_data)).toDF("filename","VAR_IDX","SAMPLE_IDX","GT").filter("SAMPLE_IDX >= 9")
  return(gt)  
 def _transform_rec_info(self, rec_info, item_ids_dataset):
     import pyspark.sql.functions as F
     if self.output_user_embeddings:
         user_embeddings = rec_info.select(
             self.increasing_id_column_name,
             self.recommendation_info_column_name + '.user_embedding')
     rec_info = rec_info.withColumn(
         self.recommendation_info_column_name,
         F.arrays_zip(self.recommendation_info_column_name + '.indices',
                      self.recommendation_info_column_name + '.distances'))
     rec_info = rec_info.select(
         self.increasing_id_column_name,
         (F.posexplode(self.recommendation_info_column_name).alias(
             'pos', self.recommendation_info_column_name)))
     rec_info = rec_info.select(
         self.increasing_id_column_name, 'pos',
         F.col(self.recommendation_info_column_name + '.0').alias('index'),
         F.col(self.recommendation_info_column_name +
               '.1').alias('distance'))
     rec_info = rec_info.join(item_ids_dataset,
                              F.col('index') == F.col('id'))
     w = pyspark.sql.Window.partitionBy(
         self.increasing_id_column_name).orderBy('pos')
     if self.output_item_embeddings:
         rec_info = rec_info.select(
             self.increasing_id_column_name, 'pos',
             (F.struct('name', 'distance', 'item_embedding').alias(
                 self.recommendation_info_column_name)))
     else:
         rec_info = rec_info.select(
             self.increasing_id_column_name, 'pos',
             (F.struct('name', 'distance').alias(
                 self.recommendation_info_column_name)))
     rec_info = rec_info.select(
         self.increasing_id_column_name, 'pos', (F.collect_list(
             self.recommendation_info_column_name).over(w).alias(
                 self.recommendation_info_column_name)))
     rec_info = rec_info.groupBy(self.increasing_id_column_name).agg(
         F.max(self.recommendation_info_column_name).alias(
             self.recommendation_info_column_name))
     if self.output_user_embeddings:
         rec_info = rec_info.join(user_embeddings,
                                  self.increasing_id_column_name)
     return rec_info
Example #14
0
def get_filtered_by_week(data: DataFrame) -> DataFrame:
    """
    Method transforms periods "start_date - end_date" to year and week number of year

    source:
    +---+----------+----------+
    |key|start_date|  end_date|
    +---+----------+----------+
    |  5|2018-01-01|2018-01-09|
    +---+----------+----------+

    result:
    +---+----------+----------+
    |key|      year|  week_num|
    +---+----------+----------+
    |  5|      2018|         1|
    |  5|      2018|         2|
    +---+----------+----------+
    """

    max_week_number = 53
    transformed_data = data \
        .withColumn('start_week', F.weekofyear('start_date')) \
        .withColumn('weeks_diff', F.ceil(F.datediff(F.col('end_date'),
                                                    F.col('start_date')) / 7)) \
        .withColumn("year", F.year("start_date")) \
        .withColumn("repeat", F.expr("split(repeat(',', weeks_diff), ',')")) \
        .select("*", F.posexplode("repeat").alias("week_add", "val")) \
        .withColumn('total_week_num', F.col('start_week') + F.col('week_add')) \
        .withColumn('add_year', (F.col('total_week_num') / max_week_number).cast(IntegerType())) \
        .withColumn('total_week_num', F.col('total_week_num') - (max_week_number * F.col('add_year'))) \
        .withColumn('week_num',
                    F.when(F.col('total_week_num') == 0, 1)
                    .otherwise(F.col('total_week_num'))) \
        .withColumn('year', F.col('year') + F.col('add_year')) \
        .drop('start_date', 'end_date', 'week_add', 'repeat',
              'val', 'date', 'add_year', 'weeks_diff', 'total_week_num') \
        .dropDuplicates()

    return transformed_data
Example #15
0
def Ranking_evaluator (spark,model, val, metric_type):
    
    val.createOrReplaceTempView('val')                        
    val_user = spark.sql('SELECT DISTINCT user_id FROM val')  
    #val_user = val.select('user_id').distinct()
    val_rec = model.recommendForUserSubset(val_user,500)
    #val_rec.printSchema()
    
    val_rec = val_rec.select('user_id','recommendations',f.posexplode('recommendations')).drop('pos').drop('recommendations')
    val_rec = val_rec.select('user_id',f.expr('col.book_id'),f.expr('col.rating'))
    
    w= Window.partitionBy('user_id')
    val_recrank=val_rec.select('user_id',f.collect_list('book_id').over(w).alias('rec_rank')).sort('user_id').distinct()
   
    val = val.sort(f.desc('rating'))
    val_truerank=val.select('user_id', f.collect_list('book_id').over(w).alias('true_rank')).sort('user_id').distinct()
    
    scoreAndLabels = val_recrank.join(val_truerank,on=['user_id'],how='inner')
    
    rankLists=scoreAndLabels.select("rec_rank", "true_rank").rdd.map(lambda x: tuple([x[0],x[1]])).collect()
    ranks = spark.sparkContext.parallelize(rankLists)
    
    metrics = RankingMetrics(ranks)
    
    MAP = metrics.meanAveragePrecision
    Precision = metrics.precisionAt(500)
    NDCG = metrics.ndcgAt(500)
    
    if metric_type == 'Precision':
        return Precision, {'MAP': MAP,'NDCG': NDCG}
    elif metric_type == 'MAP':
        return MAP, {'Precision': Precision,'NDCG': NDCG}
    elif metric_type == 'NDCG':
        return NDCG, {'MAP': MAP, 'Precision': Precision}
    else:
        return None
Example #16
0
        )\
        .show(5,False)

    companiesJson = [
        """{"company":"NewCo","employees":[{"firstName":"Sidhartha","lastName":"Ray"},{"firstName":"Pratik","lastName":"Solanki"}]}""",
        """{"company":"FamilyCo","employees":[{"firstName":"Jiten","lastName":"Pupta"},{"firstName":"Pallavi","lastName":"Gupta"}]}""",
        """{"company":"OldCo","employees":[{"firstName":"Vivek","lastName":"Garg"},{"firstName":"Nitin","lastName":"Gupta"}]}""",
        """{"company":"ClosedCo","employees":[]}"""
    ]

    companiesRDD = sparkSession.sparkContext.parallelize(companiesJson)
    companiesDF = sparkSession.read.json(companiesRDD)

    companiesDF.show(5, False)
    companiesDF.printSchema()

    employeeDfTemp = companiesDF.select("company",
                                        explode("employees").alias("employee"))
    employeeDfTemp.show()
    employeeDfTemp2 = companiesDF.select(
        "company",
        posexplode("employees").alias("employeePosition", "employee"))
    employeeDfTemp2.show()
    employeeDf = employeeDfTemp.select("company",
                                       expr("employee.firstName as firstName"))
    employeeDf.select("*",
            when(col("company") == "FamilyCo", "Premium")
            .when(col("company") == "OldCo", "Legacy")
            .otherwise("Standard").alias("Tier"))\
    .show(5,False)
Example #17
0
def mapping_sequence(sdf,
                     join_sdf,
                     column='item_id_collect__sub',
                     join_column=None,
                     how='inner',
                     dtype=ArrayType(ArrayType(FloatType())),
                     fillna=None,
                     drop=False,
                     skew_quantile=0.9,
                     orderby_flag=True):
    """
    Paramter:
    ----------
    sdf: pyspark dataframe to be processed
    join_sdf: pyspark dataframe which be joined by sdf 
    column: str of sdf's col to be mapping
    join_column: str of join_sdf's col to be joined by sdf
    how: str of the join mode which sdf.join(join_sdf)
    dtype: pyspark.sql.types of the return values
    Return:
    ----------
    sdf: pyspark dataframe of sdf.join(join_sdf) by mapping_sequence
    Example:
    ----------
    sdf=mapping_sequence(sdf,join_sdf,column='item_id_collect__sub')
    """
    if join_column is None:
        join_column = column
    resdtype = len(re.findall(re.compile(r'(ArrayType)'), str(dtype)))
    if resdtype == 1:

        def array_eval_udf(x):
            return F.col(x).cast(dtype)
    else:
        array_eval_udf = F.udf(lambda x: [eval(i) for i in x], dtype)
    join_other_list = [
        c + '__explode' for c in join_sdf.columns if c != join_column
    ]

    def other_list_fun(all_, pop_):
        return list(filter(lambda x: x not in pop_, all_))

    temp1 = sdf.withColumn('increasing_id', F.monotonically_increasing_id())
    temp2 = temp1.select(
        F.posexplode(column).alias(column + '__explodeindex',
                                   column + '__explodecol'), *temp1.columns)
    join_sdf = eval("join_sdf.withColumnRenamed" + ".withColumnRenamed".join([
        f"('{c}','%s')" %
        (f"{c}__explode" if c != join_column else column + '__explodecol')
        for c in join_sdf.columns
    ]))
    if skew_quantile is not None:
        #         temp2=sdf_join_by_dataskew(temp2,join_sdf,raw_column=column+'__explodecol',how=how)
        temp2=sdf_join_by_dataskew_quantile(temp2,join_sdf,raw_column=column+'__explodecol',how=how,\
                                            skew_quantile=skew_quantile)
    else:
        temp2 = temp2.join(join_sdf, on=column + '__explodecol', how=how)
    if fillna is not None:
        temp2 = temp2.fillna(
            dict(zip(join_other_list, [fillna] * len(join_other_list))))
    temp2 = temp2.select(
        other_list_fun(temp2.columns, join_other_list) +
        [cast2str_udf(c).alias(c) for c in join_other_list])
    #     temp2.cache()
    collect_temp2 = collect_orderby(temp2,
                                    join_other_list,
                                    groupby=temp1.columns,
                                    orderby=column +
                                    '__explodeindex' if orderby_flag else None)
    #     return collect_temp2,join_other_list
    #     temp2.unpersist()
    res_other = [
        array_eval_udf(c + '__collect').alias(c + '__collect')
        for c in join_other_list
    ]
    sdf = collect_temp2.select(
        other_list_fun(collect_temp2.columns,
                       [c + '__collect' for c in join_other_list]) +
        res_other).drop('increasing_id')
    #     sdf = sdf.select([c+'__collect' for c in join_other_list]+sdf.columns).drop('increasing_id')
    for c in join_other_list:
        sdf = sdf.withColumnRenamed(c + '__collect',
                                    c.split('__explode')[0] + '__collect')
    if drop:
        sdf = sdf.drop(column)
    return sdf
Example #18
0
def uniprot_mapping_to_parquet(input_id_mapping, uniprot_fasta_folder,
                               out_path):
    """
    This method read a tab delimited idmapping file from uniprot and a set of FASTA files from a directory to
    generate a parquet file with the following five columns:

    - AC: Uniprot Protein Accession
    - ID: Uniprot Protein Name
    - Gene: Gene name for the corresponding protein
    - Organism: Organism for the corresponding protein
    - EvidenceLevel: Protein evidence level following uniprot guidelines


    :param input_id_mapping: input id tab-delimited mapping file from uniprot
    :param uniprot_fasta_folder: folder containing fasta files from uniprot (ext fasta.gz)
    :return:
    """

    # Read the id mapping file and keep only the Unprot accession and the Protein name
    sql_context = SparkSession.builder.getOrCreate()
    print("======= processing:" + input_id_mapping)
    df = sql_context.read.csv(path=input_id_mapping, sep='\t', header=False)
    df = df.select(df.columns[:21]).toDF(
        "AC", "ID", "GeneID", "RefSeq", "GI", "PDB", "GO", "UniRef100",
        "UniRef90", "UniRef50", "UniParc", "PIR", "NCBI-taxon", "MIM",
        "UniGene", "PubMed", "EMBL", "EMBL-CDS", "Ensembl", "Ensembl_TRS",
        "Ensembl_PRO")

    df_uniprot_id = df.select(col("AC"), col("ID"))
    df_uniprot_id.show(n=30, truncate=False)
    df_ref_seq = df.select(col("AC"),
                           col("RefSeq")).withColumnRenamed("RefSeq", "ID")
    df_ref_seq.show(n=30, truncate=False)
    df_ensembl_seq = df.select(col("AC"), col("Ensembl_PRO"))
    df_ensembl_seq = df_ensembl_seq.select(
        "AC",
        f.split("Ensembl_PRO", "; ").alias("letters"),
        f.posexplode(f.split("Ensembl_PRO",
                             "; ")).alias("pos", "val")).select("AC", "val")
    df_ensembl_seq = df_ensembl_seq.withColumnRenamed("val", "ID")
    # df_ensembl_seq.show(n=40, truncate=False)

    # create list of dataframes
    dfs = [df_uniprot_id, df_ref_seq, df_ensembl_seq]

    # create merged dataframe
    df = reduce(DataFrame.unionAll, dfs)
    # df_ensembl_seq.show(n=3000, truncate=False)

    df_uniprot = df.select([trim(col(c)).alias(c) for c in df.columns])
    df_uniprot.show(n=30, truncate=False)

    print("======= processing:" + uniprot_fasta_folder)
    uniprot_fasta = sql_context.read.text(uniprot_fasta_folder + "*.fasta.gz")
    uniprot_fasta = uniprot_fasta.filter(
        lower(uniprot_fasta.value).contains(">"))
    cols = [
        "Accession", "ProteinName", "Gene", "Organism", "TaxId",
        "EvidenceLevel"
    ]
    uniprot_fasta = uniprot_fasta.rdd.map(uniprot_header_to_df).toDF().toDF(
        *cols)
    # uniprot_fasta.show(n=30)

    complete_uniprot = df_uniprot.join(
        uniprot_fasta, df_uniprot.AC == uniprot_fasta.Accession,
        "left").drop(uniprot_fasta.Accession)

    # This delete duplicated records by accession
    complete_uniprot = complete_uniprot.dropDuplicates(['AC', 'ID'])
    complete_uniprot = complete_uniprot.filter(complete_uniprot.ID.isNotNull())
    complete_uniprot.write.parquet(out_path,
                                   mode='append',
                                   compression='snappy')

    print_df = sql_context.read.parquet(out_path)
    print_df.show(n=3000, truncate=False)
Example #19
0
    def main(self, sc: SparkContext, *args):
        os.makedirs(DATASET_DIR, exist_ok=True)

        #parans
        min_itens_per_session = 2
        max_itens_per_session = self.max_itens_per_session
        min_itens_interactions = self.min_itens_interactions  # Tupla interactions
        max_relative_pos = self.max_relative_pos

        spark = SparkSession(sc)
        df = spark.read.option("delimiter", ";").csv(BASE_DATASET_FILE,
                                                     header=True,
                                                     inferSchema=True)
        df = df.withColumnRenamed("sessionId", "SessionID")\
            .withColumnRenamed("eventdate", "Timestamp")\
            .withColumnRenamed("itemId", "ItemID")\
            .withColumn("Timestamp", (col("Timestamp").cast("long") + col("timeframe").cast("long")/1000).cast("timestamp"))\
            .orderBy(col('Timestamp'), col('SessionID'), col('timeframe')).select("SessionID", "ItemID", "Timestamp", "timeframe")

        # Drop duplicate item in that same session
        df = df.dropDuplicates(['SessionID', 'ItemID'])

        # filter date
        max_timestamp = df.select(max(
            col('Timestamp'))).collect()[0]['max(Timestamp)']
        init_timestamp = max_timestamp - timedelta(days=self.sample_days)
        df = df.filter(col('Timestamp') >= init_timestamp).cache()

        df = df.groupby("SessionID").agg(
            max("Timestamp").alias("Timestamp"),
            collect_list("ItemID").alias("ItemIDs"),
            count("ItemID").alias("total"))

        # Filter Interactions
        df = df.filter(df.total >= min_itens_per_session).cache()

        # Filter position in list
        df_pos = df.select(
            col('SessionID').alias('_SessionID'), posexplode(df.ItemIDs))

        # Explode A
        df = df.withColumn("ItemID_A", explode(df.ItemIDs))
        df = df.join(df_pos,
                    (df.SessionID == df_pos._SessionID) & (df.ItemID_A == df_pos.col))\
                .select('SessionID', 'Timestamp', 'ItemID_A', 'pos', 'ItemIDs')\
                .withColumnRenamed('pos', 'pos_A')

        # Explode B
        df = df.withColumn("ItemID_B", explode(df.ItemIDs))
        df = df.join(df_pos,
                    (df.SessionID == df_pos._SessionID) & (df.ItemID_B == df_pos.col))\
                .withColumnRenamed('pos', 'pos_B')

        df = df.withColumn("relative_pos", abs(df.pos_A - df.pos_B))

        # Filter  distincts
        df = df.select('SessionID', 'Timestamp', 'ItemID_A', 'pos_A',
                        'ItemID_B', 'pos_B', 'relative_pos')\
                .distinct()\
                .filter(df.ItemID_A != df.ItemID_B).cache()

        # # Filter duplicates
        # udf_join = F.udf(lambda s,x,y : "_".join(sorted([str(s), str(x),str(y)])) , StringType())
        # df = df.withColumn('key', udf_join('SessionID', 'ItemID_A','ItemID_B'))
        # df = df.dropDuplicates(["key"])

        # Calculate and filter probs ocorrence
        df_probs = self.get_df_tuple_probs(df)
        df = df.join(df_probs, (df.ItemID_A == df_probs._ItemID_A) &
                     (df.ItemID_B == df_probs._ItemID_B))

        # Add positive interactoes
        df_positive = self.add_positive_interactions(df)

        # Filter confidence
        df = df.filter(col("total_ocr_dupla") >= min_itens_interactions)\
               .filter(col("relative_pos") <= max_relative_pos)\
               .filter(col("pos_A") <= self.max_itens_per_session)

        # df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'pos_A',
        #                 'ItemID_B', 'pos_B', 'relative_pos',
        #                 'total_ocr', 'total_ocr_dupla', 'prob', 'sub_a_b')\
        #         .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right?
        df = df.select("SessionID", 'Timestamp', 'ItemID_A', 'ItemID_B', 'relative_pos',
                        'total_ocr', 'total_ocr_dupla')\
                .dropDuplicates(['ItemID_A', 'ItemID_B', 'relative_pos']) # TODO is it right?

        df.select("ItemID_A").dropDuplicates().toPandas().to_csv(
            self.output()[2].path, index_label="item_idx")
        df.select("SessionID").dropDuplicates().toPandas().to_csv(
            self.output()[3].path, index_label="session_idx")
        df.write.parquet(self.output()[0].path)
        df_positive.to_csv(self.output()[1].path)
Example #20
0
import pyspark.sql.functions as f

#import yelp_business.csv as dataframe
yelp_b = spark.read.option("header", "true").option("quote", "\"").option(
    "escape", "\"").csv("/Project_BigData/Data/yelp_business.csv")
#yelp_diff categories has rows which has distinct columns as category
yelp_diff_categories = yelp_b.select(
    "business_id",
    f.split("categories", ";").alias("categories"),
    f.posexplode(f.split("categories", ";")).alias("pos", "val"))
#Shows all the categories with count in descending order
yelp_categories = yelp_diff_categories.groupBy("val").count().orderBy(
    "count", ascending=False)
yelp_categories.write.csv(
    "/PopularBusinesses")  # Logic to write the output as CSV
Example #21
0
    def main(self, sc: SparkContext, *args):
        os.makedirs(DATASET_DIR, exist_ok=True)
        spark = SparkSession(sc)

        df = spark.read.csv(self.input()[0].path,
                            header=True,
                            inferSchema=True)
        item_idx_df = spark.read.csv(self.input()[1][0].path,
                                     header=True,
                                     inferSchema=True)
        item_idx_dict = item_idx_df.toPandas().set_index(
            'item_id')['item_idx'].to_dict()
        item_idx_most_dict = self.most_popular_array(df)
        print(item_idx_most_dict)

        # Expand impressions interactions
        df = df.withColumn("impressions", to_array_int_udf(df.impressions)).\
            withColumn("prices", to_array_float_udf(df.prices))

        sort_array_by_dict = udf(
            lambda x: sorted(x,
                             key=lambda _x: item_idx_most_dict[_x]
                             if _x in item_idx_most_dict else 0,
                             reverse=True), ArrayType(IntegerType()))
        df = df.withColumn("list_mean_price", array_mean(df.prices))

        # Convert item_id to item_idx in impressions
        to_item_idx_from_dict_udf = udf(
            lambda x:
            [item_idx_dict[i] if i in item_idx_dict else 0 for i in x],
            ArrayType(IntegerType()))
        df = df.withColumn("impressions",
                           to_item_idx_from_dict_udf(df.impressions))
        df = df.withColumn("impressions_popularity",
                           sort_array_by_dict(df.impressions))

        # Explode
        df = df.select(
            "*",
            posexplode("impressions").alias("pos_item_idx", "item_idx"))

        df = df.withColumn("price", df["prices"].getItem(df.pos_item_idx)).\
            withColumn("popularity", df["impressions_popularity"].getItem(df.pos_item_idx)).\
            withColumn("clicked", when(df.action_type_item_idx == df.item_idx, 1.0).otherwise(0.0)).\
            withColumn("view", lit(1.0)).orderBy('timestamp')

        df = df.withColumn("is_first_in_impression", df.pos_item_idx == lit(0))
        df = df.withColumn("first_item_idx", df["impressions"].getItem(lit(0)))
        df = df.withColumn("popularity_item_idx",
                           df["impressions_popularity"].getItem(lit(0)))
        df = df.withColumn("diff_price", df.price - df.list_mean_price)

        # Quartile Discretize
        columns = ["price", "popularity"]
        for c in columns:
            discretizer = QuantileDiscretizer(numBuckets=10,
                                              inputCol=c,
                                              outputCol=c + "_BIN")
            df = discretizer.fit(df).transform(df)

        # add feature 'user_view', 'hist_views'
        win_user_item = Window.partitionBy('user_idx', 'item_idx').orderBy('timestamp') \
            .rangeBetween(Window.unboundedPreceding, -1)
        win_user = Window.partitionBy('user_idx').orderBy('timestamp')\
            .rangeBetween(Window.unboundedPreceding, -1)

        df = df.withColumn("user_view", F.sum(col("view")).over(win_user)).\
            withColumn("hist_views", F.sum(col("view")).over(win_user_item)).\
            fillna(0, subset=["user_view", "hist_views"])

        df = df.withColumn("user_view", df.user_view + lit(1)).\
            withColumn("hist_views", df.hist_views + lit(1))

        df = df.withColumn("ps", df.hist_views / df.user_view)

        print(df.toPandas().head())
        df.orderBy('timestamp',
                   "pos_item_idx").toPandas().to_csv(self.output().path,
                                                     index=False)
Example #22
0
def main():
    logfile = "{0}/log/sentence_segment.log".format(PROJECT_DIR)
    utils.setup_logging(logfile, logging.INFO)
    logging.info(sys.path)

    args = utils.parse_arguments()
    batchsize = args.batchsize

    start_time = time.time()

    # set_env_vars()

    spark_session = spark.create_spark_session()

    es = elastic.check_elasticsearch()
    es_write_conf = spark.broadcast_es_write_config(spark_session)

    s3resource = aws.create_s3_resource()
    keys = aws.get_list_s3_files(s3resource,
                                 filetype=TEXT_FOLDER,
                                 numrows=batchsize)

    pbooks = s3_to_rdd(spark_session, keys)
    # testing_rdd = spark.sparkContext.wholeTextFiles("s3a://jason-b/{0}".format(TEXT_FOLDER), minPartitions=6, use_unicode=False)
    # spark.log_rdd(pbooks)

    pipeline = spark_nlp.setup_pipeline()
    books = spark_nlp.segment_sentences(spark_session, pbooks, pipeline)

    # Go from one book per row to one sentence per row
    sentences = books.select(
        func.monotonically_increasing_id().alias("sentence_id"),
        func.col("fileName"),
        func.posexplode("sentence.result").alias("position", "sentenceText"),
        func.size("sentence.result").alias("numSentencesInBook"),
    )
    # logging.info("Num Sentences: {0}".format(sentences.count()))

    sentences = spark_nlp.tokenize_sentences(sentences)

    count_syllables_udf = func.udf(
        lambda s: _udf_sentence_count_syllables_sentence(s),
        ArrayType(IntegerType()))
    count_sentence_multisyllables_udf = func.udf(
        lambda s: sum(_udf_sentence_count_syllables_sentence(s)),
        IntegerType())
    count_array_multisyllables_udf = func.udf(
        lambda a: sum(_udf_array_count_syllables_sentence(a)), IntegerType())
    sentences = sentences.select(
            "sentence_id",
            "fileName",
            "position",
            "sentenceText",
            "numSentencesInBook",
                    # count_sentence_multisyllables_udf('sentenceText') \
                    # .alias("multiSyllableCount")
            count_array_multisyllables_udf("words") \
                    .alias("multiSyllableCount"),
            func.size("words").alias("numWordsInSentence"),
            )
    sentences.printSchema()

    # pipeline = spark_nlp.setup_sentiment_pipeline()
    # output = spark_nlp.sentiment_analysis(sentence_data, pipeline)

    # Format to load ElasticSearch
    sentences = sentences.select(
        "sentence_id",
        func.to_json(
            func.struct("sentence_id", "fileName", "position", "sentenceText",
                        "numSentencesInBook", "multiSyllableCount",
                        "numWordsInSentence")).alias("value"))
    # sentence = output.select(["sentence_id", "sentence"]).toJSON()
    sentences = sentences.rdd.map(lambda x: elastic.format_data(x))

    # Write to ES
    write_rdd_to_es(sentences, es_write_conf)
    """
    # _read_es()
    # sentences = sentence.rdd.map(lambda s: s.sentence[0].result)
    # sentences = sentence.rdd.flatMap(lambda s: s.sentence)
    # results = sentence.rdd.map(lambda s: s.result).zipWithUniqueId()

    """
    spark_session.stop()
    end_time = time.time()
    logging.info("RUNTIME: {0}".format(end_time - start_time))
Example #23
0
                 [[0, 1,1,1, 1],[1,5,5,5,1], [1,2,3,2,1],[0, 1, 2,1, 0], [0, 1, 2,1, 0]]],
     'x_values': [[0, 40,75, 100, 150],[0, 40, 75, 100, 150]],
     'y_values': [[10,40,70, 80, 110], [10,40,70, 80, 110]]
     })
dfSpark = spark.createDataFrame(df_arr)
display(dfSpark)

# COMMAND ----------

# DBTITLE 1,Transform to table step 1
from pyspark.sql.functions import posexplode
# Explode the x array and the y array and use posexplode so that we know
# from what index each, x- and y-value comes from. We need the index
# to map pairs of x,y to the correct value/postion in the matrix containing the z-values.
dfExploded1 = dfSpark.select("id", "matrixdata", "x_values",\
                             "y_values", posexplode("matrixdata"))\
.withColumnRenamed("pos", "y_index").withColumnRenamed("col", "values")
display(dfExploded1)

# COMMAND ----------

dfExploded2 = dfExploded1.select("id", "matrixdata", "x_values",\
                                 "y_values", "y_index", "values",\
                                 posexplode("values"))\
.withColumnRenamed("pos", "x_index").withColumnRenamed("col", "z_value")
display(dfExploded2)

# COMMAND ----------

# Now extract x- and y-value using our idexes for each row and then we can
# use the data in an external tool as we have separate columns/dimensions
  .select("vehicle_array", sort_array("vehicle_array", asc=True)) \
  .show(5, False)

# Use the `array_contains` function to search the array:
from pyspark.sql.functions import array_contains
drivers_array \
  .select("vehicle_array", array_contains("vehicle_array", "Subaru")) \
  .show(5, False)

# Use the `explode` and `posexplode` functions to explode the array:
from pyspark.sql.functions import explode, posexplode
drivers_array \
  .select("vehicle_array", explode("vehicle_array")) \
  .show(5, False)
drivers_array \
  .select("vehicle_array", posexplode("vehicle_array")) \
  .show(5, False)

# Note that you can pass multiple names to the `alias` method:
drivers_array \
  .select("vehicle_array", posexplode("vehicle_array").alias("position", "column")) \
  .show(5, False)

# ## Maps

# Use the
# [create_map](http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark.sql.functions.create_map)
# function to create a map:
from pyspark.sql.functions import lit, create_map
drivers_map = drivers \
  .withColumn("vehicle_map", create_map(lit("make"), "vehicle_make", lit("model"), "vehicle_model")) \
Example #25
0
from pyspark.sql.functions import  (
concat, concat_ws, collect_list, collect_set, explode, 
explode_outer, flatten, greatest, least, posexplode, posexplode_outer, struct
)


concat(*col)      #  Concatenates multiple input columns together into a single column. The function works with strings, binary and
concat_ws(sep=";", *col)   #  speration Concatenates multiple input string columns together into a single string column, using the given separator.
collect_list   ##   df2.agg(collect_list('age')).collect()   Aggregate function: returns a list of objects with duplicates.
collect_set    ###  Aggregate function: returns a set of objects with duplicate elements eliminated.
explode   ## array --> column eDF.select(explode(eDF.intlist).alias("anInt")).collect()
explode_outer   ### array --> column  Unlike explode, if the array/map is null or empty then null
flatten   ## flatten array into flat  Collection function: creates a single array from an array of arrays
greatest   # Returns the greatest value of the list of column name df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect()
least(col1, col2, col3)  # Returns the least value of the list of column names, skipping null values
posexplode(col )  # Returns a new row for each element with position in the given array or map.  eDF.select(posexplode(eDF.intlist)).collect()
posexplode_outer  ### explode array into new new row
struct  ## new struct columns,  df.select(struct('age', 'name').alias("struct")).collect()


#### Rows Agg operation  #######################################################
from pyspark.sql.functions import  (
grouping, grouping_id, first, last  )


grouping      #  df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show()
grouping_id   # df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()  returns the level of grouping,
first   ###  1st row
last    ###  last row

    if len(sys.argv) != 3:
        print("Usage: parse_arxiv <input> <output_dir>", file=sys.stderr)
        sys.exit(-1)

    # Initialize the spark context.
    spark = SparkSession \
        .builder \
        .appName("ParseArxivDataset") \
        .getOrCreate()

    # Loads in input file. It should be in format of:
    #     Json record

    join_res_path = "{}/*json".format(sys.argv[1])

    join_res = spark.read.json(join_res_path)


    output = join_res.withColumn("author", functions.explode(functions.col("authors")))\
                .withColumn("category", functions.explode(functions.col("categories")))

    output = output.select(
        "author.id", "year", "n_citation", "title",
        f.posexplode(f.split("category", " ")).alias("pos", "cate"))

    output.printSchema()

    output.write.json(sys.argv[2])

    spark.stop()
Example #27
0
vessel_types = [30, 1001, 1002, 21, 22, 21, 32, 52, 1023, 1025, 36, 37, 1019, 1021, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 1012, 1013, 1014, 1015, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 1003, 1004, 1016, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 1017, 1024]
pings = pings.where(pings.VesselType.cast("Double").isin(vessel_types))

# Invoke Function for geoHashing Ports:
# see benchmarking results for cell_size_degrees
cell_size_degrees = 0.4
ports = ports.withColumn("GRID_CELLS", polygon_to_gridCell_hashSet_udf(ports.POLYGON10KM, psql.lit(cell_size_degrees)))
ports = ports.withColumn("GRID_CELLS", psql.expr("substring(GRID_CELLS, 2, (length(GRID_CELLS)-1))"))

# Explode geoHash cells from list types to (one row per cell) to get complete set of port geoHash cells
ports_geoHash = ports.select(
		ports.PORT_NAME, \
		ports.POLYGON10KM, \
		ports.LON_LAT, \
        psql.split("GRID_CELLS", ", ").alias("GRID_CELLS"), \
        psql.posexplode(psql.split("GRID_CELLS", ", ")).alias("pos", "val") \
    )
# Take exploded points, separate into integer lat and lon for joining
ports_geoHash = ports_geoHash.withColumn('GRID_POINTS', psql.regexp_extract('val', '(-)*[0-9]+ (-)*[0-9]+', 0)).withColumn('LON_CELL', psql.split('GRID_POINTS', ' ').getItem(0).cast("INT")).withColumn('LAT_CELL', psql.split('GRID_POINTS', " ").getItem(1).cast("INT"))

# GeoHash Pings:
pings = pings.withColumn("GRID_X", psql.floor(pings.LON.cast('Double')/cell_size_degrees))
pings = pings.withColumn("GRID_Y", psql.floor(pings.LAT.cast('Double')/cell_size_degrees))

# Testing: Find Bay Area Hits
# ports_geoHash.filter((psql.col("Y") > 36) & (psql.col("Y") < 39) & (psql.col("X") < -120) & (psql.col("X") > -123)).show()
# pings.filter((psql.col("LAT") > 36) & (psql.col("LAT") < 39) & (psql.col("LON") < -120) & (psql.col("LON") > -123)).select("LON", "LAT", "GRID_X", "GRID_Y").take(20)

# pingsD = pings.filter(pings.Status == 'moored').limit(250) # for testing
# jpings.filter(jpings.PORT_NAME.isNotNull()).count() # for testing
# pings = pings.join(psql.broadcast(ports_geoHash.select("PORT_NAME", "POLYGON10KM", "LON_CELL", "LAT_CELL")), on = (pings.GRID_X == ports_geoHash.LON_CELL) & (pings.GRID_Y == ports_geoHash.LAT_CELL), how = 'inner') # broadcast doesn't seem to improve performance
def insert_time_dim(start_date_id, end_date_id):
    time_begin = datetime.strptime(str(start_date_id), "%Y%m%d").date()
    time_end = datetime.strptime(str(end_date_id), "%Y%m%d").date()

    print('time_begin')
    print(time_begin)

    print('time_end')
    print(time_end)

    # tao dataframe tu time_begin va time_end
    data = [(time_begin, time_end)]
    df = spark.createDataFrame(data, ["minDate", "maxDate"])
    # convert kieu dl va ten field
    df = df.select(
        df.minDate.cast(DateType()).alias("minDate"),
        df.maxDate.cast(DateType()).alias("maxDate"))

    # chay vong lap lay tat ca cac ngay giua mindate va maxdate
    df = df.withColumn("daysDiff", f.datediff("maxDate", "minDate")) \
        .withColumn("repeat", f.expr("split(repeat(',', daysDiff), ',')")) \
        .select("*", f.posexplode("repeat").alias("date", "val")) \
        .withColumn("date", f.expr("to_date(date_add(minDate, date))")) \
        .select('date')

    # convert date thanh cac option ngay_thang_nam
    df = df.withColumn('id', f.date_format(df.date, "yyyyMMdd")) \
        .withColumn('ngay_trong_thang', f.dayofmonth(df.date)) \
        .withColumn('ngay_trong_tuan', f.from_unixtime(f.unix_timestamp(df.date, "yyyy-MM-dd"), "EEEEE")) \
        .withColumn('tuan_trong_nam', f.weekofyear(df.date)) \
        .withColumn('thang', f.month(df.date)) \
        .withColumn('quy', f.quarter(df.date)) \
        .withColumn('nam', f.year(df.date))
    df = df.withColumn('tuan_trong_thang', (df.ngay_trong_thang - 1) / 7 + 1)

    data_time = DynamicFrame.fromDF(df, glueContext, 'data_time')

    # convert data
    data_time = data_time.resolveChoice(specs=[('tuan_trong_thang',
                                                'cast:int')])

    # chon cac truong va kieu du lieu day vao db
    applymapping1 = ApplyMapping.apply(
        frame=data_time,
        mappings=[("id", "string", "id", "bigint"),
                  ("ngay_trong_thang", 'int', 'ngay_trong_thang', 'int'),
                  ("ngay_trong_tuan", "string", "ngay_trong_tuan", "string"),
                  ("tuan_trong_thang", "int", "tuan_trong_thang", "int"),
                  ("tuan_trong_nam", "int", "tuan_trong_nam", "int"),
                  ("thang", "int", "thang", "int"),
                  ("quy", "int", "quy", "int"), ("nam", "int", "nam", "int"),
                  ("date", "date", "ngay", "timestamp")])

    resolvechoice2 = ResolveChoice.apply(frame=applymapping1,
                                         choice="make_cols",
                                         transformation_ctx="resolvechoice2")
    dropnullfields3 = DropNullFields.apply(
        frame=resolvechoice2, transformation_ctx="dropnullfields3")

    # ghi dl vao db
    preactions = 'delete student.time_dim where id >= ' + str(start_date_id)
    datasink4 = glueContext.write_dynamic_frame.from_jdbc_conf(
        frame=dropnullfields3,
        catalog_connection="glue_redshift",
        connection_options={
            "preactions": preactions,
            "dbtable": "student.time_dim",
            "database": "student_native_report"
        },
        redshift_tmp_dir=
        "s3n://dts-odin/temp/tu-student_native_report/student/time_dim",
        transformation_ctx="datasink4")
Example #29
0
def test_posexplode_litarray(data_gen):
    array_lit = gen_scalar(
        ArrayGen(data_gen, min_length=3, max_length=3, nullable=False))
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: four_op_df(spark, data_gen).select(
            f.col('a'), f.col('b'), f.col('c'), f.posexplode(array_lit)))
 def get_urls_exploded_by_split_urls(self,
                                     split_urls: DataFrame) -> DataFrame:
     return split_urls \
         .select("id", "url", posexplode(split_urls.split_url)) \
         .withColumnRenamed("col", "word")