Ejemplo n.º 1
0
def calc_jaccard_sim(df_to_process, df_match, thresh=.3, padded=True):
    if padded:
        df_processed = df_to_process.join(df_match, (F.size(
            F.array_intersect(
                df_to_process.ngrams_pad, df_match.ngrams_pad)) / F.size(
                    F.array_union(df_to_process.ngrams_pad,
                                  df_match.ngrams_pad))) > thresh)
    else:
        df_processed = df_to_process.join(
            df_match,
            (F.size(F.array_intersect(df_to_process.ngrams, df_match.ngrams)) /
             F.size(F.array_union(df_to_process.ngrams, df_match.ngrams))) >
            thresh)
    return df_processed
Ejemplo n.º 2
0
def silver_airport_data():
    df = dlt.read("bronze_airport_data")
    df = df.replace("\\N", None)
    df = df.withColumn("nulls", array())
    for i, c in enumerate(df.columns):
        df = df.withColumn(
            "nulls",
            when(col(c).isNull(),
                 array_union(col("nulls"),
                             array(lit(i)))).otherwise(col("nulls")))
    return df.drop("ingest_timestamp", "ingest_source")
Ejemplo n.º 3
0
def jaccard_index(primary_col: str, secondary_col: str, output_col: str,
                  df: DataFrame):
    """Calculate the intersection and union of two array columns"""

    return df.withColumn(
        output_col,
        F.when(
            F.col(primary_col).isNull() | F.col(secondary_col).isNull(), None).
        otherwise(
            F.size(F.array_intersect(F.col(primary_col), F.col(secondary_col)))
            / F.size(F.array_union(F.col(primary_col), F.col(secondary_col)))),
    )
Ejemplo n.º 4
0
def reduce_join(left, right):
    return_vcf = left.join(right, ["#CHROM", "POS"], "full")

    ###
    remove_colname = right.columns[2:]
    l_name = left.columns
    r_name = right.columns
    v_name = return_vcf.columns
    name_list = ["REF", "ID", "ALT", "INFO", "FORMAT"]

    for name in name_list:
        if name == "INFO":
            return_vcf = return_vcf.withColumn(column_name(l_name, name)[0],
                                       when(F.isnull(column_name(l_name, name)[0]), F.col(column_name(r_name, name)[0]))\
                                       .when(F.isnull(column_name(r_name, name)[0]), F.col(column_name(l_name, name)[0]))
                                       .otherwise(F.array_union(*column_name(v_name, name))))
        else:
            return_vcf = return_vcf.withColumn(column_name(l_name, name)[0],
                                           when(F.isnull(column_name(l_name, name)[0]), F.col(column_name(r_name, name)[0]))\
                                           .when(F.isnull(column_name(r_name, name)[0]), F.col(column_name(l_name, name)[0]))
                                           .otherwise(F.array_union(*column_name(v_name, name))))
    return_vcf = return_vcf.drop(*remove_colname)

    return return_vcf
Ejemplo n.º 5
0
def similaryBasedOnFollowers(data, minFollowers=20, debug=False):

    # We start by renaming the user column in line with the notation
    # above.
    data = data.withColumnRenamed('follows', 'u1')

    # ==== Step 1 ====
    u1_fu1 = data.groupBy('u1').agg(F.collect_set(
        data.user).alias('fu1')).filter(F.size('fu1') >= minFollowers)

    if (debug):
        print('>> Step 1 :: u1 f(u1) <<')
        u1_fu1.show()

    # ==== Step 2 ====
    # First create a "dual" of data by renaming columns.
    # This will help the subsequent join.
    u2_fu2 = u1_fu1.withColumnRenamed('u1',
                                      'u2').withColumnRenamed('fu1', 'fu2')

    prod = u1_fu1.crossJoin(u2_fu2).filter(u1_fu1.u1 < u2_fu2.u2)

    if (debug):
        print('>> Step 2 :: u1 f(u1) u2 f(u2) <<')
        prod.show()

    # ==== Step 3 ====
    prod2 = prod.withColumn('I',
                            F.array_intersect(prod.fu1, prod.fu2)).withColumn(
                                'U',
                                F.array_union(prod.fu1,
                                              prod.fu2)).drop('fu1', 'fu2')

    if (debug):
        print('>> Step 3 :: u1 u2 I(u1,u2) U(u1,u2) <<')
        #prod2.orderBy('I',ascending=False).show()
        prod2.show()

    # ==== Step 4 ====
    result = prod2.withColumn('JI', F.size('I') / F.size('U')).drop('I', 'U')

    if (debug):
        print('>> Step 4 :: u1 u2 J(u1,u2) <<')
        result.show()
    return result
Ejemplo n.º 6
0
	def verification(self, candDF, threshold, key1, key2, keep_cols1, keep_cols2):
		""" 
			Input: $candDF is the output DataFrame from the 'filtering' function. 
				   $threshold is a float value between (0, 1] 

			Output: Return a new DataFrame $resultDF that represents the ER result. 
					It has five columns: id1, joinKey1, id2, joinKey2, jaccard 

			Comments: There are two differences between $candDF and $resultDF
					  (1) $resultDF adds a new column, called jaccard, which stores the jaccard similarity 
						  between $joinKey1 and $joinKey2
					  (2) $resultDF removes the rows whose jaccard similarity is smaller than $threshold 
		"""
		return candDF.select(
			'id1', 'id2',
			(size(array_intersect(key1,key2))\
			/ size(array_union(key1,key2))).alias('jaccard'),
			# keep certain columns
			*keep_cols1, *keep_cols2
		).where(col('jaccard') >= threshold)
Ejemplo n.º 7
0
def generate_metadata_group(
    experiment_specimen_df: DataFrame,
    impress_df: DataFrame,
    exp_type="experiment",
) -> DataFrame:
    """
    Takes in an Experiment-Specimen DataFrame and the IMPReSS dataframe,
    and generates a hash value with the parameters marked as 'isImportant' on IMPReSS.
    This hash is used to identify experiments that are comparable (i.e. share the same experimental conditions).
    """

    # Explode the experiments by procedureMetadata so each row contains data for each procedureMetadata value
    experiment_metadata = experiment_specimen_df.withColumn(
        "procedureMetadata", explode("procedureMetadata"))

    # Filter the IMPReSS to leave only those that generate a metadata split: isImportant = True
    impress_df_required = impress_df.where(
        (col("parameter.isImportant") == True)
        & (col("parameter.type") == "procedureMetadata"))

    # Join the experiment DF with he IMPReSS DF
    experiment_metadata = experiment_metadata.join(
        impress_df_required,
        ((experiment_metadata["_pipeline"]
          == impress_df_required["pipelineKey"])
         & (experiment_metadata["_procedureID"]
            == impress_df_required["procedure.procedureKey"])
         & (experiment_metadata["procedureMetadata._parameterID"]
            == impress_df_required["parameter.parameterKey"])),
    )

    # Create a new column by concatenating the parameter name and the parameter value
    experiment_metadata = experiment_metadata.withColumn(
        "metadataItem",
        when(
            col("procedureMetadata.value").isNotNull(),
            concat(col("parameter.name"), lit(" = "),
                   col("procedureMetadata.value")),
        ).otherwise(concat(col("parameter.name"), lit(" = "), lit("null"))),
    )

    # Select the right column name for production and phenotyping centre depending on experiment type
    if exp_type == "experiment":
        production_centre_col = "_productionCentre"
        phenotyping_centre_col = "_phenotypingCentre"
    else:
        production_centre_col = "production_centre"
        phenotyping_centre_col = "phenotyping_centre"

    # Create a window for the DataFrame over experiment id, production and phenotyping centre
    window = Window.partitionBy(
        "unique_id", production_centre_col,
        phenotyping_centre_col).orderBy("parameter.name")

    # Use the window to create for every experiment an array containing the set of "parameter =  value" pairs.
    experiment_metadata_input = experiment_metadata.withColumn(
        "metadataItems",
        collect_set(col("metadataItem")).over(window))

    # Add the production centre to the metadata group when this is different form the phenotyping centre.
    # This is because in that given case we would like to generate a metadata split among specimens
    # That have been produced and phenotyped on the same centre
    experiment_metadata_input = experiment_metadata_input.withColumn(
        "metadataItems",
        when(
            (col(production_centre_col).isNotNull())
            & (col(production_centre_col) != col(phenotyping_centre_col)),
            array_union(
                col("metadataItems"),
                array(
                    concat(lit("ProductionCenter = "),
                           col(production_centre_col))),
            ),
        ).otherwise(col("metadataItems")),
    )

    # Create a string with the concatenation of the metadata items "parameter = value" separated by '::'.
    experiment_metadata = experiment_metadata_input.groupBy(
        "unique_id", production_centre_col, phenotyping_centre_col).agg(
            concat_ws("::", sort_array(max(
                col("metadataItems")))).alias("metadataGroupList"))

    # Hash the list to generate a medata group identifier.
    experiment_metadata = experiment_metadata.withColumn(
        "metadataGroup", md5(col("metadataGroupList")))

    # Select the experiment IDs and the metadata group IDs
    experiment_metadata = experiment_metadata.select("unique_id",
                                                     "metadataGroup")

    # Join the original experiment DataFrame with the result of the metadata group generation
    experiment_specimen_df = experiment_specimen_df.join(
        experiment_metadata, "unique_id", "left_outer")

    # Add the hashed version of an empty string to those rows without a metadata group.
    experiment_specimen_df = experiment_specimen_df.withColumn(
        "metadataGroup",
        when(experiment_specimen_df["metadataGroup"].isNull(),
             md5(lit(""))).otherwise(experiment_specimen_df["metadataGroup"]),
    )
    return experiment_specimen_df
            functions.array(functions.collect_list('text').over(window)).alias('text'),
        )

    chains.persist()
    #%%
    if depth == 0:
        chains = chains.select(
            functions.concat('context', 'response').alias('context'),
            functions.concat('sender', 'author_id').alias('sender'), 'rpos',
            'next',
            functions.col('text').alias('tweets'))
    else:
        chains = chains.select(
            functions.concat_ws(',', 'context', 'response').alias('context'),
            functions.concat_ws(',', 'sender', 'author_id').alias('sender'),
            functions.array_union('tweets', 'text'), 'rpos', 'next')

#%%
    chains = chains.join(data, chains.next == data.tweet_id, 'inner')\
        .select(
            'sender',
            'context',
            'tweets',
            'rpos',
            data.tweet_id.alias('response'),
            data.author_id,
            data.response_tweet_id.alias('next'),
            data.text.alias('text')
        )

    #%%
Ejemplo n.º 9
0
def array_union(a: Column, b: Column) -> Column:
    """Calculate the union of two array columns"""
    return F.array_remove(F.array_union(a, b), "")
Ejemplo n.º 10
0
readPlaylistsDF.show()
deletePlaylistsDF.unpersist()

print("Update playlists")
updatePlaylistsDF = df_edit.withColumn(
    'Exp_Results', F.explode('update.playlists')).select('Exp_Results.*')
updatePlaylistsDF.show(truncate=False)

# Only update song ids in the playlists when the user id and playlist id matches the source playlists
print("Update playlists Result")
updatePlaylistsDF = updatePlaylistsDF.join(
    readPlaylistsDF, (updatePlaylistsDF.id == readPlaylistsDF.id) &
    (updatePlaylistsDF.user_id == readPlaylistsDF.user_id), 'inner').select(
        updatePlaylistsDF.id, updatePlaylistsDF.user_id,
        F.array_union(
            F.array_intersect(updatePlaylistsDF.song_ids,
                              F.array([F.lit(x) for x in songs])),
            readPlaylistsDF.song_ids).alias("song_ids"))

readPlaylistsDF = readPlaylistsDF.join(
    updatePlaylistsDF, readPlaylistsDF.id == updatePlaylistsDF.id,
    "left").select(
        readPlaylistsDF.id,
        F.coalesce(updatePlaylistsDF.song_ids,
                   readPlaylistsDF.song_ids).alias("song_ids"),
        readPlaylistsDF.user_id)
readPlaylistsDF.show()

updatePlaylistsDF.unpersist()

playlistsDF = readPlaylistsDF.agg(
    F.collect_list(struct("*")).alias('playlists'))
Ejemplo n.º 11
0
def add_null_index_array(df):
  df = df.replace("\\N", None)
  df = df.withColumn("nulls", array())
  for i, c in enumerate(df.columns):
      df = df.withColumn("nulls", when(col(c).isNull(), array_union(col("nulls"), array(lit(i)))).otherwise(col("nulls")))
  return df.drop("ingest_timestamp", "ingest_source")
Ejemplo n.º 12
0
MODEL = None


def get_model_magic():
    global MODEL
    if MODEL is None:
        MODEL = hub.load(
            "https://tfhub.dev/google/universal-sentence-encoder/4")
    return MODEL


@udf(returnType=VectorUDT())
def encode_sentence(x):
    model = get_model_magic()
    emb = model([x]).numpy()[0]
    return Vectors.dense(emb)


blocking_df = tokenize(processed_df, ['name', 'description', 'manufacturer'])
blocking_df = tfidf_top_tokens(
    blocking_df,
    [c + '_swRemoved' for c in ['name', 'description', 'manufacturer']])
blocking_df = blocking_df.withColumn('name_encoding', encode_sentence(f.coalesce(f.col('name'), f.lit(''))))\
  .withColumn('description_encoding', encode_sentence(f.coalesce(f.col('description'), f.lit(''))))\
  .withColumn('blocking_keys',
              f.array_union(
                f.array(f.col('name'), f.col('description'), f.col('manufacturer')),
                f.array_union(f.col('name_swRemoved_top_tokens'), f.array_union(f.col('description_swRemoved_top_tokens'), f.col('manufacturer_swRemoved_top_tokens')))
              )
             )\
  .withColumn('uid', f.concat_ws('|', 'source', 'source_id'))
Ejemplo n.º 13
0
    def process(self):
        """ Read data. """

        self.code = self.input

        # if self.code in self.cleanUp:
        # self.cleanUp_for_code(self.code)
        # self.prepare_dirs_for_code(self.code)

        # Load parquet
        df = spark.read.parquet(
            f"{config.OUTPUT_DATASET}/{self.code}_cards.parquet")

        # Replace text with keywords based on a dictionary
        df = df.withColumn(
            "text_features1",
            preprocess_fn.udf_text_to_keywords("name", "originalText"))

        from_patterns = [
            fn.when(
                fn.regexp_extract("originalText", r"{0}".format(pattern), 0) !=
                "",
                replace,
            ).otherwise("") for pattern, replace in
            preprocess_fn_text_rules.text_patterns.items()
        ]

        df = df.withColumn("text_features2", fn.array(*from_patterns))
        df = df.withColumn("text_features",
                           fn.array_union("text_features1", "text_features2"))

        # df.select("text_features").distinct().show(100, truncate=False)

        # Fetch all the text features from all the cards into one list
        all_text_feats = df.select("text_features").rdd.flatMap(
            lambda x: x).collect()

        filtered_text_feats = [
            items for items in all_text_feats if len(items) > 0
        ]
        filtered_text_feats = list(
            itertools.chain.from_iterable(filtered_text_feats))

        # Encode the text features into ints
        label_encoder = preprocessing.LabelEncoder().fit(filtered_text_feats)
        with open(f"{config.TEMP}/labelencoder_text_feats.pkl", "wb") as fp:
            pickle.dump(label_encoder, fp)

        @fn.udf(returnType=t.ArrayType(t.IntegerType()))
        def text_to_vector(text_features):
            if len(text_features) > 0:
                enc_list = list()
                for item in text_features:
                    item = str(item)
                    encoded = label_encoder.transform([item])
                    encoded = int(encoded[0])
                    enc_list.append(encoded)
                #             print(f"{item} \t {encoded}")
                return enc_list
            return list()

        # if "text_features_vect" in df.columns:
        # df = df.drop("text_features_vect")

        df = df.withColumn("text_features_vect",
                           text_to_vector("text_features"))

        all_text_feats = df.select("text_features").rdd.flatMap(
            lambda x: x).collect()

        filtered_text_feats = [
            items for items in all_text_feats if len(items) > 0
        ]
        filtered_text_feats = list(
            itertools.chain.from_iterable(filtered_text_feats))

        df.createOrReplaceTempView("cards_features")

        tbl = spark.sql("""
            SELECT
                *
            FROM
                cards_features
        """)

        # Save to Parquet
        tbl.write.mode("overwrite").parquet(
            f"{config.TEMP}/{self.code}_cards_text.parquet")

        self.next(self.join)