Ejemplo n.º 1
0
def spk_med(data):
    spk_df = spark.read.json(data)

    # Selects data from JSON to normalize
    filtered = spk_df.select(spk_df.animals.id.alias("id"), \
                          spk_df.animals.attributes.house_trained.alias("house_trained"), \
                          spk_df.animals.attributes.declawed.alias("declawed"), \
                          spk_df.animals.attributes.spayed_neutered.alias("spayed_neutered"), \
                             spk_df.animals.attributes.special_needs.alias("special_needs"), \
                             spk_df.animals.attributes.shots_current.alias("shots_current"))

    # Result is cleaned dataframe
    cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id",
                                                                     "house_trained",
                                                                     "declawed",
                                                                     "spayed_neutered",
                                                                     "special_needs",
                                                                     "shots_current"))) \
        .select("filtered.id",
                "filtered.house_trained",
                "filtered.declawed",
                "filtered.spayed_neutered",
                "filtered.special_needs",
                "filtered.shots_current")

    # Casting values to correct datatype
    cleaned_cast = cleaned.select(cleaned.id.cast(IntegerType()),
                                  cleaned.house_trained.cast(BooleanType()),
                                  cleaned.declawed.cast(BooleanType()),
                                  cleaned.spayed_neutered.cast(BooleanType()),
                                  cleaned.special_needs.cast(BooleanType()),
                                  cleaned.shots_current.cast(BooleanType()))

    return cleaned_cast
Ejemplo n.º 2
0
def spk_temp(data):
    spk_df = spark.read.json(data)

    # Selects data from JSON to normalize
    filtered = spk_df.select(spk_df.animals.id.alias("id"), \
                          spk_df.animals.environment.cats.alias("cats"), \
                          spk_df.animals.environment.dogs.alias("dogs"), \
                          spk_df.animals.environment.children.alias("children"))

    # Result is cleaned dataframe
    cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id", \
                                                                     "cats", \
                                                                     "dogs", \
                                                                     "children"))) \
        .select("filtered.id", \
                "filtered.cats", \
                "filtered.dogs", \
                "filtered.children")

    # Casting values to correct datatyoe
    cleaned_cast = cleaned.select(cleaned.id.cast(IntegerType()),
                                  cleaned.children.cast(BooleanType()),
                                  cleaned.dogs.cast(BooleanType()),
                                  cleaned.cats.cast(BooleanType()))

    return cleaned_cast
Ejemplo n.º 3
0
    def transfLineitems(ds, config):
        invoiceDS: DataFrame = ds

        invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \
            .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto"))

        if invoiceDS.schema["TypeOfService"].dataType == StringType():
            invoiceDS = invoiceDS.withColumn("TypeOfService", array(col("_LineItems._Description"))) \
                .withColumn("ServiceAmount", array(col("_LineItems._TotalPriceNetto")))

        invoiceDS = invoiceDS.withColumn("ServiceNRate", arrays_zip(col("TypeOfService"), col("ServiceAmount")))\
                    .withColumn("ServiceNRate", explode_outer(col("ServiceNRate")))

        # udf_service = udf(get_genesis_TOS, StringType()) # same code of line 56, remove costTypeList

        billPath = config["Master"]["billPath"]
        CostTypeList = createDatasetFromCSVFile("", billPath).filter(col("VENDOR_NAME").rlike("GENESIS MARINE, LLC"))\
            .rdd.map(lambda x: x.COST_LINE_ITEM_TYPE).collect()

        def udf_service(costType):
            return udf(lambda l: get_genesis_TOS_search(l, costType),
                       StringType())

        invoiceDS = invoiceDS.withColumn("TypeOfService", udf_service(CostTypeList)(regexp_replace(
                                                        col("ServiceNRate.TypeOfService"), "\n", " ")))\
            .withColumn("ServiceAmount", col("ServiceNRate.ServiceAmount")) \

        return invoiceDS
Ejemplo n.º 4
0
def benchmark_extract_top_keywords(posts, n_keywords=10):
    """Given TF-IDF output (as "features" column) extracts out the vocabulary index of the
    10 keywords with highest TF-IDF (for each post)."""
    def extract_keys_from_vector(vector):
        return vector.indices.tolist()

    def extract_values_from_vector(vector):
        return vector.values.tolist()

    extract_keys_from_vector_udf = udf(
        lambda vector: extract_keys_from_vector(vector),
        ArrayType(IntegerType()))
    extract_values_from_vector_udf = udf(
        lambda vector: extract_values_from_vector(vector),
        ArrayType(DoubleType()))

    posts = posts.withColumn("extracted_keys",
                             extract_keys_from_vector_udf("features"))
    posts = posts.withColumn("extracted_values",
                             extract_values_from_vector_udf("features"))

    posts = posts.withColumn(
        "zipped_truncated",
        slice(
            sort_array(arrays_zip("extracted_values", "extracted_keys"),
                       asc=False), 1, n_keywords))

    take_second = udf(lambda rows: [row[1] for row in rows],
                      ArrayType(IntegerType()))
    posts = posts.withColumn("top_indices", take_second("zipped_truncated"))

    return posts
Ejemplo n.º 5
0
def predict(model,data):
    print('making predictions on data...')
    predictions = model.transform(data)

    print('Converting predictions to pandas df...')
    df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
    .select(F.expr("cols['0']").alias("token"),
            F.expr("cols['1']").alias("ground_truth"),
            F.expr("cols['2']").alias("prediction")).toPandas()

    print('Evaluation report')
    print(classification_report(df.ground_truth, df.prediction))
    print(accuracy_score(df.ground_truth, df.prediction))
Ejemplo n.º 6
0
def logVendorDetails(config, overAllXMLCnt, vendorList, vendorInitialCnt,
                     vendorProcessedCnt, StartTime, EndTime):
    RunID = singleSession().getSparkSession(config).sparkContext.applicationId

    masterURL = config["SparkSubmit"]["master"]
    xmlPath = config['InvoicePath']['abbyXmlPath']
    selectedColumns = config['Logging']['selectedColumns']
    summaryColumns = config['Logging']['summaryColumns']
    detailsColumns = config['Logging']['detailsColumns']
    loggingPath = config['Logging']['loggingPath']

    overAllXMLProcessedCnt = 0
    for cnt in vendorProcessedCnt:
        overAllXMLProcessedCnt += cnt

    row = [
        RunID, xmlPath, masterURL, overAllXMLCnt, overAllXMLProcessedCnt,
        StartTime, EndTime, vendorList, vendorInitialCnt, vendorProcessedCnt
    ]

    dataSet = singleSession().getSparkSession(config).createDataFrame(
        [row], selectedColumns)

    summary = dataSet.select(*summaryColumns)

    # details part
    details = dataSet.withColumn(
        "Message",
        arrays_zip(col("vendorList"), col("vendorInitialCnt"),
                   col("vendorProcessedCnt")))

    details = details.select(*detailsColumns).withColumn("Message", explode(col("Message")))\
        .withColumn("Message", concat(col("Message.vendorList"), lit(" vendor had "),
                                   col("Message.vendorInitialCnt"), lit(" XMLs and generated ") ,
                                   col("Message.vendorProcessedCnt"), lit(" records")))

    # summary.show(truncate=False)
    # details.show(truncate=False)

    # csv
    writeIntoServingLayer(summary, loggingPath + "summary", mode="Append")
    writeIntoServingLayer(details, loggingPath + "details", mode="Append")

    logSummaryOfRun = config['LoggingTale']['logSummaryOfRun']
    logDetailsOfRun = config['LoggingTale']['logDetailsOfRun']

    # Hive
    # writeIntoHiveServingLayer(summary, logSummaryOfRun)
    # writeIntoHiveServingLayer(details, logDetailsOfRun)

    return None
Ejemplo n.º 7
0
def df_array_zip(df, b, c, d):

    from pyspark.sql.functions import arrays_zip, col

    ###https://stackoverflow.com/questions/41027315/pyspark-split-multiple-array-columns-into-rows
    df = df.withColumn("tmp", arrays_zip(b, c, d))
    df = df.withColumn("tmp", explode("tmp"))
    df = df.select(col("tmp")[b], col("tmp")[c], col("tmp")[d])

    df = df.withColumnRenamed('tmp.{}'.format(d), d)
    df = df.withColumnRenamed('tmp.{}'.format(b), b)
    df = df.withColumnRenamed('tmp.{}'.format(c), c)

    return df
Ejemplo n.º 8
0
def get_metrics(data, model):

    print("Getting embeddings...")
    data = bert_annotator.transform(data)
    predictions = model.transform(data)

    df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
    .select(F.expr("cols['1']").alias("target"),
            F.expr("cols['2']").alias("prediction"))

    correct = df.filter(df.target == df.prediction).count()
    total = df.select("target").count()

    accuracy = 100 * correct / total
    print("Accuracy = {}".format(accuracy))
Ejemplo n.º 9
0
def explode_list(df):
    """Explode a list of products into separated rows

    :param df: Input DataFrame
    :return: Output DataFrame
    """
    return (df.withColumn('tmp', arrays_zip(
        'productCode',
        'productName')).withColumn('tmp', explode_outer('tmp')).select(
            'maid', 'siteseq', 'userid', 'transaction_date',
            'transaction_time', 'logtype',
            'tmp.productCode', 'tmp.productName').withColumnRenamed(
                'tmp.productCode',
                'productCode').withColumnRenamed('tmp.productName',
                                                 'productName'))
Ejemplo n.º 10
0
 def _transform_rec_info(self, rec_info, item_ids_dataset):
     import pyspark.sql.functions as F
     if self.output_user_embeddings:
         user_embeddings = rec_info.select(
             self.increasing_id_column_name,
             self.recommendation_info_column_name + '.user_embedding')
     rec_info = rec_info.withColumn(
         self.recommendation_info_column_name,
         F.arrays_zip(self.recommendation_info_column_name + '.indices',
                      self.recommendation_info_column_name + '.distances'))
     rec_info = rec_info.select(
         self.increasing_id_column_name,
         (F.posexplode(self.recommendation_info_column_name).alias(
             'pos', self.recommendation_info_column_name)))
     rec_info = rec_info.select(
         self.increasing_id_column_name, 'pos',
         F.col(self.recommendation_info_column_name + '.0').alias('index'),
         F.col(self.recommendation_info_column_name +
               '.1').alias('distance'))
     rec_info = rec_info.join(item_ids_dataset,
                              F.col('index') == F.col('id'))
     w = pyspark.sql.Window.partitionBy(
         self.increasing_id_column_name).orderBy('pos')
     if self.output_item_embeddings:
         rec_info = rec_info.select(
             self.increasing_id_column_name, 'pos',
             (F.struct('name', 'distance', 'item_embedding').alias(
                 self.recommendation_info_column_name)))
     else:
         rec_info = rec_info.select(
             self.increasing_id_column_name, 'pos',
             (F.struct('name', 'distance').alias(
                 self.recommendation_info_column_name)))
     rec_info = rec_info.select(
         self.increasing_id_column_name, 'pos', (F.collect_list(
             self.recommendation_info_column_name).over(w).alias(
                 self.recommendation_info_column_name)))
     rec_info = rec_info.groupBy(self.increasing_id_column_name).agg(
         F.max(self.recommendation_info_column_name).alias(
             self.recommendation_info_column_name))
     if self.output_user_embeddings:
         rec_info = rec_info.join(user_embeddings,
                                  self.increasing_id_column_name)
     return rec_info
 def filter_by_pos(self, pos_col, lemma_col):
     self.data = self.data.select(
             F.expr("*"),
             F.explode(
                 F.arrays_zip(
                     pos_col + '.result',
                     pos_col + '.begin',
                     lemma_col + '.begin',
                 )
             ).alias("cols")
     ) \
     .select(
         F.expr('*'),
         F.expr("cols['0']").alias("pos_result"),
         F.expr("cols['1']").alias("pos_begin"),
         F.expr("cols['2']").alias("token_begin")
     ) \
     .filter((F.col('pos_result') == 'NN') & (F.col('pos_begin') == F.col('token_begin'))) \
     .drop("cols", "pos_result", "pos_begin", "token_begin")
 def filter_by_ner(self, ner_col, lemma_col):
     self.data = self.data.select(
             F.expr("*"),
             F.explode(
                 F.arrays_zip(
                     ner_col + '.result',
                     ner_col + '.begin',
                     lemma_col + '.begin',
                 ).alias("cols")
             )
     ) \
     .select(
         F.expr('*'),
         F.expr("cols['0']").alias("ner_result"),
         F.expr("cols['1']").alias("ner_begin"),
         F.expr("cols['2']").alias("token_begin")
     ) \
     .filter((F.col('ner_result') == 'O') & (F.col('ner_begin') == F.col('token_begin'))) \
     .drop("cols", "ner_result", "ner_begin", "token_begin")
Ejemplo n.º 13
0
def spk_descr(data):

    spk_df = spark.read.json(data)

    # Selects data from JSON to normalize
    filtered = spk_df.select(spk_df.animals.id.alias("id"), \
                          spk_df.animals.description.alias("description"))

    # Result is cleaned dataframe
    cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id", \
                                                                     "description"))) \
        .select("filtered.id", \
                "filtered.description")

    # Casting values to correct datatype
    cleaned_cast = cleaned.select(cleaned.id.cast(IntegerType()),
                                  cleaned.description.cast(StringType()))

    return cleaned_cast
Ejemplo n.º 14
0
def zip_explode_cols(df: pyspark.sql.dataframe.DataFrame,
                     cols: list,
                     result_name: str,
                     rename_fields: Dict[str, str] = None):
    """
    Explode multiple equally-sized arrays into one struct by zipping all arrays into one `ArrayType[StructType]`

    Args:
        df: The input Spark DataFrame
        cols: The array columns that should be zipped
        result_name: The name of the column that will contain the newly created struct
        rename_fields: dictionary mapping column names to new struct field names.
            Used to rename columns in the newly created struct.

    Returns: `df.withColumn(result_name, zip(explode(cols)))`

    """
    df = df.withColumn(result_name, f.explode(f.arrays_zip(*cols)))

    if rename_fields:  # create schema of new struct by simply renaming the top-level struct fields
        old_schema: t.StructType = df.schema[result_name].dataType

        # rename field if field ist in `old_schema.fieldNames()`
        new_field_names = [
            rename_fields[field] if field in rename_fields else field
            for field in old_schema.fieldNames()
        ]

        new_schema = t.StructType([
            t.StructField(name, field.dataType)
            for name, field in zip(new_field_names, old_schema.fields)
        ])

        df = df.withColumn(result_name, f.col(result_name).cast(new_schema))

        # # old method using withColumn and a new struct; breaks with PySpark 3.0
        # df = df.withColumn(target_struct, f.struct(*[
        #     f.col(target_struct + "." + actualName).alias(targetName)
        #     for targetName, actualName in zip(target_colnames, df.schema[target_struct].dataType.fieldNames())
        # ]))

    return df
Ejemplo n.º 15
0
def read_and_clean_json(spark, input_json_path):
    '''
    Reads json file with products data
    Explodes nested columns
    Selects main columns to dataframe
    Replaces wrong values of character '&'(ampersand)
    Replaces values of character '$' to cast price column to double
    Replaces wrong null values to None
    Casts price column to double
    '''
    df = spark.read.json(input_json_path)
    df = df.withColumn("tmp", arrays_zip("category", "description", "image")) \
        .withColumn("tmp", explode("tmp")) \
        .select("asin", col("tmp.category"), col("tmp.description"), col("tmp.image"), "title", "brand", "main_cat", "price") \
        .withColumn('brand', translate('brand', '&', '&')) \
        .withColumn('category', translate('category', '&', '&')) \
        .withColumn('main_cat', translate('main_cat', '&', '&')) \
        .withColumn('price', regexp_replace('price', '\$', '')) \
        .replace(['null', '', 'None'], None) \
        .withColumn('price', col('price').cast("double"))
    return df
Ejemplo n.º 16
0
def extract_top_keywords(posts, vocabulary, n_keywords=10):
    """Given word count (Count Vectorizer) output (as "features" column) -
    extracts out the vocabulary index of the 10 keywords with highest TF-IDF (for each post)."""
    def extract_keys_from_vector(vector):
        return vector.indices.tolist()

    def extract_values_from_vector(vector):
        return vector.values.tolist()

    extract_keys_from_vector_udf = udf(
        lambda vector: extract_keys_from_vector(vector),
        ArrayType(IntegerType()))
    extract_values_from_vector_udf = udf(
        lambda vector: extract_values_from_vector(vector),
        ArrayType(DoubleType()))

    idf_udf = array_transform(idf_wiki)
    vocab_dict = {k: v for k, v in enumerate(vocabulary)}

    def ix_to_word(ix):
        return vocab_dict[ix]

    vocab_udf = array_transform(ix_to_word)

    posts = posts.withColumn("word_ix",
                             extract_keys_from_vector_udf("features"))
    posts = posts.withColumn("word_count",
                             extract_values_from_vector_udf("features"))
    posts = posts.withColumn('words', vocab_udf(col('word_ix')))
    posts = posts.withColumn("idf", idf_udf(col("words")))
    posts = posts.withColumn(
        "zipped_truncated",
        slice(sort_array(arrays_zip("idf", "words"), asc=False), 1,
              n_keywords))

    take_second = udf(lambda rows: [row[1] for row in rows],
                      ArrayType(StringType()))
    posts = posts.withColumn("top_keywords", take_second("zipped_truncated"))

    return posts['CreationDate', 'top_keywords', 'Tags', 'ParentId']
Ejemplo n.º 17
0
def get_metrics(data):

    print("Getting labels...")
    data = bert_annotator.transform(data)

    print("Loading model...")
    model = NerDLModel.load(model_path)\
        .setInputCols(["sentence", "token", 'embeddings'])\
        .setOutputCol("ner")

    data.show()
    predictions = model.transform(data)

    df = predictions.select(F.explode(F.arrays_zip('token.result','label.result','ner.result')).alias("cols")) \
    .select(F.expr("cols['1']").alias("target"),
            F.expr("cols['2']").alias("prediction"))

    correct = df.filter(df.target == df.prediction).count()
    total = df.select("target").count()

    accuracy = 100 * correct / total
    print("Accuracy = {}".format(accuracy))
Ejemplo n.º 18
0
def spk_status(data):
    spk_df = spark.read.json(data)

    # Selects data from JSON to normalize
    filtered = spk_df.select(spk_df.animals.id.alias("id"), \
                          spk_df.animals.status.alias("status"), \
                          spk_df.animals.status_changed_at.alias("status_changed_at"))

    # Result is cleaned dataframe
    cleaned = filtered.withColumn("filtered", F.explode(F.arrays_zip("id", \
                                                                     "status", \
                                                                     "status_changed_at"))) \
        .select("filtered.id", \
                "filtered.status", \
                "filtered.status_changed_at")

    # Casting values to correct datatype
    cleaned_cast = cleaned.select(
        cleaned.id.cast(IntegerType()), cleaned.status.cast(StringType()),
        cleaned.status_changed_at.cast(TimestampType()))

    return cleaned_cast
Ejemplo n.º 19
0
    def transfLineitems(ds):
        invoiceDS: DataFrame = ds

        invoiceDS = invoiceDS.withColumn("TypeOfService", col("_LineItems._Description")) \
            .withColumn("ServiceAmount", col("_LineItems._TotalPriceNetto"))

        if invoiceDS.schema["TypeOfService"].dataType == StringType():
            invoiceDS = invoiceDS.withColumn("TypeOfService", split(col("TypeOfService"), ",")) \
                .withColumn("ServiceAmount", split(col("ServiceAmount"), ","))

        # invoiceDS.printSchema()
        # import sys
        # sys.exit(3)

        invoiceDS = invoiceDS.withColumn("ServiceNPrice", arrays_zip(col("TypeOfService"), col("ServiceAmount"))) \
            .withColumn("ServiceNPrice", explode_outer(col("ServiceNPrice")))

        invoiceDS = invoiceDS.withColumn("TypeOfService", when(col("ServiceNPrice.TypeOfService").isNotNull(),
                                         regexp_replace(col("ServiceNPrice.TypeOfService"), "[:;]", "")).otherwise(""))\
            .withColumn("ServiceAmount", col("ServiceNPrice.ServiceAmount")) \
            .filter(~(col('TypeOfService').like("Total Amount%")))

        return invoiceDS
Ejemplo n.º 20
0
        "ETL Demo").enableHiveSupport().getOrCreate()
    spark.sql("create database if not exists spotify_demo;")
    spark.sql("use spotify_demo;")
    """
    Notice that the following ETL flow just serves as an example and is not optimal. Also, better execution time can be achieved by changing the order of different logical steps.
    """
    # Load track data and merge
    tracks = merge_tracks_sources(tracks_160k_etl(args),
                                  tracks_1200k_etl(args))
    # preview
    # print("tracks", tracks.count())
    # tracks.show(10, truncate=10)

    # Build artist table
    track_artists = tracks.select("id", from_json("artists", ArrayType(StringType(), containsNull=False), options={"allowBackslashEscapingAnyCharacter": True}).alias("artists"), from_json("artist_ids", ArrayType(StringType(), containsNull=False)).alias("artist_ids"))\
        .select("id", explode(arrays_zip("artists", "artist_ids")).alias("tuple"))\
        .select("id", col("tuple.artists").alias("a_name"), col("tuple.artist_ids").alias("a_spotify_id"))
    artists = build_dictionary_table(track_artists,
                                     "a_spotify_id",
                                     "a_name",
                                     index_column="a_id")
    artist_genres = genre_etl(args)
    artists = artists.join(artist_genres,
                           artists.a_name == artist_genres.artists,
                           'left').select("a_id", "a_spotify_id", "a_name",
                                          col("genre_list").alias("genres"))
    # preview
    print("artists", artists.count())
    artists.show()
    artists.write.saveAsTable("artist", format="orc",
                              mode="error")  # write_mode="ignore"
Ejemplo n.º 21
0
def main(argv):
    mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
        "SC_PHYS_PAGES")  # e.g. 4015976448
    mem_gib = int((mem_bytes / (1024.0**3)) * 0.9)
    tar_jar = os.path.join(find_runfiles(),
                           "__main__/galvasr2/spark/tar_spark_datasource.jar")
    spark = (pyspark.sql.SparkSession.builder.master(
        f"local[{os.cpu_count() - 1}]").config(
            "spark.eventLog.enabled",
            "true").config("spark.eventLog.dir", "/spark-events").config(
                "spark.sql.execution.arrow.pyspark.enabled", "true").config(
                    "spark.driver.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config(
                    "spark.executor.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config("spark.driver.memory", f"{mem_gib}g").config(
                    "spark.history.fs.logDirectory", "/spark-events").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        "1").config("spark.jars", tar_jar).config(
                            "spark.local.dir",
                            "/mnt/disks/spark-scratch/").getOrCreate())
    spark.sparkContext.setLogLevel("INFO")  # "ALL" for very verbose logging
    logging.getLogger("py4j").setLevel(logging.ERROR)

    catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue)

    _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue)
    licenseurl_df = licenseurl_df.select(
        [F.col("identifier"),
         F.col("text_document_id"),
         F.col("licenseurl")])

    # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file
    # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_"
    catalogue_df = catalogue_df.withColumn(
        "kaldi_normalized_uttid",
        F.concat_ws(
            "-",
            F.translate(catalogue_df.identifier, " /", "__"),
            F.translate(catalogue_df.audio_document_id, " /", "__"),
        ),
    )
    # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv")
    if not FLAGS.work_dir.startswith("gs://"):
        os.makedirs(FLAGS.work_dir, exist_ok=True)
    wav_scp = os.path.join(FLAGS.work_dir, "wav.scp")
    ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir")
    if FLAGS.stage <= 0:
        catalogue_df = catalogue_df.cache()
        # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping)
        training_sample_rows = catalogue_df.collect()
        catalogue_df.unpersist()

        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            create_wav_scp(posix_wav_scp, training_sample_rows,
                           FLAGS.input_dir, ctm_out_dir)

    # /development/lingvo-source/output_ctm_dir/

    # nvprof --analysis-metrics -o  decoder-analysis.nvprof \
    # We want only the best path, so we set lattice-beam to 0.1
    # --main-q-capacity=35000 \
    # Can get 266x RTF with this configuration. Keep it?
    # bath size of 100 and num channels of 100 works just fine

    if FLAGS.stage <= 1:
        if not FLAGS.work_dir.startswith("gs://"):
            os.makedirs(ctm_out_dir, exist_ok=True)
        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:

            posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                       temp_dir_name, ctm_out_dir)
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                    temp_dir_name, FLAGS.work_dir)
            num_gpus = 4
            posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir,
                                                 num_gpus)

            executor = ThreadPoolExecutor(max_workers=num_gpus)

            def run_gpu(posix_wav_scp_shard, gpu_number):
                cmd = f"""\
  /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \
  --frame-subsampling-factor=3 \
  --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \
  --max-active=7000 \
  --beam=15.0 \
  --lattice-beam=0.1 \
  --acoustic-scale=1.0 \
  --cuda-decoder-copy-threads=2 \
  --cuda-worker-threads={os.cpu_count() // num_gpus} \
  --segmentation=true \
  --cuda-use-tensor-cores=true \
  --max-batch-size=150 \
  --num-channels=250 \
  --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \
  --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \
  /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \
  /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \
  scp,p:{posix_wav_scp_shard} \
  {posix_ctm_out_dir}
  """
                env = deepcopy(os.environ)
                env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}"
                subprocess.check_call(shlex.split(cmd), env=env)

            for i, shard in enumerate(posix_wav_scp_shards):
                executor.submit(run_gpu, shard, i)
            executor.shutdown(wait=True)

    alignments_dir = os.path.join(FLAGS.alignments_work_dir,
                                  "alignments_json_jul_28")
    if FLAGS.stage <= 2:
        # TODO: Add options to DSAlign here
        dsalign_args = dsalign_main.parse_args(
            ["--output-wer",
             "--output-cer"])  # , "--output-sws", "--output-levenshtein"])

        alphabet_normalized_path = (
            "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt")
        align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path,
                                      15_000, 3_000)

        ctm_df = (spark.read.format("binaryFile").option(
            "pathGlobFilter", "*.ctm").load(ctm_out_dir))
        ctm_df = ctm_df.withColumn(
            "kaldi_normalized_uttid",
            F.regexp_replace(
                F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""),
        )
        ctm_df = ctm_df.withColumn("ctm_content",
                                   fix_text_udf(F.col("content"))).drop(
                                       "path", "length", "modificationTime",
                                       "content")

        ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid")
        downsampled_catalogue_df = ctm_df.drop("ctm_content")

        training_sample_rows = downsampled_catalogue_df.collect()
        transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path,
                                          training_sample_rows)
        transcripts_df = transcripts_df.withColumn(
            "transcript",
            normalize_english_text_udf(transcripts_df.transcript))
        ctm_df = ctm_df.join(transcripts_df,
                             ["identifier", "text_document_id"])
        ctm_df = ctm_df.repartition(960)

        # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id),
        #                                         F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id),
        #                                         ctm_df.transcript, ctm_df.ctm_content))
        alignments_df = ctm_df.withColumn(
            "alignments",
            align_udf(
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.text_document_id),
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.audio_document_id),
                ctm_df.transcript,
                ctm_df.ctm_content,
            ),
        ).drop("ctm_content")
        print("GALVEZ:schema")
        alignments_df.printSchema()

        sys.stdout.flush()

        alignments_df.write.mode("overwrite").format("json").save(
            alignments_dir)

    manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest")
    tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars")
    if FLAGS.stage <= 3:
        duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json"
        duplicates_df = spark.read.format("json").load(duplicate_data_path)

        alignments_df = spark.read.json(alignments_dir)

        alignments_df = alignments_df.join(
            duplicates_df,
            on=(alignments_df.identifier == duplicates_df.identifier)
            &
            (alignments_df.text_document_id == duplicates_df.text_document_id),
            how="anti",
        )

        if FLAGS.license_filter == "":
            pass
        else:
            if FLAGS.license_filter == "Not CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    ~is_cc_by_sa(F.col("licenseurl")))
            elif FLAGS.license_filter == "CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    is_cc_by_sa(F.col("licenseurl")))
            else:
                raise Exception("Unknown license_filter provided.")
            filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl")

            alignments_df = alignments_df.join(
                filtered_licenseurl_df,
                on=(alignments_df.identifier
                    == filtered_licenseurl_df.identifier)
                & (alignments_df.text_document_id
                   == filtered_licenseurl_df.text_document_id),
                how="inner",
            )
            alignments_df = alignments_df.drop(
                filtered_licenseurl_df.identifier).drop(
                    filtered_licenseurl_df.text_document_id)

        # We would like the number of partitions to be some large multiple
        # of the number of executors. Not every audio file is the same
        # length, so this helps with load balancing.
        alignments_df = alignments_df.withColumn(
            "duration_ms",
            F.expr(
                "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)"
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.arrays_zip(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.duration_ms,
            ).cast(
                T.ArrayType(
                    T.StructType([
                        T.StructField("cer", T.FloatType()),
                        T.StructField("end_ms", T.LongType()),
                        T.StructField("label", T.StringType()),
                        T.StructField("start_ms", T.LongType()),
                        T.StructField("wer", T.FloatType()),
                        T.StructField("duration_ms", T.LongType()),
                    ]))),
        )

        alignments_df = alignments_df.drop("duration_ms")

        alignments_df = alignments_df.withColumn(
            "alignments",
            F.filter(
                alignments_df.alignments,
                # Need to select this filter such that total number of
                # hours is 31,400
                lambda alignment:
                (alignment.duration_ms < FLAGS.max_duration_ms)
                & (alignment.duration_ms >= FLAGS.min_duration_ms)
                & (alignment.cer < FLAGS.max_cer)
                & (alignment.cer >= FLAGS.min_cer),
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.struct(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.alignments.duration_ms,
            ).cast(
                T.StructType([
                    T.StructField("cer", T.ArrayType(T.FloatType())),
                    T.StructField("end_ms", T.ArrayType(T.LongType())),
                    T.StructField("label", T.ArrayType(T.StringType())),
                    T.StructField("start_ms", T.ArrayType(T.LongType())),
                    T.StructField("wer", T.ArrayType(T.FloatType())),
                    T.StructField("duration_ms", T.ArrayType(T.LongType())),
                ])),
        )

        alignments_df = alignments_df.repartition(960)

        abc = alignments_df.select(
            F.sum(
                F.expr(
                    "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)"
                )) / 1000.0 / 60.0 / 60.0).collect()
        print("GALVEZ:total number of hours=", abc)
        sys.stdout.flush()

        alignments_df = alignments_df.select(
            alignments_df.identifier,
            alignments_df.audio_document_id,
            alignments_df.text_document_id,
            alignments_df.alignments,
        )

        alignments_df = F.broadcast(alignments_df)

        audio_paths = F.concat(
            F.lit(FLAGS.input_gcs_path),
            F.lit("/"),
            F.col("identifier"),
            F.lit("/"),
            F.col("audio_document_id"),
        )
        rows = alignments_df.select(audio_paths).collect()
        paths = [row[0] for row in rows]  # [:1] # GALVEZ: WARNING test!
        # print(f"number of paths = {len(paths)}")
        audio_df = (spark.read.format("binaryFile").load(paths).drop(
            "modificationTime", "length"))

        alignments_audio_df = alignments_df.join(audio_df,
                                                 audio_paths == audio_df.path)
        # from IPython import embed; embed()

        # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory
        # Remove "." becasue it has special meaning in webdataset format.
        # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient)
        name = F.concat(F.col("identifier"), F.lit("/"),
                        F.col("audio_document_id"))
        # name = F.regexp_replace(name, r"/", "_SLASH_")
        name = F.regexp_replace(name, r"\.", "_DOT_")
        name = F.regexp_replace(name, r" ", "_SPACE_")
        # glob.glob("**/*.flac")

        pdf = df.select(name).collect()
        for name in pdf.name:
            assert len(name) < 4096
            for chunk in "/".split(name):
                assert len(chunk) < 256
        # name = F.regexp_replace(F.concat(F.col("identifier"),
        #                                  F.lit("-"),
        #                                  F.col("audio_document_id")),
        #                         r"(\.|/)",
        #                         "_"
        # )

        # The name of each thing in the tar file. May not exceed 100 characters in length
        # substr indexes from 1!
        # name = name.substr(
        #     F.length(name) - F.least(F.length(name), F.lit(88)) + 1,
        #     F.least(F.length(name), F.lit(88))
        # )

        alignments_audio_df = alignments_audio_df.withColumn(
            "aligned_chunks",
            create_audio_segments_udf(
                alignments_audio_df.content,
                F.lit("mp3"),
                name,
                alignments_audio_df.alignments.start_ms,
                alignments_audio_df.alignments.end_ms,
                F.lit("flac"),
            ),
        )
        a = alignments_audio_df.select(
            F.explode(
                F.arrays_zip("aligned_chunks.audio_name",
                             "aligned_chunks.audio"))).select(
                                 "col.0", "col.1")
        a.write.mode("overwrite").format("tar").save(tars_dir)

        output_df = alignments_audio_df.select(
            alignments_audio_df.identifier,
            alignments_audio_df.audio_document_id,
            alignments_audio_df.text_document_id,
            F.struct(
                alignments_audio_df.alignments.label.alias("label"),
                create_audio_segment_names_udf(
                    # Is F.size right here?
                    name,
                    F.size(alignments_audio_df.alignments.start_ms),
                    F.lit("flac"),
                ).alias("name"),
                alignments_audio_df.alignments.duration_ms.alias(
                    "duration_ms"),
            ).alias("training_data"),
        )
        output_df = output_df.coalesce(960)

        # coalesce(1) seems to make the create_audio_segments_udf function run serially
        output_df.write.mode("overwrite").json(manifest_dir)

    repartitioned_tars_dir = os.path.join(FLAGS.work_dir,
                                          "repartitioned_dataset_tars")
    tmp_tars_dir = os.path.join(FLAGS.work_dir,
                                "repartitioned_dataset_tmp_dir")
    if FLAGS.stage <= 4:
        tars_df = spark.read.format("tar").load(tars_dir)  # .limit(100)
        number_of_rows = tars_df.count()

        spark2 = spark.newSession()
        spark2.conf.set(
            "spark.sql.execution.rangeExchange.sampleSizePerPartition",
            number_of_rows)
        spark2.conf.set("spark.sql.files.minPartitionNum",
                        FLAGS.number_of_shards)
        # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100)

        # print("GALVEZ:", tars_df.select(F.col("key")).collect())
        # import sys; sys.exit()
        tars_df = spark2.read.format("tar").load(tars_dir)  # .limit(100)
        tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards,
                                             F.col("key"))
        # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs.
        # # tars_df = tars_df.persist()
        tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir)
        tars_df = spark2.read.format("tar").load(
            tmp_tars_dir)  # .repartitionByRange()  # coalesce(1024)
        # counts_df = (
        #     tars_df.withColumn("partitionId", F.spark_partition_id())
        #     .groupBy("partitionId")
        #     .count()
        # )
        # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0]
        # # Consider doing this in java
        # def drop_final_rows(rows):
        #     for _ in range(num_rows_to_keep):
        #         yield next(rows)
        #     for _ in rows:
        #         pass
        #     return

        # print("GALVEZ:before=", tars_df.rdd.getNumPartitions())
        # # , preservesPartitioning=True
        # tars_df = spark2.createDataFrame(
        #     tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema
        # )
        # print("GALVEZ:after=", tars_df.rdd.getNumPartitions())
        # import sys

        # sys.stdout.flush()
        # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode.
        # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir)

        # manifest_df = spark2.read.json(manifest_dir)
        # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count()
        # print(f"GALVEZ:number_of_utterances={number_of_utterances}")
        # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards
        # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard)

    nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo")
    nemo_single_manifest_dir = os.path.join(FLAGS.work_dir,
                                            "dataset_manifest_nemo_single")

    if FLAGS.stage <= 5:
        json_df = spark.read.format("json").load(manifest_dir)
        nemo_df = json_df.select(
            F.explode(
                F.arrays_zip(
                    F.col("training_data.name").alias("audio_filepath"),
                    F.col("training_data.label").alias("text"),
                    F.col("training_data.duration_ms").alias("duration_ms"),
                )))
        nemo_df = nemo_df.select(
            F.col("col.name").alias("audio_filepath"),
            F.col("col.label").alias("text"),
            (F.col("col.duration_ms").cast(T.DoubleType()) /
             1000.0).alias("duration"),
            F.lit(-1).alias("shard_id"),
        )
        if False:
            tars_df = spark.read.format("tar").load(repartitioned_tars_dir)
            tars_df = tars_df.select(tars_df.key)
            nemo_df = F.broadcast(nemo_df)
            nemo_df = nemo_df.join(
                tars_df,
                F.col("audio_filepath") == F.col("key")).drop(F.col("key"))

        # TODO: Join against tar files that have been made to contain the
        # same number of files to filter out removed files
        nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir)

        nemo_single_df = spark.read.format("json").load(nemo_manifest_dir)
        nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save(
            nemo_single_manifest_dir)

    single_manifest_dir = os.path.join(FLAGS.work_dir,
                                       "dataset_manifest_single")
    single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single")
    # Create single tar file and single json file
    if FLAGS.stage <= 6:
        json_df = spark.read.format("json").load(manifest_dir)
        json_df.coalesce(1).write.format("json").mode("overwrite").save(
            single_manifest_dir)

        tars_df = spark.read.format("tar").load(tmp_tars_dir)
        tars_df.coalesce(1).write.format("tar").mode("overwrite").save(
            single_tar_dir)
df.show(truncate=False)

df_explode = df.select(df.name,
                       explode(df.subjects).alias("Exploded_Subjects"))

df_explode.printSchema()
df_explode.show(truncate=False)

df_flatten = df.select(df.name,
                       flatten(df.subjects).alias("Flattened_Subjects"))

df_flatten.printSchema()
df_flatten.show(truncate=False)

df_flatten_zip=df_flatten \
               .withColumn("tmp", arrays_zip("Flattened_Subjects"))  \
               .withColumn("tmp", explode("tmp")) \
               .select("name", col("tmp.Flattened_Subjects")) \

df_flatten_zip.printSchema()
df_flatten_zip.show(truncate=False)
'''Above is not performant hence below solution if array size is known '''
# Length of array
n = 5

# For legacy Python you'll need a separate function
# in place of method accessor
reduce(DataFrame.unionAll,
       (df_flatten.select("name",
                          col("Flattened_Subjects").getItem(i))
        for i in range(n))).toDF("name", "Subjects").show()
Ejemplo n.º 23
0
def getWeightedAgg(all_data, key=['hotelid'], prefix='hotel'):
    """calculate weighter ctr, br and dtob for a dataframe
  """

    global multiplication_array, sum_weights, n_dates

    col_list = ['searchdate'] + key
    df_hotel = all_data.groupby(col_list).agg(
        F.sum('bookingFlag').alias('booking_count'),
        F.sum('detailsFlag').alias('details_count'),
        F.count('fprice').alias('impression_count'))

    # calculate the metrics to be weighted
    col_list_1 = col_list + [
        (F.col('details_count') / F.col('impression_count')).alias('raw_ctr'),
        (F.col('booking_count') / F.col('impression_count')).alias('raw_br'),
        (F.col('booking_count') / F.col('details_count')).alias('raw_dtob')
    ]
    df_hotel = df_hotel.select(*col_list_1)

    # fill missing dates for all keys
    dates_df = df_hotel.select('searchdate').drop_duplicates()
    hotel_dates_df = df_hotel.select(
        *key).drop_duplicates().crossJoin(dates_df)
    n_dates = dates_df.count()

    df_hotel = hotel_dates_df.join(df_hotel, on=col_list, how='left').fillna(0)

    # collect metrics as a list to pass through udf
    df_hotel = df_hotel.groupby(key).agg(
        F.collect_list('raw_ctr').alias('raw_ctr'),
        F.collect_list('raw_br').alias('raw_br'),
        F.collect_list('raw_dtob').alias('raw_dtob'),
        F.collect_list('searchdate').alias('searchdate'))

    # compute the global weighting matrix and sum across rows
    multiplication_array = np.arange(1, n_dates + 1)
    multiplication_array = np.array(
        [multiplication_array - i for i in range(0, n_dates)])
    multiplication_array = 8 - multiplication_array
    multiplication_array = np.where(multiplication_array <= 7,
                                    multiplication_array, 0)
    multiplication_array = np.where(multiplication_array >= 1,
                                    multiplication_array, 0)
    sum_weights = multiplication_array.sum(axis=0)

    # calculate the rolling weighted average of metrics
    select_col = [F.col(i) for i in key] + [
        weightedAverageUdf(F.col('searchdate'),
                           F.col('raw_ctr')).alias('ctr_array'),
        weightedAverageUdf(F.col('searchdate'),
                           F.col('raw_br')).alias('br_array'),
        weightedAverageUdf(F.col('searchdate'),
                           F.col('raw_dtob')).alias('dtob_array')
    ]
    weighted_average_df = df_hotel.select(*select_col)

    select_col = [F.col(i) for i in key] + [
        F.arrays_zip(
            F.col('ctr_array').getItem(0).alias('searchdate'),
            F.col('ctr_array').getItem(1).alias('ctr'),
            F.col('br_array').getItem(1).alias('br'),
            F.col('dtob_array').getItem(1).alias('dtob')).alias('zip_averages')
    ]
    weighted_average_df = weighted_average_df.select(*select_col)

    select_col = [F.col(i) for i in key
                  ] + [F.explode('zip_averages').alias('averages')]
    weighted_average_df = weighted_average_df.select(*select_col)

    select_col = [F.col(i) for i in key] + [
        F.col('averages').getItem('0').alias('searchdate'),
        F.col('averages').getItem('1').alias(prefix + '_ctr'),
        F.col('averages').getItem('2').alias(prefix + '_br'),
        F.col('averages').getItem('3').alias(prefix + '_dtob')
    ]
    weighted_average_df = weighted_average_df.select(*select_col)

    return weighted_average_df
Ejemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    args = getargs(parser)

    # Initialise Spark session
    spark = init_sparksession(
        name="index_archival_{}_{}".format(args.index_table, args.night),
        shuffle_partitions=2
    )

    # The level here should be controlled by an argument.
    logger = get_fink_logger(spark.sparkContext.appName, args.log_level)

    # debug statements
    inspect_application(logger)

    # Connect to the aggregated science database
    path = '{}/science/year={}/month={}/day={}'.format(
        args.agg_data_prefix,
        args.night[:4],
        args.night[4:6],
        args.night[6:8]
    )
    df = load_parquet_files(path)

    # construct the index view
    index_row_key_name = args.index_table
    columns = index_row_key_name.split('_')
    names = [col(i) for i in columns]
    index_name = '.' + columns[0]

    # Drop partitioning columns
    df = df.drop('year').drop('month').drop('day')

    # Load column names to use in the science portal
    cols_i, cols_d, cols_b = load_science_portal_column_names()

    # Assign each column to a specific column family
    cf = assign_column_family_names(df, cols_i, cols_d, cols_b)

    # Restrict the input DataFrame to the subset of wanted columns.
    if 'upper' in args.index_table:
        df = df.select(
            'objectId',
            'prv_candidates.jd',
            'prv_candidates.fid',
            'prv_candidates.magpsf',
            'prv_candidates.sigmapsf',
            'prv_candidates.diffmaglim'
        )
    else:
        df = df.select(cols_i + cols_d + cols_b)

    # Create and attach the rowkey
    df, _ = attach_rowkey(df)

    common_cols = [
        'objectId', 'candid', 'publisher', 'rcid', 'chipsf', 'distnr',
        'ra', 'dec', 'jd', 'fid', 'nid', 'field', 'xpos', 'ypos', 'rb',
        'ssdistnr', 'ssmagnr', 'ssnamenr', 'jdstarthist', 'jdendhist', 'tooflag',
        'sgscore1', 'distpsnr1', 'neargaia', 'maggaia', 'nmtchps', 'diffmaglim',
        'magpsf', 'sigmapsf', 'magnr', 'sigmagnr', 'magzpsci', 'isdiffpos',
        'cdsxmatch',
        'roid',
        'mulens',
        'snn_snia_vs_nonia', 'snn_sn_vs_all', 'rf_snia_vs_nonia',
        'classtar', 'drb', 'ndethist', 'rf_kn_vs_nonkn', 'tracklet'
    ]

    if columns[0].startswith('pixel'):
        nside = int(columns[0].split('pixel')[1])

        df_index = df.withColumn(
            columns[0],
            ang2pix(
                df['ra'],
                df['dec'],
                lit(nside)
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + ['objectId']
        )
    elif columns[0] == 'class':
        df_index = df.withColumn(
            'class',
            extract_fink_classification(
                df['cdsxmatch'],
                df['roid'],
                df['mulens'],
                df['snn_snia_vs_nonia'],
                df['snn_sn_vs_all'],
                df['rf_snia_vs_nonia'],
                df['ndethist'],
                df['drb'],
                df['classtar'],
                df['jd'],
                df['jdstarthist'],
                df['rf_kn_vs_nonkn'],
                df['tracklet']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )
    elif columns[0] == 'ssnamenr':
        # Flag only objects with likely counterpart in MPC
        df_index = df\
            .filter(df['roid'] == 3)\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'tracklet':
        # For data < 2021-08-10, no tracklet means ''
        # For data >= 2021-08-10, no tracklet means 'null'
        df_index = df\
            .filter(df['tracklet'] != 'null')\
            .filter(df['tracklet'] != '')\
            .select(
                [
                    concat_ws('_', *names).alias(index_row_key_name)
                ] + common_cols
            )
    elif columns[0] == 'upper':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only upper limits
        df_index = df_ex.filter(~df_ex['magpsf'].isNotNull())
        # drop NaN columns
        df_index = df_index.drop(*['magpsf', 'sigmapsf'])
    elif columns[0] == 'uppervalid':
        # This case is the same as the main table
        # but we keep only upper limit measurements.
        index_row_key_name = 'objectId_jd'
        # explode
        df_ex = df.withColumn(
            "tmp",
            arrays_zip("magpsf", "sigmapsf", "diffmaglim", "jd", "fid")
        ).withColumn("tmp", explode("tmp")).select(
            concat_ws('_', 'objectId', 'tmp.jd').alias(index_row_key_name),
            "objectId",
            col("tmp.jd"),
            col("tmp.fid"),
            col("tmp.magpsf"),
            col("tmp.sigmapsf"),
            col("tmp.diffmaglim")
        )

        # take only valid measurements from the history
        df_index = df_ex.filter(df_ex['magpsf'].isNotNull())
    elif columns[0] == 'tns':
        with open('{}/tns_marker.txt'.format(args.tns_folder)) as f:
            tns_marker = f.read().replace('\n', '')

        pdf_tns = download_catalog(os.environ['TNS_API_KEY'], tns_marker)

        # Filter TNS confirmed data
        f1 = ~pdf_tns['type'].isna()
        pdf_tns_filt = pdf_tns[f1]

        pdf_tns_filt_b = spark.sparkContext.broadcast(pdf_tns_filt)

        @pandas_udf(StringType(), PandasUDFType.SCALAR)
        def crossmatch_with_tns(objectid, ra, dec):
            # TNS
            pdf = pdf_tns_filt_b.value
            ra2, dec2, type2 = pdf['ra'], pdf['declination'], pdf['type']

            # create catalogs
            catalog_ztf = SkyCoord(
                ra=np.array(ra, dtype=np.float) * u.degree,
                dec=np.array(dec, dtype=np.float) * u.degree
            )
            catalog_tns = SkyCoord(
                ra=np.array(ra2, dtype=np.float) * u.degree,
                dec=np.array(dec2, dtype=np.float) * u.degree
            )

            # cross-match
            idx, d2d, d3d = catalog_tns.match_to_catalog_sky(catalog_ztf)

            sub_pdf = pd.DataFrame({
                'objectId': objectid.values[idx],
                'ra': ra.values[idx],
                'dec': dec.values[idx],
            })

            # cross-match
            idx2, d2d2, d3d2 = catalog_ztf.match_to_catalog_sky(catalog_tns)

            # set separation length
            sep_constraint2 = d2d2.degree < 1.5 / 3600

            sub_pdf['TNS'] = [''] * len(sub_pdf)
            sub_pdf['TNS'][idx2[sep_constraint2]] = type2.values[idx2[sep_constraint2]]

            to_return = objectid.apply(
                lambda x: '' if x not in sub_pdf['objectId'].values
                else sub_pdf['TNS'][sub_pdf['objectId'] == x].values[0]
            )

            return to_return

        df = df.withColumn(
            'tns',
            crossmatch_with_tns(
                df['objectId'],
                df['ra'],
                df['dec']
            )
        ).select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols + ['tns']
        ).cache()
        df_index = df.filter(df['tns'] != '').drop('tns')
        # trigger the cache - not the cache might be a killer for LSST...
        n = df_index.count()
        print('TNS objects: {}'.format(n))
    else:
        df_index = df.select(
            [
                concat_ws('_', *names).alias(index_row_key_name)
            ] + common_cols
        )

    # construct the time catalog
    hbcatalog_index = construct_hbase_catalog_from_flatten_schema(
        df_index.schema,
        args.science_db_name + index_name,
        rowkeyname=index_row_key_name,
        cf=cf
    )

    # Push index table
    df_index.write\
        .options(catalog=hbcatalog_index, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()

    # Construct the schema row - inplace replacement
    schema_row_key_name = 'schema_version'
    df_index = df_index.withColumnRenamed(
        index_row_key_name,
        schema_row_key_name
    )

    df_index_schema = construct_schema_row(
        df_index,
        rowkeyname=schema_row_key_name,
        version='schema_{}_{}'.format(fbvsn, fsvsn))

    # construct the hbase catalog for the schema
    hbcatalog_index_schema = construct_hbase_catalog_from_flatten_schema(
        df_index_schema.schema,
        args.science_db_name + index_name,
        rowkeyname=schema_row_key_name,
        cf=cf)

    # Push the data using the shc connector
    df_index_schema.write\
        .options(catalog=hbcatalog_index_schema, newtable=50)\
        .format("org.apache.spark.sql.execution.datasources.hbase")\
        .save()
Ejemplo n.º 25
0
def main(sc):
    """
    Main processing function
    Read in data from PostgreSQL transaction table
    Perform reverse lookup for vin transactions and return input Bitcoin values and addresses
    Perform disjoint set (i.e., union find) algorithm using GraphFrames
    Write out address clustering results to PostgreSQL
    """

    # ---READ IN TRANSACTION DATA AND PERFORM REVERSE TX LOOKUP USING JOINS---

    # create initial SQL query
    # tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {} WHERE height <= 400000 LIMIT 5000000"\
    tx_query = "SELECT txid, height, time, ntx, vin_coinbase, vin_txid, vin_vout, vout_value, vout_n, vout_addresses FROM {}"\
        .format(config.SPARK_CONFIG['PG_TX_TABLE'])

    # read in data from PostgreSQL
    tx_df = spark.read \
        .format(config.SPARK_CONFIG['PG_FORMAT']) \
        .option("url", config.SPARK_CONFIG['PG_URL'] + config.SPARK_CONFIG['PG_PORT'] + "/" + config.SPARK_CONFIG['PG_DB']) \
        .option("user", config.SPARK_CONFIG['PG_USER']) \
        .option("password", config.SPARK_CONFIG['PG_PASSWORD'])\
        .option("query", tx_query) \
        .option("numPartitions", '10000') \
        .load()
    # display_df(tx_df)

    # select priority columns, convert array columns, and zip vin and vout fields
    clean_df = tx_df.withColumn("vin_txid_arr", split(col("vin_txid"), ",\s*")) \
        .withColumn("vin_vout_arr", split(col("vin_vout"), ",\s*")) \
        .withColumn("vin_txid_vout_zip", arrays_zip("vin_txid_arr", "vin_vout_arr")) \
        .withColumn("vout_value_arr", split(col("vout_value"), ",\s*")) \
        .withColumn("vout_n_arr", split(col("vout_n"), ",\s*")) \
        .withColumn("vout_addresses_arr", split(col("vout_addresses"), ",\s*")) \
        .withColumn("vout_value_n_addr_zip", arrays_zip("vout_value_arr", "vout_n_arr", "vout_addresses_arr"))
    # display_df(clean_df)

    # # create left side DataFrame
    vin_cols = [
        'txid', 'height', 'time', 'ntx', 'vin_coinbase', 'vin_txid_vout_zip'
    ]
    vin_df = clean_df.select(vin_cols) \
        .withColumn("vin_txid_vout_tup", explode("vin_txid_vout_zip")) \
        .withColumn("vin_txid", col("vin_txid_vout_tup").vin_txid_arr) \
        .withColumn("vin_vout", col("vin_txid_vout_tup").vin_vout_arr) \
        .drop("vin_txid_vout_zip") \
        .drop("vin_txid_vout_tup") \
        .withColumn("left_key", concat(col("vin_txid"), lit("-"), col("vin_vout")))
    # display_df(vin_df)

    # create right side DataFrame
    vout_cols = ['txid', 'vout_value_n_addr_zip']
    vout_df = clean_df.select(vout_cols) \
        .withColumn("vout_value_n_addr_tup", explode("vout_value_n_addr_zip")) \
        .withColumn("vout_value", col("vout_value_n_addr_tup").vout_value_arr) \
        .withColumn("vout_n", col("vout_value_n_addr_tup").vout_n_arr) \
        .withColumn("vout_addr_pre", col("vout_value_n_addr_tup").vout_addresses_arr) \
        .withColumn("vout_addr", translate(col("vout_addr_pre"), '[]', '')) \
        .drop("vout_value_n_addr_zip") \
        .drop("vout_value_n_addr_tup") \
        .drop("vout_addr_pre") \
        .withColumnRenamed("txid", "txid2") \
        .withColumn("right_key", concat(col("txid2"), lit("-"), col("vout_n"))) \
        .drop("txid2")
    # display_df(vout_df)

    # join DataFrames
    join_df = vin_df.join(vout_df, vin_df.left_key == vout_df.right_key, 'left') \
        .drop("left_key") \
        .drop("right_key")
    # display_df(join_df)

    # create temporary table for GraphFrames
    join_df.registerTempTable("join_result")

    # ---CREATING GRAPHFRAME FOR CONNECTED COMPONENTS ALGORITHM---

    # create vertices DataFrame
    vertices = spark.sql(
        "SELECT DISTINCT(vout_addr) FROM join_result").withColumnRenamed(
            "vout_addr", "id")

    # generate DataFrame with single address connection for all addresses in a given txid group
    w = Window.partitionBy("txid").orderBy("vout_addr")
    first_by_txid_df = join_df.withColumn("rn", row_number().over(w)).where(col("rn") == 1) \
        .withColumnRenamed("txid", "txid2") \
        .withColumnRenamed("vout_addr", "vout_addr_first") \
        .drop("rn") \
        .drop("height")
    # first_by_txid_df.show(100)

    # join DataFrames
    interim_df = join_df.join(first_by_txid_df,
                              join_df.txid == first_by_txid_df.txid2, 'left')

    # create edges DataFrame
    edges = interim_df.select("vout_addr", "vout_addr_first") \
        .withColumnRenamed("vout_addr", "src") \
        .withColumnRenamed("vout_addr_first", "dst") \
        .na.drop()

    # create GraphFrame
    g = GraphFrame(vertices, edges)

    # set checkpoint directory in S3
    sc.setCheckpointDir(config.SPARK_CONFIG['S3_CHECKPOINT'])

    # run connected components
    clst_result = g.connectedComponents()
    clst_result.show(100, truncate=False)

    # # ---FOR TESTING ONLY--- show result DataFrame for a specific block to verify correct results
    # clst_result.registerTempTable("clst_table")
    # view_df = spark.sql("SELECT * FROM clst_table ORDER BY clst_table.component")
    # view_df.show(1000, truncate=False)

    # write out to PostgreSQL
    write_clst_to_pg(clst_result)
Ejemplo n.º 26
0
# +---+---+----+------+
# |  1|  1|  7 |  foo |
# |  1|  2|  8 |  foo |
# |  1|  3|  9 |  foo |
# +---+---+----+------+

# if we go with this , # So we can not do this
df_exploded = df.withColumn('b', explode('b')) \
    .withColumn("c", explode(col("c")))  # will create curr_row_count X no_of_items_in_array ,here it will be 9 as 2
# array with 3 items each
df_exploded.show(truncate=False)

# way 1 ,
from pyspark.sql.functions import arrays_zip, array

df = df.withColumn("merged_col", arrays_zip("b", "c"))
df.show(truncate=False)
df.withColumn("added_array", concat(col("b"), col("c"))).show()
flattened_df = df.withColumn("exploded_merged_col", explode(col("merged_col")))
flattened_df.show(truncate=False)
flattened_df.printSchema()
flattened_df.select(col("a"), col("exploded_merged_col.b"),
                    col("exploded_merged_col.c"), col("d")).show()

# way 2 , this is useful to manipulate any col in DF , useful for any arbritary number of col and items in it .
# else use array_zip(c1,c2) as seen before

df = spark.createDataFrame([Row(a=1, b=[1, 2, 3], c=[7, 8, 9], d='foo')])


def zip_and_explode(*colnames, n):
Ejemplo n.º 27
0
    col("seatRows.features")).withColumn("seat", explode("seatRows.seats"))

# COMMAND ----------

seats.count()

# COMMAND ----------

seats.select("seat.seatId").take(1)

# COMMAND ----------

seat_info = seats.withColumn(
    "info",
    explode(
        F.arrays_zip(col("seat.seatId"), col("seat.features"),
                     col("seat.pricingReferences"))))

# COMMAND ----------

seat_info.select("info").take(1)

# COMMAND ----------

seat_info.count()

# COMMAND ----------

# MAGIC %md
# MAGIC #####

# COMMAND ----------
Ejemplo n.º 28
0
    def test_alignments_filter(self):
        work_dir = "gs://the-peoples-speech-west-europe/forced-aligner/cuda-forced-aligner/output_work_dir_5b/output_work_dir_5b"
        alignments_dir = os.path.join(work_dir, "alignments_json_jul_28")
        spark = self.spark
        alignments_df = spark.read.json(alignments_dir)
        alignments_df = alignments_df.withColumn(
            "duration_ms",
            F.expr(
                "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)"
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.arrays_zip(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.duration_ms,
            ).cast(
                T.ArrayType(
                    T.StructType(
                        [
                            T.StructField("cer", T.FloatType()),
                            T.StructField("end_ms", T.LongType()),
                            T.StructField("label", T.StringType()),
                            T.StructField("start_ms", T.LongType()),
                            T.StructField("wer", T.FloatType()),
                            T.StructField("duration_ms", T.LongType()),
                        ]
                    )
                )
            ),
        )
        alignments_df = alignments_df.drop("duration_ms")

        max_duration_ms = 20_000
        max_cer = 36.0
        min_duration_ms = 1_000

        alignments_df = alignments_df.withColumn(
            "alignments",
            F.filter(
                alignments_df.alignments,
                # Need to select this filter such that total number of
                # hours is 31,400
                lambda alignment: (alignment.duration_ms < max_duration_ms)
                & (alignment.cer < max_cer)
                & (alignment.duration_ms > min_duration_ms),
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.struct(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                # Is this the fix?
                alignments_df.alignments.duration_ms,
            ).cast(
                T.StructType(
                    [
                        T.StructField("cer", T.ArrayType(T.FloatType())),
                        T.StructField("end_ms", T.ArrayType(T.LongType())),
                        T.StructField("label", T.ArrayType(T.StringType())),
                        T.StructField("start_ms", T.ArrayType(T.LongType())),
                        T.StructField("wer", T.ArrayType(T.FloatType())),
                        T.StructField("duration_ms", T.ArrayType(T.LongType())),
                    ]
                )
            ),
        )
        abc = alignments_df.select(
            F.sum(F.expr("aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)"))
            / 1000.0
            / 60.0
            / 60.0
        ).collect()
        print("GALVEZ:max_duration_ms=", max_duration_ms)
        print("GALVEZ:max_cer=", max_cer)
        print("GALVEZ:min_duration_ms=", min_duration_ms)
        print("GALVEZ:total number of hours=", abc)
    # fitting data to schema
    df = dsraw.select(F.from_json("value",
                                  schema).alias("data")).select("data.*")

    # creates offset and adds columns latitude2, longitude2 and time2 to each row which is required to calculate speed and distance
    d = df.withColumn("arr_size", F.size("latitude"))\
        .withColumn("lat2", F.expr("slice(latitude,1,arr_size-1)"))\
        .withColumn("long2", F.expr("slice(longitude,1,arr_size-1)"))\
        .withColumn("time2", F.expr("slice(timestamp,1,arr_size-1)"))\
        .withColumn("null_col", F.array([F.lit(None)]))\
        .withColumn("lat2", F.concat(F.col("null_col"),F.col("lat2")))\
        .withColumn("long2", F.concat(F.col("null_col"),F.col("long2")))\
        .withColumn("time2", F.concat(F.col("null_col"),F.col("time2")))

    # explodes all array columns
    d1 = d.withColumn("new", F.arrays_zip("heart_rate","timestamp","latitude","longitude","lat2","long2","time2"))\
            .withColumn("new", F.explode("new"))\
            .select("userId","id",
                    F.col("new.heart_rate").alias("heart_rate"),
                    F.col("new.timestamp").alias("timestamp"),
                    F.col("new.latitude").alias("lat"),
                    F.col("new.longitude").alias("long"),
                    F.col("new.lat2").alias("lat2"),
                    F.col("new.long2").alias("long2"),
                    F.col("new.time2").alias("time2"))

    # haversine formula, calculates distance and speed between two points
    d2 = d1.withColumn("distance", 3956 *(2 * F.asin(F.sqrt(F.sin((F.radians("lat") - F.radians("lat2"))/2)**2
                                    + F.cos(F.radians("lat")) * F.cos(F.radians("lat2")) * F.sin((F.radians("long")
                                    - F.radians("long2"))/2)**2))))\
           .withColumn("speed", F.col("distance")/((F.col("timestamp") - F.col("time2"))/3600))
Ejemplo n.º 30
0
# 列合并
df = df.withColumn('col', concat('col1', 'col2'))

# concat_ws can use separator, concat can't
df = claim_group_info.withColumn('col', concat_ws('-','col1', 'col2'))



##########################
# explode multiple columns
##########################

# only works in spark > 2.4
from pyspark.sql.functions import arrays_zip, col
(df
    .withColumn("tmp", arrays_zip("b", "c"))
    .withColumn("tmp", explode("tmp"))
    .select("a", col("tmp.b"), col("tmp.c"), "d"))


# works for spark <= 2.3
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType
from pyspark.sql.functions import col, udf, explode

zip_ = udf(
        lambda x, y: list(zip(x, y)),
        ArrayType(StructType([
            StructField("first", IntegerType()),
            StructField("second", IntegerType())
            ]))
        )