Beispiel #1
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     return filter(
         self.column.get_column_spec(source_df=source_df,
                                     current_column=current_column),
         self.func,
     )
    def get_column_spec(
        self,
        source_df: Optional[DataFrame],
        current_column: Optional[Column],
        parent_columns: Optional[List[Column]],
    ) -> Column:
        if parent_columns is None:
            parent_columns = []
        if current_column is not None:
            parent_columns.append(current_column)

        return filter(
            self.array_field.get_column_spec(
                source_df=source_df,
                current_column=current_column,
                parent_columns=parent_columns,
            ),
            lambda y: exists(
                self.inner_array_field.get_column_spec(
                    source_df=source_df,
                    current_column=y,
                    parent_columns=parent_columns,
                ),
                lambda x: x[self.match_property] == self.match_value.
                get_column_spec(
                    source_df=source_df,
                    current_column=y,
                    parent_columns=parent_columns,
                ),
            ),
        )
Beispiel #3
0
def test_automapper_filter(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(view="members", source_view="patients").columns(
        age=A.filter(column=A.column("identifier"),
                     func=lambda x: x["use"] == lit("usual")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        filter("b.identifier",
               lambda x: x["use"] == lit("usual")).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)
Beispiel #4
0
 def get_column_spec(self, source_df: Optional[DataFrame],
                     current_column: Optional[Column]) -> Column:
     return flatten(
         filter(
             self.column.get_column_spec(source_df=source_df,
                                         current_column=current_column),
             lambda x: x.isNotNull(),
         ))
def test_auto_mapper_array_multiple_items_with_null(
    spark_session: SparkSession, ) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df: DataFrame = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(dst2=AutoMapperList(["address1", "address2", None]))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["dst2"],
        when(
            array(lit("address1"), lit("address2"), lit(None)).isNotNull(),
            filter(
                coalesce(array(lit("address1"), lit("address2"), lit(None)),
                         array()),
                lambda x: x.isNotNull(),
            ),
        ).alias("dst2"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][1]
            == "address2")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][1]
            == "address2")
Beispiel #6
0
 def get_column_spec(
     self,
     source_df: Optional[DataFrame],
     current_column: Optional[Column],
     parent_columns: Optional[List[Column]],
 ) -> Column:
     return filter(
         self.column.get_column_spec(
             source_df=source_df,
             current_column=current_column,
             parent_columns=parent_columns,
         ),
         self.func,
     )
def test_automapper_nested_array_filter_simple_with_array(
    spark_session: SparkSession, ) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    environ["LOGLEVEL"] = "DEBUG"

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.nested_array_filter(
            array_field=A.column("array1"),
            inner_array_field=A.field("array2"),
            match_property="reference",
            match_value=A.text("bar"),
        ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        filter(
            col("b.array1"),
            lambda y: exists(
                y["array2"], lambda x: x["reference"] == lit("bar").cast(
                    "string")),
        ).alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.printSchema()
    result_df.show(truncate=False)

    assert result_df.count() == 2
    assert result_df.select("age").collect()[0][0] == []
    assert result_df.select(
        "age").collect()[1][0][0]["array2"][0]["reference"] == "bar"
Beispiel #8
0
    def get_column_spec(
        self,
        source_df: Optional[DataFrame],
        current_column: Optional[Column],
        parent_columns: Optional[List[Column]],
    ) -> Column:
        if isinstance(self.value, list):  # if the src column is a list then iterate
            inner_array = array(
                *[
                    self.get_value(
                        item,
                        source_df=source_df,
                        current_column=current_column,
                        parent_columns=parent_columns,
                    )
                    for item in self.value
                ]
            )
            return when(
                inner_array.isNotNull(),
                filter(inner_array, lambda x: x.isNotNull() & ~x.eqNullSafe("")),
            )

            # if value is an AutoMapper then ask it for its column spec
        if isinstance(self.value, AutoMapperDataTypeBase):
            child: AutoMapperDataTypeBase = self.value
            inner_child_spec = child.get_column_spec(
                source_df=source_df,
                current_column=current_column,
                parent_columns=parent_columns,
            )
            return when(
                inner_child_spec.isNotNull(),
                filter(inner_child_spec, lambda x: x.isNotNull() & ~x.eqNullSafe("")),
            )

        raise ValueError(f"value: {self.value} is neither list nor AutoMapper")
def test_automapper_select_one(spark_session: SparkSession) -> None:
    clean_spark_session(spark_session)
    data_dir: Path = Path(__file__).parent.joinpath("./")

    data_json_file: Path = data_dir.joinpath("data.json")

    source_df: DataFrame = spark_session.read.json(str(data_json_file),
                                                   multiLine=True)

    source_df.createOrReplaceTempView("patients")

    source_df.show(truncate=False)

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients").columns(age=A.column("identifier").filter(
            lambda x: x["system"] == "http://hl7.org/fhir/sid/us-npi").
                                        select_one(A.field("_.value")))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    assert_compare_expressions(
        sql_expressions["age"],
        transform(
            filter(
                "b.identifier",
                lambda x: x["system"] == lit("http://hl7.org/fhir/sid/us-npi"),
            ),
            lambda x: x["value"],
        )[0].alias("age"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    result_df.show(truncate=False)

    assert result_df.select("age").collect()[0][0] == "1730325416"
    assert result_df.select("age").collect()[1][0] == "1467734301"
Beispiel #10
0
    def get_column_spec(
        self, source_df: Optional[DataFrame], current_column: Optional[Column]
    ) -> Column:
        if isinstance(
            self.value, str
        ):  # if the src column is just string then consider it a sql expression
            return array(lit(self.value))

        if isinstance(self.value, list):  # if the src column is a list then iterate
            return (
                filter(
                    array(
                        *[
                            self.get_value(
                                item, source_df=source_df, current_column=current_column
                            )
                            for item in self.value
                        ]
                    ),
                    lambda x: x.isNotNull(),
                )
                if self.remove_nulls
                else array(
                    *[
                        self.get_value(
                            item, source_df=source_df, current_column=current_column
                        )
                        for item in self.value
                    ]
                )
            )

        # if value is an AutoMapper then ask it for its column spec
        if isinstance(self.value, AutoMapperDataTypeBase):
            child: AutoMapperDataTypeBase = self.value
            return child.get_column_spec(
                source_df=source_df, current_column=current_column
            )

        raise ValueError(f"value: {self.value} is neither str nor AutoMapper")
Beispiel #11
0
def test_auto_mapper_columns(spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, "Vidal", "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        dst1="src1",
        dst2=AutoMapperList(["address1"]),
        dst3=AutoMapperList(["address1", "address2"]),
        dst4=AutoMapperList(
            [A.complex(use="usual", family=A.column("last_name"))]),
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    # Assert
    assert len(sql_expressions) == 4
    assert_compare_expressions(sql_expressions["dst1"],
                               lit("src1").alias("dst1"))
    assert_compare_expressions(
        sql_expressions["dst2"],
        when(
            array(lit("address1")).isNotNull(),
            filter(coalesce(array(lit("address1")), array()),
                   lambda x: x.isNotNull()),
        ).alias("dst2"),
    )
    assert_compare_expressions(
        sql_expressions["dst3"],
        when(
            array(lit("address1"), lit("address2")).isNotNull(),
            filter(
                coalesce(array(lit("address1"), lit("address2")), array()),
                lambda x: x.isNotNull(),
            ),
        ).alias("dst3"),
    )
    assert_compare_expressions(
        sql_expressions["dst4"],
        when(
            array(
                struct(
                    lit("usual").alias("use"),
                    col("b.last_name").alias("family"))).isNotNull(),
            filter(
                coalesce(
                    array(
                        struct(
                            lit("usual").alias("use"),
                            col("b.last_name").alias("family"),
                        )),
                    array(),
                ),
                lambda x: x.isNotNull(),
            ),
        ).alias("dst4"),
    )

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert len(result_df.columns) == 5
    assert result_df.where("member_id == 1").select(
        "dst1").collect()[0][0] == "src1"
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            == "address1")

    assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][0]
            == "address1")
    assert (result_df.where("member_id == 1").select("dst3").collect()[0][0][1]
            == "address2")

    assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0]
            [0] == "usual")
    assert (result_df.where("member_id == 1").select("dst4").collect()[0][0][0]
            [1] == "Qureshi")
def test_auto_mapper_list_addition_multiple_items_structs_different_elements_with_schema(
    spark_session: SparkSession,
) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, None, "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df: DataFrame = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    schema: StructType = StructType(
        [
            StructField("id", StringType(), True),
            StructField("c", StringType(), True),
            StructField("b", StringType(), True),
        ]
    )

    # Act
    mapper = AutoMapper(
        view="members",
        source_view="patients",
        keys=["member_id"],
        drop_key_columns=False,
    ).columns(
        dst2=AutoMapperList(
            [
                AutoMapperDataTypeComplexBase(
                    id_=A.column("first_name"), b=A.column("last_name")
                ),
            ],
            include_null_properties=True,
            children_schema=schema,
        )
        + AutoMapperList(
            [
                AutoMapperDataTypeComplexBase(
                    id_=A.column("first_name"), c=A.column("last_name")
                ),
            ],
            include_null_properties=True,
            children_schema=schema,
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    struct1 = struct(
        col("b.first_name").alias("id"),
        lit(None).alias("c"),
        col("b.last_name").alias("b"),
    )
    struct2 = struct(
        col("b.first_name").alias("id"),
        col("b.last_name").alias("c"),
        lit(None).alias("b"),
    )
    array1 = when(
        array(struct1, struct2).isNotNull(),
        filter(coalesce(array(struct1, struct2), array()), lambda x: x.isNotNull()),
    )
    assert_compare_expressions(sql_expressions["dst2"], array1.alias("dst2"))
    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (
        result_df.where("member_id == 1").select("dst2").collect()[0][0][0][0]
        == "Imran"
    )
    assert (
        result_df.where("member_id == 1").select("dst2").collect()[0][0][0][2]
        == "Qureshi"
    )
    assert (
        result_df.where("member_id == 2").select("dst2").collect()[0][0][0][0]
        == "Michael"
    )
    assert (
        result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1] is None
    )
def main(argv):
    mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
        "SC_PHYS_PAGES")  # e.g. 4015976448
    mem_gib = int((mem_bytes / (1024.0**3)) * 0.9)
    tar_jar = os.path.join(find_runfiles(),
                           "__main__/galvasr2/spark/tar_spark_datasource.jar")
    spark = (pyspark.sql.SparkSession.builder.master(
        f"local[{os.cpu_count() - 1}]").config(
            "spark.eventLog.enabled",
            "true").config("spark.eventLog.dir", "/spark-events").config(
                "spark.sql.execution.arrow.pyspark.enabled", "true").config(
                    "spark.driver.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config(
                    "spark.executor.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config("spark.driver.memory", f"{mem_gib}g").config(
                    "spark.history.fs.logDirectory", "/spark-events").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        "1").config("spark.jars", tar_jar).config(
                            "spark.local.dir",
                            "/mnt/disks/spark-scratch/").getOrCreate())
    spark.sparkContext.setLogLevel("INFO")  # "ALL" for very verbose logging
    logging.getLogger("py4j").setLevel(logging.ERROR)

    catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue)

    _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue)
    licenseurl_df = licenseurl_df.select(
        [F.col("identifier"),
         F.col("text_document_id"),
         F.col("licenseurl")])

    # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file
    # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_"
    catalogue_df = catalogue_df.withColumn(
        "kaldi_normalized_uttid",
        F.concat_ws(
            "-",
            F.translate(catalogue_df.identifier, " /", "__"),
            F.translate(catalogue_df.audio_document_id, " /", "__"),
        ),
    )
    # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv")
    if not FLAGS.work_dir.startswith("gs://"):
        os.makedirs(FLAGS.work_dir, exist_ok=True)
    wav_scp = os.path.join(FLAGS.work_dir, "wav.scp")
    ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir")
    if FLAGS.stage <= 0:
        catalogue_df = catalogue_df.cache()
        # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping)
        training_sample_rows = catalogue_df.collect()
        catalogue_df.unpersist()

        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            create_wav_scp(posix_wav_scp, training_sample_rows,
                           FLAGS.input_dir, ctm_out_dir)

    # /development/lingvo-source/output_ctm_dir/

    # nvprof --analysis-metrics -o  decoder-analysis.nvprof \
    # We want only the best path, so we set lattice-beam to 0.1
    # --main-q-capacity=35000 \
    # Can get 266x RTF with this configuration. Keep it?
    # bath size of 100 and num channels of 100 works just fine

    if FLAGS.stage <= 1:
        if not FLAGS.work_dir.startswith("gs://"):
            os.makedirs(ctm_out_dir, exist_ok=True)
        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:

            posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                       temp_dir_name, ctm_out_dir)
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                    temp_dir_name, FLAGS.work_dir)
            num_gpus = 4
            posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir,
                                                 num_gpus)

            executor = ThreadPoolExecutor(max_workers=num_gpus)

            def run_gpu(posix_wav_scp_shard, gpu_number):
                cmd = f"""\
  /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \
  --frame-subsampling-factor=3 \
  --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \
  --max-active=7000 \
  --beam=15.0 \
  --lattice-beam=0.1 \
  --acoustic-scale=1.0 \
  --cuda-decoder-copy-threads=2 \
  --cuda-worker-threads={os.cpu_count() // num_gpus} \
  --segmentation=true \
  --cuda-use-tensor-cores=true \
  --max-batch-size=150 \
  --num-channels=250 \
  --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \
  --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \
  /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \
  /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \
  scp,p:{posix_wav_scp_shard} \
  {posix_ctm_out_dir}
  """
                env = deepcopy(os.environ)
                env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}"
                subprocess.check_call(shlex.split(cmd), env=env)

            for i, shard in enumerate(posix_wav_scp_shards):
                executor.submit(run_gpu, shard, i)
            executor.shutdown(wait=True)

    alignments_dir = os.path.join(FLAGS.alignments_work_dir,
                                  "alignments_json_jul_28")
    if FLAGS.stage <= 2:
        # TODO: Add options to DSAlign here
        dsalign_args = dsalign_main.parse_args(
            ["--output-wer",
             "--output-cer"])  # , "--output-sws", "--output-levenshtein"])

        alphabet_normalized_path = (
            "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt")
        align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path,
                                      15_000, 3_000)

        ctm_df = (spark.read.format("binaryFile").option(
            "pathGlobFilter", "*.ctm").load(ctm_out_dir))
        ctm_df = ctm_df.withColumn(
            "kaldi_normalized_uttid",
            F.regexp_replace(
                F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""),
        )
        ctm_df = ctm_df.withColumn("ctm_content",
                                   fix_text_udf(F.col("content"))).drop(
                                       "path", "length", "modificationTime",
                                       "content")

        ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid")
        downsampled_catalogue_df = ctm_df.drop("ctm_content")

        training_sample_rows = downsampled_catalogue_df.collect()
        transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path,
                                          training_sample_rows)
        transcripts_df = transcripts_df.withColumn(
            "transcript",
            normalize_english_text_udf(transcripts_df.transcript))
        ctm_df = ctm_df.join(transcripts_df,
                             ["identifier", "text_document_id"])
        ctm_df = ctm_df.repartition(960)

        # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id),
        #                                         F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id),
        #                                         ctm_df.transcript, ctm_df.ctm_content))
        alignments_df = ctm_df.withColumn(
            "alignments",
            align_udf(
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.text_document_id),
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.audio_document_id),
                ctm_df.transcript,
                ctm_df.ctm_content,
            ),
        ).drop("ctm_content")
        print("GALVEZ:schema")
        alignments_df.printSchema()

        sys.stdout.flush()

        alignments_df.write.mode("overwrite").format("json").save(
            alignments_dir)

    manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest")
    tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars")
    if FLAGS.stage <= 3:
        duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json"
        duplicates_df = spark.read.format("json").load(duplicate_data_path)

        alignments_df = spark.read.json(alignments_dir)

        alignments_df = alignments_df.join(
            duplicates_df,
            on=(alignments_df.identifier == duplicates_df.identifier)
            &
            (alignments_df.text_document_id == duplicates_df.text_document_id),
            how="anti",
        )

        if FLAGS.license_filter == "":
            pass
        else:
            if FLAGS.license_filter == "Not CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    ~is_cc_by_sa(F.col("licenseurl")))
            elif FLAGS.license_filter == "CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    is_cc_by_sa(F.col("licenseurl")))
            else:
                raise Exception("Unknown license_filter provided.")
            filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl")

            alignments_df = alignments_df.join(
                filtered_licenseurl_df,
                on=(alignments_df.identifier
                    == filtered_licenseurl_df.identifier)
                & (alignments_df.text_document_id
                   == filtered_licenseurl_df.text_document_id),
                how="inner",
            )
            alignments_df = alignments_df.drop(
                filtered_licenseurl_df.identifier).drop(
                    filtered_licenseurl_df.text_document_id)

        # We would like the number of partitions to be some large multiple
        # of the number of executors. Not every audio file is the same
        # length, so this helps with load balancing.
        alignments_df = alignments_df.withColumn(
            "duration_ms",
            F.expr(
                "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)"
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.arrays_zip(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.duration_ms,
            ).cast(
                T.ArrayType(
                    T.StructType([
                        T.StructField("cer", T.FloatType()),
                        T.StructField("end_ms", T.LongType()),
                        T.StructField("label", T.StringType()),
                        T.StructField("start_ms", T.LongType()),
                        T.StructField("wer", T.FloatType()),
                        T.StructField("duration_ms", T.LongType()),
                    ]))),
        )

        alignments_df = alignments_df.drop("duration_ms")

        alignments_df = alignments_df.withColumn(
            "alignments",
            F.filter(
                alignments_df.alignments,
                # Need to select this filter such that total number of
                # hours is 31,400
                lambda alignment:
                (alignment.duration_ms < FLAGS.max_duration_ms)
                & (alignment.duration_ms >= FLAGS.min_duration_ms)
                & (alignment.cer < FLAGS.max_cer)
                & (alignment.cer >= FLAGS.min_cer),
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.struct(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.alignments.duration_ms,
            ).cast(
                T.StructType([
                    T.StructField("cer", T.ArrayType(T.FloatType())),
                    T.StructField("end_ms", T.ArrayType(T.LongType())),
                    T.StructField("label", T.ArrayType(T.StringType())),
                    T.StructField("start_ms", T.ArrayType(T.LongType())),
                    T.StructField("wer", T.ArrayType(T.FloatType())),
                    T.StructField("duration_ms", T.ArrayType(T.LongType())),
                ])),
        )

        alignments_df = alignments_df.repartition(960)

        abc = alignments_df.select(
            F.sum(
                F.expr(
                    "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)"
                )) / 1000.0 / 60.0 / 60.0).collect()
        print("GALVEZ:total number of hours=", abc)
        sys.stdout.flush()

        alignments_df = alignments_df.select(
            alignments_df.identifier,
            alignments_df.audio_document_id,
            alignments_df.text_document_id,
            alignments_df.alignments,
        )

        alignments_df = F.broadcast(alignments_df)

        audio_paths = F.concat(
            F.lit(FLAGS.input_gcs_path),
            F.lit("/"),
            F.col("identifier"),
            F.lit("/"),
            F.col("audio_document_id"),
        )
        rows = alignments_df.select(audio_paths).collect()
        paths = [row[0] for row in rows]  # [:1] # GALVEZ: WARNING test!
        # print(f"number of paths = {len(paths)}")
        audio_df = (spark.read.format("binaryFile").load(paths).drop(
            "modificationTime", "length"))

        alignments_audio_df = alignments_df.join(audio_df,
                                                 audio_paths == audio_df.path)
        # from IPython import embed; embed()

        # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory
        # Remove "." becasue it has special meaning in webdataset format.
        # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient)
        name = F.concat(F.col("identifier"), F.lit("/"),
                        F.col("audio_document_id"))
        # name = F.regexp_replace(name, r"/", "_SLASH_")
        name = F.regexp_replace(name, r"\.", "_DOT_")
        name = F.regexp_replace(name, r" ", "_SPACE_")
        # glob.glob("**/*.flac")

        pdf = df.select(name).collect()
        for name in pdf.name:
            assert len(name) < 4096
            for chunk in "/".split(name):
                assert len(chunk) < 256
        # name = F.regexp_replace(F.concat(F.col("identifier"),
        #                                  F.lit("-"),
        #                                  F.col("audio_document_id")),
        #                         r"(\.|/)",
        #                         "_"
        # )

        # The name of each thing in the tar file. May not exceed 100 characters in length
        # substr indexes from 1!
        # name = name.substr(
        #     F.length(name) - F.least(F.length(name), F.lit(88)) + 1,
        #     F.least(F.length(name), F.lit(88))
        # )

        alignments_audio_df = alignments_audio_df.withColumn(
            "aligned_chunks",
            create_audio_segments_udf(
                alignments_audio_df.content,
                F.lit("mp3"),
                name,
                alignments_audio_df.alignments.start_ms,
                alignments_audio_df.alignments.end_ms,
                F.lit("flac"),
            ),
        )
        a = alignments_audio_df.select(
            F.explode(
                F.arrays_zip("aligned_chunks.audio_name",
                             "aligned_chunks.audio"))).select(
                                 "col.0", "col.1")
        a.write.mode("overwrite").format("tar").save(tars_dir)

        output_df = alignments_audio_df.select(
            alignments_audio_df.identifier,
            alignments_audio_df.audio_document_id,
            alignments_audio_df.text_document_id,
            F.struct(
                alignments_audio_df.alignments.label.alias("label"),
                create_audio_segment_names_udf(
                    # Is F.size right here?
                    name,
                    F.size(alignments_audio_df.alignments.start_ms),
                    F.lit("flac"),
                ).alias("name"),
                alignments_audio_df.alignments.duration_ms.alias(
                    "duration_ms"),
            ).alias("training_data"),
        )
        output_df = output_df.coalesce(960)

        # coalesce(1) seems to make the create_audio_segments_udf function run serially
        output_df.write.mode("overwrite").json(manifest_dir)

    repartitioned_tars_dir = os.path.join(FLAGS.work_dir,
                                          "repartitioned_dataset_tars")
    tmp_tars_dir = os.path.join(FLAGS.work_dir,
                                "repartitioned_dataset_tmp_dir")
    if FLAGS.stage <= 4:
        tars_df = spark.read.format("tar").load(tars_dir)  # .limit(100)
        number_of_rows = tars_df.count()

        spark2 = spark.newSession()
        spark2.conf.set(
            "spark.sql.execution.rangeExchange.sampleSizePerPartition",
            number_of_rows)
        spark2.conf.set("spark.sql.files.minPartitionNum",
                        FLAGS.number_of_shards)
        # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100)

        # print("GALVEZ:", tars_df.select(F.col("key")).collect())
        # import sys; sys.exit()
        tars_df = spark2.read.format("tar").load(tars_dir)  # .limit(100)
        tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards,
                                             F.col("key"))
        # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs.
        # # tars_df = tars_df.persist()
        tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir)
        tars_df = spark2.read.format("tar").load(
            tmp_tars_dir)  # .repartitionByRange()  # coalesce(1024)
        # counts_df = (
        #     tars_df.withColumn("partitionId", F.spark_partition_id())
        #     .groupBy("partitionId")
        #     .count()
        # )
        # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0]
        # # Consider doing this in java
        # def drop_final_rows(rows):
        #     for _ in range(num_rows_to_keep):
        #         yield next(rows)
        #     for _ in rows:
        #         pass
        #     return

        # print("GALVEZ:before=", tars_df.rdd.getNumPartitions())
        # # , preservesPartitioning=True
        # tars_df = spark2.createDataFrame(
        #     tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema
        # )
        # print("GALVEZ:after=", tars_df.rdd.getNumPartitions())
        # import sys

        # sys.stdout.flush()
        # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode.
        # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir)

        # manifest_df = spark2.read.json(manifest_dir)
        # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count()
        # print(f"GALVEZ:number_of_utterances={number_of_utterances}")
        # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards
        # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard)

    nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo")
    nemo_single_manifest_dir = os.path.join(FLAGS.work_dir,
                                            "dataset_manifest_nemo_single")

    if FLAGS.stage <= 5:
        json_df = spark.read.format("json").load(manifest_dir)
        nemo_df = json_df.select(
            F.explode(
                F.arrays_zip(
                    F.col("training_data.name").alias("audio_filepath"),
                    F.col("training_data.label").alias("text"),
                    F.col("training_data.duration_ms").alias("duration_ms"),
                )))
        nemo_df = nemo_df.select(
            F.col("col.name").alias("audio_filepath"),
            F.col("col.label").alias("text"),
            (F.col("col.duration_ms").cast(T.DoubleType()) /
             1000.0).alias("duration"),
            F.lit(-1).alias("shard_id"),
        )
        if False:
            tars_df = spark.read.format("tar").load(repartitioned_tars_dir)
            tars_df = tars_df.select(tars_df.key)
            nemo_df = F.broadcast(nemo_df)
            nemo_df = nemo_df.join(
                tars_df,
                F.col("audio_filepath") == F.col("key")).drop(F.col("key"))

        # TODO: Join against tar files that have been made to contain the
        # same number of files to filter out removed files
        nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir)

        nemo_single_df = spark.read.format("json").load(nemo_manifest_dir)
        nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save(
            nemo_single_manifest_dir)

    single_manifest_dir = os.path.join(FLAGS.work_dir,
                                       "dataset_manifest_single")
    single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single")
    # Create single tar file and single json file
    if FLAGS.stage <= 6:
        json_df = spark.read.format("json").load(manifest_dir)
        json_df.coalesce(1).write.format("json").mode("overwrite").save(
            single_manifest_dir)

        tars_df = spark.read.format("tar").load(tmp_tars_dir)
        tars_df.coalesce(1).write.format("tar").mode("overwrite").save(
            single_tar_dir)
def test_auto_mapper_concat_multiple_items_structs_different_elements(
    spark_session: SparkSession, ) -> None:
    # Arrange
    clean_spark_session(spark_session)
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran"),
            (2, None, "Michael"),
        ],
        ["member_id", "last_name", "first_name"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients",
        enable_schema_pruning=True).columns(dst2=AutoMapperList([
            AutoMapperDataTypeComplexBase(a=A.column("first_name"),
                                          b=A.column("last_name"))
        ], ).concat(
            AutoMapperList([
                AutoMapperDataTypeComplexBase(a=A.column("first_name"),
                                              c=A.column("last_name")),
            ], )))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    array1 = when(
        array(
            struct(
                col("b.first_name").alias("a"),
                col("b.last_name").alias("b"),
                lit(None).alias("c"),
            ), ).isNotNull(),
        filter(
            coalesce(
                array(
                    struct(
                        col("b.first_name").alias("a"),
                        col("b.last_name").alias("b"),
                        lit(None).alias("c"),
                    ), ),
                array(),
            ),
            lambda x: x.isNotNull(),
        ),
    )
    array2 = when(
        array(
            struct(
                col("b.first_name").alias("a"),
                lit(None).alias("b"),
                col("b.last_name").alias("c"),
            ), ).isNotNull(),
        filter(
            coalesce(
                array(
                    struct(
                        col("b.first_name").alias("a"),
                        lit(None).alias("b"),
                        col("b.last_name").alias("c"),
                    ), ),
                array(),
            ),
            lambda x: x.isNotNull(),
        ),
    )
    assert_compare_expressions(sql_expressions["dst2"],
                               concat(array1, array2).alias("dst2"))

    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [0] == "Imran")
    assert (result_df.where("member_id == 1").select("dst2").collect()[0][0][0]
            [1] == "Qureshi")
    assert (result_df.where("member_id == 2").select("dst2").collect()[0][0][0]
            [0] == "Michael")
    assert (
        result_df.where("member_id == 2").select("dst2").collect()[0][0][0][1]
        is None)
Beispiel #15
0
    def main(self, sc: SparkContext, *args: Any):
        observations_parquet_path = args[0]
        genotype_phenotype_parquet_path = args[1]
        impc_images_parquet_path = args[2]
        product_parquet_path = args[3]
        gene_core_parquet_path = args[4]
        output_path = args[5]
        spark = SparkSession(sc)

        product_df = spark.read.parquet(product_parquet_path)
        gene_df = spark.read.parquet(gene_core_parquet_path)

        order_df = (
            product_df.groupBy(
                *[
                    col_name
                    for col_name in product_df.columns
                    if col_name not in ["type", "tissue_enquiry_links"]
                ]
            )
            .agg(
                collect_set("type").alias("available_products"),
                collect_set("tissue_enquiry_links").alias("tissue_enquiry_links"),
            )
            .withColumn(
                "available_products",
                when(
                    size("tissue_enquiry_links") > 0,
                    concat("available_products", array(lit("tissue"))),
                ).otherwise(col("available_products")),
            )
        )
        gene_id_symbol = gene_df.select("mgi_accession_id", "marker_symbol").distinct()

        order_df = order_df.join(gene_id_symbol, "marker_symbol")

        gene_order_df = order_df.select(
            "marker_symbol", "allele_name", "allele_description", "available_products"
        ).distinct()
        gene_order_df = gene_order_df.withColumn(
            "_class", lit("org.mousephenotype.web.models.gene.Order")
        )
        gene_order_df.write.format("mongo").mode("append").option(
            "spark.mongodb.output.uri",
            f"{self.mongodb_connection_uri}/admin?replicaSet={self.mongodb_replica_set}",
        ).option("database", str(self.mongodb_database)).option(
            "collection", "gene-order"
        ).save()

        links_fields = [
            "genbank_file",
            "allele_image",
            "allele_simple_image",
            "vector_genbank_file",
            "vector_allele_image",
        ]

        for link_field in links_fields:
            allele_summary_df = order_df.withColumn(
                f"{link_field}_url",
                filter("other_links", lambda x: x.startswith(f"{link_field}:")),
            )
            allele_summary_df = allele_summary_df.withColumn(
                f"{link_field}_url",
                when(
                    size(f"{link_field}_url") > 0,
                    regexp_extract(
                        col(f"{link_field}_url").getItem(0), f"{link_field}:(.*)", 1
                    ),
                ).otherwise(lit(None)),
            )

        genetic_info_fields = [
            "strain",
            "cassette",
            "cassette_type",
            "parent_es_cell_line",
        ]

        for genetic_info_field in genetic_info_fields:
            allele_summary_df = order_df.withColumn(
                genetic_info_field,
                filter(
                    "genetic_info", lambda x: x.startswith(f"{genetic_info_field}:")
                ),
            )
            allele_summary_df = allele_summary_df.withColumn(
                genetic_info_field,
                when(
                    size(genetic_info_field) > 0,
                    regexp_extract(
                        col(genetic_info_field).getItem(0), f"{link_field}:(.*)", 1
                    ),
                ).otherwise(lit(None)),
            )
        ## process by type and then join with the metadata dataframe

        mice_df = (
            allele_summary_df.where(col("type") == "mouse")
            .select(
                col("mgi_accession_id"),
                col("allele_name"),
                col("product_id"),
                col("name").alias("colony_name"),
                col("background_colony_strain"),
                col("production_centre"),
                col("qc_data"),
                col("associated_product_es_cell_name").alias(
                    "es_cell_parent_mouse_colony"
                ),
            )
            .distinct()
        )
Beispiel #16
0
    def get_column_spec(
        self,
        source_df: Optional[DataFrame],
        current_column: Optional[Column],
        parent_columns: Optional[List[Column]],
    ) -> Column:
        """
        returns a Spark Column definition


        """
        self.ensure_children_have_same_properties(
            skip_null_properties=self.skip_null_properties
        )
        if isinstance(
            self.value, str
        ):  # if the src column is just string then consider it a sql expression
            return array(lit(self.value))

        if isinstance(self.value, list):  # if the src column is a list then iterate
            inner_array = array(
                *[
                    self.get_value(
                        item,
                        source_df=source_df,
                        current_column=current_column,
                        parent_columns=parent_columns,
                    )
                    for item in self.value
                ]
            )
            return (
                when(
                    inner_array.isNotNull(),
                    # coalesce is needed otherwise Spark complains:
                    # pyspark.sql.utils.AnalysisException: cannot resolve
                    # 'filter(NULL, lambdafunction((x IS NOT NULL), x))' due to argument data type mismatch:
                    # argument 1 requires array type, however, 'NULL' is of null type.;
                    filter(coalesce(inner_array, array()), lambda x: x.isNotNull()),
                )
                if self.remove_nulls
                else inner_array
            )

        # if value is an AutoMapper then ask it for its column spec
        if isinstance(self.value, AutoMapperDataTypeBase):
            child: AutoMapperDataTypeBase = self.value
            inner_child_spec = child.get_column_spec(
                source_df=source_df,
                current_column=current_column,
                parent_columns=parent_columns,
            )
            return (
                when(
                    inner_child_spec.isNotNull(),
                    filter(
                        # coalesce is needed otherwise Spark complains:
                        # pyspark.sql.utils.AnalysisException: cannot resolve
                        # 'filter(NULL, lambdafunction((x IS NOT NULL), x))' due to argument data type mismatch:
                        # argument 1 requires array type, however, 'NULL' is of null type.;
                        coalesce(inner_child_spec, array()),
                        lambda x: x.isNotNull(),
                    ),
                )
                if self.remove_nulls
                else inner_child_spec
            )

        raise ValueError(f"value: {self.value} is neither str nor AutoMapper")
Beispiel #17
0
def test_auto_mapper_fhir_patient_resource_include_null_properties(
    spark_session: SparkSession,
) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01", "female"),
            (2, "Vidal", "Michael", "1970-02-02", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).complex(
        Patient(
            id_=FhirId(A.column("member_id")),
            birthDate=A.date(A.column("date_of_birth")),
            name=FhirList(
                [HumanName(use=NameUseCode("usual"), family=A.column("last_name"))],
                include_null_properties=True,
            ),
            gender=A.if_not_null(
                A.column("my_gender"), AdministrativeGenderCode(A.column("my_gender"))
            ),
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert len(sql_expressions) == 21
    assert str(sql_expressions["id"]) == str(
        substring(
            regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "_"), 0, 63
        ).alias("id")
    )
    assert str(sql_expressions["resourceType"]) == str(
        lit("Patient").alias("resourceType")
    )
    assert str(sql_expressions["birthDate"]) == str(
        coalesce(
            to_date(col("b.date_of_birth"), "y-M-d"),
            to_date(col("b.date_of_birth"), "yyyyMMdd"),
            to_date(col("b.date_of_birth"), "M/d/y"),
        ).alias("birthDate")
    )
    assert str(sql_expressions["name"]) == str(
        filter(
            array(
                struct(
                    lit("usual").alias("use"),
                    lit(None).alias("text"),
                    col("b.last_name").alias("family"),
                    lit(None).alias("given"),
                    lit(None).alias("prefix"),
                    lit(None).alias("suffix"),
                    lit(None).alias("period"),
                )
            ),
            lambda x: x.isNotNull(),
        ).alias("name")
    )
    assert str(sql_expressions["gender"]) == str(
        when(col("b.my_gender").isNull(), None)
        .otherwise(col("b.my_gender"))
        .alias("gender")
    )

    result_df.printSchema()
    result_df.show()

    assert (
        result_df.where("member_id == 1").selectExpr("name[0].use").collect()[0][0]
        == "usual"
    )
    assert (
        result_df.where("member_id == 1").selectExpr("name[0].family").collect()[0][0]
        == "Qureshi"
    )

    assert (
        result_df.where("member_id == 2").selectExpr("name[0].use").collect()[0][0]
        == "usual"
    )
    assert (
        result_df.where("member_id == 2").selectExpr("name[0].family").collect()[0][0]
        == "Vidal"
    )
    def test_alignments_filter(self):
        work_dir = "gs://the-peoples-speech-west-europe/forced-aligner/cuda-forced-aligner/output_work_dir_5b/output_work_dir_5b"
        alignments_dir = os.path.join(work_dir, "alignments_json_jul_28")
        spark = self.spark
        alignments_df = spark.read.json(alignments_dir)
        alignments_df = alignments_df.withColumn(
            "duration_ms",
            F.expr(
                "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)"
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.arrays_zip(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.duration_ms,
            ).cast(
                T.ArrayType(
                    T.StructType(
                        [
                            T.StructField("cer", T.FloatType()),
                            T.StructField("end_ms", T.LongType()),
                            T.StructField("label", T.StringType()),
                            T.StructField("start_ms", T.LongType()),
                            T.StructField("wer", T.FloatType()),
                            T.StructField("duration_ms", T.LongType()),
                        ]
                    )
                )
            ),
        )
        alignments_df = alignments_df.drop("duration_ms")

        max_duration_ms = 20_000
        max_cer = 36.0
        min_duration_ms = 1_000

        alignments_df = alignments_df.withColumn(
            "alignments",
            F.filter(
                alignments_df.alignments,
                # Need to select this filter such that total number of
                # hours is 31,400
                lambda alignment: (alignment.duration_ms < max_duration_ms)
                & (alignment.cer < max_cer)
                & (alignment.duration_ms > min_duration_ms),
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.struct(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                # Is this the fix?
                alignments_df.alignments.duration_ms,
            ).cast(
                T.StructType(
                    [
                        T.StructField("cer", T.ArrayType(T.FloatType())),
                        T.StructField("end_ms", T.ArrayType(T.LongType())),
                        T.StructField("label", T.ArrayType(T.StringType())),
                        T.StructField("start_ms", T.ArrayType(T.LongType())),
                        T.StructField("wer", T.ArrayType(T.FloatType())),
                        T.StructField("duration_ms", T.ArrayType(T.LongType())),
                    ]
                )
            ),
        )
        abc = alignments_df.select(
            F.sum(F.expr("aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)"))
            / 1000.0
            / 60.0
            / 60.0
        ).collect()
        print("GALVEZ:max_duration_ms=", max_duration_ms)
        print("GALVEZ:max_cer=", max_cer)
        print("GALVEZ:min_duration_ms=", min_duration_ms)
        print("GALVEZ:total number of hours=", abc)
Beispiel #19
0
def test_automapper_nested_array_filter_with_parent_column(
    spark_session: SparkSession,
) -> None:
    schema = StructType(
        [
            StructField("row_id", dataType=IntegerType(), nullable=False),
            StructField(
                "location",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("name", StringType(), True),
                        ]
                    )
                ),
            ),
            StructField(
                "schedule",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("name", StringType(), True),
                            StructField(
                                "actor",
                                ArrayType(
                                    StructType(
                                        [StructField("reference", StringType(), True)]
                                    ),
                                    True,
                                ),
                            ),
                        ]
                    )
                ),
            ),
            StructField(
                "single_level",
                dataType=ArrayType(
                    StructType(
                        [
                            StructField("reference", StringType(), True),
                        ]
                    )
                ),
            ),
        ]
    )
    spark_session.createDataFrame(
        [
            (
                1,
                [{"name": "location-100"}, {"name": "location-200"}],
                [
                    {
                        "name": "schedule-1",
                        "actor": [
                            {"reference": "location-100"},
                            {"reference": "practitioner-role-100"},
                        ],
                    },
                    {
                        "name": "schedule-2",
                        "actor": [
                            {"reference": "location-200"},
                            {"reference": "practitioner-role-200"},
                        ],
                    },
                ],
                [{"reference": "location-100"}, {"reference": "location-200"}],
            )
        ],
        schema,
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    mapper = AutoMapper(
        view="schedule", source_view="patients", keys=["row_id"]
    ).columns(
        location=A.column("location").select(
            AutoMapperElasticSearchLocation(
                name=A.field("name"),
                scheduling=A.nested_array_filter(
                    array_field=A.column("schedule"),
                    inner_array_field=A.field("actor"),
                    match_property="reference",
                    match_value=A.field("{parent}.name"),
                ).select_one(AutoMapperElasticSearchSchedule(name=A.field("name"))),
            )
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    print("------COLUMN SPECS------")
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")
    assert_compare_expressions(
        sql_expressions["location"],
        transform(
            col("b.location"),
            lambda l: (
                struct(
                    l["name"].alias("name"),
                    transform(
                        filter(
                            col("b.schedule"),
                            lambda s: exists(
                                s["actor"],
                                lambda a: a["reference"] == l["name"],  # type: ignore
                            ),
                        ),
                        lambda s: struct(s["name"].alias("name")),
                    )[0].alias("scheduling"),
                )
            ),
        ).alias("___location"),
    )
    result_df: DataFrame = mapper.transform(df=source_df)

    # Assert
    # result_df.printSchema()
    # result_df.show(truncate=False)
    location_row = result_df.collect()[0].location
    for index, location in enumerate(location_row):
        location_name = location.name
        location_scheduling = location.scheduling
        assert location_name == f"location-{index + 1}00"
        assert len(location_scheduling) == 1
        assert location_scheduling.name == f"schedule-{index + 1}"