Ejemplo n.º 1
0
def load_transcripts(
    spark, base_path: str, collected_text_document_rows: List[pyspark.Row]
):
    def fix_name(identifier, text_document_id):
        if (
            identifier == "gov.house.hbs.hrs05H_A1310_100511"
            and text_document_id == "hrs05H_A1310_100511.asr.srt"
        ):
            return "hrs05H_A1310_100511.auto.srt"
        else:
            return text_document_id

    # TODO: Upload this file
    with open("/development/lingvo-source/missing_files.json", "r") as fh:
        missing_text_document_ids = set(json.load(fh))
    text_document_ids = [
        os.path.join(
            base_path, row.identifier, fix_name(row.identifier, row.text_document_id)
        )
        for row in collected_text_document_rows
    ]
    text_document_ids = [
        tid for tid in text_document_ids if tid not in missing_text_document_ids
    ]
    srt_df = spark.read.format("binaryFile").load(text_document_ids)
    # Note the duplication with load_audio_files
    return srt_df.select(
        srt_to_text(fix_text_udf(srt_df.content)).alias("transcript"),
        F.reverse(F.split(srt_df.path, "/"))[0].alias("text_document_id"),
        F.reverse(F.split(srt_df.path, "/"))[1].alias("identifier"),
    )
Ejemplo n.º 2
0
def load_audio_files(spark, base_path: str):
    raw_audio_df = (spark.read.format("binaryFile").option(
        "pathGlobFilter", "*.mp3").option("recursiveFileLookup",
                                          "true").load(base_path))

    return raw_audio_df.select(
        'content',
        F.reverse(F.split(raw_audio_df.path, "[.]"))[0].alias("format"),
        # We will have repeats with this form of ID... It does not fulfill the purpose of an primary key...
        # 44635        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/01-Ml.Z.Ragi-JinnandJadoo18.05.05.asr.srt
        # 53884        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/02-Ml.Z.Ragi-JinnandJadoo25.05.05.asr.srt
        # 55971        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/03-Ml.Z.Ragi-JinnandJadoo01.06.05.asr.srt
        # 48287        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/04-Ml.Z.Ragi-JinnandJadoo08.06.05.asr.srt
        # 44184        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/05-Ml.Z.Ragi-JinnandJadoo22.06.05.asr.srt
        # 29040        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/06-Ml.Z.Ragi-JinnandJadoo29.06.05.asr.srt
        # 53849        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/07-Ml.Z.Ragi-JinnandJadoo20.07.05.asr.srt
        # 54745        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/08-Ml.Z.Ragi-JinnandJadoo27.07.05.asr.srt
        # 44990        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/09-Ml.Z.Ragi-JinnandJadoo03.08.05.asr.srt
        # 47756        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/10-Ml.Z.Ragi-JinnandJadoo10.08.05.asr.srt
        # 46275        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/11-Ml.Z.Ragi-JinnandJadoo07.09.05.asr.srt
        # 35660        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/12-Ml.Z.Ragi-JinnandJadoo14.09.05.asr.srt
        # 50201        gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/13-Ml.Z.Ragi-JinnandJadoo21.09.05.asr.srt
        # I probably ought to use the non-format part of the final file path... That would work.
        F.reverse(F.split(raw_audio_df.path, "/"))
        [1].alias("audio_document_id"),
        F.monotonically_increasing_id().alias("int64_audio_document_id"))
Ejemplo n.º 3
0
def load_audio_files(spark, collected_audio_document_rows, base_path: str):
    audio_document_ids = [
        os.path.join(base_path, row.identifier, row.audio_document_id)
        for row in collected_audio_document_rows
    ]
    raw_audio_df = spark.read.format("binaryFile").load(audio_document_ids)

    return raw_audio_df.select(
        "content",
        F.reverse(F.split(raw_audio_df.path, "[.]"))[0].alias("format"),
        F.reverse(F.split(raw_audio_df.path, "/"))[0].alias("audio_document_id"),
        F.reverse(F.split(raw_audio_df.path, "/"))[1].alias("identifier"),
        F.monotonically_increasing_id().alias("int64_audio_document_id"),
    )
Ejemplo n.º 4
0
def calculate_touchpoints(input_df):
    w1 = Window\
            .partitionBy("fullVisitorId")\
            .orderBy("timestamp")

    first_touchpoint = first(col("trafficSource_source")).over(w1)

    return input_df\
        .orderBy("timestamp")\
        .selectExpr("*",
            "collect_list(trafficSource_source) over (partition by fullVisitorId) as touchpoints")\
          .withColumn("touchpoints_wo_direct", expr("filter(touchpoints, x -> x != '(direct)')"))\
          .orderBy("timestamp")\
          .select("*",
                  first_touchpoint.alias("first_touchpoint"), 
                  when(reverse(col("touchpoints_wo_direct"))[0].isNotNull(), reverse(col("touchpoints_wo_direct"))[0]).otherwise("(direct)").alias("last_touchpoint"))
Ejemplo n.º 5
0
    def prepare_final_df(self):
        self.df = self.df.select("date", func.explode("message.updates"))

        selected = self.df.select("date", "col.startDatetime",
                                  "col.updateType", "col.name",
                                  "col.finished").filter(
                                      func.col("name").startswith("["))

        grouped = selected.groupBy("name", "startDatetime").agg(
            func.sort_array(func.collect_list("date")).alias("date_array"),
            func.collect_list("updateType").alias("updateType_array"),
            func.reverse(
                func.collect_list("finished")).getItem(0).alias("finished"),
        )

        preprocessed = (grouped.withColumn(
            "start_datetime",
            grouped.startDatetime.cast(TimestampType())).withColumn(
                "last_update",
                self.helper.get_last_date_udf(grouped.date_array)).withColumn(
                    "finished",
                    func.col("finished").cast(BooleanType())).withColumn(
                        "meeting_name", func.col("name")).select(
                            "start_datetime", "last_update", "finished",
                            "meeting_name").withColumn(
                                "duration",
                                func.col("last_update").cast(LongType()) -
                                func.col("start_datetime").cast(LongType()),
                            ))

        preprocessed.printSchema()

        return self.do_post_preprocessing(preprocessed)
Ejemplo n.º 6
0
def addUserFeatures(df: DataFrame) -> DataFrame:
    '''
    提取用户特征
    :param df:
    :return:
    '''

    extractGenresUdf = udf(extractGenres, returnType=ArrayType(IntegerType()))
    print('start user feature')
    samplesWithUserFeatures = df.withColumn('userPositiveHistory',collect_list(when(col('label')==1,col('movieId')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))\
        .withColumn('userPositiveHistory',reverse(col('userPositiveHistory'))) \
        .withColumn("userRatedMovie1", col("userPositiveHistory").getItem(0))  \
        .withColumn("userRatedMovie2", col("userPositiveHistory").getItem(1)) \
        .withColumn("userRatedMovie3", col("userPositiveHistory").getItem(2)) \
        .withColumn("userRatedMovie4", col("userPositiveHistory").getItem(3)) \
        .withColumn("userRatedMovie5", col("userPositiveHistory").getItem(4)) \
        .withColumn('userRatingCount',count(lit(1)).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \
        .withColumn('userAvgReleaseYear',avg(col('releaseYear')).over(Window.partitionBy("userId").orderBy(col('timestamp')).rowsBetween(-100, -1)).cast('int')) \
        .withColumn('userReleaseYearStddev',stddev(col('releaseYear')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \
        .withColumn('userAvgRating',format_number(avg(col('rating')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)),2)) \
        .withColumn('userRatingStddev',    stddev(col("rating")).over(Window.partitionBy("userId").orderBy(col("timestamp")).rowsBetween(-100, -1))) \
        .withColumn('userGenres',extractGenresUdf(collect_list(when(col('label') == 1,col('genres')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))).na.fill(0) \
        .withColumn("userRatingStddev", format_number(col("userRatingStddev"), 2)) \
        .withColumn("userReleaseYearStddev", format_number(col("userReleaseYearStddev"), 2)) \
        .withColumn("userGenre1", col("userGenres").getItem(0)) \
        .withColumn("userGenre2", col("userGenres").getItem(1)) \
        .withColumn("userGenre3", col("userGenres").getItem(2)) \
        .withColumn("userGenre4", col("userGenres").getItem(3)) \
        .withColumn("userGenre5", col("userGenres").getItem(4))\
        .drop("genres", "userGenres", "userPositiveHistory").filter(col('userRatingCount') > 1)

    # samplesWithUserFeatures.printSchema()
    samplesWithUserFeatures.show(10, truncate=True)
    return samplesWithUserFeatures
Ejemplo n.º 7
0
def load_transcripts(spark, base_path: str,
                     collected_text_document_rows: List[pyspark.Row]):
    text_document_ids = [
        os.path.join(base_path, row.identifier, row.text_document_id)
        for row in collected_text_document_rows
    ]
    text_document_ids = [
        path for path in text_document_ids
        if "[" not in path and "]" not in path
    ]
    # "[" and "]" are escape card characters. GCS has very poor support
    # for these. Namely, you can write them but not read them back. More
    # resources here: https://github.com/galv/lingvo-copy/issues/18
    # I simply filter out any files containing these characters for now.
    srt_df = (spark.read.format("binaryFile").load(text_document_ids))
    # Note the duplication with load_audio_files
    return srt_df.select(
        srt_to_text(fix_text_udf(srt_df.content)).alias('transcript'),
        F.reverse(F.split(srt_df.path, "/"))[1].alias("id"))
Ejemplo n.º 8
0
def add_user_features(data):
    # find positive rating list of each userId
    features = data.withColumn("userPositiveHistory",
                               F.collect_list(F.when(F.col("label") == 1, F.col("movieId")).otherwise(F.lit(None)))
                               .over(
                                   sql.Window.partitionBy("userId").orderBy(F.col("timestamp")).rowsBetween(-100, -1)
                               ))\
        .withColumn("userPositiveHistory", F.reverse(F.col("userPositiveHistory"))) \
        .withColumn("userRatedMovie1", F.col("userPositiveHistory").getItem(0)) \
        .withColumn("userRatedMovie2", F.col("userPositiveHistory").getItem(1)) \
        .withColumn("userRatedMovie3", F.col("userPositiveHistory").getItem(2)) \
        .withColumn("userRatedMovie4", F.col("userPositiveHistory").getItem(3)) \
        .withColumn("userRatedMovie5", F.col("userPositiveHistory").getItem(4)) \
        .withColumn("userRatingCount",
                    F.count(F.lit(1)).over(sql.Window.partitionBy("userId")
                    .orderBy(F.col("timestamp")).rowsBetween(-100, -1))) \
        .withColumn("userAvgReleaseYear",
                    F.avg(F.col("releaseYear")).over(sql.Window.partitionBy("userId")
                    .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \
        .withColumn("userReleaseYearStddev",
                    F.stddev(F.col("releaseYear")).over(sql.Window.partitionBy("userId")
                    .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \
        .withColumn("userAvgRating",
                    F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy("userId")
                    .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \
        .withColumn("userRatingStddev",
                    F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy("userId")
                    .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \
        .withColumn("userGenres",
                    udf_extract_genres(F.collect_list(F.when(F.col("label") == 1, F.col("genres")).otherwise(F.lit(None)))
                    .over(sql.Window.partitionBy("userId").orderBy("timestamp").rowsBetween(-100, -1)))) \
        .na.fill(0) \
        .withColumn("userReleaseYearStddev", F.format_number(F.col("userReleaseYearStddev"), Config.NUMBER_PRECISION)) \
        .withColumn("userGenre1", F.col("userGenres").getItem(0)) \
        .withColumn("userGenre2", F.col("userGenres").getItem(1)) \
        .withColumn("userGenre3", F.col("userGenres").getItem(2)) \
        .withColumn("userGenre4", F.col("userGenres").getItem(3)) \
        .withColumn("userGenre5", F.col("userGenres").getItem(4)) \
        .drop("genres", "userGenres", "userPositiveHistory") \
        .filter(F.col("userRatingCount") > 1)
    print_info(features, topN=20)
    return features
Ejemplo n.º 9
0
 def transform_data(self,df):
     #Transform it to a format that will be accepted by the model
         cols_to_drop = ["Customer ID","Name","Address","Phone_no","Email","SSN"]
         df = df.drop(*cols_to_drop)
         df = df.withColumn("Customer Lifetime Value", functions.round("Customer Lifetime Value", 2))
         df = df.withColumn("Loss Ratio", functions.round("Loss Ratio", 3))
         df = df.withColumn("Growth Rate", functions.round("Growth Rate", 3))
         df = df.withColumn("Total Claim Amount", functions.round("Total Claim Amount", 3))
         df = df.withColumn("Job",functions.split("Job", ",").getItem(0))
         df = df.withColumn("Company",functions.reverse(functions.split("Company", ",")).getItem(0))
         indexer_list = []
         categ_cols = ['City','Response','Coverage','Education','Employment_Status','Gender','Location_Code','Marital Status','Policy_Type','Policy_Rating','Renew_Offer_Type','Sales_Channel','Total Claim Amount','Feedback','Job','Company','Credit Card Provider']
         for i in categ_cols:
             if i == 'City':
                 indexer_list.append(StringIndexer(inputCol=i, outputCol=i+"Index"))
             else:
                 indexer_list.append(StringIndexer(inputCol=i, outputCol=i+" Index"))
         for j in indexer_list:
             df = j.fit(df).transform(df)
         df = df.select([c for c in df.columns if c not in categ_cols])
         df = df.withColumn("Effective To Date",functions.split("Effective To Date", "-").getItem(2))
         df = df.withColumn("Effective To Date", df["Effective To Date"].cast(types.IntegerType()))
         return df
Ejemplo n.º 10
0
def main(argv):
    mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf(
        "SC_PHYS_PAGES")  # e.g. 4015976448
    mem_gib = int((mem_bytes / (1024.0**3)) * 0.9)
    tar_jar = os.path.join(find_runfiles(),
                           "__main__/galvasr2/spark/tar_spark_datasource.jar")
    spark = (pyspark.sql.SparkSession.builder.master(
        f"local[{os.cpu_count() - 1}]").config(
            "spark.eventLog.enabled",
            "true").config("spark.eventLog.dir", "/spark-events").config(
                "spark.sql.execution.arrow.pyspark.enabled", "true").config(
                    "spark.driver.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config(
                    "spark.executor.extraJavaOptions",
                    "-Dio.netty.tryReflectionSetAccessible=true",
                ).config("spark.driver.memory", f"{mem_gib}g").config(
                    "spark.history.fs.logDirectory", "/spark-events").config(
                        "spark.sql.execution.arrow.maxRecordsPerBatch",
                        "1").config("spark.jars", tar_jar).config(
                            "spark.local.dir",
                            "/mnt/disks/spark-scratch/").getOrCreate())
    spark.sparkContext.setLogLevel("INFO")  # "ALL" for very verbose logging
    logging.getLogger("py4j").setLevel(logging.ERROR)

    catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue)

    _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue)
    licenseurl_df = licenseurl_df.select(
        [F.col("identifier"),
         F.col("text_document_id"),
         F.col("licenseurl")])

    # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file
    # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_"
    catalogue_df = catalogue_df.withColumn(
        "kaldi_normalized_uttid",
        F.concat_ws(
            "-",
            F.translate(catalogue_df.identifier, " /", "__"),
            F.translate(catalogue_df.audio_document_id, " /", "__"),
        ),
    )
    # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv")
    if not FLAGS.work_dir.startswith("gs://"):
        os.makedirs(FLAGS.work_dir, exist_ok=True)
    wav_scp = os.path.join(FLAGS.work_dir, "wav.scp")
    ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir")
    if FLAGS.stage <= 0:
        catalogue_df = catalogue_df.cache()
        # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping)
        training_sample_rows = catalogue_df.collect()
        catalogue_df.unpersist()

        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            create_wav_scp(posix_wav_scp, training_sample_rows,
                           FLAGS.input_dir, ctm_out_dir)

    # /development/lingvo-source/output_ctm_dir/

    # nvprof --analysis-metrics -o  decoder-analysis.nvprof \
    # We want only the best path, so we set lattice-beam to 0.1
    # --main-q-capacity=35000 \
    # Can get 266x RTF with this configuration. Keep it?
    # bath size of 100 and num channels of 100 works just fine

    if FLAGS.stage <= 1:
        if not FLAGS.work_dir.startswith("gs://"):
            os.makedirs(ctm_out_dir, exist_ok=True)
        with TemporaryMountDirectory(
                mount_cmd=[
                    "gcsfuse",
                    "--implicit-dirs",
                    FLAGS.input_gcs_bucket.lstrip("gs://"),
                ],
                unmount_cmd=["fusermount", "-u"],
        ) as temp_dir_name:

            posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                       temp_dir_name, ctm_out_dir)
            posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                   temp_dir_name, wav_scp)
            posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket),
                                    temp_dir_name, FLAGS.work_dir)
            num_gpus = 4
            posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir,
                                                 num_gpus)

            executor = ThreadPoolExecutor(max_workers=num_gpus)

            def run_gpu(posix_wav_scp_shard, gpu_number):
                cmd = f"""\
  /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \
  --frame-subsampling-factor=3 \
  --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \
  --max-active=7000 \
  --beam=15.0 \
  --lattice-beam=0.1 \
  --acoustic-scale=1.0 \
  --cuda-decoder-copy-threads=2 \
  --cuda-worker-threads={os.cpu_count() // num_gpus} \
  --segmentation=true \
  --cuda-use-tensor-cores=true \
  --max-batch-size=150 \
  --num-channels=250 \
  --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \
  --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \
  /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \
  /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \
  scp,p:{posix_wav_scp_shard} \
  {posix_ctm_out_dir}
  """
                env = deepcopy(os.environ)
                env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}"
                subprocess.check_call(shlex.split(cmd), env=env)

            for i, shard in enumerate(posix_wav_scp_shards):
                executor.submit(run_gpu, shard, i)
            executor.shutdown(wait=True)

    alignments_dir = os.path.join(FLAGS.alignments_work_dir,
                                  "alignments_json_jul_28")
    if FLAGS.stage <= 2:
        # TODO: Add options to DSAlign here
        dsalign_args = dsalign_main.parse_args(
            ["--output-wer",
             "--output-cer"])  # , "--output-sws", "--output-levenshtein"])

        alphabet_normalized_path = (
            "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt")
        align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path,
                                      15_000, 3_000)

        ctm_df = (spark.read.format("binaryFile").option(
            "pathGlobFilter", "*.ctm").load(ctm_out_dir))
        ctm_df = ctm_df.withColumn(
            "kaldi_normalized_uttid",
            F.regexp_replace(
                F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""),
        )
        ctm_df = ctm_df.withColumn("ctm_content",
                                   fix_text_udf(F.col("content"))).drop(
                                       "path", "length", "modificationTime",
                                       "content")

        ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid")
        downsampled_catalogue_df = ctm_df.drop("ctm_content")

        training_sample_rows = downsampled_catalogue_df.collect()
        transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path,
                                          training_sample_rows)
        transcripts_df = transcripts_df.withColumn(
            "transcript",
            normalize_english_text_udf(transcripts_df.transcript))
        ctm_df = ctm_df.join(transcripts_df,
                             ["identifier", "text_document_id"])
        ctm_df = ctm_df.repartition(960)

        # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id),
        #                                         F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id),
        #                                         ctm_df.transcript, ctm_df.ctm_content))
        alignments_df = ctm_df.withColumn(
            "alignments",
            align_udf(
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.text_document_id),
                F.concat(ctm_df.identifier, F.lit("/"),
                         ctm_df.audio_document_id),
                ctm_df.transcript,
                ctm_df.ctm_content,
            ),
        ).drop("ctm_content")
        print("GALVEZ:schema")
        alignments_df.printSchema()

        sys.stdout.flush()

        alignments_df.write.mode("overwrite").format("json").save(
            alignments_dir)

    manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest")
    tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars")
    if FLAGS.stage <= 3:
        duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json"
        duplicates_df = spark.read.format("json").load(duplicate_data_path)

        alignments_df = spark.read.json(alignments_dir)

        alignments_df = alignments_df.join(
            duplicates_df,
            on=(alignments_df.identifier == duplicates_df.identifier)
            &
            (alignments_df.text_document_id == duplicates_df.text_document_id),
            how="anti",
        )

        if FLAGS.license_filter == "":
            pass
        else:
            if FLAGS.license_filter == "Not CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    ~is_cc_by_sa(F.col("licenseurl")))
            elif FLAGS.license_filter == "CC-BY-SA":
                filtered_licenseurl_df = licenseurl_df.filter(
                    is_cc_by_sa(F.col("licenseurl")))
            else:
                raise Exception("Unknown license_filter provided.")
            filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl")

            alignments_df = alignments_df.join(
                filtered_licenseurl_df,
                on=(alignments_df.identifier
                    == filtered_licenseurl_df.identifier)
                & (alignments_df.text_document_id
                   == filtered_licenseurl_df.text_document_id),
                how="inner",
            )
            alignments_df = alignments_df.drop(
                filtered_licenseurl_df.identifier).drop(
                    filtered_licenseurl_df.text_document_id)

        # We would like the number of partitions to be some large multiple
        # of the number of executors. Not every audio file is the same
        # length, so this helps with load balancing.
        alignments_df = alignments_df.withColumn(
            "duration_ms",
            F.expr(
                "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)"
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.arrays_zip(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.duration_ms,
            ).cast(
                T.ArrayType(
                    T.StructType([
                        T.StructField("cer", T.FloatType()),
                        T.StructField("end_ms", T.LongType()),
                        T.StructField("label", T.StringType()),
                        T.StructField("start_ms", T.LongType()),
                        T.StructField("wer", T.FloatType()),
                        T.StructField("duration_ms", T.LongType()),
                    ]))),
        )

        alignments_df = alignments_df.drop("duration_ms")

        alignments_df = alignments_df.withColumn(
            "alignments",
            F.filter(
                alignments_df.alignments,
                # Need to select this filter such that total number of
                # hours is 31,400
                lambda alignment:
                (alignment.duration_ms < FLAGS.max_duration_ms)
                & (alignment.duration_ms >= FLAGS.min_duration_ms)
                & (alignment.cer < FLAGS.max_cer)
                & (alignment.cer >= FLAGS.min_cer),
            ),
        )
        alignments_df = alignments_df.withColumn(
            "alignments",
            F.struct(
                alignments_df.alignments.cer,
                alignments_df.alignments.end_ms,
                alignments_df.alignments.label,
                alignments_df.alignments.start_ms,
                alignments_df.alignments.wer,
                alignments_df.alignments.duration_ms,
            ).cast(
                T.StructType([
                    T.StructField("cer", T.ArrayType(T.FloatType())),
                    T.StructField("end_ms", T.ArrayType(T.LongType())),
                    T.StructField("label", T.ArrayType(T.StringType())),
                    T.StructField("start_ms", T.ArrayType(T.LongType())),
                    T.StructField("wer", T.ArrayType(T.FloatType())),
                    T.StructField("duration_ms", T.ArrayType(T.LongType())),
                ])),
        )

        alignments_df = alignments_df.repartition(960)

        abc = alignments_df.select(
            F.sum(
                F.expr(
                    "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)"
                )) / 1000.0 / 60.0 / 60.0).collect()
        print("GALVEZ:total number of hours=", abc)
        sys.stdout.flush()

        alignments_df = alignments_df.select(
            alignments_df.identifier,
            alignments_df.audio_document_id,
            alignments_df.text_document_id,
            alignments_df.alignments,
        )

        alignments_df = F.broadcast(alignments_df)

        audio_paths = F.concat(
            F.lit(FLAGS.input_gcs_path),
            F.lit("/"),
            F.col("identifier"),
            F.lit("/"),
            F.col("audio_document_id"),
        )
        rows = alignments_df.select(audio_paths).collect()
        paths = [row[0] for row in rows]  # [:1] # GALVEZ: WARNING test!
        # print(f"number of paths = {len(paths)}")
        audio_df = (spark.read.format("binaryFile").load(paths).drop(
            "modificationTime", "length"))

        alignments_audio_df = alignments_df.join(audio_df,
                                                 audio_paths == audio_df.path)
        # from IPython import embed; embed()

        # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory
        # Remove "." becasue it has special meaning in webdataset format.
        # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient)
        name = F.concat(F.col("identifier"), F.lit("/"),
                        F.col("audio_document_id"))
        # name = F.regexp_replace(name, r"/", "_SLASH_")
        name = F.regexp_replace(name, r"\.", "_DOT_")
        name = F.regexp_replace(name, r" ", "_SPACE_")
        # glob.glob("**/*.flac")

        pdf = df.select(name).collect()
        for name in pdf.name:
            assert len(name) < 4096
            for chunk in "/".split(name):
                assert len(chunk) < 256
        # name = F.regexp_replace(F.concat(F.col("identifier"),
        #                                  F.lit("-"),
        #                                  F.col("audio_document_id")),
        #                         r"(\.|/)",
        #                         "_"
        # )

        # The name of each thing in the tar file. May not exceed 100 characters in length
        # substr indexes from 1!
        # name = name.substr(
        #     F.length(name) - F.least(F.length(name), F.lit(88)) + 1,
        #     F.least(F.length(name), F.lit(88))
        # )

        alignments_audio_df = alignments_audio_df.withColumn(
            "aligned_chunks",
            create_audio_segments_udf(
                alignments_audio_df.content,
                F.lit("mp3"),
                name,
                alignments_audio_df.alignments.start_ms,
                alignments_audio_df.alignments.end_ms,
                F.lit("flac"),
            ),
        )
        a = alignments_audio_df.select(
            F.explode(
                F.arrays_zip("aligned_chunks.audio_name",
                             "aligned_chunks.audio"))).select(
                                 "col.0", "col.1")
        a.write.mode("overwrite").format("tar").save(tars_dir)

        output_df = alignments_audio_df.select(
            alignments_audio_df.identifier,
            alignments_audio_df.audio_document_id,
            alignments_audio_df.text_document_id,
            F.struct(
                alignments_audio_df.alignments.label.alias("label"),
                create_audio_segment_names_udf(
                    # Is F.size right here?
                    name,
                    F.size(alignments_audio_df.alignments.start_ms),
                    F.lit("flac"),
                ).alias("name"),
                alignments_audio_df.alignments.duration_ms.alias(
                    "duration_ms"),
            ).alias("training_data"),
        )
        output_df = output_df.coalesce(960)

        # coalesce(1) seems to make the create_audio_segments_udf function run serially
        output_df.write.mode("overwrite").json(manifest_dir)

    repartitioned_tars_dir = os.path.join(FLAGS.work_dir,
                                          "repartitioned_dataset_tars")
    tmp_tars_dir = os.path.join(FLAGS.work_dir,
                                "repartitioned_dataset_tmp_dir")
    if FLAGS.stage <= 4:
        tars_df = spark.read.format("tar").load(tars_dir)  # .limit(100)
        number_of_rows = tars_df.count()

        spark2 = spark.newSession()
        spark2.conf.set(
            "spark.sql.execution.rangeExchange.sampleSizePerPartition",
            number_of_rows)
        spark2.conf.set("spark.sql.files.minPartitionNum",
                        FLAGS.number_of_shards)
        # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100)

        # print("GALVEZ:", tars_df.select(F.col("key")).collect())
        # import sys; sys.exit()
        tars_df = spark2.read.format("tar").load(tars_dir)  # .limit(100)
        tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards,
                                             F.col("key"))
        # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs.
        # # tars_df = tars_df.persist()
        tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir)
        tars_df = spark2.read.format("tar").load(
            tmp_tars_dir)  # .repartitionByRange()  # coalesce(1024)
        # counts_df = (
        #     tars_df.withColumn("partitionId", F.spark_partition_id())
        #     .groupBy("partitionId")
        #     .count()
        # )
        # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0]
        # # Consider doing this in java
        # def drop_final_rows(rows):
        #     for _ in range(num_rows_to_keep):
        #         yield next(rows)
        #     for _ in rows:
        #         pass
        #     return

        # print("GALVEZ:before=", tars_df.rdd.getNumPartitions())
        # # , preservesPartitioning=True
        # tars_df = spark2.createDataFrame(
        #     tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema
        # )
        # print("GALVEZ:after=", tars_df.rdd.getNumPartitions())
        # import sys

        # sys.stdout.flush()
        # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode.
        # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir)

        # manifest_df = spark2.read.json(manifest_dir)
        # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count()
        # print(f"GALVEZ:number_of_utterances={number_of_utterances}")
        # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards
        # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard)

    nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo")
    nemo_single_manifest_dir = os.path.join(FLAGS.work_dir,
                                            "dataset_manifest_nemo_single")

    if FLAGS.stage <= 5:
        json_df = spark.read.format("json").load(manifest_dir)
        nemo_df = json_df.select(
            F.explode(
                F.arrays_zip(
                    F.col("training_data.name").alias("audio_filepath"),
                    F.col("training_data.label").alias("text"),
                    F.col("training_data.duration_ms").alias("duration_ms"),
                )))
        nemo_df = nemo_df.select(
            F.col("col.name").alias("audio_filepath"),
            F.col("col.label").alias("text"),
            (F.col("col.duration_ms").cast(T.DoubleType()) /
             1000.0).alias("duration"),
            F.lit(-1).alias("shard_id"),
        )
        if False:
            tars_df = spark.read.format("tar").load(repartitioned_tars_dir)
            tars_df = tars_df.select(tars_df.key)
            nemo_df = F.broadcast(nemo_df)
            nemo_df = nemo_df.join(
                tars_df,
                F.col("audio_filepath") == F.col("key")).drop(F.col("key"))

        # TODO: Join against tar files that have been made to contain the
        # same number of files to filter out removed files
        nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir)

        nemo_single_df = spark.read.format("json").load(nemo_manifest_dir)
        nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save(
            nemo_single_manifest_dir)

    single_manifest_dir = os.path.join(FLAGS.work_dir,
                                       "dataset_manifest_single")
    single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single")
    # Create single tar file and single json file
    if FLAGS.stage <= 6:
        json_df = spark.read.format("json").load(manifest_dir)
        json_df.coalesce(1).write.format("json").mode("overwrite").save(
            single_manifest_dir)

        tars_df = spark.read.format("tar").load(tmp_tars_dir)
        tars_df.coalesce(1).write.format("tar").mode("overwrite").save(
            single_tar_dir)
Ejemplo n.º 11
0
df_par.printSchema()

block_size = str(1024 * 1024 * 512)
sc._jsc.hadoopConfiguration().set("dfs.block.size", block_size)
sc._jsc.hadoopConfiguration().set("parquet.block.size", block_size)

s3_location_target = 's3://move-dataeng-temp-dev/glue-etl/parquet_block_poc/omtr_pq_block_512'
  
output_folder = s3_location_target # With absolute path
print 'output_folder= %s' %(output_folder)
#----  PySpark section ----
from pyspark.sql.functions import lit
from pyspark.sql.functions import reverse, split
#---
df_with_hour = df_par.withColumn("hour", split(reverse(split(reverse(df_par.etl_source_filename), '/')[1] ),'=')[1].cast("string")) 

df_with_day = df_with_hour.withColumn("day", split(reverse(split(reverse(df_with_hour.etl_source_filename), '/')[2] ),'=')[1].cast("string")) 

df_with_month = df_with_day.withColumn("month", split(reverse(split(reverse(df_with_day.etl_source_filename), '/')[3] ),'=')[1].cast("string")) 

df_with_partitions = df_with_month.withColumn("year", split(reverse(split(reverse(df_with_month.etl_source_filename), '/')[4] ),'=')[1].cast("string")) 


#----
codec='snappy'  
partitionby=['year', 'month','day', 'hour']
df = df_with_partitions.filter((df_with_partitions.day.cast('Integer') < 2 ) & ( df_with_partitions.day.cast('Integer') > 0))
#df.repartition(*partitionby).write.partitionBy("hour").mode('overwrite').parquet(output_folder, compression=codec) 
df.repartition(*partitionby).write.partitionBy(['year', 'month','day', 'hour']).mode('overwrite').parquet(output_folder, compression=codec) 
#df_with_partitions.repartition(*partitionby).write.partitionBy(['year', 'month','day', 'hour']).mode('overwrite').parquet(output_folder, compression=codec) 
Ejemplo n.º 12
0
def compile_reverse(t, expr, scope, **kwargs):
    op = expr.op()

    src_column = t.translate(op.arg, scope)
    return F.reverse(src_column)
def addUserFeatures(samplesWithMovieFeatures: DataFrame) -> DataFrame:
    """
    新增列详解:
    --- 以下都是**到该条评价产生时间前的**历史行为特征记录 ---
    1.userPositiveHistory: 收集该用户的积极评价并形成一个list,积极评价定义为评价分>3.5(认定为其喜欢该电影),同时使用滑动窗口收集该评价发生时间前的历史节点,避免收集未来信息
    2.使用F.reverse将①中得到的评价序列反序,即按最新评价在前的顺序
    3.userRatedMovie[0~4]: 该用户最近评价的5部电影
    4.userRatingCount:                  用户评价总数
    5.userRatedMovieAvgReleaseYear:     用户评价过的电影的平均上映年份
    6.userRatedMovieReleaseYearStddev:  用户评价过的电影的上映年份的无偏标准差
    7.userAvgRating:                    用户平均评分
    8.userRatingStddev:                 用户评分的无偏标准差
    9.userGenres:                       用户观看过的电影的风格分类汇总
    10.userGenre[0~4]:                  用户最近5个观看的电影风格分类
    --- 以下是对DataFrame中无用信息列的修正
    1.drop:
        ①genres:                        原始电影风格分类,在历史行为特征中不具有含义,删去
        ②userGenres:                    收集到的按时间排列的最近观看电影分类的列表,已捡取前5个,原始列可删去
        ③userPositiveHistory            收集到的按时间排列的评分序列,已捡取前5个,原始列可删去
    2.filter:
        不保留用户整个历史行为中第一次的电影评价,因为在这个行为前没有历史行为,属于冷启动部分
    :param samplesWithMovieFeatures
    :return: samplesWithUserFeatures
    """
    samplesWithUserFeatures = samplesWithMovieFeatures \
        .withColumn('userPositiveHistory',
                    F.collect_list(F.when(F.col('label') == 1, F.col('movieId')).otherwise(F.lit(None))).over(
                        Window.partitionBy('userId').orderBy(F.col('timestamp')).rowsBetween(-100, -1)
                    )) \
        .withColumn('userPositiveHistory', F.reverse(F.col('userPositiveHistory'))) \
        .withColumn('userRatedMovie0', F.col('userPositiveHistory')[0]) \
        .withColumn('userRatedMovie1', F.col('userPositiveHistory')[1]) \
        .withColumn('userRatedMovie2', F.col('userPositiveHistory')[2]) \
        .withColumn('userRatedMovie3', F.col('userPositiveHistory')[3]) \
        .withColumn('userRatedMovie4', F.col('userPositiveHistory')[4]) \
        .withColumn('userRatingCount',
                    F.count(F.lit(1)).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))
                    ) \
        .withColumn('userRatedMovieAvgReleaseYear',
                    F.avg(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))
                    .cast(IntegerType())) \
        .withColumn('userRatedMovieReleaseYearStddev', F.format_number(
                    F.stddev(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)),
                    NUMBER_PRECISION)) \
        .withColumn('userAvgRating', F.format_number(
                    F.avg(F.col('rating')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)),
                    NUMBER_PRECISION)) \
        .withColumn("userRatingStddev", F.format_number(
                    F.stddev(F.col("rating")).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)),
                    NUMBER_PRECISION)) \
        .withColumn("userGenres", F.udf(extractGenresUDF, ArrayType(StringType()))(
                    F.collect_list(F.when(F.col('label') == 1, F.col('genres')).otherwise(F.lit(None))).over(
                        Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1))
                    )) \
        .withColumn("userGenre0", F.col("userGenres")[0]) \
        .withColumn("userGenre1", F.col("userGenres")[1]) \
        .withColumn("userGenre2", F.col("userGenres")[2]) \
        .withColumn("userGenre3", F.col("userGenres")[3]) \
        .withColumn("userGenre4", F.col("userGenres")[4]) \
        .drop("genres", "userGenres", "userPositiveHistory") \
        .filter(F.col("userRatingCount") > 1)

    return samplesWithUserFeatures
Ejemplo n.º 14
0
# In[40]:

import pyspark.sql.functions as f

# In[58]:

df.select(f.collect_set(df['state'])).collect()

# In[62]:

df.select(f.countDistinct('state').alias('states')).show()

# In[70]:

df.select(f.md5('street').alias('hash')).collect()

# In[72]:

df.select(f.reverse(df.state).alias('state-reverse')).collect()

# In[75]:

df.select(f.soundex(df.name).alias('soundex')).collect()

# In[76]:

spark.stop()

# In[ ]:
Ejemplo n.º 15
0
def main(argv):
    spark = SparkSession.builder \
                        .master("local[1]") \
                        .appName("Forced Aligner") \
                        .config("spark.sql.execution.arrow.pyspark.enabled", "true")\
                        .config("spark.sql.execution.arrow.maxRecordsPerBatch", "1")\
                        .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\
                        .config("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\
                        .config("spark.driver.memory", "7g")\
                        .config("spark.executor.memory", "7g")\
                        .config("spark.task.maxFailures", "2")\
                        .getOrCreate()
    spark.sparkContext.setLogLevel("INFO")  # "ALL" for very verbose logging
    logging.getLogger("py4j").setLevel(logging.ERROR)
    pyspark.java_gateway.ensure_callback_server_started(
        spark.sparkContext._gateway)
    # spark.sparkContext._gateway.start_callback_server()
    listener = WriteTaskEndListener()
    spark.sparkContext._jsc.sc().addSparkListener(listener)

    vad_out_dir = os.path.join(FLAGS.work_dir, "vad_pcm_tfrecords")
    if FLAGS.stage <= 0:
        audio_df = load_audio_files(spark, FLAGS.input_dir)
        vad_udf = prepare_vad_udf(num_padding_frames=10,
                                  threshold=0.5,
                                  aggressiveness=0,
                                  frame_duration_ms=30)
        vad_df = audio_df.withColumn(
            "vad", vad_udf(audio_df.content, audio_df.format))
        vad_df = vad_df.withColumn("num_utterances_in_audio_document",
                                   F.size(vad_df.vad.voiced_buffer))

        exploded_voiced_buffer_df = vad_df.select(
            vad_df.audio_document_id, vad_df.int64_audio_document_id,
            vad_df.num_utterances_in_audio_document,
            F.posexplode(vad_df.vad.voiced_buffer))

        tfrecord_df = exploded_voiced_buffer_df.select(
            exploded_voiced_buffer_df.audio_document_id,
            exploded_voiced_buffer_df.int64_audio_document_id,
            exploded_voiced_buffer_df.col.alias("frames"),
            lit("-").alias("transcript"),
            F.concat_ws("-", exploded_voiced_buffer_df.audio_document_id,
                        exploded_voiced_buffer_df.pos).alias("uttid"),
            F.monotonically_increasing_id().alias("int64_uttid"),
            exploded_voiced_buffer_df.num_utterances_in_audio_document,
        )

        tfrecord_df = tfrecord_df.withColumn(
            "frames",
            F.expr("transform(frames, x -> float(x) * float(1./32768.))"))
        tfrecord_df.printSchema()

        tfrecord_df.write.mode("overwrite").format("tfrecord").option(
            "recordType", "Example").save(vad_out_dir)

    if FLAGS.stage <= 1:
        # TODO: Compute this automatically
        # https://stackoverflow.com/questions/44082957/how-to-add-a-sparklistener-from-pyspark-in-python
        num_samples_written = listener.value
        if num_samples_written == 0:
            num_samples = spark.read.format("tfrecord").option(
                "recordType", "Example").load(vad_out_dir).count()
        else:
            num_samples = num_samples_written

        # print(f"GALVEZ:num_samples_written={num_samples_written}")
        # print(f"GALVEZ:num_samples={num_samples}")
        # assert num_samples_written == num_samples

        # from IPython import embed; embed()
        # num_samples = 100_000
        # return

        # ctpu_up = subprocess.run(shlex.split("ctpu up -name forced-aligner-tpu -tpu-only -tpu-size v3-8 -tf-version 2.2"))

        TPU_IP = "10.240.1.2"

        # model_dir = "gs://the-peoples-speech-west-europe/PeoplesSpeech/ag_training/1127"
        model_dir = FLAGS.align_model_dir
        # model = "asr.inference_only.InferenceOnly"
        model = "asr.librispeech_ctc.TpuDecoderLibrispeech960Base"

        logits_dir = os.path.join(FLAGS.work_dir, "logits")

        def compute_max_steps(model_dir):
            # That the "train" directory is where the saved models are
            # stored is particular to lingvo. I don't expect this magic
            # constant to change.
            checkpoint_path = tf.train.latest_checkpoint(
                os.path.join(model_dir, "train"))
            step_pattern = r'-(\d+)$'
            checkpoint_step = int(
                re.search(step_pattern, checkpoint_path).group(1))
            max_steps = checkpoint_step + 1
            return max_steps

        #input.file_datasource.file_pattern:part-00000-8853e74a-fd03-46dc-affd-5c2ef87be96c-c000.tfrecord
        #part-00000-c4f0eb22-8f1e-45e2-9437-889428d09bf8-c000.tfrecord
        with tempfile.NamedTemporaryFile("w+") as fh:
            fh.write(f"""\
      input.file_datasource.file_pattern_prefix:{vad_out_dir}
      input.file_datasource.file_pattern:*.tfrecord
      input.num_samples:{num_samples}
      task.log_softmax_output_directory:{logits_dir}
      train.max_steps:{compute_max_steps(model_dir)}
      """)
            # This flush() is required. Otherwise, lingvo/trainer will see
            # an empty params file.
            fh.flush()

            # TODO: Make lingvo:trainer a dependency in the BUILD file. This is silly.
            subprocess.check_call(
                shlex.split(f"""
      lingvo/trainer --logdir={model_dir} \
      --model={model} \
      --logtostderr \
      --tpu=grpc://{TPU_IP}:8470 \
      --job=executor_tpu \
      --lingvo_executor_skip_saving_upon_stop \
      --model_params_file_override={fh.name}
      """))

    if FLAGS.stage <= 2:
        catalogue_df = spark.read.format('json').schema(
            ARCHIVE_ORG_SCHEMA).load(FLAGS.input_catalogue)
        load_transcripts(spark, FLAGS.input_dir, collected_text_document_rows)

        log_probabilities_schema = StructType([
            StructField("int64_uttid", IntegerType()),
            StructField("log_probabilities", ArrayType(FloatType(), True))
        ])

        # log_probabilities_df = spark.read.format("tfrecord").schema(log_probabilities_schema).load(logits_dir)
        log_probabilities_df = spark.read.format("tfrecord").load(logits_dir)
        vad_df = spark.read.format("tfrecord").load(vad_out_dir)
        uttid_integer_mapping_df = vad_df.select(vad_df.int64_uttid,
                                                 vad_df.uttid)
        log_probabilities_df = log_probabilities_df.join(
            uttid_integer_mapping_df, log_probabilities_df.int64_uttid ==
            uttid_integer_mapping_df.int64_uttid, 'inner')
        log_probabilities_df = log_probabilities_df.drop(
            log_probabilities_df.int64_uttid)

        split_col = F.split(F.reverse(log_probabilities_df.uttid), '-', 2)
        log_probabilities_df = log_probabilities_df.withColumn(
            'document_id', split_col.getItem(1))
        log_probabilities_df = log_probabilities_df.withColumn(
            'utterance_id',
            split_col.getItem(0).cast(IntegerType()))
        log_probabilities_df = log_probabilities_df.groupBy('document_id').agg(
            collect_list("log_probabilities"), collect_list("utterance_id"))
        # TODO: Sort each array by utterance_id. array_sort lexicographically with a Struct?

        log_probabilities_df.join(
            text_df,
            col("log_probabilities_df.document_id") == col(
                "transcript_df.document_id"), 'inner')

    if FLAGS.stage <= 3:
        generate_lm_udf = prepare_generate_lm_udf(
            "/install/kenlm/build/bin/",
            "/development/lingvo-source/tmpworkdir",
            FLAGS.mozilla_ds_alphabet_txt)
        df = spark.read.format("json").load(
            "/home/ws15dgalvez/dumpblahblah.json")
        rows = df.select(generate_lm_udf(df.transcript, df.id)).head(1)
        from IPython import embed
        embed()
Ejemplo n.º 16
0
block_size = str(1024 * 1024 * 512)
sc._jsc.hadoopConfiguration().set("dfs.block.size", block_size)
sc._jsc.hadoopConfiguration().set("parquet.block.size", block_size)

s3_location_target = 's3://move-dataeng-temp-dev/glue-etl/parquet_block_poc/omtr_pq_block_512'

output_folder = s3_location_target  # With absolute path
print 'output_folder= %s' % (output_folder)
#----  PySpark section ----
from pyspark.sql.functions import lit
from pyspark.sql.functions import reverse, split
#---
df_with_hour = df_par.withColumn(
    "hour",
    split(reverse(split(reverse(df_par.etl_source_filename), '/')[1]),
          '=')[1].cast("string"))

df_with_day = df_with_hour.withColumn(
    "day",
    split(reverse(split(reverse(df_with_hour.etl_source_filename), '/')[2]),
          '=')[1].cast("string"))

df_with_month = df_with_day.withColumn(
    "month",
    split(reverse(split(reverse(df_with_day.etl_source_filename), '/')[3]),
          '=')[1].cast("string"))

df_with_partitions = df_with_month.withColumn(
    "year",
    split(reverse(split(reverse(df_with_month.etl_source_filename), '/')[4]),
Ejemplo n.º 17
0
                               sf.col('tid').alias('trackingId'),
                               sf.col('status'),
                               sf.col('paymode'))

display(bookings)

# COMMAND ----------

# DBTITLE 1,Rough
int(datetime.strftime(startDate, '%Y%m'))

# COMMAND ----------

filterInput = t[0].inputData.filterInput
filterInput.schema
# for f in filterInput:
#   print(f)

# COMMAND ----------

[k for k,v in filterInput.asDict().items() if not(v is None or (type(v) == list and len(v) == 0))]

# COMMAND ----------

_s = "dom_base_upr_default"
_s[::-1].split('_')[0][::-1]

# COMMAND ----------

sf.split(sf.reverse("dom_base_upr_default"), "_")[0]
Ejemplo n.º 18
0
def process_indicator_data(spark, input_data, output_data):
    """ filepath to wdi data file """
    wdi_data = input_data + 'WDIData.csv'
    """ read wdi data file """
    df_wdi = spark.read.csv(wdi_data, header='true')
    df_wdi.persist()
    """ clean data - remove null columns """
    count_not_null = df_wdi.agg(*[count(c).alias(c) for c in df_wdi.columns])
    not_null_cols = [c for c in count_not_null.columns if count_not_null[[c]].first()[c] > 0]
    df_wdi = df_wdi.select(*not_null_cols)
    """ melt data """
    fixed_columns = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code']
    df_wdi = melt_df(df_wdi, fixed_columns, 'year')

    """ filepath to world happiness data file """
    happiness_data = input_data + 'world-happiness/'
    """ read happiness data file """
    df_happ = spark.read.csv(happiness_data, header='true').withColumn("file", input_file_name())
    df_happ.persist()
    """ clean and transform data """
    df_happ = df_happ.withColumn("year", reverse(split(reverse(df_happ.file), '/')[0])[0:4])
    df_happ = df_happ.drop('file')
    df_happ = df_happ.toDF(*(c.replace(' ', '') for c in df_happ.columns))
    """ melt data """
    fixed_columns = ['Countryorregion', 'year']
    df_happ = melt_df(df_happ, fixed_columns, 'indicator')

    """ filepath to unece data file """
    unece_data = input_data + 'unece.json'
    """ read unece data file """
    df_unece = spark.read.json(unece_data)
    df_unece.persist()
    """ clean data """
    df_unece = df_unece.toDF(*(c.replace(',', '') for c in df_unece.columns))
    df_unece = df_unece.toDF(*(c.replace('.', '') for c in df_unece.columns))
    """ melt data """
    fixed_columns = ['Country', 'Year']
    df_unece = melt_df(df_unece, fixed_columns, 'indicator')

    #### -------------- DIM TIME ----------------- ####
    """ transform and extract columns to create time table """
    df_time = df_wdi.select('year').dropDuplicates()
    df_time = df_time.withColumn("year", df_time["year"].cast(IntegerType()))

    get_century = udf(lambda x : (x - 1) // 100 + 1)
    df_time = df_time.withColumn("century", get_century(df_time.year))

    get_decade = udf(lambda x : int(str(x)[2]) * 10)
    df_time = df_time.withColumn("decade", get_decade(df_time.year))

    time_table = df_time["year", "decade", "century"]

    """ write time table to parquet files """
    time_table.write.parquet(output_data + 'dim_time.parquet', mode = 'overwrite')
    #### -------------- DIM TIME ----------------- ####

    #### ------------ DIM INDICATOR -------------- ####
    """ extract wdi indicators for indicator table """
    indicator_table_wdi = df_wdi.select(col('Indicator Code').alias('code'), \
        col('Indicator Name').alias('name')).dropDuplicates()

    """ transform and extract column group from column code """
    indicator_table_wdi = indicator_table_wdi.withColumn('group', split(indicator_table_wdi.code, '\.').getItem(0))

    """ extract happiness indicators for indicator table """
    indicator_table_happ = df_happ.select(col('indicator').alias('code'), \
        col('indicator').alias('name')).dropDuplicates()
    indicator_table_happ = indicator_table_happ.withColumn('group', lit('people_happines'))

    """ extract unece indicators for indicator table """
    indicator_table_unece = df_unece.select(col('indicator').alias('code'), \
        col('indicator').alias('name')).dropDuplicates()
    indicator_table_unece = indicator_table_unece.withColumn('group', lit('unece_indicator'))

    """ write indicators table to parquet files """
    indicator_table = indicator_table_wdi.union(indicator_table_happ)
    indicator_table.write.parquet(output_data + 'dim_indicator.parquet', mode = 'overwrite')
    #### ------------ DIM INDICATOR -------------- ####

    #### -------------- FACT SCORE --------------- ####

    """ create score table from indicator files """
    df_score = df_wdi.select(col('year').alias('dim_time_year'), col('Country Code'), \
       col('Country Name'), col('Indicator Code').alias('dim_indicator_code'), col("value").cast(DoubleType()))

    """ read country parquet files to get country code """
    country_df = spark.read.parquet(output_data + 'dim_country.parquet')
    df_score = df_score.join(country_df.select("code", "name"), country_df['code'] == df_score['Country Code'])

    """ unece score """
    df_unece = df_unece.join(df_score.select("code", "name"), df_unece.Country == df_score.name)
    df_unece_score = df_unece.select(col('Year').alias('dim_time_year'), col('code').alias('dim_country_code'), \
       col('indicator').alias('dim_indicator_code'), col("value").cast(DoubleType()))

    """ happines score """
    df_happ = df_happ.join(df_score.select("code", "name"), df_happ.Countryorregion == df_score.name)
    df_happ_score = df_happ.select(col('year').alias('dim_time_year'), col('code').alias('dim_country_code'), \
       col('indicator').alias('dim_indicator_code'), col("value").cast(DoubleType()))

    """ union of all indicators score """
    df_score = df_score.select(col('dim_time_year'), col('code').alias('dim_country_code'), \
       col('dim_indicator_code'), col("value").cast(DoubleType()))
    score_table = df_score.union(df_unece_score).union(df_happ_score)

    score_table = score_table.withColumn("score_id", monotonically_increasing_id())
    """ write score table to parquet files partitioned by year """
    score_table.write.partitionBy('dim_time_year').parquet(output_data + 'fact_score.parquet', mode = 'overwrite')
Ejemplo n.º 19
0
 def _reverse(col, args):
     return F.reverse(F.col(col))
Ejemplo n.º 20
0
    def prepare_final_df(self):
        self.df = self.df.select("date", "startDatetime",
                                 "message.callInfo.name", "message.callInfo")
        info = self.df.callInfo

        selected = self.df.select(
            "name",
            "startDatetime",
            info.callType.alias("callType"),
            info.distributedInstances.alias("distributedInstances"),
            info.endpointRecording.alias("endpointRecording"),
            info.lockState.alias("lockState"),
            info.participants.alias("participants"),
            info.recording.alias("recording"),
            info.streaming.alias("streaming"),
            info.joinAudioMuteOverride.alias("joinAudioMute"),
            "date",
        ).filter(func.col("name").startswith("["))

        grouped = selected.groupBy("name", "startDatetime").agg(
            func.sort_array(func.collect_list("date")).alias("date_array"),
            func.collect_list("recording").alias("recording_array"),
            func.collect_list("streaming").alias("streaming_array"),
            func.collect_list("lockState").alias("lockState_array"),
            func.reverse(
                func.collect_list("callType")).getItem(0).alias("callType"),
            func.reverse(func.collect_list("participants")).getItem(0).cast(
                IntegerType()).alias("current_participants"),
            func.collect_list("participants").alias("participant_array"),
        )

        grouped.printSchema()

        preprocessed = (
            grouped.withColumn(
                "datetime",
                self.helper.get_last_date_udf(grouped.date_array)).withColumn(
                    "meeting_name", func.col("name")).withColumn(
                        "time_diff",
                        self.helper.get_time_diff_udf(
                            grouped.date_array)).withColumn(
                                "recording",
                                self.helper.get_if_active_udf(
                                    grouped.recording_array)).withColumn(
                                        "streaming",
                                        self.helper.get_if_active_udf(
                                            grouped.streaming_array)).
            withColumn("locked",
                       self.helper.get_if_locked_udf(
                           grouped.lockState_array)).withColumn(
                               "cospace",
                               self.helper.get_if_cospace_udf(
                                   grouped.callType)).withColumn(
                                       "adhoc",
                                       self.helper.get_if_adhoc_udf(
                                           grouped.callType)).withColumn(
                                               "lync_conferencing",
                                               self.helper.get_if_lync_udf(
                                                   grouped.callType)).
            withColumn(
                "forwarding",
                self.helper.get_if_forwarding_udf(
                    grouped.callType)).withColumn(
                        "max_participants",
                        self.helper.get_max_udf(
                            grouped.participant_array)).withColumn(
                                "mean_participants",
                                self.helper.get_mean_udf(
                                    grouped.participant_array)).withColumn(
                                        "start_datetime",
                                        grouped.startDatetime.cast(
                                            TimestampType())).select(
                                                "datetime",
                                                "time_diff",
                                                "start_datetime",
                                                "recording",
                                                "streaming",
                                                "locked",
                                                "cospace",
                                                "adhoc",
                                                "lync_conferencing",
                                                "forwarding",
                                                "current_participants",
                                                "mean_participants",
                                                "max_participants",
                                                "meeting_name",
                                            ))

        preprocessed.printSchema()

        return self.do_post_preprocessing(preprocessed)