def load_transcripts( spark, base_path: str, collected_text_document_rows: List[pyspark.Row] ): def fix_name(identifier, text_document_id): if ( identifier == "gov.house.hbs.hrs05H_A1310_100511" and text_document_id == "hrs05H_A1310_100511.asr.srt" ): return "hrs05H_A1310_100511.auto.srt" else: return text_document_id # TODO: Upload this file with open("/development/lingvo-source/missing_files.json", "r") as fh: missing_text_document_ids = set(json.load(fh)) text_document_ids = [ os.path.join( base_path, row.identifier, fix_name(row.identifier, row.text_document_id) ) for row in collected_text_document_rows ] text_document_ids = [ tid for tid in text_document_ids if tid not in missing_text_document_ids ] srt_df = spark.read.format("binaryFile").load(text_document_ids) # Note the duplication with load_audio_files return srt_df.select( srt_to_text(fix_text_udf(srt_df.content)).alias("transcript"), F.reverse(F.split(srt_df.path, "/"))[0].alias("text_document_id"), F.reverse(F.split(srt_df.path, "/"))[1].alias("identifier"), )
def load_audio_files(spark, base_path: str): raw_audio_df = (spark.read.format("binaryFile").option( "pathGlobFilter", "*.mp3").option("recursiveFileLookup", "true").load(base_path)) return raw_audio_df.select( 'content', F.reverse(F.split(raw_audio_df.path, "[.]"))[0].alias("format"), # We will have repeats with this form of ID... It does not fulfill the purpose of an primary key... # 44635 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/01-Ml.Z.Ragi-JinnandJadoo18.05.05.asr.srt # 53884 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/02-Ml.Z.Ragi-JinnandJadoo25.05.05.asr.srt # 55971 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/03-Ml.Z.Ragi-JinnandJadoo01.06.05.asr.srt # 48287 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/04-Ml.Z.Ragi-JinnandJadoo08.06.05.asr.srt # 44184 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/05-Ml.Z.Ragi-JinnandJadoo22.06.05.asr.srt # 29040 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/06-Ml.Z.Ragi-JinnandJadoo29.06.05.asr.srt # 53849 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/07-Ml.Z.Ragi-JinnandJadoo20.07.05.asr.srt # 54745 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/08-Ml.Z.Ragi-JinnandJadoo27.07.05.asr.srt # 44990 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/09-Ml.Z.Ragi-JinnandJadoo03.08.05.asr.srt # 47756 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/10-Ml.Z.Ragi-JinnandJadoo10.08.05.asr.srt # 46275 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/11-Ml.Z.Ragi-JinnandJadoo07.09.05.asr.srt # 35660 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/12-Ml.Z.Ragi-JinnandJadoo14.09.05.asr.srt # 50201 gs://the-peoples-speech-west-europe/archive_org/Nov_6_2020/ALL_CAPTIONED_DATA/07Ml.Z.RagiJinnandJadoo20.07.05/13-Ml.Z.Ragi-JinnandJadoo21.09.05.asr.srt # I probably ought to use the non-format part of the final file path... That would work. F.reverse(F.split(raw_audio_df.path, "/")) [1].alias("audio_document_id"), F.monotonically_increasing_id().alias("int64_audio_document_id"))
def load_audio_files(spark, collected_audio_document_rows, base_path: str): audio_document_ids = [ os.path.join(base_path, row.identifier, row.audio_document_id) for row in collected_audio_document_rows ] raw_audio_df = spark.read.format("binaryFile").load(audio_document_ids) return raw_audio_df.select( "content", F.reverse(F.split(raw_audio_df.path, "[.]"))[0].alias("format"), F.reverse(F.split(raw_audio_df.path, "/"))[0].alias("audio_document_id"), F.reverse(F.split(raw_audio_df.path, "/"))[1].alias("identifier"), F.monotonically_increasing_id().alias("int64_audio_document_id"), )
def calculate_touchpoints(input_df): w1 = Window\ .partitionBy("fullVisitorId")\ .orderBy("timestamp") first_touchpoint = first(col("trafficSource_source")).over(w1) return input_df\ .orderBy("timestamp")\ .selectExpr("*", "collect_list(trafficSource_source) over (partition by fullVisitorId) as touchpoints")\ .withColumn("touchpoints_wo_direct", expr("filter(touchpoints, x -> x != '(direct)')"))\ .orderBy("timestamp")\ .select("*", first_touchpoint.alias("first_touchpoint"), when(reverse(col("touchpoints_wo_direct"))[0].isNotNull(), reverse(col("touchpoints_wo_direct"))[0]).otherwise("(direct)").alias("last_touchpoint"))
def prepare_final_df(self): self.df = self.df.select("date", func.explode("message.updates")) selected = self.df.select("date", "col.startDatetime", "col.updateType", "col.name", "col.finished").filter( func.col("name").startswith("[")) grouped = selected.groupBy("name", "startDatetime").agg( func.sort_array(func.collect_list("date")).alias("date_array"), func.collect_list("updateType").alias("updateType_array"), func.reverse( func.collect_list("finished")).getItem(0).alias("finished"), ) preprocessed = (grouped.withColumn( "start_datetime", grouped.startDatetime.cast(TimestampType())).withColumn( "last_update", self.helper.get_last_date_udf(grouped.date_array)).withColumn( "finished", func.col("finished").cast(BooleanType())).withColumn( "meeting_name", func.col("name")).select( "start_datetime", "last_update", "finished", "meeting_name").withColumn( "duration", func.col("last_update").cast(LongType()) - func.col("start_datetime").cast(LongType()), )) preprocessed.printSchema() return self.do_post_preprocessing(preprocessed)
def addUserFeatures(df: DataFrame) -> DataFrame: ''' 提取用户特征 :param df: :return: ''' extractGenresUdf = udf(extractGenres, returnType=ArrayType(IntegerType())) print('start user feature') samplesWithUserFeatures = df.withColumn('userPositiveHistory',collect_list(when(col('label')==1,col('movieId')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))\ .withColumn('userPositiveHistory',reverse(col('userPositiveHistory'))) \ .withColumn("userRatedMovie1", col("userPositiveHistory").getItem(0)) \ .withColumn("userRatedMovie2", col("userPositiveHistory").getItem(1)) \ .withColumn("userRatedMovie3", col("userPositiveHistory").getItem(2)) \ .withColumn("userRatedMovie4", col("userPositiveHistory").getItem(3)) \ .withColumn("userRatedMovie5", col("userPositiveHistory").getItem(4)) \ .withColumn('userRatingCount',count(lit(1)).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \ .withColumn('userAvgReleaseYear',avg(col('releaseYear')).over(Window.partitionBy("userId").orderBy(col('timestamp')).rowsBetween(-100, -1)).cast('int')) \ .withColumn('userReleaseYearStddev',stddev(col('releaseYear')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1))) \ .withColumn('userAvgRating',format_number(avg(col('rating')).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)),2)) \ .withColumn('userRatingStddev', stddev(col("rating")).over(Window.partitionBy("userId").orderBy(col("timestamp")).rowsBetween(-100, -1))) \ .withColumn('userGenres',extractGenresUdf(collect_list(when(col('label') == 1,col('genres')).otherwise(lit(None))).over(Window.partitionBy('userId').orderBy(col('timestamp')).rowsBetween(-100,-1)))).na.fill(0) \ .withColumn("userRatingStddev", format_number(col("userRatingStddev"), 2)) \ .withColumn("userReleaseYearStddev", format_number(col("userReleaseYearStddev"), 2)) \ .withColumn("userGenre1", col("userGenres").getItem(0)) \ .withColumn("userGenre2", col("userGenres").getItem(1)) \ .withColumn("userGenre3", col("userGenres").getItem(2)) \ .withColumn("userGenre4", col("userGenres").getItem(3)) \ .withColumn("userGenre5", col("userGenres").getItem(4))\ .drop("genres", "userGenres", "userPositiveHistory").filter(col('userRatingCount') > 1) # samplesWithUserFeatures.printSchema() samplesWithUserFeatures.show(10, truncate=True) return samplesWithUserFeatures
def load_transcripts(spark, base_path: str, collected_text_document_rows: List[pyspark.Row]): text_document_ids = [ os.path.join(base_path, row.identifier, row.text_document_id) for row in collected_text_document_rows ] text_document_ids = [ path for path in text_document_ids if "[" not in path and "]" not in path ] # "[" and "]" are escape card characters. GCS has very poor support # for these. Namely, you can write them but not read them back. More # resources here: https://github.com/galv/lingvo-copy/issues/18 # I simply filter out any files containing these characters for now. srt_df = (spark.read.format("binaryFile").load(text_document_ids)) # Note the duplication with load_audio_files return srt_df.select( srt_to_text(fix_text_udf(srt_df.content)).alias('transcript'), F.reverse(F.split(srt_df.path, "/"))[1].alias("id"))
def add_user_features(data): # find positive rating list of each userId features = data.withColumn("userPositiveHistory", F.collect_list(F.when(F.col("label") == 1, F.col("movieId")).otherwise(F.lit(None))) .over( sql.Window.partitionBy("userId").orderBy(F.col("timestamp")).rowsBetween(-100, -1) ))\ .withColumn("userPositiveHistory", F.reverse(F.col("userPositiveHistory"))) \ .withColumn("userRatedMovie1", F.col("userPositiveHistory").getItem(0)) \ .withColumn("userRatedMovie2", F.col("userPositiveHistory").getItem(1)) \ .withColumn("userRatedMovie3", F.col("userPositiveHistory").getItem(2)) \ .withColumn("userRatedMovie4", F.col("userPositiveHistory").getItem(3)) \ .withColumn("userRatedMovie5", F.col("userPositiveHistory").getItem(4)) \ .withColumn("userRatingCount", F.count(F.lit(1)).over(sql.Window.partitionBy("userId") .orderBy(F.col("timestamp")).rowsBetween(-100, -1))) \ .withColumn("userAvgReleaseYear", F.avg(F.col("releaseYear")).over(sql.Window.partitionBy("userId") .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \ .withColumn("userReleaseYearStddev", F.stddev(F.col("releaseYear")).over(sql.Window.partitionBy("userId") .orderBy(F.col("timestamp")).rowsBetween(-100, -1)).cast("integer")) \ .withColumn("userAvgRating", F.format_number(F.avg(F.col("rating")).over(sql.Window.partitionBy("userId") .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \ .withColumn("userRatingStddev", F.format_number(F.stddev(F.col("rating")).over(sql.Window.partitionBy("userId") .orderBy("timestamp").rowsBetween(-100, -1)), Config.NUMBER_PRECISION)) \ .withColumn("userGenres", udf_extract_genres(F.collect_list(F.when(F.col("label") == 1, F.col("genres")).otherwise(F.lit(None))) .over(sql.Window.partitionBy("userId").orderBy("timestamp").rowsBetween(-100, -1)))) \ .na.fill(0) \ .withColumn("userReleaseYearStddev", F.format_number(F.col("userReleaseYearStddev"), Config.NUMBER_PRECISION)) \ .withColumn("userGenre1", F.col("userGenres").getItem(0)) \ .withColumn("userGenre2", F.col("userGenres").getItem(1)) \ .withColumn("userGenre3", F.col("userGenres").getItem(2)) \ .withColumn("userGenre4", F.col("userGenres").getItem(3)) \ .withColumn("userGenre5", F.col("userGenres").getItem(4)) \ .drop("genres", "userGenres", "userPositiveHistory") \ .filter(F.col("userRatingCount") > 1) print_info(features, topN=20) return features
def transform_data(self,df): #Transform it to a format that will be accepted by the model cols_to_drop = ["Customer ID","Name","Address","Phone_no","Email","SSN"] df = df.drop(*cols_to_drop) df = df.withColumn("Customer Lifetime Value", functions.round("Customer Lifetime Value", 2)) df = df.withColumn("Loss Ratio", functions.round("Loss Ratio", 3)) df = df.withColumn("Growth Rate", functions.round("Growth Rate", 3)) df = df.withColumn("Total Claim Amount", functions.round("Total Claim Amount", 3)) df = df.withColumn("Job",functions.split("Job", ",").getItem(0)) df = df.withColumn("Company",functions.reverse(functions.split("Company", ",")).getItem(0)) indexer_list = [] categ_cols = ['City','Response','Coverage','Education','Employment_Status','Gender','Location_Code','Marital Status','Policy_Type','Policy_Rating','Renew_Offer_Type','Sales_Channel','Total Claim Amount','Feedback','Job','Company','Credit Card Provider'] for i in categ_cols: if i == 'City': indexer_list.append(StringIndexer(inputCol=i, outputCol=i+"Index")) else: indexer_list.append(StringIndexer(inputCol=i, outputCol=i+" Index")) for j in indexer_list: df = j.fit(df).transform(df) df = df.select([c for c in df.columns if c not in categ_cols]) df = df.withColumn("Effective To Date",functions.split("Effective To Date", "-").getItem(2)) df = df.withColumn("Effective To Date", df["Effective To Date"].cast(types.IntegerType())) return df
def main(argv): mem_bytes = os.sysconf("SC_PAGE_SIZE") * os.sysconf( "SC_PHYS_PAGES") # e.g. 4015976448 mem_gib = int((mem_bytes / (1024.0**3)) * 0.9) tar_jar = os.path.join(find_runfiles(), "__main__/galvasr2/spark/tar_spark_datasource.jar") spark = (pyspark.sql.SparkSession.builder.master( f"local[{os.cpu_count() - 1}]").config( "spark.eventLog.enabled", "true").config("spark.eventLog.dir", "/spark-events").config( "spark.sql.execution.arrow.pyspark.enabled", "true").config( "spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config( "spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true", ).config("spark.driver.memory", f"{mem_gib}g").config( "spark.history.fs.logDirectory", "/spark-events").config( "spark.sql.execution.arrow.maxRecordsPerBatch", "1").config("spark.jars", tar_jar).config( "spark.local.dir", "/mnt/disks/spark-scratch/").getOrCreate()) spark.sparkContext.setLogLevel("INFO") # "ALL" for very verbose logging logging.getLogger("py4j").setLevel(logging.ERROR) catalogue_df = load_audio_id_text_id_mapping(spark, FLAGS.input_catalogue) _, licenseurl_df = load_audio_and_text_dfs(spark, FLAGS.input_catalogue) licenseurl_df = licenseurl_df.select( [F.col("identifier"), F.col("text_document_id"), F.col("licenseurl")]) # Kaldi's wav.scp format does not support space characters in the key field of a wav.scp file # We write the transcript to a file called "{kaldi_normalized_uttid}.ctm", so we also need to change all instances of "/" to "_" catalogue_df = catalogue_df.withColumn( "kaldi_normalized_uttid", F.concat_ws( "-", F.translate(catalogue_df.identifier, " /", "__"), F.translate(catalogue_df.audio_document_id, " /", "__"), ), ) # key_int_mapping = os.path.join(FLAGS.work_dir, "key_int_mapping_csv") if not FLAGS.work_dir.startswith("gs://"): os.makedirs(FLAGS.work_dir, exist_ok=True) wav_scp = os.path.join(FLAGS.work_dir, "wav.scp") ctm_out_dir = os.path.join(FLAGS.work_dir, "decoder_ctm_dir") if FLAGS.stage <= 0: catalogue_df = catalogue_df.cache() # catalogue_df.write.mode("overwrite").format("csv").options(header="true").save(key_int_mapping) training_sample_rows = catalogue_df.collect() catalogue_df.unpersist() with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) create_wav_scp(posix_wav_scp, training_sample_rows, FLAGS.input_dir, ctm_out_dir) # /development/lingvo-source/output_ctm_dir/ # nvprof --analysis-metrics -o decoder-analysis.nvprof \ # We want only the best path, so we set lattice-beam to 0.1 # --main-q-capacity=35000 \ # Can get 266x RTF with this configuration. Keep it? # bath size of 100 and num channels of 100 works just fine if FLAGS.stage <= 1: if not FLAGS.work_dir.startswith("gs://"): os.makedirs(ctm_out_dir, exist_ok=True) with TemporaryMountDirectory( mount_cmd=[ "gcsfuse", "--implicit-dirs", FLAGS.input_gcs_bucket.lstrip("gs://"), ], unmount_cmd=["fusermount", "-u"], ) as temp_dir_name: posix_ctm_out_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, ctm_out_dir) posix_wav_scp = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, wav_scp) posix_work_dir = re.sub(r"^{0}".format(FLAGS.input_gcs_bucket), temp_dir_name, FLAGS.work_dir) num_gpus = 4 posix_wav_scp_shards = split_wav_scp(posix_wav_scp, posix_work_dir, num_gpus) executor = ThreadPoolExecutor(max_workers=num_gpus) def run_gpu(posix_wav_scp_shard, gpu_number): cmd = f"""\ /opt/kaldi/src/cudadecoderbin/batched-wav-nnet3-cuda3 \ --frame-subsampling-factor=3 \ --config=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/conf/online.conf \ --max-active=7000 \ --beam=15.0 \ --lattice-beam=0.1 \ --acoustic-scale=1.0 \ --cuda-decoder-copy-threads=2 \ --cuda-worker-threads={os.cpu_count() // num_gpus} \ --segmentation=true \ --cuda-use-tensor-cores=true \ --max-batch-size=150 \ --num-channels=250 \ --lattice-postprocessor-rxfilename=/development/lingvo-source/lattice_postprocess.conf \ --word-symbol-table=/opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/words.txt \ /opt/kaldi/egs/aspire/s5/exp/chain/tdnn_7b/final.mdl \ /opt/kaldi/egs/aspire/s5/exp/tdnn_7b_chain_online/graph_pp/HCLG.fst \ scp,p:{posix_wav_scp_shard} \ {posix_ctm_out_dir} """ env = deepcopy(os.environ) env["CUDA_VISIBLE_DEVICES"] = f"{gpu_number}" subprocess.check_call(shlex.split(cmd), env=env) for i, shard in enumerate(posix_wav_scp_shards): executor.submit(run_gpu, shard, i) executor.shutdown(wait=True) alignments_dir = os.path.join(FLAGS.alignments_work_dir, "alignments_json_jul_28") if FLAGS.stage <= 2: # TODO: Add options to DSAlign here dsalign_args = dsalign_main.parse_args( ["--output-wer", "--output-cer"]) # , "--output-sws", "--output-levenshtein"]) alphabet_normalized_path = ( "/development/lingvo-source/galvasr2/align/spark/alphabet2.txt") align_udf = prepare_align_udf(dsalign_args, alphabet_normalized_path, 15_000, 3_000) ctm_df = (spark.read.format("binaryFile").option( "pathGlobFilter", "*.ctm").load(ctm_out_dir)) ctm_df = ctm_df.withColumn( "kaldi_normalized_uttid", F.regexp_replace( F.reverse(F.split(ctm_df.path, "/"))[0], r"[.]ctm$", ""), ) ctm_df = ctm_df.withColumn("ctm_content", fix_text_udf(F.col("content"))).drop( "path", "length", "modificationTime", "content") ctm_df = ctm_df.join(catalogue_df, "kaldi_normalized_uttid") downsampled_catalogue_df = ctm_df.drop("ctm_content") training_sample_rows = downsampled_catalogue_df.collect() transcripts_df = load_transcripts(spark, FLAGS.input_gcs_path, training_sample_rows) transcripts_df = transcripts_df.withColumn( "transcript", normalize_english_text_udf(transcripts_df.transcript)) ctm_df = ctm_df.join(transcripts_df, ["identifier", "text_document_id"]) ctm_df = ctm_df.repartition(960) # alignments_df = ctm_df.select(align_udf(F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), # F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), # ctm_df.transcript, ctm_df.ctm_content)) alignments_df = ctm_df.withColumn( "alignments", align_udf( F.concat(ctm_df.identifier, F.lit("/"), ctm_df.text_document_id), F.concat(ctm_df.identifier, F.lit("/"), ctm_df.audio_document_id), ctm_df.transcript, ctm_df.ctm_content, ), ).drop("ctm_content") print("GALVEZ:schema") alignments_df.printSchema() sys.stdout.flush() alignments_df.write.mode("overwrite").format("json").save( alignments_dir) manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest") tars_dir = os.path.join(FLAGS.work_dir, "dataset_tars") if FLAGS.stage <= 3: duplicate_data_path = "gs://the-peoples-speech-west-europe/forced-aligner/data_deduplication/data_deduplication_v2_lines.json" duplicates_df = spark.read.format("json").load(duplicate_data_path) alignments_df = spark.read.json(alignments_dir) alignments_df = alignments_df.join( duplicates_df, on=(alignments_df.identifier == duplicates_df.identifier) & (alignments_df.text_document_id == duplicates_df.text_document_id), how="anti", ) if FLAGS.license_filter == "": pass else: if FLAGS.license_filter == "Not CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( ~is_cc_by_sa(F.col("licenseurl"))) elif FLAGS.license_filter == "CC-BY-SA": filtered_licenseurl_df = licenseurl_df.filter( is_cc_by_sa(F.col("licenseurl"))) else: raise Exception("Unknown license_filter provided.") filtered_licenseurl_df = filtered_licenseurl_df.drop("licenseurl") alignments_df = alignments_df.join( filtered_licenseurl_df, on=(alignments_df.identifier == filtered_licenseurl_df.identifier) & (alignments_df.text_document_id == filtered_licenseurl_df.text_document_id), how="inner", ) alignments_df = alignments_df.drop( filtered_licenseurl_df.identifier).drop( filtered_licenseurl_df.text_document_id) # We would like the number of partitions to be some large multiple # of the number of executors. Not every audio file is the same # length, so this helps with load balancing. alignments_df = alignments_df.withColumn( "duration_ms", F.expr( "transform(arrays_zip(alignments.end_ms, alignments.start_ms), x -> x.end_ms - x.start_ms)" ), ) alignments_df = alignments_df.withColumn( "alignments", F.arrays_zip( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.duration_ms, ).cast( T.ArrayType( T.StructType([ T.StructField("cer", T.FloatType()), T.StructField("end_ms", T.LongType()), T.StructField("label", T.StringType()), T.StructField("start_ms", T.LongType()), T.StructField("wer", T.FloatType()), T.StructField("duration_ms", T.LongType()), ]))), ) alignments_df = alignments_df.drop("duration_ms") alignments_df = alignments_df.withColumn( "alignments", F.filter( alignments_df.alignments, # Need to select this filter such that total number of # hours is 31,400 lambda alignment: (alignment.duration_ms < FLAGS.max_duration_ms) & (alignment.duration_ms >= FLAGS.min_duration_ms) & (alignment.cer < FLAGS.max_cer) & (alignment.cer >= FLAGS.min_cer), ), ) alignments_df = alignments_df.withColumn( "alignments", F.struct( alignments_df.alignments.cer, alignments_df.alignments.end_ms, alignments_df.alignments.label, alignments_df.alignments.start_ms, alignments_df.alignments.wer, alignments_df.alignments.duration_ms, ).cast( T.StructType([ T.StructField("cer", T.ArrayType(T.FloatType())), T.StructField("end_ms", T.ArrayType(T.LongType())), T.StructField("label", T.ArrayType(T.StringType())), T.StructField("start_ms", T.ArrayType(T.LongType())), T.StructField("wer", T.ArrayType(T.FloatType())), T.StructField("duration_ms", T.ArrayType(T.LongType())), ])), ) alignments_df = alignments_df.repartition(960) abc = alignments_df.select( F.sum( F.expr( "aggregate(alignments.duration_ms, 0L, (x, acc) -> acc + x)" )) / 1000.0 / 60.0 / 60.0).collect() print("GALVEZ:total number of hours=", abc) sys.stdout.flush() alignments_df = alignments_df.select( alignments_df.identifier, alignments_df.audio_document_id, alignments_df.text_document_id, alignments_df.alignments, ) alignments_df = F.broadcast(alignments_df) audio_paths = F.concat( F.lit(FLAGS.input_gcs_path), F.lit("/"), F.col("identifier"), F.lit("/"), F.col("audio_document_id"), ) rows = alignments_df.select(audio_paths).collect() paths = [row[0] for row in rows] # [:1] # GALVEZ: WARNING test! # print(f"number of paths = {len(paths)}") audio_df = (spark.read.format("binaryFile").load(paths).drop( "modificationTime", "length")) alignments_audio_df = alignments_df.join(audio_df, audio_paths == audio_df.path) # from IPython import embed; embed() # Remove "/" so that, if someat untars the tar files, everything will be dumped into one directory # Remove "." becasue it has special meaning in webdataset format. # Remove " " because kaldi keys may not contain " " (this is not striclty necessary, but convenient) name = F.concat(F.col("identifier"), F.lit("/"), F.col("audio_document_id")) # name = F.regexp_replace(name, r"/", "_SLASH_") name = F.regexp_replace(name, r"\.", "_DOT_") name = F.regexp_replace(name, r" ", "_SPACE_") # glob.glob("**/*.flac") pdf = df.select(name).collect() for name in pdf.name: assert len(name) < 4096 for chunk in "/".split(name): assert len(chunk) < 256 # name = F.regexp_replace(F.concat(F.col("identifier"), # F.lit("-"), # F.col("audio_document_id")), # r"(\.|/)", # "_" # ) # The name of each thing in the tar file. May not exceed 100 characters in length # substr indexes from 1! # name = name.substr( # F.length(name) - F.least(F.length(name), F.lit(88)) + 1, # F.least(F.length(name), F.lit(88)) # ) alignments_audio_df = alignments_audio_df.withColumn( "aligned_chunks", create_audio_segments_udf( alignments_audio_df.content, F.lit("mp3"), name, alignments_audio_df.alignments.start_ms, alignments_audio_df.alignments.end_ms, F.lit("flac"), ), ) a = alignments_audio_df.select( F.explode( F.arrays_zip("aligned_chunks.audio_name", "aligned_chunks.audio"))).select( "col.0", "col.1") a.write.mode("overwrite").format("tar").save(tars_dir) output_df = alignments_audio_df.select( alignments_audio_df.identifier, alignments_audio_df.audio_document_id, alignments_audio_df.text_document_id, F.struct( alignments_audio_df.alignments.label.alias("label"), create_audio_segment_names_udf( # Is F.size right here? name, F.size(alignments_audio_df.alignments.start_ms), F.lit("flac"), ).alias("name"), alignments_audio_df.alignments.duration_ms.alias( "duration_ms"), ).alias("training_data"), ) output_df = output_df.coalesce(960) # coalesce(1) seems to make the create_audio_segments_udf function run serially output_df.write.mode("overwrite").json(manifest_dir) repartitioned_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tars") tmp_tars_dir = os.path.join(FLAGS.work_dir, "repartitioned_dataset_tmp_dir") if FLAGS.stage <= 4: tars_df = spark.read.format("tar").load(tars_dir) # .limit(100) number_of_rows = tars_df.count() spark2 = spark.newSession() spark2.conf.set( "spark.sql.execution.rangeExchange.sampleSizePerPartition", number_of_rows) spark2.conf.set("spark.sql.files.minPartitionNum", FLAGS.number_of_shards) # tars_df = spark2.read.format("tar").load(tars_dir)#.limit(100) # print("GALVEZ:", tars_df.select(F.col("key")).collect()) # import sys; sys.exit() tars_df = spark2.read.format("tar").load(tars_dir) # .limit(100) tars_df = tars_df.repartitionByRange(FLAGS.number_of_shards, F.col("key")) # # May need to write this out to GCS, and then delete it, to prevent different behavior between runs. # # tars_df = tars_df.persist() tars_df.write.mode("overwrite").format("tar").save(tmp_tars_dir) tars_df = spark2.read.format("tar").load( tmp_tars_dir) # .repartitionByRange() # coalesce(1024) # counts_df = ( # tars_df.withColumn("partitionId", F.spark_partition_id()) # .groupBy("partitionId") # .count() # ) # num_rows_to_keep = counts_df.select(F.min(F.col("count"))).collect()[0][0] # # Consider doing this in java # def drop_final_rows(rows): # for _ in range(num_rows_to_keep): # yield next(rows) # for _ in rows: # pass # return # print("GALVEZ:before=", tars_df.rdd.getNumPartitions()) # # , preservesPartitioning=True # tars_df = spark2.createDataFrame( # tars_df.rdd.mapPartitions(drop_final_rows), schema=tars_df.schema # ) # print("GALVEZ:after=", tars_df.rdd.getNumPartitions()) # import sys # sys.stdout.flush() # # Don't actually write this out right now. It doesn't benefit us unless we are doing nemo training in a specific mode. # tars_df.write.mode("overwrite").format("tar").save(repartitioned_tars_dir) # manifest_df = spark2.read.json(manifest_dir) # number_of_utterances = manifest_df.select(F.explode(F.col("training_data.name"))).count() # print(f"GALVEZ:number_of_utterances={number_of_utterances}") # utterances_per_shard = number_of_utterances // FLAGS.number_of_shards # repartition_tar_files(os.path.join(tars_dir, "*.tar"), repartitioned_tars_dir, utterances_per_shard) nemo_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo") nemo_single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_nemo_single") if FLAGS.stage <= 5: json_df = spark.read.format("json").load(manifest_dir) nemo_df = json_df.select( F.explode( F.arrays_zip( F.col("training_data.name").alias("audio_filepath"), F.col("training_data.label").alias("text"), F.col("training_data.duration_ms").alias("duration_ms"), ))) nemo_df = nemo_df.select( F.col("col.name").alias("audio_filepath"), F.col("col.label").alias("text"), (F.col("col.duration_ms").cast(T.DoubleType()) / 1000.0).alias("duration"), F.lit(-1).alias("shard_id"), ) if False: tars_df = spark.read.format("tar").load(repartitioned_tars_dir) tars_df = tars_df.select(tars_df.key) nemo_df = F.broadcast(nemo_df) nemo_df = nemo_df.join( tars_df, F.col("audio_filepath") == F.col("key")).drop(F.col("key")) # TODO: Join against tar files that have been made to contain the # same number of files to filter out removed files nemo_df.write.mode("overwrite").format("json").save(nemo_manifest_dir) nemo_single_df = spark.read.format("json").load(nemo_manifest_dir) nemo_single_df.coalesce(1).write.mode("overwrite").format("json").save( nemo_single_manifest_dir) single_manifest_dir = os.path.join(FLAGS.work_dir, "dataset_manifest_single") single_tar_dir = os.path.join(FLAGS.work_dir, "dataset_tars_single") # Create single tar file and single json file if FLAGS.stage <= 6: json_df = spark.read.format("json").load(manifest_dir) json_df.coalesce(1).write.format("json").mode("overwrite").save( single_manifest_dir) tars_df = spark.read.format("tar").load(tmp_tars_dir) tars_df.coalesce(1).write.format("tar").mode("overwrite").save( single_tar_dir)
df_par.printSchema() block_size = str(1024 * 1024 * 512) sc._jsc.hadoopConfiguration().set("dfs.block.size", block_size) sc._jsc.hadoopConfiguration().set("parquet.block.size", block_size) s3_location_target = 's3://move-dataeng-temp-dev/glue-etl/parquet_block_poc/omtr_pq_block_512' output_folder = s3_location_target # With absolute path print 'output_folder= %s' %(output_folder) #---- PySpark section ---- from pyspark.sql.functions import lit from pyspark.sql.functions import reverse, split #--- df_with_hour = df_par.withColumn("hour", split(reverse(split(reverse(df_par.etl_source_filename), '/')[1] ),'=')[1].cast("string")) df_with_day = df_with_hour.withColumn("day", split(reverse(split(reverse(df_with_hour.etl_source_filename), '/')[2] ),'=')[1].cast("string")) df_with_month = df_with_day.withColumn("month", split(reverse(split(reverse(df_with_day.etl_source_filename), '/')[3] ),'=')[1].cast("string")) df_with_partitions = df_with_month.withColumn("year", split(reverse(split(reverse(df_with_month.etl_source_filename), '/')[4] ),'=')[1].cast("string")) #---- codec='snappy' partitionby=['year', 'month','day', 'hour'] df = df_with_partitions.filter((df_with_partitions.day.cast('Integer') < 2 ) & ( df_with_partitions.day.cast('Integer') > 0)) #df.repartition(*partitionby).write.partitionBy("hour").mode('overwrite').parquet(output_folder, compression=codec) df.repartition(*partitionby).write.partitionBy(['year', 'month','day', 'hour']).mode('overwrite').parquet(output_folder, compression=codec) #df_with_partitions.repartition(*partitionby).write.partitionBy(['year', 'month','day', 'hour']).mode('overwrite').parquet(output_folder, compression=codec)
def compile_reverse(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.reverse(src_column)
def addUserFeatures(samplesWithMovieFeatures: DataFrame) -> DataFrame: """ 新增列详解: --- 以下都是**到该条评价产生时间前的**历史行为特征记录 --- 1.userPositiveHistory: 收集该用户的积极评价并形成一个list,积极评价定义为评价分>3.5(认定为其喜欢该电影),同时使用滑动窗口收集该评价发生时间前的历史节点,避免收集未来信息 2.使用F.reverse将①中得到的评价序列反序,即按最新评价在前的顺序 3.userRatedMovie[0~4]: 该用户最近评价的5部电影 4.userRatingCount: 用户评价总数 5.userRatedMovieAvgReleaseYear: 用户评价过的电影的平均上映年份 6.userRatedMovieReleaseYearStddev: 用户评价过的电影的上映年份的无偏标准差 7.userAvgRating: 用户平均评分 8.userRatingStddev: 用户评分的无偏标准差 9.userGenres: 用户观看过的电影的风格分类汇总 10.userGenre[0~4]: 用户最近5个观看的电影风格分类 --- 以下是对DataFrame中无用信息列的修正 1.drop: ①genres: 原始电影风格分类,在历史行为特征中不具有含义,删去 ②userGenres: 收集到的按时间排列的最近观看电影分类的列表,已捡取前5个,原始列可删去 ③userPositiveHistory 收集到的按时间排列的评分序列,已捡取前5个,原始列可删去 2.filter: 不保留用户整个历史行为中第一次的电影评价,因为在这个行为前没有历史行为,属于冷启动部分 :param samplesWithMovieFeatures :return: samplesWithUserFeatures """ samplesWithUserFeatures = samplesWithMovieFeatures \ .withColumn('userPositiveHistory', F.collect_list(F.when(F.col('label') == 1, F.col('movieId')).otherwise(F.lit(None))).over( Window.partitionBy('userId').orderBy(F.col('timestamp')).rowsBetween(-100, -1) )) \ .withColumn('userPositiveHistory', F.reverse(F.col('userPositiveHistory'))) \ .withColumn('userRatedMovie0', F.col('userPositiveHistory')[0]) \ .withColumn('userRatedMovie1', F.col('userPositiveHistory')[1]) \ .withColumn('userRatedMovie2', F.col('userPositiveHistory')[2]) \ .withColumn('userRatedMovie3', F.col('userPositiveHistory')[3]) \ .withColumn('userRatedMovie4', F.col('userPositiveHistory')[4]) \ .withColumn('userRatingCount', F.count(F.lit(1)).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)) ) \ .withColumn('userRatedMovieAvgReleaseYear', F.avg(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)) .cast(IntegerType())) \ .withColumn('userRatedMovieReleaseYearStddev', F.format_number( F.stddev(F.col('releaseYear')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \ .withColumn('userAvgRating', F.format_number( F.avg(F.col('rating')).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \ .withColumn("userRatingStddev", F.format_number( F.stddev(F.col("rating")).over(Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)), NUMBER_PRECISION)) \ .withColumn("userGenres", F.udf(extractGenresUDF, ArrayType(StringType()))( F.collect_list(F.when(F.col('label') == 1, F.col('genres')).otherwise(F.lit(None))).over( Window.partitionBy('userId').orderBy('timestamp').rowsBetween(-100, -1)) )) \ .withColumn("userGenre0", F.col("userGenres")[0]) \ .withColumn("userGenre1", F.col("userGenres")[1]) \ .withColumn("userGenre2", F.col("userGenres")[2]) \ .withColumn("userGenre3", F.col("userGenres")[3]) \ .withColumn("userGenre4", F.col("userGenres")[4]) \ .drop("genres", "userGenres", "userPositiveHistory") \ .filter(F.col("userRatingCount") > 1) return samplesWithUserFeatures
# In[40]: import pyspark.sql.functions as f # In[58]: df.select(f.collect_set(df['state'])).collect() # In[62]: df.select(f.countDistinct('state').alias('states')).show() # In[70]: df.select(f.md5('street').alias('hash')).collect() # In[72]: df.select(f.reverse(df.state).alias('state-reverse')).collect() # In[75]: df.select(f.soundex(df.name).alias('soundex')).collect() # In[76]: spark.stop() # In[ ]:
def main(argv): spark = SparkSession.builder \ .master("local[1]") \ .appName("Forced Aligner") \ .config("spark.sql.execution.arrow.pyspark.enabled", "true")\ .config("spark.sql.execution.arrow.maxRecordsPerBatch", "1")\ .config("spark.driver.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\ .config("spark.executor.extraJavaOptions", "-Dio.netty.tryReflectionSetAccessible=true")\ .config("spark.driver.memory", "7g")\ .config("spark.executor.memory", "7g")\ .config("spark.task.maxFailures", "2")\ .getOrCreate() spark.sparkContext.setLogLevel("INFO") # "ALL" for very verbose logging logging.getLogger("py4j").setLevel(logging.ERROR) pyspark.java_gateway.ensure_callback_server_started( spark.sparkContext._gateway) # spark.sparkContext._gateway.start_callback_server() listener = WriteTaskEndListener() spark.sparkContext._jsc.sc().addSparkListener(listener) vad_out_dir = os.path.join(FLAGS.work_dir, "vad_pcm_tfrecords") if FLAGS.stage <= 0: audio_df = load_audio_files(spark, FLAGS.input_dir) vad_udf = prepare_vad_udf(num_padding_frames=10, threshold=0.5, aggressiveness=0, frame_duration_ms=30) vad_df = audio_df.withColumn( "vad", vad_udf(audio_df.content, audio_df.format)) vad_df = vad_df.withColumn("num_utterances_in_audio_document", F.size(vad_df.vad.voiced_buffer)) exploded_voiced_buffer_df = vad_df.select( vad_df.audio_document_id, vad_df.int64_audio_document_id, vad_df.num_utterances_in_audio_document, F.posexplode(vad_df.vad.voiced_buffer)) tfrecord_df = exploded_voiced_buffer_df.select( exploded_voiced_buffer_df.audio_document_id, exploded_voiced_buffer_df.int64_audio_document_id, exploded_voiced_buffer_df.col.alias("frames"), lit("-").alias("transcript"), F.concat_ws("-", exploded_voiced_buffer_df.audio_document_id, exploded_voiced_buffer_df.pos).alias("uttid"), F.monotonically_increasing_id().alias("int64_uttid"), exploded_voiced_buffer_df.num_utterances_in_audio_document, ) tfrecord_df = tfrecord_df.withColumn( "frames", F.expr("transform(frames, x -> float(x) * float(1./32768.))")) tfrecord_df.printSchema() tfrecord_df.write.mode("overwrite").format("tfrecord").option( "recordType", "Example").save(vad_out_dir) if FLAGS.stage <= 1: # TODO: Compute this automatically # https://stackoverflow.com/questions/44082957/how-to-add-a-sparklistener-from-pyspark-in-python num_samples_written = listener.value if num_samples_written == 0: num_samples = spark.read.format("tfrecord").option( "recordType", "Example").load(vad_out_dir).count() else: num_samples = num_samples_written # print(f"GALVEZ:num_samples_written={num_samples_written}") # print(f"GALVEZ:num_samples={num_samples}") # assert num_samples_written == num_samples # from IPython import embed; embed() # num_samples = 100_000 # return # ctpu_up = subprocess.run(shlex.split("ctpu up -name forced-aligner-tpu -tpu-only -tpu-size v3-8 -tf-version 2.2")) TPU_IP = "10.240.1.2" # model_dir = "gs://the-peoples-speech-west-europe/PeoplesSpeech/ag_training/1127" model_dir = FLAGS.align_model_dir # model = "asr.inference_only.InferenceOnly" model = "asr.librispeech_ctc.TpuDecoderLibrispeech960Base" logits_dir = os.path.join(FLAGS.work_dir, "logits") def compute_max_steps(model_dir): # That the "train" directory is where the saved models are # stored is particular to lingvo. I don't expect this magic # constant to change. checkpoint_path = tf.train.latest_checkpoint( os.path.join(model_dir, "train")) step_pattern = r'-(\d+)$' checkpoint_step = int( re.search(step_pattern, checkpoint_path).group(1)) max_steps = checkpoint_step + 1 return max_steps #input.file_datasource.file_pattern:part-00000-8853e74a-fd03-46dc-affd-5c2ef87be96c-c000.tfrecord #part-00000-c4f0eb22-8f1e-45e2-9437-889428d09bf8-c000.tfrecord with tempfile.NamedTemporaryFile("w+") as fh: fh.write(f"""\ input.file_datasource.file_pattern_prefix:{vad_out_dir} input.file_datasource.file_pattern:*.tfrecord input.num_samples:{num_samples} task.log_softmax_output_directory:{logits_dir} train.max_steps:{compute_max_steps(model_dir)} """) # This flush() is required. Otherwise, lingvo/trainer will see # an empty params file. fh.flush() # TODO: Make lingvo:trainer a dependency in the BUILD file. This is silly. subprocess.check_call( shlex.split(f""" lingvo/trainer --logdir={model_dir} \ --model={model} \ --logtostderr \ --tpu=grpc://{TPU_IP}:8470 \ --job=executor_tpu \ --lingvo_executor_skip_saving_upon_stop \ --model_params_file_override={fh.name} """)) if FLAGS.stage <= 2: catalogue_df = spark.read.format('json').schema( ARCHIVE_ORG_SCHEMA).load(FLAGS.input_catalogue) load_transcripts(spark, FLAGS.input_dir, collected_text_document_rows) log_probabilities_schema = StructType([ StructField("int64_uttid", IntegerType()), StructField("log_probabilities", ArrayType(FloatType(), True)) ]) # log_probabilities_df = spark.read.format("tfrecord").schema(log_probabilities_schema).load(logits_dir) log_probabilities_df = spark.read.format("tfrecord").load(logits_dir) vad_df = spark.read.format("tfrecord").load(vad_out_dir) uttid_integer_mapping_df = vad_df.select(vad_df.int64_uttid, vad_df.uttid) log_probabilities_df = log_probabilities_df.join( uttid_integer_mapping_df, log_probabilities_df.int64_uttid == uttid_integer_mapping_df.int64_uttid, 'inner') log_probabilities_df = log_probabilities_df.drop( log_probabilities_df.int64_uttid) split_col = F.split(F.reverse(log_probabilities_df.uttid), '-', 2) log_probabilities_df = log_probabilities_df.withColumn( 'document_id', split_col.getItem(1)) log_probabilities_df = log_probabilities_df.withColumn( 'utterance_id', split_col.getItem(0).cast(IntegerType())) log_probabilities_df = log_probabilities_df.groupBy('document_id').agg( collect_list("log_probabilities"), collect_list("utterance_id")) # TODO: Sort each array by utterance_id. array_sort lexicographically with a Struct? log_probabilities_df.join( text_df, col("log_probabilities_df.document_id") == col( "transcript_df.document_id"), 'inner') if FLAGS.stage <= 3: generate_lm_udf = prepare_generate_lm_udf( "/install/kenlm/build/bin/", "/development/lingvo-source/tmpworkdir", FLAGS.mozilla_ds_alphabet_txt) df = spark.read.format("json").load( "/home/ws15dgalvez/dumpblahblah.json") rows = df.select(generate_lm_udf(df.transcript, df.id)).head(1) from IPython import embed embed()
block_size = str(1024 * 1024 * 512) sc._jsc.hadoopConfiguration().set("dfs.block.size", block_size) sc._jsc.hadoopConfiguration().set("parquet.block.size", block_size) s3_location_target = 's3://move-dataeng-temp-dev/glue-etl/parquet_block_poc/omtr_pq_block_512' output_folder = s3_location_target # With absolute path print 'output_folder= %s' % (output_folder) #---- PySpark section ---- from pyspark.sql.functions import lit from pyspark.sql.functions import reverse, split #--- df_with_hour = df_par.withColumn( "hour", split(reverse(split(reverse(df_par.etl_source_filename), '/')[1]), '=')[1].cast("string")) df_with_day = df_with_hour.withColumn( "day", split(reverse(split(reverse(df_with_hour.etl_source_filename), '/')[2]), '=')[1].cast("string")) df_with_month = df_with_day.withColumn( "month", split(reverse(split(reverse(df_with_day.etl_source_filename), '/')[3]), '=')[1].cast("string")) df_with_partitions = df_with_month.withColumn( "year", split(reverse(split(reverse(df_with_month.etl_source_filename), '/')[4]),
sf.col('tid').alias('trackingId'), sf.col('status'), sf.col('paymode')) display(bookings) # COMMAND ---------- # DBTITLE 1,Rough int(datetime.strftime(startDate, '%Y%m')) # COMMAND ---------- filterInput = t[0].inputData.filterInput filterInput.schema # for f in filterInput: # print(f) # COMMAND ---------- [k for k,v in filterInput.asDict().items() if not(v is None or (type(v) == list and len(v) == 0))] # COMMAND ---------- _s = "dom_base_upr_default" _s[::-1].split('_')[0][::-1] # COMMAND ---------- sf.split(sf.reverse("dom_base_upr_default"), "_")[0]
def process_indicator_data(spark, input_data, output_data): """ filepath to wdi data file """ wdi_data = input_data + 'WDIData.csv' """ read wdi data file """ df_wdi = spark.read.csv(wdi_data, header='true') df_wdi.persist() """ clean data - remove null columns """ count_not_null = df_wdi.agg(*[count(c).alias(c) for c in df_wdi.columns]) not_null_cols = [c for c in count_not_null.columns if count_not_null[[c]].first()[c] > 0] df_wdi = df_wdi.select(*not_null_cols) """ melt data """ fixed_columns = ['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code'] df_wdi = melt_df(df_wdi, fixed_columns, 'year') """ filepath to world happiness data file """ happiness_data = input_data + 'world-happiness/' """ read happiness data file """ df_happ = spark.read.csv(happiness_data, header='true').withColumn("file", input_file_name()) df_happ.persist() """ clean and transform data """ df_happ = df_happ.withColumn("year", reverse(split(reverse(df_happ.file), '/')[0])[0:4]) df_happ = df_happ.drop('file') df_happ = df_happ.toDF(*(c.replace(' ', '') for c in df_happ.columns)) """ melt data """ fixed_columns = ['Countryorregion', 'year'] df_happ = melt_df(df_happ, fixed_columns, 'indicator') """ filepath to unece data file """ unece_data = input_data + 'unece.json' """ read unece data file """ df_unece = spark.read.json(unece_data) df_unece.persist() """ clean data """ df_unece = df_unece.toDF(*(c.replace(',', '') for c in df_unece.columns)) df_unece = df_unece.toDF(*(c.replace('.', '') for c in df_unece.columns)) """ melt data """ fixed_columns = ['Country', 'Year'] df_unece = melt_df(df_unece, fixed_columns, 'indicator') #### -------------- DIM TIME ----------------- #### """ transform and extract columns to create time table """ df_time = df_wdi.select('year').dropDuplicates() df_time = df_time.withColumn("year", df_time["year"].cast(IntegerType())) get_century = udf(lambda x : (x - 1) // 100 + 1) df_time = df_time.withColumn("century", get_century(df_time.year)) get_decade = udf(lambda x : int(str(x)[2]) * 10) df_time = df_time.withColumn("decade", get_decade(df_time.year)) time_table = df_time["year", "decade", "century"] """ write time table to parquet files """ time_table.write.parquet(output_data + 'dim_time.parquet', mode = 'overwrite') #### -------------- DIM TIME ----------------- #### #### ------------ DIM INDICATOR -------------- #### """ extract wdi indicators for indicator table """ indicator_table_wdi = df_wdi.select(col('Indicator Code').alias('code'), \ col('Indicator Name').alias('name')).dropDuplicates() """ transform and extract column group from column code """ indicator_table_wdi = indicator_table_wdi.withColumn('group', split(indicator_table_wdi.code, '\.').getItem(0)) """ extract happiness indicators for indicator table """ indicator_table_happ = df_happ.select(col('indicator').alias('code'), \ col('indicator').alias('name')).dropDuplicates() indicator_table_happ = indicator_table_happ.withColumn('group', lit('people_happines')) """ extract unece indicators for indicator table """ indicator_table_unece = df_unece.select(col('indicator').alias('code'), \ col('indicator').alias('name')).dropDuplicates() indicator_table_unece = indicator_table_unece.withColumn('group', lit('unece_indicator')) """ write indicators table to parquet files """ indicator_table = indicator_table_wdi.union(indicator_table_happ) indicator_table.write.parquet(output_data + 'dim_indicator.parquet', mode = 'overwrite') #### ------------ DIM INDICATOR -------------- #### #### -------------- FACT SCORE --------------- #### """ create score table from indicator files """ df_score = df_wdi.select(col('year').alias('dim_time_year'), col('Country Code'), \ col('Country Name'), col('Indicator Code').alias('dim_indicator_code'), col("value").cast(DoubleType())) """ read country parquet files to get country code """ country_df = spark.read.parquet(output_data + 'dim_country.parquet') df_score = df_score.join(country_df.select("code", "name"), country_df['code'] == df_score['Country Code']) """ unece score """ df_unece = df_unece.join(df_score.select("code", "name"), df_unece.Country == df_score.name) df_unece_score = df_unece.select(col('Year').alias('dim_time_year'), col('code').alias('dim_country_code'), \ col('indicator').alias('dim_indicator_code'), col("value").cast(DoubleType())) """ happines score """ df_happ = df_happ.join(df_score.select("code", "name"), df_happ.Countryorregion == df_score.name) df_happ_score = df_happ.select(col('year').alias('dim_time_year'), col('code').alias('dim_country_code'), \ col('indicator').alias('dim_indicator_code'), col("value").cast(DoubleType())) """ union of all indicators score """ df_score = df_score.select(col('dim_time_year'), col('code').alias('dim_country_code'), \ col('dim_indicator_code'), col("value").cast(DoubleType())) score_table = df_score.union(df_unece_score).union(df_happ_score) score_table = score_table.withColumn("score_id", monotonically_increasing_id()) """ write score table to parquet files partitioned by year """ score_table.write.partitionBy('dim_time_year').parquet(output_data + 'fact_score.parquet', mode = 'overwrite')
def _reverse(col, args): return F.reverse(F.col(col))
def prepare_final_df(self): self.df = self.df.select("date", "startDatetime", "message.callInfo.name", "message.callInfo") info = self.df.callInfo selected = self.df.select( "name", "startDatetime", info.callType.alias("callType"), info.distributedInstances.alias("distributedInstances"), info.endpointRecording.alias("endpointRecording"), info.lockState.alias("lockState"), info.participants.alias("participants"), info.recording.alias("recording"), info.streaming.alias("streaming"), info.joinAudioMuteOverride.alias("joinAudioMute"), "date", ).filter(func.col("name").startswith("[")) grouped = selected.groupBy("name", "startDatetime").agg( func.sort_array(func.collect_list("date")).alias("date_array"), func.collect_list("recording").alias("recording_array"), func.collect_list("streaming").alias("streaming_array"), func.collect_list("lockState").alias("lockState_array"), func.reverse( func.collect_list("callType")).getItem(0).alias("callType"), func.reverse(func.collect_list("participants")).getItem(0).cast( IntegerType()).alias("current_participants"), func.collect_list("participants").alias("participant_array"), ) grouped.printSchema() preprocessed = ( grouped.withColumn( "datetime", self.helper.get_last_date_udf(grouped.date_array)).withColumn( "meeting_name", func.col("name")).withColumn( "time_diff", self.helper.get_time_diff_udf( grouped.date_array)).withColumn( "recording", self.helper.get_if_active_udf( grouped.recording_array)).withColumn( "streaming", self.helper.get_if_active_udf( grouped.streaming_array)). withColumn("locked", self.helper.get_if_locked_udf( grouped.lockState_array)).withColumn( "cospace", self.helper.get_if_cospace_udf( grouped.callType)).withColumn( "adhoc", self.helper.get_if_adhoc_udf( grouped.callType)).withColumn( "lync_conferencing", self.helper.get_if_lync_udf( grouped.callType)). withColumn( "forwarding", self.helper.get_if_forwarding_udf( grouped.callType)).withColumn( "max_participants", self.helper.get_max_udf( grouped.participant_array)).withColumn( "mean_participants", self.helper.get_mean_udf( grouped.participant_array)).withColumn( "start_datetime", grouped.startDatetime.cast( TimestampType())).select( "datetime", "time_diff", "start_datetime", "recording", "streaming", "locked", "cospace", "adhoc", "lync_conferencing", "forwarding", "current_participants", "mean_participants", "max_participants", "meeting_name", )) preprocessed.printSchema() return self.do_post_preprocessing(preprocessed)