def call(self, x: F.DataFrame): x = self.parse_textual_cols_step(x) x = self.clean_text_step(x) x = x.orderBy('complaint_id') return x
def call(self, x: F.DataFrame): x = x.where(x.complaint_id.isNotNull() & x.consumer_message.isNotNull() & (x.consumer_message != '')) x = self.discard_duplicates(x, 'complaint_id', 'ingested_at') return x
def _transform( self, input_df: f.DataFrame, contig: pyspark.sql.column.Column, start: pyspark.sql.column.Column, end: pyspark.sql.column.Column, ref: pyspark.sql.column.Column, alt: pyspark.sql.column.Column, id: pyspark.sql.column.Column, ): """ Runs Ensembl VEP on a Spark DataFrame with VEP. The DataFrame needs to provide the following fields: - "contigName" - "start" - "end - "referenceAllele" - "alternateAlleles" Args: df: Spark DataFrame with contigNamem start, end, ref and alt contig: contig name column start: variant position column ref: reference allele column alt: array of alternate alleles column id: array of id's Returns: Spark DataFrame with single column `text` that contains json-formatted VEP output as string """ import glow input_df = input_df.select([ contig, start, end, id, ref, alt, ]) vep_transformed_df = glow.transform( "pipe", input_df, cmd=json.dumps(self.call_args), inputFormatter='vcf', inVcfHeader='infer', outputFormatter='text', ) return vep_transformed_df
def _parse_text(self, vep_transformed_df: f.DataFrame): """ Parses json-formatted VEP output string Args: vep_transformed_df: output of `self._transform()` Returns: Spark DataFrame with the schema as defined by `self.output_schema` """ vep_df = (vep_transformed_df.withColumn( 'data', f.from_json('text', self.output_schema)).select( f.expr("data.*")).drop("input")) return vep_df
def parse_textual_cols_step(self, x: F.DataFrame): return (x.withColumn('date_received', F.to_date('date_received', 'M/d/yyyy')) .withColumn('disputed', confirming_word_as_bool('disputed')) .withColumn('timely_response', confirming_word_as_bool('timely_response')))
def add_tags_step(self, x: F.DataFrame): return (x.withColumn('ingested_at', F.current_timestamp()) .withColumn('tags_trusted_labels', F.rand() < .1) .withColumn('tags_split', F.when(F.rand() < .5, 'train') .otherwise('test')))
def clean_text_step(self, x: F.DataFrame): return x.withColumn('text_cleaned', T.functions.clean('consumer_message'))
def call(self, x: F.DataFrame): x = x.repartition('product', 'issue') x = self.encode_text_with_word2vec_step(x) return x