Esempio n. 1
0
    def call(self, x: F.DataFrame):
        x = self.parse_textual_cols_step(x)
        x = self.clean_text_step(x)

        x = x.orderBy('complaint_id')

        return x
Esempio n. 2
0
    def call(self, x: F.DataFrame):
        x = x.where(x.complaint_id.isNotNull() &
                    x.consumer_message.isNotNull() &
                    (x.consumer_message != ''))

        x = self.discard_duplicates(x, 'complaint_id', 'ingested_at')

        return x
Esempio n. 3
0
File: vep.py Progetto: Hoeze/firefly
    def _transform(
        self,
        input_df: f.DataFrame,
        contig: pyspark.sql.column.Column,
        start: pyspark.sql.column.Column,
        end: pyspark.sql.column.Column,
        ref: pyspark.sql.column.Column,
        alt: pyspark.sql.column.Column,
        id: pyspark.sql.column.Column,
    ):
        """
        Runs Ensembl VEP on a Spark DataFrame with VEP.
        The DataFrame needs to provide the following fields:
            - "contigName"
            - "start"
            - "end
            - "referenceAllele"
            - "alternateAlleles"

        Args:
            df: Spark DataFrame with contigNamem start, end, ref and alt
            contig: contig name column
            start: variant position column
            ref: reference allele column
            alt: array of alternate alleles column
            id: array of id's

        Returns:
            Spark DataFrame with single column `text` that contains json-formatted VEP output as string
        """
        import glow
        input_df = input_df.select([
            contig,
            start,
            end,
            id,
            ref,
            alt,
        ])

        vep_transformed_df = glow.transform(
            "pipe",
            input_df,
            cmd=json.dumps(self.call_args),
            inputFormatter='vcf',
            inVcfHeader='infer',
            outputFormatter='text',
        )

        return vep_transformed_df
Esempio n. 4
0
File: vep.py Progetto: Hoeze/firefly
    def _parse_text(self, vep_transformed_df: f.DataFrame):
        """
        Parses json-formatted VEP output string

        Args:
            vep_transformed_df: output of `self._transform()`

        Returns:
            Spark DataFrame with the schema as defined by `self.output_schema`
        """
        vep_df = (vep_transformed_df.withColumn(
            'data', f.from_json('text', self.output_schema)).select(
                f.expr("data.*")).drop("input"))

        return vep_df
Esempio n. 5
0
 def parse_textual_cols_step(self, x: F.DataFrame):
     return (x.withColumn('date_received', F.to_date('date_received', 'M/d/yyyy'))
             .withColumn('disputed', confirming_word_as_bool('disputed'))
             .withColumn('timely_response', confirming_word_as_bool('timely_response')))
Esempio n. 6
0
 def add_tags_step(self, x: F.DataFrame):
     return (x.withColumn('ingested_at', F.current_timestamp())
             .withColumn('tags_trusted_labels', F.rand() < .1)
             .withColumn('tags_split', F.when(F.rand() < .5, 'train')
                         .otherwise('test')))
Esempio n. 7
0
 def clean_text_step(self, x: F.DataFrame):
     return x.withColumn('text_cleaned', T.functions.clean('consumer_message'))
Esempio n. 8
0
    def call(self, x: F.DataFrame):
        x = x.repartition('product', 'issue')
        x = self.encode_text_with_word2vec_step(x)

        return x