Esempio n. 1
0
def get_df_mincityear_onw_cit(df_ani):
    return (df_ani.filter(sort_pub_year + ' >= ' + mincityear).withColumn(
        'references_u', func.array_distinct('references')).select(
            func.col('Eid').alias('CitingEid'),
            func.explode('references_u').alias('Eid'),
            func.when(
                func.col('source.srcid').isin(discontinued_sources),
                func.lit(int(1))).otherwise(func.lit(
                    int(0))).alias('isDiscontinuedCiting'),
            func.col('Au.auid').cast('array<long>').alias('CitingAuids')
        ).join(
            df_ani.select(
                'Eid',
                func.col('Au.auid').cast('array<long>').alias('CitedAuids')),
            ["Eid"]).withColumn(
                'overLappingAuthors',
                func.size(func.array_intersect(
                    'CitingAuids', 'CitedAuids'))).select(
                        "CitingEid",
                        "Eid",
                        'isDiscontinuedCiting',
                        func.expr("IF(overLappingAuthors>0,1,0)").alias(
                            'isSelfCitation'),
                        func.expr("IF(overLappingAuthors>0,NULL,CitingEid)").
                        alias('CitingEidNonSelf'),
                    ).groupBy('Eid').agg(
                        func.count('*').alias('CitationCount'),
                        func.sum('isSelfCitation').alias('SelfCitationCount'),
                        (func.count('*') - func.sum('isSelfCitation')
                         ).alias('CitationCountNonSelf'),
                        func.collect_list('CitingEid').alias('CitingEids'),
                        func.collect_list('CitingEidNonSelf').alias(
                            'CitingEidsNonSelf'),
                        func.sum("isDiscontinuedCiting").alias(
                            'CitationCountFromDiscontinuedSources')))
 def get_column_spec(
     self, source_df: Optional[DataFrame], current_column: Optional[Column]
 ) -> Column:
     column_spec = array_distinct(
         *[
             col.get_column_spec(source_df=source_df, current_column=current_column)
             for col in self.value
         ]
     )
     return column_spec
Esempio n. 3
0
def column_revalue(vcf):
    # info 값 수정 필요
    name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"]
    for name in name_list:
        if name == "FORMAT":
            vcf = vcf.withColumn(
                name, F.array_sort(F.array_distinct(F.flatten(F.col(name)))))
            vcf = vcf.withColumn(
                name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":")))
        else:
            vcf = vcf.withColumn(name, F.array_max(F.col(name)))
    return vcf
Esempio n. 4
0
def process_orphanet(orphanet_df: DataFrame) -> DataFrame:
    """
    The JSON Schema format is applied to the df
    """

    # Map association type to sequence ontology ID:
    so_mapping_expr = create_map([lit(x) for x in chain(*CONSEQUENCE_MAP.items())])

    evidence_df = (
        orphanet_df.filter(~col('associationType').isin(EXCLUDED_ASSOCIATIONTYPES))
        .filter(~col('targetFromSourceId').isNull())
        .withColumn('dataSourceId', lit('orphanet'))
        .withColumn('datatypeId', lit('genetic_association'))
        .withColumn('alleleOrigins', split(lit('germline'), '_'))
        .withColumn('literature', array_distinct(col('literature')))
        .withColumn(
            'variantFunctionalConsequenceId',
            so_mapping_expr.getItem(col('associationType')),
        )
        .drop('associationType', 'type')
        # Select the evidence relevant fields
        .select(
            'datasourceId',
            'datatypeId',
            'alleleOrigins',
            'confidence',
            'diseaseFromSource',
            'diseaseFromSourceId',
            'literature',
            'targetFromSource',
            'targetFromSourceId',
            'variantFunctionalConsequenceId',
        )
        .persist()
    )

    return evidence_df
Esempio n. 5
0
pipelineTitle = Pipeline() \
    .setStages([
        documentAssemblerTitle,
        sentenceDetector,
        regexTokenizer,
        normalizer,
        pos,
        stopwords_cleaner,
        stemmer
    ])

nlp1 = pipelineBody.fit(questions_nlp_base).transform(questions_nlp_base)\
    .select(*questions_nlp_base.columns,
            f.size("sentence.result").alias("body_n_sentences"),
            f.size("normalized.result").alias("body_n_words"),
            f.size(f.array_distinct("stem.result")).alias(
                "body_n_distinct_words"),
            f.size(f.expr("filter(pos.result, x -> x like 'V%')")
                   ).alias("body_n_verbs"),
            f.size(f.expr("filter(pos.result, x -> x like 'N%')")
                   ).alias("body_n_nouns"),
            f.size(f.expr("filter(pos.result, x -> x like 'PR%')")
                   ).alias("body_n_pronouns"),
            f.size(f.expr("filter(pos.result, x -> x like 'J%')")
                   ).alias("body_n_adjectives"),
            f.size(f.expr("filter(pos.result, x -> x like 'RB%')")
                   ).alias("body_n_adverbs"),
            f.array_distinct(f.col("stem.result")).alias("body_words")
            )

questions_nlp = pipelineTitle.fit(nlp1).transform(nlp1)\
        "time >= '" + start_time + "' AND " +
        "time < '" + end_time + "' AND " +
        "(product_id = 7008 OR product_id = 7009 or product_id = 7010 or product_id = 7011 or product_id = 8462)) alias")

pw_df = spark.read.jdbc(
            url = "jdbc:postgresql://timescale.ghana.powerwatch.io/powerwatch",
            table = query,
            predicates = predicates,
            properties={"user": args.user, "password": args.password, "driver":"org.postgresql.Driver"})

#if you have multiple saves below this prevents reloading the data every time
pw_df.cache()

#We should mark every row with the number of unique sensors reporting in +-5 days so we now the denominator for SAIDI/SAIFI
pw_distinct_core_id = pw_df.select("time","core_id")
pw_distinct_core_id = pw_distinct_core_id.groupBy(F.window("time", '10 days', '1 day')).agg(F.countDistinct("core_id"),F.array_distinct(F.collect_list("core_id")).alias("core_ids_reporting"))
pw_distinct_core_id = pw_distinct_core_id.withColumn("time", F.from_unixtime((F.unix_timestamp(col("window.start")) + F.unix_timestamp(col("window.end")))/2))
pw_distinct_core_id = pw_distinct_core_id.select(col("count(DISTINCT core_id)").alias("sensors_reporting"), "time","core_ids_reporting")
pw_distinct_core_id = pw_distinct_core_id.withColumn("day",F.date_trunc("day","time"))
pw_distinct_core_id = pw_distinct_core_id.select("day","sensors_reporting","core_ids_reporting")

pw_powered_locations = pw_df.select("time","is_powered","core_id","location_latitude","location_longitude")
pw_powered_locations = pw_powered_locations.withColumn("is_powered",col("is_powered").cast(IntegerType()))
pw_powered_locations = pw_powered_locations.groupBy("core_id",F.window("time",'4 minutes', '1 minute')).agg(F.avg("is_powered").alias("avg_power"),
                                                                                                            F.first("location_latitude").alias("location_latitude"),
                                                                                                            F.first("location_longitude").alias("location_longitude"))

pw_powered_locations = pw_powered_locations.filter(col("avg_power") == 1)
pw_powered_locations = pw_powered_locations.withColumn("time", col("window.start"))
pw_powered_locations = pw_powered_locations.select("time","core_id","location_latitude","location_longitude")
pw_powered_locations = pw_powered_locations.withColumn("loc_struct",F.struct("core_id","location_latitude","location_longitude"))
Esempio n. 7
0
    def execute(self, conf_path: str, input_path: str, output_path: str,
                on_dbfs: bool) -> None:
        """
        Pipeline that sanitize data, extract drugs and change the data model finally save to a JSON.
        This is the main entrypoint of the package. The parameters are the job's arguments.
        Args:
            conf_path: If DataBricks Filesystem is mounted
            input_path: Folder path to write files
            output_path: Folder path to read raw files
            on_dbfs: File path of the params.json

        Returns: Nothing only modify inplace the instanced class

        """

        self.load_params(conf_path)

        df_dict = Sanitizer.read_files(self.logger, self.spark, self.params,
                                       input_path)
        Sanitizer.clean_strings(self.logger, df_dict)
        df_dict = Sanitizer.clean_date(self.logger, df_dict)
        df_dict = Sanitizer.empty_str_cleaning(self.logger, df_dict)

        Files.merge_write(self.logger, df_dict,
                          self.params.get("merge sanitized rules"),
                          path.join(output_path, "sanitized"), self.spark)

        df_dict = Files.read_delta(
            self.logger, set(self.params.get("csv") + self.params.get("json")),
            path.join(output_path, "sanitized"), self.spark)
        Sanitizer.deduplication(self.logger, df_dict,
                                self.params.get("deduplication rules"))

        DrugsExtractor.to_words(self.logger, df_dict,
                                self.params.get("to words"))

        drug_df_name = self.params.get("names").get("drugs")
        drug_col_name = self.params.get("names").get("drug")

        df_dict[drug_df_name] = df_dict.get(drug_df_name).withColumn(
            drug_col_name,
            lower(col(drug_col_name))).filter(col(drug_col_name).isNotNull())
        # To be refactor as it don't work in case of really large drug list because of collect to driver (below) and column creation (above)
        # need to drop duplicate because several drugs can have different atc code
        drugs_list = df_dict.get(drug_df_name).select(
            drug_col_name).drop_duplicates().toPandas()[drug_col_name].to_list(
            )
        df_dict.pop(drug_df_name)

        for df in df_dict.values():
            df.cache()
        self.logger.info(
            "Prepared drug list and cached dataframes for following intensive computation: {}"
            .format(df_dict))

        DrugsExtractor.pivot(self.logger, drugs_list, df_dict)

        date = self.params.get("names").get("date")
        id_col = self.params.get("names").get("id")
        journal = self.params.get("names").get("journal")
        columns_kept = [date, id_col, journal]
        df_dict = DrugsExtractor.shift(self.logger, drugs_list, df_dict,
                                       drug_col_name, self.spark, columns_kept)

        # Construct publication objects and journal object
        for df_name in self.params.get("to words").keys():
            df_dict[df_name] = df_dict.get(df_name).withColumn(
                date,
                col(date).cast(StringType()))
            df_dict[df_name] = df_dict.get(df_name).withColumn(id_col, struct(col(date).alias(date), col(id_col).alias(id_col))) \
                .withColumn(journal, struct(col(date).alias(date), col(journal).alias(journal)))
        self.logger.info(
            "Publication objects and journal object constructed: {}".format(
                df_dict))

        trial = self.params.get("names").get("clinical_trials")
        pubmed = self.params.get("names").get("pubmed")
        # Get of each drug a the list of journal and publication (we use set on journal to avoid duplicates)
        merge_trial_df = \
            df_dict.get(trial).groupby(drug_col_name)\
            .agg(collect_set(col(journal)).alias(journal), collect_list(col(id_col)).alias(trial))\
            .withColumn(pubmed, lit(None)
                        .cast(ArrayType(StructType([StructField('date', StringType(), True), StructField('id', StringType(), True)]))))
        self.logger.info("Created publication per drug for trials: {}".format(
            merge_trial_df))
        merge_pub_df = df_dict.get(pubmed).groupby(drug_col_name).agg(
            collect_set(col(journal)).alias(journal),
            collect_list(col(id_col)).alias(pubmed))
        self.logger.info(
            "Created publication per drug for pubmed: {}".format(merge_pub_df))

        # Merge clinical trials publications with pubmed publication by drug with their associated journal (without repetition)
        merge_path = path.join(output_path, "enriched")
        Files.merge_write(self.logger, {trial: merge_trial_df},
                          self.params.get("merge sanitized rules"), merge_path,
                          self.spark)
        delta_path = path.join(merge_path, trial)
        from delta.tables import DeltaTable
        delta_trial = DeltaTable.forPath(self.spark, delta_path)
        update_match = "trial.{0} = pub.{0}".format(drug_col_name)
        update = {
            pubmed:
            col(f"pub.{pubmed}"),
            journal:
            array_distinct(
                concat(col(f"pub.{journal}"), col(f"trial.{journal}")))
        }
        insert = {
            pubmed: col(f"pub.{pubmed}"),
            journal: col(f"pub.{journal}"),
            drug_col_name: col(f"pub.{drug_col_name}"),
            trial: lit(None)
        }
        self.logger.info(
            "Merging publications with the matching rule: {}".format(
                update_match))
        (delta_trial.alias("trial").merge(
            merge_pub_df.alias("pub"), update_match).whenMatchedUpdate(
                set=update).whenNotMatchedInsert(values=insert).execute())
        # Save the end result
        graph_filename = self.params.get("names").get("graph_filename")
        json_df = self.spark.read.format("delta").load(delta_path)

        # To use the filesystem mounted on databricks with python process we need to prefix "/dbfs/" but Spark process don't work with this prefix
        pythonic_path = "/dbfs" + output_path if on_dbfs else output_path
        graph_path = path.join(pythonic_path, *graph_filename)
        json_df.withColumn(journal, to_json(col(journal))).withColumn(
            trial, to_json(col(trial))).withColumn(pubmed, to_json(
                col(pubmed))).toPandas().to_json(graph_path,
                                                 orient="records",
                                                 date_format="iso")
        # when used multiLine need to be enable on the reading spark process

        self.logger.info("Wrote the resulting JSON to: {}".format(graph_path))
    def process_biomarkers(
        self,
        biomarkers_df: DataFrame,
        source_df: DataFrame,
        disease_df: DataFrame,
        drugs_df: DataFrame
    ) -> DataFrame:
        """The diverse steps to prepare and enrich the input table"""

        biomarkers_enriched = (
            biomarkers_df
            .select(
                'Biomarker', 'IndividualMutation',
                array_distinct(split(col('Alteration'), ';')).alias('alterations'),
                array_distinct(split(col('Gene'), ';')).alias('gene'),
                split(col('AlterationType'), ';').alias('alteration_types'),
                array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'),
                array_distinct(split(col('Drug'), ';|,')).alias('drug'),
                'DrugFullName', 'Association', 'gDNA',
                array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'),
                array_distinct(split(col('Source'), ';')).alias('source')
            )
            .withColumn('confidence', explode(col('confidence')))
            .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name')))
            .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', ''))
            .withColumn('drug', explode(col('drug')))
            .withColumn('drug', translate(col('drug'), '[]', ''))
            .withColumn('gene', explode(col('gene')))
            .replace(to_replace=GENENAMESOVERRIDE, subset=['gene'])
            .withColumn('gene', upper(col('gene')))
            # At this stage alterations and alteration_types are both arrays
            # Disambiguation when the biomarker consists of multiple alterations is needed
            # This is solved by:
            # 1. Zipping both fields - tmp consists of a list of alteration/type tuples
            # 2. tmp is exploded - tmp consists of the alteration/type tuple
            # 3. alteration & alteration_type columns are overwritten with the elements in the tuple
            .withColumn(
                'tmp',
                self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types')))
            .withColumn('tmp', explode(col('tmp')))
            .withColumn('alteration_type', element_at(col('tmp'), 2))
            .withColumn(
                'alteration',
                when(
                    ~col('IndividualMutation').isNull(),
                    col('IndividualMutation')
                )
                .otherwise(element_at(col('tmp'), 1))
            )
            .drop('tmp')
            # Clean special cases on the alteration string
            .withColumn(
                'alteration',
                when(
                    col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.',
                    col('Biomarker')  # 'NRAS (12,13,59,61,117,146)'
                )
                .when(
                    # Cleans strings like 'ARAF:.'
                    col('alteration').contains(':.'),
                    translate(col('alteration'), ':.', '')
                )
                .when(
                    # Fusion genes are described with '__'
                    # biomarker is a cleaner representation when there's one alteration
                    (col('alteration').contains('__')) & (~col('Biomarker').contains('+')),
                    col('Biomarker')
                )
                .otherwise(col('alteration'))
            )
            # Split source into literature and urls
            # literature contains PMIDs
            # urls are enriched from the source table if not a CT
            .withColumn('source', explode(col('source')))
            .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source')))
            .join(source_df, on='source', how='left')
            .withColumn(
                'literature',
                when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2))
            )
            .withColumn(
                'urls',
                when(
                    col('source').startswith('NCT'),
                    struct(
                        lit('Clinical Trials').alias('niceName'),
                        concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url')
                    )
                )
                .when(
                    (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')),
                    struct(col('niceName'), col('url'))
                )
            )
            # The previous conditional clause creates a struct regardless of
            # whether any condition is met. The empty struct is replaced with null
            .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls')))
            # Enrich data
            .withColumn('functionalConsequenceId', col('alteration_type'))
            .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId'])
            .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association'])
            .join(disease_df, on='tumor_type', how='left')
            .withColumn('drug', upper(col('drug')))
            .withColumn(
                # drug class is coalesced when the precise name of the medicine is not provided
                'drug',
                when(col('drug') == '', col('DrugFullName')).otherwise(col('drug')))
            .join(drugs_df, on='drug', how='left')
            .withColumn('drug', initcap(col('drug')))
            # Translate variantId
            .withColumn(
                'variantId',
                when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA')))
            )
            # Assign a GO ID when a gene expression data is reported
            .withColumn(
                'geneExpressionId',
                when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')),
                    'GO_0010628'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')),
                    'GO_0010629'
                )
                .when(
                    (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')),
                    'GO_0010467'
                )
            )
            # Create variant struct
            .withColumn(
                'variant',
                when(
                    col('alteration_type') != 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('variantId').alias('id'),
                        col('functionalConsequenceId')
                    )
                )
            )
            # Create geneExpression struct
            .withColumn(
                'geneExpression',
                when(
                    col('alteration_type') == 'EXPR',
                    struct(
                        col('alteration').alias('name'),
                        col('geneExpressionId').alias('id'))
                )
            )
        )

        pre_evidence = (
            biomarkers_enriched
            .withColumn('datasourceId', lit('cancer_biomarkers'))
            .withColumn('datatypeId', lit('affected_pathway'))
            .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource')
            .withColumnRenamed('drug', 'drugFromSource')
            # diseaseFromSourceMappedId, drugId populated above
            .withColumnRenamed('Association', 'drugResponse')
            # confidence, literature and urls populated above
            .withColumnRenamed('gene', 'targetFromSourceId')
            .withColumnRenamed('Biomarker', 'biomarkerName')
            # variant, geneExpression populated above
            .drop(
                'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId',
                'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url')
        )

        # Group evidence
        self.evidence = (
            pre_evidence
            .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId',
                     'drugResponse', 'targetFromSourceId', 'diseaseFromSource',
                     'diseaseFromSourceMappedId', 'confidence', 'biomarkerName')
            .agg(
                collect_set('literature').alias('literature'),
                collect_set('urls').alias('urls'),
                collect_set('variant').alias('variant'),
                collect_set('geneExpression').alias('geneExpression'),
            )
            # Replace empty lists with null values
            .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature')))
            .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls')))
            .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant')))
            .withColumn(
                'geneExpression',
                when(size(col('geneExpression')) == 0, lit(None))
                .otherwise(col('geneExpression')))
            # Collect variant info into biomarkers struct
            .withColumn(
                'biomarkers',
                struct(
                    'variant',
                    'geneExpression'
                ))
            .drop('variant', 'geneExpression')
            .distinct()
        )

        return self.evidence
Esempio n. 9
0
# COMMAND ----------

# MAGIC %md
# MAGIC To prepare the data for analysis, we perform the following transformations:
# MAGIC - Split multiallelic variants with the ``split_multiallelics`` transformer.
# MAGIC - Calculate the number of alternate alleles for biallelic variants with `genotype_states`.
# MAGIC - Replace any missing values with the mean of the non-missing values using `mean_substitute`.
# MAGIC - Filter out all homozygous SNPs.

# COMMAND ----------

variant_df = (glow.transform(
    'split_multiallelics', base_variant_df).withColumn(
        'values',
        glow.mean_substitute(glow.genotype_states('genotypes'))).filter(
            fx.size(fx.array_distinct('values')) > 1))

# COMMAND ----------

# MAGIC %md
# MAGIC
# MAGIC Create the beginning block genotype matrix and sample block ID mapping with `glow.wgr.block_variants_and_samples`.
# MAGIC
# MAGIC Write the block matrix to Delta and the sample blocks a JSON file so that we can reuse them for multiple phenotype batches.

# COMMAND ----------

block_df, sample_blocks = glow.wgr.block_variants_and_samples(
    variant_df, sample_ids, variants_per_block, sample_block_count)

# COMMAND ----------
    inputCol="refined_text",
    outputCol="words",
    pattern=
    "overfit|underfit|missing values|imbalance|covariate shift|outlier|leakage|calibration|dataset shift|drift",
    gaps=False)

# COMMAND ----------

regexTokenized = regexTokenizer.transform(ff)

display(regexTokenized)

# COMMAND ----------

wo_dupes = regexTokenized.withColumn("words_without_dupes",
                                     array_distinct("words"))

display(wo_dupes)

# COMMAND ----------

countdf = regexTokenized.select('*', size('words').alias('size'))

display(countdf)

# COMMAND ----------

countdf_wo_dupes = wo_dupes.select(
    '*',
    size('words_without_dupes').alias('dupes_wo_size'))
Esempio n. 11
0
            #func.max(sort_pub_year).alias('lastyr'),

            # number of cited papers.
            func.sum(
                func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear +
                          ' AND ' + maxyear +
                          ',IF(CitationCountNonSelf>0,1,0),0)')
            ).alias('ns_npcY1Y3'),
            func.sum(
                func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear +
                          ' AND ' + maxyear +
                          ',IF(CitationCount>0,1,0),0)')).alias('ws_npcY1Y3'),
            func.sum('CitationCountNonSelf').alias('ns_ncY2Y3'),
            func.size(
                func.array_distinct(
                    func.flatten(func.collect_list(
                        'CitingEidsNonSelf')))).alias('ns_ncY2Y3_cp'),
            func.max(func.expr('IF(ns_r<=CitationCountNonSelf,ns_r,0)')).alias(
                'ns_hY3'),
            func.max(func.expr('IF(ns_r_eff<=CitationCountNonSelf,ns_r_eff,0)')
                     ).alias('ns_hmY3'),
            func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ns_nps'),
            func.sum(func.expr(
                'IF(n_authors=1,CitationCountNonSelf,0)')).alias('ns_ncs'),
            func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias(
                'ns_npsf'),
            func.sum(
                func.expr(
                    'IF(n_authors=1 OR Authorseq=1,CitationCountNonSelf,0)')).
            alias('ns_ncsf'),
            func.sum(
Esempio n. 12
0
def run(
    plink_path: str,
    traits_path: str,
    covariates_path: str,
    variants_per_block: int,
    sample_block_count: int,
    output_dir: str,
    plink_fam_sep: str = "\t",
    plink_bim_sep: str = "\t",
    alphas: Optional[list] = None,
    contigs: List[str] = None,
):
    """Run Glow WGR"""
    output_path = Path(output_dir)
    if output_path.exists():
        shutil.rmtree(output_path)
    output_path.mkdir(parents=True, exist_ok=False)

    if alphas is None:
        alphas = np.array([])
    else:
        alphas = np.array(alphas).astype(float)

    spark = spark_session()
    logger.info(
        f"Loading PLINK dataset at {plink_path} (fam sep = {plink_fam_sep}, bim sep = {plink_bim_sep}, alphas = {alphas})"
    )
    df = (spark.read.format("plink").option(
        "bimDelimiter",
        plink_bim_sep).option("famDelimiter", plink_fam_sep).option(
            "includeSampleIds", True).option("mergeFidIid",
                                             False).load(plink_path))

    variant_df = df.withColumn(
        "values", mean_substitute(genotype_states(F.col("genotypes")))).filter(
            F.size(F.array_distinct("values")) > 1)
    if contigs is not None:
        variant_df = variant_df.filter(F.col("contigName").isin(contigs))

    sample_ids = get_sample_ids(variant_df)
    logger.info(
        f"Found {len(sample_ids)} samples, first 10: {sample_ids[:10]}")

    ###########
    # Stage 1 #
    ###########

    logger.info(HR)
    logger.info("Calculating variant/sample block info")
    block_df, sample_blocks = block_variants_and_samples(
        variant_df,
        sample_ids,
        variants_per_block=variants_per_block,
        sample_block_count=sample_block_count,
    )

    label_df = pd.read_csv(traits_path, index_col="sample_id")
    label_df = (label_df - label_df.mean()) / label_df.std(ddof=0)
    logger.info(HR)
    logger.info("Trait info:")
    logger.info(_info(label_df))

    cov_df = pd.read_csv(covariates_path, index_col="sample_id")
    cov_df = (cov_df - cov_df.mean()) / cov_df.std(ddof=0)
    logger.info(HR)
    logger.info("Covariate info:")
    logger.info(_info(cov_df))

    stack = RidgeReducer(alphas=alphas)
    reduced_block_df = stack.fit_transform(block_df, label_df, sample_blocks,
                                           cov_df)
    logger.info(HR)
    logger.info("Stage 1: Reduced block schema:")
    logger.info(_schema(reduced_block_df))

    path = output_path / "reduced_blocks.parquet"
    reduced_block_df.write.parquet(str(path), mode="overwrite")
    logger.info(f"Stage 1: Reduced blocks written to {path}")

    # Flatten to scalars for more convenient access w/o Spark
    flat_reduced_block_df = spark.read.parquet(str(path))
    path = output_path / "reduced_blocks_flat.csv.gz"
    flat_reduced_block_df = _flatten_reduced_blocks(flat_reduced_block_df)
    flat_reduced_block_df = flat_reduced_block_df.toPandas()
    flat_reduced_block_df.to_csv(path, index=False)
    # flat_reduced_block_df.write.parquet(str(path), mode='overwrite')
    logger.info(f"Stage 1: Flattened reduced blocks written to {path}")

    ###########
    # Stage 2 #
    ###########

    # Monkey-patch this in until there's a glow release beyond 0.5.0
    if glow_version != "0.5.0":
        raise NotImplementedError(
            f"Must remove adjustements for glow != 0.5.0 (found {glow_version})"
        )
    # Remove after glow update
    RidgeRegression.transform_loco = transform_loco
    estimator = RidgeRegression(alphas=alphas)
    model_df, cv_df = estimator.fit(reduced_block_df, label_df, sample_blocks,
                                    cov_df)
    logger.info(HR)
    logger.info("Stage 2: Model schema:")
    logger.info(_schema(model_df))
    logger.info("Stage 2: CV schema:")
    logger.info(_schema(cv_df))

    y_hat_df = estimator.transform(reduced_block_df, label_df, sample_blocks,
                                   model_df, cv_df, cov_df)

    logger.info(HR)
    logger.info("Stage 2: Prediction info:")
    logger.info(_info(y_hat_df))
    logger.info(y_hat_df.head(5))

    path = output_path / "predictions.csv"
    y_hat_df.reset_index().to_csv(path, index=False)
    logger.info(f"Stage 2: Predictions written to {path}")

    y_hat_df_loco = estimator.transform_loco(reduced_block_df, label_df,
                                             sample_blocks, model_df, cv_df,
                                             cov_df)

    path = output_path / "predictions_loco.csv"
    y_hat_df_loco.reset_index().to_csv(path, index=False)
    logger.info(f"Stage 2: LOCO Predictions written to {path}")

    ###########
    # Stage 3 #
    ###########

    # Do this to correct for the error in Glow at https://github.com/projectglow/glow/issues/257
    if glow_version != "0.5.0":
        raise NotImplementedError(
            f"Must remove adjustements for glow != 0.5.0 (found {glow_version})"
        )
    cov_arr = cov_df.to_numpy()
    cov_arr = cov_arr.T.ravel(order="C").reshape(cov_arr.shape)

    # Convert the pandas dataframe into a Spark DataFrame
    adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df)

    # Run GWAS w/o LOCO (this could be for a much larger set of variants)
    wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").crossJoin(
        adjusted_phenotypes.withColumnRenamed(
            "values", "phenotypeValues")).select(
                "start",
                "names",
                "label",
                expand_struct(
                    linear_regression_gwas(F.col("callValues"),
                                           F.col("phenotypeValues"),
                                           F.lit(cov_arr))),
            ))

    logger.info(HR)
    logger.info("Stage 3: GWAS (no LOCO) schema:")
    logger.info(_schema(wgr_gwas))

    # Convert to pandas
    wgr_gwas = wgr_gwas.toPandas()
    logger.info(HR)
    logger.info("Stage 3: GWAS (no LOCO) info:")
    logger.info(_info(wgr_gwas))
    logger.info(wgr_gwas.head(5))

    path = output_path / "gwas.csv"
    wgr_gwas.to_csv(path, index=False)
    logger.info(f"Stage 3: GWAS (no LOCO) results written to {path}")
    logger.info(HR)
    logger.info("Done")

    # TODO: Enable this once WGR is fully released
    # See: https://github.com/projectglow/glow/issues/256)

    # Run GWAS w/ LOCO
    adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df_loco)
    wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").join(
        adjusted_phenotypes.withColumnRenamed("values", "phenotypeValues"),
        ["contigName"],
    ).select(
        "contigName",
        "start",
        "names",
        "label",
        expand_struct(
            linear_regression_gwas(F.col("callValues"),
                                   F.col("phenotypeValues"), F.lit(cov_arr))),
    ))

    # Convert to pandas
    wgr_gwas = wgr_gwas.toPandas()
    logger.info(HR)
    logger.info("Stage 3: GWAS (with LOCO) info:")
    logger.info(_info(wgr_gwas))
    logger.info(wgr_gwas.head(5))

    path = output_path / "gwas_loco.csv"
    wgr_gwas.to_csv(path, index=False)
    logger.info(f"Stage 3: GWAS (with LOCO) results written to {path}")
    logger.info(HR)
    logger.info("Done")
Esempio n. 13
0
print(f"Top 10 words {mvv}")

# Preparation for calculating stats

stpwr = set(stopwords.words('english'))
stpwr.update([' ', '  ', '   ', '    ', '', 'like', 'im', 'oh', 'dont', 'im'])

clean_df = newdf.filter(newdf['word'].isin(stpwr) == False)
#clean_df.sort(clean_df.wcount,ascending=False).show(10)

billdf = billdf.withColumn('allWords', f.size(f.split(f.col('Lyrics'), ' ')))

billdf = billdf.withColumn('uniqWords', f.split(f.col('Lyrics'), ' '))

billdf = billdf.withColumn('uniqWords', f.size(f.array_distinct("uniqWords")))
billdf = billdf.withColumn('Gini', f.col("uniqWords") / f.col("allWords"))

df2 = billdf.filter(billdf.allWords > 1)
#df2.show(10)

gini_df = df2.select(['Year',
                      'Gini']).groupby('Year').mean('Gini').withColumnRenamed(
                          'avg(Gini)', "Mean Gini")
#gini_df.orderBy('Year', ascending = True).show(20)

# Scatter plot of Gini coefficient

p_gini_df = gini_df.toPandas()

fig3 = px.scatter(p_gini_df,
Esempio n. 14
0
# MAGIC %md
# MAGIC
# MAGIC Extract sample IDs from a variant DataFrame with `glow.wgr.get_sample_ids`.

# COMMAND ----------

sample_ids = glow.wgr.get_sample_ids(base_variant_df)

# COMMAND ----------

variant_df = (glow.transform(
    'split_multiallelics', base_variant_df).withColumn(
        'values',
        glow.mean_substitute(glow.genotype_states('genotypes'))).filter(
            fx.size(fx.array_distinct('values')) > 1).alias('variant_df'))

# COMMAND ----------

display(variant_df)

# COMMAND ----------

# MAGIC
# MAGIC %md
# MAGIC
# MAGIC Create the beginning block genotype matrix and sample block ID mapping with `glow.wgr.block_variants_and_samples`.
# MAGIC
# MAGIC Write the block matrix to Delta and the sample blocks a JSON file so that we can reuse them for multiple phenotype batches.

# COMMAND ----------
Esempio n. 15
0
    pw_null_merge_time = pw_null_merge_time.select("user_id", "outage_times")
    pw_df = pw_df.union(pw_null_merge_time)

    udfTimestampAverage = udf(timestamp_average, LongType())
    pw_df = pw_df.withColumn("outage_time",
                             udfTimestampAverage("outage_times"))
    pw_df = pw_df.localCheckpoint(eager=True)
    print("Merged to:", pw_df.count())
    print()

#Okay now we have a list of outages, restore_times, locations, user_ids
#First let's calculate some high level metrics

#size of outages
pw_finalized_outages = pw_finalized_outages.withColumn(
    "cluster_size", F.size(F.array_distinct("user_id")))

#standard deviation outage times
pw_finalized_outages = pw_finalized_outages.withColumn(
    "outage_times_stddev", F.explode("outage_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [
    F.first(x).alias(x) for x in pw_finalized_outages.columns
    if x != 'outage_times_stddev' and x != 'outage_time'
]
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(
    F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"), *exprs)

#range of outage times
pw_finalized_outages = pw_finalized_outages.withColumn(
Esempio n. 16
0
    lambda row: Row(business_id=row.split(",")[0],
                    avg_stars=row.split(',')[3],
                    user_stars=row.split(',')[4],
                    user_id=row.split(",")[5]))

# remove duplicate rows
data_rdd = data_rdd.map(lambda row: row).distinct()
data_rdd = data_rdd.filter(lambda row: row[1] <= row[2])

# create DataFrame
data_df = spark.createDataFrame(data_rdd)
data_df = data_df.groupby('user_id').agg(F.collect_list('business_id'))

# group restaurants with same name but different location together
data_df = data_df.withColumn("business_id_list",
                             array_distinct("collect_list(business_id)"))

# # Python API docs
fpGrowth = FPGrowth(itemsCol="business_id_list",
                    minSupport=0.001,
                    minConfidence=0.001)
model = fpGrowth.fit(data_df)

# # Display frequent itemsets
model.freqItemsets.orderBy([func.size("items"), "freq"],
                           ascending=[0, 0]).show(20, truncate=False)

# Association Rules
association = model.associationRules
association.orderBy([func.size("antecedent"), "confidence"],
                    ascending=[0, 0]).show(20, truncate=False)
Esempio n. 17
0
    pw_df = pw_df.select("core_id","outage_times","restore_time","location")
    pw_null_merge_time = pw_null_merge_time.select("core_id","outage_times","restore_time","location")
    pw_df = pw_df.union(pw_null_merge_time)

    udfTimestampAverage = udf(timestamp_average, LongType())
    pw_df = pw_df.withColumn("outage_time", udfTimestampAverage("outage_times"))
    pw_df = pw_df.localCheckpoint(eager = True)
    print("Merged to:", pw_df.count())
    print()

#Okay now we have a list of outages, restore_times, locations, core_ids
#First let's calculate some high level metrics

#size of outages
pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id")))

#standard deviation outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times"))

#this expression essentially takes the first value of each column (which should all be the same after the explode)
exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time']
pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs)

#range of outage times
pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times"))

#standard deviation and range of restore times
pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time"))
pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time"))
                                                   "tx", "feeder_id")
    pw_df = pw_df.union(pw_null_merge_time)

    udfTimestampAverage = udf(timestamp_average, LongType())
    pw_df = pw_df.withColumn("outage_time",
                             udfTimestampAverage("outage_times"))
    pw_df = pw_df.localCheckpoint(eager=True)
    print("Merged to:", pw_df.count())
    print()

#Okay now we have a list of outages, restore_times, locations, core_ids
#First let's calculate some high level metrics

#size of outages
pw_finalized_outages = pw_finalized_outages.withColumn(
    "cluster_size", F.size(F.array_distinct("core_id")))

# now filter the outages so that at least two devices went out
pw_finalized_outages = pw_finalized_outages.filter(col("cluster_size") >= 2)

# now explode the outage lists so that every line is a sensor involved in that outage and regroup by transformer and feeder
# then each outage maps to a number of sensors out under each transformer and feeder
# This gives the relative SAIFI contribution of each transformer in each outage
pw_outages_by_feeder = pw_finalized_outages.select("outage_time", "feeder_id")
pw_outages_by_feeder = pw_outages_by_feeder.withColumn("feeder_id",
                                                       F.explode("feeder_id"))
pw_outages_by_feeder = pw_outages_by_feeder.withColumn("size", F.lit(1))
pw_outages_by_feeder = pw_outages_by_feeder.groupBy(
    "outage_time", "feeder_id").agg(F.sum("size").alias("cluster_size"))
pw_outages_by_feeder = pw_outages_by_feeder.select(
    "outage_time", "cluster_size",
Esempio n. 19
0
array_subset = shows.select("name", "genres")

array_subset = array_subset.select(
    "name",
    array_subset.genres[0].alias("dot_and_index"), 
    F.col("genres")[0].alias("col_and_index"),
    array_subset.genres.getItem(0).alias("dot_and_method"), 
    F.col("genres").getItem(0).alias("col_and_method"),
)

# array_subset.show()
array_subset_repeated = array_subset.select(
    "name",
    F.lit("Comedy").alias("one"),
    F.lit("Horror").alias("two"),
    F.lit("Drama").alias("three"),
    F.col("dot_and_index"),
).select(
    "name",
    F.array("one", "two", "three").alias("Some_Genres"),
    F.array_repeat("dot_and_index", 5).alias("Repeated_Genres"),

array_subset_repeated.show(1, False)

array_subset_repeated.select(
    "name", F.size("Some_Genres"), F.size("Repeated_Genres")
).show()

array_subset_repeated.select(
    "name", F.array_distinct("Some_Genres"), F.array_distinct("Repeated_Genres")
).show(1, False)
Esempio n. 20
0
    def generate_panelapp_evidence(self, input_file: str, output_file: str,
                                   cache_dir: str) -> None:
        logging.info('Filter and extract the necessary columns.')
        panelapp_df = self.spark.read.csv(input_file, sep=r'\t', header=True)
        # Panel version can be either a single number (e.g. 1), or two numbers separated by a dot (e.g. 3.14). We cast
        # either representation to float to ensure correct filtering below. (Note that conversion to float would not
        # work in the general case, because 3.4 > 3.14, but we only need to compare relative to 1.0.)
        panelapp_df = panelapp_df.withColumn(
            'Panel Version',
            panelapp_df['Panel Version'].cast('float').alias('Panel Version'))
        panelapp_df = (
            panelapp_df.filter((
                (col('List') == 'green') | (col('List') == 'amber'))
                               & (col('Panel Version') >= 1.0)
                               & (col('Panel Status') == 'PUBLIC')).select(
                                   'Symbol', 'Panel Id', 'Panel Name', 'List',
                                   'Mode of inheritance', 'Phenotypes')
            # The full original records are not redundant; however, uniqueness on a subset of fields is not guaranteed.
            .distinct())

        logging.info(
            'Fix typos and formatting errors which would interfere with phenotype splitting.'
        )
        panelapp_df = panelapp_df.withColumn('cleanedUpPhenotypes',
                                             col('Phenotypes'))
        for regexp, replacement in self.PHENOTYPE_BEFORE_SPLIT_RE.items():
            panelapp_df = panelapp_df.withColumn(
                'cleanedUpPhenotypes',
                regexp_replace(col('cleanedUpPhenotypes'), regexp,
                               replacement))

        logging.info('Split and explode the phenotypes.')
        panelapp_df = (panelapp_df.withColumn(
            'cohortPhenotypes',
            array_distinct(split(col('cleanedUpPhenotypes'), ';'))).withColumn(
                'phenotype', explode(col('cohortPhenotypes'))))

        logging.info(
            'Remove specific patterns and phrases which will interfere with ontology extraction and mapping.'
        )
        panelapp_df = panelapp_df.withColumn('diseaseFromSource',
                                             col('phenotype'))
        for regexp in self.PHENOTYPE_AFTER_SPLIT_RE:
            panelapp_df = panelapp_df.withColumn(
                'diseaseFromSource',
                regexp_replace(col('diseaseFromSource'), f'({regexp})', ''))

        logging.info(
            'Extract ontology information, clean up and filter the split phenotypes.'
        )
        panelapp_df = (
            panelapp_df

            # Extract Orphanet/MONDO/HP ontology identifiers and remove them from the phenotype string.
            .withColumn('ontology_namespace', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 1))
            .withColumn('ontology_namespace', regexp_replace(col('ontology_namespace'), 'OrphaNet: ORPHA', 'ORPHA'))
            .withColumn('ontology_id', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 2))
            .withColumn(
                'ontology',
                when(
                    (col('ontology_namespace') != '') & (col('ontology_id') != ''),
                    concat(col('ontology_namespace'), lit(':'), col('ontology_id'))
                )
            )
            .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OTHER_RE})', ''))

            # Extract OMIM identifiers and remove them from the phenotype string.
            .withColumn('omim_id', regexp_extract(col('diseaseFromSource'), self.OMIM_RE, 2))
            .withColumn('omim', when(col('omim_id') != '', concat(lit('OMIM:'), col('omim_id'))))
            .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OMIM_RE})', ''))

            # Choose one of the ontology identifiers, keeping OMIM as a priority.
            .withColumn('diseaseFromSourceId', when(col('omim').isNotNull(), col('omim')).otherwise(col('ontology')))
            .drop('ontology_namespace', 'ontology_id', 'ontology', 'omim_id', 'omim')

            # Clean up the final split phenotypes.
            .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), r'\(\)', ''))
            .withColumn('diseaseFromSource', trim(col('diseaseFromSource')))
            .withColumn('diseaseFromSource', when(col('diseaseFromSource') != '', col('diseaseFromSource')))

            # Remove low quality records, where the name of the phenotype string starts with a question mark.
            .filter(
                ~(
                    (col('diseaseFromSource').isNotNull()) & (col('diseaseFromSource').startswith('?'))
                )
            )

            # Remove duplication caused by cases where multiple phenotypes within the same record fail to generate any
            # phenotype string or ontology identifier.
            .distinct()

            # For records where we were unable to determine either a phenotype string nor an ontology identifier,
            # substitute the panel name instead.
            .withColumn(
                'diseaseFromSource',
                when(
                    (col('diseaseFromSource').isNull()) & (col('diseaseFromSourceId').isNull()),
                    col('Panel Name')
                )
                .otherwise(col('diseaseFromSource'))
            )
            .persist()
        )

        logging.info('Fetch and join literature references.')
        all_panel_ids = panelapp_df.select(
            'Panel Id').toPandas()['Panel Id'].unique()
        literature_references = self.fetch_literature_references(all_panel_ids)
        panelapp_df = panelapp_df.join(literature_references,
                                       on=['Panel Id', 'Symbol'],
                                       how='left')

        if self.debug_output_phenotypes_filename:
            logging.info('Output tables for debugging purposes, if requested.')
            (panelapp_df.select(
                'Phenotypes',  # Original, unaltered string with all phenotypes.
                'cleanedUpPhenotypes',  # String with phenotypes after pre-split cleanup.
                'phenotype',  # Individual phenotype after splitting.
                'diseaseFromSource',  # Final cleaned up disease name.
                'diseaseFromSourceId',  # Final cleaned up disease ID.
            ).distinct().toPandas().to_csv(
                self.debug_output_phenotypes_filename, sep='\t', index=False))

        logging.info(
            'Drop unnecessary fields and populate the final evidence string structure.'
        )
        evidence_df = (
            panelapp_df.drop('Phenotypes', 'cleanedUpPhenotypes', 'phenotype')
            # allelicRequirements requires a list, but we always only have one value from PanelApp.
            .withColumn(
                'allelicRequirements',
                when(
                    col('Mode of inheritance').isNotNull(),
                    array(col('Mode of inheritance')))).drop(
                        'Mode of inheritance').withColumnRenamed(
                            'List', 'confidence').withColumn(
                                'datasourceId',
                                lit('genomics_england')).withColumn(
                                    'datatypeId', lit('genetic_literature'))
            # diseaseFromSourceId populated above
            # literature populated above
            .withColumnRenamed('Panel Id', 'studyId').withColumnRenamed(
                'Panel Name',
                'studyOverview').withColumnRenamed('Symbol',
                                                   'targetFromSourceId')

            # Some residual duplication is caused by slightly different representations from `cohortPhenotypes` being
            # cleaned up to the same representation in `diseaseFromSource`, for example "Pontocerebellar hypoplasia type
            # 2D (613811)" and "Pontocerebellar hypoplasia type 2D, 613811".
            .distinct())

        evidence_df = add_efo_mapping(evidence_strings=evidence_df,
                                      spark_instance=self.spark,
                                      ontoma_cache_dir=cache_dir)
        logging.info('Disease mappings have been added.')

        write_evidence_strings(evidence_df, output_file)
        logging.info(
            f'{evidence_df.count()} evidence strings have been saved to {output_file}'
        )