def get_df_mincityear_onw_cit(df_ani): return (df_ani.filter(sort_pub_year + ' >= ' + mincityear).withColumn( 'references_u', func.array_distinct('references')).select( func.col('Eid').alias('CitingEid'), func.explode('references_u').alias('Eid'), func.when( func.col('source.srcid').isin(discontinued_sources), func.lit(int(1))).otherwise(func.lit( int(0))).alias('isDiscontinuedCiting'), func.col('Au.auid').cast('array<long>').alias('CitingAuids') ).join( df_ani.select( 'Eid', func.col('Au.auid').cast('array<long>').alias('CitedAuids')), ["Eid"]).withColumn( 'overLappingAuthors', func.size(func.array_intersect( 'CitingAuids', 'CitedAuids'))).select( "CitingEid", "Eid", 'isDiscontinuedCiting', func.expr("IF(overLappingAuthors>0,1,0)").alias( 'isSelfCitation'), func.expr("IF(overLappingAuthors>0,NULL,CitingEid)"). alias('CitingEidNonSelf'), ).groupBy('Eid').agg( func.count('*').alias('CitationCount'), func.sum('isSelfCitation').alias('SelfCitationCount'), (func.count('*') - func.sum('isSelfCitation') ).alias('CitationCountNonSelf'), func.collect_list('CitingEid').alias('CitingEids'), func.collect_list('CitingEidNonSelf').alias( 'CitingEidsNonSelf'), func.sum("isDiscontinuedCiting").alias( 'CitationCountFromDiscontinuedSources')))
def get_column_spec( self, source_df: Optional[DataFrame], current_column: Optional[Column] ) -> Column: column_spec = array_distinct( *[ col.get_column_spec(source_df=source_df, current_column=current_column) for col in self.value ] ) return column_spec
def column_revalue(vcf): # info 값 수정 필요 name_list = ["ID", "REF", "ALT", "INFO", "FORMAT"] for name in name_list: if name == "FORMAT": vcf = vcf.withColumn( name, F.array_sort(F.array_distinct(F.flatten(F.col(name))))) vcf = vcf.withColumn( name, F.concat(F.lit("GT:"), F.array_join(F.col(name), ":"))) else: vcf = vcf.withColumn(name, F.array_max(F.col(name))) return vcf
def process_orphanet(orphanet_df: DataFrame) -> DataFrame: """ The JSON Schema format is applied to the df """ # Map association type to sequence ontology ID: so_mapping_expr = create_map([lit(x) for x in chain(*CONSEQUENCE_MAP.items())]) evidence_df = ( orphanet_df.filter(~col('associationType').isin(EXCLUDED_ASSOCIATIONTYPES)) .filter(~col('targetFromSourceId').isNull()) .withColumn('dataSourceId', lit('orphanet')) .withColumn('datatypeId', lit('genetic_association')) .withColumn('alleleOrigins', split(lit('germline'), '_')) .withColumn('literature', array_distinct(col('literature'))) .withColumn( 'variantFunctionalConsequenceId', so_mapping_expr.getItem(col('associationType')), ) .drop('associationType', 'type') # Select the evidence relevant fields .select( 'datasourceId', 'datatypeId', 'alleleOrigins', 'confidence', 'diseaseFromSource', 'diseaseFromSourceId', 'literature', 'targetFromSource', 'targetFromSourceId', 'variantFunctionalConsequenceId', ) .persist() ) return evidence_df
pipelineTitle = Pipeline() \ .setStages([ documentAssemblerTitle, sentenceDetector, regexTokenizer, normalizer, pos, stopwords_cleaner, stemmer ]) nlp1 = pipelineBody.fit(questions_nlp_base).transform(questions_nlp_base)\ .select(*questions_nlp_base.columns, f.size("sentence.result").alias("body_n_sentences"), f.size("normalized.result").alias("body_n_words"), f.size(f.array_distinct("stem.result")).alias( "body_n_distinct_words"), f.size(f.expr("filter(pos.result, x -> x like 'V%')") ).alias("body_n_verbs"), f.size(f.expr("filter(pos.result, x -> x like 'N%')") ).alias("body_n_nouns"), f.size(f.expr("filter(pos.result, x -> x like 'PR%')") ).alias("body_n_pronouns"), f.size(f.expr("filter(pos.result, x -> x like 'J%')") ).alias("body_n_adjectives"), f.size(f.expr("filter(pos.result, x -> x like 'RB%')") ).alias("body_n_adverbs"), f.array_distinct(f.col("stem.result")).alias("body_words") ) questions_nlp = pipelineTitle.fit(nlp1).transform(nlp1)\
"time >= '" + start_time + "' AND " + "time < '" + end_time + "' AND " + "(product_id = 7008 OR product_id = 7009 or product_id = 7010 or product_id = 7011 or product_id = 8462)) alias") pw_df = spark.read.jdbc( url = "jdbc:postgresql://timescale.ghana.powerwatch.io/powerwatch", table = query, predicates = predicates, properties={"user": args.user, "password": args.password, "driver":"org.postgresql.Driver"}) #if you have multiple saves below this prevents reloading the data every time pw_df.cache() #We should mark every row with the number of unique sensors reporting in +-5 days so we now the denominator for SAIDI/SAIFI pw_distinct_core_id = pw_df.select("time","core_id") pw_distinct_core_id = pw_distinct_core_id.groupBy(F.window("time", '10 days', '1 day')).agg(F.countDistinct("core_id"),F.array_distinct(F.collect_list("core_id")).alias("core_ids_reporting")) pw_distinct_core_id = pw_distinct_core_id.withColumn("time", F.from_unixtime((F.unix_timestamp(col("window.start")) + F.unix_timestamp(col("window.end")))/2)) pw_distinct_core_id = pw_distinct_core_id.select(col("count(DISTINCT core_id)").alias("sensors_reporting"), "time","core_ids_reporting") pw_distinct_core_id = pw_distinct_core_id.withColumn("day",F.date_trunc("day","time")) pw_distinct_core_id = pw_distinct_core_id.select("day","sensors_reporting","core_ids_reporting") pw_powered_locations = pw_df.select("time","is_powered","core_id","location_latitude","location_longitude") pw_powered_locations = pw_powered_locations.withColumn("is_powered",col("is_powered").cast(IntegerType())) pw_powered_locations = pw_powered_locations.groupBy("core_id",F.window("time",'4 minutes', '1 minute')).agg(F.avg("is_powered").alias("avg_power"), F.first("location_latitude").alias("location_latitude"), F.first("location_longitude").alias("location_longitude")) pw_powered_locations = pw_powered_locations.filter(col("avg_power") == 1) pw_powered_locations = pw_powered_locations.withColumn("time", col("window.start")) pw_powered_locations = pw_powered_locations.select("time","core_id","location_latitude","location_longitude") pw_powered_locations = pw_powered_locations.withColumn("loc_struct",F.struct("core_id","location_latitude","location_longitude"))
def execute(self, conf_path: str, input_path: str, output_path: str, on_dbfs: bool) -> None: """ Pipeline that sanitize data, extract drugs and change the data model finally save to a JSON. This is the main entrypoint of the package. The parameters are the job's arguments. Args: conf_path: If DataBricks Filesystem is mounted input_path: Folder path to write files output_path: Folder path to read raw files on_dbfs: File path of the params.json Returns: Nothing only modify inplace the instanced class """ self.load_params(conf_path) df_dict = Sanitizer.read_files(self.logger, self.spark, self.params, input_path) Sanitizer.clean_strings(self.logger, df_dict) df_dict = Sanitizer.clean_date(self.logger, df_dict) df_dict = Sanitizer.empty_str_cleaning(self.logger, df_dict) Files.merge_write(self.logger, df_dict, self.params.get("merge sanitized rules"), path.join(output_path, "sanitized"), self.spark) df_dict = Files.read_delta( self.logger, set(self.params.get("csv") + self.params.get("json")), path.join(output_path, "sanitized"), self.spark) Sanitizer.deduplication(self.logger, df_dict, self.params.get("deduplication rules")) DrugsExtractor.to_words(self.logger, df_dict, self.params.get("to words")) drug_df_name = self.params.get("names").get("drugs") drug_col_name = self.params.get("names").get("drug") df_dict[drug_df_name] = df_dict.get(drug_df_name).withColumn( drug_col_name, lower(col(drug_col_name))).filter(col(drug_col_name).isNotNull()) # To be refactor as it don't work in case of really large drug list because of collect to driver (below) and column creation (above) # need to drop duplicate because several drugs can have different atc code drugs_list = df_dict.get(drug_df_name).select( drug_col_name).drop_duplicates().toPandas()[drug_col_name].to_list( ) df_dict.pop(drug_df_name) for df in df_dict.values(): df.cache() self.logger.info( "Prepared drug list and cached dataframes for following intensive computation: {}" .format(df_dict)) DrugsExtractor.pivot(self.logger, drugs_list, df_dict) date = self.params.get("names").get("date") id_col = self.params.get("names").get("id") journal = self.params.get("names").get("journal") columns_kept = [date, id_col, journal] df_dict = DrugsExtractor.shift(self.logger, drugs_list, df_dict, drug_col_name, self.spark, columns_kept) # Construct publication objects and journal object for df_name in self.params.get("to words").keys(): df_dict[df_name] = df_dict.get(df_name).withColumn( date, col(date).cast(StringType())) df_dict[df_name] = df_dict.get(df_name).withColumn(id_col, struct(col(date).alias(date), col(id_col).alias(id_col))) \ .withColumn(journal, struct(col(date).alias(date), col(journal).alias(journal))) self.logger.info( "Publication objects and journal object constructed: {}".format( df_dict)) trial = self.params.get("names").get("clinical_trials") pubmed = self.params.get("names").get("pubmed") # Get of each drug a the list of journal and publication (we use set on journal to avoid duplicates) merge_trial_df = \ df_dict.get(trial).groupby(drug_col_name)\ .agg(collect_set(col(journal)).alias(journal), collect_list(col(id_col)).alias(trial))\ .withColumn(pubmed, lit(None) .cast(ArrayType(StructType([StructField('date', StringType(), True), StructField('id', StringType(), True)])))) self.logger.info("Created publication per drug for trials: {}".format( merge_trial_df)) merge_pub_df = df_dict.get(pubmed).groupby(drug_col_name).agg( collect_set(col(journal)).alias(journal), collect_list(col(id_col)).alias(pubmed)) self.logger.info( "Created publication per drug for pubmed: {}".format(merge_pub_df)) # Merge clinical trials publications with pubmed publication by drug with their associated journal (without repetition) merge_path = path.join(output_path, "enriched") Files.merge_write(self.logger, {trial: merge_trial_df}, self.params.get("merge sanitized rules"), merge_path, self.spark) delta_path = path.join(merge_path, trial) from delta.tables import DeltaTable delta_trial = DeltaTable.forPath(self.spark, delta_path) update_match = "trial.{0} = pub.{0}".format(drug_col_name) update = { pubmed: col(f"pub.{pubmed}"), journal: array_distinct( concat(col(f"pub.{journal}"), col(f"trial.{journal}"))) } insert = { pubmed: col(f"pub.{pubmed}"), journal: col(f"pub.{journal}"), drug_col_name: col(f"pub.{drug_col_name}"), trial: lit(None) } self.logger.info( "Merging publications with the matching rule: {}".format( update_match)) (delta_trial.alias("trial").merge( merge_pub_df.alias("pub"), update_match).whenMatchedUpdate( set=update).whenNotMatchedInsert(values=insert).execute()) # Save the end result graph_filename = self.params.get("names").get("graph_filename") json_df = self.spark.read.format("delta").load(delta_path) # To use the filesystem mounted on databricks with python process we need to prefix "/dbfs/" but Spark process don't work with this prefix pythonic_path = "/dbfs" + output_path if on_dbfs else output_path graph_path = path.join(pythonic_path, *graph_filename) json_df.withColumn(journal, to_json(col(journal))).withColumn( trial, to_json(col(trial))).withColumn(pubmed, to_json( col(pubmed))).toPandas().to_json(graph_path, orient="records", date_format="iso") # when used multiLine need to be enable on the reading spark process self.logger.info("Wrote the resulting JSON to: {}".format(graph_path))
def process_biomarkers( self, biomarkers_df: DataFrame, source_df: DataFrame, disease_df: DataFrame, drugs_df: DataFrame ) -> DataFrame: """The diverse steps to prepare and enrich the input table""" biomarkers_enriched = ( biomarkers_df .select( 'Biomarker', 'IndividualMutation', array_distinct(split(col('Alteration'), ';')).alias('alterations'), array_distinct(split(col('Gene'), ';')).alias('gene'), split(col('AlterationType'), ';').alias('alteration_types'), array_distinct(split(col("PrimaryTumorTypeFullName"), ";")).alias('tumor_type_full_name'), array_distinct(split(col('Drug'), ';|,')).alias('drug'), 'DrugFullName', 'Association', 'gDNA', array_distinct(split(col('EvidenceLevel'), ',')).alias('confidence'), array_distinct(split(col('Source'), ';')).alias('source') ) .withColumn('confidence', explode(col('confidence'))) .withColumn('tumor_type_full_name', explode(col('tumor_type_full_name'))) .withColumn('tumor_type', translate(col('tumor_type_full_name'), ' -', '')) .withColumn('drug', explode(col('drug'))) .withColumn('drug', translate(col('drug'), '[]', '')) .withColumn('gene', explode(col('gene'))) .replace(to_replace=GENENAMESOVERRIDE, subset=['gene']) .withColumn('gene', upper(col('gene'))) # At this stage alterations and alteration_types are both arrays # Disambiguation when the biomarker consists of multiple alterations is needed # This is solved by: # 1. Zipping both fields - tmp consists of a list of alteration/type tuples # 2. tmp is exploded - tmp consists of the alteration/type tuple # 3. alteration & alteration_type columns are overwritten with the elements in the tuple .withColumn( 'tmp', self.zip_alterations_with_type_udf(col('alterations'), col('alteration_types'))) .withColumn('tmp', explode(col('tmp'))) .withColumn('alteration_type', element_at(col('tmp'), 2)) .withColumn( 'alteration', when( ~col('IndividualMutation').isNull(), col('IndividualMutation') ) .otherwise(element_at(col('tmp'), 1)) ) .drop('tmp') # Clean special cases on the alteration string .withColumn( 'alteration', when( col('alteration') == 'NRAS:.12.,.13.,.59.,.61.,.117.,.146.', col('Biomarker') # 'NRAS (12,13,59,61,117,146)' ) .when( # Cleans strings like 'ARAF:.' col('alteration').contains(':.'), translate(col('alteration'), ':.', '') ) .when( # Fusion genes are described with '__' # biomarker is a cleaner representation when there's one alteration (col('alteration').contains('__')) & (~col('Biomarker').contains('+')), col('Biomarker') ) .otherwise(col('alteration')) ) # Split source into literature and urls # literature contains PMIDs # urls are enriched from the source table if not a CT .withColumn('source', explode(col('source'))) .withColumn('source', trim(regexp_extract(col('source'), r'(PMID:\d+)|([\w ]+)', 0).alias('source'))) .join(source_df, on='source', how='left') .withColumn( 'literature', when(col('source').startswith('PMID'), regexp_extract(col('source'), r'(PMID:)(\d+)', 2)) ) .withColumn( 'urls', when( col('source').startswith('NCT'), struct( lit('Clinical Trials').alias('niceName'), concat(lit('https://clinicaltrials.gov/ct2/show/'), col('source')).alias('url') ) ) .when( (~col('source').startswith('PMID')) | (~col('source').startswith('NCIT')), struct(col('niceName'), col('url')) ) ) # The previous conditional clause creates a struct regardless of # whether any condition is met. The empty struct is replaced with null .withColumn('urls', when(~col('urls.niceName').isNull(), col('urls'))) # Enrich data .withColumn('functionalConsequenceId', col('alteration_type')) .replace(to_replace=ALTERATIONTYPE2FUNCTIONCSQ, subset=['functionalConsequenceId']) .replace(to_replace=DRUGRESPONSE2EFO, subset=['Association']) .join(disease_df, on='tumor_type', how='left') .withColumn('drug', upper(col('drug'))) .withColumn( # drug class is coalesced when the precise name of the medicine is not provided 'drug', when(col('drug') == '', col('DrugFullName')).otherwise(col('drug'))) .join(drugs_df, on='drug', how='left') .withColumn('drug', initcap(col('drug'))) # Translate variantId .withColumn( 'variantId', when(~col('gDNA').isNull(), self.get_variantId_udf(col('gDNA'))) ) # Assign a GO ID when a gene expression data is reported .withColumn( 'geneExpressionId', when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('over')), 'GO_0010628' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('under')), 'GO_0010629' ) .when( (col('alteration_type') == 'EXPR') & (col('alteration').contains('norm')), 'GO_0010467' ) ) # Create variant struct .withColumn( 'variant', when( col('alteration_type') != 'EXPR', struct( col('alteration').alias('name'), col('variantId').alias('id'), col('functionalConsequenceId') ) ) ) # Create geneExpression struct .withColumn( 'geneExpression', when( col('alteration_type') == 'EXPR', struct( col('alteration').alias('name'), col('geneExpressionId').alias('id')) ) ) ) pre_evidence = ( biomarkers_enriched .withColumn('datasourceId', lit('cancer_biomarkers')) .withColumn('datatypeId', lit('affected_pathway')) .withColumnRenamed('tumor_type_full_name', 'diseaseFromSource') .withColumnRenamed('drug', 'drugFromSource') # diseaseFromSourceMappedId, drugId populated above .withColumnRenamed('Association', 'drugResponse') # confidence, literature and urls populated above .withColumnRenamed('gene', 'targetFromSourceId') .withColumnRenamed('Biomarker', 'biomarkerName') # variant, geneExpression populated above .drop( 'tumor_type', 'source', 'alteration', 'alteration_type', 'IndividualMutation', 'geneExpressionId', 'gDNA', 'functionalConsequenceId', 'variantId', 'DrugFullName', 'niceName', 'url') ) # Group evidence self.evidence = ( pre_evidence .groupBy('datasourceId', 'datatypeId', 'drugFromSource', 'drugId', 'drugResponse', 'targetFromSourceId', 'diseaseFromSource', 'diseaseFromSourceMappedId', 'confidence', 'biomarkerName') .agg( collect_set('literature').alias('literature'), collect_set('urls').alias('urls'), collect_set('variant').alias('variant'), collect_set('geneExpression').alias('geneExpression'), ) # Replace empty lists with null values .withColumn('literature', when(size(col('literature')) == 0, lit(None)).otherwise(col('literature'))) .withColumn('urls', when(size(col('urls')) == 0, lit(None)).otherwise(col('urls'))) .withColumn('variant', when(size(col('variant')) == 0, lit(None)).otherwise(col('variant'))) .withColumn( 'geneExpression', when(size(col('geneExpression')) == 0, lit(None)) .otherwise(col('geneExpression'))) # Collect variant info into biomarkers struct .withColumn( 'biomarkers', struct( 'variant', 'geneExpression' )) .drop('variant', 'geneExpression') .distinct() ) return self.evidence
# COMMAND ---------- # MAGIC %md # MAGIC To prepare the data for analysis, we perform the following transformations: # MAGIC - Split multiallelic variants with the ``split_multiallelics`` transformer. # MAGIC - Calculate the number of alternate alleles for biallelic variants with `genotype_states`. # MAGIC - Replace any missing values with the mean of the non-missing values using `mean_substitute`. # MAGIC - Filter out all homozygous SNPs. # COMMAND ---------- variant_df = (glow.transform( 'split_multiallelics', base_variant_df).withColumn( 'values', glow.mean_substitute(glow.genotype_states('genotypes'))).filter( fx.size(fx.array_distinct('values')) > 1)) # COMMAND ---------- # MAGIC %md # MAGIC # MAGIC Create the beginning block genotype matrix and sample block ID mapping with `glow.wgr.block_variants_and_samples`. # MAGIC # MAGIC Write the block matrix to Delta and the sample blocks a JSON file so that we can reuse them for multiple phenotype batches. # COMMAND ---------- block_df, sample_blocks = glow.wgr.block_variants_and_samples( variant_df, sample_ids, variants_per_block, sample_block_count) # COMMAND ----------
inputCol="refined_text", outputCol="words", pattern= "overfit|underfit|missing values|imbalance|covariate shift|outlier|leakage|calibration|dataset shift|drift", gaps=False) # COMMAND ---------- regexTokenized = regexTokenizer.transform(ff) display(regexTokenized) # COMMAND ---------- wo_dupes = regexTokenized.withColumn("words_without_dupes", array_distinct("words")) display(wo_dupes) # COMMAND ---------- countdf = regexTokenized.select('*', size('words').alias('size')) display(countdf) # COMMAND ---------- countdf_wo_dupes = wo_dupes.select( '*', size('words_without_dupes').alias('dupes_wo_size'))
#func.max(sort_pub_year).alias('lastyr'), # number of cited papers. func.sum( func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear + ' AND ' + maxyear + ',IF(CitationCountNonSelf>0,1,0),0)') ).alias('ns_npcY1Y3'), func.sum( func.expr('IF(' + sort_pub_year + ' BETWEEN ' + minyear + ' AND ' + maxyear + ',IF(CitationCount>0,1,0),0)')).alias('ws_npcY1Y3'), func.sum('CitationCountNonSelf').alias('ns_ncY2Y3'), func.size( func.array_distinct( func.flatten(func.collect_list( 'CitingEidsNonSelf')))).alias('ns_ncY2Y3_cp'), func.max(func.expr('IF(ns_r<=CitationCountNonSelf,ns_r,0)')).alias( 'ns_hY3'), func.max(func.expr('IF(ns_r_eff<=CitationCountNonSelf,ns_r_eff,0)') ).alias('ns_hmY3'), func.sum(func.expr('IF(n_authors=1,1,0)')).alias('ns_nps'), func.sum(func.expr( 'IF(n_authors=1,CitationCountNonSelf,0)')).alias('ns_ncs'), func.sum(func.expr('IF(n_authors=1 OR Authorseq=1,1,0)')).alias( 'ns_npsf'), func.sum( func.expr( 'IF(n_authors=1 OR Authorseq=1,CitationCountNonSelf,0)')). alias('ns_ncsf'), func.sum(
def run( plink_path: str, traits_path: str, covariates_path: str, variants_per_block: int, sample_block_count: int, output_dir: str, plink_fam_sep: str = "\t", plink_bim_sep: str = "\t", alphas: Optional[list] = None, contigs: List[str] = None, ): """Run Glow WGR""" output_path = Path(output_dir) if output_path.exists(): shutil.rmtree(output_path) output_path.mkdir(parents=True, exist_ok=False) if alphas is None: alphas = np.array([]) else: alphas = np.array(alphas).astype(float) spark = spark_session() logger.info( f"Loading PLINK dataset at {plink_path} (fam sep = {plink_fam_sep}, bim sep = {plink_bim_sep}, alphas = {alphas})" ) df = (spark.read.format("plink").option( "bimDelimiter", plink_bim_sep).option("famDelimiter", plink_fam_sep).option( "includeSampleIds", True).option("mergeFidIid", False).load(plink_path)) variant_df = df.withColumn( "values", mean_substitute(genotype_states(F.col("genotypes")))).filter( F.size(F.array_distinct("values")) > 1) if contigs is not None: variant_df = variant_df.filter(F.col("contigName").isin(contigs)) sample_ids = get_sample_ids(variant_df) logger.info( f"Found {len(sample_ids)} samples, first 10: {sample_ids[:10]}") ########### # Stage 1 # ########### logger.info(HR) logger.info("Calculating variant/sample block info") block_df, sample_blocks = block_variants_and_samples( variant_df, sample_ids, variants_per_block=variants_per_block, sample_block_count=sample_block_count, ) label_df = pd.read_csv(traits_path, index_col="sample_id") label_df = (label_df - label_df.mean()) / label_df.std(ddof=0) logger.info(HR) logger.info("Trait info:") logger.info(_info(label_df)) cov_df = pd.read_csv(covariates_path, index_col="sample_id") cov_df = (cov_df - cov_df.mean()) / cov_df.std(ddof=0) logger.info(HR) logger.info("Covariate info:") logger.info(_info(cov_df)) stack = RidgeReducer(alphas=alphas) reduced_block_df = stack.fit_transform(block_df, label_df, sample_blocks, cov_df) logger.info(HR) logger.info("Stage 1: Reduced block schema:") logger.info(_schema(reduced_block_df)) path = output_path / "reduced_blocks.parquet" reduced_block_df.write.parquet(str(path), mode="overwrite") logger.info(f"Stage 1: Reduced blocks written to {path}") # Flatten to scalars for more convenient access w/o Spark flat_reduced_block_df = spark.read.parquet(str(path)) path = output_path / "reduced_blocks_flat.csv.gz" flat_reduced_block_df = _flatten_reduced_blocks(flat_reduced_block_df) flat_reduced_block_df = flat_reduced_block_df.toPandas() flat_reduced_block_df.to_csv(path, index=False) # flat_reduced_block_df.write.parquet(str(path), mode='overwrite') logger.info(f"Stage 1: Flattened reduced blocks written to {path}") ########### # Stage 2 # ########### # Monkey-patch this in until there's a glow release beyond 0.5.0 if glow_version != "0.5.0": raise NotImplementedError( f"Must remove adjustements for glow != 0.5.0 (found {glow_version})" ) # Remove after glow update RidgeRegression.transform_loco = transform_loco estimator = RidgeRegression(alphas=alphas) model_df, cv_df = estimator.fit(reduced_block_df, label_df, sample_blocks, cov_df) logger.info(HR) logger.info("Stage 2: Model schema:") logger.info(_schema(model_df)) logger.info("Stage 2: CV schema:") logger.info(_schema(cv_df)) y_hat_df = estimator.transform(reduced_block_df, label_df, sample_blocks, model_df, cv_df, cov_df) logger.info(HR) logger.info("Stage 2: Prediction info:") logger.info(_info(y_hat_df)) logger.info(y_hat_df.head(5)) path = output_path / "predictions.csv" y_hat_df.reset_index().to_csv(path, index=False) logger.info(f"Stage 2: Predictions written to {path}") y_hat_df_loco = estimator.transform_loco(reduced_block_df, label_df, sample_blocks, model_df, cv_df, cov_df) path = output_path / "predictions_loco.csv" y_hat_df_loco.reset_index().to_csv(path, index=False) logger.info(f"Stage 2: LOCO Predictions written to {path}") ########### # Stage 3 # ########### # Do this to correct for the error in Glow at https://github.com/projectglow/glow/issues/257 if glow_version != "0.5.0": raise NotImplementedError( f"Must remove adjustements for glow != 0.5.0 (found {glow_version})" ) cov_arr = cov_df.to_numpy() cov_arr = cov_arr.T.ravel(order="C").reshape(cov_arr.shape) # Convert the pandas dataframe into a Spark DataFrame adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df) # Run GWAS w/o LOCO (this could be for a much larger set of variants) wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").crossJoin( adjusted_phenotypes.withColumnRenamed( "values", "phenotypeValues")).select( "start", "names", "label", expand_struct( linear_regression_gwas(F.col("callValues"), F.col("phenotypeValues"), F.lit(cov_arr))), )) logger.info(HR) logger.info("Stage 3: GWAS (no LOCO) schema:") logger.info(_schema(wgr_gwas)) # Convert to pandas wgr_gwas = wgr_gwas.toPandas() logger.info(HR) logger.info("Stage 3: GWAS (no LOCO) info:") logger.info(_info(wgr_gwas)) logger.info(wgr_gwas.head(5)) path = output_path / "gwas.csv" wgr_gwas.to_csv(path, index=False) logger.info(f"Stage 3: GWAS (no LOCO) results written to {path}") logger.info(HR) logger.info("Done") # TODO: Enable this once WGR is fully released # See: https://github.com/projectglow/glow/issues/256) # Run GWAS w/ LOCO adjusted_phenotypes = reshape_for_gwas(spark, label_df - y_hat_df_loco) wgr_gwas = (variant_df.withColumnRenamed("values", "callValues").join( adjusted_phenotypes.withColumnRenamed("values", "phenotypeValues"), ["contigName"], ).select( "contigName", "start", "names", "label", expand_struct( linear_regression_gwas(F.col("callValues"), F.col("phenotypeValues"), F.lit(cov_arr))), )) # Convert to pandas wgr_gwas = wgr_gwas.toPandas() logger.info(HR) logger.info("Stage 3: GWAS (with LOCO) info:") logger.info(_info(wgr_gwas)) logger.info(wgr_gwas.head(5)) path = output_path / "gwas_loco.csv" wgr_gwas.to_csv(path, index=False) logger.info(f"Stage 3: GWAS (with LOCO) results written to {path}") logger.info(HR) logger.info("Done")
print(f"Top 10 words {mvv}") # Preparation for calculating stats stpwr = set(stopwords.words('english')) stpwr.update([' ', ' ', ' ', ' ', '', 'like', 'im', 'oh', 'dont', 'im']) clean_df = newdf.filter(newdf['word'].isin(stpwr) == False) #clean_df.sort(clean_df.wcount,ascending=False).show(10) billdf = billdf.withColumn('allWords', f.size(f.split(f.col('Lyrics'), ' '))) billdf = billdf.withColumn('uniqWords', f.split(f.col('Lyrics'), ' ')) billdf = billdf.withColumn('uniqWords', f.size(f.array_distinct("uniqWords"))) billdf = billdf.withColumn('Gini', f.col("uniqWords") / f.col("allWords")) df2 = billdf.filter(billdf.allWords > 1) #df2.show(10) gini_df = df2.select(['Year', 'Gini']).groupby('Year').mean('Gini').withColumnRenamed( 'avg(Gini)', "Mean Gini") #gini_df.orderBy('Year', ascending = True).show(20) # Scatter plot of Gini coefficient p_gini_df = gini_df.toPandas() fig3 = px.scatter(p_gini_df,
# MAGIC %md # MAGIC # MAGIC Extract sample IDs from a variant DataFrame with `glow.wgr.get_sample_ids`. # COMMAND ---------- sample_ids = glow.wgr.get_sample_ids(base_variant_df) # COMMAND ---------- variant_df = (glow.transform( 'split_multiallelics', base_variant_df).withColumn( 'values', glow.mean_substitute(glow.genotype_states('genotypes'))).filter( fx.size(fx.array_distinct('values')) > 1).alias('variant_df')) # COMMAND ---------- display(variant_df) # COMMAND ---------- # MAGIC # MAGIC %md # MAGIC # MAGIC Create the beginning block genotype matrix and sample block ID mapping with `glow.wgr.block_variants_and_samples`. # MAGIC # MAGIC Write the block matrix to Delta and the sample blocks a JSON file so that we can reuse them for multiple phenotype batches. # COMMAND ----------
pw_null_merge_time = pw_null_merge_time.select("user_id", "outage_times") pw_df = pw_df.union(pw_null_merge_time) udfTimestampAverage = udf(timestamp_average, LongType()) pw_df = pw_df.withColumn("outage_time", udfTimestampAverage("outage_times")) pw_df = pw_df.localCheckpoint(eager=True) print("Merged to:", pw_df.count()) print() #Okay now we have a list of outages, restore_times, locations, user_ids #First let's calculate some high level metrics #size of outages pw_finalized_outages = pw_finalized_outages.withColumn( "cluster_size", F.size(F.array_distinct("user_id"))) #standard deviation outage times pw_finalized_outages = pw_finalized_outages.withColumn( "outage_times_stddev", F.explode("outage_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [ F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time' ] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg( F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"), *exprs) #range of outage times pw_finalized_outages = pw_finalized_outages.withColumn(
lambda row: Row(business_id=row.split(",")[0], avg_stars=row.split(',')[3], user_stars=row.split(',')[4], user_id=row.split(",")[5])) # remove duplicate rows data_rdd = data_rdd.map(lambda row: row).distinct() data_rdd = data_rdd.filter(lambda row: row[1] <= row[2]) # create DataFrame data_df = spark.createDataFrame(data_rdd) data_df = data_df.groupby('user_id').agg(F.collect_list('business_id')) # group restaurants with same name but different location together data_df = data_df.withColumn("business_id_list", array_distinct("collect_list(business_id)")) # # Python API docs fpGrowth = FPGrowth(itemsCol="business_id_list", minSupport=0.001, minConfidence=0.001) model = fpGrowth.fit(data_df) # # Display frequent itemsets model.freqItemsets.orderBy([func.size("items"), "freq"], ascending=[0, 0]).show(20, truncate=False) # Association Rules association = model.associationRules association.orderBy([func.size("antecedent"), "confidence"], ascending=[0, 0]).show(20, truncate=False)
pw_df = pw_df.select("core_id","outage_times","restore_time","location") pw_null_merge_time = pw_null_merge_time.select("core_id","outage_times","restore_time","location") pw_df = pw_df.union(pw_null_merge_time) udfTimestampAverage = udf(timestamp_average, LongType()) pw_df = pw_df.withColumn("outage_time", udfTimestampAverage("outage_times")) pw_df = pw_df.localCheckpoint(eager = True) print("Merged to:", pw_df.count()) print() #Okay now we have a list of outages, restore_times, locations, core_ids #First let's calculate some high level metrics #size of outages pw_finalized_outages = pw_finalized_outages.withColumn("cluster_size", F.size(F.array_distinct("core_id"))) #standard deviation outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_stddev", F.explode("outage_times")) #this expression essentially takes the first value of each column (which should all be the same after the explode) exprs = [F.first(x).alias(x) for x in pw_finalized_outages.columns if x != 'outage_times_stddev' and x != 'outage_time'] pw_finalized_outages = pw_finalized_outages.groupBy("outage_time").agg(F.stddev_pop("outage_times_stddev").alias("outage_times_stddev"),*exprs) #range of outage times pw_finalized_outages = pw_finalized_outages.withColumn("outage_times_range", F.array_max("outage_times") - F.array_min("outage_times")) #standard deviation and range of restore times pw_finalized_outages = pw_finalized_outages.withColumn("restore_times", col("restore_time")) pw_finalized_outages = pw_finalized_outages.withColumn("restore_time", F.explode("restore_time"))
"tx", "feeder_id") pw_df = pw_df.union(pw_null_merge_time) udfTimestampAverage = udf(timestamp_average, LongType()) pw_df = pw_df.withColumn("outage_time", udfTimestampAverage("outage_times")) pw_df = pw_df.localCheckpoint(eager=True) print("Merged to:", pw_df.count()) print() #Okay now we have a list of outages, restore_times, locations, core_ids #First let's calculate some high level metrics #size of outages pw_finalized_outages = pw_finalized_outages.withColumn( "cluster_size", F.size(F.array_distinct("core_id"))) # now filter the outages so that at least two devices went out pw_finalized_outages = pw_finalized_outages.filter(col("cluster_size") >= 2) # now explode the outage lists so that every line is a sensor involved in that outage and regroup by transformer and feeder # then each outage maps to a number of sensors out under each transformer and feeder # This gives the relative SAIFI contribution of each transformer in each outage pw_outages_by_feeder = pw_finalized_outages.select("outage_time", "feeder_id") pw_outages_by_feeder = pw_outages_by_feeder.withColumn("feeder_id", F.explode("feeder_id")) pw_outages_by_feeder = pw_outages_by_feeder.withColumn("size", F.lit(1)) pw_outages_by_feeder = pw_outages_by_feeder.groupBy( "outage_time", "feeder_id").agg(F.sum("size").alias("cluster_size")) pw_outages_by_feeder = pw_outages_by_feeder.select( "outage_time", "cluster_size",
array_subset = shows.select("name", "genres") array_subset = array_subset.select( "name", array_subset.genres[0].alias("dot_and_index"), F.col("genres")[0].alias("col_and_index"), array_subset.genres.getItem(0).alias("dot_and_method"), F.col("genres").getItem(0).alias("col_and_method"), ) # array_subset.show() array_subset_repeated = array_subset.select( "name", F.lit("Comedy").alias("one"), F.lit("Horror").alias("two"), F.lit("Drama").alias("three"), F.col("dot_and_index"), ).select( "name", F.array("one", "two", "three").alias("Some_Genres"), F.array_repeat("dot_and_index", 5).alias("Repeated_Genres"), array_subset_repeated.show(1, False) array_subset_repeated.select( "name", F.size("Some_Genres"), F.size("Repeated_Genres") ).show() array_subset_repeated.select( "name", F.array_distinct("Some_Genres"), F.array_distinct("Repeated_Genres") ).show(1, False)
def generate_panelapp_evidence(self, input_file: str, output_file: str, cache_dir: str) -> None: logging.info('Filter and extract the necessary columns.') panelapp_df = self.spark.read.csv(input_file, sep=r'\t', header=True) # Panel version can be either a single number (e.g. 1), or two numbers separated by a dot (e.g. 3.14). We cast # either representation to float to ensure correct filtering below. (Note that conversion to float would not # work in the general case, because 3.4 > 3.14, but we only need to compare relative to 1.0.) panelapp_df = panelapp_df.withColumn( 'Panel Version', panelapp_df['Panel Version'].cast('float').alias('Panel Version')) panelapp_df = ( panelapp_df.filter(( (col('List') == 'green') | (col('List') == 'amber')) & (col('Panel Version') >= 1.0) & (col('Panel Status') == 'PUBLIC')).select( 'Symbol', 'Panel Id', 'Panel Name', 'List', 'Mode of inheritance', 'Phenotypes') # The full original records are not redundant; however, uniqueness on a subset of fields is not guaranteed. .distinct()) logging.info( 'Fix typos and formatting errors which would interfere with phenotype splitting.' ) panelapp_df = panelapp_df.withColumn('cleanedUpPhenotypes', col('Phenotypes')) for regexp, replacement in self.PHENOTYPE_BEFORE_SPLIT_RE.items(): panelapp_df = panelapp_df.withColumn( 'cleanedUpPhenotypes', regexp_replace(col('cleanedUpPhenotypes'), regexp, replacement)) logging.info('Split and explode the phenotypes.') panelapp_df = (panelapp_df.withColumn( 'cohortPhenotypes', array_distinct(split(col('cleanedUpPhenotypes'), ';'))).withColumn( 'phenotype', explode(col('cohortPhenotypes')))) logging.info( 'Remove specific patterns and phrases which will interfere with ontology extraction and mapping.' ) panelapp_df = panelapp_df.withColumn('diseaseFromSource', col('phenotype')) for regexp in self.PHENOTYPE_AFTER_SPLIT_RE: panelapp_df = panelapp_df.withColumn( 'diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({regexp})', '')) logging.info( 'Extract ontology information, clean up and filter the split phenotypes.' ) panelapp_df = ( panelapp_df # Extract Orphanet/MONDO/HP ontology identifiers and remove them from the phenotype string. .withColumn('ontology_namespace', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 1)) .withColumn('ontology_namespace', regexp_replace(col('ontology_namespace'), 'OrphaNet: ORPHA', 'ORPHA')) .withColumn('ontology_id', regexp_extract(col('diseaseFromSource'), self.OTHER_RE, 2)) .withColumn( 'ontology', when( (col('ontology_namespace') != '') & (col('ontology_id') != ''), concat(col('ontology_namespace'), lit(':'), col('ontology_id')) ) ) .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OTHER_RE})', '')) # Extract OMIM identifiers and remove them from the phenotype string. .withColumn('omim_id', regexp_extract(col('diseaseFromSource'), self.OMIM_RE, 2)) .withColumn('omim', when(col('omim_id') != '', concat(lit('OMIM:'), col('omim_id')))) .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), f'({self.OMIM_RE})', '')) # Choose one of the ontology identifiers, keeping OMIM as a priority. .withColumn('diseaseFromSourceId', when(col('omim').isNotNull(), col('omim')).otherwise(col('ontology'))) .drop('ontology_namespace', 'ontology_id', 'ontology', 'omim_id', 'omim') # Clean up the final split phenotypes. .withColumn('diseaseFromSource', regexp_replace(col('diseaseFromSource'), r'\(\)', '')) .withColumn('diseaseFromSource', trim(col('diseaseFromSource'))) .withColumn('diseaseFromSource', when(col('diseaseFromSource') != '', col('diseaseFromSource'))) # Remove low quality records, where the name of the phenotype string starts with a question mark. .filter( ~( (col('diseaseFromSource').isNotNull()) & (col('diseaseFromSource').startswith('?')) ) ) # Remove duplication caused by cases where multiple phenotypes within the same record fail to generate any # phenotype string or ontology identifier. .distinct() # For records where we were unable to determine either a phenotype string nor an ontology identifier, # substitute the panel name instead. .withColumn( 'diseaseFromSource', when( (col('diseaseFromSource').isNull()) & (col('diseaseFromSourceId').isNull()), col('Panel Name') ) .otherwise(col('diseaseFromSource')) ) .persist() ) logging.info('Fetch and join literature references.') all_panel_ids = panelapp_df.select( 'Panel Id').toPandas()['Panel Id'].unique() literature_references = self.fetch_literature_references(all_panel_ids) panelapp_df = panelapp_df.join(literature_references, on=['Panel Id', 'Symbol'], how='left') if self.debug_output_phenotypes_filename: logging.info('Output tables for debugging purposes, if requested.') (panelapp_df.select( 'Phenotypes', # Original, unaltered string with all phenotypes. 'cleanedUpPhenotypes', # String with phenotypes after pre-split cleanup. 'phenotype', # Individual phenotype after splitting. 'diseaseFromSource', # Final cleaned up disease name. 'diseaseFromSourceId', # Final cleaned up disease ID. ).distinct().toPandas().to_csv( self.debug_output_phenotypes_filename, sep='\t', index=False)) logging.info( 'Drop unnecessary fields and populate the final evidence string structure.' ) evidence_df = ( panelapp_df.drop('Phenotypes', 'cleanedUpPhenotypes', 'phenotype') # allelicRequirements requires a list, but we always only have one value from PanelApp. .withColumn( 'allelicRequirements', when( col('Mode of inheritance').isNotNull(), array(col('Mode of inheritance')))).drop( 'Mode of inheritance').withColumnRenamed( 'List', 'confidence').withColumn( 'datasourceId', lit('genomics_england')).withColumn( 'datatypeId', lit('genetic_literature')) # diseaseFromSourceId populated above # literature populated above .withColumnRenamed('Panel Id', 'studyId').withColumnRenamed( 'Panel Name', 'studyOverview').withColumnRenamed('Symbol', 'targetFromSourceId') # Some residual duplication is caused by slightly different representations from `cohortPhenotypes` being # cleaned up to the same representation in `diseaseFromSource`, for example "Pontocerebellar hypoplasia type # 2D (613811)" and "Pontocerebellar hypoplasia type 2D, 613811". .distinct()) evidence_df = add_efo_mapping(evidence_strings=evidence_df, spark_instance=self.spark, ontoma_cache_dir=cache_dir) logging.info('Disease mappings have been added.') write_evidence_strings(evidence_df, output_file) logging.info( f'{evidence_df.count()} evidence strings have been saved to {output_file}' )