def check_zero_genes(data=None): """ Checks data for genes with values of 0 for signals across all samples. arguments: data (object): Pandas data frame of genes' signals for all samples. raises: returns: """ utility.print_terminal_partition(level=2) print("Check for genes with values of 0 for signals across all samples.") print("These genes are undetectable.") print("shape of original data frame: " + str(data.shape)) data_nonzero = (data != 0) print("shape of data frame without zero genes: " + str(data.loc[data_nonzero.any(axis="columns"), :].shape)) print("Now printing a summary of data for genes with all zero signals.") data_zero = (data == 0) data_signal_zero = data.loc[data_zero.all(axis="columns"), :] print(data_signal_zero.iloc[0:10, 0:10]) #groups = data_signal_zero.groupby(level="gene") #print(groups.describe()) pass
def filter_samples_by_signal_threshold( data=None, threshold=None, ): """ Filter samples to keep only those with signals beyond threshold in at least one gene. Data format should have samples across columns and genes across rows. arguments: data (object): Pandas data frame of genes' signals across samples raises: returns: (object): Pandas data frame of genes' signals across samples """ utility.print_terminal_partition(level=2) print( "Filter samples to keep only those with signals beyond threshold in " + "at least one gene. \n" + "Data format should have samples across columns and genes across rows." ) print("signal threshold: " + str(threshold)) print("data dimensions before filter: " + str(data.shape)) data_threshold = (data >= threshold) data_detection = data.loc[:, data_threshold.any(axis="index")] print("data dimensions after filter: " + str(data_detection.shape)) utility.print_terminal_partition(level=3) return data_detection
def check_redundancy_genes(data=None): """ Checks data for redundancy in genes. arguments: data (object): Pandas data frame of genes' signals for all samples. raises: returns: """ utility.print_terminal_partition(level=2) print("Check for redundant genes in genes' signals.") print("Consider names of genes.") # Reset indices to consider names of genes. data = data.reset_index() print(data.iloc[0:10, 0:10]) data_redundancy = data.duplicated(subset=None, keep="first") data_redundancy_list = data_redundancy.to_list() if any(data_redundancy_list): print("Redundancy in genes: Yes") else: print("Redundancy in genes: No") pass
def execute_procedure(path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ # 3. read in Coombes' metabolite regression tables (all) # 4. iterate on Coombes' metabolite regression tables # 5. match metabolite names to Shin 2014 metabolite identifiers # 6. merge Coombes' metabolite regression information with metabolite heritabilities # 7. filter metabolites by whether identifiable and SNP-heritability > 0.05 # 8. calculate Benjamini-Hochberg False-Discovery Rates # Report version. utility.print_terminal_partition(level=1) print(path_dock) print("version check: 1") # Pause procedure. time.sleep(5.0) # Initialize directories. paths = initialize_directories( restore=False, path_dock=path_dock, ) # Read source information from file. source = read_source( path_dock=path_dock, report=True, ) #print(source["table_reference_shin_2014"]) #print(source["table_metabolite_heritabilities"]) # Read and organize tables for regressions between polygenic estimate # metabolites and phenotypes. pail = read_organize_polygenic_metabolite_phenotype_regression_tables( threshold_metabolite_heritability=0.05, # metabolite heritability path_source_directory=paths["coombes_polygene"], table_reference_shin_2014=source["table_reference_shin_2014"], table_metabolite_heritabilities=( source["table_metabolite_heritabilities"]), report=True, ) # Collect information. information = dict() information["tables"] = pail # Write product information to file. write_product(paths=paths, information=information) pass
def standardize_gene_signal(data_gene_signal=None): """ Transforms values of genes' signals to standard or z-score space. Data has genes across rows and samples across columns. sample_1 sample_2 sample_3 sample_4 sample_5 gene_1 ... ... ... ... ... gene_2 ... ... ... ... ... gene_3 ... ... ... ... ... gene_4 ... ... ... ... ... gene_5 ... ... ... ... ... arguments: data_gene_signal (object): Pandas data frame of genes' signals across samples raises: returns: (object): Pandas data frame of genes' signals across samples """ # Transform signals to standard score space. data_gene_signal_standard = calculate_standard_score_gene_signal_by_gene( data_gene_signal=data_gene_signal ) print(data_gene_signal_standard.iloc[0:10, 0:10]) # Compare summary statistics before and after transformation. utility.print_terminal_partition(level=3) print("Summary statistics for gene signals before standardization.") data_mean = data_gene_signal.apply( lambda x: x.mean(), axis="columns" ) print("Mean") print(data_mean.iloc[0:10]) data_deviation = data_gene_signal.apply( lambda x: x.std(), axis="columns" ) print("Standard deviation") print(data_deviation.iloc[0:10]) utility.print_terminal_partition(level=3) print("Summary statistics for gene signals after standardization.") data_mean = data_gene_signal_standard.apply( lambda x: x.mean(), axis="columns" ) print("Mean") print(data_mean.iloc[0:10]) data_deviation = data_gene_signal_standard.apply( lambda x: x.std(), axis="columns" ) print("Standard deviation") print(data_deviation.iloc[0:10]) return data_gene_signal_standard
def filter_genes_by_bimodality_thresholds( measures=None, thresholds=None, data_genes_distributions=None, direction=None, ): """ Copy and split information about genes. arguments: measures (list<str>): measures of bimodality thresholds (dict<float>): values of thresholds for measures of bimodality data_genes_distributions (object): Pandas data frame of information about genes and their measures of bimodality direction (str): direction of distribution from which to select, lesser or greater raises: returns: (dict<list<str>>): identifiers of genes that pass filtration by thresholds on each measure of bimodality """ utility.print_terminal_partition(level=1) print( "count of genes filtered by probabilities of each bimodality " + "measurement" ) # Collect genes from filtration by each measurement of bimodality. entries = dict() for measure in measures: # Copy minimal genes' data for each measure of bimodality. data_measure = copy_split_minimal_gene_data( measure=measure, data_genes_distributions=data_genes_distributions, ) # Filter genes by threshold on each measure's probabilities. data_filter = filter_genes_by_bimodality_threshold( data=data_measure, measure=measure, threshold=thresholds[direction][measure], direction=direction, ) # Extract genes' identifiers. genes = data_filter["identifier"].tolist() utility.print_terminal_partition(level=3) print(measure + ": " + str(len(genes))) # Compile information. entries[measure] = genes # Return information. return entries
def extract_genes_modality_sets( direction=None, measures=None, selection=None, ): """ Extracts identifiers of unique genes from selection by modality measures. arguments: direction (str): direction of distribution from which to select, lesser or greater measures (list<str>): measures of modality selection (dict): selections of genes raises: returns: (dict<list<str>>): identifiers of genes """ # Organize sets of genes. sets = dict() for measure in measures: sets[measure] = selection[measure][direction]["genes"] # Select genes that pass filters by multiple measures of bimodality. genes_1 = utility.select_elements_by_sets( names=measures, sets=sets, count=1, ) genes_2 = utility.select_elements_by_sets( names=measures, sets=sets, count=2, ) genes_3 = utility.select_elements_by_sets( names=measures, sets=sets, count=3, ) # Summarize information. utility.print_terminal_partition(level=2) print("Selection of genes by: " + direction) print("... any 1 sets: " + str(len(genes_1))) print("... any 2 sets: " + str(len(genes_2))) print("... any 3 sets: " + str(len(genes_3))) # Collect information. bin = dict() bin["measures_1"] = genes_1 bin["measures_2"] = genes_2 bin["measures_3"] = genes_3 bin["sets_genes_measures"] = sets # Return information. return bin
def collect_report_ontology_parentage_orphan_genes( cluster_reports=None, genes_query=None, report=None, ): """ Extracts information about persons. arguments: cluster_reports (dict): reports for each cluster genes_query (list<str>): identifiers of genes in original enrichment query report (bool): whether to print reports raises: returns: (dict<list<str>>): identifiers of genes in each parent set """ # Collect genes. genes_collection = list() # Iterate on cluster reports. for key in cluster_reports.keys(): # Organize data. data_report = cluster_reports[key]["report"] #print(cluster_reports[key]["name"]) #print(data_report) data_report.rename_axis( index="set", axis="index", copy=False, inplace=True, ) records = utility.convert_dataframe_to_records(data=data_report) # Iterate on sets within cluster. for record in records: # Extract identifiers of genes. genes_set_raw = record["Genes"] genes_set = genes_set_raw.split(", ") # Collect genes. genes_collection.extend(genes_set) # Collect unique genes from parent. genes_parentage = utility.collect_unique_elements( elements_original=genes_collection, ) # Collect orphan genes. genes_orphan = list( filter(lambda gene: not gene in genes_parentage, genes_query)) # Report. if report: utility.print_terminal_partition(level=2) print("unique parentage and orphan genes") print("parentage genes: " + str(len(genes_parentage))) print("orphan genes: " + str(len(genes_orphan))) utility.print_terminal_partition(level=2) pass
def organize_cohort_gene_components( cohort=None, paths=None, report=None, ): """ Organizes evaluation of subpopulation structure on the basis of pan-tissue expression of genes of interest. arguments: cohort (str): cohort of persons--selection, respiration, or ventilation paths (dict<str>): collection of paths to directories for procedure's files report (bool): whether to print reports raises: returns: """ # Report. if report: utility.print_terminal_partition(level=2) print("cohort: " + cohort) # Read source information from file. source = read_source_cohort_gene_components( cohort=cohort, dock=paths["dock"], ) # Organize data for principal component analysis. data_signals_genes_persons = organize_data_cohort_multimodal_genes_signals( data_signals_genes_persons=source["data_signals_genes_persons"], report=report, ) if False: # Calculate principal components on genes across persons. bin = calculate_multimodal_genes_signals_persons_components( genes=source["genes_candidacy"]["multimodal"], data_signals_genes_persons=data_signals_genes_persons, report=report, ) # Compile information. information = dict() information["data_persons_genes_components"] = bin[ "data_observations_components"] information["data_persons_genes_variances"] = bin[ "data_components_variances"] # Write information to file. write_product_cohort_gene_components( cohort=cohort, information=information, paths=paths, ) pass
def organize_differential_expression_data_sets( comparisons=None, data_samples_tissues_patients=None, data_gene_count=None, ): """ Collect hierarchical structure of tissues, patients, and samples. arguments: comparisons (dict<list<str>>): Minor categories to compare for each major category of tissue data_samples_tissues_patients (object): Pandas data frame of patients and tissues for all samples data_gene_count (object): Pandas data frame of genes' counts for all samples raises: returns: (list<dict>): Collections of data sets for differential expression analyses """ # Print terminal partition. utility.print_terminal_partition(level=2) # Report. print( "Organization of data sets for differential gene expression " + "comparison of minor categories of tissues." ) # Collect data sets. sets = list() tissues_major = list(comparisons.keys()) for tissue_major in tissues_major: set = organize_differential_expression_data_set( tissue_major=tissue_major, tissues_minor=comparisons[tissue_major], data_samples_tissues_patients=data_samples_tissues_patients, data_gene_count=data_gene_count, ) # Collect the data set. sets.append(set) # Print terminal partition. utility.print_terminal_partition(level=2) # Report. print( "Data sets by major tissues:" ) for set in sets: print(set["tissue"]) return sets
def determine_gene_study_comparison_value( gene_identifier=None, comparison=None, data_study=None, warn=None, ): """ Determines a gene's value of fold change for a comparison in a study. The function that calls this function already verifies that the study includes the gene and the comparison. arguments: gene_identifier (str): unique identifier of a gene comparison (str): name of a comparison data_study (object): Pandas data frame of information about comparisons across genes in a study warn (bool): whether to print warnings raises: returns: (float): gene's value of fold change for a comparison in a study """ # Select study's information for gene. data_study = data_study.copy(deep=True) data_study_gene = data_study.loc[data_study["identifier"] == gene_identifier, :].copy(deep=True) data_study_gene.drop_duplicates( subset=None, keep="first", inplace=True, #ignore_index=True, ) # Determine valid, non null values of the gene's fold change. values_valid = list( filter(lambda value: (not math.isnan(value)), data_study_gene[comparison].to_list())) if len(values_valid) > 1: value = statistics.mean(values_valid) if warn: utility.print_terminal_partition(level=3) print( "warning: gene has multiple fold change values for a single " + "study and comparison." + gene_identifier) elif len(values_valid) == 1: value = values_valid[0] else: value = float("nan") # Return information. return value
def read_source( path_dock=None, report=None, ): """ Reads and organizes source information from file. arguments: path_dock (str): path to dock directory for source and product directories and files report (bool): whether to print reports raises: returns: (object): source information """ # Specify directories and files. path_table_reference_shin_2014 = os.path.join( path_dock, "metabolite_reference", "24816252_shin_2014", "table_metabolite_reference.tsv") path_table_metabolite_heritabilities = os.path.join( path_dock, "heritability_correlation_2021-04-12", "table_shin_2014_heritabilities.tsv") # Read information from file. table_reference_shin_2014 = pandas.read_csv( path_table_reference_shin_2014, sep="\t", header=0, #dtype="string", ) table_metabolite_heritabilities = pandas.read_csv( path_table_metabolite_heritabilities, sep="\t", header=0, #dtype="string", ) # Report. if report: utility.print_terminal_partition(level=2) print("report from read_source()") print(table_reference_shin_2014) utility.print_terminal_partition(level=2) # Compile and return information. return { "table_reference_shin_2014": table_reference_shin_2014, "table_metabolite_heritabilities": table_metabolite_heritabilities, }
def execute_procedure(dock=None, count=None): """ Function to execute module's main behavior. arguments: dock (str): path to root or dock directory for source and product directories and files count (int): count of shuffles to create and store raises: returns: """ # Remove previous files to avoid version or batch confusion. path_shuffle = os.path.join(dock, "shuffle") utility.remove_directory(path=path_shuffle) # Read source information from file. source = read_source(dock=dock) # Report. utility.print_terminal_partition(level=3) print( "Creating " + str(count) + " shuffles for matrices of dimension " + "zero: " + str(source["tissues_selection"]) + " by dimension one: " + str(source["persons_selection"]) + ". " "Notice that shuffles occur across dimension one (tissues for each " + "person)." ) print( "Hence, values will stay matched to their respective tissues, but " + "they will be shuffled with respect to persons." ) utility.print_terminal_partition(level=3) # Create shuffle indices. shuffles = create_shuffle_indices( count=count, dimension_zero=source["tissues_selection"], dimension_one=source["persons_selection"], ) # Compile information. information = { "shuffles": shuffles } #Write product information to file. write_product(dock=dock, information=information) pass
def organize_genes_heritability_data( data_genes_heritability=None, report=None, ): """ Organize data summarizing genes' heritabilities. arguments: data_genes_heritability (object): Pandas data frame of genes' heritabilities report (bool): whether to print reports raises: returns: (object): Pandas data frame of genes' heritabilities """ # Copy data. data = data_genes_heritability.copy(deep=True) columns = list() columns.append("name") columns.append("proportion") columns.append("count") columns.append("probability") columns.append("probability_log") columns.append("discovery") columns.append("discovery_log") columns.append("significance") columns.append("error") columns.append("confidence_95_interval") columns.append("confidence_95_low") columns.append("confidence_95_high") columns.append("residual") columns.append("genotype") columns.append("phenotype") data = data[[*columns]] data.sort_values( by=["probability"], axis="index", ascending=True, inplace=True, ) # Report. if report: utility.print_terminal_partition(level=2) print("data after organization of columns") print(data) return data
def organize_cohort_components_regressions( cohort=None, paths=None, report=None, ): """ Organizes evaluation of subpopulation structure on the basis of pan-tissue expression of genes of interest. arguments: cohort (str): cohort of persons--selection, respiration, or ventilation paths (dict<str>): collection of paths to directories for procedure's files report (bool): whether to print reports raises: returns: """ # Report. if report: utility.print_terminal_partition(level=2) print("cohort: " + cohort) # Read source information from file. source = read_source_cohort_components_regressions( cohort=cohort, dock=paths["dock"], ) # Define variables for regression models. variables = selection.define_variables() # Organize data and regress across components. bin_regression = organize_data_regress_cases_report( variables_regression=(variables[cohort]["model_hypothesis"]), data_persons_properties=source["data_persons_properties"], data_persons_genes_components=source["data_persons_genes_components"], data_persons_genes_variances=source["data_persons_genes_variances"], threshold_discovery=0.05, discovery=False, report=True, ) # Write information to file. write_product_cohort_components_regressions( cohort=cohort, information=bin_regression, paths=paths, ) pass
def validate_report_selection_thresholds( measures=None, selection=None, genes_scores=None, ): """ Validates thresholds from selection of genes with least and greatest values of measures of modality. arguments: measures (list<str>): measures of modality selection (dict): selections of genes genes_scores (dict): information about genes' measures of modality raises: returns: (dict): information about selection of genes """ utility.print_terminal_partition(level=2) print( "Validation of thresholds for selection of unimodal and multimodal " + "genes." ) # Iterate on measures of modality. for measure in measures: for direction in ["least", "greatest"]: # Collect values of measure for selection of genes. values = list() for gene in selection[measure][direction]["genes"]: value = genes_scores[gene][measure] values.append(value) if direction == "least": validation = max(values) elif direction == "greatest": validation = min(values) selection[measure][direction]["threshold_validation"] = validation threshold = selection[measure][direction]["threshold"] utility.print_terminal_partition(level=3) print("measure: " + measure) print("direction: " + direction) print("threshold: " + str(round(threshold, 5))) print("validation: " + str(round(validation, 5))) # Return information. return selection
def select_samples(tissues=None, persons=None, data_gene_signal=None): """ Selects samples of interest for further analyses. arguments: tissues (list<str>): Tissues of interest. persons (list<str>): persons of interest. data_gene_signal (object): Pandas data frame of genes' signals for all samples, tissues, and persons. raises: returns: (dict): Pandas data frame of genes' signals for all samples, tissues, and persons. """ # Select samples from persons and tissues of interest. utility.print_terminal_partition(level=2) print("Selection of samples from persons and tissues of interest.") print("count of samples, original: " + str(data_gene_signal.shape[0])) data_gene_signal.reset_index(level=["person", "tissue", "sample"], inplace=True) data_gene_signal.set_index(["person"], append=False, drop=True, inplace=True) data_gene_signal = data_gene_signal.loc[persons, :] print("count of samples from persons of interest: " + str(data_gene_signal.shape[0])) data_gene_signal.reset_index(level=["person"], inplace=True) data_gene_signal.set_index(["tissue"], append=False, drop=True, inplace=True) data_gene_signal = data_gene_signal.loc[tissues, :] print("count of samples from tissues of interest: " + str(data_gene_signal.shape[0])) data_gene_signal.reset_index(level=["tissue"], inplace=True) data_gene_signal.set_index(["person", "tissue", "sample"], append=False, drop=True, inplace=True) return data_gene_signal
def find_intersection_heritability_genes( genes_selection=None, genes_distribution=None, genes_heritability_complete=None, path_genes=None, report=None, ): """ Reads and organizes source information from file arguments: genes_selection (list<str>): identifiers of genes from selection procedure genes_distribution (list<str>): identifiers of genes with valid pan-tissue signal distributions genes_heritability_complete (list<str>): identifiers of genes for which the heritability procedure completed path_genes (str): path to heritability genes directory report (bool): whether to print reports raises: returns: (list<str>): identifiers of genes of interest from selection that also have valid heritability measurements """ # Determine genes for which heritability analysis converged successfully. genes_heritability_valid = collect_successful_genes( genes=genes_heritability_complete, path_genes=path_genes, ) # Determine intersection genes of interest. genes_interest = utility.filter_common_elements( list_one=genes_selection, list_two=genes_heritability_valid, ) # Report. if report: utility.print_terminal_partition(level=2) print("genes of interest with valid heritabilities: " + str(len(genes_interest))) utility.print_terminal_partition(level=2) # Return information. return genes_interest
def select_samples_genes(persons=None, tissues=None, data_gene_annotation=None, data_gene_signal=None): """ Selects samples and genes of interest for further analyses. arguments: persons (list<str>): persons of interest. tissues (list<str>): Tissues of interest. data_gene_annotation (object): Pandas data frame of genes' annotations. data_gene_signal (object): Pandas data frame of genes' signals for all samples, tissues, and persons. raises: returns: (object): Pandas data frame of genes' signals for all samples, tissues, and persons. """ utility.print_terminal_partition(level=1) print("Selection of samples and genes of interest.") # Select samples from persons and tissues of interest. data_gene_signal = select_samples( persons=persons, tissues=tissues, data_gene_signal=data_gene_signal, ) # Select genes with detectable, non-zero signal in tissues and persons of # interest. data_gene_signal = select_genes_detection( data_gene_signal=data_gene_signal) # Select genes that encode proteins. data_gene_signal = select_genes_protein( data_gene_annotation=data_gene_annotation, data_gene_signal=data_gene_signal) # Return information. return data_gene_signal
def check_missing_values(data=None): """ Checks data for missing values and prints reports. arguments: data (object): Pandas data frame of genes' signals for all samples. raises: returns: """ utility.print_terminal_partition(level=2) print("Check for missing values in genes' signals.") print("shape of original data frame: " + str(data.shape)) print("shape without missing axis 0: " + str(data.dropna(axis=0).shape)) print("shape without missing axis 1: " + str(data.dropna(axis=1).shape)) pass
def collect_studies_unique_gene_identifiers( bin_studies=None, report=None, ): """ Collects unique identifiers of genes from all studies. arguments: bin_studies (dict): collection of information about each study report (bool): whether to print reports raises: returns: (list<str>): unique identifiers of genes from all studies """ # Report. if report: utility.print_terminal_partition(level=2) print("unique genes from each study") genes_collection = [] for study in bin_studies.keys(): data_study = bin_studies[study]["data"] genes_study = utility.collect_unique_elements( elements_original=data_study["identifier"].to_list()) genes_study_valid = list( filter(lambda identifier: ("ENSG" in str(identifier)), genes_study)) genes_collection.extend(genes_study) # Report. if report: print("study " + study + " : " + str(len(genes_study_valid))) # Determine valid, non null values of the gene's fold change. genes_valid = list( filter(lambda identifier: ("ENSG" in str(identifier)), genes_collection)) genes_unique = utility.collect_unique_elements( elements_original=genes_valid) return genes_unique
def organize_multimodal_genes_signals_persons_components( genes=None, data_signals_genes_persons=None, report=None, ): """ Organizes a principal components analysis on genes' pan-tissue signals as features across persons as instances. arguments: genes (list<str>): identifiers of genes data_signals_genes_persons (object): Pandas data frame of genes' pan-tissue signals across persons report (bool): whether to print reports raises: returns: (dict<object>): collection of Pandas data frames of genes' pairwise correlations """ # Copy data. data_signals = data_signals_genes_persons.copy(deep=True) # Select genes of interest. data_selection = data_signals.loc[:, data_signals.columns.isin(genes)] # Report. if report: utility.print_terminal_partition(level=2) print("Selection of genes with pan-tissue signals across persons.") utility.print_terminal_partition(level=3) print(data_selection) # Reduce dimensionality. components = min(int(len(genes)), int(data_selection.shape[0])) result = utility.calculate_principal_components( data=data_selection, components=components, report=report, ) # Return information. return result
def select_heritable_genes( data_genes_heritability=None, threshold_proportion=None, threshold_probability=None, report=None, ): """ Collects and organizes information about genes. arguments: data_genes_heritability (object): Pandas data frame of genes' heritabilities threshold_proportion (float): threshold by proportion of phenotypic variance attributable to genotype threshold_probability (float): threshold by probability of heritability estimate report (bool): whether to print reports raises: returns: (list<str>): identifiers of heritable genes """ # Copy genes' heritabilities. data_copy = data_genes_heritability.copy(deep=True) # Set threshold. data_proportion = data_copy.loc[ data_copy["proportion"] >= threshold_proportion] data_probability = data_proportion.loc[ data_proportion["probability"] <= threshold_probability] # Extract identifiers of genes. genes = data_probability.index.to_list() # Report. if report: percentage = round((len(genes) / data_copy.shape[0]) * 100, 2) utility.print_terminal_partition(level=2) print("count of 'heritable' genes': " + str(len(genes)) + " (" + str(percentage) + " %)") # Return information. return genes
def filter_heritabilities_confidence( data_genes_heritability=None, threshold=None, report=None, ): """ Organizes and combines information about dependent and independent variables for regression. arguments: data_genes_heritability (object): Pandas data frame of genes' heritabilities threshold (float): maximal confidence interval report (bool): whether to print reports raises: returns: (object): Pandas data frame of genes' heritabilities """ # Remove all columns from persons properties except the covariates # Copy data. data = data_genes_heritability.copy(deep=True) # Organize data. data["threshold"] = data["confidence_95_interval"].apply( lambda value: determine_confidence_threshold_pass( value=value, threshold=threshold, )) data_confidence = data.loc[data["threshold"] == True, :] # Report. if report: utility.print_terminal_partition(level=2) print("data after filter by confidence interval") print(data_confidence) utility.print_terminal_partition(level=2) print("count of candidate genes': " + str(data_confidence.shape[0])) # Return information. return data_confidence
def translate_study_genes_identifiers( bin_studies=None, data_gene_annotation=None, translations_genes=None, report=None, ): """ Translates genes' names from all studies to Ensembl identifiers. arguments: bin_studies (dict): collection of information about each study data_gene_annotation (object): Pandas data frame of genes' annotations translations_genes (dict<str>): pairwise custom translations of genes' names to Ensembl identifiers, see assembly.read_source_gene_name_identifier_translations() report (bool): whether to print reports raises: returns: (dict): collection of information about each study with genes' identifiers """ utility.print_terminal_partition(level=2) print("translating genes' names to Ensembl identifiers...") print("following genes' names do not match...") utility.print_terminal_partition(level=2) # Iterate on studies. bin_studies = copy.deepcopy(bin_studies) for study in bin_studies.keys(): data_study = bin_studies[study]["data"] # Determine whether the study already includes genes' identifiers. if ("identifier" not in data_study.columns.to_list()): data_study["identifier"] = data_study["name"].apply( lambda gene_name: assembly.translate_gene_name_to_identifier( name=gene_name, data_gene_annotation=data_gene_annotation, translations_genes=translations_genes, )) bin_studies[study]["data"] = data_study # Report. if report: print(data_study) pass pass pass utility.print_terminal_partition(level=2) print("end translation...") utility.print_terminal_partition(level=2) return bin_studies
def check_zero_samples(data=None): """ Checks data for samples with values of 0 for all genes' signals. arguments: data (object): Pandas data frame of genes' signals for all samples. raises: returns: """ utility.print_terminal_partition(level=2) print("Check for samples with values of 0 for all genes' signals.") print("shape of original data frame: " + str(data.shape)) data_nonzero = (data != 0) print("shape of data frame without zero samples: " + str(data.loc[:, data_nonzero.any(axis="index")].shape)) pass
def summarize_measures_thresholds( measures=None, scores=None, thresholds=None, ): """ Summarizes values of thresholds for genes' measures of bimodality. arguments: measures (list<str>): measures of bimodality scores (dict<dict>): information about genes' measures of bimodality thresholds (dict<float>): values of thresholds for genes' measures of bimodality raises: returns: """ utility.print_terminal_partition(level=2) for measure in measures: utility.print_terminal_partition(level=3) print("measure: " + str(measure)) print("mean: " + str(scores[measure]["mean"])) print("deviation: " + str(scores[measure]["deviation"])) print("threshold lesser: " + str(thresholds["lesser"][measure])) print("threshold greater: " + str(thresholds["greater"][measure])) pass utility.print_terminal_partition(level=2)
def summarize_genes_samples_signals(genes_samples_signals=None, ): """ Summarize information about a gene's samples and signals. arguments: data_gene_samples_signals (object): Pandas data frame of a gene's signals across samples raises: returns: (dict): counts of persons and tissues """ # Report. utility.print_terminal_partition(level=2) print("Count of data by genes: " + str(len(genes_samples_signals.keys()))) print("Access data for a single gene.") utility.print_terminal_partition(level=2) data = genes_samples_signals["ENSG00000231925"] print(data) utility.print_terminal_partition(level=2) print("Determine counts of persons and tissues.") print("Split gene's signals by person.") groups = data.groupby("person") persons = len(groups) print("Count of groups by person: " + str(persons)) print("Split gene's signals by major tissue category.") groups = data.groupby("tissue_major") tissues = len(groups) print("Count of groups by tissue: " + str(tissues)) pass
def drop_undetectable_genes(data=None): """ Drops genes with values of 0 for signals across all samples. arguments: data (object): Pandas data frame of genes' signals for all samples. raises: returns: (object): Pandas data frame of genes' signals for all samples. """ utility.print_terminal_partition(level=2) print("Drop genes that are undetectable.") data_nonzero = (data != 0) data_signal = data.loc[data_nonzero.any(axis="columns"), :] print("Data without undetectable genes.") print(data_signal.iloc[0:10, 0:10]) print("data dimensions: " + str(data_signal.shape)) return data_signal
def calculate_report_gene_sample_principal_components( data=None, data_samples_factors=None, components=None, ): """ Calculates the principal components for genes as features and samples as observations. arguments: data (object): Pandas data frame of signals with features across rows and observations across columns data_samples_factors (object): Pandas data frame of factors for each sample components (int): count of principle components raises: returns: (object): Pandas data frame of principle components for each factor """ # Describe variance across categories of tissues. # Normalize and standardized gene's signals for principal components. data_normal_standard = normalize_standardize_gene_signal( data_gene_signal=data ) print("Data after normalization and standardization.") print(data_normal_standard) report = calculate_principal_components( data=data_normal_standard, components=10 ) utility.print_terminal_partition(level=2) print("Report from principal component analysis...") print("Explained variance by each principal component...") print(report["data_component_variance"]) utility.print_terminal_partition(level=3) print(report["data_sample_component"]) # Associate samples to major and minor tissue types. data_factor_component = assembly.associate_samples_persons_tissues( data_samples_tissues_persons=data_samples_factors, data_gene_sample=report["data_sample_component"], ) utility.print_terminal_partition(level=3) print(data_factor_component) # Compile information. information = { "data_component_variance": report["data_component_variance"], "data_sample_component": report["data_sample_component"], "data_factor_component": data_factor_component, } # Return information. return information