def organize_principal_component_factor_table( factors=None, prefix=None, index=None, index_name=None, report=None, ): """ Organizes a table of factors or scores from Principal Component Analysis. arguments: factors (object): NumPy array matrix of Principal Component factors prefix (str): prefix for names of component columns index (object): NumPy array of indices for table index_name (str): name for table's index column report (bool): whether to print reports raises: returns: (dict): collection of information """ # Copy information. factors = numpy.copy(factors) # Organize information. count = 1 columns = list() for component in range(0, pail_components.factors.shape[1], 1): column = str(prefix + str(count)) columns.append(column) count += 1 table = pandas.DataFrame( data=factors, index=index, columns=columns, dtype="float32", copy=True, ) table.rename_axis( index=index_name, axis="index", copy=False, inplace=True, ) table.reset_index( level=None, inplace=True ) # Report. if report: utility.print_terminal_partition(level=2) print("Matrix before organization:") print(factors) utility.print_terminal_partition(level=3) print("Table after organization:") print(table) # Return. return table
def execute_procedure(path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ utility.print_terminal_partition(level=1) print(path_dock) print("version check: 1") # Pause procedure. time.sleep(5.0) # Execute assembly procedure from uk_biobank package. ukb_importation.execute_procedure(path_dock=path_dock) utility.print_terminal_partition(level=1) print("From package 'uk_biobank', procedure 'importation' is complete.") pass
def execute_procedure(path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ # Report version. utility.print_terminal_partition(level=1) print(path_dock) print("version check: 1") # Pause procedure. time.sleep(5.0) # Define phenotype studies. phenotype_studies = [ "30124842_yengo_2018", "30239722_pulit_2018", "30482948_walters_2018_all", "30482948_walters_2018_eur", "30482948_walters_2018_eur_unrel", "30718901_howard_2019", "29906448_ruderfer_2018_scz_vs_ctl", "29906448_ruderfer_2018_scz_bpd_vs_ctl", "29906448_ruderfer_2018_scz_vs_bpd", "29906448_ruderfer_2018_bpd_vs_ctl", "00000000_ripke_2021", "31043756_stahl_2019", "00000000_mullins_2021_all", "00000000_mullins_2021_bpd1", "00000000_mullins_2021_bpd2", ] # Define metabolite studies. metabolite_studies = [ #"24816252_shin_2014", "27005778_kettunen_2016", #"33437055_panyard_2021", ] for phenotype_study in phenotype_studies: for metabolite_study in metabolite_studies: drive_collection_report_phenotype_metabolite_studies( phenotype_study=phenotype_study, metabolite_study=metabolite_study, path_dock=path_dock, report=True, ) pass pass pass
def read_source( path_dock=None, report=None, ): """ Reads and organizes source information from file. Notice that Pandas does not accommodate missing values within series of integer variable types. arguments: path_dock (str): path to dock directory for source and product directories and files report (bool): whether to print reports raises: returns: (object): source information """ # Specify directories and files. path_table_phenotypes = os.path.join(path_dock, "organization", "table_phenotypes.pickle") path_table_metabolites_names = os.path.join( path_dock, "organization", "table_metabolites_names.pickle") path_metabolites_valid = os.path.join(path_dock, "organization", "metabolites_valid.pickle") ##################################3 path_table_metabolites_scores = os.path.join( path_dock, "aggregation", "selection", "table_metabolites_scores_prs_0_0001.pickle") utility.print_terminal_partition(level=1) print("PRS pvalue: 0.0001") utility.print_terminal_partition(level=2) # Pause procedure. time.sleep(5.0) ############################################## # Read information from file. table_phenotypes = pandas.read_pickle(path_table_phenotypes) table_metabolites_scores = pandas.read_pickle( path_table_metabolites_scores) table_metabolites_names = pandas.read_pickle(path_table_metabolites_names) with open(path_metabolites_valid, "rb") as file_source: metabolites_valid = pickle.load(file_source) # Compile and return information. return { "table_phenotypes": table_phenotypes, "table_metabolites_scores": table_metabolites_scores, "table_metabolites_names": table_metabolites_names, "metabolites_valid": metabolites_valid, }
def read_aggregate_test_metabolite_genetic_scores( metabolite=None, metabolites_files_paths=None, report=None, ): """ Reads a metabolite's genetic scores across the UK Biobank from file, aggregates scores by Singular Value Decomposition (SVD), and tests this method. arguments: metabolite (str): identifier of a metabolite metabolites_files_paths (dict<list<str>>): collection of files and paths for metabolites report (bool): whether to print reports raises: returns: (dict): collection of information """ # Aggregate metabolite's genetic scores. table_aggregation = read_aggregate_metabolite_genetic_scores( metabolite=metabolite, metabolites_files_paths=metabolites_files_paths, report=report, ) # Copy information. table = table_aggregation.copy(deep=True) # Organize information. table.dropna( axis="index", how="any", subset=["identifier_ukb"], inplace=True, ) table.set_index( "identifier_ukb", drop=True, inplace=True, ) # Report. if report: # Column name translations. utility.print_terminal_partition(level=2) print("reporting from: read_aggregate_test_metabolite_genetic_score()") print("blah blah...") print(table) utility.print_terminal_partition(level=3) # Compile information. pail = dict() # Return. return pail
def remove_null_records_standardize_variables_scales( table=None, report=None, ): """ Removes records with null values and standardizes variables' scales. arguments: table (object): Pandas data frame of dependent and independent variables for regression report (bool): whether to print reports raises: returns: (object): Pandas data frame of dependent and independent variables for regression """ # Copy information. table = table.copy(deep=True) # Drop any rows with null keys. table.dropna( axis="index", how="any", subset=None, inplace=True, ) # Standardize variables' scales. table_scale = utility.standardize_table_values_by_column( table=table, report=report, ) # Report. if report: utility.print_terminal_partition(level=2) print( "Report source: remove_null_records_standardize_variables_scales()" ) utility.print_terminal_partition(level=3) print("Table after removal of records with null values...") print("... and after standardizing scale of each variable:") print(table_scale) pass # Return information. return table_scale
def organize_principal_component_variance_proportion_table( variance_proportions=None, prefix=None, index_name=None, report=None, ): """ Organizes a table of proportion of variance explained by each Eigenvector and Principal Component factor. arguments: variance_proportions (object): NumPy array of proportions of variance explained prefix (str): prefix for names of component columns index_name (str): name for table's index column report (bool): whether to print reports raises: returns: (dict): collection of information """ # Copy information. variance_proportions = numpy.copy(variance_proportions).tolist() # Organize information. count = 1 records = list() for variance_proportion in variance_proportions: record = dict() record[index_name] = str(prefix + str(count)) record["variance_proportion"] = variance_proportion records.append(record) count += 1 table = pandas.DataFrame(data=records) # Report. if report: utility.print_terminal_partition(level=2) print("Report from: " + "organize_principal_component_variance_proportion_table()") utility.print_terminal_partition(level=3) print("Table after organization:") print(table) # Return. return table
def execute_procedure(path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ utility.print_terminal_partition(level=1) print(path_dock) print("version check: 3") # TODO: steps to complete here... # 1. write script to move metabolite GEM PRSs to a new directory in "dock" # 2. read all contents of the PRSice output directory (need to move to "dock" first) # 3. filter to keep only the relevant files... probably on basis of file suffixes # 4. iterate on the files in the directory... # --- collect information for each metabolite during iteration (sort of a collection) # 5. for each metabolite, read PRS table in Pandas, specifying gzip compression # 6. calculate PRS-PCA by modification of Coombes' method # 7. keep PC1 and collect within a dataframe for all metabolites # Initialize directories. paths = initialize_directories( restore=True, path_dock=path_dock, ) # Read source information from file. # Exclusion identifiers are "eid". source = read_source( path_dock=path_dock, report=True, ) # Collect information. information = dict() # Write product information to file. write_product(paths=paths, information=information) pass
def calculate_principal_component_explanation_variance_proportions( eigenvalues=None, report=None, ): """ Calculates the proportion of variance explained by Eigenvectors of Principal Components Analysis (PCA). Sum of proportional variance explained across all Eigenvectors is one. arguments: eigenvalues (object): NumPy array of Eigenvalues report (bool): whether to print reports raises: returns: (object): NumPy array of proportions of variance explained """ def divide_by_total(value, total): return (value / (total)) array_divide_by_total = numpy.vectorize(divide_by_total) # Copy information. eigenvalues = numpy.copy(eigenvalues) # Calculate total variance across all Eigenvectors. variance_total = numpy.sum(eigenvalues) # Calculate proportional variance across Eigenvectors. variance_proportions = array_divide_by_total(eigenvalues, variance_total) # Report. if report: utility.print_terminal_partition(level=2) print( "Report from: " + "calculate_principal_component_explanation_variance_proportions()") utility.print_terminal_partition(level=2) print("Eigenvalues...") print(eigenvalues) print("Variance proportions...") print(variance_proportions) # Return. return variance_proportions
def calculate_principal_component_loadings( eigenvectors=None, eigenvalues=None, s_singular_values=None, vt_right_singular_vectors_rows=None, count_samples=None, report=None, ): """ Calculates Principal Components Analysis (PCA) loadings. arguments: eigenvectors (object): NumPy matrix of Eigenvectors eigenvalues (object): NumPy array of Eigenvalues s_singular_values (object): Numpy matrix of Singular Values vt_right_singular_vectors_rows (object): Numpy matrix count_samples (float): count of samples in the original source matrix for Singular Value Decomposition report (bool): whether to print reports raises: returns: (object): Numpy array of loadings """ # Eigenvectors and Eigenvalues. loadings_eigen = calculate_loadings_from_eigenvalues_eigenvectors( eigenvectors=eigenvectors, eigenvalues=eigenvalues, report=report, ) # Raw decomposition factors. loadings_factor = calculate_loadings_from_decomposition_factors( s_singular_values=s_singular_values, vt_right_singular_vectors_rows=vt_right_singular_vectors_rows, count_samples=count_samples, report=report, ) # Report. if report: utility.print_terminal_partition(level=2) print("Report from: " + "calculate_principal_component_loadings()") utility.print_terminal_partition(level=3) print("Shape of Eigen loadings: " + str(loadings_eigen.shape)) print("Shape of factor loadings: " + str(loadings_factor.shape)) utility.print_terminal_partition(level=4) print("Compare loadings from both methods: ") print( numpy.allclose( loadings_eigen, loadings_factor, rtol=1e-2, atol=1e-3, equal_nan=False, )) # Return. return loadings_eigen
def calculate_loadings_from_eigenvalues_eigenvectors( eigenvectors=None, eigenvalues=None, report=None, ): """ Calculates Principal Components Analysis (PCA) loadings from Eigenvectors and Eigenvalues. Statsmodels erroneously returns "loadings" that have identical values and dimensions as the Eigenvectors; however, Eigenvectors and Loadings are not equivalent. loadings = eigenvectors [dot] square_root(eigenvalues) Loadings include aspects of both direction (eigenvectors) and scale (eigenvalues). arguments: eigenvectors (object): NumPy matrix of Eigenvectors eigenvalues (object): NumPy array of Eigenvalues report (bool): whether to print reports raises: returns: (object): Numpy array of loadings """ # Copy information. eigenvectors = numpy.copy(eigenvectors) eigenvalues = numpy.copy(eigenvalues) # Calculate square roots of Eigenvalues. # Organize a diagonal matrix of square roots of Eigenvalues. eigenvalues_square_root = numpy.sqrt(eigenvalues) eigenvalues_root_diagonal = numpy.diag(eigenvalues_square_root) # Calculate loadings. loadings = numpy.dot(eigenvectors, eigenvalues_root_diagonal) # Report. if report: utility.print_terminal_partition(level=2) print("Report from: " + "calculate_loadings_from_eigenvalues_eigenvectors()") utility.print_terminal_partition(level=3) print("Shape of loadings: " + str(loadings.shape)) utility.print_terminal_partition(level=4) print( "Loadings = Eigenvectors [dot] square_root(diagonal Eigenvalues)") print(loadings) # Return. return loadings
def calculate_loadings_from_decomposition_factors( s_singular_values=None, vt_right_singular_vectors_rows=None, count_samples=None, report=None, ): """ Calculates Principal Components Analysis (PCA) loadings from direct factors of Singular Value Decomposition. arguments: s_singular_values (object): Numpy matrix of Singular Values vt_right_singular_vectors_rows (object): Numpy matrix count_samples (float): count of samples in the original source matrix for Singular Value Decomposition report (bool): whether to print reports raises: returns: (object): Numpy array of loadings """ def divide_by_sample_count(value, count_samples): return (value / math.sqrt(count_samples - 1)) array_divide_by_sample_count = numpy.vectorize(divide_by_sample_count) # Copy information. s = numpy.copy(s_singular_values) vt = numpy.copy(vt_right_singular_vectors_rows) # Calculate loadings. quotients = array_divide_by_sample_count(s, count_samples) quotients_diagonal = numpy.diag(quotients) loadings = numpy.dot(vt, quotients_diagonal) # Report. if report: utility.print_terminal_partition(level=2) print("Report from: " + "calculate_loadings_from_decomposition_factors()") utility.print_terminal_partition(level=3) print("Shape of loadings: " + str(loadings.shape)) utility.print_terminal_partition(level=4) print("Loadings = (V [dot] (S / square_root(samples - 1)))") print(loadings) # Return. return loadings
def combine_organize_phenotype_metabolites_summary_table( table_metabolite_reference=None, phenotype_heritability=None, table_metabolite_heritability=None, table_correlations=None, threshold_metabolite_heritability=None, threshold_false_discovery_rate=None, report=None, ): """ Reads, collects, and organizes metabolite heritability estimates. arguments: table_metabolite_reference (object): Pandas data frame of metabolites' identifiers and names from study phenotype_heritability (dict): information about estimation of a phenotype's heritability table_metabolite_heritability (object): Pandas data frame of metabolites' heritability estimates table_correlations (object): Pandas data frame of genetic correlations threshold_metabolite_heritability (float): threshold for metabolite heritability threshold_false_discovery_rate (float): threshold for false discovery rate report (bool): whether to print reports raises: returns: (object): Pandas data frame of metabolites' heritability estimates and genetic correlation estimates against a phenotype of interest """ # Organize metabolite reference table. table_metabolite_reference = organize_metabolite_reference_table( table=table_metabolite_reference, identifier="identifier_study", name="name", identity="identity", ) # Merge tables for metabolite references and heritabilities. # Merge data tables using database-style join. # Alternative is to use DataFrame.join(). table_heritability = table_metabolite_reference.merge( table_metabolite_heritability, how="outer", left_on="identifier", right_on="identifier", suffixes=("_reference", "_heritability"), ) # Merge tables for metabolite heritabilities and correlations. # Merge data tables using database-style join. # Alternative is to use DataFrame.join(). table = table_heritability.merge( table_correlations, how="outer", left_on="identifier", right_on="identifier", suffixes=("_heritability", "_correlation"), ) # Introduce columns for phenotype heritability. table["phenotype_heritability"] = phenotype_heritability["heritability"] table["phenotype_heritability_error"] = ( phenotype_heritability["heritability_standard_error"]) # Select table rows for metabolites with valid identities. table = table.loc[(table["identity"] == 1), :] # Select table rows for metabolites with valid heritability estimates. table = table.loc[( table["heritability"] >= threshold_metabolite_heritability), :] # Calculate False Discovery Rates (FDRs). table = utility.calculate_table_false_discovery_rates( threshold=threshold_false_discovery_rate, probability="correlation_probability", discovery="correlation_discovery", significance="correlation_significance", table=table, ) # Sort table rows. table.sort_values( by=["correlation_absolute"], axis="index", ascending=False, na_position="last", inplace=True, ) table.sort_values( by=[ "correlation_discovery", ], axis="index", ascending=True, na_position="last", inplace=True, ) # Sort table columns. columns_sequence = [ #"identifier", "name", "correlation_discovery", "correlation", "correlation_standard_error", "heritability", "heritability_standard_error", "correlation_absolute", "correlation_probability", "phenotype_heritability", "phenotype_heritability_error", "heritability_ratio", "heritability_ratio_standard_error", "heritability_variants", "correlation_significance", "correlation_variants", ] table = table[[*columns_sequence]] # Report. if report: utility.print_terminal_partition(level=2) print("combine_organize_phenotype_metabolites_summary_table()") print(table) # Return information. return table
def read_source( phenotype_study=None, metabolite_study=None, paths=None, report=None, ): """ Reads and organizes source information from file. arguments: phenotype_study (str): identifier of main phenotype study metabolite_study (str): identifier of metabolite study paths (dict<str>): collection of paths to directories for procedure's files report (bool): whether to print reports raises: returns: (object): source information """ # Metabolite reference table. path_table_metabolite_reference = os.path.join( paths["dock"], "parameters", "psychiatric_metabolism", "metabolite_reference", metabolite_study, "table_metabolite_reference.tsv") table_metabolite_reference = pandas.read_csv( path_table_metabolite_reference, sep="\t", header=0, #dtype="string", ) # Phenotype heritability. phenotype_heritability = read_extract_phenotype_heritability( file="heritability_report.log", file_suffix="_heritability_report.log", path_source_directory=paths["heritability_studies"][phenotype_study], ) # Metabolite heritability table. table_metabolite_heritability = read_collect_metabolites_heritabilities( file_suffix="_heritability_report.log", path_source_directory=paths["heritability_studies"][metabolite_study], ) # Phenotype-metabolite correlation table. table_correlations = ( read_collect_phenotype_metabolites_genetic_correlations( file_suffix="_correlation.log", path_source_directory=(paths["correlation_studies"] [phenotype_study][metabolite_study]), )) # Report. if report: utility.print_terminal_partition(level=2) print(table_correlations) utility.print_terminal_partition(level=2) # Compile and return information. return { "table_metabolite_reference": table_metabolite_reference, "phenotype_heritability": phenotype_heritability, "table_metabolite_heritability": table_metabolite_heritability, "table_correlations": table_correlations, }
def read_aggregate_metabolite_genetic_scores( metabolite=None, metabolites_files_paths=None, report=None, ): """ Reads a metabolite's genetic scores across the UK Biobank from file, and aggregates these scores by Singular Value Decomposition (SVD). This function returns a table for a single metabolite with UK Biobank identifiers and a single column of aggregate scores for the metabolite across these UK Biobank records. arguments: metabolite (str): identifier of a metabolite metabolites_files_paths (dict<list<str>>): collection of files and paths for metabolites report (bool): whether to print reports raises: returns: (object): Pandas data frame of a metabolite's aggregate genetic scores across UK Biobank """ # Read raw table of metabolite's genetic scores. metabolite_file_path = metabolites_files_paths[metabolite]["path"] table_raw = read_source_metabolite_genetic_scores( path_file=metabolite_file_path, report=report, ) # Organize the raw table. table_raw.drop( labels=["IID",], axis="columns", inplace=True ) # Translate column names. translations = dict() translations["FID"] = "identifier_ukb" table_raw.rename( columns=translations, inplace=True, ) # Aggregate scores. table_aggregation = organize_aggregate_metabolite_genetic_scores( identifier=metabolite, column_index="identifier_ukb", columns_scores=[ "X5e.08", "X1e.07", "X1e.06", "X1e.05", "X0.0001", "X0.001", "X0.01", "X0.05", "X0.1", "X0.2", "X1", ], table=table_raw, report=report, ) # Report. if report: # Column name translations. utility.print_terminal_partition(level=2) print("Report from: read_aggregate_metabolite_genetic_scores()") utility.print_terminal_partition(level=2) print("Metabolite: " + str(metabolite)) print(table_aggregation) utility.print_terminal_partition(level=3) # Return. return table_aggregation
def organize_principal_components_positive_sum_loadings( threshold_valid_proportion_per_column=None, table=None, report=None, ): """ Organizes a Principal Components Analysis while forcing loadings to have a positive sum. arguments: threshold_valid_proportion_per_column (float): proportion of rows that must have a valid value for a column in order to keep the column table (object): Pandas data frame of variables (features) across columns and samples (cases, observations) across rows with an explicit index report (bool): whether to print reports raises: returns: (dict): collection of information about the singular value decomposition """ # Calculate original factors by Singular Value Decomposition (SVD). pail_original = ( organize_principal_components_by_singular_value_decomposition( threshold_valid_proportion_per_column=( threshold_valid_proportion_per_column ), table=table, report=report, )) # Determine whether loadings have a positive sum. loadings_original = numpy.copy(pail_original["loadings"]) sum_original = numpy.sum(loadings_original.flatten(order="C")) if (sum_original >= 0): loading_sign_flip = False else: loading_sign_flip = True # TODO: invert signs of all of Vh (and U???)... then re-calculate derived values. loadings_novel = numpy.negative(numpy.copy(pail_components.loadings)) sum_novel = numpy.sum(loadings_novel.flatten(order="C")) # Organize principal component factors within table. table_components = organize_principal_component_factor_table( factors=pail_components.factors, # TODO: change to factors after sign adjustment prefix="component_", index=index, index_name="identifier_ukb", report=True, ) # Report. if report: utility.print_terminal_partition(level=2) print( "Report from: " + "organize_principal_components_positive_sum_loadings()" ) utility.print_terminal_partition(level=2) pass # Compile information. pail = dict() pail["table_scale"] = pail_organization["table_scale"] pail["u"] = u pail["singular_values"] = s pail["vh"] = vh pail["eigenvalues"] = eigenvalues pail["eigenvectors"] = eigenvectors pail["loadings"] = loadings # Return. return pail
def execute_procedure(path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ utility.print_terminal_partition(level=1) print(path_dock) print("version check: 1") # Pause procedure. time.sleep(5.0) # Initialize directories. paths = ukb_organization.initialize_directories( restore=True, path_dock=path_dock, ) # Read source information from file. # Exclusion identifiers are "eid". source = ukb_organization.read_source( source="importation", path_dock=path_dock, report=True, ) # Organize variables for persons' genotypes, sex, age, and body mass index # across the UK Biobank. pail_basis = ukb_organization.execute_genotype_assessment_basis( table=source["table_phenotypes"], path_dock=path_dock, report=True, ) if True: # Organize variables for persons' sex hormones across the UK Biobank. pail_hormone = ukb_organization.execute_sex_hormones( table=pail_basis["table"], path_dock=path_dock, report=True, ) # Organize variables for female menstruation across the UK Biobank. pail_female = ukb_organization.execute_female_menstruation( table=pail_hormone["table"], report=True, ) # Organize variables for persons' alcohol consumption across the UK Biobank. pail_alcohol = ukb_organization.execute_alcohol( table=pail_female["table"], report=True, ) # Organize variables for persons' mental health across the UK Biobank. pail_psychology = ukb_organization.execute_psychology_psychiatry( table=pail_alcohol["table"], path_dock=path_dock, report=True, ) #print(pail_psychology["table_clean"].columns.to_list()) if False: # Organize variables for persons' sex hormones across the UK Biobank. pail_hormone = ukb_organization.execute_sex_hormones( table=pail_basis["table"], path_dock=path_dock, report=True, ) # Collect information. information = dict() information["organization"] = dict() #information["organization"]["table_phenotypes"] = pail_basis["table"] #information["organization"]["table_phenotypes"] = pail_hormone["table"] #information["organization"]["table_phenotypes"] = pail_female["table"] information["organization"]["table_phenotypes"] = pail_psychology["table"] # Write product information to file. ukb_organization.write_product(paths=paths, information=information) pass
def adjust_singular_value_decomposition_factor_signs( matrix=None, singular_values=None, left_singular_vectors_columns=None, right_singular_vectors_rows=None, report=None, ): """ Adjusts the otherwise random signs of factors from Singular Value Decomposition (SVD) to reduce the directional ambiguity. Reference: arguments: matrix (object): NumPy array matrix of original values across samples (rows, dimension 0) and variables (columns, dimension 1) singular_values (object): NumPy array of Singular Values from SVD left_singular_vectors_columns (object): NumPy array matrix with SVD left singular vectors as columns, U right_singular_vectors_rows (object): NumPy array matrix with SVD right singular vectors as rows, VT or Vh report (bool): whether to print reports raises: returns: (dict): collection of information about the singular value decomposition """ # Copy information. matrix = numpy.copy(matrix) s = numpy.copy(singular_values) u = numpy.copy(left_singular_vectors_columns) vt = numpy.copy(right_singular_vectors_rows) # Organize information. matrix_transpose = numpy.transpose(matrix) s_diagonal = numpy.diag(s) ut = numpy.copy(numpy.transpose(u)) v = numpy.copy(numpy.transpose(vt)) # Calculate basic products by matrix multiplication. ut_y = numpy.dot(ut, matrix) vt_y = numpy.dot(vt, matrix_transpose) # Reduce values to indicators of positive and negative signs. ut_y_sign = numpy.sign(ut_y) vt_y_sign = numpy.sign(vt_y) # Calculate squares of matrices. # Calculation of square by matrix multiplifcation is only possible for # square matrices. # Instead calculate the squares of all individual values in the matrices. ut_y_square = numpy.square(ut_y) vt_y_square = numpy.square(vt_y) # Calculate left and right sign matrices. signs_left = numpy.dot(ut_y_sign, ut_y_square) signs_right = numpy.dot(vt_y_sign, vt_y_square) # Report. if report: utility.print_terminal_partition(level=2) print( "Report from: " + "adjust_singular_value_decomposition_factor_signs()" ) utility.print_terminal_partition(level=2) print("Shape of original matrix: " + str(matrix.shape)) print("rows (dimension 0): samples (cases, observations)") print("columns (dimension 1): variables (features)") utility.print_terminal_partition(level=4) print("Shape of matrix Sigma (singular values): " + str(s.shape)) utility.print_terminal_partition(level=4) print("Shape of matrix U (left singular vectors): " + str(u.shape)) print( "Shape of matrix UT (transpose left singular vectors): " + str(ut.shape) ) utility.print_terminal_partition(level=4) print( "Shape of matrix VT (transpose right singular vectors): " + str(vt.shape) ) print("Shape of matrix V (right singular vectors): " + str(v.shape)) utility.print_terminal_partition(level=4) print("Shape of matrix UT-Y (product): " + str(ut_y.shape)) print("Shape of matrix UT-Y square: " + str(ut_y_square.shape)) print("Shape of matrix VT-Y (product): " + str(vt_y.shape)) print("Shape of matrix VT-Y square: " + str(vt_y_square.shape)) utility.print_terminal_partition(level=4) print("Shape of left signs matrix: " + str(signs_left.shape)) print("Shape of right signs matrix: " + str(signs_right.shape)) pass # Compile information. pail = dict() pail["matrix"] = matrix pail["left_singular_vectors_columns"] = u_prime pail["singular_values"] = s pail["right_singular_vectors_rows"] = vt_prime # Return. return pail
def drive_organize_table_regress_linear_ordinary_least_squares( dependence=None, independence=None, standard_scale=None, threshold_samples=None, table=None, report=None, ): """ Drive the organization of a table and regression. Table format must have samples (cases, observations) across rows and dependent and independent variables (features) across columns. arguments: dependence (str): name of table's column for dependent variable independence (list<str>): names of table's columns for independent variables threshold_samples (float): minimal count of samples with non-missing values of dependent and independent variables to perform regression table (object): Pandas data frame of dependent and independent variables for regression report (bool): whether to print reports raises: returns: (dict): collection of regression's residuals and statistics """ # Organize table for regression. pail_organization = organize_table_cohort_model_variables_for_regression( dependence=dependence, independence=independence, standard_scale=standard_scale, table=table, report=False, ) # Determine whether dependent and independent variables (features) have # sufficient observations for regression. if (pail_organization["count_samples"] >= threshold_samples): pail_regression = regress_linear_ordinary_least_squares( dependence=dependence, independence=pail_organization["independence"], table=pail_organization["table"], report=report, ) else: # Report. if report: utility.print_terminal_partition(level=2) print("report: ") function_name = str("drive_organize_table_regress_linear_" + "ordinary_least_squares()") print(function_name) utility.print_terminal_partition(level=5) print("Missing information for model...") print("There may be inadequate samples with non-missing values " + "or adequate variance in relevant variables.") pail_regression = create_regression_missing_values( dependence=dependence, independence=independence, ) # Return information. return pail_regression
def execute_procedure(path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ utility.print_terminal_partition(level=1) print(path_dock) print("version check: 1") # Pause procedure. time.sleep(5.0) # Initialize directories. paths = ukb_strat.initialize_directories( restore=True, path_dock=path_dock, ) # Read source information from file. # Exclusion identifiers are "eid". source = read_source( path_dock=path_dock, report=True, ) # Select and organize variables across cohorts. # Organize phenotypes and covariates in format for analysis in PLINK. # Reference population. if True: pail_population = ( ukb_strat.execute_stratify_genotype_cohorts_plink_format_set( table=source["table_phenotypes"], set="reference_population", path_dock=path_dock, report=True, )) else: pail_population = dict() pass # Vitamin D. if True: pail_vitamin_d_linear = ( ukb_strat.execute_stratify_genotype_cohorts_plink_format_set( table=source["table_phenotypes"], set="vitamin_d_linear", path_dock=path_dock, report=True, )) else: pail_vitamin_d_linear = dict() pass if True: pail_vitamin_d_logistic = ( ukb_strat.execute_stratify_genotype_cohorts_plink_format_set( table=source["table_phenotypes"], set="vitamin_d_logistic", path_dock=path_dock, report=True, )) else: pail_vitamin_d_logistic = dict() pass # Hormones and their regulatory proteins. if False: pail_hormones_linear = ( ukb_strat.execute_stratify_genotype_cohorts_plink_format_set( table=source["table_phenotypes"], set="vitamin_d_linear", path_dock=path_dock, report=True, )) else: pail_hormones_linear = dict() pass if False: pail_hormones_logistic = ( ukb_strat.execute_stratify_genotype_cohorts_plink_format_set( table=source["table_phenotypes"], set="vitamin_d_logistic", path_dock=path_dock, report=True, )) else: pail_hormones_logistic = dict() pass # Body mass index (BMI) in Bipolar Disorder. if False: pail_bipolar_linear = ( ukb_strat.execute_stratify_genotype_cohorts_plink_format_set( table=source["table_phenotypes"], set="bipolar_body_linear", path_dock=path_dock, report=True, )) else: pail_bipolar_linear = dict() pass if False: pail_bipolar_logistic = ( ukb_strat.execute_stratify_genotype_cohorts_plink_format_set( table=source["table_phenotypes"], set="bipolar_body_logistic", path_dock=path_dock, report=True, )) else: pail_bipolar_logistic = dict() pass # Collect information. information = dict() information["reference_population"] = pail_population information["vitamin_d_linear"] = pail_vitamin_d_linear information["vitamin_d_logistic"] = pail_vitamin_d_logistic information["hormones_linear"] = pail_hormones_linear information["hormones_logistic"] = pail_hormones_logistic information["body_bipolar_linear"] = pail_bipolar_linear information["body_bipolar_logistic"] = pail_bipolar_logistic # Write product information to file. ukb_strat.write_genotype_product(paths=paths, information=information) pass
def read_select_metabolite_genetic_scores( metabolite=None, selection=None, metabolites_files_paths=None, report=None, ): """ Reads a metabolite's genetic scores across the UK Biobank from file, and selects the scores to keep. This function returns a table for a single metabolite with UK Biobank identifiers and a single column of selection scores for the metabolite across these UK Biobank records. arguments: metabolite (str): identifier of a metabolite selection (str): name of column for selection from Polygenic Score thresholds metabolites_files_paths (dict<list<str>>): collection of files and paths for metabolites report (bool): whether to print reports raises: returns: (object): Pandas data frame of a metabolite's aggregate genetic scores across UK Biobank """ # Read raw table of metabolite's genetic scores. metabolite_file_path = metabolites_files_paths[metabolite]["path"] table_raw = read_source_metabolite_genetic_scores( path_file=metabolite_file_path, report=report, ) # Organize the raw table. table_raw.drop( labels=["IID",], axis="columns", inplace=True ) # Select scores. table_selection = table_raw.loc[ :, table_raw.columns.isin(["FID", selection]) ] # Translate column names. translations = dict() translations["FID"] = "identifier_ukb" translations[selection] = metabolite table_selection.rename( columns=translations, inplace=True, ) # Report. if report: # Column name translations. utility.print_terminal_partition(level=2) print("Report from: read_select_metabolite_genetic_scores()") utility.print_terminal_partition(level=2) print("Metabolite: " + str(metabolite)) print(table_selection) utility.print_terminal_partition(level=3) # Return. return table_selection
def execute_procedure(path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ utility.print_terminal_partition(level=1) print(path_dock) print("version check: 21") # Pause procedure. time.sleep(5.0) # Initialize directories. paths = initialize_directories( restore=True, path_dock=path_dock, ) path_table_kinship_pairs = os.path.join(path_dock, "access", "ukbiobank_phenotypes", "table_kinship_pairs.dat") table_kinship_pairs = pandas.read_csv( path_table_kinship_pairs, sep="\s+", header=0, dtype={ "ID1": "string", "ID2": "string", "HetHet": "float32", "IBS0": "float32", "Kinship": "float32", }, ) path_table_kinship_pairs = os.path.join(path_dock, "assembly", "table_kinship_pairs.pickle") path_table_kinship_pairs_text = os.path.join(path_dock, "assembly", "table_kinship_pairs.tsv") table_kinship_pairs.to_pickle(path_table_kinship_pairs) table_kinship_pairs.to_csv( path_or_buf=path_table_kinship_pairs_text, sep="\t", header=True, index=False, ) # Read source information from file. # Read source information from file. table_kinship_pairs = ukb_strat.read_source_table_kinship_pairs( path_dock=path_dock, report=True, ) print(table_kinship_pairs) pass
def drive_collection_report_phenotype_metabolite_studies( phenotype_study=None, metabolite_study=None, path_dock=None, report=None, ): """ Function to execute module's main behavior. arguments: phenotype_study (str): identifier of main phenotype study metabolite_study (str): identifier of metabolite study path_dock (str): path to dock directory for source and product directories and files report (bool): whether to print reports raises: returns: """ # Report. if report: utility.print_terminal_partition(level=2) print("report: drive_collection_report_phenotype_metabolite_studies()") print(phenotype_study) print(metabolite_study) # Initialize directories. paths = initialize_directories( phenotype_study=phenotype_study, metabolite_study=metabolite_study, restore=False, path_dock=path_dock, ) # Read source information from file. source = read_source( phenotype_study=phenotype_study, metabolite_study=metabolite_study, paths=paths, report=False, ) # TODO: now combine and organize the various information containers from "read_source()" # TODO: build summary table. # name change? table_summary = combine_organize_phenotype_metabolites_summary_table( table_metabolite_reference=source["table_metabolite_reference"], phenotype_heritability=source["phenotype_heritability"], table_metabolite_heritability=source["table_metabolite_heritability"], table_correlations=source["table_correlations"], threshold_metabolite_heritability=0.05, threshold_false_discovery_rate=0.05, report=False, ) # Report. if report: utility.print_terminal_partition(level=5) print(table_summary) # Collect information. information = dict() information["table_summary"] = table_summary # Write product information to file. write_product(phenotype_study=phenotype_study, metabolite_study=metabolite_study, paths=paths, information=information) pass
def calculate_principal_components_from_singular_value_decomposition( singular_values=None, left_singular_vectors=None, right_singular_vectors=None, which_singular_vectors=None, table=None, report=None, ): """ Calculates Principal Components and relevant information from raw factors of a Singular Value Decomposition (SVD). Reference: "https://stats.stackexchange.com/questions/134282/ relationship-between-svd-and-pca-how-to-use-svd-to-perform-pca" arguments: threshold_valid_proportion_per_column (float): proportion of rows that must have a valid value for a column in order to keep the column table (object): Pandas data frame of variables (features) across columns and samples (cases, observations) across rows with an explicit index, after final scaling and filtering for SVD report (bool): whether to print reports raises: returns: (dict): collection of information about the singular value decomposition """ # https://stats.stackexchange.com/questions/134282/relationship-between-svd-and-pca-how-to-use-svd-to-perform-pca # https://towardsdatascience.com/pca-and-svd-explained-with-numpy-5d13b0d2a4d8 # https://towardsdatascience.com/singular-value-decomposition-and-its-applications-in-principal-component-analysis-5b7a5f08d0bd # http://www.math.ucsd.edu/~gptesler/283/slides/pca_18-handout.pdf # https://www.cc.gatech.edu/~lsong/teaching/CX4240spring16/pca_wall.pdf # Calculate Eigenvalues. eigenvalues = ( calculate_principal_component_eigenvalues_from_singular_values( singular_values=s, count_samples=pail_svd["count_samples"], report=False, )) # TODO: calculate eigenvectors from the factor specified in # "which_singular_vectors" # Calculate Eigenvectors. # Eigenvectors are the right singular vectors of the original matrix. eigenvectors = numpy.copy(numpy.transpose(vh)) # TODO: I'm not entirely sure that I'm sorting the correct dimension of # Eigenvectors... # TODO: sort dimension will depend on which singular vector selected # Sort Eigenvectors by order of decreasing Eigenvalues. pail_sort = sort_eigenvectors_by_decreasing_eigenvalues( eigenvectors=eigenvectors, eigenvalues=eigenvalues, report=False, ) # Calculate loadings. loadings = calculate_principal_component_loadings_from_eigen_values_vectors( eigenvectors=pail_sort["eigenvectors"], eigenvalues=pail_sort["eigenvalues"], report=True, ) loadings_direct = ( calculate_principal_component_loadings_from_direct_factors( s=s, vh=vh, count_samples=pail_organization["count_samples"], report=True, )) # Calculate Principal Components. # --> Calculate from U and S # or # --> Calculate from V and S # Report. if report: utility.print_terminal_partition(level=2) print( "Report from: " + "calculate_principal_components_from_singular_value_decomposition()" ) utility.print_terminal_partition(level=2) # Original matrix has shape (M, N) print( "Shape of original matrix: " + str(pail_organization["matrix"].shape) ) # Eigenvalues. print("Shape of Eigenvalues: " + str(eigenvalues.shape)) # Eigenvectors. print("Shape of Eigenvectors: " + str(eigenvectors.shape)) # Loadings. print("Shape of Loadings: " + str(loadings.shape)) print(loadings) print( "Shape of Loadings from SVD factors: " + str(loadings_direct.shape) ) print("Loadings nearly equal by both calculations: ") print(numpy.allclose(loadings, loadings_direct)) pass # Compile information. pail = dict() #pail["table_scale"] = pail_organization["table_scale"] #pail["u"] = u #pail["singular_values"] = s #pail["vh"] = vh pail["eigenvalues"] = eigenvalues pail["eigenvectors"] = eigenvectors pail["loadings"] = loadings # Return. return pail
def read_select_collect_metabolites_genetic_scores( selection=None, metabolites_files_paths=None, report=None, ): """ Reads metabolites' genetic scores across the UK Biobank from file, aggregates scores by Singular Value Decomposition (SVD), and collects these within a table. arguments: selection (str): name of column for selection from Polygenic Score thresholds metabolites_files_paths (dict<list<str>>): collection of files and paths for metabolites report (bool): whether to print reports raises: returns: (object): Pandas data frame of metabolites' genetic scores across UK Biobank cohort """ # Initialize a table for collection. table_collection = pandas.DataFrame(columns=["identifier_ukb"]) # UK Biobank identifier is in column "FID" within the metabolite tables # rename to "identifier_ukb" table_collection.set_index( "identifier_ukb", drop=True, inplace=True, ) for metabolite in metabolites_files_paths.keys(): # Select metabolite's genetic scores. table_selection = read_select_metabolite_genetic_scores( metabolite=metabolite, selection=selection, metabolites_files_paths=metabolites_files_paths, report=False, ) # Copy information. table_metabolite = table_selection.copy(deep=True) # Organize information. table_metabolite.dropna( axis="index", how="any", subset=["identifier_ukb"], inplace=True, ) table_metabolite.set_index( "identifier_ukb", drop=True, inplace=True, ) # Collect information for metabolite. table_collection = table_collection.merge( table_metabolite, how="outer", left_on="identifier_ukb", right_on="identifier_ukb", suffixes=("_original", "_novel"), ) pass # Report. if report: # Column name translations. utility.print_terminal_partition(level=2) print("Report from: read_select_collect_metabolites_genetic_scores()") utility.print_terminal_partition(level=2) print("selection: " + str(selection)) print(table_collection) utility.print_terminal_partition(level=3) # Compile information. #pail = dict() # Return information. return table_collection
def select_organize_metabolites_valid_identities_scores( table_names=None, table_scores=None, report=None, ): """ Selects identifiers of metabolites from Metabolon with valid identities. arguments: table_names (object): Pandas data frame of metabolites' identifiers and names from Metabolon table_scores (object): Pandas data frame of metabolites' genetic scores across UK Biobank cohort report (bool): whether to print reports raises: returns: (dict): collection of information about metabolites, their identifiers, and their names """ # Copy information. table_names = table_names.copy(deep=True) table_scores = table_scores.copy(deep=True) # Translate column names. translations = dict() translations["metabolonID"] = "identifier" translations["metabolonDescription"] = "name" table_names.rename( columns=translations, inplace=True, ) # Determine whether metabolite has a valid identity. table_names["identity"] = table_names.apply( lambda row: determine_metabolite_valid_identity(name=row["name"], ), axis="columns", # apply across rows ) # Select metabolites with valid identities. table_identity = table_names.loc[(table_names["identity"] > 0.5), :] metabolites_identity = table_identity["identifier"].to_list() names_identity = table_identity["name"].to_list() # Organize table. table_names["identifier"].astype("string") table_names.set_index( "identifier", drop=True, inplace=True, ) # Remove table columns for metabolites with null genetic scores. table_scores.dropna( axis="columns", how="all", subset=None, inplace=True, ) # Select metabolites with valid identities and valid genetic scores. metabolites_scores = table_scores.columns.to_list() metabolites_valid = utility.filter_common_elements( list_minor=metabolites_identity, list_major=metabolites_scores, ) # Compile information. pail = dict() pail["table"] = table_names pail["metabolites_valid"] = metabolites_valid # Report. if report: # Column name translations. utility.print_terminal_partition(level=2) print("Report from select_metabolites_with_valid_identities()") utility.print_terminal_partition(level=3) print("Count of identifiable metabolites: " + str(len(metabolites_identity))) print("Count of identifiable metabolites with scores: " + str(len(metabolites_valid))) utility.print_terminal_partition(level=3) print(table_names) # Return information. return pail
def regress_linear_ordinary_least_squares( dependence=None, independence=None, table=None, report=None, ): """ Regresses a quantitative continuous dependent variable against multiple independent variables and returns relevant parameters and statistics. Table format must have samples (cases, observations) across rows and dependent and independent variables (features) across columns. Description of formats for StatsModels... Format of dependent variable is a vector of scalar values. [1.3, 1.5, 1.2, 1.0, 1.7, 1.5, 1.9, 1.1, 1.3, 1.4] Format of independent variable(s) is a matrix: a first-dimension vector of samples (observations) and for each sample a second-dimension vector of variables' (features') scalar values. StatsModels also requires a constant for the intercept. [ [1.3, 5.2, 1.0], [1.5, 5.1, 1.0], [1.2, 5.5, 1.0], ... ] arguments: dependence (str): name of table's column for dependent variable independence (list<str>): names of table's columns for independent variables table (object): Pandas data frame of dependent and independent variables for regression report (bool): whether to print reports raises: returns: (dict): collection of regression's residuals and statistics """ # Determine count of valid samples (cases, observations). count_samples = int(table.shape[0]) # Extract values of dependent and independent variables. values_dependence = table[dependence].to_numpy() # Keep independent variables in Pandas dataframe to preserve variables' # names. #values_independence = data.loc[ :, independence].to_numpy() table_independence = table.loc[:, independence] # Introduce constant value for intercept. # If any column in the independent variables already has constant # values, then the function skips it by default. # It is necessary to change parameter "has_constant" to avoid this # conditional behavior. table_independence_intercept = statsmodels.api.add_constant( table_independence, prepend=True, # insert intercept constant first has_constant="add", # introduce new intercept constant regardless ) columns_independence = copy.deepcopy( table_independence_intercept.columns.to_list()) #matrix_independence = table.to_numpy() # Define model. model = statsmodels.api.OLS( values_dependence, table_independence_intercept, missing="drop", ) pail_raw = model.fit() # Report. if report: print("--------------------------------------------------") print("Report source: " + "regress_dependent_independent_variables_linear_ordinary()") print("--------------------------------------------------") print("Version check: TCW 28 September 2021") print("Information from regression:") print(pail_raw.summary()) #utility.print_terminal_partition(level=3) #print(dir(pail_raw)) #print(pail_raw.params) #print(pail_raw.pvalues) pass # Organize residuals. residuals = pail_raw.resid ########## # Collect parameters, errors, probabilities, and statistics. model_parameters = pandas.Series(data=pail_raw.params) model_parameter_errors = pandas.Series(data=pail_raw.bse) model_probabilities = pandas.Series(data=pail_raw.pvalues) parameters = dict() parameter_errors = dict() parameter_intervals = dict() parameter_ranges = dict() probabilities = dict() inflations = dict() if ("const" in model_parameters.index): #parameters["intercept_parameter"] = report.params[0] parameters["intercept_parameter"] = model_parameters["const"] else: parameters["intercept_parameter"] = float("nan") # Report. if report: utility.print_terminal_partition(level=4) print("Warning: regression data does not have constant intercept.") print(independence) if ("const" in model_parameter_errors.index): parameter_errors["intercept_error"] = model_parameter_errors["const"] parameter_intervals["intercept_interval_95"] = float( 1.96 * parameter_errors["intercept_error"]) parameter_ranges["intercept_range_95"] = ( determine_confidence_interval_range_text( estimate=parameters["intercept_parameter"], interval_low=parameter_intervals["intercept_interval_95"], interval_high=parameter_intervals["intercept_interval_95"], )) else: parameter_errors["intercept_error"] = float("nan") parameter_intervals["intercept_interval_95"] = float("nan") parameter_ranges["intercept_range_95"] = str("nan ... nan") # Report. if report: utility.print_terminal_partition(level=4) print("Warning: regression data does not have constant intercept.") print(independence) if ("const" in model_probabilities.index): #probabilities["intercept_probability"] = report.pvalues[0] probabilities["intercept_probability"] = (model_probabilities["const"]) else: probabilities["intercept_probability"] = float("nan") # Report. if report: utility.print_terminal_partition(level=4) print("Warning: regression data does not have constant intercept.") print(independence) inflations["intercept_inflation"] = float("nan") # Iterate on each independent variable. # Initiate counter at 1 to assume that intercept is at index 0. counter = 1 # Accommodate index for intercept. for variable in independence: # Coefficient or parameter. parameter = str(variable + ("_parameter")) #parameters[parameter] = report.params[counter] parameters[parameter] = model_parameters[variable] # Parameter standard error parameter_error = str(variable + ("_error")) parameter_errors[parameter_error] = model_parameter_errors[variable] parameter_interval = str(variable + ("_interval_95")) parameter_intervals[parameter_interval] = float( 1.96 * parameter_errors[parameter_error]) parameter_range = str(variable + ("_range_95")) parameter_ranges[parameter_range] = ( determine_confidence_interval_range_text( estimate=parameters[parameter], interval_low=parameter_intervals[parameter_interval], interval_high=parameter_intervals[parameter_interval], )) # Probability. probability = str(variable + ("_probability")) #probabilities[probability] = report.pvalues[counter] probabilities[probability] = model_probabilities[variable] # Variance Inflation Factor (VIF). inflation = str(variable + ("_inflation")) inflation_value = ( statsmodels.stats.outliers_influence.variance_inflation_factor( table_independence_intercept.to_numpy(), counter)) inflations[inflation] = round(inflation_value, 3) # Increment index. counter += 1 pass summary = { "independence": ";".join(independence), "freedom": pail_raw.df_model, "observations": pail_raw.nobs, "samples": count_samples, "r_square": pail_raw.rsquared, "r_square_adjust": pail_raw.rsquared_adj, "log_likelihood": pail_raw.llf, "akaike": pail_raw.aic, "bayes": pail_raw.bic, "condition": pail_raw.condition_number, } summary.update(parameters) summary.update(parameter_errors) summary.update(parameter_intervals) summary.update(parameter_ranges) summary.update(probabilities) summary.update(inflations) # Compile information. pail = dict() pail["summary"] = summary pail["residuals"] = residuals # Return information. return pail
def execute_procedure( path_dock=None, ): """ Function to execute module's main behavior. arguments: path_dock (str): path to dock directory for source and product directories and files raises: returns: """ utility.print_terminal_partition(level=1) print(path_dock) print("version check: 2") # Pause procedure. time.sleep(5.0) # Initialize directories. paths = initialize_directories( restore=True, path_dock=path_dock, ) # Read source information from file. # Exclusion identifiers are "eid". source = read_source_initial( path_dock=path_dock, report=False, ) if False: # Test the aggregation method for a single metabolite. # M00599: pyruvate # M32315: serine # M02342: serotonin # M00054: tryptophan pail_test = read_aggregate_test_metabolite_genetic_scores( metabolite="M00054", metabolites_files_paths=source["metabolites_files_paths"], report=True, ) # Collect metabolites' genetic scores, and aggregate these by singular value # decomposition (SVD). # pail_metabolites_scores table_scores = read_aggregate_collect_metabolites_genetic_scores( metabolites_files_paths=source["metabolites_files_paths"], ) print("printing after read_aggregate_collect_metabolites_genetic_scores") print(table_scores) # TODO: temporarily by-pass the aggregation process... # Collect metabolites' genetic scores at multiple PRS p-value thresholds. table_prs_0_00001 = read_select_collect_metabolites_genetic_scores( selection="X1e.05", metabolites_files_paths=source["metabolites_files_paths"], report=True, ) table_prs_0_0001 = read_select_collect_metabolites_genetic_scores( selection="X0.0001", metabolites_files_paths=source["metabolites_files_paths"], report=True, ) table_prs_0_001 = read_select_collect_metabolites_genetic_scores( selection="X0.001", metabolites_files_paths=source["metabolites_files_paths"], report=True, ) table_prs_0_01 = read_select_collect_metabolites_genetic_scores( selection="X0.01", metabolites_files_paths=source["metabolites_files_paths"], report=True, ) table_prs_0_1 = read_select_collect_metabolites_genetic_scores( selection="X0.1", # "X0.001", "X0.01", "X0.1" metabolites_files_paths=source["metabolites_files_paths"], report=True, ) # Collect information. information = dict() information["metabolites_files_paths"] = source["metabolites_files_paths"] information["table_prs_0_00001"] = table_prs_0_00001 information["table_prs_0_0001"] = table_prs_0_0001 information["table_prs_0_001"] = table_prs_0_001 information["table_prs_0_01"] = table_prs_0_01 information["table_prs_0_1"] = table_prs_0_1 # TODO: eventually, include a dictionary collection of a table for each # metabolite #information["pail_metabolites_scores_tables"] = pail # Write product information to file. write_product( paths=paths, information=information ) pass
def drive_cohort_model_linear_regression( table=None, table_cohorts_models=None, cohort=None, model=None, dependence=None, report=None, ): """ Organize regressions. Table format must have samples (cases, observations) across rows and dependent and independent variables (features) across columns. arguments: table (object): Pandas data frame of dependent and independent variables (features) across columns and samples (cases, observations) within a specific cohort across rows table_cohorts_models (object): Pandas data frame of cohorts, models, dependent variables, and independent variables for regression cohort (str): name of a stratification cohort for regression analysis model (str): name of a model for regression analysis, normally "complex", "simple", or "unadjust" dependence (str): name of table's column for dependent variable report (bool): whether to print reports raises: returns: (dict): information from regressions """ pail_model = determine_cohort_model_variables_from_reference_table( cohort=cohort, model=model, dependence=dependence, table=table_cohorts_models, report=report, ) if (pail_model["match"]): # Report. if report: utility.print_terminal_partition(level=2) print("report: ") print("drive_cohort_model_linear_regression()") utility.print_terminal_partition(level=5) print("cohort: " + str(cohort)) print("model: " + str(model)) print("dependent variable: " + str(dependence)) print("independent variables: ") print(pail_model["independence"]) utility.print_terminal_partition(level=5) pail_regression = ( drive_organize_table_regress_linear_ordinary_least_squares( dependence=dependence, independence=pail_model["independence"], standard_scale=True, threshold_samples=50, table=table, report=report, )) else: # Report. if report: utility.print_terminal_partition(level=2) print("report: ") print("drive_cohort_model_linear_regression()") utility.print_terminal_partition(level=5) print("cohort: " + str(cohort)) print("model: " + str(model)) print("dependent variable: " + str(dependence)) print("independent variables: ") print(pail_model["independence"]) utility.print_terminal_partition(level=5) print("Missing information for model...") utility.print_terminal_partition(level=5) pail_regression = create_regression_missing_values( dependence=dependence, independence=pail_model["independence"], ) return pail_regression
def read_source_metabolite_genetic_scores( path_file=None, report=None, ): """ Reads and organizes source information from file. Notice that Pandas does not accommodate missing values within series of integer variable types. arguments: path_dock (str): path to dock directory for source and product directories and files report (bool): whether to print reports raises: returns: (object): source information """ # Read information from file. variables_types = { "FID": "string", "IID": "string", "X5e.08": "float32", "X1e.07": "float32", "X1e.06": "float32", "X1e.05": "float32", "X0.0001": "float32", "X0.001": "float32", "X0.01": "float32", "X0.05": "float32", "X0.1": "float32", "X0.2": "float32", "X1": "float32", } table = pandas.read_csv( path_file, sep="\s+", # ",", "\t", "\s+" header=0, dtype=variables_types, na_values=["NA", "<NA>"], keep_default_na=True, compression=None, # "gzip" ) # Report. if report: # Report only for a few metabolites. metabolites = ["M00599", "M32315", "M02342", "M00054"] match = any(list(map( lambda metabolite: (metabolite in path_file), metabolites ))) if match: utility.print_terminal_partition(level=2) print("Path: " + str(path_file)) print("raw table for example metabolites:") print(table) utility.print_terminal_partition(level=3) print(table.columns.to_list()) utility.print_terminal_partition(level=3) print("variable types:") print(table.dtypes) utility.print_terminal_partition(level=3) # Compile and return information. return table