def organize_principal_component_factor_table(
    factors=None,
    prefix=None,
    index=None,
    index_name=None,
    report=None,
):
    """
    Organizes a table of factors or scores from Principal Component Analysis.

    arguments:
        factors (object): NumPy array matrix of Principal Component factors
        prefix (str): prefix for names of component columns
        index (object): NumPy array of indices for table
        index_name (str): name for table's index column
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information

    """

    # Copy information.
    factors = numpy.copy(factors)
    # Organize information.
    count = 1
    columns = list()
    for component in range(0, pail_components.factors.shape[1], 1):
        column = str(prefix + str(count))
        columns.append(column)
        count += 1
    table = pandas.DataFrame(
        data=factors,
        index=index,
        columns=columns,
        dtype="float32",
        copy=True,
    )
    table.rename_axis(
        index=index_name,
        axis="index",
        copy=False,
        inplace=True,
    )
    table.reset_index(
        level=None,
        inplace=True
    )
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Matrix before organization:")
        print(factors)
        utility.print_terminal_partition(level=3)
        print("Table after organization:")
        print(table)
    # Return.
    return table
def execute_procedure(path_dock=None, ):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 1")
    # Pause procedure.
    time.sleep(5.0)

    # Execute assembly procedure from uk_biobank package.
    ukb_importation.execute_procedure(path_dock=path_dock)
    utility.print_terminal_partition(level=1)
    print("From package 'uk_biobank', procedure 'importation' is complete.")

    pass
def execute_procedure(path_dock=None, ):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    # Report version.
    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 1")
    # Pause procedure.
    time.sleep(5.0)

    # Define phenotype studies.
    phenotype_studies = [
        "30124842_yengo_2018",
        "30239722_pulit_2018",
        "30482948_walters_2018_all",
        "30482948_walters_2018_eur",
        "30482948_walters_2018_eur_unrel",
        "30718901_howard_2019",
        "29906448_ruderfer_2018_scz_vs_ctl",
        "29906448_ruderfer_2018_scz_bpd_vs_ctl",
        "29906448_ruderfer_2018_scz_vs_bpd",
        "29906448_ruderfer_2018_bpd_vs_ctl",
        "00000000_ripke_2021",
        "31043756_stahl_2019",
        "00000000_mullins_2021_all",
        "00000000_mullins_2021_bpd1",
        "00000000_mullins_2021_bpd2",
    ]
    # Define metabolite studies.
    metabolite_studies = [
        #"24816252_shin_2014",
        "27005778_kettunen_2016",
        #"33437055_panyard_2021",
    ]
    for phenotype_study in phenotype_studies:
        for metabolite_study in metabolite_studies:
            drive_collection_report_phenotype_metabolite_studies(
                phenotype_study=phenotype_study,
                metabolite_study=metabolite_study,
                path_dock=path_dock,
                report=True,
            )
            pass
        pass
    pass
def read_source(
    path_dock=None,
    report=None,
):
    """
    Reads and organizes source information from file.

    Notice that Pandas does not accommodate missing values within series of
    integer variable types.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files
        report (bool): whether to print reports

    raises:

    returns:
        (object): source information

    """

    # Specify directories and files.
    path_table_phenotypes = os.path.join(path_dock, "organization",
                                         "table_phenotypes.pickle")
    path_table_metabolites_names = os.path.join(
        path_dock, "organization", "table_metabolites_names.pickle")
    path_metabolites_valid = os.path.join(path_dock, "organization",
                                          "metabolites_valid.pickle")

    ##################################3
    path_table_metabolites_scores = os.path.join(
        path_dock, "aggregation", "selection",
        "table_metabolites_scores_prs_0_0001.pickle")
    utility.print_terminal_partition(level=1)
    print("PRS pvalue: 0.0001")
    utility.print_terminal_partition(level=2)
    # Pause procedure.
    time.sleep(5.0)
    ##############################################

    # Read information from file.
    table_phenotypes = pandas.read_pickle(path_table_phenotypes)
    table_metabolites_scores = pandas.read_pickle(
        path_table_metabolites_scores)
    table_metabolites_names = pandas.read_pickle(path_table_metabolites_names)
    with open(path_metabolites_valid, "rb") as file_source:
        metabolites_valid = pickle.load(file_source)
    # Compile and return information.
    return {
        "table_phenotypes": table_phenotypes,
        "table_metabolites_scores": table_metabolites_scores,
        "table_metabolites_names": table_metabolites_names,
        "metabolites_valid": metabolites_valid,
    }
def read_aggregate_test_metabolite_genetic_scores(
    metabolite=None,
    metabolites_files_paths=None,
    report=None,
):
    """
    Reads a metabolite's genetic scores across the UK Biobank from file,
    aggregates scores by Singular Value Decomposition (SVD), and tests this
    method.

    arguments:
        metabolite (str): identifier of a metabolite
        metabolites_files_paths (dict<list<str>>): collection of files and paths
            for metabolites
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information

    """

    # Aggregate metabolite's genetic scores.
    table_aggregation = read_aggregate_metabolite_genetic_scores(
        metabolite=metabolite,
        metabolites_files_paths=metabolites_files_paths,
        report=report,
    )
    # Copy information.
    table = table_aggregation.copy(deep=True)
    # Organize information.
    table.dropna(
        axis="index",
        how="any",
        subset=["identifier_ukb"],
        inplace=True,
    )
    table.set_index(
        "identifier_ukb",
        drop=True,
        inplace=True,
    )
    # Report.
    if report:
        # Column name translations.
        utility.print_terminal_partition(level=2)
        print("reporting from: read_aggregate_test_metabolite_genetic_score()")
        print("blah blah...")
        print(table)
        utility.print_terminal_partition(level=3)
    # Compile information.
    pail = dict()
    # Return.
    return pail
def remove_null_records_standardize_variables_scales(
    table=None,
    report=None,
):
    """
    Removes records with null values and standardizes variables' scales.

    arguments:
        table (object): Pandas data frame of dependent and independent variables
            for regression
        report (bool): whether to print reports

    raises:

    returns:
        (object): Pandas data frame of dependent and independent variables for
            regression

    """

    # Copy information.
    table = table.copy(deep=True)
    # Drop any rows with null keys.
    table.dropna(
        axis="index",
        how="any",
        subset=None,
        inplace=True,
    )
    # Standardize variables' scales.
    table_scale = utility.standardize_table_values_by_column(
        table=table,
        report=report,
    )
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(
            "Report source: remove_null_records_standardize_variables_scales()"
        )
        utility.print_terminal_partition(level=3)
        print("Table after removal of records with null values...")
        print("... and after standardizing scale of each variable:")
        print(table_scale)
        pass
    # Return information.
    return table_scale
Ejemplo n.º 7
0
def organize_principal_component_variance_proportion_table(
    variance_proportions=None,
    prefix=None,
    index_name=None,
    report=None,
):
    """
    Organizes a table of proportion of variance explained by each Eigenvector
    and Principal Component factor.

    arguments:
        variance_proportions (object): NumPy array of proportions of variance
            explained
        prefix (str): prefix for names of component columns
        index_name (str): name for table's index column
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information

    """

    # Copy information.
    variance_proportions = numpy.copy(variance_proportions).tolist()
    # Organize information.
    count = 1
    records = list()
    for variance_proportion in variance_proportions:
        record = dict()
        record[index_name] = str(prefix + str(count))
        record["variance_proportion"] = variance_proportion
        records.append(record)
        count += 1
    table = pandas.DataFrame(data=records)
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Report from: " +
              "organize_principal_component_variance_proportion_table()")
        utility.print_terminal_partition(level=3)
        print("Table after organization:")
        print(table)
    # Return.
    return table
def execute_procedure(path_dock=None, ):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 3")

    # TODO: steps to complete here...
    # 1. write script to move metabolite GEM PRSs to a new directory in "dock"
    # 2. read all contents of the PRSice output directory (need to move to "dock" first)
    # 3. filter to keep only the relevant files... probably on basis of file suffixes
    # 4. iterate on the files in the directory...
    # --- collect information for each metabolite during iteration (sort of a collection)
    # 5. for each metabolite, read PRS table in Pandas, specifying gzip compression
    # 6. calculate PRS-PCA by modification of Coombes' method
    # 7. keep PC1 and collect within a dataframe for all metabolites

    # Initialize directories.
    paths = initialize_directories(
        restore=True,
        path_dock=path_dock,
    )
    # Read source information from file.
    # Exclusion identifiers are "eid".
    source = read_source(
        path_dock=path_dock,
        report=True,
    )

    # Collect information.
    information = dict()
    # Write product information to file.
    write_product(paths=paths, information=information)
    pass
Ejemplo n.º 9
0
def calculate_principal_component_explanation_variance_proportions(
    eigenvalues=None,
    report=None,
):
    """
    Calculates the proportion of variance explained by Eigenvectors of Principal
    Components Analysis (PCA).

    Sum of proportional variance explained across all Eigenvectors is one.

    arguments:
        eigenvalues (object): NumPy array of Eigenvalues
        report (bool): whether to print reports

    raises:

    returns:
        (object): NumPy array of proportions of variance explained

    """
    def divide_by_total(value, total):
        return (value / (total))

    array_divide_by_total = numpy.vectorize(divide_by_total)

    # Copy information.
    eigenvalues = numpy.copy(eigenvalues)
    # Calculate total variance across all Eigenvectors.
    variance_total = numpy.sum(eigenvalues)
    # Calculate proportional variance across Eigenvectors.
    variance_proportions = array_divide_by_total(eigenvalues, variance_total)
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(
            "Report from: " +
            "calculate_principal_component_explanation_variance_proportions()")
        utility.print_terminal_partition(level=2)
        print("Eigenvalues...")
        print(eigenvalues)
        print("Variance proportions...")
        print(variance_proportions)
    # Return.
    return variance_proportions
Ejemplo n.º 10
0
def calculate_principal_component_loadings(
    eigenvectors=None,
    eigenvalues=None,
    s_singular_values=None,
    vt_right_singular_vectors_rows=None,
    count_samples=None,
    report=None,
):
    """
    Calculates Principal Components Analysis (PCA) loadings.

    arguments:
        eigenvectors (object): NumPy matrix of Eigenvectors
        eigenvalues (object): NumPy array of Eigenvalues
        s_singular_values (object): Numpy matrix of Singular Values
        vt_right_singular_vectors_rows (object): Numpy matrix
        count_samples (float): count of samples in the original source matrix
            for Singular Value Decomposition
        report (bool): whether to print reports

    raises:

    returns:
        (object): Numpy array of loadings

    """

    # Eigenvectors and Eigenvalues.
    loadings_eigen = calculate_loadings_from_eigenvalues_eigenvectors(
        eigenvectors=eigenvectors,
        eigenvalues=eigenvalues,
        report=report,
    )
    # Raw decomposition factors.
    loadings_factor = calculate_loadings_from_decomposition_factors(
        s_singular_values=s_singular_values,
        vt_right_singular_vectors_rows=vt_right_singular_vectors_rows,
        count_samples=count_samples,
        report=report,
    )

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Report from: " + "calculate_principal_component_loadings()")
        utility.print_terminal_partition(level=3)
        print("Shape of Eigen loadings: " + str(loadings_eigen.shape))
        print("Shape of factor loadings: " + str(loadings_factor.shape))
        utility.print_terminal_partition(level=4)
        print("Compare loadings from both methods: ")
        print(
            numpy.allclose(
                loadings_eigen,
                loadings_factor,
                rtol=1e-2,
                atol=1e-3,
                equal_nan=False,
            ))
    # Return.
    return loadings_eigen
Ejemplo n.º 11
0
def calculate_loadings_from_eigenvalues_eigenvectors(
    eigenvectors=None,
    eigenvalues=None,
    report=None,
):
    """
    Calculates Principal Components Analysis (PCA) loadings from Eigenvectors
    and Eigenvalues.

    Statsmodels erroneously returns "loadings" that have identical values and
    dimensions as the Eigenvectors; however, Eigenvectors and Loadings are not
    equivalent.

    loadings = eigenvectors [dot] square_root(eigenvalues)
    Loadings include aspects of both direction (eigenvectors) and scale
    (eigenvalues).

    arguments:
        eigenvectors (object): NumPy matrix of Eigenvectors
        eigenvalues (object): NumPy array of Eigenvalues
        report (bool): whether to print reports

    raises:

    returns:
        (object): Numpy array of loadings

    """

    # Copy information.
    eigenvectors = numpy.copy(eigenvectors)
    eigenvalues = numpy.copy(eigenvalues)
    # Calculate square roots of Eigenvalues.
    # Organize a diagonal matrix of square roots of Eigenvalues.
    eigenvalues_square_root = numpy.sqrt(eigenvalues)
    eigenvalues_root_diagonal = numpy.diag(eigenvalues_square_root)
    # Calculate loadings.
    loadings = numpy.dot(eigenvectors, eigenvalues_root_diagonal)
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Report from: " +
              "calculate_loadings_from_eigenvalues_eigenvectors()")
        utility.print_terminal_partition(level=3)
        print("Shape of loadings: " + str(loadings.shape))
        utility.print_terminal_partition(level=4)
        print(
            "Loadings = Eigenvectors [dot] square_root(diagonal Eigenvalues)")
        print(loadings)
    # Return.
    return loadings
Ejemplo n.º 12
0
def calculate_loadings_from_decomposition_factors(
    s_singular_values=None,
    vt_right_singular_vectors_rows=None,
    count_samples=None,
    report=None,
):
    """
    Calculates Principal Components Analysis (PCA) loadings from direct factors
    of Singular Value Decomposition.

    arguments:
        s_singular_values (object): Numpy matrix of Singular Values
        vt_right_singular_vectors_rows (object): Numpy matrix
        count_samples (float): count of samples in the original source matrix
            for Singular Value Decomposition
        report (bool): whether to print reports

    raises:

    returns:
        (object): Numpy array of loadings

    """
    def divide_by_sample_count(value, count_samples):
        return (value / math.sqrt(count_samples - 1))

    array_divide_by_sample_count = numpy.vectorize(divide_by_sample_count)

    # Copy information.
    s = numpy.copy(s_singular_values)
    vt = numpy.copy(vt_right_singular_vectors_rows)
    # Calculate loadings.
    quotients = array_divide_by_sample_count(s, count_samples)
    quotients_diagonal = numpy.diag(quotients)
    loadings = numpy.dot(vt, quotients_diagonal)
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Report from: " +
              "calculate_loadings_from_decomposition_factors()")
        utility.print_terminal_partition(level=3)
        print("Shape of loadings: " + str(loadings.shape))
        utility.print_terminal_partition(level=4)
        print("Loadings = (V [dot] (S / square_root(samples - 1)))")
        print(loadings)
    # Return.
    return loadings
def combine_organize_phenotype_metabolites_summary_table(
    table_metabolite_reference=None,
    phenotype_heritability=None,
    table_metabolite_heritability=None,
    table_correlations=None,
    threshold_metabolite_heritability=None,
    threshold_false_discovery_rate=None,
    report=None,
):
    """
    Reads, collects, and organizes metabolite heritability estimates.

    arguments:
        table_metabolite_reference (object): Pandas data frame of metabolites'
            identifiers and names from study
        phenotype_heritability (dict): information about estimation of a
            phenotype's heritability
        table_metabolite_heritability (object): Pandas data frame of
            metabolites' heritability estimates
        table_correlations (object): Pandas data frame of genetic correlations
        threshold_metabolite_heritability (float): threshold for metabolite
            heritability
        threshold_false_discovery_rate (float): threshold for false discovery
            rate
        report (bool): whether to print reports

    raises:

    returns:
        (object): Pandas data frame of metabolites' heritability estimates and
            genetic correlation estimates against a phenotype of interest

    """

    # Organize metabolite reference table.
    table_metabolite_reference = organize_metabolite_reference_table(
        table=table_metabolite_reference,
        identifier="identifier_study",
        name="name",
        identity="identity",
    )

    # Merge tables for metabolite references and heritabilities.
    # Merge data tables using database-style join.
    # Alternative is to use DataFrame.join().
    table_heritability = table_metabolite_reference.merge(
        table_metabolite_heritability,
        how="outer",
        left_on="identifier",
        right_on="identifier",
        suffixes=("_reference", "_heritability"),
    )

    # Merge tables for metabolite heritabilities and correlations.
    # Merge data tables using database-style join.
    # Alternative is to use DataFrame.join().
    table = table_heritability.merge(
        table_correlations,
        how="outer",
        left_on="identifier",
        right_on="identifier",
        suffixes=("_heritability", "_correlation"),
    )

    # Introduce columns for phenotype heritability.
    table["phenotype_heritability"] = phenotype_heritability["heritability"]
    table["phenotype_heritability_error"] = (
        phenotype_heritability["heritability_standard_error"])

    # Select table rows for metabolites with valid identities.
    table = table.loc[(table["identity"] == 1), :]
    # Select table rows for metabolites with valid heritability estimates.
    table = table.loc[(
        table["heritability"] >= threshold_metabolite_heritability), :]

    # Calculate False Discovery Rates (FDRs).
    table = utility.calculate_table_false_discovery_rates(
        threshold=threshold_false_discovery_rate,
        probability="correlation_probability",
        discovery="correlation_discovery",
        significance="correlation_significance",
        table=table,
    )
    # Sort table rows.
    table.sort_values(
        by=["correlation_absolute"],
        axis="index",
        ascending=False,
        na_position="last",
        inplace=True,
    )
    table.sort_values(
        by=[
            "correlation_discovery",
        ],
        axis="index",
        ascending=True,
        na_position="last",
        inplace=True,
    )
    # Sort table columns.
    columns_sequence = [
        #"identifier",
        "name",
        "correlation_discovery",
        "correlation",
        "correlation_standard_error",
        "heritability",
        "heritability_standard_error",
        "correlation_absolute",
        "correlation_probability",
        "phenotype_heritability",
        "phenotype_heritability_error",
        "heritability_ratio",
        "heritability_ratio_standard_error",
        "heritability_variants",
        "correlation_significance",
        "correlation_variants",
    ]
    table = table[[*columns_sequence]]
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("combine_organize_phenotype_metabolites_summary_table()")
        print(table)
    # Return information.
    return table
def read_source(
    phenotype_study=None,
    metabolite_study=None,
    paths=None,
    report=None,
):
    """
    Reads and organizes source information from file.

    arguments:
        phenotype_study (str): identifier of main phenotype study
        metabolite_study (str): identifier of metabolite study
        paths (dict<str>): collection of paths to directories for procedure's
            files
        report (bool): whether to print reports

    raises:

    returns:
        (object): source information

    """

    # Metabolite reference table.
    path_table_metabolite_reference = os.path.join(
        paths["dock"], "parameters", "psychiatric_metabolism",
        "metabolite_reference", metabolite_study,
        "table_metabolite_reference.tsv")
    table_metabolite_reference = pandas.read_csv(
        path_table_metabolite_reference,
        sep="\t",
        header=0,
        #dtype="string",
    )
    # Phenotype heritability.
    phenotype_heritability = read_extract_phenotype_heritability(
        file="heritability_report.log",
        file_suffix="_heritability_report.log",
        path_source_directory=paths["heritability_studies"][phenotype_study],
    )
    # Metabolite heritability table.
    table_metabolite_heritability = read_collect_metabolites_heritabilities(
        file_suffix="_heritability_report.log",
        path_source_directory=paths["heritability_studies"][metabolite_study],
    )
    # Phenotype-metabolite correlation table.
    table_correlations = (
        read_collect_phenotype_metabolites_genetic_correlations(
            file_suffix="_correlation.log",
            path_source_directory=(paths["correlation_studies"]
                                   [phenotype_study][metabolite_study]),
        ))

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(table_correlations)
        utility.print_terminal_partition(level=2)
    # Compile and return information.
    return {
        "table_metabolite_reference": table_metabolite_reference,
        "phenotype_heritability": phenotype_heritability,
        "table_metabolite_heritability": table_metabolite_heritability,
        "table_correlations": table_correlations,
    }
def read_aggregate_metabolite_genetic_scores(
    metabolite=None,
    metabolites_files_paths=None,
    report=None,
):
    """
    Reads a metabolite's genetic scores across the UK Biobank from file,
    and aggregates these scores by Singular Value Decomposition (SVD).

    This function returns a table for a single metabolite with UK Biobank
    identifiers and a single column of aggregate scores for the metabolite
    across these UK Biobank records.

    arguments:
        metabolite (str): identifier of a metabolite
        metabolites_files_paths (dict<list<str>>): collection of files and paths
            for metabolites
        report (bool): whether to print reports

    raises:

    returns:
        (object): Pandas data frame of a metabolite's aggregate genetic scores
            across UK Biobank

    """

    # Read raw table of metabolite's genetic scores.
    metabolite_file_path = metabolites_files_paths[metabolite]["path"]
    table_raw = read_source_metabolite_genetic_scores(
        path_file=metabolite_file_path,
        report=report,
    )
    # Organize the raw table.
    table_raw.drop(
        labels=["IID",],
        axis="columns",
        inplace=True
    )
    # Translate column names.
    translations = dict()
    translations["FID"] = "identifier_ukb"
    table_raw.rename(
        columns=translations,
        inplace=True,
    )
    # Aggregate scores.
    table_aggregation = organize_aggregate_metabolite_genetic_scores(
        identifier=metabolite,
        column_index="identifier_ukb",
        columns_scores=[
            "X5e.08", "X1e.07", "X1e.06", "X1e.05", "X0.0001", "X0.001",
            "X0.01", "X0.05", "X0.1", "X0.2", "X1",
        ],
        table=table_raw,
        report=report,
    )
    # Report.
    if report:
        # Column name translations.
        utility.print_terminal_partition(level=2)
        print("Report from: read_aggregate_metabolite_genetic_scores()")
        utility.print_terminal_partition(level=2)
        print("Metabolite: " + str(metabolite))
        print(table_aggregation)
        utility.print_terminal_partition(level=3)
    # Return.
    return table_aggregation
def organize_principal_components_positive_sum_loadings(
    threshold_valid_proportion_per_column=None,
    table=None,
    report=None,
):
    """
    Organizes a Principal Components Analysis while forcing loadings to have a
    positive sum.

    arguments:
        threshold_valid_proportion_per_column (float): proportion of rows that
            must have a valid value for a column in order to keep the column
        table (object): Pandas data frame of variables (features) across
            columns and samples (cases, observations) across rows with an
            explicit index
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information about the singular value
            decomposition

    """

    # Calculate original factors by Singular Value Decomposition (SVD).
    pail_original = (
        organize_principal_components_by_singular_value_decomposition(
            threshold_valid_proportion_per_column=(
                threshold_valid_proportion_per_column
            ),
            table=table,
            report=report,
    ))
    # Determine whether loadings have a positive sum.
    loadings_original = numpy.copy(pail_original["loadings"])
    sum_original = numpy.sum(loadings_original.flatten(order="C"))
    if (sum_original >= 0):
        loading_sign_flip = False
    else:
        loading_sign_flip = True
        # TODO: invert signs of all of Vh (and U???)... then re-calculate derived values.
        loadings_novel = numpy.negative(numpy.copy(pail_components.loadings))
        sum_novel = numpy.sum(loadings_novel.flatten(order="C"))


    # Organize principal component factors within table.
    table_components = organize_principal_component_factor_table(
        factors=pail_components.factors, # TODO: change to factors after sign adjustment
        prefix="component_",
        index=index,
        index_name="identifier_ukb",
        report=True,
    )

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(
            "Report from: " +
            "organize_principal_components_positive_sum_loadings()"
        )
        utility.print_terminal_partition(level=2)
        pass
    # Compile information.
    pail = dict()
    pail["table_scale"] = pail_organization["table_scale"]
    pail["u"] = u
    pail["singular_values"] = s
    pail["vh"] = vh
    pail["eigenvalues"] = eigenvalues
    pail["eigenvectors"] = eigenvectors
    pail["loadings"] = loadings
    # Return.
    return pail
Ejemplo n.º 17
0
def execute_procedure(path_dock=None, ):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 1")
    # Pause procedure.
    time.sleep(5.0)

    # Initialize directories.
    paths = ukb_organization.initialize_directories(
        restore=True,
        path_dock=path_dock,
    )
    # Read source information from file.
    # Exclusion identifiers are "eid".
    source = ukb_organization.read_source(
        source="importation",
        path_dock=path_dock,
        report=True,
    )
    # Organize variables for persons' genotypes, sex, age, and body mass index
    # across the UK Biobank.
    pail_basis = ukb_organization.execute_genotype_assessment_basis(
        table=source["table_phenotypes"],
        path_dock=path_dock,
        report=True,
    )
    if True:
        # Organize variables for persons' sex hormones across the UK Biobank.
        pail_hormone = ukb_organization.execute_sex_hormones(
            table=pail_basis["table"],
            path_dock=path_dock,
            report=True,
        )
        # Organize variables for female menstruation across the UK Biobank.
        pail_female = ukb_organization.execute_female_menstruation(
            table=pail_hormone["table"],
            report=True,
        )
        # Organize variables for persons' alcohol consumption across the UK Biobank.
        pail_alcohol = ukb_organization.execute_alcohol(
            table=pail_female["table"],
            report=True,
        )
        # Organize variables for persons' mental health across the UK Biobank.
        pail_psychology = ukb_organization.execute_psychology_psychiatry(
            table=pail_alcohol["table"],
            path_dock=path_dock,
            report=True,
        )
        #print(pail_psychology["table_clean"].columns.to_list())
    if False:
        # Organize variables for persons' sex hormones across the UK Biobank.
        pail_hormone = ukb_organization.execute_sex_hormones(
            table=pail_basis["table"],
            path_dock=path_dock,
            report=True,
        )

    # Collect information.
    information = dict()
    information["organization"] = dict()
    #information["organization"]["table_phenotypes"] = pail_basis["table"]
    #information["organization"]["table_phenotypes"] = pail_hormone["table"]
    #information["organization"]["table_phenotypes"] = pail_female["table"]
    information["organization"]["table_phenotypes"] = pail_psychology["table"]
    # Write product information to file.
    ukb_organization.write_product(paths=paths, information=information)
    pass
def adjust_singular_value_decomposition_factor_signs(
    matrix=None,
    singular_values=None,
    left_singular_vectors_columns=None,
    right_singular_vectors_rows=None,
    report=None,
):
    """
    Adjusts the otherwise random signs of factors from Singular Value
    Decomposition (SVD) to reduce the directional ambiguity.

    Reference:

    arguments:
        matrix (object): NumPy array matrix of original values across samples
            (rows, dimension 0) and variables (columns, dimension 1)
        singular_values (object): NumPy array of Singular Values from SVD
        left_singular_vectors_columns (object): NumPy array matrix with SVD left
            singular vectors as columns, U
        right_singular_vectors_rows (object): NumPy array matrix with SVD right
            singular vectors as rows, VT or Vh
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information about the singular value
            decomposition

    """

    # Copy information.
    matrix = numpy.copy(matrix)
    s = numpy.copy(singular_values)
    u = numpy.copy(left_singular_vectors_columns)
    vt = numpy.copy(right_singular_vectors_rows)
    # Organize information.
    matrix_transpose = numpy.transpose(matrix)
    s_diagonal = numpy.diag(s)
    ut = numpy.copy(numpy.transpose(u))
    v = numpy.copy(numpy.transpose(vt))
    # Calculate basic products by matrix multiplication.
    ut_y = numpy.dot(ut, matrix)
    vt_y = numpy.dot(vt, matrix_transpose)
    # Reduce values to indicators of positive and negative signs.
    ut_y_sign = numpy.sign(ut_y)
    vt_y_sign = numpy.sign(vt_y)
    # Calculate squares of matrices.
    # Calculation of square by matrix multiplifcation is only possible for
    # square matrices.
    # Instead calculate the squares of all individual values in the matrices.
    ut_y_square = numpy.square(ut_y)
    vt_y_square = numpy.square(vt_y)
    # Calculate left and right sign matrices.
    signs_left = numpy.dot(ut_y_sign, ut_y_square)
    signs_right = numpy.dot(vt_y_sign, vt_y_square)


    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(
            "Report from: " +
            "adjust_singular_value_decomposition_factor_signs()"
        )
        utility.print_terminal_partition(level=2)
        print("Shape of original matrix: " + str(matrix.shape))
        print("rows (dimension 0): samples (cases, observations)")
        print("columns (dimension 1): variables (features)")
        utility.print_terminal_partition(level=4)
        print("Shape of matrix Sigma (singular values): " + str(s.shape))
        utility.print_terminal_partition(level=4)
        print("Shape of matrix U (left singular vectors): " + str(u.shape))
        print(
            "Shape of matrix UT (transpose left singular vectors): " +
            str(ut.shape)
        )
        utility.print_terminal_partition(level=4)
        print(
            "Shape of matrix VT (transpose right singular vectors): " +
            str(vt.shape)
        )
        print("Shape of matrix V (right singular vectors): " + str(v.shape))
        utility.print_terminal_partition(level=4)
        print("Shape of matrix UT-Y (product): " + str(ut_y.shape))
        print("Shape of matrix UT-Y square: " + str(ut_y_square.shape))
        print("Shape of matrix VT-Y (product): " + str(vt_y.shape))
        print("Shape of matrix VT-Y square: " + str(vt_y_square.shape))
        utility.print_terminal_partition(level=4)
        print("Shape of left signs matrix: " + str(signs_left.shape))
        print("Shape of right signs matrix: " + str(signs_right.shape))
        pass
    # Compile information.
    pail = dict()
    pail["matrix"] = matrix
    pail["left_singular_vectors_columns"] = u_prime
    pail["singular_values"] = s
    pail["right_singular_vectors_rows"] = vt_prime
    # Return.
    return pail
Ejemplo n.º 19
0
def drive_organize_table_regress_linear_ordinary_least_squares(
    dependence=None,
    independence=None,
    standard_scale=None,
    threshold_samples=None,
    table=None,
    report=None,
):
    """
    Drive the organization of a table and regression.

    Table format must have samples (cases, observations) across rows and
    dependent and independent variables (features) across columns.

    arguments:
        dependence (str): name of table's column for dependent variable
        independence (list<str>): names of table's columns for independent
            variables
        threshold_samples (float): minimal count of samples with non-missing
            values of dependent and independent variables to perform regression
        table (object): Pandas data frame of dependent and independent variables
            for regression
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of regression's residuals and statistics
    """

    # Organize table for regression.
    pail_organization = organize_table_cohort_model_variables_for_regression(
        dependence=dependence,
        independence=independence,
        standard_scale=standard_scale,
        table=table,
        report=False,
    )
    # Determine whether dependent and independent variables (features) have
    # sufficient observations for regression.
    if (pail_organization["count_samples"] >= threshold_samples):
        pail_regression = regress_linear_ordinary_least_squares(
            dependence=dependence,
            independence=pail_organization["independence"],
            table=pail_organization["table"],
            report=report,
        )
    else:
        # Report.
        if report:
            utility.print_terminal_partition(level=2)
            print("report: ")
            function_name = str("drive_organize_table_regress_linear_" +
                                "ordinary_least_squares()")
            print(function_name)
            utility.print_terminal_partition(level=5)
            print("Missing information for model...")
            print("There may be inadequate samples with non-missing values " +
                  "or adequate variance in relevant variables.")
        pail_regression = create_regression_missing_values(
            dependence=dependence,
            independence=independence,
        )
    # Return information.
    return pail_regression
Ejemplo n.º 20
0
def execute_procedure(path_dock=None, ):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 1")
    # Pause procedure.
    time.sleep(5.0)

    # Initialize directories.
    paths = ukb_strat.initialize_directories(
        restore=True,
        path_dock=path_dock,
    )
    # Read source information from file.
    # Exclusion identifiers are "eid".
    source = read_source(
        path_dock=path_dock,
        report=True,
    )

    # Select and organize variables across cohorts.
    # Organize phenotypes and covariates in format for analysis in PLINK.

    # Reference population.
    if True:
        pail_population = (
            ukb_strat.execute_stratify_genotype_cohorts_plink_format_set(
                table=source["table_phenotypes"],
                set="reference_population",
                path_dock=path_dock,
                report=True,
            ))
    else:
        pail_population = dict()
        pass

    # Vitamin D.
    if True:
        pail_vitamin_d_linear = (
            ukb_strat.execute_stratify_genotype_cohorts_plink_format_set(
                table=source["table_phenotypes"],
                set="vitamin_d_linear",
                path_dock=path_dock,
                report=True,
            ))
    else:
        pail_vitamin_d_linear = dict()
        pass
    if True:
        pail_vitamin_d_logistic = (
            ukb_strat.execute_stratify_genotype_cohorts_plink_format_set(
                table=source["table_phenotypes"],
                set="vitamin_d_logistic",
                path_dock=path_dock,
                report=True,
            ))
    else:
        pail_vitamin_d_logistic = dict()
        pass

    # Hormones and their regulatory proteins.
    if False:
        pail_hormones_linear = (
            ukb_strat.execute_stratify_genotype_cohorts_plink_format_set(
                table=source["table_phenotypes"],
                set="vitamin_d_linear",
                path_dock=path_dock,
                report=True,
            ))
    else:
        pail_hormones_linear = dict()
        pass
    if False:
        pail_hormones_logistic = (
            ukb_strat.execute_stratify_genotype_cohorts_plink_format_set(
                table=source["table_phenotypes"],
                set="vitamin_d_logistic",
                path_dock=path_dock,
                report=True,
            ))
    else:
        pail_hormones_logistic = dict()
        pass

    # Body mass index (BMI) in Bipolar Disorder.
    if False:
        pail_bipolar_linear = (
            ukb_strat.execute_stratify_genotype_cohorts_plink_format_set(
                table=source["table_phenotypes"],
                set="bipolar_body_linear",
                path_dock=path_dock,
                report=True,
            ))
    else:
        pail_bipolar_linear = dict()
        pass
    if False:
        pail_bipolar_logistic = (
            ukb_strat.execute_stratify_genotype_cohorts_plink_format_set(
                table=source["table_phenotypes"],
                set="bipolar_body_logistic",
                path_dock=path_dock,
                report=True,
            ))
    else:
        pail_bipolar_logistic = dict()
        pass

    # Collect information.
    information = dict()
    information["reference_population"] = pail_population
    information["vitamin_d_linear"] = pail_vitamin_d_linear
    information["vitamin_d_logistic"] = pail_vitamin_d_logistic
    information["hormones_linear"] = pail_hormones_linear
    information["hormones_logistic"] = pail_hormones_logistic
    information["body_bipolar_linear"] = pail_bipolar_linear
    information["body_bipolar_logistic"] = pail_bipolar_logistic
    # Write product information to file.
    ukb_strat.write_genotype_product(paths=paths, information=information)

    pass
def read_select_metabolite_genetic_scores(
    metabolite=None,
    selection=None,
    metabolites_files_paths=None,
    report=None,
):
    """
    Reads a metabolite's genetic scores across the UK Biobank from file,
    and selects the scores to keep.

    This function returns a table for a single metabolite with UK Biobank
    identifiers and a single column of selection scores for the metabolite
    across these UK Biobank records.

    arguments:
        metabolite (str): identifier of a metabolite
        selection (str): name of column for selection from Polygenic Score
            thresholds
        metabolites_files_paths (dict<list<str>>): collection of files and paths
            for metabolites
        report (bool): whether to print reports

    raises:

    returns:
        (object): Pandas data frame of a metabolite's aggregate genetic scores
            across UK Biobank

    """

    # Read raw table of metabolite's genetic scores.
    metabolite_file_path = metabolites_files_paths[metabolite]["path"]
    table_raw = read_source_metabolite_genetic_scores(
        path_file=metabolite_file_path,
        report=report,
    )
    # Organize the raw table.
    table_raw.drop(
        labels=["IID",],
        axis="columns",
        inplace=True
    )
    # Select scores.
    table_selection = table_raw.loc[
        :, table_raw.columns.isin(["FID", selection])
    ]
    # Translate column names.
    translations = dict()
    translations["FID"] = "identifier_ukb"
    translations[selection] = metabolite
    table_selection.rename(
        columns=translations,
        inplace=True,
    )
    # Report.
    if report:
        # Column name translations.
        utility.print_terminal_partition(level=2)
        print("Report from: read_select_metabolite_genetic_scores()")
        utility.print_terminal_partition(level=2)
        print("Metabolite: " + str(metabolite))
        print(table_selection)
        utility.print_terminal_partition(level=3)
    # Return.
    return table_selection
Ejemplo n.º 22
0
def execute_procedure(path_dock=None, ):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 21")
    # Pause procedure.
    time.sleep(5.0)

    # Initialize directories.
    paths = initialize_directories(
        restore=True,
        path_dock=path_dock,
    )

    path_table_kinship_pairs = os.path.join(path_dock, "access",
                                            "ukbiobank_phenotypes",
                                            "table_kinship_pairs.dat")
    table_kinship_pairs = pandas.read_csv(
        path_table_kinship_pairs,
        sep="\s+",
        header=0,
        dtype={
            "ID1": "string",
            "ID2": "string",
            "HetHet": "float32",
            "IBS0": "float32",
            "Kinship": "float32",
        },
    )
    path_table_kinship_pairs = os.path.join(path_dock, "assembly",
                                            "table_kinship_pairs.pickle")
    path_table_kinship_pairs_text = os.path.join(path_dock, "assembly",
                                                 "table_kinship_pairs.tsv")
    table_kinship_pairs.to_pickle(path_table_kinship_pairs)
    table_kinship_pairs.to_csv(
        path_or_buf=path_table_kinship_pairs_text,
        sep="\t",
        header=True,
        index=False,
    )

    # Read source information from file.
    # Read source information from file.
    table_kinship_pairs = ukb_strat.read_source_table_kinship_pairs(
        path_dock=path_dock,
        report=True,
    )
    print(table_kinship_pairs)

    pass
def drive_collection_report_phenotype_metabolite_studies(
    phenotype_study=None,
    metabolite_study=None,
    path_dock=None,
    report=None,
):
    """
    Function to execute module's main behavior.

    arguments:
        phenotype_study (str): identifier of main phenotype study
        metabolite_study (str): identifier of metabolite study
        path_dock (str): path to dock directory for source and product
            directories and files
        report (bool): whether to print reports

    raises:

    returns:

    """

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("report: drive_collection_report_phenotype_metabolite_studies()")
        print(phenotype_study)
        print(metabolite_study)

    # Initialize directories.
    paths = initialize_directories(
        phenotype_study=phenotype_study,
        metabolite_study=metabolite_study,
        restore=False,
        path_dock=path_dock,
    )

    # Read source information from file.
    source = read_source(
        phenotype_study=phenotype_study,
        metabolite_study=metabolite_study,
        paths=paths,
        report=False,
    )

    # TODO: now combine and organize the various information containers from "read_source()"
    # TODO: build summary table.

    # name change?
    table_summary = combine_organize_phenotype_metabolites_summary_table(
        table_metabolite_reference=source["table_metabolite_reference"],
        phenotype_heritability=source["phenotype_heritability"],
        table_metabolite_heritability=source["table_metabolite_heritability"],
        table_correlations=source["table_correlations"],
        threshold_metabolite_heritability=0.05,
        threshold_false_discovery_rate=0.05,
        report=False,
    )

    # Report.
    if report:
        utility.print_terminal_partition(level=5)
        print(table_summary)

    # Collect information.
    information = dict()
    information["table_summary"] = table_summary
    # Write product information to file.
    write_product(phenotype_study=phenotype_study,
                  metabolite_study=metabolite_study,
                  paths=paths,
                  information=information)

    pass
def calculate_principal_components_from_singular_value_decomposition(
    singular_values=None,
    left_singular_vectors=None,
    right_singular_vectors=None,
    which_singular_vectors=None,
    table=None,
    report=None,
):
    """
    Calculates Principal Components and relevant information from raw factors of
    a Singular Value Decomposition (SVD).

    Reference:
    "https://stats.stackexchange.com/questions/134282/
    relationship-between-svd-and-pca-how-to-use-svd-to-perform-pca"

    arguments:
        threshold_valid_proportion_per_column (float): proportion of rows that
            must have a valid value for a column in order to keep the column
        table (object): Pandas data frame of variables (features) across
            columns and samples (cases, observations) across rows with an
            explicit index, after final scaling and filtering for SVD
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information about the singular value
            decomposition

    """
    # https://stats.stackexchange.com/questions/134282/relationship-between-svd-and-pca-how-to-use-svd-to-perform-pca
    # https://towardsdatascience.com/pca-and-svd-explained-with-numpy-5d13b0d2a4d8
    # https://towardsdatascience.com/singular-value-decomposition-and-its-applications-in-principal-component-analysis-5b7a5f08d0bd
    # http://www.math.ucsd.edu/~gptesler/283/slides/pca_18-handout.pdf
    # https://www.cc.gatech.edu/~lsong/teaching/CX4240spring16/pca_wall.pdf

    # Calculate Eigenvalues.
    eigenvalues = (
        calculate_principal_component_eigenvalues_from_singular_values(
            singular_values=s,
            count_samples=pail_svd["count_samples"],
            report=False,
    ))

    # TODO: calculate eigenvectors from the factor specified in
    # "which_singular_vectors"

    # Calculate Eigenvectors.
    # Eigenvectors are the right singular vectors of the original matrix.
    eigenvectors = numpy.copy(numpy.transpose(vh))

    # TODO: I'm not entirely sure that I'm sorting the correct dimension of
    # Eigenvectors...
    # TODO: sort dimension will depend on which singular vector selected

    # Sort Eigenvectors by order of decreasing Eigenvalues.
    pail_sort = sort_eigenvectors_by_decreasing_eigenvalues(
        eigenvectors=eigenvectors,
        eigenvalues=eigenvalues,
        report=False,
    )
    # Calculate loadings.
    loadings = calculate_principal_component_loadings_from_eigen_values_vectors(
        eigenvectors=pail_sort["eigenvectors"],
        eigenvalues=pail_sort["eigenvalues"],
        report=True,
    )
    loadings_direct = (
        calculate_principal_component_loadings_from_direct_factors(
            s=s,
            vh=vh,
            count_samples=pail_organization["count_samples"],
            report=True,
    ))
    # Calculate Principal Components.
    # --> Calculate from U and S
    # or
    # --> Calculate from V and S

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(
            "Report from: " +
            "calculate_principal_components_from_singular_value_decomposition()"
        )
        utility.print_terminal_partition(level=2)

        # Original matrix has shape (M, N)
        print(
            "Shape of original matrix: " +
            str(pail_organization["matrix"].shape)
        )
        # Eigenvalues.
        print("Shape of Eigenvalues: " + str(eigenvalues.shape))
        # Eigenvectors.
        print("Shape of Eigenvectors: " + str(eigenvectors.shape))
        # Loadings.
        print("Shape of Loadings: " + str(loadings.shape))
        print(loadings)
        print(
            "Shape of Loadings from SVD factors: " +
            str(loadings_direct.shape)
        )
        print("Loadings nearly equal by both calculations: ")
        print(numpy.allclose(loadings, loadings_direct))
        pass
    # Compile information.
    pail = dict()
    #pail["table_scale"] = pail_organization["table_scale"]
    #pail["u"] = u
    #pail["singular_values"] = s
    #pail["vh"] = vh
    pail["eigenvalues"] = eigenvalues
    pail["eigenvectors"] = eigenvectors
    pail["loadings"] = loadings
    # Return.
    return pail
def read_select_collect_metabolites_genetic_scores(
    selection=None,
    metabolites_files_paths=None,
    report=None,
):
    """
    Reads metabolites' genetic scores across the UK Biobank from file,
    aggregates scores by Singular Value Decomposition (SVD), and collects these
    within a table.

    arguments:
        selection (str): name of column for selection from Polygenic Score
            thresholds
        metabolites_files_paths (dict<list<str>>): collection of files and paths
            for metabolites
        report (bool): whether to print reports

    raises:

    returns:
        (object): Pandas data frame of metabolites' genetic scores across UK
            Biobank cohort

    """

    # Initialize a table for collection.
    table_collection = pandas.DataFrame(columns=["identifier_ukb"])
    # UK Biobank identifier is in column "FID" within the metabolite tables
    # rename to "identifier_ukb"
    table_collection.set_index(
        "identifier_ukb",
        drop=True,
        inplace=True,
    )

    for metabolite in metabolites_files_paths.keys():
        # Select metabolite's genetic scores.
        table_selection = read_select_metabolite_genetic_scores(
            metabolite=metabolite,
            selection=selection,
            metabolites_files_paths=metabolites_files_paths,
            report=False,
        )
        # Copy information.
        table_metabolite = table_selection.copy(deep=True)
        # Organize information.
        table_metabolite.dropna(
            axis="index",
            how="any",
            subset=["identifier_ukb"],
            inplace=True,
        )
        table_metabolite.set_index(
            "identifier_ukb",
            drop=True,
            inplace=True,
        )
        # Collect information for metabolite.
        table_collection = table_collection.merge(
            table_metabolite,
            how="outer",
            left_on="identifier_ukb",
            right_on="identifier_ukb",
            suffixes=("_original", "_novel"),
        )

        pass
    # Report.
    if report:
        # Column name translations.
        utility.print_terminal_partition(level=2)
        print("Report from: read_select_collect_metabolites_genetic_scores()")
        utility.print_terminal_partition(level=2)
        print("selection: " + str(selection))
        print(table_collection)
        utility.print_terminal_partition(level=3)
    # Compile information.
    #pail = dict()
    # Return information.
    return table_collection
Ejemplo n.º 26
0
def select_organize_metabolites_valid_identities_scores(
    table_names=None,
    table_scores=None,
    report=None,
):
    """
    Selects identifiers of metabolites from Metabolon with valid identities.

    arguments:
        table_names (object): Pandas data frame of metabolites' identifiers and
            names from Metabolon
        table_scores (object): Pandas data frame of metabolites' genetic scores
            across UK Biobank cohort
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information about metabolites, their identifiers,
            and their names

    """

    # Copy information.
    table_names = table_names.copy(deep=True)
    table_scores = table_scores.copy(deep=True)
    # Translate column names.
    translations = dict()
    translations["metabolonID"] = "identifier"
    translations["metabolonDescription"] = "name"
    table_names.rename(
        columns=translations,
        inplace=True,
    )
    # Determine whether metabolite has a valid identity.
    table_names["identity"] = table_names.apply(
        lambda row: determine_metabolite_valid_identity(name=row["name"], ),
        axis="columns",  # apply across rows
    )
    # Select metabolites with valid identities.
    table_identity = table_names.loc[(table_names["identity"] > 0.5), :]
    metabolites_identity = table_identity["identifier"].to_list()
    names_identity = table_identity["name"].to_list()
    # Organize table.
    table_names["identifier"].astype("string")
    table_names.set_index(
        "identifier",
        drop=True,
        inplace=True,
    )
    # Remove table columns for metabolites with null genetic scores.
    table_scores.dropna(
        axis="columns",
        how="all",
        subset=None,
        inplace=True,
    )
    # Select metabolites with valid identities and valid genetic scores.
    metabolites_scores = table_scores.columns.to_list()
    metabolites_valid = utility.filter_common_elements(
        list_minor=metabolites_identity,
        list_major=metabolites_scores,
    )
    # Compile information.
    pail = dict()
    pail["table"] = table_names
    pail["metabolites_valid"] = metabolites_valid
    # Report.
    if report:
        # Column name translations.
        utility.print_terminal_partition(level=2)
        print("Report from select_metabolites_with_valid_identities()")
        utility.print_terminal_partition(level=3)
        print("Count of identifiable metabolites: " +
              str(len(metabolites_identity)))
        print("Count of identifiable metabolites with scores: " +
              str(len(metabolites_valid)))
        utility.print_terminal_partition(level=3)
        print(table_names)
    # Return information.
    return pail
Ejemplo n.º 27
0
def regress_linear_ordinary_least_squares(
    dependence=None,
    independence=None,
    table=None,
    report=None,
):
    """
    Regresses a quantitative continuous dependent variable against multiple
    independent variables and returns relevant parameters and statistics.

    Table format must have samples (cases, observations) across rows and
    dependent and independent variables (features) across columns.

    Description of formats for StatsModels...

    Format of dependent variable is a vector of scalar values.
    [1.3, 1.5, 1.2, 1.0, 1.7, 1.5, 1.9, 1.1, 1.3, 1.4]

    Format of independent variable(s) is a matrix: a first-dimension vector of
    samples (observations) and for each sample a second-dimension vector of
    variables' (features') scalar values.
    StatsModels also requires a constant for the intercept.
    [
        [1.3, 5.2, 1.0],
        [1.5, 5.1, 1.0],
        [1.2, 5.5, 1.0],
        ...
    ]

    arguments:
        dependence (str): name of table's column for dependent variable
        independence (list<str>): names of table's columns for independent
            variables
        table (object): Pandas data frame of dependent and independent variables
            for regression
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of regression's residuals and statistics
    """

    # Determine count of valid samples (cases, observations).
    count_samples = int(table.shape[0])
    # Extract values of dependent and independent variables.
    values_dependence = table[dependence].to_numpy()
    # Keep independent variables in Pandas dataframe to preserve variables'
    # names.
    #values_independence = data.loc[ :, independence].to_numpy()
    table_independence = table.loc[:, independence]
    # Introduce constant value for intercept.
    # If any column in the independent variables already has constant
    # values, then the function skips it by default.
    # It is necessary to change parameter "has_constant" to avoid this
    # conditional behavior.
    table_independence_intercept = statsmodels.api.add_constant(
        table_independence,
        prepend=True,  # insert intercept constant first
        has_constant="add",  # introduce new intercept constant regardless
    )
    columns_independence = copy.deepcopy(
        table_independence_intercept.columns.to_list())
    #matrix_independence = table.to_numpy()
    # Define model.
    model = statsmodels.api.OLS(
        values_dependence,
        table_independence_intercept,
        missing="drop",
    )
    pail_raw = model.fit()
    # Report.
    if report:
        print("--------------------------------------------------")
        print("Report source: " +
              "regress_dependent_independent_variables_linear_ordinary()")
        print("--------------------------------------------------")
        print("Version check: TCW 28 September 2021")
        print("Information from regression:")
        print(pail_raw.summary())
        #utility.print_terminal_partition(level=3)
        #print(dir(pail_raw))
        #print(pail_raw.params)
        #print(pail_raw.pvalues)
        pass

    # Organize residuals.
    residuals = pail_raw.resid

    ##########
    # Collect parameters, errors, probabilities, and statistics.
    model_parameters = pandas.Series(data=pail_raw.params)
    model_parameter_errors = pandas.Series(data=pail_raw.bse)
    model_probabilities = pandas.Series(data=pail_raw.pvalues)
    parameters = dict()
    parameter_errors = dict()
    parameter_intervals = dict()
    parameter_ranges = dict()
    probabilities = dict()
    inflations = dict()
    if ("const" in model_parameters.index):
        #parameters["intercept_parameter"] = report.params[0]
        parameters["intercept_parameter"] = model_parameters["const"]
    else:
        parameters["intercept_parameter"] = float("nan")
        # Report.
        if report:
            utility.print_terminal_partition(level=4)
            print("Warning: regression data does not have constant intercept.")
            print(independence)
    if ("const" in model_parameter_errors.index):
        parameter_errors["intercept_error"] = model_parameter_errors["const"]
        parameter_intervals["intercept_interval_95"] = float(
            1.96 * parameter_errors["intercept_error"])
        parameter_ranges["intercept_range_95"] = (
            determine_confidence_interval_range_text(
                estimate=parameters["intercept_parameter"],
                interval_low=parameter_intervals["intercept_interval_95"],
                interval_high=parameter_intervals["intercept_interval_95"],
            ))
    else:
        parameter_errors["intercept_error"] = float("nan")
        parameter_intervals["intercept_interval_95"] = float("nan")
        parameter_ranges["intercept_range_95"] = str("nan ... nan")
        # Report.
        if report:
            utility.print_terminal_partition(level=4)
            print("Warning: regression data does not have constant intercept.")
            print(independence)
    if ("const" in model_probabilities.index):
        #probabilities["intercept_probability"] = report.pvalues[0]
        probabilities["intercept_probability"] = (model_probabilities["const"])
    else:
        probabilities["intercept_probability"] = float("nan")
        # Report.
        if report:
            utility.print_terminal_partition(level=4)
            print("Warning: regression data does not have constant intercept.")
            print(independence)
    inflations["intercept_inflation"] = float("nan")
    # Iterate on each independent variable.
    # Initiate counter at 1 to assume that intercept is at index 0.
    counter = 1
    # Accommodate index for intercept.
    for variable in independence:
        # Coefficient or parameter.
        parameter = str(variable + ("_parameter"))
        #parameters[parameter] = report.params[counter]
        parameters[parameter] = model_parameters[variable]
        # Parameter standard error
        parameter_error = str(variable + ("_error"))
        parameter_errors[parameter_error] = model_parameter_errors[variable]
        parameter_interval = str(variable + ("_interval_95"))
        parameter_intervals[parameter_interval] = float(
            1.96 * parameter_errors[parameter_error])
        parameter_range = str(variable + ("_range_95"))
        parameter_ranges[parameter_range] = (
            determine_confidence_interval_range_text(
                estimate=parameters[parameter],
                interval_low=parameter_intervals[parameter_interval],
                interval_high=parameter_intervals[parameter_interval],
            ))
        # Probability.
        probability = str(variable + ("_probability"))
        #probabilities[probability] = report.pvalues[counter]
        probabilities[probability] = model_probabilities[variable]
        # Variance Inflation Factor (VIF).
        inflation = str(variable + ("_inflation"))
        inflation_value = (
            statsmodels.stats.outliers_influence.variance_inflation_factor(
                table_independence_intercept.to_numpy(), counter))
        inflations[inflation] = round(inflation_value, 3)
        # Increment index.
        counter += 1
        pass
    summary = {
        "independence": ";".join(independence),
        "freedom": pail_raw.df_model,
        "observations": pail_raw.nobs,
        "samples": count_samples,
        "r_square": pail_raw.rsquared,
        "r_square_adjust": pail_raw.rsquared_adj,
        "log_likelihood": pail_raw.llf,
        "akaike": pail_raw.aic,
        "bayes": pail_raw.bic,
        "condition": pail_raw.condition_number,
    }
    summary.update(parameters)
    summary.update(parameter_errors)
    summary.update(parameter_intervals)
    summary.update(parameter_ranges)
    summary.update(probabilities)
    summary.update(inflations)

    # Compile information.
    pail = dict()
    pail["summary"] = summary
    pail["residuals"] = residuals
    # Return information.
    return pail
def execute_procedure(
    path_dock=None,
):
    """
    Function to execute module's main behavior.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files

    raises:

    returns:

    """

    utility.print_terminal_partition(level=1)
    print(path_dock)
    print("version check: 2")
    # Pause procedure.
    time.sleep(5.0)


    # Initialize directories.
    paths = initialize_directories(
        restore=True,
        path_dock=path_dock,
    )
    # Read source information from file.
    # Exclusion identifiers are "eid".
    source = read_source_initial(
        path_dock=path_dock,
        report=False,
    )

    if False:
        # Test the aggregation method for a single metabolite.
        # M00599: pyruvate
        # M32315: serine
        # M02342: serotonin
        # M00054: tryptophan
        pail_test = read_aggregate_test_metabolite_genetic_scores(
            metabolite="M00054",
            metabolites_files_paths=source["metabolites_files_paths"],
            report=True,
        )

        # Collect metabolites' genetic scores, and aggregate these by singular value
        # decomposition (SVD).
        # pail_metabolites_scores
        table_scores = read_aggregate_collect_metabolites_genetic_scores(
            metabolites_files_paths=source["metabolites_files_paths"],
        )
        print("printing after read_aggregate_collect_metabolites_genetic_scores")
        print(table_scores)

    # TODO: temporarily by-pass the aggregation process...
    # Collect metabolites' genetic scores at multiple PRS p-value thresholds.
    table_prs_0_00001 = read_select_collect_metabolites_genetic_scores(
        selection="X1e.05",
        metabolites_files_paths=source["metabolites_files_paths"],
        report=True,
    )
    table_prs_0_0001 = read_select_collect_metabolites_genetic_scores(
        selection="X0.0001",
        metabolites_files_paths=source["metabolites_files_paths"],
        report=True,
    )
    table_prs_0_001 = read_select_collect_metabolites_genetic_scores(
        selection="X0.001",
        metabolites_files_paths=source["metabolites_files_paths"],
        report=True,
    )
    table_prs_0_01 = read_select_collect_metabolites_genetic_scores(
        selection="X0.01",
        metabolites_files_paths=source["metabolites_files_paths"],
        report=True,
    )
    table_prs_0_1 = read_select_collect_metabolites_genetic_scores(
        selection="X0.1", # "X0.001", "X0.01", "X0.1"
        metabolites_files_paths=source["metabolites_files_paths"],
        report=True,
    )

    # Collect information.
    information = dict()
    information["metabolites_files_paths"] = source["metabolites_files_paths"]
    information["table_prs_0_00001"] = table_prs_0_00001
    information["table_prs_0_0001"] = table_prs_0_0001
    information["table_prs_0_001"] = table_prs_0_001
    information["table_prs_0_01"] = table_prs_0_01
    information["table_prs_0_1"] = table_prs_0_1
    # TODO: eventually, include a dictionary collection of a table for each
    # metabolite
    #information["pail_metabolites_scores_tables"] = pail
    # Write product information to file.
    write_product(
        paths=paths,
        information=information
    )

    pass
Ejemplo n.º 29
0
def drive_cohort_model_linear_regression(
    table=None,
    table_cohorts_models=None,
    cohort=None,
    model=None,
    dependence=None,
    report=None,
):
    """
    Organize regressions.

    Table format must have samples (cases, observations) across rows and
    dependent and independent variables (features) across columns.

    arguments:
        table (object): Pandas data frame of dependent and independent variables
            (features) across columns and samples (cases, observations) within
            a specific cohort across rows
        table_cohorts_models (object): Pandas data frame of cohorts, models,
            dependent variables, and independent variables for regression
        cohort (str): name of a stratification cohort for regression analysis
        model (str): name of a model for regression analysis, normally
            "complex", "simple", or "unadjust"
        dependence (str): name of table's column for dependent variable
        report (bool): whether to print reports

    raises:

    returns:
        (dict): information from regressions

    """

    pail_model = determine_cohort_model_variables_from_reference_table(
        cohort=cohort,
        model=model,
        dependence=dependence,
        table=table_cohorts_models,
        report=report,
    )
    if (pail_model["match"]):
        # Report.
        if report:
            utility.print_terminal_partition(level=2)
            print("report: ")
            print("drive_cohort_model_linear_regression()")
            utility.print_terminal_partition(level=5)
            print("cohort: " + str(cohort))
            print("model: " + str(model))
            print("dependent variable: " + str(dependence))
            print("independent variables: ")
            print(pail_model["independence"])
            utility.print_terminal_partition(level=5)
        pail_regression = (
            drive_organize_table_regress_linear_ordinary_least_squares(
                dependence=dependence,
                independence=pail_model["independence"],
                standard_scale=True,
                threshold_samples=50,
                table=table,
                report=report,
            ))
    else:
        # Report.
        if report:
            utility.print_terminal_partition(level=2)
            print("report: ")
            print("drive_cohort_model_linear_regression()")
            utility.print_terminal_partition(level=5)
            print("cohort: " + str(cohort))
            print("model: " + str(model))
            print("dependent variable: " + str(dependence))
            print("independent variables: ")
            print(pail_model["independence"])
            utility.print_terminal_partition(level=5)
            print("Missing information for model...")
            utility.print_terminal_partition(level=5)
        pail_regression = create_regression_missing_values(
            dependence=dependence,
            independence=pail_model["independence"],
        )
    return pail_regression
def read_source_metabolite_genetic_scores(
    path_file=None,
    report=None,
):
    """
    Reads and organizes source information from file.

    Notice that Pandas does not accommodate missing values within series of
    integer variable types.

    arguments:
        path_dock (str): path to dock directory for source and product
            directories and files
        report (bool): whether to print reports

    raises:

    returns:
        (object): source information

    """

    # Read information from file.
    variables_types = {
        "FID": "string",
        "IID": "string",
        "X5e.08": "float32",
        "X1e.07": "float32",
        "X1e.06": "float32",
        "X1e.05": "float32",
        "X0.0001": "float32",
        "X0.001": "float32",
        "X0.01": "float32",
        "X0.05": "float32",
        "X0.1": "float32",
        "X0.2": "float32",
        "X1": "float32",
    }
    table = pandas.read_csv(
        path_file,
        sep="\s+", # ",", "\t", "\s+"
        header=0,
        dtype=variables_types,
        na_values=["NA", "<NA>"],
        keep_default_na=True,
        compression=None, # "gzip"
    )
    # Report.
    if report:
        # Report only for a few metabolites.
        metabolites = ["M00599", "M32315", "M02342", "M00054"]
        match = any(list(map(
            lambda metabolite: (metabolite in path_file), metabolites
        )))
        if match:
            utility.print_terminal_partition(level=2)
            print("Path: " + str(path_file))
            print("raw table for example metabolites:")
            print(table)
            utility.print_terminal_partition(level=3)
            print(table.columns.to_list())
            utility.print_terminal_partition(level=3)
            print("variable types:")
            print(table.dtypes)
            utility.print_terminal_partition(level=3)
    # Compile and return information.
    return table