def collect_studies_unique_comparisons(bin_studies=None, ):
    """
    Collects unique identifiers of comparisons from all studies.

    arguments:
        bin_studies (dict): collection of information about each study

    raises:

    returns:
        (list<str>): unique identifiers of comparisons from all studies

    """

    collection = []
    for study in bin_studies.keys():
        data_study = bin_studies[study]["data"]
        columns = data_study.columns.to_list()
        comparisons = list(
            filter(lambda value: (not value in ["identifier", "name"]),
                   columns))
        collection.extend(comparisons)
    comparisons_unique = sorted(
        utility.collect_unique_elements(elements_original=collection))
    return comparisons_unique
Beispiel #2
0
def collect_orphan_gene_set(
    sets=None,
    genes_query=None,
):
    """
    Collects the union of elements from multiple sets.

    arguments:
        sets (dict<dict<list<str>>>): sets of genes
        genes_query (list<str>): identifiers of genes in original enrichment
            query

    raises:

    returns:
        (dict<dict<list<str>>>): sets of genes

    """

    sets = copy.deepcopy(sets)
    orphans_raw = list(
        filter(lambda gene: not gene in sets["union"], genes_query))
    sets["orphan"] = utility.collect_unique_elements(
        elements_original=orphans_raw)
    # Return information.
    return sets
Beispiel #3
0
def collect_report_ontology_parentage_orphan_genes(
    cluster_reports=None,
    genes_query=None,
    report=None,
):
    """
    Extracts information about persons.

    arguments:
        cluster_reports (dict): reports for each cluster
        genes_query (list<str>): identifiers of genes in original enrichment
            query
        report (bool): whether to print reports

    raises:

    returns:
        (dict<list<str>>): identifiers of genes in each parent set

    """

    # Collect genes.
    genes_collection = list()
    # Iterate on cluster reports.
    for key in cluster_reports.keys():
        # Organize data.
        data_report = cluster_reports[key]["report"]
        #print(cluster_reports[key]["name"])
        #print(data_report)
        data_report.rename_axis(
            index="set",
            axis="index",
            copy=False,
            inplace=True,
        )
        records = utility.convert_dataframe_to_records(data=data_report)
        # Iterate on sets within cluster.
        for record in records:
            # Extract identifiers of genes.
            genes_set_raw = record["Genes"]
            genes_set = genes_set_raw.split(", ")
            # Collect genes.
            genes_collection.extend(genes_set)
    # Collect unique genes from parent.
    genes_parentage = utility.collect_unique_elements(
        elements_original=genes_collection, )
    # Collect orphan genes.
    genes_orphan = list(
        filter(lambda gene: not gene in genes_parentage, genes_query))
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("unique parentage and orphan genes")
        print("parentage genes: " + str(len(genes_parentage)))
        print("orphan genes: " + str(len(genes_orphan)))
        utility.print_terminal_partition(level=2)
    pass
def collect_studies_unique_gene_identifiers(
    bin_studies=None,
    report=None,
):
    """
    Collects unique identifiers of genes from all studies.

    arguments:
        bin_studies (dict): collection of information about each study
        report (bool): whether to print reports

    raises:

    returns:
        (list<str>): unique identifiers of genes from all studies

    """

    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("unique genes from each study")

    genes_collection = []
    for study in bin_studies.keys():
        data_study = bin_studies[study]["data"]
        genes_study = utility.collect_unique_elements(
            elements_original=data_study["identifier"].to_list())
        genes_study_valid = list(
            filter(lambda identifier: ("ENSG" in str(identifier)),
                   genes_study))
        genes_collection.extend(genes_study)
        # Report.
        if report:
            print("study " + study + " : " + str(len(genes_study_valid)))
    # Determine valid, non null values of the gene's fold change.
    genes_valid = list(
        filter(lambda identifier: ("ENSG" in str(identifier)),
               genes_collection))
    genes_unique = utility.collect_unique_elements(
        elements_original=genes_valid)
    return genes_unique
def translate_study_comparisons_identifiers(
    bin_studies=None,
    report=None,
):
    """
    Collects and organizes unique designations of comparisons from all studies.

    arguments:
        bin_studies (dict): collection of information about each study
        report (bool): whether to print reports

    raises:

    returns:
        (dict): collection of information about each study with unique
            comparisons in each study

    """

    bin_studies = copy.deepcopy(bin_studies)
    for study in bin_studies.keys():
        identifier_study = bin_studies[study]["identifier"]
        data_study = bin_studies[study]["data"]
        columns = data_study.columns.to_list()
        comparisons = list(
            filter(lambda value: (not value in ["identifier", "name"]),
                   columns))
        comparisons_unique = sorted(
            utility.collect_unique_elements(elements_original=comparisons))
        bin_studies[study]["comparisons"] = comparisons_unique
        # Organize study's unique comparisons.
        bin_studies[study]["comparisons_translation"] = dict()
        for comparison in comparisons_unique:
            name = str(identifier_study + "_" + comparison)
            bin_studies[study]["comparisons_translation"][comparison] = name
            pass
        # Translate comparison columns.
        data_study.rename(
            columns=bin_studies[study]["comparisons_translation"],
            inplace=True,
        )
        bin_studies[study]["data"] = data_study
        # Report.
        if report:
            print(data_study)
            pass
        pass
    return bin_studies
Beispiel #6
0
def collect_union_gene_set(sets=None, ):
    """
    Collects the union of elements from multiple sets.

    arguments:
        sets (dict<dict<list<str>>>): sets of genes

    raises:

    returns:
        (dict<dict<list<str>>>): sets of genes

    """

    sets = copy.deepcopy(sets)
    union = list()
    for set in sets.keys():
        union.extend(sets[set])
    sets["union"] = utility.collect_unique_elements(elements_original=union)
    # Return information.
    return sets
Beispiel #7
0
def select_genes_by_modality_measures_ranks(
    genes=None,
    proportion_least=None,
    proportion_greatest=None,
    measures=None,
    data_distribution_report=None,
    report=None,
):
    """
    Selects genes with least and greatest values of measures of modality.

    arguments:
        genes (list<str>): identifiers of genes for which to consider
            modalities
        proportion_least (float): proportion of genes to select from those with
            least values of modality measures
        proportion_greatest (float): proportion of genes to select from those
            with greatest values of modality measures
        measures (list<str>): measures of modality
        data_distribution_report (object): Pandas data frame of information
            about genes and their measures of modality
        report (bool): whether to print reports

    raises:

    returns:
        (dict): information about selection of genes

    """

    # Organize data.
    genes = copy.deepcopy(genes)
    data_report = data_distribution_report.copy(deep=True)
    data_report_genes = data_report.loc[data_report.index.isin(genes), :]
    # Calculate count of genes to select from least and greatest extremes.
    count_total = len(genes)
    count_least = round(proportion_least * count_total)
    count_greatest = round(proportion_greatest * count_total)
    # Report.
    if report:
        print(
            "selection percentage least: " +
            str(round((proportion_least * 100), 2))
        )
        print("selection count least: " + str(count_least))
        utility.print_terminal_partition(level=3)
        print(
            "selection percentage greatest: " +
            str(round((proportion_greatest * 100), 2))
        )
        print("selection count greatest: " + str(count_greatest))
        pass
    # Iterate on measures of modality.
    bin = dict()
    for measure in measures:
        # Copy data.
        data_measure = data_report_genes.copy(deep=True)
        data_measure = data_measure.loc[:, ["name", measure]]
        # Sort by values of the measure.
        data_measure.sort_values(
            by=[measure],
            axis="index",
            ascending=True,
            inplace=True,
        )
        # Select least and greatest genes.
        # Pay attention to index values.
        # I validated the selection of threshold values.
        threshold_least = data_measure.iat[(count_least - 1), 1]
        data_least = data_measure.iloc[:count_least]
        genes_least = utility.collect_unique_elements(
            elements_original=data_least.index.to_list()
        )
        threshold_greatest = (
            data_measure.iat[(count_total - (count_greatest)), 1]
        )
        data_greatest = data_measure.iloc[(count_total - count_greatest):]
        genes_greatest = data_greatest.index.to_list()
        genes_greatest = utility.collect_unique_elements(
            elements_original=data_greatest.index.to_list()
        )
        # Collect information.
        bin[measure] = dict()
        bin[measure]["least"] = dict()
        bin[measure]["least"]["threshold"] = threshold_least
        bin[measure]["least"]["genes"] = genes_least
        bin[measure]["greatest"] = dict()
        bin[measure]["greatest"]["threshold"] = threshold_greatest
        bin[measure]["greatest"]["genes"] = genes_greatest
        pass
    # Organize measures' thresholds for plot.
    bin["measures_thresholds"] = dict()
    for measure in measures:
        bin["measures_thresholds"][measure] = (
            bin[measure]["greatest"]["threshold"]
        )
    # Return information.
    return bin
Beispiel #8
0
def determine_selection_distribution_genes_valid_modalities(
    genes_selection=None,
    genes_distribution=None,
    data_distribution_report=None,
    report=None,
):
    """
    Determines selection genes with valid modalities from distribution
    procedure.

    Only genes with adequate signal coverage across tissues and persons have
    valid modalities from distribution procedure.

    arguments:
        genes_selection (list<str>): identifiers of genes from selection
        genes_distribution (list<str>): identifiers of genes from distribution
            procedure
        data_distribution_report (object): Pandas data frame of information
            about genes and their measures of modality
        report (bool): whether to print reports

    raises:

    returns:
        (dict): information

    """

    # Organize data.
    genes_selection = copy.deepcopy(genes_selection)
    data_report = data_distribution_report.copy(deep=True)
    # Select genes with valid distribution modalities.
    data_valid = data_report.loc[
        :, data_report.columns.isin(["gene", "coefficient", "mixture", "dip"])
    ]
    data_valid.dropna(
        axis="index",
        how="any",
        inplace=True,
    )
    genes_distribution_valid = utility.collect_unique_elements(
        elements_original=data_valid.index.to_list()
    )
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(
            "count of all distribution genes: " +
            str(len(genes_distribution))
        )
        utility.print_terminal_partition(level=4)
        print(
            "count of all distribution genes with valid modalities: " +
            str(len(genes_distribution_valid))
        )
        utility.print_terminal_partition(level=2)
        pass
    # Select genes from selection procedure with valid distribution modalities.
    genes_selection_distribution_valid = utility.filter_common_elements(
        list_one=genes_selection,
        list_two=genes_distribution_valid,
    )
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print(
            "count of all selection genes: " +
            str(len(genes_selection))
        )
        utility.print_terminal_partition(level=4)
        print(
            "count of all selection genes with valid modalities: " +
            str(len(genes_selection_distribution_valid))
        )
        utility.print_terminal_partition(level=2)
        pass
    # Compile information.
    bin = dict()
    bin["data_distribution_report"] = data_valid
    bin["genes_selection_distribution"] = genes_selection_distribution_valid
    # Return information.
    return bin
Beispiel #9
0
def split_report_write_genes_signals(
    cohort=None,
    persons=None,
    data_samples_tissues_persons=None,
    data_gene_signal=None,
    path_directory=None,
    report=None,
):
    """
    Function to execute module's main behavior.

    arguments:
        cohort (str): cohort of persons--selection, respiration, or ventilation
        persons (list<str>): identifiers of persons
        data_samples_tissues_persons (object): Pandas data frame of persons
            and tissues across samples
        data_gene_signal (object): Pandas data frame of genes' signals across
            samples
        path_directory (str): path to directory for product directories and
            files
        report (bool): whether to print reports about the selection

    raises:

    returns:

    """

    # Report.
    if report:
        utility.print_terminal_partition(level=1)
        print("... Split procedure for: " + str(cohort) + " persons...")
        print("Count persons: " + str(len(persons)))
        utility.print_terminal_partition(level=2)

    # Copy data.
    data_samples_tissues_persons = data_samples_tissues_persons.copy(deep=True)
    data_gene_signal = data_gene_signal.copy(deep=True)
    # Select samples for relevant persons.
    bin = select_samples_signals_persons(
        persons=persons,
        data_samples_tissues_persons=data_samples_tissues_persons,
        data_gene_signal=data_gene_signal,
    )
    # Split genes' signals across tissues and patients by gene.
    genes_samples_signals = split_genes_signals(
        data_samples_tissues_persons=data_samples_tissues_persons,
        data_gene_signal=bin["data_gene_signal"],
    )

    # Organize genes' identifiers.
    # Format of genes' identifiers needs to be readable by Bash as an array.
    genes = utility.collect_unique_elements(
        elements_original=list(genes_samples_signals.keys()))

    # Summarize information for a single gene.
    # Access data for single gene for demonstration.
    if report:
        summarize_genes_samples_signals(
            genes_samples_signals=genes_samples_signals, )
    # Write the entire collection of all genes' signals to a single file.
    # Also write each gene's signals to a separate file.
    # Conserve memory in parallel pipeline by reading data for each gene
    # separately.
    # Compile information.
    information = {
        "genes": genes,
        "genes_samples_signals": genes_samples_signals,
    }
    # Write product information to file.
    write_product(
        information=information,
        path_directory=path_directory,
    )
    pass
Beispiel #10
0
def collect_ontology_enrichment_cluster_gene_sets(
    cluster_reports=None,
    report=None,
):
    """
    Extracts information about persons.

    arguments:
        cluster_reports (dict): reports for each cluster
        report (bool): whether to print reports

    raises:

    returns:
        (dict<list<str>>): identifiers of genes in each parent set

    """

    # Collect unique genes of all children sets from each parent cluster.
    parents_genes = dict()
    # Iterate on cluster reports.
    for key in cluster_reports.keys():
        # Organize data.
        name = cluster_reports[key]["name"]
        data_report = cluster_reports[key]["report"]
        data_report.rename_axis(
            index="set",
            axis="index",
            copy=False,
            inplace=True,
        )
        records = utility.convert_dataframe_to_records(data=data_report)
        # Collect genes from each children set in parent cluster.
        genes_child = list()
        # Iterate on children sets within cluster.
        for record in records:
            # Extract identifiers of genes.
            genes_set_raw = record["Genes"]
            genes_set = genes_set_raw.split(", ")
            # Collect genes.
            genes_child.extend(genes_set)
        # Collect unique genes from parent.
        genes_child_unique = utility.collect_unique_elements(
            elements_original=genes_child, )
        parents_genes[name] = genes_child_unique
    # Organize data.
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("unique genes from each parent set")
        for key in cluster_reports.keys():
            utility.print_terminal_partition(level=3)
            print("parent: " + cluster_reports[key]["name"])
            print("count of children sets: " +
                  str(cluster_reports[key]["report"].shape[0]))
            print("count of children genes: " +
                  str(len(parents_genes[cluster_reports[key]["name"]])))
            #print(data_parent)
        utility.print_terminal_partition(level=2)
    # Return information.
    return parents_genes
Beispiel #11
0
def select_covid19_genes_by_studies_fold_directions(
    data_genes_comparisons_studies=None,
    genes_selection=None,
    threshold_studies=None,
    report=None,
):
    """
    Collects and organizes genes that show differential expression in multiple
    studies and comparisons.

    arguments:
        data_genes_comparisons_studies (object): Pandas data frame of genes'
            differential expression in studies
        genes_selection (list<str>): identifiers of genes
        threshold_studies (int): minimal count of studies
        report (bool): whether to print reports

    raises:

    returns:
        (dict<list<str>>): sets of genes

    """

    # Copy data.
    data = data_genes_comparisons_studies.copy(deep=True)
    # Select data for genes that match selection for study.
    data = data.loc[data.index.isin(genes_selection), :]
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Count of differential expression genes that match selection" +
              "of genes for study.")
        print("selection genes DE COVID-19: " + str(data.shape[0]))
    # Select data for genes that match threshold.
    data_studies = data.loc[data["studies"] >= threshold_studies, :]
    genes_any = utility.collect_unique_elements(
        elements_original=data_studies.index.to_list())
    # Select data for genes that show accumulation in majority of studies.
    data_accumulation = data_studies.loc[data_studies["accumulations"] >
                                         (data_studies["depletions"] + 1), :]
    genes_accumulation = utility.collect_unique_elements(
        elements_original=data_accumulation.index.to_list())
    # Select data for genes that show depletion in majority of studies.
    data_depletion = data_studies.loc[data_studies["depletions"] >
                                      (data_studies["accumulations"] + 1), :]
    genes_depletion = utility.collect_unique_elements(
        elements_original=data_depletion.index.to_list())
    # Select data for genes that show depletion in majority of studies.
    data_mix = data_studies.loc[
        lambda datum: (datum["accumulations"] == datum["depletions"]) |
        (datum["accumulations"] ==
         (datum["depletions"] + 1)) | (datum["depletions"] ==
                                       (datum["accumulations"] + 1))]
    genes_mix = utility.collect_unique_elements(
        elements_original=data_mix.index.to_list())
    # Compile information.
    bin = dict()
    bin["any"] = genes_any
    bin["up"] = genes_accumulation
    bin["down"] = genes_depletion
    bin["mix"] = genes_mix
    # Report.
    if report:
        utility.print_terminal_partition(level=2)
        print("Count of differential expression genes that match threshold.")
        print("threshold count of studies: " + str(threshold_studies))
        print("genes DE COVID-19 any direction: " + str(len(genes_any)))
        print("genes by accumulation: " + str(len(genes_accumulation)))
        print("genes by depletion: " + str(len(genes_depletion)))
        print("genes by mix of folds: " + str(len(genes_mix)))
    # Return information.
    return bin
Beispiel #12
0
def collect_genes_annotations_studies_comparisons_valid_change(
    genes_identifiers=None,
    bin_studies=None,
    data_gene_annotation=None,
    warn=None,
):
    """
    Collects unique studies in which each gene has a valid fold change.

    arguments:
        genes_identifiers (list<str>): unique identifiers of genes in all
            studies
        bin_studies (dict): collection of information about each study
        data_gene_annotation (object): Pandas data frame of genes' annotations
        warn (bool): whether to print warnings

    raises:

    returns:
        (object): Pandas data frame of studies for each gene

    """

    # Iterate across genes.
    records = list()
    for gene_identifier in genes_identifiers:
        # Determine whether gene has a valid identifier.
        if ("ENSG" in str(gene_identifier)):
            # Collect studies and comparisons for the gene.
            studies = list()
            comparisons_gene = list()
            accumulations = list()
            depletions = list()
            for study in bin_studies.keys():
                data_study = bin_studies[study]["data"]
                # Determine whether the gene has valid fold change in study.
                if (gene_identifier in data_study["identifier"].to_list()):
                    # Study mentions current gene.
                    # Determine whether the gene has a valid fold change.
                    # Consider each study comparison for the gene.
                    comparisons_study = list(
                        filter(
                            lambda value:
                            (not value in ["identifier", "name"]),
                            data_study.columns.to_list()))
                    # Iterate across study's comparisons.
                    for comparison in comparisons_study:
                        value = determine_gene_study_comparison_value(
                            gene_identifier=gene_identifier,
                            comparison=comparison,
                            data_study=data_study,
                            warn=warn,
                        )
                        if not math.isnan(value):
                            studies.append(study)
                            comparisons_gene.append(comparison)
                            if value >= 1:
                                accumulations.append(comparison)
                            elif value < 1:
                                depletions.append(comparison)
                            pass
                        pass
                    pass
                pass
            # Collect unique studies.
            studies_unique = sorted(
                utility.collect_unique_elements(elements_original=studies))
            comparisons_gene_unique = sorted(
                utility.collect_unique_elements(
                    elements_original=comparisons_gene))
            accumulations_unique = sorted(
                utility.collect_unique_elements(
                    elements_original=accumulations))
            depletions_unique = sorted(
                utility.collect_unique_elements(elements_original=depletions))
            # Organize record.
            record = dict()
            record["identifier"] = gene_identifier
            record["studies"] = len(studies_unique)
            record["comparisons"] = len(comparisons_gene_unique)
            record["reference"] = ";".join(studies_unique)
            record["accumulations"] = len(accumulations_unique)
            record["depletions"] = len(depletions_unique)
            annotations = assembly.access_gene_contextual_annotations(
                gene_identifier=gene_identifier,
                data_gene_annotation=data_gene_annotation,
            )
            record.update(annotations)
            records.append(record)
            pass
        pass
    # Organize data.
    data = pandas.DataFrame(data=records)
    data.set_index(
        "identifier",
        drop=True,
        inplace=True,
    )
    return data