Esempio n. 1
0
def main():
    """Compute sample hierarchical clustering."""
    args = parse_args()
    expressions = get_expressions(fnames=args.sample_files,
                                  gene_set=args.genes)
    expressions = transform(expressions,
                            log2=args.log2,
                            normalization=args.normalization)
    zero_genes = get_zero_genes(expressions)
    zero_samples = get_zero_samples(expressions)
    linkage, dendrogram = get_clustering(expressions,
                                         distance_metric=get_distance_metric(
                                             args.distance),
                                         linkage_method=args.linkage,
                                         ordering_method=args.ordering,
                                         n_keep=args.n_keep,
                                         n_trials=args.n_trials)
    result = {
        'linkage':
        linkage.tolist(),
        'sample_ids':
        {i: {
            'id': sampleid
        }
         for i, sampleid in enumerate(args.sampleids)},
        'order':
        dendrogram['leaves'],
        'zero_gene_symbols':
        zero_genes,
        'missing_gene_symbols':
        list(set(args.genes).difference(set(expressions.index))),
        'zero_sample_ids': [args.sampleids[sample] for sample in zero_samples]
    }
    output_json(result, args.output)
Esempio n. 2
0
def main():
    """Compute sample hierarchical clustering."""
    args = parse_args()

    if len(args.sample_files) != len(args.sample_ids):
        msg = "The number of sample files does not match the number of sample IDs."
        set_error(msg)

    if len(args.sample_files) != len(args.sample_names):
        msg = "The number of sample files does not match the number of sample names."
        set_error(msg)

    if len(args.sample_files) < 2:
        msg = (
            "Select at least two samples to compute hierarchical clustering of samples."
        )
        set_error(msg)

    if len(args.gene_labels) == 1 and args.distance_metric != "euclidean":
        msg = (
            "Select at least two genes to compute hierarchical clustering of samples with "
            "correlation distance metric or use Euclidean distance metric.")
        set_error(msg)

    expressions, excluded = get_expressions(fnames=args.sample_files,
                                            gene_set=args.gene_labels)

    if len(expressions.index) == 0:
        if not args.gene_labels:
            msg = "The selected samples do not have any common genes."
        else:
            msg = "None of the selected genes are present in all samples."
        set_error(msg)

    if len(expressions.index) == 1 and args.distance_metric != "euclidean":
        if not args.gene_labels:
            msg = (
                "The selected samples contain only one common gene ({}). At least two common "
                "genes are required to compute hierarchical clustering of samples with "
                "correlation distance metric. Select a different set of samples or use Euclidean "
                "distance metric.".format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        else:
            msg = (
                "Only one of the selected genes ({}) is present in all samples but at least two "
                "such genes are required to compute hierarchical clustering of samples with "
                "correlation distance metric. Select more genes or use Euclidean distance "
                "metric.".format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        set_error(msg)

    expressions = transform(expressions, log2=args.log2, z_score=args.z_score)

    if args.remove_const:
        expressions, matches = remove_const_samples(expressions)
        if len(expressions.columns) == 0:
            msg = (
                "All of the selected samples have constant expression across genes. Hierarchical "
                "clustering of samples cannot be computed.")
            set_error(msg)
        if len(expressions.columns) == 1:
            sample_name = [
                id for i, id in enumerate(args.sample_names) if matches[i]
            ][0]
            msg = (
                "Only one of the selected samples ({}) has a non-constant expression across "
                "genes. However, hierarchical clustering of samples cannot be computed with "
                "just one sample.".format(sample_name))
            set_error(msg)
        removed = [
            name for i, name in enumerate(args.sample_names) if not matches[i]
        ]
        suffix = "" if len(removed) <= 3 else ", ..."
        if removed:
            msg = (
                "{} of the selected samples ({}) have constant expression across genes. "
                "Those samples are excluded from the computation of hierarchical clustering of "
                "samples with correlation distance "
                "metric.".format(len(removed),
                                 ", ".join(removed[:3]) + suffix))
            send_message(warning(msg))
    else:
        matches = [True] * len(args.sample_files)

    suffix = "" if len(excluded) <= 3 else ", ..."
    if excluded:
        excluded_names = get_gene_names(excluded[:3], args.source,
                                        args.species)
    if len(excluded) == 1:
        if not args.gene_labels:
            msg = (
                "Gene {} is present in some but not all of the selected samples. This "
                "gene is excluded from the computation of hierarchical clustering of "
                "samples.".format(", ".join(excluded_names)))
        else:
            msg = (
                "{} of the selected genes ({}) is missing in at least one of the selected "
                "samples. This gene is excluded from the computation of hierarchical "
                "clustering of samples.".format(len(excluded),
                                                ", ".join(excluded_names)))
        send_message(warning(msg))
    if len(excluded) > 1:
        if not args.gene_labels:
            msg = (
                "{} genes ({}) are present in some but not all of the selected samples. Those "
                "genes are excluded from the computation of hierarchical clustering of "
                "samples.".format(len(excluded), ", ".join(excluded_names)))
        else:
            msg = (
                "{} of the selected genes ({}) are missing in at least one of the selected "
                "samples. Those genes are excluded from the computation of hierarchical "
                "clustering of samples.".format(len(excluded),
                                                ", ".join(excluded_names)))
        send_message(warning(msg))

    linkage, dendrogram = get_clustering(
        expressions,
        distance_metric=get_distance_metric(args.distance_metric),
        linkage_method=args.linkage_method,
        order=args.order,
    )

    sample_ids = [
        sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i]
    ]
    result = {
        "sample_ids":
        {i: {
            "id": sample_id
        }
         for i, sample_id in enumerate(sample_ids)},
        "linkage": linkage.tolist(),
        "order": dendrogram["leaves"],
    }
    output_json(result, args.output)
Esempio n. 3
0
def main():
    """Compute gene hierarchical clustering."""
    args = parse_args()

    if len(args.sample_files) != len(args.sample_names):
        msg = 'The number of sample files does not match the number of sample names.'
        set_error(msg)

    if len(args.gene_labels) == 1:
        msg = 'Select at least two genes to compute hierarchical clustering of genes.'
        set_error(msg)

    if len(args.sample_files) == 1 and args.distance_metric != 'euclidean':
        msg = (
            'Select at least two samples to compute hierarchical clustering of genes with '
            'correlation distance metric or use Euclidean distance metric.')
        set_error(msg)

    expressions, excluded = get_expressions(fnames=args.sample_files,
                                            gene_set=args.gene_labels)

    if len(expressions.index) == 0:
        if not args.gene_labels:
            msg = 'The selected samples do not have any common genes.'
        else:
            msg = 'None of the selected genes are present in all samples.'
        set_error(msg)

    if len(expressions.index) == 1 and args.distance_metric != 'euclidean':
        if not args.gene_labels:
            msg = (
                'The selected samples contain only one common gene ({}). At least two common '
                'genes are required to compute hierarchical clustering of genes with '
                'correlation distance metric. Select a different set of samples or use Euclidean '
                'distance metric.'.format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        else:
            msg = (
                'Only one of the selected genes ({}) is present in all samples but at least two '
                'such genes are required to compute hierarchical clustering of genes with '
                'correlation distance metric. Select more genes or use Euclidean distance '
                'metric.'.format(
                    get_gene_names(list(expressions.index), args.source,
                                   args.species)[0]))
        set_error(msg)

    expressions = transform(expressions, log2=args.log2, z_score=args.z_score)

    if args.remove_const:
        expressions, matches = remove_const_genes(expressions)
        if len(expressions.index) == 0:
            msg = (
                'All of the selected genes have constant expression across samples. '
                'Hierarchical clustering of genes cannot be computed.')
            set_error(msg)
        if len(expressions.index) == 1:
            gene_names = get_gene_names(list(expressions.index), args.source,
                                        args.species)
            msg = (
                'Only one of the selected genes ({}) has a non-constant expression across '
                'samples. However, hierarchical clustering of genes cannot be computed with '
                'just one gene.'.format(gene_names[0]))
            set_error(msg)
        removed = [
            name for i, name in enumerate(expressions.index) if not matches[i]
        ]
        suffix = '' if len(removed) <= 3 else ', ...'
        if removed:
            removed_names = get_gene_names(removed[:3], args.source,
                                           args.species)
            msg = (
                '{} of the selected genes ({}) have constant expression across samples. '
                'Those genes are excluded from the computation of hierarchical clustering of '
                'genes with correlation distance '
                'metric.'.format(len(removed),
                                 ', '.join(removed_names) + suffix))
            print(warning(msg))
    else:
        matches = [True] * len(expressions.index)

    suffix = '' if len(excluded) <= 3 else ', ...'
    if excluded:
        excluded_names = get_gene_names(excluded[:3], args.source,
                                        args.species)
    if len(excluded) == 1:
        if not args.gene_labels:
            msg = (
                'Gene {} is present in some but not all of the selected samples. This '
                'gene is excluded from the computation of hierarchical clustering of '
                'genes.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = (
                '{} of the selected genes ({}) is missing in at least one of the selected '
                'samples. This gene is excluded from the computation of hierarchical '
                'clustering of genes.'.format(len(excluded),
                                              ', '.join(excluded_names)))
        print(warning(msg))
    if len(excluded) > 1:
        if not args.gene_labels:
            msg = (
                '{} genes ({}) are present in some but not all of the selected samples. Those '
                'genes are excluded from the computation of hierarchical clustering of '
                'genes.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = (
                '{} of the selected genes ({}) are missing in at least one of the selected '
                'samples. Those genes are excluded from the computation of hierarchical '
                'clustering of genes.'.format(len(excluded),
                                              ', '.join(excluded_names)))
        print(warning(msg))

    linkage, dendrogram = get_clustering(expressions,
                                         distance_metric=get_distance_metric(
                                             args.distance_metric),
                                         linkage_method=args.linkage_method,
                                         order=args.order)

    result = {
        'gene_symbols':
        {i: {
            'gene': gene
        }
         for i, gene in enumerate(expressions.index)},
        'linkage': linkage.tolist(),
        'order': dendrogram['leaves'],
    }
    output_json(result, args.output)
def main():
    """Compute sample hierarchical clustering."""
    args = parse_args()

    if len(args.sample_files) != len(args.sample_ids):
        msg = 'The number of sample files does not match the number of sample IDs.'
        set_error(msg)

    if len(args.sample_files) != len(args.sample_names):
        msg = 'The number of sample files does not match the number of sample names.'
        set_error(msg)

    if len(args.sample_files) < 2:
        msg = 'Select at least two samples to compute hierarchical clustering of samples.'
        set_error(msg)

    if len(args.gene_labels) == 1 and args.distance_metric != 'euclidean':
        msg = ('Select at least two genes to compute hierarchical clustering of samples with '
               'correlation distance metric or use Euclidean distance metric.')
        set_error(msg)

    expressions, excluded = get_expressions(fnames=args.sample_files, gene_set=args.gene_labels)

    if len(expressions.index) == 0:
        if not args.gene_labels:
            msg = 'The selected samples do not have any common genes.'
        else:
            msg = 'None of the selected genes are present in all samples.'
        set_error(msg)

    if len(expressions.index) == 1 and args.distance_metric != 'euclidean':
        if not args.gene_labels:
            msg = ('The selected samples contain only one common gene ({}). At least two common '
                   'genes are required to compute hierarchical clustering of samples with '
                   'correlation distance metric. Select a different set of samples or use Euclidean '
                   'distance metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0]))
        else:
            msg = ('Only one of the selected genes ({}) is present in all samples but at least two '
                   'such genes are required to compute hierarchical clustering of samples with '
                   'correlation distance metric. Select more genes or use Euclidean distance '
                   'metric.'.format(get_gene_names(list(expressions.index), args.source, args.species)[0]))
        set_error(msg)

    expressions = transform(expressions, log2=args.log2, z_score=args.z_score)

    if args.remove_const:
        expressions, matches = remove_const_samples(expressions)
        if len(expressions.columns) == 0:
            msg = ('All of the selected samples have constant expression across genes. Hierarchical '
                   'clustering of samples cannot be computed.')
            set_error(msg)
        if len(expressions.columns) == 1:
            sample_name = [id for i, id in enumerate(args.sample_names) if matches[i]][0]
            msg = ('Only one of the selected samples ({}) has a non-constant expression across '
                   'genes. However, hierarchical clustering of samples cannot be computed with '
                   'just one sample.'.format(sample_name))
            set_error(msg)
        removed = [name for i, name in enumerate(args.sample_names) if not matches[i]]
        suffix = '' if len(removed) <= 3 else ', ...'
        if removed:
            msg = ('{} of the selected samples ({}) have constant expression across genes. '
                   'Those samples are excluded from the computation of hierarchical clustering of '
                   'samples with correlation distance '
                   'metric.'.format(len(removed), ', '.join(removed[:3]) + suffix))
            print(warning(msg))
    else:
        matches = [True] * len(args.sample_files)

    suffix = '' if len(excluded) <= 3 else ', ...'
    if excluded:
        excluded_names = get_gene_names(excluded[:3], args.source, args.species)
    if len(excluded) == 1:
        if not args.gene_labels:
            msg = ('Gene {} is present in some but not all of the selected samples. This '
                   'gene is excluded from the computation of hierarchical clustering of '
                   'samples.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = ('{} of the selected genes ({}) is missing in at least one of the selected '
                   'samples. This gene is excluded from the computation of hierarchical '
                   'clustering of samples.'.format(len(excluded), ', '.join(excluded_names)))
        print(warning(msg))
    if len(excluded) > 1:
        if not args.gene_labels:
            msg = ('{} genes ({}) are present in some but not all of the selected samples. Those '
                   'genes are excluded from the computation of hierarchical clustering of '
                   'samples.'.format(len(excluded), ', '.join(excluded_names)))
        else:
            msg = ('{} of the selected genes ({}) are missing in at least one of the selected '
                   'samples. Those genes are excluded from the computation of hierarchical '
                   'clustering of samples.'.format(len(excluded), ', '.join(excluded_names)))
        print(warning(msg))

    linkage, dendrogram = get_clustering(
        expressions,
        distance_metric=get_distance_metric(args.distance_metric),
        linkage_method=args.linkage_method,
        order=args.order
    )

    sample_ids = [sample_id for i, sample_id in enumerate(args.sample_ids) if matches[i]]
    result = {
        'sample_ids': {i: {'id': sample_id} for i, sample_id in enumerate(sample_ids)},
        'linkage': linkage.tolist(),
        'order': dendrogram['leaves'],
    }
    output_json(result, args.output)