def bowtie2(resource, genome):
    """Run bowtie2 aligner on given resource.

    Aligne reads files of given resource to the given genome using the
    ``bowtie2`` aligner. If reads were already aligned, existing objects
    will be returned.

    :param resource: resource of which reads will be aligned
    :param genome: data object with genome that will be used
    :type genome: `~resdk.resources.data.Data`

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        for sample in get_samples(single_resource):
            inputs = {
                'reads': sample.get_reads().id,
                'genome': get_data_id(genome),
            }

            aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2',
                                                input=inputs)
            sample.add_data(aligned)
            results.append(aligned)

    return results
Exemple #2
0
def hisat2(resource, genome):
    """Run hisat2 aligner on given resource.

    Align reads files of given resource to the given genome using the
    `Hisat2`_ aligner. If reads were already aligned, existing objects
    will be returned.

    .. _Hisat2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-hisat2

    :param resource: resource of which reads will be aligned
    :param genome: data object with genome that will be used
    :type genome: `~resdk.resources.data.Data`

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        for sample in get_samples(single_resource):
            inputs = {
                'reads': sample.get_reads().id,
                'genome': get_data_id(genome),
            }

            aligned = sample.resolwe.get_or_run(slug='alignment-hisat2', input=inputs)
            sample.add_data(aligned)
            results.append(aligned)

    return results
def cuffquant(resource,
              gff,
              genome=None,
              mask_file=None,
              library_type=None,
              multi_read_correct=None,
              threads=None):
    """Run Cuffquant_ for selected cuffquats.

    This method runs `Cuffquant`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unsstranded. Other
    parameters: genome, mask_file, multi_reads_correct and threads are
    optional.

    .. _Cuffquant:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant

    :param gff: id of annotation file is given
    :type gff: int or `~resdk.resources.data.Data`
    :param genome: id of genome file is given to run bias detection and
        correction algorithm
    :type genome: int or `~resdk.resources.data.Data`
    :param mask_file: id of mask file is given
    :type mask_file: int or `~resdk.resources.data.Data`
    :param str library_type: options are fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings
    :param int threads: use this many processor threads

    """
    results = []
    for sample in get_samples(resource):
        inputs = {
            'alignment': sample.get_bam().id,
            'gff': get_data_id(gff),
        }

        if genome is not None:
            inputs['genome'] = genome

        if mask_file is not None:
            inputs['mask_file'] = mask_file

        if library_type is not None:
            inputs['library_type'] = library_type

        if multi_read_correct is not None:
            inputs['multi_read_correct'] = multi_read_correct

        if threads is not None:
            inputs['threads'] = threads

        cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant',
                                                  input=inputs)
        sample.add_data(cuffquant_obj)
        results.append(cuffquant_obj)

    return results
Exemple #4
0
def cuffnorm(resource, annotation, use_ercc=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
    ``annotation`` and ``use_ercc`` parameters specified in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: `~resdk.resources.data.Data`
    :param bool use_ercc: use ERRCC spike-in controls for normalization

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    inputs = {
        'cuffquant': cuffquants,
        'annotation': get_data_id(annotation),
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
def cuffnorm(resource, annotation, use_ercc=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
    ``annotation`` and ``use_ercc`` parameters specified in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: `~resdk.resources.data.Data`
    :param bool use_ercc: use ERRCC spike-in controls for normalization

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    inputs = {
        'cuffquant': cuffquants,
        'annotation': get_data_id(annotation),
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
def cuffquant(resource, annotation, genome=None, mask_file=None,
              library_type=None, multi_read_correct=None):
    """Run Cuffquant_ for selected cuffquats.

    This method runs `Cuffquant`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unsstranded. Other
    parameters: genome, mask_file and multi_reads_correct are optional.

    .. _Cuffquant:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant

    :param annotation: annotation file
    :type annotation: `~resdk.resources.data.Data`
    :param genome: genome object to use for bias detection and
        correction algorithm
    :type genome: `~resdk.resources.data.Data`
    :param mask_file: mask file to use in process
    :type mask_file: `~resdk.resources.data.Data`
    :param str library_type: options are: fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings

    """
    results = []
    for sample in get_samples(resource):
        inputs = {
            'alignment': sample.get_bam().id,
            'annotation': get_data_id(annotation),
        }

        if genome is not None:
            inputs['genome'] = genome

        if mask_file is not None:
            inputs['mask_file'] = mask_file

        if library_type is not None:
            inputs['library_type'] = library_type

        if multi_read_correct is not None:
            inputs['multi_read_correct'] = multi_read_correct

        cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant', input=inputs)
        sample.add_data(cuffquant_obj)
        results.append(cuffquant_obj)

    return results
Exemple #7
0
def bamplot(resource,
            genome,
            input_gff=None,
            input_region=None,
            stretch_input=None,
            color=None,
            sense=None,
            extension=None,
            rpm=None,
            yscale=None,
            names=None,
            plot=None,
            title=None,
            scale=None,
            bed=None,
            multi_page=None):
    """Run ``bamplot`` on the resource.

    This method runs `bamplot`_ with bams, genome and gff or region
    specified in arguments.

    .. _bamplot:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot

    :param list resource: resource from which bam objects will be get
    :param str genome: Genome used in the process (options are HG18,
        HG19, MM9 and MM10)
    :param input_gff: id of annotation file is given
    :type input_gff: int or `~resdk.resources.data.Data`
    :param str input_region: enter a genomic region
    :param int stretch_input: stretch the input regions to a minimum
        length
    :param str color: enter a colon separated list of colors
    :param str sense: map to forward, reverse or'both strand,
        default maps to ``both``
    :param int extension: extends reads by n bp, dfault value is 200bp
    :param bool rpm: normalizes density to reads per million (rpm),
        default is ``False``
    :param str yscale: choose either relative or uniform y axis scaling,
        default is ``relative scaling``
    :param str names: a comma separated list of names for your bams
    :param str plot: choose all lines on a single plot or multiple plots
    :param str title: title for the output plot(s), default will be the
        coordinate region
    :param str scale: a comma separated list of multiplicative scaling
        factors for your bams, default is ``None``
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run
    :param bool multi_page: if flagged will create a new pdf for each
        region

    """
    input_objects = []

    if not input_gff and not input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')
    if input_gff and input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')

    valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10']
    if genome not in valid_genomes:
        raise KeyError('Invalid `genome`, please use one of the following: '
                       '{}'.format(', '.join(valid_genomes)))

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'genome': genome,
        'bam': bams,
    }

    if color is not None:
        inputs['color'] = color

    if sense is not None:
        inputs['scale'] = scale

    if extension is not None:
        inputs['extension'] = extension

    if rpm is not None:
        inputs['rpm'] = rpm

    if yscale is not None:
        inputs['yscale'] = yscale

    if names is not None:
        inputs['names'] = names

    if plot is not None:
        inputs['plot'] = plot

    if title is not None:
        inputs['title'] = title

    if scale is not None:
        inputs['scale'] = scale

    if multi_page is not None:
        inputs['multi_page'] = multi_page

    if input_gff is not None:
        input_objects.append(input_gff)
        inputs['input_gff'] = get_data_id(input_gff)

    if input_region is not None:
        inputs['input_region'] = input_region

    if bed is not None:
        if isinstance(bed, list):
            input_objects.extend(bed)
            inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed]
        else:
            input_objects.append(bed)
            inputs['bed'] = [get_data_id(bed)]

    resolwe = get_resolwe(*input_objects)

    bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs)

    if is_collection(resource):
        resource.add_data(bamplot_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamplot_obj)

    return bamplot_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None,
             library_type=None, library_normalization=None, dispersion_method=None):
    """Run Cuffdiff_ for selected cuffquants.

    This method runs `Cuffdiff`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unstranded. Other parameters
    defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric,
    dispersion_method=pooled, threads=1. Parameter genome is optional.

    The way the function works depends on the resource. If it is run on a collection,
    it will perform cuffdiff on every 'compare' relation labeled 'case-control' in
    the selected collection. If it is run on a list of samples (not necesssarily in
    the same collection) it will run cuffdiff on all 'compare' relations labeled
    'case-control' containing all of the given samples but will discard those
    samples in a relation that are not in the list of samples.

    .. _Cuffdiff:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff

    :param annotation: annotation file
    :type annotation: `~resdk.resources.data.Data`
    :param genome: genome object to use for bias detection and
        correction algorithm
    :type genome: `~resdk.resources.data.Data`
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings
    :param fdr: the allowed false discovery rate
    :type fdr: decimal
    :param str library_type: options are: fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param str library_normalization: options are: geometric, classic-fpkm,
        quartile
    :param str dispersion_method: options are: pooled, per-condition,
        blind, poisson

    """
    inputs = {'annotation': get_data_id(annotation)}

    input_objects = [annotation]

    if genome is not None:
        inputs['genome'] = genome.id
        input_objects.append(genome)

    if multi_read_correct is not None:
        inputs['multi_read_correct'] = multi_read_correct

    if fdr is not None:
        inputs['fdr'] = fdr

    if library_type is not None:
        inputs['library_type'] = library_type

    if library_normalization is not None:
        inputs['library_normalization'] = library_normalization

    if dispersion_method is not None:
        inputs['dispersion_method'] = dispersion_method

    samples = get_samples(resource)
    sample_ids = [sample.id for sample in samples]

    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    collection_id = get_resource_collection(resource)

    relation_filter = {}
    if collection_id:
        relation_filter['collection'] = collection_id
    else:
        relation_filter['entity'] = sample_ids

    relations = resolwe.relation.filter(
        type='compare',
        **relation_filter
    )

    cuffdiff_objects = []
    for relation in relations:
        control = []
        case = []
        for partition in relation.partitions:
            sample = resolwe.sample.get(partition['entity'])
            label = partition['label']
            if sample.id not in sample_ids:
                continue

            if label == 'case':
                case.append(get_data_id(sample.get_cuffquant()))
            elif label == 'control':
                control.append(get_data_id(sample.get_cuffquant()))
            else:
                raise ValueError(
                    "Label different from 'case' or 'control' was found in the "
                    "following relation: {}".format(relation.id)
                )

        if not case or not control:
            continue

        inputs['case'] = case
        inputs['control'] = control

        cuffdiff_obj = resolwe.get_or_run(slug='cuffdiff', input=inputs)
        cuffdiff_objects.append(cuffdiff_obj)

        if is_collection(resource):
            resource.add_data(cuffdiff_obj)
        elif is_relation(resource):
            resource.collection.add_data(cuffdiff_obj)

    if not cuffdiff_objects:
        if not relations:
            raise ValueError("No relation containing all of the given samples was found")
        else:
            raise ValueError(
                "No suitable relation was found (given samples all have either 'case' label "
                "or 'control' label"
            )

    return cuffdiff_objects
Exemple #9
0
def rose2(resource, use_background=True, tss=None, stitch=None, beds=None):
    """Run ``ROSE 2`` process on the resource.

    This method runs `ROSE2`_ process with ``tss_exclusion`` and
    ``stitch`` parameters specified in arguments.

    Separate process is run for each bed file on the sample. To run
    process only on subset of those files, list them in ``beds``
    argument (if only one object is given, it will be auto-wrapped in
    list, if it is not already).

    If ``use_background`` argument is set to ``True``, bam file from
    background sample is passed to the process as the control.

    .. _ROSE2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2

    :param bool use_background: if set to ``True``, background sample
        will be used in the process
    :param int tss: TSS exclusion used in process
    :param int stitch: Stitch used in process
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        background_filter = {}
        if use_background:
            collection_id = get_resource_collection(single_resource)
            if collection_id:
                background_filter['collection'] = collection_id

        for sample in get_samples(single_resource):
            inputs = {
                'rankby': sample.get_bam().id,
            }

            if tss is not None:
                inputs['tss'] = tss

            if stitch is not None:
                inputs['stitch'] = stitch

            if use_background:
                if sample.is_background and not is_sample(single_resource):
                    # Don't run process on the background sample,
                    # but let it fail if it is run directly on sample
                    continue

                background = sample.get_background(**background_filter)
                inputs['control'] = background.get_bam().id

            bed_list = sample.get_macs()
            if beds is not None:
                # Convert objects to the list of their ids
                if isinstance(beds, list):
                    bed_filter = [get_data_id(bed) for bed in beds]
                else:
                    bed_filter = [get_data_id(beds)]

                bed_list = bed_list.filter(id__in=bed_filter)

            for bed in bed_list:
                inputs['input'] = bed.id

                rose = sample.resolwe.get_or_run(slug='rose2', input=inputs)
                sample.add_data(rose)
                results.append(rose)

    return results
Exemple #10
0
    def run_rose2(self,
                  use_background=True,
                  background_slug='',
                  genome='HG19',
                  tss=None,
                  stitch=None,
                  beds=None):
        """Run ``ROSE 2`` process on the sample.

        This method runs `ROSE2`_ process with ``tss_exclusion`` and
        ``stitch`` parameters specified in arguments.

        Separate process is run for each bed file on the sample. To
        run process only on subset of those files, list them in ``beds``
        argument (if only one object is given, it will be auto-wrapped
        in list, if it is not already).

        If ``use_background`` argument is set to ``True``, bam file
        from background sample is passed to the process as the control.

        .. _ROSE2:
            http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2

        :param bool use_background: if set to ``True``, background
            sample will be used in the process
        :param str genome: Genome used in the process (options are HG18,
            HG19, MM9 and MM10), default is HG19
        :param int tss: TSS exclusion used in process
        :param int stitch: Stitch used in process
        :param list beds: subset of bed files to run process on, if
            empty processes for all bed files will be run

        """
        valid_genomes = ['HG18', 'HG19', 'MM9', 'MM10']
        if genome not in valid_genomes:
            raise KeyError(
                'Invalid `genome`, please use one of the following: '
                '{}'.format(', '.join(valid_genomes)))

        inputs = {
            'genome': genome,
            'rankby': self.get_bam().id,
        }

        if tss is not None:
            inputs['tss'] = tss

        if stitch is not None:
            inputs['stitch'] = stitch

        if use_background:
            background = self.get_background(background_slug,
                                             fail_silently=True)

            if background:
                inputs['control'] = background.get_bam().id
            else:
                self.logger.info('Rose-2 will run without a control sample.')

        bed_list = self.get_macs()
        if beds is not None:
            # Convert objects to the list of their ids
            if isinstance(beds, list):
                bed_filter = [get_data_id(bed) for bed in beds]
            else:
                bed_filter = [get_data_id(beds)]

            bed_list = bed_list.filter(id__in=bed_filter)

        results = []
        for bed in bed_list:
            inputs['input'] = bed.id

            rose = self.resolwe.get_or_run(slug='rose2', input=inputs)
            self.add_data(rose)
            results.append(rose)

        return results
Exemple #11
0
def bowtie2(resource, genome, mode=None, speed=None, use_se=None, discordantly=None, rep_se=None,
            minins=None, maxins=None, trim_5=None, trim_3=None, trim_iter=None, trim_nucl=None,
            rep_mode=None, k_reports=None):
    """Run bowtie2 aligner on given resource.

    Align reads files of given resource to the given genome using the
    `Bowtie2`_ aligner. If reads were already aligned, existing objects
    will be returned.

    .. _Bowtie2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-bowtie2

    :param resource: resource of which reads will be aligned
    :param genome: data object with genome that will be used
    :type genome: `~resdk.resources.data.Data`
    :param str mode: alignment mode (options are: --end-to-end,
        --local), default is --end-to-end
    :param str speed: speed vs sensitivity (options are: --very-fast,
        --fast, --semsitive, --very-sensitive), default is --sensitive
    :param bool use_se: map as single-ended (for paired-end reads
        only), default is False
    :param bool discordantly: report discordantly matched read, default
        is True
    :param bool rep_se: report single ended, default is True
    :param int minins: minimum fragment length, default is 0
    :param int maxins: maximum fragment length, default is 500
    :param int trim_5: number of bases to trim from 5', default is 0
    :param int trim_3: number of bases to trim from 3', default is 0
    :param int trim_iter: number of iterations, default is 0
    :param int trim_nucl: number of bases to trim from 3' in each
        iteration, default is 2
    :param str rep_mode: report mode (options are: def, k, a), default
        is def
    :param int k_reports: number of reports (for -k mode only), default
        is 5

    """
    inputs = {'genome': get_data_id(genome)}

    if mode is not None:
        inputs['mode'] = mode

    if speed is not None:
        inputs['speed'] = speed

    if use_se is not None:
        inputs['use_se'] = use_se

    if discordantly is not None:
        inputs['discordantly'] = discordantly

    if rep_se is not None:
        inputs['rep_se'] = rep_se

    if minins is not None:
        inputs['minins'] = minins

    if maxins is not None:
        inputs['maxins'] = maxins

    if trim_5 is not None:
        inputs['trim_5'] = trim_5

    if trim_3 is not None:
        inputs['trim_3'] = trim_3

    if trim_iter is not None:
        inputs['trim_iter'] = trim_iter

    if trim_nucl is not None:
        inputs['trim_nucl'] = trim_nucl

    if rep_mode is not None:
        inputs['rep_mode'] = rep_mode

    if k_reports is not None:
        inputs['k_reports'] = k_reports

    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        for sample in get_samples(single_resource):
            inputs['reads'] = sample.get_reads().id

            aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2', input=inputs)
            sample.add_data(aligned)
            results.append(aligned)

    return results
def cuffnorm(resource, annotation, use_ercc=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
    ``annotation`` and ``use_ercc`` parameters specified in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: `~resdk.resources.data.Data`
    :param bool use_ercc: use ERRCC spike-in controls for normalization

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    replicates = []
    replicates_ids = {}
    for sample in samples:
        relations = resolwe.relation.filter(
            type='group',
            entity=[sample.id],
            **relation_filter
        )

        if len(relations) == 1:
            relation = relations[0]
        else:
            raise LookupError(
                "Cannot determine unique group relation with label `replicates` for the "
                "following sample: {}".format(sample.name)
            )

        if relation.id not in replicates_ids:
            replicates_ids[relation.id] = str(len(replicates_ids))
        replicates.append(replicates_ids[relation.id])

    inputs = {
        'cuffquant': cuffquants,
        'replicates': replicates,
        'annotation': get_data_id(annotation),
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
Exemple #13
0
def cuffdiff(resource,
             annotation,
             genome=None,
             multi_read_correct=None,
             fdr=None,
             library_type=None,
             library_normalization=None,
             dispersion_method=None,
             threads=None):
    """Run Cuffdiff_ for selected cuffquants.

    This method runs `Cuffdiff`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unstranded. Other parameters
    defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric,
    dispersion_method=pooled, threads=1. Parameter genome is optional.

    The way the function works depends on the resource. If it is run on a collection,
    it will perform cuffdiff on every 'compare' relation labeled 'case-control' in
    the selected collection. If it is run on a list of samples (not necesssarily in
    the same collection) it will run cuffdiff on all 'compare' relations labeled
    'case-control' containing all of the given samples but will discard those
    samples in a relation that are not in the list of samples.

    .. _Cuffdiff:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff

    :param annotation: annotation file
    :type annotation: `~resdk.resources.data.Data`
    :param genome: genome object to use for bias detection and
        correction algorithm
    :type genome: `~resdk.resources.data.Data`
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings
    :param fdr: the allowed false discovery rate
    :type fdr: decimal
    :param str library_type: options are: fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param str library_normalization: options are: geometric, classic-fpkm,
        quartile
    :param str dispersion_method: options are: pooled, per-condition,
        blind, poisson
    :param int threads: use this many processor threads

    """
    inputs = {'annotation': get_data_id(annotation)}

    input_objects = [annotation]

    if genome is not None:
        inputs['genome'] = genome
        input_objects.append(genome)

    if multi_read_correct is not None:
        inputs['multi_read_correct'] = multi_read_correct

    if fdr is not None:
        inputs['fdr'] = fdr

    if library_type is not None:
        inputs['library_type'] = library_type

    if library_normalization is not None:
        inputs['library_normalization'] = library_normalization

    if dispersion_method is not None:
        inputs['dispersion_method'] = dispersion_method

    if threads is not None:
        inputs['threads'] = threads

    samples = get_samples(resource)
    sample_ids = [sample.id for sample in samples]

    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    collection_id = get_resource_collection(resource)

    relation_filter = {}
    if collection_id:
        relation_filter['collection'] = collection_id
    else:
        relation_filter['entity'] = sample_ids

    relations = resolwe.relation.filter(type='compare',
                                        label='case-control',
                                        **relation_filter)

    cuffdiff_objects = []
    for relation in relations:
        control = []
        case = []
        for sample, position in zip(relation.samples, relation.positions):
            if sample.id not in sample_ids:
                continue

            if position == 'case':
                case.append(get_data_id(sample.get_cuffquant()))
            elif position == 'control':
                control.append(get_data_id(sample.get_cuffquant()))
            else:
                raise ValueError(
                    "Position different from 'case' or 'control' was found in the "
                    "following relation: {}".format(relation.id))
Exemple #14
0
def bamliquidator(resource,
                  cell_type=None,
                  bin_size=None,
                  regions=None,
                  extension=None,
                  sense=None,
                  skip_plot=None,
                  black_list=None,
                  threads=None):
    """Run ``bamliquidator`` on the resource.

    This method runs `bamliquidator`_ with bams, where three different
    analysis type options are possible: Bin mode, Region mode and BED
    mode.

    .. _bamliquidator:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator

    :param list resource: resource from which bam objects will be get
    :param str cell_type: the name of cell type will be given in counts
        tables
    :param int bin_size: number of base pairs in each bin. Default is
        100000.
    :param regions: gtf or bed annotation object used in region mode
    :type regions: `~resdk.resources.data.Data`
    :param int extension: Extends reads by number of bp. Default is 200.
    :param str sense: Mapping strand to gff file. Use '+' for forwaed,
        '-' for reverse and '.' for both. Defoult is both.
    :param bool skip_plot: True for skip plot.
    :param list str black_list: One or more chromosome patterns to skip
        during bin liquidation. Default is to skip any chromosomes that
        contain any of the following substrings `chrUn`, `_random`,
        `Zv9_` or `_hap`.
    :param int threads: Number of CPUs

    """
    if not xor(bin_size, regions):
        raise KeyError(
            'Exactly one of `bin_size` and `regions` parameters must be given.'
        )

    if regions and not is_data(regions):
        raise KeyError('`regions` parameter must be data object.')

    input_objects = []

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'bam': bams,
    }

    if bin_size:
        inputs['analysis_type'] = 'bin'
        inputs['bin_size'] = bin_size
    else:  # regions
        if regions.process_type == 'data:annotation:gtf:':
            inputs['analysis_type'] = 'gtf'
        elif regions.process_type == 'data:bed:':
            inputs['analysis_type'] = 'bed'
        else:
            raise KeyError(
                '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`'
            )

        input_objects.append(regions)
        inputs['regions_file_gtf'] = get_data_id(regions)

    if cell_type is not None:
        inputs['cell_type'] = cell_type

    if extension is not None:
        inputs['extension'] = extension

    if sense is not None:
        inputs['sense'] = sense

    if skip_plot is not None:
        inputs['skip_plot'] = skip_plot

    if black_list is not None:
        inputs['black_list'] = black_list

    if threads is not None:
        inputs['threads'] = threads

    resolwe = get_resolwe(*input_objects)

    bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs)

    if is_collection(resource):
        resource.add_data(bamliquidator_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamliquidator_obj)

    return bamliquidator_obj
Exemple #15
0
    def test_get_data_id(self):
        data = Data(id=1, resolwe=MagicMock())
        data.id = 1  # this is overriden when initialized
        self.assertEqual(get_data_id(data), 1)

        self.assertEqual(get_data_id(2), 2)
def cuffnorm(resource, annotation, use_ercc=None, threads=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
     ``annotation``, ``useERCC`` and ``threads`` parameters specified
     in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: int or `~resdk.resources.data.Data`
    :param bool useERCC: use ERRCC spike-in controls for normalization
    :param int threads: use this many threads to align reads
        (default: ``1``)

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    labels = []
    replicates = []
    replicates_ids = {}
    for sample in samples:
        relations = resolwe.relation.filter(type='group',
                                            label='replicates',
                                            entity=[sample.id],
                                            **relation_filter)

        if len(relations) == 1:
            relation = relations[0]
        else:
            raise LookupError(
                "Cannot determine unique group relation with label `replicates` for the "
                "following sample: {}".format(sample.name))

        if relation.id not in replicates_ids:
            replicates_ids[relation.id] = str(len(replicates_ids))
        replicates.append(replicates_ids[relation.id])

        if str(relation.id) not in labels:
            labels.append(str(relation.id))

    inputs = {
        'cuffquant': cuffquants,
        'replicates': replicates,
        'annotation': get_data_id(annotation),
        'labels': labels,
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    if threads is not None:
        inputs['threads'] = threads

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
Exemple #17
0
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None,
                  sense=None, skip_plot=None, black_list=None, threads=None):
    """Run ``bamliquidator`` on the resource.

    This method runs `bamliquidator`_ with bams, where three different
    analysis type options are possible: Bin mode, Region mode and BED
    mode.

    .. _bamliquidator:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator

    :param list resource: resource from which bam objects will be get
    :param str cell_type: the name of cell type will be given in counts
        tables
    :param int bin_size: number of base pairs in each bin. Default is
        100000.
    :param regions: gtf or bed annotation object used in region mode
    :type regions: `~resdk.resources.data.Data`
    :param int extension: Extends reads by number of bp. Default is 200.
    :param str sense: Mapping strand to gff file. Use '+' for forwaed,
        '-' for reverse and '.' for both. Defoult is both.
    :param bool skip_plot: True for skip plot.
    :param list str black_list: One or more chromosome patterns to skip
        during bin liquidation. Default is to skip any chromosomes that
        contain any of the following substrings `chrUn`, `_random`,
        `Zv9_` or `_hap`.
    :param int threads: Number of CPUs

    """
    if not xor(bin_size, regions):
        raise KeyError('Exactly one of `bin_size` and `regions` parameters must be given.')

    if regions and not is_data(regions):
        raise KeyError('`regions` parameter must be data object.')

    input_objects = []

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'bam': bams,
    }

    if bin_size:
        inputs['analysis_type'] = 'bin'
        inputs['bin_size'] = bin_size
    else:  # regions
        if regions.process_type == 'data:annotation:gtf:':
            inputs['analysis_type'] = 'gtf'
        elif regions.process_type == 'data:bed:':
            inputs['analysis_type'] = 'bed'
        else:
            raise KeyError(
                '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`'
            )

        input_objects.append(regions)
        inputs['regions_file_gtf'] = get_data_id(regions)

    if cell_type is not None:
        inputs['cell_type'] = cell_type

    if extension is not None:
        inputs['extension'] = extension

    if sense is not None:
        inputs['sense'] = sense

    if skip_plot is not None:
        inputs['skip_plot'] = skip_plot

    if black_list is not None:
        inputs['black_list'] = black_list

    if threads is not None:
        inputs['threads'] = threads

    resolwe = get_resolwe(*input_objects)

    bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs)

    if is_collection(resource):
        resource.add_data(bamliquidator_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamliquidator_obj)

    return bamliquidator_obj
Exemple #18
0
    def run_bamplot(self,
                    bam,
                    genome,
                    input_gff=None,
                    input_region=None,
                    stretch_input=None,
                    color=None,
                    sense=None,
                    extension=None,
                    rpm=None,
                    yscale=None,
                    names=None,
                    plot=None,
                    title=None,
                    scale=None,
                    bed=None,
                    multi_page=None):
        """Run bamplot."""
        if not input_gff and not input_region:
            raise KeyError('Please specify `input_gff` or `input_region.')
        if input_gff and input_region:
            raise KeyError('Please specify `input_gff` or `input_region.')

        valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10']
        if genome not in valid_genomes:
            raise KeyError(
                'Invalid `genome`, please use one of the following: '
                '{}'.format(', '.join(valid_genomes)))

        if isinstance(bam, list):
            bam = [get_data_id(bam_obj) for bam_obj in bam]
        else:
            bam = [get_data_id(bam)]

        inputs = {
            'genome': genome,
            'bam': bam,
        }

        if color is not None:
            inputs['color'] = color

        if sense is not None:
            inputs['scale'] = scale

        if extension is not None:
            inputs['extension'] = extension

        if rpm is not None:
            inputs['rpm'] = rpm

        if yscale is not None:
            inputs['yscale'] = yscale

        if names is not None:
            inputs['names'] = names

        if plot is not None:
            inputs['plot'] = plot

        if title is not None:
            inputs['title'] = title

        if scale is not None:
            inputs['scale'] = scale

        if multi_page is not None:
            inputs['multi_page'] = multi_page

        if input_gff is not None:
            inputs['input_gff'] = get_data_id(input_gff)

        if input_region is not None:
            inputs['input_region'] = input_region

        if bed is not None:
            if isinstance(bed, list):
                inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed]
            else:
                inputs['bed'] = [get_data_id(bed)]

        bamplot = self.get_or_run(slug='bamplot', input=inputs)

        return bamplot
Exemple #19
0
def rose2(resource, use_background=True, tss=None, stitch=None, beds=None):
    """Run ``ROSE 2`` process on the resource.

    This method runs `ROSE2`_ process with ``tss_exclusion`` and
    ``stitch`` parameters specified in arguments.

    Separate process is run for each bed file on the sample. To run
    process only on subset of those files, list them in ``beds``
    argument (if only one object is given, it will be auto-wrapped in
    list, if it is not already).

    If ``use_background`` argument is set to ``True``, bam file from
    background sample is passed to the process as the control.

    .. _ROSE2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2

    :param bool use_background: if set to ``True``, background sample
        will be used in the process
    :param int tss: TSS exclusion used in process
    :param int stitch: Stitch used in process
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        background_filter = {}
        if use_background:
            collection_id = get_resource_collection(single_resource)
            if collection_id:
                background_filter['collection'] = collection_id

        for sample in get_samples(single_resource):
            inputs = {
                'rankby': sample.get_bam().id,
            }

            if tss is not None:
                inputs['tss'] = tss

            if stitch is not None:
                inputs['stitch'] = stitch

            if use_background:
                if sample.is_background and not is_sample(single_resource):
                    # Don't run process on the background sample,
                    # but let it fail if it is run directly on sample
                    continue

                background = sample.get_background(**background_filter)
                inputs['control'] = background.get_bam().id

            bed_list = sample.get_macs()
            if beds is not None:
                # Convert objects to the list of their ids
                if isinstance(beds, list):
                    bed_filter = [get_data_id(bed) for bed in beds]
                else:
                    bed_filter = [get_data_id(beds)]

                bed_list = bed_list.filter(id__in=bed_filter)

            for bed in bed_list:
                inputs['input'] = bed.id

                rose = sample.resolwe.get_or_run(slug='rose2', input=inputs)
                sample.add_data(rose)
                results.append(rose)

    return results
Exemple #20
0
    def test_get_data_id(self):
        data = Data(id=1, resolwe=MagicMock())
        data.id = 1  # this is overriden when initialized
        self.assertEqual(get_data_id(data), 1)

        self.assertEqual(get_data_id(2), 2)
Exemple #21
0
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None,
            sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None,
            scale=None, bed=None, multi_page=None):
    """Run ``bamplot`` on the resource.

    This method runs `bamplot`_ with bams, genome and gff or region
    specified in arguments.

    .. _bamplot:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot

    :param list resource: resource from which bam objects will be get
    :param str genome: Genome used in the process (options are HG18,
        HG19, MM9 and MM10)
    :param input_gff: id of annotation file is given
    :type input_gff: int or `~resdk.resources.data.Data`
    :param str input_region: enter a genomic region
    :param int stretch_input: stretch the input regions to a minimum
        length
    :param str color: enter a colon separated list of colors
    :param str sense: map to forward, reverse or'both strand,
        default maps to ``both``
    :param int extension: extends reads by n bp, dfault value is 200bp
    :param bool rpm: normalizes density to reads per million (rpm),
        default is ``False``
    :param str yscale: choose either relative or uniform y axis scaling,
        default is ``relative scaling``
    :param str names: a comma separated list of names for your bams
    :param str plot: choose all lines on a single plot or multiple plots
    :param str title: title for the output plot(s), default will be the
        coordinate region
    :param str scale: a comma separated list of multiplicative scaling
        factors for your bams, default is ``None``
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run
    :param bool multi_page: if flagged will create a new pdf for each
        region

    """
    input_objects = []

    if not input_gff and not input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')
    if input_gff and input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')

    valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10', 'RN4', 'RN6']
    if genome not in valid_genomes:
        raise KeyError('Invalid `genome`, please use one of the following: '
                       '{}'. format(', '.join(valid_genomes)))

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'genome': genome,
        'bam': bams,
    }

    if color is not None:
        inputs['color'] = color

    if sense is not None:
        inputs['scale'] = scale

    if extension is not None:
        inputs['extension'] = extension

    if rpm is not None:
        inputs['rpm'] = rpm

    if yscale is not None:
        inputs['yscale'] = yscale

    if names is not None:
        inputs['names'] = names

    if plot is not None:
        inputs['plot'] = plot

    if title is not None:
        inputs['title'] = title

    if scale is not None:
        inputs['scale'] = scale

    if multi_page is not None:
        inputs['multi_page'] = multi_page

    if input_gff is not None:
        input_objects.append(input_gff)
        inputs['input_gff'] = get_data_id(input_gff)

    if input_region is not None:
        inputs['input_region'] = input_region

    if bed is not None:
        if isinstance(bed, list):
            input_objects.extend(bed)
            inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed]
        else:
            input_objects.append(bed)
            inputs['bed'] = [get_data_id(bed)]

    resolwe = get_resolwe(*input_objects)

    bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs)

    if is_collection(resource):
        resource.add_data(bamplot_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamplot_obj)

    return bamplot_obj