def test_get_samples(self):
        collection = Collection(id=1, resolwe=MagicMock())
        collection._samples = ['sample_1', 'sample_2']
        self.assertEqual(get_samples(collection), ['sample_1', 'sample_2'])

        collection_1 = Collection(id=1, resolwe=MagicMock())
        collection_1._samples = ['sample_1']
        collection_2 = Collection(id=2, resolwe=MagicMock())
        collection_2._samples = ['sample_2']
        self.assertEqual(get_samples([collection_1, collection_2]),
                         ['sample_1', 'sample_2'])

        data = Data(id=1, resolwe=MagicMock())
        data._sample = 'sample_1'
        self.assertEqual(get_samples(data), ['sample_1'])

        data1 = Data(id=1, resolwe=MagicMock())
        data1._sample = 'sample1'
        data2 = Data(id=2, resolwe=MagicMock())
        data2._sample = 'sample2'
        self.assertEqual(get_samples([data1, data2]), ['sample1', 'sample2'])

        data = Data(id=1,
                    resolwe=MagicMock(**{'sample.filter.return_value': None}))
        data._sample = None
        with self.assertRaises(TypeError):
            get_samples(data)

        sample = Sample(id=1, resolwe=MagicMock())
        self.assertEqual(get_samples(sample), [sample])

        sample_1 = Sample(id=1, resolwe=MagicMock())
        sample_2 = Sample(id=3, resolwe=MagicMock())
        self.assertEqual(get_samples([sample_1, sample_2]),
                         [sample_1, sample_2])
Beispiel #2
0
def prepare_geo_rnaseq(resource, name=None):
    """Run ``Prepare GEO - RNA-Seq`` process on the resource.

    This method can be used to run ``Prepare GEO - RNA-Seq`` process
    on a single collection or a list of samples.

    :param resource: resource on which prepare_geo_rnaseq will be run
    :param str name: name of the prepare GEO tarball and table

    """
    reads = []
    expressions = []

    samples = get_samples(resource)
    resolwe = get_resolwe(*samples)
    collection_ids = set()

    for sample in samples:
        reads.append(sample.get_reads().id)
        expressions.append(sample.get_expression().id)
        collection_ids.add(get_resource_collection(sample))

    auto_name, collection = get_name_collection(collection_ids, resolwe)

    inputs = {
        'reads': reads,
        'expressions': expressions,
        'name': name or auto_name,
    }
    geo = resolwe.get_or_run(slug='prepare-geo-rnaseq', input=inputs)

    if collection:
        collection.add_data(geo)

    return geo
Beispiel #3
0
def hisat2(resource, genome):
    """Run hisat2 aligner on given resource.

    Align reads files of given resource to the given genome using the
    `Hisat2`_ aligner. If reads were already aligned, existing objects
    will be returned.

    .. _Hisat2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-hisat2

    :param resource: resource of which reads will be aligned
    :param genome: data object with genome that will be used
    :type genome: `~resdk.resources.data.Data`

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        for sample in get_samples(single_resource):
            inputs = {
                'reads': sample.get_reads().id,
                'genome': get_data_id(genome),
            }

            aligned = sample.resolwe.get_or_run(slug='alignment-hisat2', input=inputs)
            sample.add_data(aligned)
            results.append(aligned)

    return results
Beispiel #4
0
def bowtie2(resource, genome):
    """Run bowtie2 aligner on given resource.

    Aligne reads files of given resource to the given genome using the
    ``bowtie2`` aligner. If reads were already aligned, existing objects
    will be returned.

    :param resource: resource of which reads will be aligned
    :param genome: data object with genome that will be used
    :type genome: `~resdk.resources.data.Data`

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        for sample in get_samples(single_resource):
            inputs = {
                'reads': sample.get_reads().id,
                'genome': get_data_id(genome),
            }

            aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2',
                                                input=inputs)
            sample.add_data(aligned)
            results.append(aligned)

    return results
def prepare_geo_rnaseq(resource, name=None):
    """Run ``Prepare GEO - RNA-Seq`` process on the resource.

    This method can be used to run ``Prepare GEO - RNA-Seq`` process
    on a single collection or a list of samples.

    :param resource: resource on which prepare_geo_rnaseq will be run
    :param str name: name of the prepare GEO tarball and table

    """
    reads = []
    expressions = []

    samples = get_samples(resource)
    resolwe = get_resolwe(*samples)
    collection_ids = set()

    for sample in samples:
        reads.append(sample.get_reads().id)
        expressions.append(sample.get_expression().id)
        collection_ids.add(get_resource_collection(sample))

    auto_name, collection = get_name_collection(collection_ids, resolwe)

    inputs = {
        'reads': reads,
        'expressions': expressions,
        'name': name or auto_name,
    }
    geo = resolwe.get_or_run(slug='prepare-geo-rnaseq', input=inputs)

    if collection:
        collection.add_data(geo)

    return geo
def cuffquant(resource,
              gff,
              genome=None,
              mask_file=None,
              library_type=None,
              multi_read_correct=None,
              threads=None):
    """Run Cuffquant_ for selected cuffquats.

    This method runs `Cuffquant`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unsstranded. Other
    parameters: genome, mask_file, multi_reads_correct and threads are
    optional.

    .. _Cuffquant:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant

    :param gff: id of annotation file is given
    :type gff: int or `~resdk.resources.data.Data`
    :param genome: id of genome file is given to run bias detection and
        correction algorithm
    :type genome: int or `~resdk.resources.data.Data`
    :param mask_file: id of mask file is given
    :type mask_file: int or `~resdk.resources.data.Data`
    :param str library_type: options are fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings
    :param int threads: use this many processor threads

    """
    results = []
    for sample in get_samples(resource):
        inputs = {
            'alignment': sample.get_bam().id,
            'gff': get_data_id(gff),
        }

        if genome is not None:
            inputs['genome'] = genome

        if mask_file is not None:
            inputs['mask_file'] = mask_file

        if library_type is not None:
            inputs['library_type'] = library_type

        if multi_read_correct is not None:
            inputs['multi_read_correct'] = multi_read_correct

        if threads is not None:
            inputs['threads'] = threads

        cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant',
                                                  input=inputs)
        sample.add_data(cuffquant_obj)
        results.append(cuffquant_obj)

    return results
Beispiel #7
0
def macs(resource, use_background=True, p_value=None):
    """Run ``MACS 1.4`` process on the resource.

    This method runs `MACS 1.4`_ process with ``p-value`` specified in
    arguments and ``bam`` file from the sample.

    If ``use_background`` argument is set to ``True``, ``bam`` file from
    background sample is passed to the process as the control. Mappable
    genome size is taken from the sample annotation.

    .. _MACS 1.4:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-macs14

    :param bool use_background: if set to ``True``, background sample
        will be used in the process
    :param float p_value: p-value used in the process

    """
    inputs = {}
    if p_value is not None:
        inputs['pvalue'] = p_value

    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        background_filter = {}
        if use_background:
            collection_id = get_resource_collection(single_resource)
            if collection_id:
                background_filter['collection'] = collection_id

        for sample in get_samples(single_resource):
            inputs['treatment'] = sample.get_bam().id

            try:
                inputs['gsize'] = gsize_organism(
                    sample.descriptor['sample']['organism'])
            except KeyError:
                raise KeyError('{} is not annotated'.format(sample))

            if use_background:
                if is_background(sample) and not is_sample(single_resource):
                    # Don't run process on the background sample,
                    # but let it fail if it is run directly on sample
                    continue

                background = sample.get_background(**background_filter)
                inputs['control'] = background.get_bam().id

            macs_obj = sample.resolwe.get_or_run(slug='macs14', input=inputs)
            sample.add_data(macs_obj)
            results.append(macs_obj)

    return results
Beispiel #8
0
    def test_get_samples(self):
        collection = Collection(id=1, resolwe=MagicMock())
        collection._samples = ['sample_1', 'sample_2']
        self.assertEqual(get_samples(collection), ['sample_1', 'sample_2'])

        collection_1 = Collection(id=1, resolwe=MagicMock())
        collection_1._samples = ['sample_1']
        collection_2 = Collection(id=2, resolwe=MagicMock())
        collection_2._samples = ['sample_2']
        self.assertEqual(get_samples([collection_1, collection_2]),
                         ['sample_1', 'sample_2'])

        sample = Sample(id=1, resolwe=MagicMock())
        self.assertEqual(get_samples(sample), [sample])

        sample_1 = Sample(id=1, resolwe=MagicMock())
        sample_2 = Sample(id=3, resolwe=MagicMock())
        self.assertEqual(get_samples([sample_1, sample_2]),
                         [sample_1, sample_2])
Beispiel #9
0
def prepare_geo_chipseq(resource, name=None):
    """Run ``Prepare GEO - ChIP-Seq`` process on the resource.

    This method can be used to run ``Prepare GEO - ChIP-Seq`` process
    on a single collection or a list of samples.

    :param resource: resource on which prepare_geo_chipseq will be run
    :param str name: name of the prepare GEO tarball and table

    """
    reads = []
    macs = []

    samples = get_samples(resource)
    resolwe = get_resolwe(*samples)
    collection_ids = set()

    for sample in samples:
        reads.append(sample.get_reads().id)

        if sample.is_background:
            continue

        macs_list = sample.get_macs()
        if not macs_list:
            raise ValueError(
                "Sample {} has no `macs` data object!".format(sample))
        elif len(macs_list) != 1:
            raise ValueError(
                "Sample {} has more than one `macs` data objects!".format(
                    sample))

        macs.append(macs_list[0].id)

        if sample.background:
            if sample.background not in samples:
                raise ValueError(
                    "Background of the sample {} cannot be found in the resource you provided: "
                    "{}!".format(sample, resource))

        collection_ids.add(get_resource_collection(sample))

    auto_name, collection = get_name_collection(collection_ids, resolwe)

    inputs = {
        'reads': reads,
        'macs': macs,
        'name': name or auto_name,
    }
    geo = resolwe.get_or_run(slug='prepare-geo-chipseq', input=inputs)

    if collection:
        collection.add_data(geo)

    return geo
Beispiel #10
0
def prepare_geo_chipseq(resource, name=None):
    """Run ``Prepare GEO - ChIP-Seq`` process on the resource.

    This method can be used to run ``Prepare GEO - ChIP-Seq`` process
    on a single collection or a list of samples.

    :param resource: resource on which prepare_geo_chipseq will be run
    :param str name: name of the prepare GEO tarball and table

    """
    reads = []
    macs = []

    samples = get_samples(resource)
    resolwe = get_resolwe(*samples)
    collection_ids = set()

    for sample in samples:
        reads.append(sample.get_reads().id)

        if sample.is_background:
            continue

        macs_list = sample.get_macs()
        if not macs_list:
            raise ValueError("Sample {} has no `macs` data object!".format(sample))
        elif len(macs_list) != 1:
            raise ValueError("Sample {} has more than one `macs` data objects!".format(sample))

        macs.append(macs_list[0].id)

        if sample.background:
            if sample.background not in samples:
                raise ValueError(
                    "Background of the sample {} cannot be found in the resource you provided: "
                    "{}!".format(sample, resource)
                )

        collection_ids.add(get_resource_collection(sample))

    auto_name, collection = get_name_collection(collection_ids, resolwe)

    inputs = {
        'reads': reads,
        'macs': macs,
        'name': name or auto_name,
    }
    geo = resolwe.get_or_run(slug='prepare-geo-chipseq', input=inputs)

    if collection:
        collection.add_data(geo)

    return geo
Beispiel #11
0
def macs(resource, use_background=True, p_value=None):
    """Run ``MACS 1.4`` process on the resource.

    This method runs `MACS 1.4`_ process with ``p-value`` specified in
    arguments and ``bam`` file from the sample.

    If ``use_background`` argument is set to ``True``, ``bam`` file from
    background sample is passed to the process as the control.

    .. _MACS 1.4:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-macs14

    :param bool use_background: if set to ``True``, background sample
        will be used in the process
    :param float p_value: p-value used in the process

    """
    inputs = {}
    if p_value is not None:
        inputs['pvalue'] = p_value

    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        background_filter = {}
        if use_background:
            collection_id = get_resource_collection(single_resource)
            if collection_id:
                background_filter['collection'] = collection_id

        for sample in get_samples(single_resource):
            inputs['treatment'] = sample.get_primary_bam(fallback_to_bam=True).id

            if use_background:
                if sample.is_background and not is_sample(single_resource):
                    # Don't run process on the background sample,
                    # but let it fail if it is run directly on sample
                    continue

                background = sample.get_background(**background_filter)
                inputs['control'] = background.get_primary_bam(fallback_to_bam=True).id

            macs_obj = sample.resolwe.get_or_run(slug='macs14', input=inputs)
            sample.add_data(macs_obj)
            results.append(macs_obj)

    return results
Beispiel #12
0
def cuffquant(resource, annotation, genome=None, mask_file=None,
              library_type=None, multi_read_correct=None):
    """Run Cuffquant_ for selected cuffquats.

    This method runs `Cuffquant`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unsstranded. Other
    parameters: genome, mask_file and multi_reads_correct are optional.

    .. _Cuffquant:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant

    :param annotation: annotation file
    :type annotation: `~resdk.resources.data.Data`
    :param genome: genome object to use for bias detection and
        correction algorithm
    :type genome: `~resdk.resources.data.Data`
    :param mask_file: mask file to use in process
    :type mask_file: `~resdk.resources.data.Data`
    :param str library_type: options are: fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings

    """
    results = []
    for sample in get_samples(resource):
        inputs = {
            'alignment': sample.get_bam().id,
            'annotation': get_data_id(annotation),
        }

        if genome is not None:
            inputs['genome'] = genome

        if mask_file is not None:
            inputs['mask_file'] = mask_file

        if library_type is not None:
            inputs['library_type'] = library_type

        if multi_read_correct is not None:
            inputs['multi_read_correct'] = multi_read_correct

        cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant', input=inputs)
        sample.add_data(cuffquant_obj)
        results.append(cuffquant_obj)

    return results
Beispiel #13
0
def cuffnorm(resource, annotation, use_ercc=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
    ``annotation`` and ``use_ercc`` parameters specified in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: `~resdk.resources.data.Data`
    :param bool use_ercc: use ERRCC spike-in controls for normalization

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    inputs = {
        'cuffquant': cuffquants,
        'annotation': get_data_id(annotation),
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
Beispiel #14
0
def cuffnorm(resource, annotation, use_ercc=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
    ``annotation`` and ``use_ercc`` parameters specified in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: `~resdk.resources.data.Data`
    :param bool use_ercc: use ERRCC spike-in controls for normalization

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    inputs = {
        'cuffquant': cuffquants,
        'annotation': get_data_id(annotation),
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
Beispiel #15
0
    def test_get_samples(self):
        collection = Collection(id=1, resolwe=MagicMock())
        collection._samples = ['sample_1', 'sample_2']
        self.assertEqual(get_samples(collection), ['sample_1', 'sample_2'])

        collection_1 = Collection(id=1, resolwe=MagicMock())
        collection_1._samples = ['sample_1']
        collection_2 = Collection(id=2, resolwe=MagicMock())
        collection_2._samples = ['sample_2']
        self.assertEqual(get_samples([collection_1, collection_2]),
                         ['sample_1', 'sample_2'])

        data = Data(id=1, resolwe=MagicMock())
        data.api(data.id).get = MagicMock(return_value={'entities': [7]})
        data.resolwe.sample.get = MagicMock(return_value='sample_1')
        self.assertEqual(get_samples(data), ['sample_1'])

        data1 = Data(id=1, resolwe=MagicMock())
        data1.api(data.id).get = MagicMock(return_value={'entities': [7]})
        data1.resolwe.sample.get = MagicMock(return_value='sample1')

        data2 = Data(id=2, resolwe=MagicMock())
        data2.api(data.id).get = MagicMock(return_value={'entities': [8]})
        data2.resolwe.sample.get = MagicMock(return_value='sample2')
        self.assertEqual(get_samples([data1, data2]), ['sample1', 'sample2'])

        data = Data(id=1,
                    resolwe=MagicMock(**{'sample.get.return_value': None}))
        data._sample = None
        with self.assertRaises(TypeError):
            get_samples(data)

        sample = Sample(id=1, resolwe=MagicMock())
        self.assertEqual(get_samples(sample), [sample])

        sample_1 = Sample(id=1, resolwe=MagicMock())
        sample_2 = Sample(id=3, resolwe=MagicMock())
        self.assertEqual(get_samples([sample_1, sample_2]),
                         [sample_1, sample_2])
Beispiel #16
0
def bamsplit(resource, header=None, header2=None):
    """Run ``Bam split`` process on the resource.

    This method runs `Bam split`_ process on the resource. The process
    will be run on all samples in the resource. Samples' alignment data
    object must be valid. Valid alignment data objects are those that
    were aligned to a hybrid genome with a valid build. Valid builds
    are: 'hg19_dm6' and 'mm10_dm6'.

    .. _Bam split:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bam-split

    :param resource: resource on which bam-split will be run
    :param header: SAM header data object for the primary BAM
    :type header: `~resdk.resources.data.Data`
    :param header2: SAM header data object for the secodary BAM
    :type header: `~resdk.resources.data.Data`

    """
    inputs = {}

    if header:
        inputs['header'] = header

    if header2:
        inputs['header2'] = header2

    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:
        for sample in get_samples(single_resource):
            inputs['bam'] = sample.get_bam().id
            primary_bam = sample.resolwe.run(
                slug='bam-split', input=inputs, collections=sample.collections
            )
            results.append(primary_bam)

    return results
Beispiel #17
0
def bamsplit(resource, header=None, header2=None):
    """Run ``Bam split`` process on the resource.

    This method runs `Bam split`_ process on the resource. The process
    will be run on all samples in the resource. Samples' alignment data
    object must be valid. Valid alignment data objects are those that
    were aligned to a hybrid genome with a valid build. Valid builds
    are: 'hg19_dm6' and 'mm10_dm6'.

    .. _Bam split:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bam-split

    :param resource: resource on which bam-split will be run
    :param header: SAM header data object for the primary BAM
    :type header: `~resdk.resources.data.Data`
    :param header2: SAM header data object for the secodary BAM
    :type header: `~resdk.resources.data.Data`

    """
    inputs = {}

    if header:
        inputs['header'] = header

    if header2:
        inputs['header2'] = header2

    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:
        for sample in get_samples(single_resource):
            inputs['bam'] = sample.get_bam().id
            primary_bam = sample.resolwe.run(slug='bam-split',
                                             input=inputs,
                                             collections=sample.collections)
            results.append(primary_bam)

    return results
Beispiel #18
0
    def test_get_samples(self):
        collection = Collection(id=1, resolwe=MagicMock())
        collection._samples = ['sample_1', 'sample_2']
        self.assertEqual(get_samples(collection), ['sample_1', 'sample_2'])

        collection_1 = Collection(id=1, resolwe=MagicMock())
        collection_1._samples = ['sample_1']
        collection_2 = Collection(id=2, resolwe=MagicMock())
        collection_2._samples = ['sample_2']
        self.assertEqual(get_samples([collection_1, collection_2]), ['sample_1', 'sample_2'])

        data = Data(id=1, resolwe=MagicMock())
        data.api(data.id).get = MagicMock(return_value={'entities': [7]})
        data.resolwe.sample.get = MagicMock(return_value='sample_1')
        self.assertEqual(get_samples(data), ['sample_1'])

        data1 = Data(id=1, resolwe=MagicMock())
        data1.api(data.id).get = MagicMock(return_value={'entities': [7]})
        data1.resolwe.sample.get = MagicMock(return_value='sample1')

        data2 = Data(id=2, resolwe=MagicMock())
        data2.api(data.id).get = MagicMock(return_value={'entities': [8]})
        data2.resolwe.sample.get = MagicMock(return_value='sample2')
        self.assertEqual(get_samples([data1, data2]), ['sample1', 'sample2'])

        data = Data(id=1, resolwe=MagicMock(**{'sample.get.return_value': None}))
        data._sample = None
        with self.assertRaises(TypeError):
            get_samples(data)

        sample = Sample(id=1, resolwe=MagicMock())
        self.assertEqual(get_samples(sample), [sample])

        sample_1 = Sample(id=1, resolwe=MagicMock())
        sample_2 = Sample(id=3, resolwe=MagicMock())
        self.assertEqual(get_samples([sample_1, sample_2]), [sample_1, sample_2])
def cuffnorm(resource, annotation, use_ercc=None, threads=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
     ``annotation``, ``useERCC`` and ``threads`` parameters specified
     in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: int or `~resdk.resources.data.Data`
    :param bool useERCC: use ERRCC spike-in controls for normalization
    :param int threads: use this many threads to align reads
        (default: ``1``)

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    labels = []
    replicates = []
    replicates_ids = {}
    for sample in samples:
        relations = resolwe.relation.filter(type='group',
                                            label='replicates',
                                            entity=[sample.id],
                                            **relation_filter)

        if len(relations) == 1:
            relation = relations[0]
        else:
            raise LookupError(
                "Cannot determine unique group relation with label `replicates` for the "
                "following sample: {}".format(sample.name))

        if relation.id not in replicates_ids:
            replicates_ids[relation.id] = str(len(replicates_ids))
        replicates.append(replicates_ids[relation.id])

        if str(relation.id) not in labels:
            labels.append(str(relation.id))

    inputs = {
        'cuffquant': cuffquants,
        'replicates': replicates,
        'annotation': get_data_id(annotation),
        'labels': labels,
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    if threads is not None:
        inputs['threads'] = threads

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
Beispiel #20
0
def rose2(resource, use_background=True, tss=None, stitch=None, beds=None):
    """Run ``ROSE 2`` process on the resource.

    This method runs `ROSE2`_ process with ``tss_exclusion`` and
    ``stitch`` parameters specified in arguments.

    Separate process is run for each bed file on the sample. To run
    process only on subset of those files, list them in ``beds``
    argument (if only one object is given, it will be auto-wrapped in
    list, if it is not already).

    If ``use_background`` argument is set to ``True``, bam file from
    background sample is passed to the process as the control.

    .. _ROSE2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2

    :param bool use_background: if set to ``True``, background sample
        will be used in the process
    :param int tss: TSS exclusion used in process
    :param int stitch: Stitch used in process
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        background_filter = {}
        if use_background:
            collection_id = get_resource_collection(single_resource)
            if collection_id:
                background_filter['collection'] = collection_id

        for sample in get_samples(single_resource):
            inputs = {
                'rankby': sample.get_bam().id,
            }

            if tss is not None:
                inputs['tss'] = tss

            if stitch is not None:
                inputs['stitch'] = stitch

            if use_background:
                if sample.is_background and not is_sample(single_resource):
                    # Don't run process on the background sample,
                    # but let it fail if it is run directly on sample
                    continue

                background = sample.get_background(**background_filter)
                inputs['control'] = background.get_bam().id

            bed_list = sample.get_macs()
            if beds is not None:
                # Convert objects to the list of their ids
                if isinstance(beds, list):
                    bed_filter = [get_data_id(bed) for bed in beds]
                else:
                    bed_filter = [get_data_id(beds)]

                bed_list = bed_list.filter(id__in=bed_filter)

            for bed in bed_list:
                inputs['input'] = bed.id

                rose = sample.resolwe.get_or_run(slug='rose2', input=inputs)
                sample.add_data(rose)
                results.append(rose)

    return results
Beispiel #21
0
def bamplot(resource,
            genome,
            input_gff=None,
            input_region=None,
            stretch_input=None,
            color=None,
            sense=None,
            extension=None,
            rpm=None,
            yscale=None,
            names=None,
            plot=None,
            title=None,
            scale=None,
            bed=None,
            multi_page=None):
    """Run ``bamplot`` on the resource.

    This method runs `bamplot`_ with bams, genome and gff or region
    specified in arguments.

    .. _bamplot:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot

    :param list resource: resource from which bam objects will be get
    :param str genome: Genome used in the process (options are HG18,
        HG19, MM9 and MM10)
    :param input_gff: id of annotation file is given
    :type input_gff: int or `~resdk.resources.data.Data`
    :param str input_region: enter a genomic region
    :param int stretch_input: stretch the input regions to a minimum
        length
    :param str color: enter a colon separated list of colors
    :param str sense: map to forward, reverse or'both strand,
        default maps to ``both``
    :param int extension: extends reads by n bp, dfault value is 200bp
    :param bool rpm: normalizes density to reads per million (rpm),
        default is ``False``
    :param str yscale: choose either relative or uniform y axis scaling,
        default is ``relative scaling``
    :param str names: a comma separated list of names for your bams
    :param str plot: choose all lines on a single plot or multiple plots
    :param str title: title for the output plot(s), default will be the
        coordinate region
    :param str scale: a comma separated list of multiplicative scaling
        factors for your bams, default is ``None``
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run
    :param bool multi_page: if flagged will create a new pdf for each
        region

    """
    input_objects = []

    if not input_gff and not input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')
    if input_gff and input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')

    valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10']
    if genome not in valid_genomes:
        raise KeyError('Invalid `genome`, please use one of the following: '
                       '{}'.format(', '.join(valid_genomes)))

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'genome': genome,
        'bam': bams,
    }

    if color is not None:
        inputs['color'] = color

    if sense is not None:
        inputs['scale'] = scale

    if extension is not None:
        inputs['extension'] = extension

    if rpm is not None:
        inputs['rpm'] = rpm

    if yscale is not None:
        inputs['yscale'] = yscale

    if names is not None:
        inputs['names'] = names

    if plot is not None:
        inputs['plot'] = plot

    if title is not None:
        inputs['title'] = title

    if scale is not None:
        inputs['scale'] = scale

    if multi_page is not None:
        inputs['multi_page'] = multi_page

    if input_gff is not None:
        input_objects.append(input_gff)
        inputs['input_gff'] = get_data_id(input_gff)

    if input_region is not None:
        inputs['input_region'] = input_region

    if bed is not None:
        if isinstance(bed, list):
            input_objects.extend(bed)
            inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed]
        else:
            input_objects.append(bed)
            inputs['bed'] = [get_data_id(bed)]

    resolwe = get_resolwe(*input_objects)

    bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs)

    if is_collection(resource):
        resource.add_data(bamplot_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamplot_obj)

    return bamplot_obj
def cuffnorm(resource, annotation, use_ercc=None):
    """Run Cuffnorm_ for selected cuffquats.

    This method runs `Cuffnorm`_ process on ``resource`` with
    ``annotation`` and ``use_ercc`` parameters specified in arguments.

    .. _Cuffnorm:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm

    :param resource: resource on which cuffnorm will be run
    :param annotation: annotation object used in cuffnorm
    :type annotation: `~resdk.resources.data.Data`
    :param bool use_ercc: use ERRCC spike-in controls for normalization

    """
    relation_filter = {}
    collection_id = get_resource_collection(resource)
    if collection_id:
        relation_filter['collection'] = collection_id

    samples = get_samples(resource)

    input_objects = [annotation]
    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples]

    replicates = []
    replicates_ids = {}
    for sample in samples:
        relations = resolwe.relation.filter(
            type='group',
            entity=[sample.id],
            **relation_filter
        )

        if len(relations) == 1:
            relation = relations[0]
        else:
            raise LookupError(
                "Cannot determine unique group relation with label `replicates` for the "
                "following sample: {}".format(sample.name)
            )

        if relation.id not in replicates_ids:
            replicates_ids[relation.id] = str(len(replicates_ids))
        replicates.append(replicates_ids[relation.id])

    inputs = {
        'cuffquant': cuffquants,
        'replicates': replicates,
        'annotation': get_data_id(annotation),
    }

    if use_ercc is not None:
        inputs['useERCC'] = use_ercc

    cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs)

    if is_collection(resource):
        resource.add_data(cuffnorm_obj)
    elif is_relation(resource):
        resource.collection.add_data(cuffnorm_obj)

    return cuffnorm_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None,
             library_type=None, library_normalization=None, dispersion_method=None):
    """Run Cuffdiff_ for selected cuffquants.

    This method runs `Cuffdiff`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unstranded. Other parameters
    defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric,
    dispersion_method=pooled, threads=1. Parameter genome is optional.

    The way the function works depends on the resource. If it is run on a collection,
    it will perform cuffdiff on every 'compare' relation labeled 'case-control' in
    the selected collection. If it is run on a list of samples (not necesssarily in
    the same collection) it will run cuffdiff on all 'compare' relations labeled
    'case-control' containing all of the given samples but will discard those
    samples in a relation that are not in the list of samples.

    .. _Cuffdiff:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff

    :param annotation: annotation file
    :type annotation: `~resdk.resources.data.Data`
    :param genome: genome object to use for bias detection and
        correction algorithm
    :type genome: `~resdk.resources.data.Data`
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings
    :param fdr: the allowed false discovery rate
    :type fdr: decimal
    :param str library_type: options are: fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param str library_normalization: options are: geometric, classic-fpkm,
        quartile
    :param str dispersion_method: options are: pooled, per-condition,
        blind, poisson

    """
    inputs = {'annotation': get_data_id(annotation)}

    input_objects = [annotation]

    if genome is not None:
        inputs['genome'] = genome.id
        input_objects.append(genome)

    if multi_read_correct is not None:
        inputs['multi_read_correct'] = multi_read_correct

    if fdr is not None:
        inputs['fdr'] = fdr

    if library_type is not None:
        inputs['library_type'] = library_type

    if library_normalization is not None:
        inputs['library_normalization'] = library_normalization

    if dispersion_method is not None:
        inputs['dispersion_method'] = dispersion_method

    samples = get_samples(resource)
    sample_ids = [sample.id for sample in samples]

    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    collection_id = get_resource_collection(resource)

    relation_filter = {}
    if collection_id:
        relation_filter['collection'] = collection_id
    else:
        relation_filter['entity'] = sample_ids

    relations = resolwe.relation.filter(
        type='compare',
        **relation_filter
    )

    cuffdiff_objects = []
    for relation in relations:
        control = []
        case = []
        for partition in relation.partitions:
            sample = resolwe.sample.get(partition['entity'])
            label = partition['label']
            if sample.id not in sample_ids:
                continue

            if label == 'case':
                case.append(get_data_id(sample.get_cuffquant()))
            elif label == 'control':
                control.append(get_data_id(sample.get_cuffquant()))
            else:
                raise ValueError(
                    "Label different from 'case' or 'control' was found in the "
                    "following relation: {}".format(relation.id)
                )

        if not case or not control:
            continue

        inputs['case'] = case
        inputs['control'] = control

        cuffdiff_obj = resolwe.get_or_run(slug='cuffdiff', input=inputs)
        cuffdiff_objects.append(cuffdiff_obj)

        if is_collection(resource):
            resource.add_data(cuffdiff_obj)
        elif is_relation(resource):
            resource.collection.add_data(cuffdiff_obj)

    if not cuffdiff_objects:
        if not relations:
            raise ValueError("No relation containing all of the given samples was found")
        else:
            raise ValueError(
                "No suitable relation was found (given samples all have either 'case' label "
                "or 'control' label"
            )

    return cuffdiff_objects
Beispiel #24
0
def rose2(resource, use_background=True, tss=None, stitch=None, beds=None):
    """Run ``ROSE 2`` process on the resource.

    This method runs `ROSE2`_ process with ``tss_exclusion`` and
    ``stitch`` parameters specified in arguments.

    Separate process is run for each bed file on the sample. To run
    process only on subset of those files, list them in ``beds``
    argument (if only one object is given, it will be auto-wrapped in
    list, if it is not already).

    If ``use_background`` argument is set to ``True``, bam file from
    background sample is passed to the process as the control.

    .. _ROSE2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2

    :param bool use_background: if set to ``True``, background sample
        will be used in the process
    :param int tss: TSS exclusion used in process
    :param int stitch: Stitch used in process
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run

    """
    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        background_filter = {}
        if use_background:
            collection_id = get_resource_collection(single_resource)
            if collection_id:
                background_filter['collection'] = collection_id

        for sample in get_samples(single_resource):
            inputs = {
                'rankby': sample.get_bam().id,
            }

            if tss is not None:
                inputs['tss'] = tss

            if stitch is not None:
                inputs['stitch'] = stitch

            if use_background:
                if sample.is_background and not is_sample(single_resource):
                    # Don't run process on the background sample,
                    # but let it fail if it is run directly on sample
                    continue

                background = sample.get_background(**background_filter)
                inputs['control'] = background.get_bam().id

            bed_list = sample.get_macs()
            if beds is not None:
                # Convert objects to the list of their ids
                if isinstance(beds, list):
                    bed_filter = [get_data_id(bed) for bed in beds]
                else:
                    bed_filter = [get_data_id(beds)]

                bed_list = bed_list.filter(id__in=bed_filter)

            for bed in bed_list:
                inputs['input'] = bed.id

                rose = sample.resolwe.get_or_run(slug='rose2', input=inputs)
                sample.add_data(rose)
                results.append(rose)

    return results
Beispiel #25
0
def bowtie2(resource, genome, mode=None, speed=None, use_se=None, discordantly=None, rep_se=None,
            minins=None, maxins=None, trim_5=None, trim_3=None, trim_iter=None, trim_nucl=None,
            rep_mode=None, k_reports=None):
    """Run bowtie2 aligner on given resource.

    Align reads files of given resource to the given genome using the
    `Bowtie2`_ aligner. If reads were already aligned, existing objects
    will be returned.

    .. _Bowtie2:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-bowtie2

    :param resource: resource of which reads will be aligned
    :param genome: data object with genome that will be used
    :type genome: `~resdk.resources.data.Data`
    :param str mode: alignment mode (options are: --end-to-end,
        --local), default is --end-to-end
    :param str speed: speed vs sensitivity (options are: --very-fast,
        --fast, --semsitive, --very-sensitive), default is --sensitive
    :param bool use_se: map as single-ended (for paired-end reads
        only), default is False
    :param bool discordantly: report discordantly matched read, default
        is True
    :param bool rep_se: report single ended, default is True
    :param int minins: minimum fragment length, default is 0
    :param int maxins: maximum fragment length, default is 500
    :param int trim_5: number of bases to trim from 5', default is 0
    :param int trim_3: number of bases to trim from 3', default is 0
    :param int trim_iter: number of iterations, default is 0
    :param int trim_nucl: number of bases to trim from 3' in each
        iteration, default is 2
    :param str rep_mode: report mode (options are: def, k, a), default
        is def
    :param int k_reports: number of reports (for -k mode only), default
        is 5

    """
    inputs = {'genome': get_data_id(genome)}

    if mode is not None:
        inputs['mode'] = mode

    if speed is not None:
        inputs['speed'] = speed

    if use_se is not None:
        inputs['use_se'] = use_se

    if discordantly is not None:
        inputs['discordantly'] = discordantly

    if rep_se is not None:
        inputs['rep_se'] = rep_se

    if minins is not None:
        inputs['minins'] = minins

    if maxins is not None:
        inputs['maxins'] = maxins

    if trim_5 is not None:
        inputs['trim_5'] = trim_5

    if trim_3 is not None:
        inputs['trim_3'] = trim_3

    if trim_iter is not None:
        inputs['trim_iter'] = trim_iter

    if trim_nucl is not None:
        inputs['trim_nucl'] = trim_nucl

    if rep_mode is not None:
        inputs['rep_mode'] = rep_mode

    if k_reports is not None:
        inputs['k_reports'] = k_reports

    results = []

    if not isinstance(resource, list):
        resource = [resource]

    for single_resource in resource:

        for sample in get_samples(single_resource):
            inputs['reads'] = sample.get_reads().id

            aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2', input=inputs)
            sample.add_data(aligned)
            results.append(aligned)

    return results
Beispiel #26
0
def bamliquidator(resource,
                  cell_type=None,
                  bin_size=None,
                  regions=None,
                  extension=None,
                  sense=None,
                  skip_plot=None,
                  black_list=None,
                  threads=None):
    """Run ``bamliquidator`` on the resource.

    This method runs `bamliquidator`_ with bams, where three different
    analysis type options are possible: Bin mode, Region mode and BED
    mode.

    .. _bamliquidator:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator

    :param list resource: resource from which bam objects will be get
    :param str cell_type: the name of cell type will be given in counts
        tables
    :param int bin_size: number of base pairs in each bin. Default is
        100000.
    :param regions: gtf or bed annotation object used in region mode
    :type regions: `~resdk.resources.data.Data`
    :param int extension: Extends reads by number of bp. Default is 200.
    :param str sense: Mapping strand to gff file. Use '+' for forwaed,
        '-' for reverse and '.' for both. Defoult is both.
    :param bool skip_plot: True for skip plot.
    :param list str black_list: One or more chromosome patterns to skip
        during bin liquidation. Default is to skip any chromosomes that
        contain any of the following substrings `chrUn`, `_random`,
        `Zv9_` or `_hap`.
    :param int threads: Number of CPUs

    """
    if not xor(bin_size, regions):
        raise KeyError(
            'Exactly one of `bin_size` and `regions` parameters must be given.'
        )

    if regions and not is_data(regions):
        raise KeyError('`regions` parameter must be data object.')

    input_objects = []

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'bam': bams,
    }

    if bin_size:
        inputs['analysis_type'] = 'bin'
        inputs['bin_size'] = bin_size
    else:  # regions
        if regions.process_type == 'data:annotation:gtf:':
            inputs['analysis_type'] = 'gtf'
        elif regions.process_type == 'data:bed:':
            inputs['analysis_type'] = 'bed'
        else:
            raise KeyError(
                '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`'
            )

        input_objects.append(regions)
        inputs['regions_file_gtf'] = get_data_id(regions)

    if cell_type is not None:
        inputs['cell_type'] = cell_type

    if extension is not None:
        inputs['extension'] = extension

    if sense is not None:
        inputs['sense'] = sense

    if skip_plot is not None:
        inputs['skip_plot'] = skip_plot

    if black_list is not None:
        inputs['black_list'] = black_list

    if threads is not None:
        inputs['threads'] = threads

    resolwe = get_resolwe(*input_objects)

    bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs)

    if is_collection(resource):
        resource.add_data(bamliquidator_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamliquidator_obj)

    return bamliquidator_obj
Beispiel #27
0
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None,
                  sense=None, skip_plot=None, black_list=None, threads=None):
    """Run ``bamliquidator`` on the resource.

    This method runs `bamliquidator`_ with bams, where three different
    analysis type options are possible: Bin mode, Region mode and BED
    mode.

    .. _bamliquidator:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator

    :param list resource: resource from which bam objects will be get
    :param str cell_type: the name of cell type will be given in counts
        tables
    :param int bin_size: number of base pairs in each bin. Default is
        100000.
    :param regions: gtf or bed annotation object used in region mode
    :type regions: `~resdk.resources.data.Data`
    :param int extension: Extends reads by number of bp. Default is 200.
    :param str sense: Mapping strand to gff file. Use '+' for forwaed,
        '-' for reverse and '.' for both. Defoult is both.
    :param bool skip_plot: True for skip plot.
    :param list str black_list: One or more chromosome patterns to skip
        during bin liquidation. Default is to skip any chromosomes that
        contain any of the following substrings `chrUn`, `_random`,
        `Zv9_` or `_hap`.
    :param int threads: Number of CPUs

    """
    if not xor(bin_size, regions):
        raise KeyError('Exactly one of `bin_size` and `regions` parameters must be given.')

    if regions and not is_data(regions):
        raise KeyError('`regions` parameter must be data object.')

    input_objects = []

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'bam': bams,
    }

    if bin_size:
        inputs['analysis_type'] = 'bin'
        inputs['bin_size'] = bin_size
    else:  # regions
        if regions.process_type == 'data:annotation:gtf:':
            inputs['analysis_type'] = 'gtf'
        elif regions.process_type == 'data:bed:':
            inputs['analysis_type'] = 'bed'
        else:
            raise KeyError(
                '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`'
            )

        input_objects.append(regions)
        inputs['regions_file_gtf'] = get_data_id(regions)

    if cell_type is not None:
        inputs['cell_type'] = cell_type

    if extension is not None:
        inputs['extension'] = extension

    if sense is not None:
        inputs['sense'] = sense

    if skip_plot is not None:
        inputs['skip_plot'] = skip_plot

    if black_list is not None:
        inputs['black_list'] = black_list

    if threads is not None:
        inputs['threads'] = threads

    resolwe = get_resolwe(*input_objects)

    bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs)

    if is_collection(resource):
        resource.add_data(bamliquidator_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamliquidator_obj)

    return bamliquidator_obj
Beispiel #28
0
def cuffdiff(resource,
             annotation,
             genome=None,
             multi_read_correct=None,
             fdr=None,
             library_type=None,
             library_normalization=None,
             dispersion_method=None,
             threads=None):
    """Run Cuffdiff_ for selected cuffquants.

    This method runs `Cuffdiff`_ process with ``annotation`` specified
    in arguments. Library type is by defalt fr-unstranded. Other parameters
    defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric,
    dispersion_method=pooled, threads=1. Parameter genome is optional.

    The way the function works depends on the resource. If it is run on a collection,
    it will perform cuffdiff on every 'compare' relation labeled 'case-control' in
    the selected collection. If it is run on a list of samples (not necesssarily in
    the same collection) it will run cuffdiff on all 'compare' relations labeled
    'case-control' containing all of the given samples but will discard those
    samples in a relation that are not in the list of samples.

    .. _Cuffdiff:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff

    :param annotation: annotation file
    :type annotation: `~resdk.resources.data.Data`
    :param genome: genome object to use for bias detection and
        correction algorithm
    :type genome: `~resdk.resources.data.Data`
    :param bool multi_read_correct: do initial estimation procedure to
        more accurately weight reads with multiple genome mappings
    :param fdr: the allowed false discovery rate
    :type fdr: decimal
    :param str library_type: options are: fr-unstranded, fr-firststrand,
        fr-secondstrand
    :param str library_normalization: options are: geometric, classic-fpkm,
        quartile
    :param str dispersion_method: options are: pooled, per-condition,
        blind, poisson
    :param int threads: use this many processor threads

    """
    inputs = {'annotation': get_data_id(annotation)}

    input_objects = [annotation]

    if genome is not None:
        inputs['genome'] = genome
        input_objects.append(genome)

    if multi_read_correct is not None:
        inputs['multi_read_correct'] = multi_read_correct

    if fdr is not None:
        inputs['fdr'] = fdr

    if library_type is not None:
        inputs['library_type'] = library_type

    if library_normalization is not None:
        inputs['library_normalization'] = library_normalization

    if dispersion_method is not None:
        inputs['dispersion_method'] = dispersion_method

    if threads is not None:
        inputs['threads'] = threads

    samples = get_samples(resource)
    sample_ids = [sample.id for sample in samples]

    input_objects.extend(samples)
    resolwe = get_resolwe(*input_objects)

    collection_id = get_resource_collection(resource)

    relation_filter = {}
    if collection_id:
        relation_filter['collection'] = collection_id
    else:
        relation_filter['entity'] = sample_ids

    relations = resolwe.relation.filter(type='compare',
                                        label='case-control',
                                        **relation_filter)

    cuffdiff_objects = []
    for relation in relations:
        control = []
        case = []
        for sample, position in zip(relation.samples, relation.positions):
            if sample.id not in sample_ids:
                continue

            if position == 'case':
                case.append(get_data_id(sample.get_cuffquant()))
            elif position == 'control':
                control.append(get_data_id(sample.get_cuffquant()))
            else:
                raise ValueError(
                    "Position different from 'case' or 'control' was found in the "
                    "following relation: {}".format(relation.id))
Beispiel #29
0
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None,
            sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None,
            scale=None, bed=None, multi_page=None):
    """Run ``bamplot`` on the resource.

    This method runs `bamplot`_ with bams, genome and gff or region
    specified in arguments.

    .. _bamplot:
        http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot

    :param list resource: resource from which bam objects will be get
    :param str genome: Genome used in the process (options are HG18,
        HG19, MM9 and MM10)
    :param input_gff: id of annotation file is given
    :type input_gff: int or `~resdk.resources.data.Data`
    :param str input_region: enter a genomic region
    :param int stretch_input: stretch the input regions to a minimum
        length
    :param str color: enter a colon separated list of colors
    :param str sense: map to forward, reverse or'both strand,
        default maps to ``both``
    :param int extension: extends reads by n bp, dfault value is 200bp
    :param bool rpm: normalizes density to reads per million (rpm),
        default is ``False``
    :param str yscale: choose either relative or uniform y axis scaling,
        default is ``relative scaling``
    :param str names: a comma separated list of names for your bams
    :param str plot: choose all lines on a single plot or multiple plots
    :param str title: title for the output plot(s), default will be the
        coordinate region
    :param str scale: a comma separated list of multiplicative scaling
        factors for your bams, default is ``None``
    :param list beds: subset of bed files to run process on, if empty
        processes for all bed files will be run
    :param bool multi_page: if flagged will create a new pdf for each
        region

    """
    input_objects = []

    if not input_gff and not input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')
    if input_gff and input_region:
        raise KeyError('Please specify `input_gff` or `input_region.')

    valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10', 'RN4', 'RN6']
    if genome not in valid_genomes:
        raise KeyError('Invalid `genome`, please use one of the following: '
                       '{}'. format(', '.join(valid_genomes)))

    bams = [sample.get_bam() for sample in get_samples(resource)]
    input_objects.extend(bams)
    bams = [get_data_id(bam) for bam in bams]

    inputs = {
        'genome': genome,
        'bam': bams,
    }

    if color is not None:
        inputs['color'] = color

    if sense is not None:
        inputs['scale'] = scale

    if extension is not None:
        inputs['extension'] = extension

    if rpm is not None:
        inputs['rpm'] = rpm

    if yscale is not None:
        inputs['yscale'] = yscale

    if names is not None:
        inputs['names'] = names

    if plot is not None:
        inputs['plot'] = plot

    if title is not None:
        inputs['title'] = title

    if scale is not None:
        inputs['scale'] = scale

    if multi_page is not None:
        inputs['multi_page'] = multi_page

    if input_gff is not None:
        input_objects.append(input_gff)
        inputs['input_gff'] = get_data_id(input_gff)

    if input_region is not None:
        inputs['input_region'] = input_region

    if bed is not None:
        if isinstance(bed, list):
            input_objects.extend(bed)
            inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed]
        else:
            input_objects.append(bed)
            inputs['bed'] = [get_data_id(bed)]

    resolwe = get_resolwe(*input_objects)

    bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs)

    if is_collection(resource):
        resource.add_data(bamplot_obj)
    elif is_relation(resource):
        resource.collection.add_data(bamplot_obj)

    return bamplot_obj