def test_get_samples(self): collection = Collection(id=1, resolwe=MagicMock()) collection._samples = ['sample_1', 'sample_2'] self.assertEqual(get_samples(collection), ['sample_1', 'sample_2']) collection_1 = Collection(id=1, resolwe=MagicMock()) collection_1._samples = ['sample_1'] collection_2 = Collection(id=2, resolwe=MagicMock()) collection_2._samples = ['sample_2'] self.assertEqual(get_samples([collection_1, collection_2]), ['sample_1', 'sample_2']) data = Data(id=1, resolwe=MagicMock()) data._sample = 'sample_1' self.assertEqual(get_samples(data), ['sample_1']) data1 = Data(id=1, resolwe=MagicMock()) data1._sample = 'sample1' data2 = Data(id=2, resolwe=MagicMock()) data2._sample = 'sample2' self.assertEqual(get_samples([data1, data2]), ['sample1', 'sample2']) data = Data(id=1, resolwe=MagicMock(**{'sample.filter.return_value': None})) data._sample = None with self.assertRaises(TypeError): get_samples(data) sample = Sample(id=1, resolwe=MagicMock()) self.assertEqual(get_samples(sample), [sample]) sample_1 = Sample(id=1, resolwe=MagicMock()) sample_2 = Sample(id=3, resolwe=MagicMock()) self.assertEqual(get_samples([sample_1, sample_2]), [sample_1, sample_2])
def prepare_geo_rnaseq(resource, name=None): """Run ``Prepare GEO - RNA-Seq`` process on the resource. This method can be used to run ``Prepare GEO - RNA-Seq`` process on a single collection or a list of samples. :param resource: resource on which prepare_geo_rnaseq will be run :param str name: name of the prepare GEO tarball and table """ reads = [] expressions = [] samples = get_samples(resource) resolwe = get_resolwe(*samples) collection_ids = set() for sample in samples: reads.append(sample.get_reads().id) expressions.append(sample.get_expression().id) collection_ids.add(get_resource_collection(sample)) auto_name, collection = get_name_collection(collection_ids, resolwe) inputs = { 'reads': reads, 'expressions': expressions, 'name': name or auto_name, } geo = resolwe.get_or_run(slug='prepare-geo-rnaseq', input=inputs) if collection: collection.add_data(geo) return geo
def hisat2(resource, genome): """Run hisat2 aligner on given resource. Align reads files of given resource to the given genome using the `Hisat2`_ aligner. If reads were already aligned, existing objects will be returned. .. _Hisat2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-hisat2 :param resource: resource of which reads will be aligned :param genome: data object with genome that will be used :type genome: `~resdk.resources.data.Data` """ results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs = { 'reads': sample.get_reads().id, 'genome': get_data_id(genome), } aligned = sample.resolwe.get_or_run(slug='alignment-hisat2', input=inputs) sample.add_data(aligned) results.append(aligned) return results
def bowtie2(resource, genome): """Run bowtie2 aligner on given resource. Aligne reads files of given resource to the given genome using the ``bowtie2`` aligner. If reads were already aligned, existing objects will be returned. :param resource: resource of which reads will be aligned :param genome: data object with genome that will be used :type genome: `~resdk.resources.data.Data` """ results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs = { 'reads': sample.get_reads().id, 'genome': get_data_id(genome), } aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2', input=inputs) sample.add_data(aligned) results.append(aligned) return results
def cuffquant(resource, gff, genome=None, mask_file=None, library_type=None, multi_read_correct=None, threads=None): """Run Cuffquant_ for selected cuffquats. This method runs `Cuffquant`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unsstranded. Other parameters: genome, mask_file, multi_reads_correct and threads are optional. .. _Cuffquant: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant :param gff: id of annotation file is given :type gff: int or `~resdk.resources.data.Data` :param genome: id of genome file is given to run bias detection and correction algorithm :type genome: int or `~resdk.resources.data.Data` :param mask_file: id of mask file is given :type mask_file: int or `~resdk.resources.data.Data` :param str library_type: options are fr-unstranded, fr-firststrand, fr-secondstrand :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param int threads: use this many processor threads """ results = [] for sample in get_samples(resource): inputs = { 'alignment': sample.get_bam().id, 'gff': get_data_id(gff), } if genome is not None: inputs['genome'] = genome if mask_file is not None: inputs['mask_file'] = mask_file if library_type is not None: inputs['library_type'] = library_type if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if threads is not None: inputs['threads'] = threads cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant', input=inputs) sample.add_data(cuffquant_obj) results.append(cuffquant_obj) return results
def macs(resource, use_background=True, p_value=None): """Run ``MACS 1.4`` process on the resource. This method runs `MACS 1.4`_ process with ``p-value`` specified in arguments and ``bam`` file from the sample. If ``use_background`` argument is set to ``True``, ``bam`` file from background sample is passed to the process as the control. Mappable genome size is taken from the sample annotation. .. _MACS 1.4: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-macs14 :param bool use_background: if set to ``True``, background sample will be used in the process :param float p_value: p-value used in the process """ inputs = {} if p_value is not None: inputs['pvalue'] = p_value results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: background_filter = {} if use_background: collection_id = get_resource_collection(single_resource) if collection_id: background_filter['collection'] = collection_id for sample in get_samples(single_resource): inputs['treatment'] = sample.get_bam().id try: inputs['gsize'] = gsize_organism( sample.descriptor['sample']['organism']) except KeyError: raise KeyError('{} is not annotated'.format(sample)) if use_background: if is_background(sample) and not is_sample(single_resource): # Don't run process on the background sample, # but let it fail if it is run directly on sample continue background = sample.get_background(**background_filter) inputs['control'] = background.get_bam().id macs_obj = sample.resolwe.get_or_run(slug='macs14', input=inputs) sample.add_data(macs_obj) results.append(macs_obj) return results
def test_get_samples(self): collection = Collection(id=1, resolwe=MagicMock()) collection._samples = ['sample_1', 'sample_2'] self.assertEqual(get_samples(collection), ['sample_1', 'sample_2']) collection_1 = Collection(id=1, resolwe=MagicMock()) collection_1._samples = ['sample_1'] collection_2 = Collection(id=2, resolwe=MagicMock()) collection_2._samples = ['sample_2'] self.assertEqual(get_samples([collection_1, collection_2]), ['sample_1', 'sample_2']) sample = Sample(id=1, resolwe=MagicMock()) self.assertEqual(get_samples(sample), [sample]) sample_1 = Sample(id=1, resolwe=MagicMock()) sample_2 = Sample(id=3, resolwe=MagicMock()) self.assertEqual(get_samples([sample_1, sample_2]), [sample_1, sample_2])
def prepare_geo_chipseq(resource, name=None): """Run ``Prepare GEO - ChIP-Seq`` process on the resource. This method can be used to run ``Prepare GEO - ChIP-Seq`` process on a single collection or a list of samples. :param resource: resource on which prepare_geo_chipseq will be run :param str name: name of the prepare GEO tarball and table """ reads = [] macs = [] samples = get_samples(resource) resolwe = get_resolwe(*samples) collection_ids = set() for sample in samples: reads.append(sample.get_reads().id) if sample.is_background: continue macs_list = sample.get_macs() if not macs_list: raise ValueError( "Sample {} has no `macs` data object!".format(sample)) elif len(macs_list) != 1: raise ValueError( "Sample {} has more than one `macs` data objects!".format( sample)) macs.append(macs_list[0].id) if sample.background: if sample.background not in samples: raise ValueError( "Background of the sample {} cannot be found in the resource you provided: " "{}!".format(sample, resource)) collection_ids.add(get_resource_collection(sample)) auto_name, collection = get_name_collection(collection_ids, resolwe) inputs = { 'reads': reads, 'macs': macs, 'name': name or auto_name, } geo = resolwe.get_or_run(slug='prepare-geo-chipseq', input=inputs) if collection: collection.add_data(geo) return geo
def prepare_geo_chipseq(resource, name=None): """Run ``Prepare GEO - ChIP-Seq`` process on the resource. This method can be used to run ``Prepare GEO - ChIP-Seq`` process on a single collection or a list of samples. :param resource: resource on which prepare_geo_chipseq will be run :param str name: name of the prepare GEO tarball and table """ reads = [] macs = [] samples = get_samples(resource) resolwe = get_resolwe(*samples) collection_ids = set() for sample in samples: reads.append(sample.get_reads().id) if sample.is_background: continue macs_list = sample.get_macs() if not macs_list: raise ValueError("Sample {} has no `macs` data object!".format(sample)) elif len(macs_list) != 1: raise ValueError("Sample {} has more than one `macs` data objects!".format(sample)) macs.append(macs_list[0].id) if sample.background: if sample.background not in samples: raise ValueError( "Background of the sample {} cannot be found in the resource you provided: " "{}!".format(sample, resource) ) collection_ids.add(get_resource_collection(sample)) auto_name, collection = get_name_collection(collection_ids, resolwe) inputs = { 'reads': reads, 'macs': macs, 'name': name or auto_name, } geo = resolwe.get_or_run(slug='prepare-geo-chipseq', input=inputs) if collection: collection.add_data(geo) return geo
def macs(resource, use_background=True, p_value=None): """Run ``MACS 1.4`` process on the resource. This method runs `MACS 1.4`_ process with ``p-value`` specified in arguments and ``bam`` file from the sample. If ``use_background`` argument is set to ``True``, ``bam`` file from background sample is passed to the process as the control. .. _MACS 1.4: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-macs14 :param bool use_background: if set to ``True``, background sample will be used in the process :param float p_value: p-value used in the process """ inputs = {} if p_value is not None: inputs['pvalue'] = p_value results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: background_filter = {} if use_background: collection_id = get_resource_collection(single_resource) if collection_id: background_filter['collection'] = collection_id for sample in get_samples(single_resource): inputs['treatment'] = sample.get_primary_bam(fallback_to_bam=True).id if use_background: if sample.is_background and not is_sample(single_resource): # Don't run process on the background sample, # but let it fail if it is run directly on sample continue background = sample.get_background(**background_filter) inputs['control'] = background.get_primary_bam(fallback_to_bam=True).id macs_obj = sample.resolwe.get_or_run(slug='macs14', input=inputs) sample.add_data(macs_obj) results.append(macs_obj) return results
def cuffquant(resource, annotation, genome=None, mask_file=None, library_type=None, multi_read_correct=None): """Run Cuffquant_ for selected cuffquats. This method runs `Cuffquant`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unsstranded. Other parameters: genome, mask_file and multi_reads_correct are optional. .. _Cuffquant: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param mask_file: mask file to use in process :type mask_file: `~resdk.resources.data.Data` :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings """ results = [] for sample in get_samples(resource): inputs = { 'alignment': sample.get_bam().id, 'annotation': get_data_id(annotation), } if genome is not None: inputs['genome'] = genome if mask_file is not None: inputs['mask_file'] = mask_file if library_type is not None: inputs['library_type'] = library_type if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant', input=inputs) sample.add_data(cuffquant_obj) results.append(cuffquant_obj) return results
def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] inputs = { 'cuffquant': cuffquants, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def test_get_samples(self): collection = Collection(id=1, resolwe=MagicMock()) collection._samples = ['sample_1', 'sample_2'] self.assertEqual(get_samples(collection), ['sample_1', 'sample_2']) collection_1 = Collection(id=1, resolwe=MagicMock()) collection_1._samples = ['sample_1'] collection_2 = Collection(id=2, resolwe=MagicMock()) collection_2._samples = ['sample_2'] self.assertEqual(get_samples([collection_1, collection_2]), ['sample_1', 'sample_2']) data = Data(id=1, resolwe=MagicMock()) data.api(data.id).get = MagicMock(return_value={'entities': [7]}) data.resolwe.sample.get = MagicMock(return_value='sample_1') self.assertEqual(get_samples(data), ['sample_1']) data1 = Data(id=1, resolwe=MagicMock()) data1.api(data.id).get = MagicMock(return_value={'entities': [7]}) data1.resolwe.sample.get = MagicMock(return_value='sample1') data2 = Data(id=2, resolwe=MagicMock()) data2.api(data.id).get = MagicMock(return_value={'entities': [8]}) data2.resolwe.sample.get = MagicMock(return_value='sample2') self.assertEqual(get_samples([data1, data2]), ['sample1', 'sample2']) data = Data(id=1, resolwe=MagicMock(**{'sample.get.return_value': None})) data._sample = None with self.assertRaises(TypeError): get_samples(data) sample = Sample(id=1, resolwe=MagicMock()) self.assertEqual(get_samples(sample), [sample]) sample_1 = Sample(id=1, resolwe=MagicMock()) sample_2 = Sample(id=3, resolwe=MagicMock()) self.assertEqual(get_samples([sample_1, sample_2]), [sample_1, sample_2])
def bamsplit(resource, header=None, header2=None): """Run ``Bam split`` process on the resource. This method runs `Bam split`_ process on the resource. The process will be run on all samples in the resource. Samples' alignment data object must be valid. Valid alignment data objects are those that were aligned to a hybrid genome with a valid build. Valid builds are: 'hg19_dm6' and 'mm10_dm6'. .. _Bam split: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bam-split :param resource: resource on which bam-split will be run :param header: SAM header data object for the primary BAM :type header: `~resdk.resources.data.Data` :param header2: SAM header data object for the secodary BAM :type header: `~resdk.resources.data.Data` """ inputs = {} if header: inputs['header'] = header if header2: inputs['header2'] = header2 results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs['bam'] = sample.get_bam().id primary_bam = sample.resolwe.run( slug='bam-split', input=inputs, collections=sample.collections ) results.append(primary_bam) return results
def bamsplit(resource, header=None, header2=None): """Run ``Bam split`` process on the resource. This method runs `Bam split`_ process on the resource. The process will be run on all samples in the resource. Samples' alignment data object must be valid. Valid alignment data objects are those that were aligned to a hybrid genome with a valid build. Valid builds are: 'hg19_dm6' and 'mm10_dm6'. .. _Bam split: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bam-split :param resource: resource on which bam-split will be run :param header: SAM header data object for the primary BAM :type header: `~resdk.resources.data.Data` :param header2: SAM header data object for the secodary BAM :type header: `~resdk.resources.data.Data` """ inputs = {} if header: inputs['header'] = header if header2: inputs['header2'] = header2 results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs['bam'] = sample.get_bam().id primary_bam = sample.resolwe.run(slug='bam-split', input=inputs, collections=sample.collections) results.append(primary_bam) return results
def cuffnorm(resource, annotation, use_ercc=None, threads=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation``, ``useERCC`` and ``threads`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: int or `~resdk.resources.data.Data` :param bool useERCC: use ERRCC spike-in controls for normalization :param int threads: use this many threads to align reads (default: ``1``) """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] labels = [] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter(type='group', label='replicates', entity=[sample.id], **relation_filter) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name)) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) if str(relation.id) not in labels: labels.append(str(relation.id)) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), 'labels': labels, } if use_ercc is not None: inputs['useERCC'] = use_ercc if threads is not None: inputs['threads'] = threads cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def rose2(resource, use_background=True, tss=None, stitch=None, beds=None): """Run ``ROSE 2`` process on the resource. This method runs `ROSE2`_ process with ``tss_exclusion`` and ``stitch`` parameters specified in arguments. Separate process is run for each bed file on the sample. To run process only on subset of those files, list them in ``beds`` argument (if only one object is given, it will be auto-wrapped in list, if it is not already). If ``use_background`` argument is set to ``True``, bam file from background sample is passed to the process as the control. .. _ROSE2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2 :param bool use_background: if set to ``True``, background sample will be used in the process :param int tss: TSS exclusion used in process :param int stitch: Stitch used in process :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run """ results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: background_filter = {} if use_background: collection_id = get_resource_collection(single_resource) if collection_id: background_filter['collection'] = collection_id for sample in get_samples(single_resource): inputs = { 'rankby': sample.get_bam().id, } if tss is not None: inputs['tss'] = tss if stitch is not None: inputs['stitch'] = stitch if use_background: if sample.is_background and not is_sample(single_resource): # Don't run process on the background sample, # but let it fail if it is run directly on sample continue background = sample.get_background(**background_filter) inputs['control'] = background.get_bam().id bed_list = sample.get_macs() if beds is not None: # Convert objects to the list of their ids if isinstance(beds, list): bed_filter = [get_data_id(bed) for bed in beds] else: bed_filter = [get_data_id(beds)] bed_list = bed_list.filter(id__in=bed_filter) for bed in bed_list: inputs['input'] = bed.id rose = sample.resolwe.get_or_run(slug='rose2', input=inputs) sample.add_data(rose) results.append(rose) return results
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None, sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None, scale=None, bed=None, multi_page=None): """Run ``bamplot`` on the resource. This method runs `bamplot`_ with bams, genome and gff or region specified in arguments. .. _bamplot: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot :param list resource: resource from which bam objects will be get :param str genome: Genome used in the process (options are HG18, HG19, MM9 and MM10) :param input_gff: id of annotation file is given :type input_gff: int or `~resdk.resources.data.Data` :param str input_region: enter a genomic region :param int stretch_input: stretch the input regions to a minimum length :param str color: enter a colon separated list of colors :param str sense: map to forward, reverse or'both strand, default maps to ``both`` :param int extension: extends reads by n bp, dfault value is 200bp :param bool rpm: normalizes density to reads per million (rpm), default is ``False`` :param str yscale: choose either relative or uniform y axis scaling, default is ``relative scaling`` :param str names: a comma separated list of names for your bams :param str plot: choose all lines on a single plot or multiple plots :param str title: title for the output plot(s), default will be the coordinate region :param str scale: a comma separated list of multiplicative scaling factors for your bams, default is ``None`` :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run :param bool multi_page: if flagged will create a new pdf for each region """ input_objects = [] if not input_gff and not input_region: raise KeyError('Please specify `input_gff` or `input_region.') if input_gff and input_region: raise KeyError('Please specify `input_gff` or `input_region.') valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10'] if genome not in valid_genomes: raise KeyError('Invalid `genome`, please use one of the following: ' '{}'.format(', '.join(valid_genomes))) bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'genome': genome, 'bam': bams, } if color is not None: inputs['color'] = color if sense is not None: inputs['scale'] = scale if extension is not None: inputs['extension'] = extension if rpm is not None: inputs['rpm'] = rpm if yscale is not None: inputs['yscale'] = yscale if names is not None: inputs['names'] = names if plot is not None: inputs['plot'] = plot if title is not None: inputs['title'] = title if scale is not None: inputs['scale'] = scale if multi_page is not None: inputs['multi_page'] = multi_page if input_gff is not None: input_objects.append(input_gff) inputs['input_gff'] = get_data_id(input_gff) if input_region is not None: inputs['input_region'] = input_region if bed is not None: if isinstance(bed, list): input_objects.extend(bed) inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed] else: input_objects.append(bed) inputs['bed'] = [get_data_id(bed)] resolwe = get_resolwe(*input_objects) bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs) if is_collection(resource): resource.add_data(bamplot_obj) elif is_relation(resource): resource.collection.add_data(bamplot_obj) return bamplot_obj
def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter( type='group', entity=[sample.id], **relation_filter ) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name) ) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None, library_type=None, library_normalization=None, dispersion_method=None): """Run Cuffdiff_ for selected cuffquants. This method runs `Cuffdiff`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unstranded. Other parameters defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric, dispersion_method=pooled, threads=1. Parameter genome is optional. The way the function works depends on the resource. If it is run on a collection, it will perform cuffdiff on every 'compare' relation labeled 'case-control' in the selected collection. If it is run on a list of samples (not necesssarily in the same collection) it will run cuffdiff on all 'compare' relations labeled 'case-control' containing all of the given samples but will discard those samples in a relation that are not in the list of samples. .. _Cuffdiff: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param fdr: the allowed false discovery rate :type fdr: decimal :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param str library_normalization: options are: geometric, classic-fpkm, quartile :param str dispersion_method: options are: pooled, per-condition, blind, poisson """ inputs = {'annotation': get_data_id(annotation)} input_objects = [annotation] if genome is not None: inputs['genome'] = genome.id input_objects.append(genome) if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if fdr is not None: inputs['fdr'] = fdr if library_type is not None: inputs['library_type'] = library_type if library_normalization is not None: inputs['library_normalization'] = library_normalization if dispersion_method is not None: inputs['dispersion_method'] = dispersion_method samples = get_samples(resource) sample_ids = [sample.id for sample in samples] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) collection_id = get_resource_collection(resource) relation_filter = {} if collection_id: relation_filter['collection'] = collection_id else: relation_filter['entity'] = sample_ids relations = resolwe.relation.filter( type='compare', **relation_filter ) cuffdiff_objects = [] for relation in relations: control = [] case = [] for partition in relation.partitions: sample = resolwe.sample.get(partition['entity']) label = partition['label'] if sample.id not in sample_ids: continue if label == 'case': case.append(get_data_id(sample.get_cuffquant())) elif label == 'control': control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Label different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id) ) if not case or not control: continue inputs['case'] = case inputs['control'] = control cuffdiff_obj = resolwe.get_or_run(slug='cuffdiff', input=inputs) cuffdiff_objects.append(cuffdiff_obj) if is_collection(resource): resource.add_data(cuffdiff_obj) elif is_relation(resource): resource.collection.add_data(cuffdiff_obj) if not cuffdiff_objects: if not relations: raise ValueError("No relation containing all of the given samples was found") else: raise ValueError( "No suitable relation was found (given samples all have either 'case' label " "or 'control' label" ) return cuffdiff_objects
def bowtie2(resource, genome, mode=None, speed=None, use_se=None, discordantly=None, rep_se=None, minins=None, maxins=None, trim_5=None, trim_3=None, trim_iter=None, trim_nucl=None, rep_mode=None, k_reports=None): """Run bowtie2 aligner on given resource. Align reads files of given resource to the given genome using the `Bowtie2`_ aligner. If reads were already aligned, existing objects will be returned. .. _Bowtie2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-bowtie2 :param resource: resource of which reads will be aligned :param genome: data object with genome that will be used :type genome: `~resdk.resources.data.Data` :param str mode: alignment mode (options are: --end-to-end, --local), default is --end-to-end :param str speed: speed vs sensitivity (options are: --very-fast, --fast, --semsitive, --very-sensitive), default is --sensitive :param bool use_se: map as single-ended (for paired-end reads only), default is False :param bool discordantly: report discordantly matched read, default is True :param bool rep_se: report single ended, default is True :param int minins: minimum fragment length, default is 0 :param int maxins: maximum fragment length, default is 500 :param int trim_5: number of bases to trim from 5', default is 0 :param int trim_3: number of bases to trim from 3', default is 0 :param int trim_iter: number of iterations, default is 0 :param int trim_nucl: number of bases to trim from 3' in each iteration, default is 2 :param str rep_mode: report mode (options are: def, k, a), default is def :param int k_reports: number of reports (for -k mode only), default is 5 """ inputs = {'genome': get_data_id(genome)} if mode is not None: inputs['mode'] = mode if speed is not None: inputs['speed'] = speed if use_se is not None: inputs['use_se'] = use_se if discordantly is not None: inputs['discordantly'] = discordantly if rep_se is not None: inputs['rep_se'] = rep_se if minins is not None: inputs['minins'] = minins if maxins is not None: inputs['maxins'] = maxins if trim_5 is not None: inputs['trim_5'] = trim_5 if trim_3 is not None: inputs['trim_3'] = trim_3 if trim_iter is not None: inputs['trim_iter'] = trim_iter if trim_nucl is not None: inputs['trim_nucl'] = trim_nucl if rep_mode is not None: inputs['rep_mode'] = rep_mode if k_reports is not None: inputs['k_reports'] = k_reports results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs['reads'] = sample.get_reads().id aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2', input=inputs) sample.add_data(aligned) results.append(aligned) return results
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None, sense=None, skip_plot=None, black_list=None, threads=None): """Run ``bamliquidator`` on the resource. This method runs `bamliquidator`_ with bams, where three different analysis type options are possible: Bin mode, Region mode and BED mode. .. _bamliquidator: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator :param list resource: resource from which bam objects will be get :param str cell_type: the name of cell type will be given in counts tables :param int bin_size: number of base pairs in each bin. Default is 100000. :param regions: gtf or bed annotation object used in region mode :type regions: `~resdk.resources.data.Data` :param int extension: Extends reads by number of bp. Default is 200. :param str sense: Mapping strand to gff file. Use '+' for forwaed, '-' for reverse and '.' for both. Defoult is both. :param bool skip_plot: True for skip plot. :param list str black_list: One or more chromosome patterns to skip during bin liquidation. Default is to skip any chromosomes that contain any of the following substrings `chrUn`, `_random`, `Zv9_` or `_hap`. :param int threads: Number of CPUs """ if not xor(bin_size, regions): raise KeyError( 'Exactly one of `bin_size` and `regions` parameters must be given.' ) if regions and not is_data(regions): raise KeyError('`regions` parameter must be data object.') input_objects = [] bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'bam': bams, } if bin_size: inputs['analysis_type'] = 'bin' inputs['bin_size'] = bin_size else: # regions if regions.process_type == 'data:annotation:gtf:': inputs['analysis_type'] = 'gtf' elif regions.process_type == 'data:bed:': inputs['analysis_type'] = 'bed' else: raise KeyError( '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`' ) input_objects.append(regions) inputs['regions_file_gtf'] = get_data_id(regions) if cell_type is not None: inputs['cell_type'] = cell_type if extension is not None: inputs['extension'] = extension if sense is not None: inputs['sense'] = sense if skip_plot is not None: inputs['skip_plot'] = skip_plot if black_list is not None: inputs['black_list'] = black_list if threads is not None: inputs['threads'] = threads resolwe = get_resolwe(*input_objects) bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs) if is_collection(resource): resource.add_data(bamliquidator_obj) elif is_relation(resource): resource.collection.add_data(bamliquidator_obj) return bamliquidator_obj
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None, sense=None, skip_plot=None, black_list=None, threads=None): """Run ``bamliquidator`` on the resource. This method runs `bamliquidator`_ with bams, where three different analysis type options are possible: Bin mode, Region mode and BED mode. .. _bamliquidator: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator :param list resource: resource from which bam objects will be get :param str cell_type: the name of cell type will be given in counts tables :param int bin_size: number of base pairs in each bin. Default is 100000. :param regions: gtf or bed annotation object used in region mode :type regions: `~resdk.resources.data.Data` :param int extension: Extends reads by number of bp. Default is 200. :param str sense: Mapping strand to gff file. Use '+' for forwaed, '-' for reverse and '.' for both. Defoult is both. :param bool skip_plot: True for skip plot. :param list str black_list: One or more chromosome patterns to skip during bin liquidation. Default is to skip any chromosomes that contain any of the following substrings `chrUn`, `_random`, `Zv9_` or `_hap`. :param int threads: Number of CPUs """ if not xor(bin_size, regions): raise KeyError('Exactly one of `bin_size` and `regions` parameters must be given.') if regions and not is_data(regions): raise KeyError('`regions` parameter must be data object.') input_objects = [] bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'bam': bams, } if bin_size: inputs['analysis_type'] = 'bin' inputs['bin_size'] = bin_size else: # regions if regions.process_type == 'data:annotation:gtf:': inputs['analysis_type'] = 'gtf' elif regions.process_type == 'data:bed:': inputs['analysis_type'] = 'bed' else: raise KeyError( '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`' ) input_objects.append(regions) inputs['regions_file_gtf'] = get_data_id(regions) if cell_type is not None: inputs['cell_type'] = cell_type if extension is not None: inputs['extension'] = extension if sense is not None: inputs['sense'] = sense if skip_plot is not None: inputs['skip_plot'] = skip_plot if black_list is not None: inputs['black_list'] = black_list if threads is not None: inputs['threads'] = threads resolwe = get_resolwe(*input_objects) bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs) if is_collection(resource): resource.add_data(bamliquidator_obj) elif is_relation(resource): resource.collection.add_data(bamliquidator_obj) return bamliquidator_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None, library_type=None, library_normalization=None, dispersion_method=None, threads=None): """Run Cuffdiff_ for selected cuffquants. This method runs `Cuffdiff`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unstranded. Other parameters defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric, dispersion_method=pooled, threads=1. Parameter genome is optional. The way the function works depends on the resource. If it is run on a collection, it will perform cuffdiff on every 'compare' relation labeled 'case-control' in the selected collection. If it is run on a list of samples (not necesssarily in the same collection) it will run cuffdiff on all 'compare' relations labeled 'case-control' containing all of the given samples but will discard those samples in a relation that are not in the list of samples. .. _Cuffdiff: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param fdr: the allowed false discovery rate :type fdr: decimal :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param str library_normalization: options are: geometric, classic-fpkm, quartile :param str dispersion_method: options are: pooled, per-condition, blind, poisson :param int threads: use this many processor threads """ inputs = {'annotation': get_data_id(annotation)} input_objects = [annotation] if genome is not None: inputs['genome'] = genome input_objects.append(genome) if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if fdr is not None: inputs['fdr'] = fdr if library_type is not None: inputs['library_type'] = library_type if library_normalization is not None: inputs['library_normalization'] = library_normalization if dispersion_method is not None: inputs['dispersion_method'] = dispersion_method if threads is not None: inputs['threads'] = threads samples = get_samples(resource) sample_ids = [sample.id for sample in samples] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) collection_id = get_resource_collection(resource) relation_filter = {} if collection_id: relation_filter['collection'] = collection_id else: relation_filter['entity'] = sample_ids relations = resolwe.relation.filter(type='compare', label='case-control', **relation_filter) cuffdiff_objects = [] for relation in relations: control = [] case = [] for sample, position in zip(relation.samples, relation.positions): if sample.id not in sample_ids: continue if position == 'case': case.append(get_data_id(sample.get_cuffquant())) elif position == 'control': control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Position different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id))
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None, sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None, scale=None, bed=None, multi_page=None): """Run ``bamplot`` on the resource. This method runs `bamplot`_ with bams, genome and gff or region specified in arguments. .. _bamplot: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot :param list resource: resource from which bam objects will be get :param str genome: Genome used in the process (options are HG18, HG19, MM9 and MM10) :param input_gff: id of annotation file is given :type input_gff: int or `~resdk.resources.data.Data` :param str input_region: enter a genomic region :param int stretch_input: stretch the input regions to a minimum length :param str color: enter a colon separated list of colors :param str sense: map to forward, reverse or'both strand, default maps to ``both`` :param int extension: extends reads by n bp, dfault value is 200bp :param bool rpm: normalizes density to reads per million (rpm), default is ``False`` :param str yscale: choose either relative or uniform y axis scaling, default is ``relative scaling`` :param str names: a comma separated list of names for your bams :param str plot: choose all lines on a single plot or multiple plots :param str title: title for the output plot(s), default will be the coordinate region :param str scale: a comma separated list of multiplicative scaling factors for your bams, default is ``None`` :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run :param bool multi_page: if flagged will create a new pdf for each region """ input_objects = [] if not input_gff and not input_region: raise KeyError('Please specify `input_gff` or `input_region.') if input_gff and input_region: raise KeyError('Please specify `input_gff` or `input_region.') valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10', 'RN4', 'RN6'] if genome not in valid_genomes: raise KeyError('Invalid `genome`, please use one of the following: ' '{}'. format(', '.join(valid_genomes))) bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'genome': genome, 'bam': bams, } if color is not None: inputs['color'] = color if sense is not None: inputs['scale'] = scale if extension is not None: inputs['extension'] = extension if rpm is not None: inputs['rpm'] = rpm if yscale is not None: inputs['yscale'] = yscale if names is not None: inputs['names'] = names if plot is not None: inputs['plot'] = plot if title is not None: inputs['title'] = title if scale is not None: inputs['scale'] = scale if multi_page is not None: inputs['multi_page'] = multi_page if input_gff is not None: input_objects.append(input_gff) inputs['input_gff'] = get_data_id(input_gff) if input_region is not None: inputs['input_region'] = input_region if bed is not None: if isinstance(bed, list): input_objects.extend(bed) inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed] else: input_objects.append(bed) inputs['bed'] = [get_data_id(bed)] resolwe = get_resolwe(*input_objects) bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs) if is_collection(resource): resource.add_data(bamplot_obj) elif is_relation(resource): resource.collection.add_data(bamplot_obj) return bamplot_obj