def bowtie2(resource, genome): """Run bowtie2 aligner on given resource. Aligne reads files of given resource to the given genome using the ``bowtie2`` aligner. If reads were already aligned, existing objects will be returned. :param resource: resource of which reads will be aligned :param genome: data object with genome that will be used :type genome: `~resdk.resources.data.Data` """ results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs = { 'reads': sample.get_reads().id, 'genome': get_data_id(genome), } aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2', input=inputs) sample.add_data(aligned) results.append(aligned) return results
def hisat2(resource, genome): """Run hisat2 aligner on given resource. Align reads files of given resource to the given genome using the `Hisat2`_ aligner. If reads were already aligned, existing objects will be returned. .. _Hisat2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-hisat2 :param resource: resource of which reads will be aligned :param genome: data object with genome that will be used :type genome: `~resdk.resources.data.Data` """ results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs = { 'reads': sample.get_reads().id, 'genome': get_data_id(genome), } aligned = sample.resolwe.get_or_run(slug='alignment-hisat2', input=inputs) sample.add_data(aligned) results.append(aligned) return results
def cuffquant(resource, gff, genome=None, mask_file=None, library_type=None, multi_read_correct=None, threads=None): """Run Cuffquant_ for selected cuffquats. This method runs `Cuffquant`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unsstranded. Other parameters: genome, mask_file, multi_reads_correct and threads are optional. .. _Cuffquant: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant :param gff: id of annotation file is given :type gff: int or `~resdk.resources.data.Data` :param genome: id of genome file is given to run bias detection and correction algorithm :type genome: int or `~resdk.resources.data.Data` :param mask_file: id of mask file is given :type mask_file: int or `~resdk.resources.data.Data` :param str library_type: options are fr-unstranded, fr-firststrand, fr-secondstrand :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param int threads: use this many processor threads """ results = [] for sample in get_samples(resource): inputs = { 'alignment': sample.get_bam().id, 'gff': get_data_id(gff), } if genome is not None: inputs['genome'] = genome if mask_file is not None: inputs['mask_file'] = mask_file if library_type is not None: inputs['library_type'] = library_type if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if threads is not None: inputs['threads'] = threads cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant', input=inputs) sample.add_data(cuffquant_obj) results.append(cuffquant_obj) return results
def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] inputs = { 'cuffquant': cuffquants, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def cuffquant(resource, annotation, genome=None, mask_file=None, library_type=None, multi_read_correct=None): """Run Cuffquant_ for selected cuffquats. This method runs `Cuffquant`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unsstranded. Other parameters: genome, mask_file and multi_reads_correct are optional. .. _Cuffquant: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffquant :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param mask_file: mask file to use in process :type mask_file: `~resdk.resources.data.Data` :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings """ results = [] for sample in get_samples(resource): inputs = { 'alignment': sample.get_bam().id, 'annotation': get_data_id(annotation), } if genome is not None: inputs['genome'] = genome if mask_file is not None: inputs['mask_file'] = mask_file if library_type is not None: inputs['library_type'] = library_type if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct cuffquant_obj = sample.resolwe.get_or_run(slug='cuffquant', input=inputs) sample.add_data(cuffquant_obj) results.append(cuffquant_obj) return results
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None, sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None, scale=None, bed=None, multi_page=None): """Run ``bamplot`` on the resource. This method runs `bamplot`_ with bams, genome and gff or region specified in arguments. .. _bamplot: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot :param list resource: resource from which bam objects will be get :param str genome: Genome used in the process (options are HG18, HG19, MM9 and MM10) :param input_gff: id of annotation file is given :type input_gff: int or `~resdk.resources.data.Data` :param str input_region: enter a genomic region :param int stretch_input: stretch the input regions to a minimum length :param str color: enter a colon separated list of colors :param str sense: map to forward, reverse or'both strand, default maps to ``both`` :param int extension: extends reads by n bp, dfault value is 200bp :param bool rpm: normalizes density to reads per million (rpm), default is ``False`` :param str yscale: choose either relative or uniform y axis scaling, default is ``relative scaling`` :param str names: a comma separated list of names for your bams :param str plot: choose all lines on a single plot or multiple plots :param str title: title for the output plot(s), default will be the coordinate region :param str scale: a comma separated list of multiplicative scaling factors for your bams, default is ``None`` :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run :param bool multi_page: if flagged will create a new pdf for each region """ input_objects = [] if not input_gff and not input_region: raise KeyError('Please specify `input_gff` or `input_region.') if input_gff and input_region: raise KeyError('Please specify `input_gff` or `input_region.') valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10'] if genome not in valid_genomes: raise KeyError('Invalid `genome`, please use one of the following: ' '{}'.format(', '.join(valid_genomes))) bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'genome': genome, 'bam': bams, } if color is not None: inputs['color'] = color if sense is not None: inputs['scale'] = scale if extension is not None: inputs['extension'] = extension if rpm is not None: inputs['rpm'] = rpm if yscale is not None: inputs['yscale'] = yscale if names is not None: inputs['names'] = names if plot is not None: inputs['plot'] = plot if title is not None: inputs['title'] = title if scale is not None: inputs['scale'] = scale if multi_page is not None: inputs['multi_page'] = multi_page if input_gff is not None: input_objects.append(input_gff) inputs['input_gff'] = get_data_id(input_gff) if input_region is not None: inputs['input_region'] = input_region if bed is not None: if isinstance(bed, list): input_objects.extend(bed) inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed] else: input_objects.append(bed) inputs['bed'] = [get_data_id(bed)] resolwe = get_resolwe(*input_objects) bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs) if is_collection(resource): resource.add_data(bamplot_obj) elif is_relation(resource): resource.collection.add_data(bamplot_obj) return bamplot_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None, library_type=None, library_normalization=None, dispersion_method=None): """Run Cuffdiff_ for selected cuffquants. This method runs `Cuffdiff`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unstranded. Other parameters defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric, dispersion_method=pooled, threads=1. Parameter genome is optional. The way the function works depends on the resource. If it is run on a collection, it will perform cuffdiff on every 'compare' relation labeled 'case-control' in the selected collection. If it is run on a list of samples (not necesssarily in the same collection) it will run cuffdiff on all 'compare' relations labeled 'case-control' containing all of the given samples but will discard those samples in a relation that are not in the list of samples. .. _Cuffdiff: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param fdr: the allowed false discovery rate :type fdr: decimal :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param str library_normalization: options are: geometric, classic-fpkm, quartile :param str dispersion_method: options are: pooled, per-condition, blind, poisson """ inputs = {'annotation': get_data_id(annotation)} input_objects = [annotation] if genome is not None: inputs['genome'] = genome.id input_objects.append(genome) if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if fdr is not None: inputs['fdr'] = fdr if library_type is not None: inputs['library_type'] = library_type if library_normalization is not None: inputs['library_normalization'] = library_normalization if dispersion_method is not None: inputs['dispersion_method'] = dispersion_method samples = get_samples(resource) sample_ids = [sample.id for sample in samples] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) collection_id = get_resource_collection(resource) relation_filter = {} if collection_id: relation_filter['collection'] = collection_id else: relation_filter['entity'] = sample_ids relations = resolwe.relation.filter( type='compare', **relation_filter ) cuffdiff_objects = [] for relation in relations: control = [] case = [] for partition in relation.partitions: sample = resolwe.sample.get(partition['entity']) label = partition['label'] if sample.id not in sample_ids: continue if label == 'case': case.append(get_data_id(sample.get_cuffquant())) elif label == 'control': control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Label different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id) ) if not case or not control: continue inputs['case'] = case inputs['control'] = control cuffdiff_obj = resolwe.get_or_run(slug='cuffdiff', input=inputs) cuffdiff_objects.append(cuffdiff_obj) if is_collection(resource): resource.add_data(cuffdiff_obj) elif is_relation(resource): resource.collection.add_data(cuffdiff_obj) if not cuffdiff_objects: if not relations: raise ValueError("No relation containing all of the given samples was found") else: raise ValueError( "No suitable relation was found (given samples all have either 'case' label " "or 'control' label" ) return cuffdiff_objects
def rose2(resource, use_background=True, tss=None, stitch=None, beds=None): """Run ``ROSE 2`` process on the resource. This method runs `ROSE2`_ process with ``tss_exclusion`` and ``stitch`` parameters specified in arguments. Separate process is run for each bed file on the sample. To run process only on subset of those files, list them in ``beds`` argument (if only one object is given, it will be auto-wrapped in list, if it is not already). If ``use_background`` argument is set to ``True``, bam file from background sample is passed to the process as the control. .. _ROSE2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2 :param bool use_background: if set to ``True``, background sample will be used in the process :param int tss: TSS exclusion used in process :param int stitch: Stitch used in process :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run """ results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: background_filter = {} if use_background: collection_id = get_resource_collection(single_resource) if collection_id: background_filter['collection'] = collection_id for sample in get_samples(single_resource): inputs = { 'rankby': sample.get_bam().id, } if tss is not None: inputs['tss'] = tss if stitch is not None: inputs['stitch'] = stitch if use_background: if sample.is_background and not is_sample(single_resource): # Don't run process on the background sample, # but let it fail if it is run directly on sample continue background = sample.get_background(**background_filter) inputs['control'] = background.get_bam().id bed_list = sample.get_macs() if beds is not None: # Convert objects to the list of their ids if isinstance(beds, list): bed_filter = [get_data_id(bed) for bed in beds] else: bed_filter = [get_data_id(beds)] bed_list = bed_list.filter(id__in=bed_filter) for bed in bed_list: inputs['input'] = bed.id rose = sample.resolwe.get_or_run(slug='rose2', input=inputs) sample.add_data(rose) results.append(rose) return results
def run_rose2(self, use_background=True, background_slug='', genome='HG19', tss=None, stitch=None, beds=None): """Run ``ROSE 2`` process on the sample. This method runs `ROSE2`_ process with ``tss_exclusion`` and ``stitch`` parameters specified in arguments. Separate process is run for each bed file on the sample. To run process only on subset of those files, list them in ``beds`` argument (if only one object is given, it will be auto-wrapped in list, if it is not already). If ``use_background`` argument is set to ``True``, bam file from background sample is passed to the process as the control. .. _ROSE2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-rose2 :param bool use_background: if set to ``True``, background sample will be used in the process :param str genome: Genome used in the process (options are HG18, HG19, MM9 and MM10), default is HG19 :param int tss: TSS exclusion used in process :param int stitch: Stitch used in process :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run """ valid_genomes = ['HG18', 'HG19', 'MM9', 'MM10'] if genome not in valid_genomes: raise KeyError( 'Invalid `genome`, please use one of the following: ' '{}'.format(', '.join(valid_genomes))) inputs = { 'genome': genome, 'rankby': self.get_bam().id, } if tss is not None: inputs['tss'] = tss if stitch is not None: inputs['stitch'] = stitch if use_background: background = self.get_background(background_slug, fail_silently=True) if background: inputs['control'] = background.get_bam().id else: self.logger.info('Rose-2 will run without a control sample.') bed_list = self.get_macs() if beds is not None: # Convert objects to the list of their ids if isinstance(beds, list): bed_filter = [get_data_id(bed) for bed in beds] else: bed_filter = [get_data_id(beds)] bed_list = bed_list.filter(id__in=bed_filter) results = [] for bed in bed_list: inputs['input'] = bed.id rose = self.resolwe.get_or_run(slug='rose2', input=inputs) self.add_data(rose) results.append(rose) return results
def bowtie2(resource, genome, mode=None, speed=None, use_se=None, discordantly=None, rep_se=None, minins=None, maxins=None, trim_5=None, trim_3=None, trim_iter=None, trim_nucl=None, rep_mode=None, k_reports=None): """Run bowtie2 aligner on given resource. Align reads files of given resource to the given genome using the `Bowtie2`_ aligner. If reads were already aligned, existing objects will be returned. .. _Bowtie2: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-alignment-bowtie2 :param resource: resource of which reads will be aligned :param genome: data object with genome that will be used :type genome: `~resdk.resources.data.Data` :param str mode: alignment mode (options are: --end-to-end, --local), default is --end-to-end :param str speed: speed vs sensitivity (options are: --very-fast, --fast, --semsitive, --very-sensitive), default is --sensitive :param bool use_se: map as single-ended (for paired-end reads only), default is False :param bool discordantly: report discordantly matched read, default is True :param bool rep_se: report single ended, default is True :param int minins: minimum fragment length, default is 0 :param int maxins: maximum fragment length, default is 500 :param int trim_5: number of bases to trim from 5', default is 0 :param int trim_3: number of bases to trim from 3', default is 0 :param int trim_iter: number of iterations, default is 0 :param int trim_nucl: number of bases to trim from 3' in each iteration, default is 2 :param str rep_mode: report mode (options are: def, k, a), default is def :param int k_reports: number of reports (for -k mode only), default is 5 """ inputs = {'genome': get_data_id(genome)} if mode is not None: inputs['mode'] = mode if speed is not None: inputs['speed'] = speed if use_se is not None: inputs['use_se'] = use_se if discordantly is not None: inputs['discordantly'] = discordantly if rep_se is not None: inputs['rep_se'] = rep_se if minins is not None: inputs['minins'] = minins if maxins is not None: inputs['maxins'] = maxins if trim_5 is not None: inputs['trim_5'] = trim_5 if trim_3 is not None: inputs['trim_3'] = trim_3 if trim_iter is not None: inputs['trim_iter'] = trim_iter if trim_nucl is not None: inputs['trim_nucl'] = trim_nucl if rep_mode is not None: inputs['rep_mode'] = rep_mode if k_reports is not None: inputs['k_reports'] = k_reports results = [] if not isinstance(resource, list): resource = [resource] for single_resource in resource: for sample in get_samples(single_resource): inputs['reads'] = sample.get_reads().id aligned = sample.resolwe.get_or_run(slug='alignment-bowtie2', input=inputs) sample.add_data(aligned) results.append(aligned) return results
def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter( type='group', entity=[sample.id], **relation_filter ) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name) ) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None, library_type=None, library_normalization=None, dispersion_method=None, threads=None): """Run Cuffdiff_ for selected cuffquants. This method runs `Cuffdiff`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unstranded. Other parameters defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric, dispersion_method=pooled, threads=1. Parameter genome is optional. The way the function works depends on the resource. If it is run on a collection, it will perform cuffdiff on every 'compare' relation labeled 'case-control' in the selected collection. If it is run on a list of samples (not necesssarily in the same collection) it will run cuffdiff on all 'compare' relations labeled 'case-control' containing all of the given samples but will discard those samples in a relation that are not in the list of samples. .. _Cuffdiff: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param fdr: the allowed false discovery rate :type fdr: decimal :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param str library_normalization: options are: geometric, classic-fpkm, quartile :param str dispersion_method: options are: pooled, per-condition, blind, poisson :param int threads: use this many processor threads """ inputs = {'annotation': get_data_id(annotation)} input_objects = [annotation] if genome is not None: inputs['genome'] = genome input_objects.append(genome) if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if fdr is not None: inputs['fdr'] = fdr if library_type is not None: inputs['library_type'] = library_type if library_normalization is not None: inputs['library_normalization'] = library_normalization if dispersion_method is not None: inputs['dispersion_method'] = dispersion_method if threads is not None: inputs['threads'] = threads samples = get_samples(resource) sample_ids = [sample.id for sample in samples] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) collection_id = get_resource_collection(resource) relation_filter = {} if collection_id: relation_filter['collection'] = collection_id else: relation_filter['entity'] = sample_ids relations = resolwe.relation.filter(type='compare', label='case-control', **relation_filter) cuffdiff_objects = [] for relation in relations: control = [] case = [] for sample, position in zip(relation.samples, relation.positions): if sample.id not in sample_ids: continue if position == 'case': case.append(get_data_id(sample.get_cuffquant())) elif position == 'control': control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Position different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id))
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None, sense=None, skip_plot=None, black_list=None, threads=None): """Run ``bamliquidator`` on the resource. This method runs `bamliquidator`_ with bams, where three different analysis type options are possible: Bin mode, Region mode and BED mode. .. _bamliquidator: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator :param list resource: resource from which bam objects will be get :param str cell_type: the name of cell type will be given in counts tables :param int bin_size: number of base pairs in each bin. Default is 100000. :param regions: gtf or bed annotation object used in region mode :type regions: `~resdk.resources.data.Data` :param int extension: Extends reads by number of bp. Default is 200. :param str sense: Mapping strand to gff file. Use '+' for forwaed, '-' for reverse and '.' for both. Defoult is both. :param bool skip_plot: True for skip plot. :param list str black_list: One or more chromosome patterns to skip during bin liquidation. Default is to skip any chromosomes that contain any of the following substrings `chrUn`, `_random`, `Zv9_` or `_hap`. :param int threads: Number of CPUs """ if not xor(bin_size, regions): raise KeyError( 'Exactly one of `bin_size` and `regions` parameters must be given.' ) if regions and not is_data(regions): raise KeyError('`regions` parameter must be data object.') input_objects = [] bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'bam': bams, } if bin_size: inputs['analysis_type'] = 'bin' inputs['bin_size'] = bin_size else: # regions if regions.process_type == 'data:annotation:gtf:': inputs['analysis_type'] = 'gtf' elif regions.process_type == 'data:bed:': inputs['analysis_type'] = 'bed' else: raise KeyError( '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`' ) input_objects.append(regions) inputs['regions_file_gtf'] = get_data_id(regions) if cell_type is not None: inputs['cell_type'] = cell_type if extension is not None: inputs['extension'] = extension if sense is not None: inputs['sense'] = sense if skip_plot is not None: inputs['skip_plot'] = skip_plot if black_list is not None: inputs['black_list'] = black_list if threads is not None: inputs['threads'] = threads resolwe = get_resolwe(*input_objects) bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs) if is_collection(resource): resource.add_data(bamliquidator_obj) elif is_relation(resource): resource.collection.add_data(bamliquidator_obj) return bamliquidator_obj
def test_get_data_id(self): data = Data(id=1, resolwe=MagicMock()) data.id = 1 # this is overriden when initialized self.assertEqual(get_data_id(data), 1) self.assertEqual(get_data_id(2), 2)
def cuffnorm(resource, annotation, use_ercc=None, threads=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation``, ``useERCC`` and ``threads`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: int or `~resdk.resources.data.Data` :param bool useERCC: use ERRCC spike-in controls for normalization :param int threads: use this many threads to align reads (default: ``1``) """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] labels = [] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter(type='group', label='replicates', entity=[sample.id], **relation_filter) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name)) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) if str(relation.id) not in labels: labels.append(str(relation.id)) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), 'labels': labels, } if use_ercc is not None: inputs['useERCC'] = use_ercc if threads is not None: inputs['threads'] = threads cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None, sense=None, skip_plot=None, black_list=None, threads=None): """Run ``bamliquidator`` on the resource. This method runs `bamliquidator`_ with bams, where three different analysis type options are possible: Bin mode, Region mode and BED mode. .. _bamliquidator: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator :param list resource: resource from which bam objects will be get :param str cell_type: the name of cell type will be given in counts tables :param int bin_size: number of base pairs in each bin. Default is 100000. :param regions: gtf or bed annotation object used in region mode :type regions: `~resdk.resources.data.Data` :param int extension: Extends reads by number of bp. Default is 200. :param str sense: Mapping strand to gff file. Use '+' for forwaed, '-' for reverse and '.' for both. Defoult is both. :param bool skip_plot: True for skip plot. :param list str black_list: One or more chromosome patterns to skip during bin liquidation. Default is to skip any chromosomes that contain any of the following substrings `chrUn`, `_random`, `Zv9_` or `_hap`. :param int threads: Number of CPUs """ if not xor(bin_size, regions): raise KeyError('Exactly one of `bin_size` and `regions` parameters must be given.') if regions and not is_data(regions): raise KeyError('`regions` parameter must be data object.') input_objects = [] bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'bam': bams, } if bin_size: inputs['analysis_type'] = 'bin' inputs['bin_size'] = bin_size else: # regions if regions.process_type == 'data:annotation:gtf:': inputs['analysis_type'] = 'gtf' elif regions.process_type == 'data:bed:': inputs['analysis_type'] = 'bed' else: raise KeyError( '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`' ) input_objects.append(regions) inputs['regions_file_gtf'] = get_data_id(regions) if cell_type is not None: inputs['cell_type'] = cell_type if extension is not None: inputs['extension'] = extension if sense is not None: inputs['sense'] = sense if skip_plot is not None: inputs['skip_plot'] = skip_plot if black_list is not None: inputs['black_list'] = black_list if threads is not None: inputs['threads'] = threads resolwe = get_resolwe(*input_objects) bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs) if is_collection(resource): resource.add_data(bamliquidator_obj) elif is_relation(resource): resource.collection.add_data(bamliquidator_obj) return bamliquidator_obj
def run_bamplot(self, bam, genome, input_gff=None, input_region=None, stretch_input=None, color=None, sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None, scale=None, bed=None, multi_page=None): """Run bamplot.""" if not input_gff and not input_region: raise KeyError('Please specify `input_gff` or `input_region.') if input_gff and input_region: raise KeyError('Please specify `input_gff` or `input_region.') valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10'] if genome not in valid_genomes: raise KeyError( 'Invalid `genome`, please use one of the following: ' '{}'.format(', '.join(valid_genomes))) if isinstance(bam, list): bam = [get_data_id(bam_obj) for bam_obj in bam] else: bam = [get_data_id(bam)] inputs = { 'genome': genome, 'bam': bam, } if color is not None: inputs['color'] = color if sense is not None: inputs['scale'] = scale if extension is not None: inputs['extension'] = extension if rpm is not None: inputs['rpm'] = rpm if yscale is not None: inputs['yscale'] = yscale if names is not None: inputs['names'] = names if plot is not None: inputs['plot'] = plot if title is not None: inputs['title'] = title if scale is not None: inputs['scale'] = scale if multi_page is not None: inputs['multi_page'] = multi_page if input_gff is not None: inputs['input_gff'] = get_data_id(input_gff) if input_region is not None: inputs['input_region'] = input_region if bed is not None: if isinstance(bed, list): inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed] else: inputs['bed'] = [get_data_id(bed)] bamplot = self.get_or_run(slug='bamplot', input=inputs) return bamplot
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None, sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None, scale=None, bed=None, multi_page=None): """Run ``bamplot`` on the resource. This method runs `bamplot`_ with bams, genome and gff or region specified in arguments. .. _bamplot: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot :param list resource: resource from which bam objects will be get :param str genome: Genome used in the process (options are HG18, HG19, MM9 and MM10) :param input_gff: id of annotation file is given :type input_gff: int or `~resdk.resources.data.Data` :param str input_region: enter a genomic region :param int stretch_input: stretch the input regions to a minimum length :param str color: enter a colon separated list of colors :param str sense: map to forward, reverse or'both strand, default maps to ``both`` :param int extension: extends reads by n bp, dfault value is 200bp :param bool rpm: normalizes density to reads per million (rpm), default is ``False`` :param str yscale: choose either relative or uniform y axis scaling, default is ``relative scaling`` :param str names: a comma separated list of names for your bams :param str plot: choose all lines on a single plot or multiple plots :param str title: title for the output plot(s), default will be the coordinate region :param str scale: a comma separated list of multiplicative scaling factors for your bams, default is ``None`` :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run :param bool multi_page: if flagged will create a new pdf for each region """ input_objects = [] if not input_gff and not input_region: raise KeyError('Please specify `input_gff` or `input_region.') if input_gff and input_region: raise KeyError('Please specify `input_gff` or `input_region.') valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10', 'RN4', 'RN6'] if genome not in valid_genomes: raise KeyError('Invalid `genome`, please use one of the following: ' '{}'. format(', '.join(valid_genomes))) bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'genome': genome, 'bam': bams, } if color is not None: inputs['color'] = color if sense is not None: inputs['scale'] = scale if extension is not None: inputs['extension'] = extension if rpm is not None: inputs['rpm'] = rpm if yscale is not None: inputs['yscale'] = yscale if names is not None: inputs['names'] = names if plot is not None: inputs['plot'] = plot if title is not None: inputs['title'] = title if scale is not None: inputs['scale'] = scale if multi_page is not None: inputs['multi_page'] = multi_page if input_gff is not None: input_objects.append(input_gff) inputs['input_gff'] = get_data_id(input_gff) if input_region is not None: inputs['input_region'] = input_region if bed is not None: if isinstance(bed, list): input_objects.extend(bed) inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed] else: input_objects.append(bed) inputs['bed'] = [get_data_id(bed)] resolwe = get_resolwe(*input_objects) bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs) if is_collection(resource): resource.add_data(bamplot_obj) elif is_relation(resource): resource.collection.add_data(bamplot_obj) return bamplot_obj