def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] inputs = { 'cuffquant': cuffquants, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Position different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id)) if not case or not control: continue inputs['case'] = case inputs['control'] = control cuffdiff_obj = resolwe.get_or_run(slug='cuffdiff', input=inputs) cuffdiff_objects.append(cuffdiff_obj) if is_collection(resource): resource.add_data(cuffdiff_obj) elif is_relation(resource): resource.collection.add_data(cuffdiff_obj) if not cuffdiff_objects: if not relations: raise ValueError( "No relation containing all of the given samples was found") else: raise ValueError( "No suitable relation was found (given samples all have either 'case' position " "or 'control' position") return cuffdiff_objects
def cuffnorm(resource, annotation, use_ercc=None, threads=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation``, ``useERCC`` and ``threads`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: int or `~resdk.resources.data.Data` :param bool useERCC: use ERRCC spike-in controls for normalization :param int threads: use this many threads to align reads (default: ``1``) """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] labels = [] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter(type='group', label='replicates', entity=[sample.id], **relation_filter) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name)) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) if str(relation.id) not in labels: labels.append(str(relation.id)) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), 'labels': labels, } if use_ercc is not None: inputs['useERCC'] = use_ercc if threads is not None: inputs['threads'] = threads cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None, sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None, scale=None, bed=None, multi_page=None): """Run ``bamplot`` on the resource. This method runs `bamplot`_ with bams, genome and gff or region specified in arguments. .. _bamplot: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot :param list resource: resource from which bam objects will be get :param str genome: Genome used in the process (options are HG18, HG19, MM9 and MM10) :param input_gff: id of annotation file is given :type input_gff: int or `~resdk.resources.data.Data` :param str input_region: enter a genomic region :param int stretch_input: stretch the input regions to a minimum length :param str color: enter a colon separated list of colors :param str sense: map to forward, reverse or'both strand, default maps to ``both`` :param int extension: extends reads by n bp, dfault value is 200bp :param bool rpm: normalizes density to reads per million (rpm), default is ``False`` :param str yscale: choose either relative or uniform y axis scaling, default is ``relative scaling`` :param str names: a comma separated list of names for your bams :param str plot: choose all lines on a single plot or multiple plots :param str title: title for the output plot(s), default will be the coordinate region :param str scale: a comma separated list of multiplicative scaling factors for your bams, default is ``None`` :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run :param bool multi_page: if flagged will create a new pdf for each region """ input_objects = [] if not input_gff and not input_region: raise KeyError('Please specify `input_gff` or `input_region.') if input_gff and input_region: raise KeyError('Please specify `input_gff` or `input_region.') valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10'] if genome not in valid_genomes: raise KeyError('Invalid `genome`, please use one of the following: ' '{}'.format(', '.join(valid_genomes))) bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'genome': genome, 'bam': bams, } if color is not None: inputs['color'] = color if sense is not None: inputs['scale'] = scale if extension is not None: inputs['extension'] = extension if rpm is not None: inputs['rpm'] = rpm if yscale is not None: inputs['yscale'] = yscale if names is not None: inputs['names'] = names if plot is not None: inputs['plot'] = plot if title is not None: inputs['title'] = title if scale is not None: inputs['scale'] = scale if multi_page is not None: inputs['multi_page'] = multi_page if input_gff is not None: input_objects.append(input_gff) inputs['input_gff'] = get_data_id(input_gff) if input_region is not None: inputs['input_region'] = input_region if bed is not None: if isinstance(bed, list): input_objects.extend(bed) inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed] else: input_objects.append(bed) inputs['bed'] = [get_data_id(bed)] resolwe = get_resolwe(*input_objects) bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs) if is_collection(resource): resource.add_data(bamplot_obj) elif is_relation(resource): resource.collection.add_data(bamplot_obj) return bamplot_obj
def cuffdiff(resource, annotation, genome=None, multi_read_correct=None, fdr=None, library_type=None, library_normalization=None, dispersion_method=None): """Run Cuffdiff_ for selected cuffquants. This method runs `Cuffdiff`_ process with ``annotation`` specified in arguments. Library type is by defalt fr-unstranded. Other parameters defaults: multi_read_correct=false, fdr=0.05, library_normalization=geometric, dispersion_method=pooled, threads=1. Parameter genome is optional. The way the function works depends on the resource. If it is run on a collection, it will perform cuffdiff on every 'compare' relation labeled 'case-control' in the selected collection. If it is run on a list of samples (not necesssarily in the same collection) it will run cuffdiff on all 'compare' relations labeled 'case-control' containing all of the given samples but will discard those samples in a relation that are not in the list of samples. .. _Cuffdiff: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-cuffdiff :param annotation: annotation file :type annotation: `~resdk.resources.data.Data` :param genome: genome object to use for bias detection and correction algorithm :type genome: `~resdk.resources.data.Data` :param bool multi_read_correct: do initial estimation procedure to more accurately weight reads with multiple genome mappings :param fdr: the allowed false discovery rate :type fdr: decimal :param str library_type: options are: fr-unstranded, fr-firststrand, fr-secondstrand :param str library_normalization: options are: geometric, classic-fpkm, quartile :param str dispersion_method: options are: pooled, per-condition, blind, poisson """ inputs = {'annotation': get_data_id(annotation)} input_objects = [annotation] if genome is not None: inputs['genome'] = genome.id input_objects.append(genome) if multi_read_correct is not None: inputs['multi_read_correct'] = multi_read_correct if fdr is not None: inputs['fdr'] = fdr if library_type is not None: inputs['library_type'] = library_type if library_normalization is not None: inputs['library_normalization'] = library_normalization if dispersion_method is not None: inputs['dispersion_method'] = dispersion_method samples = get_samples(resource) sample_ids = [sample.id for sample in samples] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) collection_id = get_resource_collection(resource) relation_filter = {} if collection_id: relation_filter['collection'] = collection_id else: relation_filter['entity'] = sample_ids relations = resolwe.relation.filter( type='compare', **relation_filter ) cuffdiff_objects = [] for relation in relations: control = [] case = [] for partition in relation.partitions: sample = resolwe.sample.get(partition['entity']) label = partition['label'] if sample.id not in sample_ids: continue if label == 'case': case.append(get_data_id(sample.get_cuffquant())) elif label == 'control': control.append(get_data_id(sample.get_cuffquant())) else: raise ValueError( "Label different from 'case' or 'control' was found in the " "following relation: {}".format(relation.id) ) if not case or not control: continue inputs['case'] = case inputs['control'] = control cuffdiff_obj = resolwe.get_or_run(slug='cuffdiff', input=inputs) cuffdiff_objects.append(cuffdiff_obj) if is_collection(resource): resource.add_data(cuffdiff_obj) elif is_relation(resource): resource.collection.add_data(cuffdiff_obj) if not cuffdiff_objects: if not relations: raise ValueError("No relation containing all of the given samples was found") else: raise ValueError( "No suitable relation was found (given samples all have either 'case' label " "or 'control' label" ) return cuffdiff_objects
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None, sense=None, skip_plot=None, black_list=None, threads=None): """Run ``bamliquidator`` on the resource. This method runs `bamliquidator`_ with bams, where three different analysis type options are possible: Bin mode, Region mode and BED mode. .. _bamliquidator: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator :param list resource: resource from which bam objects will be get :param str cell_type: the name of cell type will be given in counts tables :param int bin_size: number of base pairs in each bin. Default is 100000. :param regions: gtf or bed annotation object used in region mode :type regions: `~resdk.resources.data.Data` :param int extension: Extends reads by number of bp. Default is 200. :param str sense: Mapping strand to gff file. Use '+' for forwaed, '-' for reverse and '.' for both. Defoult is both. :param bool skip_plot: True for skip plot. :param list str black_list: One or more chromosome patterns to skip during bin liquidation. Default is to skip any chromosomes that contain any of the following substrings `chrUn`, `_random`, `Zv9_` or `_hap`. :param int threads: Number of CPUs """ if not xor(bin_size, regions): raise KeyError( 'Exactly one of `bin_size` and `regions` parameters must be given.' ) if regions and not is_data(regions): raise KeyError('`regions` parameter must be data object.') input_objects = [] bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'bam': bams, } if bin_size: inputs['analysis_type'] = 'bin' inputs['bin_size'] = bin_size else: # regions if regions.process_type == 'data:annotation:gtf:': inputs['analysis_type'] = 'gtf' elif regions.process_type == 'data:bed:': inputs['analysis_type'] = 'bed' else: raise KeyError( '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`' ) input_objects.append(regions) inputs['regions_file_gtf'] = get_data_id(regions) if cell_type is not None: inputs['cell_type'] = cell_type if extension is not None: inputs['extension'] = extension if sense is not None: inputs['sense'] = sense if skip_plot is not None: inputs['skip_plot'] = skip_plot if black_list is not None: inputs['black_list'] = black_list if threads is not None: inputs['threads'] = threads resolwe = get_resolwe(*input_objects) bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs) if is_collection(resource): resource.add_data(bamliquidator_obj) elif is_relation(resource): resource.collection.add_data(bamliquidator_obj) return bamliquidator_obj
def cuffnorm(resource, annotation, use_ercc=None): """Run Cuffnorm_ for selected cuffquats. This method runs `Cuffnorm`_ process on ``resource`` with ``annotation`` and ``use_ercc`` parameters specified in arguments. .. _Cuffnorm: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-upload-expression-cuffnorm :param resource: resource on which cuffnorm will be run :param annotation: annotation object used in cuffnorm :type annotation: `~resdk.resources.data.Data` :param bool use_ercc: use ERRCC spike-in controls for normalization """ relation_filter = {} collection_id = get_resource_collection(resource) if collection_id: relation_filter['collection'] = collection_id samples = get_samples(resource) input_objects = [annotation] input_objects.extend(samples) resolwe = get_resolwe(*input_objects) cuffquants = [get_data_id(sample.get_cuffquant()) for sample in samples] replicates = [] replicates_ids = {} for sample in samples: relations = resolwe.relation.filter( type='group', entity=[sample.id], **relation_filter ) if len(relations) == 1: relation = relations[0] else: raise LookupError( "Cannot determine unique group relation with label `replicates` for the " "following sample: {}".format(sample.name) ) if relation.id not in replicates_ids: replicates_ids[relation.id] = str(len(replicates_ids)) replicates.append(replicates_ids[relation.id]) inputs = { 'cuffquant': cuffquants, 'replicates': replicates, 'annotation': get_data_id(annotation), } if use_ercc is not None: inputs['useERCC'] = use_ercc cuffnorm_obj = resolwe.get_or_run(slug='cuffnorm', input=inputs) if is_collection(resource): resource.add_data(cuffnorm_obj) elif is_relation(resource): resource.collection.add_data(cuffnorm_obj) return cuffnorm_obj
def bamliquidator(resource, cell_type=None, bin_size=None, regions=None, extension=None, sense=None, skip_plot=None, black_list=None, threads=None): """Run ``bamliquidator`` on the resource. This method runs `bamliquidator`_ with bams, where three different analysis type options are possible: Bin mode, Region mode and BED mode. .. _bamliquidator: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamliquidator :param list resource: resource from which bam objects will be get :param str cell_type: the name of cell type will be given in counts tables :param int bin_size: number of base pairs in each bin. Default is 100000. :param regions: gtf or bed annotation object used in region mode :type regions: `~resdk.resources.data.Data` :param int extension: Extends reads by number of bp. Default is 200. :param str sense: Mapping strand to gff file. Use '+' for forwaed, '-' for reverse and '.' for both. Defoult is both. :param bool skip_plot: True for skip plot. :param list str black_list: One or more chromosome patterns to skip during bin liquidation. Default is to skip any chromosomes that contain any of the following substrings `chrUn`, `_random`, `Zv9_` or `_hap`. :param int threads: Number of CPUs """ if not xor(bin_size, regions): raise KeyError('Exactly one of `bin_size` and `regions` parameters must be given.') if regions and not is_data(regions): raise KeyError('`regions` parameter must be data object.') input_objects = [] bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'bam': bams, } if bin_size: inputs['analysis_type'] = 'bin' inputs['bin_size'] = bin_size else: # regions if regions.process_type == 'data:annotation:gtf:': inputs['analysis_type'] = 'gtf' elif regions.process_type == 'data:bed:': inputs['analysis_type'] = 'bed' else: raise KeyError( '`regions` object must be of type `data:annotation:gtf:` or `data:bed:`' ) input_objects.append(regions) inputs['regions_file_gtf'] = get_data_id(regions) if cell_type is not None: inputs['cell_type'] = cell_type if extension is not None: inputs['extension'] = extension if sense is not None: inputs['sense'] = sense if skip_plot is not None: inputs['skip_plot'] = skip_plot if black_list is not None: inputs['black_list'] = black_list if threads is not None: inputs['threads'] = threads resolwe = get_resolwe(*input_objects) bamliquidator_obj = resolwe.get_or_run(slug='bamliquidator', input=inputs) if is_collection(resource): resource.add_data(bamliquidator_obj) elif is_relation(resource): resource.collection.add_data(bamliquidator_obj) return bamliquidator_obj
def bamplot(resource, genome, input_gff=None, input_region=None, stretch_input=None, color=None, sense=None, extension=None, rpm=None, yscale=None, names=None, plot=None, title=None, scale=None, bed=None, multi_page=None): """Run ``bamplot`` on the resource. This method runs `bamplot`_ with bams, genome and gff or region specified in arguments. .. _bamplot: http://resolwe-bio.readthedocs.io/en/latest/catalog-definitions.html#process-bamplot :param list resource: resource from which bam objects will be get :param str genome: Genome used in the process (options are HG18, HG19, MM9 and MM10) :param input_gff: id of annotation file is given :type input_gff: int or `~resdk.resources.data.Data` :param str input_region: enter a genomic region :param int stretch_input: stretch the input regions to a minimum length :param str color: enter a colon separated list of colors :param str sense: map to forward, reverse or'both strand, default maps to ``both`` :param int extension: extends reads by n bp, dfault value is 200bp :param bool rpm: normalizes density to reads per million (rpm), default is ``False`` :param str yscale: choose either relative or uniform y axis scaling, default is ``relative scaling`` :param str names: a comma separated list of names for your bams :param str plot: choose all lines on a single plot or multiple plots :param str title: title for the output plot(s), default will be the coordinate region :param str scale: a comma separated list of multiplicative scaling factors for your bams, default is ``None`` :param list beds: subset of bed files to run process on, if empty processes for all bed files will be run :param bool multi_page: if flagged will create a new pdf for each region """ input_objects = [] if not input_gff and not input_region: raise KeyError('Please specify `input_gff` or `input_region.') if input_gff and input_region: raise KeyError('Please specify `input_gff` or `input_region.') valid_genomes = ['HG18', 'HG19', 'MM8', 'MM9', 'MM10', 'RN4', 'RN6'] if genome not in valid_genomes: raise KeyError('Invalid `genome`, please use one of the following: ' '{}'. format(', '.join(valid_genomes))) bams = [sample.get_bam() for sample in get_samples(resource)] input_objects.extend(bams) bams = [get_data_id(bam) for bam in bams] inputs = { 'genome': genome, 'bam': bams, } if color is not None: inputs['color'] = color if sense is not None: inputs['scale'] = scale if extension is not None: inputs['extension'] = extension if rpm is not None: inputs['rpm'] = rpm if yscale is not None: inputs['yscale'] = yscale if names is not None: inputs['names'] = names if plot is not None: inputs['plot'] = plot if title is not None: inputs['title'] = title if scale is not None: inputs['scale'] = scale if multi_page is not None: inputs['multi_page'] = multi_page if input_gff is not None: input_objects.append(input_gff) inputs['input_gff'] = get_data_id(input_gff) if input_region is not None: inputs['input_region'] = input_region if bed is not None: if isinstance(bed, list): input_objects.extend(bed) inputs['bed'] = [get_data_id(bed_obj) for bed_obj in bed] else: input_objects.append(bed) inputs['bed'] = [get_data_id(bed)] resolwe = get_resolwe(*input_objects) bamplot_obj = resolwe.get_or_run(slug='bamplot', input=inputs) if is_collection(resource): resource.add_data(bamplot_obj) elif is_relation(resource): resource.collection.add_data(bamplot_obj) return bamplot_obj