Ejemplo n.º 1
0
def get_region_counts(BAMorBED, ROIs, chr_prefix=None, chrom_size=300000000):
    """
        Direct ROIs to BAM, BEDgraph or BED file reader.
        Also can work with Wiggle files but these are very slow.
        Return ROIs, the number of read tags, total counts and mapped tags
    """

    rr = RunRecord('get_region_counts')
    if 'bam' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_BAM(BAMorBED, ROIs, chr_prefix)
    elif 'bedgraph' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_BEDgraph(BAMorBED, ROIs, chr_prefix)
    elif 'bed' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_BED(BAMorBED, ROIs, chr_prefix)
    elif 'wig' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_wiggle(BAMorBED, ROIs, chr_prefix, chrom_size)
    elif 'vcf' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_vcf(BAMorBED, ROIs, chr_prefix, chrom_size)
    else:
        rr.dieOnCritical('File not recognised as BAM, BEDgraph,'+\
                'BED, WIG or VCF', BAMorBED)

    rr.addInfo('Number of read tags counted', num_tags)
    rr.addInfo('Number of total bases counted', num_bases)
    rr.addInfo('Number of mapped tags in experiment', mapped_tags)

    return filled_ROIs, num_tags, num_bases, mapped_tags
Ejemplo n.º 2
0
def gene_expr_diff_to_table(data_path, sep='\t', stable_id_label='',
        probeset_label='', exp_label='', sig_label='', pval_label='',
        allow_probeset_many_gene=False, validate=True):
    """
        As per gene_expr_to_table() but with the addition of sig_label and
        pval_label columns.
    """
    rr = RunRecord('gene_expr_diff_to_table')

    rr.addInfo('Reading expression diff file', data_path)
    genes, probes, exp, sig, pval, probes_present = _read_data_file(\
            data_path, sep=sep, stable_id_label=stable_id_label,
            probeset_label=probeset_label, exp_label=exp_label,
            sig_label=sig_label, pval_label=pval_label, is_diff=True)

    if probes_present:
        if validate:
            # if probes and exp are mismatched, nuke the gene
            genes, probes, exp, sig, pval =\
                    _validate_probes_scores(genes, probes, exp, sig, pval)

        if not allow_probeset_many_gene:
            # each probe should map to only one gene
            genes, probes, exp, sig, pval =\
                    _remove_multimapped_probesets(genes, probes, exp,
                    sig, pval)

    header = DIFF_HEADER
    rows = [[g, p, e, s, v] for g, p, e, s, v in \
                zip(genes, probes, exp, sig, pval)]

    return Table(header=header, rows=rows)
Ejemplo n.º 3
0
    def load_expr(self,
                  expr_study,
                  db_path,
                  include_targets=None,
                  exclude_targets=None):
        """
            loads expression records from a ChippyDB and also
            ranks by expr
        """
        rr = RunRecord('load_expr')

        sample_name = expr_study.split(' : ')[0]
        session = db_query.make_session(db_path)

        self.expr_genes = []
        #sample_type == 'Expression data: absolute ranked'
        print 'Querying sample from ChippyDB', sample_name

        sample_genes = db_query.get_genes_by_ranked_expr(
            session,
            sample_name,
            biotype='protein_coding',
            data_path=None,
            rank_by='mean',
            include_targets=include_targets,
            exclude_targets=exclude_targets)

        for gene in sample_genes:
            gene_record = ExprGene(gene.MeanScore, gene.Rank, gene.ensembl_id,
                                   sample_name)
            self.expr_genes.append(gene_record)
        rr.addInfo('genes found in ' + sample_name, len(sample_genes))
Ejemplo n.º 4
0
    def _groupNoGeneCounts(self):
        """ Don't group counts. Simply return a PlotLine for each set of
            counts.
            Called by asPlotLines()
        """
        rr = RunRecord('_groupNoGeneCounts')
        counts = self.data_collection.counts
        ranks = self.data_collection.ranks
        labels = self.data_collection.labels
        plot_lines = []
        for c, r, l in zip(counts, ranks, labels):
            if self.counts_func == stdev:
                stdev_ = c.std()
                if stdev_ > 0:
                    c = (c - c.mean()) / stdev_
                    plot_lines.append(
                        PlotLine(c, r, l, study=self.collection_label))
            else:
                plot_lines.append(
                    PlotLine(c, r, l, study=self.collection_label))

        # If no data was returned default to groupAllCollectionCounts
        if not len(plot_lines):
            rr.dieOnCritical('No data in collection', 'Failure')

        # If a single line is created label it with the collection name
        if len(plot_lines) == 1:
            plot_lines[0].label = [self.collection_label]

        return plot_lines
Ejemplo n.º 5
0
    def _groupNGeneCounts(self, group_size, p=0.0):
        """ Group counts for N genes and return as PlotLines. Defaults to
            _groupAllGeneCounts() if group size is too large.
            Called by asPlotLines()
        """
        rr = RunRecord('_groupNGeneCounts')
        plot_lines = []
        for index, (c,r,l,se) in enumerate(self.data_collection.\
                iterTransformedGroups(group_size=group_size,
                counts_func=self.counts_func, p=p)):
            plot_lines.append(
                PlotLine(c,
                         rank=r,
                         label=l,
                         study=self.collection_label,
                         stderr=se))

        # If no data was returned default to groupAllCollectionCounts
        if not len(plot_lines):
            rr.addWarning('Defaulting to ALL features. Not enough '+\
                          'features for group of size', group_size)
            plotLines = self._groupAllGeneCounts()
            return plotLines

        return plot_lines
Ejemplo n.º 6
0
def get_collection(session,
                   sample_name,
                   feature_type,
                   BAMorBED,
                   chr_prefix,
                   window_upstream,
                   window_downstream,
                   multitest_signif_val,
                   collection_fn,
                   overwrite,
                   tab_delimited,
                   include_targets=None,
                   exclude_targets=None,
                   bedgraph=False,
                   BED_windows=False,
                   chrom_size=300000000,
                   no_overlap=True):
    """
        builds and writes a collection of counts and expression for
        feature_type in given sample genes.
    """
    rr = RunRecord('get_collection')

    if not collection_fn.endswith('.chp'):
        collection_fn += '.chp'  # ChipPy data file

    if not os.path.exists(collection_fn) or overwrite:
        bedgraph_fn = None
        if bedgraph:
            bedgraph_fn = '.'.join(collection_fn.split('.')[:-1]) + '.bedgraph'

        BED_windows_fn = None
        if BED_windows:
            BED_windows_fn = '.'.join(collection_fn.split('.')[:-1]) +\
                    '_regions.BED'

        data_collection = counts_for_genes(
            session,
            sample_name,
            feature_type,
            BAMorBED,
            chr_prefix,
            window_upstream,
            window_downstream,
            include_targets,
            exclude_targets,
            bedgraph_fn,
            multitest_signif_val=multitest_signif_val,
            BED_windows_fn=BED_windows_fn,
            chrom_size=chrom_size,
            no_overlap=no_overlap)

        if data_collection is not None:
            data_collection.writeToFile(collection_fn,
                                        as_table=tab_delimited,
                                        compress_file=True)
        else:
            rr.dieOnCritical('No data collection was returned', 'Failed')
    else:
        print 'Existing output at', collection_fn
Ejemplo n.º 7
0
    def asPlotLines(self, group_size, group_location, p=0.0):
        """
            Returns a list of PlotLine objects from this study.
            'p' is the Chebyshev cut-off if not None
        """
        rr = RunRecord('asPlotLines')
        if p > 0.0:
            rr.addInfo('Applying per-line Chebyshev filtering', p)

        if type(group_size) is str and group_size.lower() == 'all':
            plot_lines = self._groupAllGeneCounts()
        elif type(group_size) is int:
            if group_size == 1:
                plot_lines = self._groupNoGeneCounts()
            else:
                plot_lines = self._groupNGeneCounts(group_size, p=p)
        else:
            rr.dieOnCritical('group_size, wrong type or value',
                             [type(group_size), group_size])

        if group_location.lower() != 'all':
            rr.addInfo('grouping genes from location', group_location)
            plot_lines.sort(key=lambda x: x.rank)
            if group_location.lower() == 'top':
                plot_lines = [plot_lines[0]]
            elif group_location.lower() == 'middle':
                plot_lines = [plot_lines[int(len(plot_lines) / 2)]]
            elif group_location.lower() == 'bottom':
                plot_lines = [plot_lines[-1]]

        rr.addInfo('Plottable lines from study', len(plot_lines))
        return plot_lines
Ejemplo n.º 8
0
def get_genes_by_ranked_diff(session,
                             sample_name,
                             multitest_signif_val=None,
                             biotype='protein_coding',
                             chrom=None,
                             data_path=None,
                             include_targets=None,
                             exclude_targets=None,
                             rank_by='mean'):
    """returns all ranked genes from a sample difference experiment"""
    rr = RunRecord('get_genes_by_ranked_diff')
    records = get_diff_entries(session,
                               sample_name=sample_name,
                               biotype=biotype,
                               data_path=data_path,
                               chrom=chrom,
                               multitest_signif_val=multitest_signif_val)

    genes = []
    for expressed_diff in records:
        gene = expressed_diff.gene
        gene.Scores = expressed_diff.fold_changes
        genes.append(gene)

    # keep only those genes in the include target gene sets if provided
    if include_targets is not None:
        include_genes = get_targetgene_entries(session, include_targets)
        if len(include_genes) > 0:
            include_gene_ids = set(
                [tg.gene.ensembl_id for tg in include_genes])
            genes = [gene for gene in genes if gene.ensembl_id in\
                    include_gene_ids]

    # keep only those genes not in the exclude target gene sets if provided
    if exclude_targets is not None:
        exclude_genes = get_targetgene_entries(session, exclude_targets)
        if len(exclude_genes) > 0:
            exclude_gene_ids = set(
                [tg.gene.ensembl_id for tg in exclude_genes])
            genes = [gene for gene in genes if gene.ensembl_id not in\
                    exclude_gene_ids]

    # set rank
    if rank_by.lower() == 'mean':
        scored = [(g.MeanScore, g) for g in genes]
    elif rank_by.lower() == 'max':
        scored = [(g.MaxScore, g) for g in genes]
    else:
        rr.dieOnCritical('Ranking method not possible', rank_by.lower())

    # Make sure we get highest first
    scored = reversed(sorted(scored))
    genes = []
    for rank, (score, gene) in enumerate(scored):
        gene.Rank = rank + 1
        genes.append(gene)

    return genes
Ejemplo n.º 9
0
    def normaliseByRPM(self):
        """ Normalise counts by per million mapped tags """
        rr = RunRecord('normaliseByRPM')
        norm_factor = 1000000.0 / self.mapped_tags
        rr.addInfo('normalising by RPMs', norm_factor)

        for gene in self.counts_genes:
            gene.promoter_counts *= norm_factor
            gene.coding_counts *= norm_factor
            gene.feature_counts *= norm_factor
            gene.total_counts *= norm_factor
Ejemplo n.º 10
0
def set_up_series_plots_dir(plot_filename):
    """ Create directory structure for series plots """
    rr = RunRecord('set_up_series_plot_dir')

    save_dir = dirname_or_default(plot_filename)
    basename = os.path.basename(plot_filename)

    plot_series_dir = os.path.join(save_dir,
        '%s-series' % basename[:basename.rfind('.')])
    create_path(plot_series_dir)
    rr.addInfo('Plotting as a series to', plot_series_dir)
    return plot_series_dir
Ejemplo n.º 11
0
def gene_expr_to_table(data_path, sep='\t', stable_id_label='',
        probeset_label='', exp_label='', allow_probeset_many_gene=False,
        validate=True):
    """
        Returns a cogent table object

        Deals with a simple tab-delimited representation of gene expression
        data which may have come from either micro-array or mRNA-seq
        experiments.

        Data from micro-arrays will have probeset information for each
        gene and a score to match each probe.

        RNA-seq data will not have probes and simply a single score for each
        gene. In this case we will create a fake probe for each gene of the
        form 'P' + a unique integer.

        Probset id's and expressions scores are separated by the pipe
        -- | -- character. The probset and expression scores are then
        converted to tuples of ints or floats respectively.

        Arguments:
            - probeset_label: name of column containing probesets
            - exp_label: name of column containing expression scores
            - stable_id_label: name of column containing Ensembl stable IDs
            - allow_probeset_many_gene: whether one probeset can map to
                multiple genes. If not we remove probes and scores that multi-
                map.
            - validate: checks that -- stable IDs are unique in the file,
                that for each row the number of probesets equals the
                number of expression scores. Removes the gene entry.
    """

    rr = RunRecord('geneExprDataToTable')

    rr.addInfo('Reading expression data', data_path)
    genes, probes, exp, probes_present = _read_data_file(data_path, sep=sep,
            stable_id_label=stable_id_label, probeset_label=probeset_label,
            exp_label=exp_label)

    if probes_present:
        if validate:
            # if probes and scores are mismatched, nuke the gene
            genes, probes, exp = \
                    _validate_probes_scores(genes, probes, exp)

        if not allow_probeset_many_gene:
            # each probe should map to only one gene
            genes, probes, exp = \
                    _remove_multimapped_probesets(genes, probes, exp)

    rows = [[g,p,e] for g,p,e in zip(genes, probes, exp)]
    return Table(header=EXPR_HEADER, rows=rows)
Ejemplo n.º 12
0
def main():
    rr = RunRecord('drop_expression_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Drop Expression Data')
    session = db_query.make_session(args.db_path)

    if db_query.drop_sample_records(session, args.sample):
        rr.addInfo('Removing ' + args.sample, 'Success')
    else:
        rr.addWarning('Removing ' + args.sample, 'Failure')

    rr.display()
Ejemplo n.º 13
0
def _get_keep_indices(data, filtered=None):
    rr = RunRecord('_get_keep_indices')
    keep = range(data.shape[0])
    
    if filtered is not None:
        keep = []
        for i in range(data.shape[0]):
            if filtered(data[i]):
                keep.append(i)
        if len(keep) == 0:
            rr.dieOnCritical('No remaing data after filtering', 'Failure')

    return keep
Ejemplo n.º 14
0
def _check_expr_headers(header_row, stable_id_label='', probeset_label='',
        exp_label=''):
    """
        Check the header labels match for standard expression. Probeset is
        optional and results in probes_present being False
    """
    rr = RunRecord('_check_expr_headers')

    try:
        gene_col = header_row.index(stable_id_label)
    except ValueError:
        rr.dieOnCritical('Stable ID column header not found in', header_row)

    try:
        exp_col = header_row.index(exp_label)
    except ValueError:
        rr.dieOnCritical('Expression score column header not found in',
                header_row)

    try:
        probe_col = header_row.index(probeset_label)
        probes_present = True
    except ValueError:
        rr.addWarning('Probeset column header not found in', header_row)
        probe_col = -1
        probes_present = False

    return gene_col, probe_col, exp_col, probes_present
Ejemplo n.º 15
0
def safe_line_division(sample_line, dividing_line):
    """ in case of divide-by-zero we need some robustness """
    rr = RunRecord('safe_line_division')
    try:
        sample_line /= dividing_line
    except ZeroDivisionError: # 0 counts at a base position
        min_count = 1
        for c in dividing_line:
            if 0 < c < min_count:
                min_count = c
        sample_line /= dividing_line
        rr.addWarning('Zero counts value seen. Setting zeros to',
                min_count)
    return sample_line
Ejemplo n.º 16
0
    def load_counts(self, collection):
        """ loads gene entries from a ChipPy collection """
        rr = RunRecord('load_counts')

        print 'Loading counts collection file', collection
        self.counts_genes = []
        if os.path.isfile(collection):
            try:
                # to load counts data from file
                file1 = gzip.GzipFile(collection, 'rb')
                data = numpy.load(file1)
                d = data.tolist()
                counts = d['counts']
                labels = d['labels']

                for count, label in zip(counts, labels):
                    gene_record = CountsGene(count, str(label), collection)
                    self.counts_genes.append(gene_record)
                rr.addInfo('genes found in ' + collection, len(labels))

            except IOError:  # some exception type
                rr.dieOnCritical('file found but could not be read',
                                 collection)
        else:
            rr.dieOnCritical('unrecognised collection file', collection)
Ejemplo n.º 17
0
def get_chroms(session):
    """ return list of chroms from ',' separated string """
    if session is None:
        return ['No connection to DB']
    elif type(session) is str:
        session = make_session(session)
    rr = RunRecord('get_chroms')
    try:
        chroms = session.query(Chroms).one()
        chroms = chroms.chromStr.split(',')
    except NoResultFound:
        chroms = []
        rr.addError('Chroms found', None)
    return chroms
Ejemplo n.º 18
0
def set_counts_function(counts_metric):
    """ Sets the feature counting metric function"""
    rr = RunRecord('set_counts_function')
    if counts_metric.lower() == 'mean':
        counts_func = column_mean
        rr.addInfo('Counts metric set to', 'column_mean')
    elif counts_metric.lower() == 'frequency':
        counts_func = column_sum
        rr.addInfo('Counts metric set to', 'column_sum')
    elif counts_metric.lower() == 'stdev':
        counts_func = stdev
        rr.addInfo('Counts metric set to', 'stdev')
    else:
        rr.dieOnCritical('Invalid count metric', counts_metric)
    return counts_func
Ejemplo n.º 19
0
    def __call__(self, x_array, plot_lines=None, clean=False, xlabel=None,
            ylabel=None, title=None, plot_CI=False, ui=None):
        rr = RunRecord('PlottableSingle__call__')

        self.setAxes(plot_lines, plot_CI=plot_CI, test_run=False)
        self.checkYAxisScale(plot_lines, plot_CI=plot_CI)

        self.fig, self.ax = self.getFigureAndAxes(title=title,
              xlabel=xlabel, ylabel=ylabel)

        self.clean=clean

        for i, line in ui.series(enumerate(sorted(plot_lines,
                key=lambda line: (line.study,line.rank), reverse=True)),
                noun='Applying lines to plot'):
            self.ax.plot(x_array, line.counts, color=line.color,
                    linewidth=self.linewidth)

            # Show confidence interval around each line
            if plot_CI:
                #set shading alpha
                alpha = line.color[3]
                if alpha is None:
                    alpha = 0.9
                upper = 1.96 * line.stderr + line.counts
                lower = -1.96 * line.stderr + line.counts
                self.ax.fill_between(x_array, upper, lower, alpha=alpha/2.5,
                        color=line.color)
Ejemplo n.º 20
0
def add_data(session,
             name,
             description,
             path,
             expr_table,
             gene_id_heading='gene',
             probeset_heading='probeset',
             expr_heading='exp',
             sample_type=sample_types['abs_expr'],
             reffile1=None,
             reffile2=None):
    """ A unified interface for adding data to the DB """
    rr = RunRecord('add_data')

    success = add_sample(session, name, description)
    if not success:
        # Check if any sample exists without data
        existing_data, existing_type = check_existing_data(session, name)
        if existing_data > 0:
            rr.addError(name + ' already has data loaded', existing_data)
            rr.addError('data of type', existing_type)
            return False
        else:
            rr.addInfo('now loading data for existing sample', name)

    # either sample was created or existed with no data, so load data now
    if sample_types[sample_type] == sample_types['abs_expr']:
        success = add_expression_study(session, name, path, expr_table)

    elif sample_types[sample_type] == sample_types['diff_expr']:
        # diff between two files, check we got the related files
        assert reffile1 is not None and reffile2 is not None,\
        'To enter differences in gene expression you must specify the 2'\
        'files that contain the absolute measures.'
        add_expression_diff_study(session, name, path, expr_table, reffile1,
                                  reffile2)

    elif sample_types[sample_type] == sample_types['target_genes']:
        add_target_genes(session,
                         name,
                         path,
                         expr_table,
                         ensembl_id_label=gene_id_heading)
    else:
        rr.dieOnCritical('Unknown sample type', sample_type)

    return success
Ejemplo n.º 21
0
    def filteredChebyshevUpper(self, p=0.05, axis=None):
        """returns a new RegionCollection excluding records with excessive
        reads using a one-sided Chebyshev's inequality"""
        rr = RunRecord('filteredChebyshevUpper')
        if not (0 <= p <= 1):
            rr.dieOnCritical('Probability argument', 'Invalid')

        k = chebyshev_upper(p)
        if axis is None:
            # only bother computing normalised score for max of each row
            data = self.counts.max(axis=1)
            mean = self.counts.mean()
            stdev_ = self.counts.std(ddof=1)
            data -= mean
            data /= stdev_
            indices = data < k
            data = self.counts[indices]
            if self.labels is not None:
                labels = self.labels[indices]
            else:
                labels = None
            
            if self.ranks is not None:
                ranks = self.ranks[indices]
            else:
                ranks = None
            new = self.__class__(counts=data, ranks=ranks, labels=labels,
                    info=self.info)
        else:
            data = normalised_data(self.counts, axis=axis)
            func = lambda x: (x < k).all()
            indices = _get_keep_indices(data, filtered=func)
            new = self.take(indices)
            
        
        if self.info is None:
            info = {'filteredChebyshevUpper': p}
        else:
            info = self.info.copy()
            info['filteredChebyshevUpper'] = p
        
        if new.info:
            new.info.update(info)
        else:
            new.info = info

        return new
Ejemplo n.º 22
0
    def _groupAllGeneCounts(self):
        """ Group counts for all genes and return as a single PlotLine.
            Called by asPlotLines or _groupNGeneCounts().
            Returns a list.
        """
        rr = RunRecord('_groupAllGeneCounts')
        counts, ranks, se = self.data_collection.transformed(\
            counts_func=self.counts_func)
        if not len(counts):
            rr.dieOnCritical('No counts data in', 'Study._groupAllGeneCounts')

        ranks = 0  # rank is irrelevant for 'all' genes

        # Always name single lines by their collection name
        label = self.collection_label
        plot_lines = [PlotLine(counts, ranks, label, study=label, stderr=se)]
        return plot_lines
Ejemplo n.º 23
0
    def filteredByLabel(self, labels):
        """returns a new collection object with data corresponding to the
        provided labels"""
        rr = RunRecord('filteredByLabel')
        if self.labels is None:
            rr.dieOnCritical('No labels', 'Failure')

        if type(labels) == str:
            labels = [labels]
        
        # determine label indices and use self.take
        indices = []
        for i in range(self.counts.shape[0]):
            if self.labels[i] in labels:
                indices.append(i)
        
        return self.take(indices)
Ejemplo n.º 24
0
    def populateLogTable(self):
        """ display ChipPy Log text in the appropriate window """
        rr = RunRecord()
        self.log_table.setRowCount(0)
        try:
            table = rr.getMessageTable(last_n_lines=30, include_date=True)
        except RuntimeError:
            return

        if table is None:
            return
        else:
            for r, row in enumerate(table):
                self.log_table.setRowCount(self.log_table.rowCount() + 1)
                for c, column in enumerate(row):
                    self.log_table.setItem(r, c,
                                           QTableWidgetItem(QString(column)))
Ejemplo n.º 25
0
def _get_targetgene_query(session, sample_name=None, biotype='protein_coding'):
    """ Returns target_gene records for a given sample """
    rr = RunRecord('get_targets')
    if sample_name is not None:
        sample = _get_sample(session, sample_name)
        if sample is None:
            rr.addError('Using all samples, as no sample matches name',
                        sample_name)
            query = session.query(TargetGene).join(Gene)
        else:
            query = session.query(TargetGene).join(Gene).\
                    filter(TargetGene.sample_id==sample.sample_id)
    else:  # get them all
        query = session.query(TargetGene).join(Gene)

    if biotype:
        query = query.filter(Gene.biotype == biotype)
    return query
Ejemplo n.º 26
0
    def _auto_y_lims(self, minY, maxY, rounding=True, test_run=False):
        """
            Takes a list of plotlines.
            Returns ylims(y_min_limit, y_max_limit)
            Defaults to min = 0.0, max = 1.0
        """
        rr = RunRecord('_auto_y_lims')

        y_floor = minY
        y_ceiling = maxY
        if rounding:
            # Round min/max values to whole values for nice plots

            # For fractional counts then scale the rounding appropriately
            if maxY > 0:
                ypower = log10(maxY) # check scale

                if ypower < 0:
                    rounding_places = 0 - int(floor(ypower))
                    y_ceiling = float(ceil(maxY * (10**rounding_places))/
                                  (10**rounding_places))
                    y_floor = float(floor(minY * (10**rounding_places))/
                                (10**rounding_places))
                elif ypower == 0:
                    y_floor = 0.0
                    y_ceiling = 1.0
                else:
                    # round up to 2 significant digits
                    ypower = ceil(log10(maxY))
                    y_ceiling = ceil( maxY/(10**(ypower-1)) ) * (10**(ypower-1))
                    y_floor = floor(minY)
            elif maxY == 0:
                y_floor = 0.0
                y_ceiling = 1.0
            else:
                rr.dieOnCritical('Negative max y-axis value', maxY)

        if test_run:
            rr.addInfo('Y-axis min', minY)
            rr.addInfo('Y-axis max', maxY)
            rr.addInfo('Y-axis auto floor', y_floor)
            rr.addInfo('Y-axis auto ceiling', y_ceiling)

        return tuple([y_floor, y_ceiling])
Ejemplo n.º 27
0
def add_target_genes(session,
                     sample_name,
                     data_path,
                     table,
                     ensembl_id_label='ENSEMBL'):
    """adds Expression instances into the database from table

    Arguments:
        - data_path: the reference file path
        - table: the actual expression data table
        - ensembl_id_label: label of the column containing Ensembl Stable IDs
    """
    rr = RunRecord('add_target_genes')
    data = []
    sample = _one(session.query(Sample).filter_by(name=sample_name))
    if not sample:
        session.rollback()
        rr.dieOnCritical('querying for sample', 'Failed')

    reffile = session.query(ReferenceFile).filter_by(name=data_path).all()
    if len(reffile) == 0:
        reffile = ReferenceFile(data_path, today)
        reffile.sample = sample
        data.append(reffile)
    else:  # Don't overwrite anything, exit instead
        rr.dieOnCritical('File already loaded', data_path)

    ensembl_ids = table.getRawData(ensembl_id_label)

    for id_chunk in _chunk_id_list(ensembl_ids, 100):
        genes = session.query(Gene).filter(Gene.ensembl_id.in_(id_chunk)).all()
        for gene in genes:
            target = TargetGene()
            target.gene = gene
            target.reference_file = reffile
            target.sample = sample
            data.append(target)

    rr.addInfo('Added target genes from', data_path)
    rr.addInfo('No. genes added', len(data))
    session.add_all(data)
    session.commit()
    return
Ejemplo n.º 28
0
 def applyBinning(self, bin_width):
     """ For every bin_width, sum the counts. Output array size is
         same, just filled with mean values of each bin - giving a
         normalised score per base. Bin_width must be an integer
         factor of the window size.
     """
     rr = RunRecord('apply_binning')
     if bin_width and bin_width > 0:
         if len(self.counts)%bin_width:
             rr.dieOnCritical('Bin width is not an integer '+\
                     'factor of window size', bin_width)
         tmp_array = numpy.array(self.counts)
         for k in range(0, len(self.counts), bin_width):
             bin_sum = 0
             for i in xrange(bin_width):
                 bin_sum += self.counts[k+i]
             for i in xrange(bin_width):
                 tmp_array[k+i] = bin_sum/bin_width
         self.counts = tmp_array
Ejemplo n.º 29
0
def _get_expression_query(session,
                          sample_name=None,
                          biotype='protein_coding',
                          chrom=None,
                          data_path=None):
    """ Returns expression table query """
    rr = RunRecord('_get_expression_query')
    query = session.query(Expression)
    if sample_name is not None:
        sample = _get_sample(session, sample_name)
        if sample is None:
            rr.dieOnCritical('Unknown sample name', sample_name)
        query = query.filter(Expression.sample_id == sample.sample_id)

    if data_path is not None:
        reffile_id = _one(session.query(ReferenceFile.reffile_id).\
                filter(ReferenceFile.name==data_path))
        if not data_path:
            rr.dieOnCritical('Unknown data path', data_path)
        reffile_id = reffile_id[0]
        query = query.filter(Expression.reffile_id == reffile_id)

        # used to reconstruct the origin of a sample
    query = query.join(Gene)

    if chrom is not None:
        query = query.filter(Gene.chrom == chrom)
    if biotype is not None:
        query = query.filter(Gene.biotype == biotype)
    return query
Ejemplo n.º 30
0
def _get_diff_query(session,
                    sample_name=None,
                    biotype='protein_coding',
                    multitest_signif_val=None,
                    chrom=None,
                    data_path=None):
    """ Returns ExpressionDiff table query """
    rr = RunRecord('_get_diff_query')
    query = session.query(ExpressionDiff)
    if sample_name is not None:
        sample = _get_sample(session, sample_name)
        if not sample:
            rr.dieOnCritical('No sample with name', sample_name)

        query = query.filter(ExpressionDiff.sample_id == sample.sample_id)

    if data_path is not None:
        reffile_id = _one(session.query(ReferenceFile.reffile_id).\
        filter(ReferenceFile.name==data_path))
        if not data_path:
            rr.dieOnCritical('Unknown data path', data_path)
        reffile_id = reffile_id[0]
        query = query.filter(Expression.reffile_id == reffile_id)

    if multitest_signif_val is not None:
        query = query.filter(ExpressionDiff.multitest_signif==\
                                 multitest_signif_val)
    query = query.join(Gene)
    if chrom is not None:
        query = query.filter(Gene.chrom == chrom)
    if biotype:
        query = query.filter(Gene.biotype == biotype)
    return query