Ejemplo n.º 1
0
def _check_expr_headers(header_row, stable_id_label='', probeset_label='',
        exp_label=''):
    """
        Check the header labels match for standard expression. Probeset is
        optional and results in probes_present being False
    """
    rr = RunRecord('_check_expr_headers')

    try:
        gene_col = header_row.index(stable_id_label)
    except ValueError:
        rr.dieOnCritical('Stable ID column header not found in', header_row)

    try:
        exp_col = header_row.index(exp_label)
    except ValueError:
        rr.dieOnCritical('Expression score column header not found in',
                header_row)

    try:
        probe_col = header_row.index(probeset_label)
        probes_present = True
    except ValueError:
        rr.addWarning('Probeset column header not found in', header_row)
        probe_col = -1
        probes_present = False

    return gene_col, probe_col, exp_col, probes_present
Ejemplo n.º 2
0
    def asPlotLines(self, group_size, group_location, p=0.0):
        """
            Returns a list of PlotLine objects from this study.
            'p' is the Chebyshev cut-off if not None
        """
        rr = RunRecord('asPlotLines')
        if p > 0.0:
            rr.addInfo('Applying per-line Chebyshev filtering', p)

        if type(group_size) is str and group_size.lower() == 'all':
            plot_lines = self._groupAllGeneCounts()
        elif type(group_size) is int:
            if group_size == 1:
                plot_lines = self._groupNoGeneCounts()
            else:
                plot_lines = self._groupNGeneCounts(group_size, p=p)
        else:
            rr.dieOnCritical('group_size, wrong type or value',
                             [type(group_size), group_size])

        if group_location.lower() != 'all':
            rr.addInfo('grouping genes from location', group_location)
            plot_lines.sort(key=lambda x: x.rank)
            if group_location.lower() == 'top':
                plot_lines = [plot_lines[0]]
            elif group_location.lower() == 'middle':
                plot_lines = [plot_lines[int(len(plot_lines) / 2)]]
            elif group_location.lower() == 'bottom':
                plot_lines = [plot_lines[-1]]

        rr.addInfo('Plottable lines from study', len(plot_lines))
        return plot_lines
Ejemplo n.º 3
0
    def setAxes(self, plot_lines, plot_CI=False, test_run=False):
        """
            Gets called by the __call__ method but is also available for
            re-scaling of plots.

            1) Set the axes to y_min_limit and y_max_limit or call
                auto-calculate.
            2) Set y-tick-space or auto-calculate
        """
        rr = RunRecord('setAxes')

        if not self.ylims:
            minY = self.getMinY(plot_lines, plot_CI)
            maxY = self.getMaxY(plot_lines, plot_CI)
            self.ylims = self._auto_y_lims(minY, maxY, test_run=test_run)

        y_min_limit, y_max_limit = self.ylims
        # set grid-lines/tick marks
        if not self.ytick_space:
            self.ytick_space = self._auto_grid_lines(y_max_limit-y_min_limit,
                    test_run=test_run)

        if not self.ytick_interval:
            self.ytick_interval = 2

        rr.addInfo('Y-max plot limit', '{:e}'.format(y_max_limit))
        rr.addInfo('Y-min plot limit', '{:e}'.format(y_min_limit))
        rr.addInfo('Y-grid-line spacing', '{:e}'.format(self.ytick_space))
Ejemplo n.º 4
0
    def __call__(self, x_array, plot_lines=None, clean=False, xlabel=None,
            ylabel=None, title=None, plot_CI=False, ui=None):
        rr = RunRecord('PlottableSingle__call__')

        self.setAxes(plot_lines, plot_CI=plot_CI, test_run=False)
        self.checkYAxisScale(plot_lines, plot_CI=plot_CI)

        self.fig, self.ax = self.getFigureAndAxes(title=title,
              xlabel=xlabel, ylabel=ylabel)

        self.clean=clean

        for i, line in ui.series(enumerate(sorted(plot_lines,
                key=lambda line: (line.study,line.rank), reverse=True)),
                noun='Applying lines to plot'):
            self.ax.plot(x_array, line.counts, color=line.color,
                    linewidth=self.linewidth)

            # Show confidence interval around each line
            if plot_CI:
                #set shading alpha
                alpha = line.color[3]
                if alpha is None:
                    alpha = 0.9
                upper = 1.96 * line.stderr + line.counts
                lower = -1.96 * line.stderr + line.counts
                self.ax.fill_between(x_array, upper, lower, alpha=alpha/2.5,
                        color=line.color)
Ejemplo n.º 5
0
    def _groupNGeneCounts(self, group_size, p=0.0):
        """ Group counts for N genes and return as PlotLines. Defaults to
            _groupAllGeneCounts() if group size is too large.
            Called by asPlotLines()
        """
        rr = RunRecord('_groupNGeneCounts')
        plot_lines = []
        for index, (c,r,l,se) in enumerate(self.data_collection.\
                iterTransformedGroups(group_size=group_size,
                counts_func=self.counts_func, p=p)):
            plot_lines.append(
                PlotLine(c,
                         rank=r,
                         label=l,
                         study=self.collection_label,
                         stderr=se))

        # If no data was returned default to groupAllCollectionCounts
        if not len(plot_lines):
            rr.addWarning('Defaulting to ALL features. Not enough '+\
                          'features for group of size', group_size)
            plotLines = self._groupAllGeneCounts()
            return plotLines

        return plot_lines
Ejemplo n.º 6
0
def get_region_counts(BAMorBED, ROIs, chr_prefix=None, chrom_size=300000000):
    """
        Direct ROIs to BAM, BEDgraph or BED file reader.
        Also can work with Wiggle files but these are very slow.
        Return ROIs, the number of read tags, total counts and mapped tags
    """

    rr = RunRecord('get_region_counts')
    if 'bam' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_BAM(BAMorBED, ROIs, chr_prefix)
    elif 'bedgraph' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_BEDgraph(BAMorBED, ROIs, chr_prefix)
    elif 'bed' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_BED(BAMorBED, ROIs, chr_prefix)
    elif 'wig' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_wiggle(BAMorBED, ROIs, chr_prefix, chrom_size)
    elif 'vcf' in BAMorBED.lower():
        filled_ROIs, num_tags, num_bases, mapped_tags =\
                read_vcf(BAMorBED, ROIs, chr_prefix, chrom_size)
    else:
        rr.dieOnCritical('File not recognised as BAM, BEDgraph,'+\
                'BED, WIG or VCF', BAMorBED)

    rr.addInfo('Number of read tags counted', num_tags)
    rr.addInfo('Number of total bases counted', num_bases)
    rr.addInfo('Number of mapped tags in experiment', mapped_tags)

    return filled_ROIs, num_tags, num_bases, mapped_tags
Ejemplo n.º 7
0
def _get_expression_query(session,
                          sample_name=None,
                          biotype='protein_coding',
                          chrom=None,
                          data_path=None):
    """ Returns expression table query """
    rr = RunRecord('_get_expression_query')
    query = session.query(Expression)
    if sample_name is not None:
        sample = _get_sample(session, sample_name)
        if sample is None:
            rr.dieOnCritical('Unknown sample name', sample_name)
        query = query.filter(Expression.sample_id == sample.sample_id)

    if data_path is not None:
        reffile_id = _one(session.query(ReferenceFile.reffile_id).\
                filter(ReferenceFile.name==data_path))
        if not data_path:
            rr.dieOnCritical('Unknown data path', data_path)
        reffile_id = reffile_id[0]
        query = query.filter(Expression.reffile_id == reffile_id)

        # used to reconstruct the origin of a sample
    query = query.join(Gene)

    if chrom is not None:
        query = query.filter(Gene.chrom == chrom)
    if biotype is not None:
        query = query.filter(Gene.biotype == biotype)
    return query
Ejemplo n.º 8
0
def main():
    """
        How do counts distributions vary with rank?
    """
    rr = RunRecord('counts_distribution')
    rr.addCommands(sys.argv)
    args = script_info['args'].parse(window_title='Counts Distribution')

    studies = [CountsStudy(fn) for fn in args.collections]

    fig_details = FigureDetails(x_size=args.fig_width,
                                y_size=args.fig_height,
                                title=args.title,
                                x_text=args.xlabel,
                                y_text=args.ylabel)

    if args.normalise_by_RPM:
        for study in studies:
            study.normaliseByRPM()

    score_groups = []
    for study in studies:
        score_groups.append(
            study.scoresAsRankedArray(metric=args.counts_region,
                                      log2=args.y_axis_is_log))

    make_plot(score_groups, fig_details, args.plot_type, args.plot_filename)

    rr.display()
Ejemplo n.º 9
0
def _get_diff_query(session,
                    sample_name=None,
                    biotype='protein_coding',
                    multitest_signif_val=None,
                    chrom=None,
                    data_path=None):
    """ Returns ExpressionDiff table query """
    rr = RunRecord('_get_diff_query')
    query = session.query(ExpressionDiff)
    if sample_name is not None:
        sample = _get_sample(session, sample_name)
        if not sample:
            rr.dieOnCritical('No sample with name', sample_name)

        query = query.filter(ExpressionDiff.sample_id == sample.sample_id)

    if data_path is not None:
        reffile_id = _one(session.query(ReferenceFile.reffile_id).\
        filter(ReferenceFile.name==data_path))
        if not data_path:
            rr.dieOnCritical('Unknown data path', data_path)
        reffile_id = reffile_id[0]
        query = query.filter(Expression.reffile_id == reffile_id)

    if multitest_signif_val is not None:
        query = query.filter(ExpressionDiff.multitest_signif==\
                                 multitest_signif_val)
    query = query.join(Gene)
    if chrom is not None:
        query = query.filter(Gene.chrom == chrom)
    if biotype:
        query = query.filter(Gene.biotype == biotype)
    return query
Ejemplo n.º 10
0
def get_collection(session,
                   sample_name,
                   feature_type,
                   BAMorBED,
                   chr_prefix,
                   window_upstream,
                   window_downstream,
                   multitest_signif_val,
                   collection_fn,
                   overwrite,
                   tab_delimited,
                   include_targets=None,
                   exclude_targets=None,
                   bedgraph=False,
                   BED_windows=False,
                   chrom_size=300000000,
                   no_overlap=True):
    """
        builds and writes a collection of counts and expression for
        feature_type in given sample genes.
    """
    rr = RunRecord('get_collection')

    if not collection_fn.endswith('.chp'):
        collection_fn += '.chp'  # ChipPy data file

    if not os.path.exists(collection_fn) or overwrite:
        bedgraph_fn = None
        if bedgraph:
            bedgraph_fn = '.'.join(collection_fn.split('.')[:-1]) + '.bedgraph'

        BED_windows_fn = None
        if BED_windows:
            BED_windows_fn = '.'.join(collection_fn.split('.')[:-1]) +\
                    '_regions.BED'

        data_collection = counts_for_genes(
            session,
            sample_name,
            feature_type,
            BAMorBED,
            chr_prefix,
            window_upstream,
            window_downstream,
            include_targets,
            exclude_targets,
            bedgraph_fn,
            multitest_signif_val=multitest_signif_val,
            BED_windows_fn=BED_windows_fn,
            chrom_size=chrom_size,
            no_overlap=no_overlap)

        if data_collection is not None:
            data_collection.writeToFile(collection_fn,
                                        as_table=tab_delimited,
                                        compress_file=True)
        else:
            rr.dieOnCritical('No data collection was returned', 'Failed')
    else:
        print 'Existing output at', collection_fn
Ejemplo n.º 11
0
def gene_expr_diff_to_table(data_path, sep='\t', stable_id_label='',
        probeset_label='', exp_label='', sig_label='', pval_label='',
        allow_probeset_many_gene=False, validate=True):
    """
        As per gene_expr_to_table() but with the addition of sig_label and
        pval_label columns.
    """
    rr = RunRecord('gene_expr_diff_to_table')

    rr.addInfo('Reading expression diff file', data_path)
    genes, probes, exp, sig, pval, probes_present = _read_data_file(\
            data_path, sep=sep, stable_id_label=stable_id_label,
            probeset_label=probeset_label, exp_label=exp_label,
            sig_label=sig_label, pval_label=pval_label, is_diff=True)

    if probes_present:
        if validate:
            # if probes and exp are mismatched, nuke the gene
            genes, probes, exp, sig, pval =\
                    _validate_probes_scores(genes, probes, exp, sig, pval)

        if not allow_probeset_many_gene:
            # each probe should map to only one gene
            genes, probes, exp, sig, pval =\
                    _remove_multimapped_probesets(genes, probes, exp,
                    sig, pval)

    header = DIFF_HEADER
    rows = [[g, p, e, s, v] for g, p, e, s, v in \
                zip(genes, probes, exp, sig, pval)]

    return Table(header=header, rows=rows)
Ejemplo n.º 12
0
    def load_expr(self,
                  expr_study,
                  db_path,
                  include_targets=None,
                  exclude_targets=None):
        """
            loads expression records from a ChippyDB and also
            ranks by expr
        """
        rr = RunRecord('load_expr')

        sample_name = expr_study.split(' : ')[0]
        session = db_query.make_session(db_path)

        self.expr_genes = []
        #sample_type == 'Expression data: absolute ranked'
        print 'Querying sample from ChippyDB', sample_name

        sample_genes = db_query.get_genes_by_ranked_expr(
            session,
            sample_name,
            biotype='protein_coding',
            data_path=None,
            rank_by='mean',
            include_targets=include_targets,
            exclude_targets=exclude_targets)

        for gene in sample_genes:
            gene_record = ExprGene(gene.MeanScore, gene.Rank, gene.ensembl_id,
                                   sample_name)
            self.expr_genes.append(gene_record)
        rr.addInfo('genes found in ' + sample_name, len(sample_genes))
Ejemplo n.º 13
0
    def load_counts(self, collection):
        """ loads gene entries from a ChipPy collection """
        rr = RunRecord('load_counts')

        print 'Loading counts collection file', collection
        self.counts_genes = []
        if os.path.isfile(collection):
            try:
                # to load counts data from file
                file1 = gzip.GzipFile(collection, 'rb')
                data = numpy.load(file1)
                d = data.tolist()
                counts = d['counts']
                labels = d['labels']

                for count, label in zip(counts, labels):
                    gene_record = CountsGene(count, str(label), collection)
                    self.counts_genes.append(gene_record)
                rr.addInfo('genes found in ' + collection, len(labels))

            except IOError:  # some exception type
                rr.dieOnCritical('file found but could not be read',
                                 collection)
        else:
            rr.dieOnCritical('unrecognised collection file', collection)
Ejemplo n.º 14
0
    def test_add_commands(self):
        """ test that RunRecord.addCommands correctly logs long lines of text
        """
        logging.disable(logging.NOTSET)
        rr = RunRecord('test_add_commands')
        rr.addCommands([])
        cmd_line = 'This is a list of command arguments that probably '+\
                   'do not exist in the real world'
        cmds = cmd_line.split(' ')
        rr.addCommands(cmds)

        recorded_lines = [
            'ChipPy.test_add_commands\tINFO\tcommand-line\tNo arguments given',
            'ChipPy.test_add_commands\tINFO\tcommand-line\tThis is a list of command arguments',
            'ChipPy.test_add_commands\tINFO\tcommand-line\tthat probably do not exist in the real',
            'ChipPy.test_add_commands\tINFO\tcommand-line\tworld'
        ]

        log_file = open(LOG_FN, 'r')
        for n, line in enumerate(log_file):
            line_parts = [lp.strip() for lp in line.split('\t')]
            #print repr(recorded_lines[n])
            #print repr('\t'.join(line_parts[1:]))
            assert '\t'.join(line_parts[1:]) == recorded_lines[n]


        logging.disable(logging.CRITICAL)
Ejemplo n.º 15
0
    def filterByCutoff(self, cutoff=None):
        """ keep only results that pass Chebyshev cutoff """
        rr = RunRecord('filterByCutoff')

        rr.addInfo('Starting no. of genes', self.data_collection.N)

        # exclude outlier genes using one-sided Chebyshev
        if cutoff is not None and cutoff != 0.0:
            try:
                cutoff = float(cutoff)
                if cutoff < 0.0 or cutoff >= 1.0:
                    rr.addError('Cutoff out of range', cutoff)
                    rr.addInfo('Cutoff set to default', 0.05)
                    cutoff = 0.05
            except ValueError:
                rr.addError('Cutoff not given as float', cutoff)
                rr.addInfo('Cutoff set to default', 0.05)
                cutoff = 0.05
                # Do Chebyshev filtering

            self.data_collection =\
                    self.data_collection.filteredChebyshevUpper(p=cutoff)
            rr.addInfo('Used Chebyshev filter cutoff', cutoff)
            rr.addInfo('No. genes after normalisation filter',
                       self.data_collection.N)
        else:
            rr.addInfo('Outlier cutoff filtering', 'Off')

        if self.data_collection is None or\
                self.data_collection.ranks.max() == 0:
            rr.dieOnCritical('No data after filtering', 'Failure')
Ejemplo n.º 16
0
    def _groupNoGeneCounts(self):
        """ Don't group counts. Simply return a PlotLine for each set of
            counts.
            Called by asPlotLines()
        """
        rr = RunRecord('_groupNoGeneCounts')
        counts = self.data_collection.counts
        ranks = self.data_collection.ranks
        labels = self.data_collection.labels
        plot_lines = []
        for c, r, l in zip(counts, ranks, labels):
            if self.counts_func == stdev:
                stdev_ = c.std()
                if stdev_ > 0:
                    c = (c - c.mean()) / stdev_
                    plot_lines.append(
                        PlotLine(c, r, l, study=self.collection_label))
            else:
                plot_lines.append(
                    PlotLine(c, r, l, study=self.collection_label))

        # If no data was returned default to groupAllCollectionCounts
        if not len(plot_lines):
            rr.dieOnCritical('No data in collection', 'Failure')

        # If a single line is created label it with the collection name
        if len(plot_lines) == 1:
            plot_lines[0].label = [self.collection_label]

        return plot_lines
Ejemplo n.º 17
0
def main():
    rr = RunRecord('add_expression_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Add Expression to DB')
    session = db_query.make_session(args.db_path)

    name = args.name
    description = args.description
    ref_file = args.expression_data
    sample_type = args.sample_type

    # Check that Sample and Reference File are both unique
    if name in db_query.get_sample_entries(session):
        rr.dieOnCritical('Sample name already exists', name)
    if ref_file in db_query.get_reffile_entries(session,
                                                reffile_name=ref_file):
        rr.dieOnCritical('ReferenceFile already loaded', ref_file)

    if sample_types[sample_type] == sample_types['abs_expr']:
        expr_table = gene_expr_to_table(
            args.expression_data,
            stable_id_label=args.gene_id_heading,
            probeset_label=args.probeset_heading,
            exp_label=args.expression_heading,
            allow_probeset_many_gene=args.allow_probeset_many_gene,
            validate=True,
            sep=args.sep)

    elif sample_types[sample_type] == sample_types['diff_expr']:
        # validation breaks with some of Rohan's diff files
        # he's included all probesets but only the mean score, once.
        expr_table = gene_expr_diff_to_table(
            args.expression_data,
            stable_id_label=args.gene_id_heading,
            probeset_label=args.probeset_heading,
            exp_label=args.expression_heading,
            sig_label=args.significance_heading,
            pval_label=args.p_value_heading,
            allow_probeset_many_gene=args.allow_probeset_many_gene,
            validate=False,
            sep=args.sep)
    elif sample_types[sample_type] == sample_types['target_genes']:
        expr_table = LoadTable(args.expression_data, sep=args.sep)

    else:
        rr.dieOnCritical('Unknown sample type', args.sample_type)

    success = add_data(session,
                       name,
                       description,
                       args.expression_data,
                       expr_table,
                       sample_type=args.sample_type,
                       reffile1=args.reffile1,
                       reffile2=args.reffile2)

    rr.addInfo(name + ' added to DB', success)
    rr.display()
Ejemplo n.º 18
0
def get_genes_by_ranked_diff(session,
                             sample_name,
                             multitest_signif_val=None,
                             biotype='protein_coding',
                             chrom=None,
                             data_path=None,
                             include_targets=None,
                             exclude_targets=None,
                             rank_by='mean'):
    """returns all ranked genes from a sample difference experiment"""
    rr = RunRecord('get_genes_by_ranked_diff')
    records = get_diff_entries(session,
                               sample_name=sample_name,
                               biotype=biotype,
                               data_path=data_path,
                               chrom=chrom,
                               multitest_signif_val=multitest_signif_val)

    genes = []
    for expressed_diff in records:
        gene = expressed_diff.gene
        gene.Scores = expressed_diff.fold_changes
        genes.append(gene)

    # keep only those genes in the include target gene sets if provided
    if include_targets is not None:
        include_genes = get_targetgene_entries(session, include_targets)
        if len(include_genes) > 0:
            include_gene_ids = set(
                [tg.gene.ensembl_id for tg in include_genes])
            genes = [gene for gene in genes if gene.ensembl_id in\
                    include_gene_ids]

    # keep only those genes not in the exclude target gene sets if provided
    if exclude_targets is not None:
        exclude_genes = get_targetgene_entries(session, exclude_targets)
        if len(exclude_genes) > 0:
            exclude_gene_ids = set(
                [tg.gene.ensembl_id for tg in exclude_genes])
            genes = [gene for gene in genes if gene.ensembl_id not in\
                    exclude_gene_ids]

    # set rank
    if rank_by.lower() == 'mean':
        scored = [(g.MeanScore, g) for g in genes]
    elif rank_by.lower() == 'max':
        scored = [(g.MaxScore, g) for g in genes]
    else:
        rr.dieOnCritical('Ranking method not possible', rank_by.lower())

    # Make sure we get highest first
    scored = reversed(sorted(scored))
    genes = []
    for rank, (score, gene) in enumerate(scored):
        gene.Rank = rank + 1
        genes.append(gene)

    return genes
Ejemplo n.º 19
0
def add_sample(session, name, description):
    """add a basic sample"""
    rr = RunRecord('add_sample')
    sample = Sample(name, description)
    if not _successful_commit(session, sample):
        rr.addInfo('Sample already exists in db', name)
        return False
    else:
        rr.addInfo('Sample created in db', name)
        return True
Ejemplo n.º 20
0
    def normaliseByRPM(self):
        """ Normalise counts by per million mapped tags """
        rr = RunRecord('normaliseByRPM')
        norm_factor = 1000000.0 / self.mapped_tags
        rr.addInfo('normalising by RPMs', norm_factor)

        for gene in self.counts_genes:
            gene.promoter_counts *= norm_factor
            gene.coding_counts *= norm_factor
            gene.feature_counts *= norm_factor
            gene.total_counts *= norm_factor
Ejemplo n.º 21
0
def gene_expr_to_table(data_path, sep='\t', stable_id_label='',
        probeset_label='', exp_label='', allow_probeset_many_gene=False,
        validate=True):
    """
        Returns a cogent table object

        Deals with a simple tab-delimited representation of gene expression
        data which may have come from either micro-array or mRNA-seq
        experiments.

        Data from micro-arrays will have probeset information for each
        gene and a score to match each probe.

        RNA-seq data will not have probes and simply a single score for each
        gene. In this case we will create a fake probe for each gene of the
        form 'P' + a unique integer.

        Probset id's and expressions scores are separated by the pipe
        -- | -- character. The probset and expression scores are then
        converted to tuples of ints or floats respectively.

        Arguments:
            - probeset_label: name of column containing probesets
            - exp_label: name of column containing expression scores
            - stable_id_label: name of column containing Ensembl stable IDs
            - allow_probeset_many_gene: whether one probeset can map to
                multiple genes. If not we remove probes and scores that multi-
                map.
            - validate: checks that -- stable IDs are unique in the file,
                that for each row the number of probesets equals the
                number of expression scores. Removes the gene entry.
    """

    rr = RunRecord('geneExprDataToTable')

    rr.addInfo('Reading expression data', data_path)
    genes, probes, exp, probes_present = _read_data_file(data_path, sep=sep,
            stable_id_label=stable_id_label, probeset_label=probeset_label,
            exp_label=exp_label)

    if probes_present:
        if validate:
            # if probes and scores are mismatched, nuke the gene
            genes, probes, exp = \
                    _validate_probes_scores(genes, probes, exp)

        if not allow_probeset_many_gene:
            # each probe should map to only one gene
            genes, probes, exp = \
                    _remove_multimapped_probesets(genes, probes, exp)

    rows = [[g,p,e] for g,p,e in zip(genes, probes, exp)]
    return Table(header=EXPR_HEADER, rows=rows)
Ejemplo n.º 22
0
def set_up_series_plots_dir(plot_filename):
    """ Create directory structure for series plots """
    rr = RunRecord('set_up_series_plot_dir')

    save_dir = dirname_or_default(plot_filename)
    basename = os.path.basename(plot_filename)

    plot_series_dir = os.path.join(save_dir,
        '%s-series' % basename[:basename.rfind('.')])
    create_path(plot_series_dir)
    rr.addInfo('Plotting as a series to', plot_series_dir)
    return plot_series_dir
Ejemplo n.º 23
0
def main():
    """
        Plot the score differential (y-axis) against the absolute expression
        components (x-axes of each plot).
    """
    rr = RunRecord('diff_abs_plots')
    rr.addCommands(sys.argv)
    args = script_info['args'].parse(\
            window_title='Difference vs Absolute Expression Plots')

    groups_dict = dict([('extremes_colour', args.extremes_colour),
                        ('signif_colour', args.signif_colour),
                        ('bulk_colour', args.bulk_colour),
                        ('hide_extremes', args.hide_extremes),
                        ('hide_signif', args.hide_signif),
                        ('hide_bulk', args.hide_bulk)])

    # Should do number restrictions in load step
    # Load all genes into RawPlotData object
    print 'Loading data for first plot'
    raw_plot_data1 = load_sample_genes(args.db_path, args.diff_sample,
                                       args.abs_expr_sample1,
                                       args.sample_extremes)

    print 'Loading data for second plot'
    raw_plot_data2 = load_sample_genes(args.db_path, args.diff_sample,
                                       args.abs_expr_sample2,
                                       args.sample_extremes)

    print 'Building plot points'
    # get back a list of plot_dot objects with 'x', 'y', 'colour', 'area'
    plot_dots1 = build_plot_points(raw_plot_data1, args.ranks, args.num_genes)

    plot_dots2 = build_plot_points(raw_plot_data2, args.ranks, args.num_genes)

    plot_dict = dict([('out_name', args.plot1_name), ('title', args.title),
                      ('y_text', args.ylabel), ('y_units', args.yaxis_units),
                      ('x_text', args.xaxis_text1),
                      ('x_units', args.xaxis_units),
                      ('diff_name', raw_plot_data1.diff_name),
                      ('sample_name', raw_plot_data1.sample_name)])
    print 'Generating plot 1'
    make_plot(plot_dots1, plot_dict, groups_dict)

    plot_dict['sample_name'] = raw_plot_data2.sample_name
    plot_dict['out_name'] = args.plot2_name
    plot_dict['x_text'] = args.xaxis_text2

    print 'Generating plot 2'
    make_plot(plot_dots2, plot_dict, groups_dict)

    rr.display()
Ejemplo n.º 24
0
def _get_keep_indices(data, filtered=None):
    rr = RunRecord('_get_keep_indices')
    keep = range(data.shape[0])
    
    if filtered is not None:
        keep = []
        for i in range(data.shape[0]):
            if filtered(data[i]):
                keep.append(i)
        if len(keep) == 0:
            rr.dieOnCritical('No remaing data after filtering', 'Failure')

    return keep
Ejemplo n.º 25
0
def main():
    rr = RunRecord('drop_expression_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Drop Expression Data')
    session = db_query.make_session(args.db_path)

    if db_query.drop_sample_records(session, args.sample):
        rr.addInfo('Removing ' + args.sample, 'Success')
    else:
        rr.addWarning('Removing ' + args.sample, 'Failure')

    rr.display()
Ejemplo n.º 26
0
def safe_line_division(sample_line, dividing_line):
    """ in case of divide-by-zero we need some robustness """
    rr = RunRecord('safe_line_division')
    try:
        sample_line /= dividing_line
    except ZeroDivisionError: # 0 counts at a base position
        min_count = 1
        for c in dividing_line:
            if 0 < c < min_count:
                min_count = c
        sample_line /= dividing_line
        rr.addWarning('Zero counts value seen. Setting zeros to',
                min_count)
    return sample_line
Ejemplo n.º 27
0
def chromHandle(species='mouse', chroms=mouse_chroms):
    """chrom_handle provides the safe way of creating an instance of
    Chrom and returning THE existing instance of Chromosomes. """
    rr = RunRecord('chromHandle')
    try:
        chromsInstance = Chromosomes(species, chroms)
        rr.addInfo('Chromosomes instance created', True)
        rr.addInfo('Chromosomes instance species', species)
        rr.addInfo('Chromosomes instance chromosomes', chroms)
    except Chromosomes, c:
        chromsInstance = c
        if species or chroms:
            rr.addWarning('provided species', species)
            rr.addWarning('provided chromosomes', chroms)
Ejemplo n.º 28
0
def load_studies(collections, counts_func):
    """
        Load all collection data as list, and apply filtering if needed.
        Return the studies plus their max common up- & down- stream
        window size.
    """
    rr = RunRecord('load_studies')

    # Parse glob file names
    if len(collections) == 0:
        rr.dieOnCritical('Number of provided collection files', 0)

    collection_fns = []
    for collection_file in collections:
        if '*' in collections[0]:
            dir_name = os.path.dirname(collection_file)
            base_name = os.path.basename(collection_file)
            glob_file_names = [os.path.join(dir_name, p)\
                    for p in glob.glob1(dir_name, base_name)]
            glob_file_names.sort()
            for f in glob_file_names:
                collection_fns.append(os.path.abspath(f))
        else:
            collection_fns.append(os.path.abspath(collection_file))

    windows_upstream = []
    windows_downstream = []
    studies = []
    # Load data from each file
    for collection_fn in collection_fns:
        study = RegionStudy(collection_fn, counts_func)
        if study is None:
            rr.dieOnCritical('Could not load study', collection_fn)
        else:
            studies.append(study)
            windows_upstream.append(study.window_upstream)
            windows_downstream.append(study.window_downstream)

    # Find max common windows size
    if not len(studies):
        rr.dieOnCritical('No valid data files', 'Failure')

    window_upstream = int(min(windows_upstream))
    window_downstream = int(min(windows_downstream))

    rr.addInfo('Max common upstream window size', window_upstream)
    rr.addInfo('Max common downstream window size', window_downstream)
    rr.addInfo('Total data collections', len(studies))

    return studies, window_upstream, window_downstream
Ejemplo n.º 29
0
def get_chroms(session):
    """ return list of chroms from ',' separated string """
    if session is None:
        return ['No connection to DB']
    elif type(session) is str:
        session = make_session(session)
    rr = RunRecord('get_chroms')
    try:
        chroms = session.query(Chroms).one()
        chroms = chroms.chromStr.split(',')
    except NoResultFound:
        chroms = []
        rr.addError('Chroms found', None)
    return chroms
Ejemplo n.º 30
0
def set_counts_function(counts_metric):
    """ Sets the feature counting metric function"""
    rr = RunRecord('set_counts_function')
    if counts_metric.lower() == 'mean':
        counts_func = column_mean
        rr.addInfo('Counts metric set to', 'column_mean')
    elif counts_metric.lower() == 'frequency':
        counts_func = column_sum
        rr.addInfo('Counts metric set to', 'column_sum')
    elif counts_metric.lower() == 'stdev':
        counts_func = stdev
        rr.addInfo('Counts metric set to', 'stdev')
    else:
        rr.dieOnCritical('Invalid count metric', counts_metric)
    return counts_func