Beispiel #1
0
    def load_expr(self,
                  expr_study,
                  db_path,
                  include_targets=None,
                  exclude_targets=None):
        """
            loads expression records from a ChippyDB and also
            ranks by expr
        """
        rr = RunRecord('load_expr')

        sample_name = expr_study.split(' : ')[0]
        session = db_query.make_session(db_path)

        self.expr_genes = []
        #sample_type == 'Expression data: absolute ranked'
        print 'Querying sample from ChippyDB', sample_name

        sample_genes = db_query.get_genes_by_ranked_expr(
            session,
            sample_name,
            biotype='protein_coding',
            data_path=None,
            rank_by='mean',
            include_targets=include_targets,
            exclude_targets=exclude_targets)

        for gene in sample_genes:
            gene_record = ExprGene(gene.MeanScore, gene.Rank, gene.ensembl_id,
                                   sample_name)
            self.expr_genes.append(gene_record)
        rr.addInfo('genes found in ' + sample_name, len(sample_genes))
Beispiel #2
0
def main():
    rr = RunRecord('add_expression_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Add Expression to DB')
    session = db_query.make_session(args.db_path)

    name = args.name
    description = args.description
    ref_file = args.expression_data
    sample_type = args.sample_type

    # Check that Sample and Reference File are both unique
    if name in db_query.get_sample_entries(session):
        rr.dieOnCritical('Sample name already exists', name)
    if ref_file in db_query.get_reffile_entries(session,
                                                reffile_name=ref_file):
        rr.dieOnCritical('ReferenceFile already loaded', ref_file)

    if sample_types[sample_type] == sample_types['abs_expr']:
        expr_table = gene_expr_to_table(
            args.expression_data,
            stable_id_label=args.gene_id_heading,
            probeset_label=args.probeset_heading,
            exp_label=args.expression_heading,
            allow_probeset_many_gene=args.allow_probeset_many_gene,
            validate=True,
            sep=args.sep)

    elif sample_types[sample_type] == sample_types['diff_expr']:
        # validation breaks with some of Rohan's diff files
        # he's included all probesets but only the mean score, once.
        expr_table = gene_expr_diff_to_table(
            args.expression_data,
            stable_id_label=args.gene_id_heading,
            probeset_label=args.probeset_heading,
            exp_label=args.expression_heading,
            sig_label=args.significance_heading,
            pval_label=args.p_value_heading,
            allow_probeset_many_gene=args.allow_probeset_many_gene,
            validate=False,
            sep=args.sep)
    elif sample_types[sample_type] == sample_types['target_genes']:
        expr_table = LoadTable(args.expression_data, sep=args.sep)

    else:
        rr.dieOnCritical('Unknown sample type', args.sample_type)

    success = add_data(session,
                       name,
                       description,
                       args.expression_data,
                       expr_table,
                       sample_type=args.sample_type,
                       reffile1=args.reffile1,
                       reffile2=args.reffile2)

    rr.addInfo(name + ' added to DB', success)
    rr.display()
Beispiel #3
0
def _create_session():
    # Create DB session
    if 'CHIPPY_DB' in os.environ:
        db_path = os.environ['CHIPPY_DB']
    else:
        raise RuntimeError(
            'You need to set an environment variable '
            'CHIPPY_DB that indicates where to find the database')
    session = db_query.make_session('sqlite:///%s' % db_path)

    return session
Beispiel #4
0
 def start_chippy_db(self):
     """ script to create a new DB """
     command = self._make_cmd_str('start_chippy_db.py', include_db=False)
     returncode, stdout, stderr = run_command(command)
     if returncode == 0:
         if self.check_valid_db(stdout):
             self.current_db = stdout
             # Check the DB works correctly
             session = db_query.make_session(self.current_db)
             self.populateDBInfo(session)
             session.close()
             self.switch_menu_actions(True)
Beispiel #5
0
    def check_valid_db(self, db_path):
        """ True if valid data in DB at path """
        if db_path is None or db_path == '':
            return False

        # test DB is valid
        session = db_query.make_session(db_path)
        if db_query.get_species(session) is None:
            session.close()
            return False

        session.close()
        return True
Beispiel #6
0
def main():
    rr = RunRecord('drop_expression_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Drop Expression Data')
    session = db_query.make_session(args.db_path)

    if db_query.drop_sample_records(session, args.sample):
        rr.addInfo('Removing ' + args.sample, 'Success')
    else:
        rr.addWarning('Removing ' + args.sample, 'Failure')

    rr.display()
Beispiel #7
0
    def open_chippy_db(self):
        """ Use dialog to select DB file and populate view with DB info """
        rr = RunRecord('open_chippy_db')
        db_path = str(QFileDialog.getOpenFileName())
        if not self.check_valid_db(db_path):
            rr.addWarning('DB has invalid format', db_path)
            self.populateLogTable()
            return
        self.current_db = os.path.realpath(db_path)

        session = db_query.make_session(self.current_db)
        self.populateDBInfo(session)
        self.populateDBTable(session)
        rr.addInfo('DB opened successfully', db_path)
        self.populateLogTable()
        session.close()
        self.switch_menu_actions(True)
Beispiel #8
0
def main():
    rr = RunRecord('db_summary')
    rr.addCommands(sys.argv)
    args = script_info['args'].parse(window_title='DB Summary')
    session = make_session(args.db_path)
    sample_name = args.sample if args.sample else None

    chroms = get_chroms(session)
    species = get_species(session)

    if sample_name is None:
        total_samples_count = get_sample_counts(session)
        sample_names = get_all_sample_names(session)
        total_genes_count = get_gene_counts(session)
        total_exon_count = get_exon_counts(session)
        total_expr_count = get_expression_counts(session)
        total_diff_genes_count = get_diff_counts(session)
        total_target_genes_count = get_targetgene_counts(session)
        total_reffiles_count = get_reffile_counts(session)
    else:
        total_expr_count = get_expression_counts(session, sample_name)
        total_diff_genes_count = get_diff_counts(session, sample_name)
        total_target_genes_count = get_targetgene_counts(session, sample_name)
        reffiles_entries = get_reffile_entries(session,
                                               sample_name=sample_name)

    rr.addInfo('ChipPy DB name', args.db_path)
    rr.addInfo('Species name', species)
    rr.addInfo('Chroms list', chroms)
    if sample_name is None:
        rr.addInfo('Total # of sample entries', total_samples_count)
        rr.addInfo('Sample names', sample_names)
        rr.addInfo('Total # of gene entries', total_genes_count)
        rr.addInfo('Total # of exon entries', total_exon_count)
    rr.addInfo('Total # of absolute-scored gene entries', total_expr_count)
    rr.addInfo('Total # of differential gene entries', total_diff_genes_count)
    rr.addInfo('Total # of target gene entries', total_target_genes_count)
    if sample_name is None:
        rr.addInfo('Total # of reference files', total_reffiles_count)
    else:
        if len(reffiles_entries) > 0:
            rr.addInfo('Reference file name', reffiles_entries)
        else:
            rr.addError('Reference file name', 'Not Available')

    rr.display()
Beispiel #9
0
    def populateDBTable(self, session=None):
        """ Get all expression set data from self.current_db """
        if session is None:
            if not self.check_valid_db(self.current_db):
                return
            session = db_query.make_session(self.current_db)

        names_descriptions = db_query.get_sample_names_descriptions(session)
        names = names_descriptions.keys()
        descriptions = names_descriptions.values()
        types = []
        num_genes = []
        files = []
        for name in names:
            abs = set(db_query.get_expr_sample_names(session))
            diff = set(db_query.get_diff_sample_names(session))
            target = set(db_query.get_target_gene_names(session))

            if name in abs:
                types.append('Expression')
                num_genes.append(
                    db_query.get_expression_counts(session, sample_name=name))
            elif name in diff:
                types.append('Differential')
                num_genes.append(
                    db_query.get_diff_counts(session, sample_name=name))
            elif name in target:
                types.append('Target Genes')
                num_genes.append(
                    db_query.get_targetgene_counts(session, sample_name=name))
            # reffile_entries returns reffile objects
            reffiles = db_query.get_reffile_entries(session, sample_name=name)
            file_names = [r.name for r in reffiles]
            files.append(', '.join(file_names))
        session.close()

        self.db_table.setRowCount(0)
        for row, (n, d, t, g, f) in enumerate(
                zip(names, descriptions, types, num_genes, files)):
            self.db_table.setRowCount(self.db_table.rowCount() + 1)
            self.db_table.setItem(row, 0, QTableWidgetItem(QString(n)))
            self.db_table.setItem(row, 1, QTableWidgetItem(QString(d)))
            self.db_table.setItem(row, 2, QTableWidgetItem(QString(t)))
            self.db_table.setItem(row, 3, QTableWidgetItem(QString(str(g))))
            self.db_table.setItem(row, 4, QTableWidgetItem(QString(f)))
Beispiel #10
0
def main():
    rr = RunRecord('start_chippy_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse()
    create_path(args.save_db_dir)

    if not os.path.isdir(args.save_db_dir):
        sys.stderr.write('The save_db_dir must be an existing directory.\n')
        return

    release = args.ensembl_release
    species = args.species
    chippy_db_name = args.save_db_prefix + '_chippy_' + str(release) +\
            '_' + species + '.db'
    db_path = os.path.join(args.save_db_dir, chippy_db_name)
    if not os.path.exists(db_path):
        session = make_session(db_path)

        hostname = args.hostname
        username = args.username
        password = args.password

        account = HostAccount(hostname, username, password, port=args.port)
        add_ensembl_gene_data(session,
                              args.species,
                              ensembl_release=args.ensembl_release,
                              account=account)

        success = create_dummy_expr(session)
        if success:
            rr.addInfo('Dummy data added successfully', 'Expr=1.')
        else:
            rr.addError('Dummy data failed to upload to DB',
                        'Expect bigger problems')

        rr.addInfo('Chippy DB written', db_path)
        print os.path.realpath(db_path)
    else:
        rr.addError('Chippy DB with this name already exists', db_path)

    if args.show_log:
        rr.display()
Beispiel #11
0
    def filterByGenes(self,
                      db_path,
                      chrom=None,
                      include_samples=None,
                      exclude_samples=None):
        """ keep only results that match selected genes """

        rr = RunRecord('filterByGenes')
        if not include_samples and not exclude_samples and not chrom:
            return

        rr.addInfo('Starting no. of genes', self.data_collection.N)

        session = make_session(db_path)
        if include_samples:
            for sample in include_samples:
                rr.addInfo('Restricting plot by include sample', sample)

        if exclude_samples:
            for sample in exclude_samples:
                rr.addInfo('Restricting plot by exclude sample', sample)

        if not chrom is None:
            rr.addInfo('Restricting plot to chromosome', chrom)

        filter_gene_ids = get_gene_ids(session,
                                       chrom=chrom,
                                       include_targets=include_samples,
                                       exclude_targets=exclude_samples)

        self.data_collection =\
                self.data_collection.filteredByLabel(filter_gene_ids)
        rr.addInfo('Remaining genes', self.data_collection.N)

        if self.data_collection is None or\
           len(self.data_collection.ranks) == 0:
            rr.dieOnCritical('Genes remaining after filtering', '0')
Beispiel #12
0
def main():
    """
        Returns a pickle of size window_start to window_finish containing
        chromatin mapping averages per base, one per gene, ranked by
        expression.
    """
    rr = RunRecord('export_counts')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Export Counts')

    session = db_query.make_session(args.db_path)

    sample_name = args.expr_sample
    print 'Loading counts data for', sample_name

    include_name = None
    exclude_name = None
    if args.include_targets:
        include_name = args.include_targets
        rr.addInfo('include gene targets', include_name)

    if args.exclude_targets:
        exclude_name = args.exclude_targets
        rr.addInfo('exclude gene targets', exclude_name)

    if (args.multitest_signif_val is not None) and not \
            (-1 <= args.multitest_signif_val <= 1):
        rr.dieOnCritical('Multitest_signif_val should be -1, 0, 1',
                         args.multitest_signif_val)

    if args.chr_prefix != '':
        # If it writes nothing then cogent.Table fails because it's fragile
        rr.addInfo('BAM/BED chromosome prefix given', args.chr_prefix)

    window_upstream = args.window_upstream
    assert window_upstream > 0, \
            'upstream window must be of at least size 1 bp'
    window_downstream = args.window_downstream
    assert window_downstream > 0, \
            'downstream window must be of at least size 1 bp'

    get_collection(session,
                   sample_name,
                   args.feature_type,
                   args.BAMorBED,
                   args.chr_prefix,
                   window_upstream,
                   window_downstream,
                   args.multitest_signif_val,
                   args.collection,
                   args.overwrite,
                   args.tab_delimited,
                   include_name,
                   exclude_name,
                   bedgraph=args.make_bedgraph,
                   BED_windows=args.BED_windows,
                   chrom_size=args.max_chrom_size,
                   no_overlap=args.no_overlap)

    session.close()
    rr.display()
Beispiel #13
0
    return calc_stat


def summed(data):
    freqs = data.asfreqs()
    c, r = freqs.transformed(counts_func=column_sum)
    return c


def averaged(data):
    c, r = data.transformed(counts_func=column_mean)
    return c


session = db_query.make_session('sqlite:///%s' % db_path)
samples = db_query.get_target_sample(session)

script_info = {}
script_info['title'] = 'Compare read counts between histone variants'
script_info['script_description'] = "Takes read counts that are centred on"\
    " on gene TSS, that exist in two separate mapped read samples."
script_info['version'] = __version__
script_info['authors'] = __author__
script_info['output_description'] = "Generates a single pdf figure."

# alternate option organisation

# essential source files
opt_collection1 = make_option('-1', '--collection1',
                              help='path to the plottable data from sample 1'\
Beispiel #14
0
def load_sample_genes(db_path, diff_sample, sample, sample_extremes):
    """
        Load all portions of diffs into a dict with keys:
        diff_plus1, diff_noSig, diff_minus1, sample_bot,
        sample_mid, sample_top
    """
    rr = RunRecord('load_sample_genes')
    # convert full identifier to stored name
    diff_sample_name = diff_sample
    sample_name = sample

    if sample_extremes > 0.5:
        rr.addWarning('sample_extremes option '+\
                'must be less than or equal to 0.5', sample_extremes)
        sample_extremes = 0.05
        rr.addInfo('setting extremes to default', sample_extremes)

    raw_plot_data = RawPlotData(diff_sample_name, sample_name)

    # get diff genes which are significantly up-regulated
    session = make_session(db_path)
    multitest_signif_val = 1
    raw_plot_data.diff_sig_plus1 =\
            get_genes_by_ranked_diff(session, diff_sample_name,
            multitest_signif_val, biotype='protein_coding',
            data_path=None, rank_by='mean')
    session.close()

    # get diff genes which are significantly down-regulated
    session = make_session(db_path)
    multitest_signif_val = -1
    raw_plot_data.diff_sig_minus1 =\
            get_genes_by_ranked_diff(session, diff_sample_name,
            multitest_signif_val, biotype='protein_coding',
            data_path=None, rank_by='mean')
    session.close()

    # get diff genes which are neither up nor down
    session = make_session(db_path)
    multitest_signif_val = 0
    raw_plot_data.diff_sig_zero = get_genes_by_ranked_diff(
        session, diff_sample_name, multitest_signif_val)
    session.close()

    # get absolute expression samples
    session = make_session(db_path)
    sample_genes = get_genes_by_ranked_expr(session, sample_name)
    session.close()
    sample_genes.sort(key=lambda x: x.MeanScore, reverse=True)
    sample_cutoff = int(len(sample_genes) * sample_extremes)
    rr.addInfo('sample cutoff set', sample_cutoff)

    # set absolute expression middle genes
    raw_plot_data.sample_mid =\
            sample_genes[sample_cutoff:len(sample_genes)-sample_cutoff]

    raw_plot_data.sample_top = sample_genes[:sample_cutoff]
    raw_plot_data.sample_bot = sample_genes[-sample_cutoff:]\
            if sample_cutoff else []

    # Report diff counts
    rr.addInfo('Difference sample name', raw_plot_data.diff_name)
    rr.addInfo('diff genes for signif 1', len(raw_plot_data.diff_sig_plus1))
    rr.addInfo('diff genes for signif 0', len(raw_plot_data.diff_sig_zero))
    rr.addInfo('diff genes for signif -1', len(raw_plot_data.diff_sig_minus1))

    # Report sample counts
    rr.addInfo('Absolute sample name', raw_plot_data.sample_name)
    rr.addInfo('top extreme genes for sample', len(raw_plot_data.sample_top))
    rr.addInfo('bulk, non-extreme genes for sample',
               len(raw_plot_data.sample_mid))
    rr.addInfo('bottom extreme genes for sample',
               len(raw_plot_data.sample_bot))

    return raw_plot_data
Beispiel #15
0
def main(ui=None):
    """
        1) Get all protein coding genes from DB.
        2) Read WIG file and if a count record is in a gene then add
            to its total
        3) Write out genes and expr values
    """
    rr = RunRecord('expr_wig_to_exp')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse(window_title='Expression WIG to EXP')
    chrom_size = args.max_chrom_size
    prefix = args.chr_prefix

    session = db_query.make_session(args.db_path)
    genes = db_query.get_gene_entries(session)

    all_genes = {} # genes indexed by ensembl_id
    genes_by_chrom = {} # chrom: list(gene_id)
    genes_scores = {} # each gene has an expression score
    for gene in genes:
        if not gene.chrom in genes_by_chrom.keys():
            genes_by_chrom[gene.chrom] = []
        genes_by_chrom[gene.chrom].append(gene.ensembl_id)
        genes_scores[gene.ensembl_id] = 0
        all_genes[gene.ensembl_id] = gene

    wig_fn = args.wig
    if wig_fn.endswith('.gz'):
        wig_file = gzip.GzipFile(wig_fn, 'rb')
    else:
        try:
            wig_file = open(wig_fn, 'r')
        except IOError:
            rr.dieOnCritical('Could not open file', wig_fn)

    # get total lines in wig for pacing the progress bar
    if not wig_fn.endswith('.gz'):
        command = 'wc -l ' + wig_fn
        returncode, stdout, stderr = run_command(command)
        if returncode:
            rr.addWarning('could not run wc to count WIG lines', 'error')
            total_lines = 1
        else:
            total_lines = int(stdout.strip().split(' ')[0])
            rr.addInfo('total lines in '+wig_fn, total_lines)

    # Read each piece of the file into an artificial chromosome (Numpy array)
    # and slice out the gene regions that we have for each gene in that chrom

    chrom_array = numpy.zeros(chrom_size, dtype=numpy.float32)

    current_chrom = None
    for i, line in enumerate(wig_file):
        if i % 100 == 0:
            msg = 'Reading wiggle entries [' + str(i) +\
                  ' / ' + str(total_lines) + ']'
            progress = (float(i)/float(total_lines))
            ui.display(msg=msg, progress=progress)

        if line.startswith('track'):
            continue
        elif line.startswith('fixed'):
            # fixedStep chrom=chr10 start=56001 step=20 span=20
            step_type = 'fixed'
            step_parts = line.split(' ')
            step = [val.strip('step=').strip() \
                    for val in step_parts if val.startswith('step')][0]
            span = [val.strip('span=').strip() \
                    for val in step_parts if val.startswith('span')][0]
            chrom = [val.strip('chrom='+prefix).strip() \
                     for val in step_parts if val.startswith('chrom')][0]

            if chrom == 'M':
                chrom = 'MT'

            if current_chrom is None:
                current_chrom = chrom
            elif current_chrom != chrom: # Empty chrom_array into genes
                get_gene_scores_from_chrom(chrom_array, chrom, all_genes,
                        genes_by_chrom, genes_scores)
                current_chrom = chrom
                chrom_array[:] = 0

            start = [val.strip('start=').strip() \
                     for val in step_parts if val.startswith('start')][0]
            pos = int(start)
            step = int(step)
            span = int(span)
        elif line.startswith('variable'):
            step_type = 'variable'
            step_parts = line.split(' ')
            chrom = [val.strip('chrom='+prefix).strip() \
                    for val in step_parts if val.startswith('chrom')][0]

            if chrom == 'M':
                chrom = 'MT'

            if current_chrom is None:
                current_chrom = chrom
            elif current_chrom != chrom: # Empty chrom_array into genes
                get_gene_scores_from_chrom(chrom_array, chrom, all_genes,
                        genes_by_chrom, genes_scores)
                current_chrom = chrom
                chrom_array[:] = 0
        else:
            if step_type == 'fixed':
                chrom_array[pos] = float(line.strip())
                pos += step
            else: #step_type == 'variable'
                if '\t' in line:
                    line_parts = line.split('\t')
                else:
                    line_parts = line.split(' ')
                chrom_array[int(line_parts[0])] = float(line_parts[1].strip())

    # empty chrom_array into genes_score from the final section
    get_gene_scores_from_chrom(chrom_array, chrom, all_genes,
            genes_by_chrom, genes_scores)

    # output genes and scores
    if args.exp:
        out_fn = args.exp
    else:
        if '.gz' in wig_fn:
            wig_fn = '.'.join(wig_fn.split('.')[:-1])
        out_fn = '.'.join(wig_fn.split('.')[:-1]) # cut off wig extension
        out_fn += '.exp' # add .exp extension

    with open(out_fn, 'w') as out:
        out.write('gene\texp\n') # header
        for id in genes_scores.keys():
            out.write(id + '\t' + str(genes_scores[id]) + '\n')
        out.close()