def diagnostic_yield(api, genes=None, samples=None, group=None, level=10): """Calculate transcripts that aren't completely covered. This metric only applies to one sample in isolation. Otherwise it's hard to know what to do with exons that are covered or not covered across multiple samples. Args: sample_id (str): unique sample id """ threshold = 100 str_level = "completeness_{}".format(level) completeness_col = getattr(TranscriptStat, str_level) all_tx = api.query(Transcript) missed_tx = (api.query(TranscriptStat).filter( completeness_col < threshold).order_by(TranscriptStat.sample_id)) if genes: missed_tx = (missed_tx.join(TranscriptStat.transcript).filter( Transcript.gene_id.in_(genes))) all_tx = all_tx.filter(Transcript.gene_id.in_(genes)) samples_query = api.query(Sample.id) if samples: samples_query = samples_query.filter(Sample.id.in_(samples)) missed_tx = missed_tx.filter(TranscriptStat.sample_id.in_(samples)) elif group: samples_query = samples_query.filter_by(group_id=group) missed_tx = (missed_tx.join( TranscriptStat.sample).filter(Sample.group_id == group)) all_count = all_tx.count() all_samples = [row[0] for row in samples_query.all()] sample_groups = itertools.groupby(missed_tx, key=lambda tx: tx.sample_id) missed_samples = {} for sample_id, tx_models in sample_groups: gene_ids = set() tx_count = 0 for tx_model in tx_models: gene_ids.add(tx_model.transcript.gene_id) tx_count += 1 diagnostic_yield = 100 - (tx_count / all_count * 100) result = {'sample_id': sample_id} result['diagnostic_yield'] = diagnostic_yield result['count'] = tx_count result['total_count'] = all_count result['genes'] = list(gene_ids) missed_samples[sample_id] = result for sample_id in all_samples: if sample_id in missed_samples: yield missed_samples[sample_id] else: # all transcripts are covered! result = {'sample_id': sample_id, 'diagnostic_yield': 100} yield result
def diagnostic_yield(api, genes=None, samples=None, group=None, level=10): """Calculate transcripts that aren't completely covered. This metric only applies to one sample in isolation. Otherwise it's hard to know what to do with exons that are covered or not covered across multiple samples. Args: sample_id (str): unique sample id """ threshold = 100 str_level = "completeness_{}".format(level) completeness_col = getattr(TranscriptStat, str_level) all_tx = api.query(Transcript) missed_tx = (api.query(TranscriptStat) .filter(completeness_col < threshold) .order_by(TranscriptStat.sample_id)) if genes: missed_tx = (missed_tx.join(TranscriptStat.transcript) .filter(Transcript.gene_id.in_(genes))) all_tx = all_tx.filter(Transcript.gene_id.in_(genes)) samples_query = api.query(Sample.id) if samples: samples_query = samples_query.filter(Sample.id.in_(samples)) missed_tx = missed_tx.filter(TranscriptStat.sample_id.in_(samples)) elif group: samples_query = samples_query.filter_by(group_id=group) missed_tx = (missed_tx.join(TranscriptStat.sample).filter(Sample.group_id == group)) all_count = all_tx.count() all_samples = [row[0] for row in samples_query.all()] sample_groups = itertools.groupby(missed_tx, key=lambda tx: tx.sample_id) missed_samples = {} for sample_id, tx_models in sample_groups: gene_ids = set() tx_count = 0 for tx_model in tx_models: gene_ids.add(tx_model.transcript.gene_id) tx_count += 1 diagnostic_yield = 100 - (tx_count / all_count * 100) result = {'sample_id': sample_id} result['diagnostic_yield'] = diagnostic_yield result['count'] = tx_count result['total_count'] = all_count result['genes'] = list(gene_ids) missed_samples[sample_id] = result for sample_id in all_samples: if sample_id in missed_samples: yield missed_samples[sample_id] else: # all transcripts are covered! result = {'sample_id': sample_id, 'diagnostic_yield': 100} yield result
def samplesex_rows(sample_ids): """Generate sex prediction info rows.""" sex_query = (api.query( TranscriptStat.sample_id, Transcript.chromosome, func.avg(TranscriptStat.mean_coverage) ).join( TranscriptStat.transcript ).filter( Transcript.chromosome.in_(['X', 'Y']), TranscriptStat.sample_id.in_(sample_ids) ).group_by( TranscriptStat.sample_id, Transcript.chromosome )) samples = itertools.groupby(sex_query, lambda row: row[0]) for sample_id, chromosomes in samples: chr_coverage = [coverage for _, _, coverage in chromosomes] LOG.debug('predicting sex') predicted_sex = predict_sex(*chr_coverage) sample_obj = Sample.query.get(sample_id) sample_row = { 'sample': sample_obj.name or sample_obj.id, 'group': sample_obj.group_name, 'analysis_date': sample_obj.created_at, 'sex': predicted_sex, 'x_coverage': chr_coverage[0], 'y_coverage': chr_coverage[1], } yield sample_row
def genes(): """Display an overview of genes that are (un)completely covered.""" skip = int(request.args.get('skip', 0)) limit = int(request.args.get('limit', 30)) exonlink = request.args.get('exonlink') sample_ids = request.args.getlist('sample_id') samples_q = Sample.filter(Sample.id.in_(sample_ids)) level = request.args.get('level', 10) raw_gene_ids = request.args.get('gene_id') completeness_col = getattr(TranscriptStat, "completeness_{}".format(level)) query = (api.query(TranscriptStat) .join(TranscriptStat.transcript) .filter(completeness_col < 100) .order_by(completeness_col)) gene_ids = raw_gene_ids.split(',') if raw_gene_ids else [] if raw_gene_ids: query = query.filter(Transcript.gene_id.in_(gene_ids)) if sample_ids: query = query.filter(TranscriptStat.sample_id.in_(sample_ids)) incomplete_left = query.offset(skip).limit(limit) total = query.count() has_next = total > skip + limit return render_template('report/genes.html', incomplete=incomplete_left, level=level, skip=skip, limit=limit, has_next=has_next, gene_ids=gene_ids, exonlink=exonlink, samples=samples_q, sample_ids=sample_ids)
def samplesex_rows(sample_ids): """Generate sex prediction info rows.""" sex_query = (api.query( TranscriptStat.sample_id, Transcript.chromosome, func.avg(TranscriptStat.mean_coverage)).join( TranscriptStat.transcript).filter( Transcript.chromosome.in_(['X', 'Y']), TranscriptStat.sample_id.in_(sample_ids)).group_by( TranscriptStat.sample_id, Transcript.chromosome)) samples = itertools.groupby(sex_query, lambda row: row[0]) for sample_id, chromosomes in samples: chr_coverage = [coverage for _, _, coverage in chromosomes] LOG.debug('predicting sex') predicted_sex = predict_sex(*chr_coverage) sample_obj = Sample.query.get(sample_id) sample_row = { 'sample': sample_obj.name or sample_obj.id, 'group': sample_obj.group_name, 'analysis_date': sample_obj.created_at, 'sex': predicted_sex, 'x_coverage': chr_coverage[0], 'y_coverage': chr_coverage[1], } yield sample_row
def genes(): """Display an overview of genes that are (un)completely covered.""" skip = int(request.args.get('skip', 0)) limit = int(request.args.get('limit', 30)) exonlink = request.args.get('exonlink') sample_ids = request.args.getlist('sample_id') samples_q = Sample.filter(Sample.id.in_(sample_ids)) level = request.args.get('level', 10) raw_gene_ids = request.args.get('gene_id') completeness_col = getattr(TranscriptStat, "completeness_{}".format(level)) query = (api.query(TranscriptStat).join(TranscriptStat.transcript).filter( completeness_col < 100).order_by(completeness_col)) gene_ids = raw_gene_ids.split(',') if raw_gene_ids else [] if raw_gene_ids: query = query.filter(Transcript.gene_id.in_(gene_ids)) if sample_ids: query = query.filter(TranscriptStat.sample_id.in_(sample_ids)) incomplete_left = query.offset(skip).limit(limit) total = query.count() has_next = total > skip + limit return render_template('report/genes.html', incomplete=incomplete_left, level=level, skip=skip, limit=limit, has_next=has_next, gene_ids=gene_ids, exonlink=exonlink, samples=samples_q, sample_ids=sample_ids)
def transcript_coverage(api, gene_id, *sample_ids): """Return coverage metrics per transcript for a given gene.""" query = (api.query(TranscriptStat).join(TranscriptStat.transcript).filter( Transcript.gene_id == gene_id).order_by(TranscriptStat.transcript_id, TranscriptStat.sample_id)) if sample_ids: query = query.filter(TranscriptStat.sample_id.in_(sample_ids)) tx_groups = itertools.groupby(query, key=lambda tx: tx.transcript_id) return tx_groups
def transcript_coverage(api, gene_id, *sample_ids): """Return coverage metrics per transcript for a given gene.""" query = (api.query(TranscriptStat) .join(TranscriptStat.transcript) .filter(Transcript.gene_id == gene_id) .order_by(TranscriptStat.transcript_id, TranscriptStat.sample_id)) if sample_ids: query = query.filter(TranscriptStat.sample_id.in_(sample_ids)) tx_groups = itertools.groupby(query, key=lambda tx: tx.transcript_id) return tx_groups
def keymetrics_rows(samples_ids, genes=None): """Generate key metrics rows.""" fields = ( TranscriptStat, func.avg(TranscriptStat.mean_coverage).label("mean_coverage"), func.avg(TranscriptStat.completeness_10).label("completeness_10"), func.avg(TranscriptStat.completeness_15).label("completeness_15"), func.avg(TranscriptStat.completeness_20).label("completeness_20"), func.avg(TranscriptStat.completeness_50).label("completeness_50"), func.avg(TranscriptStat.completeness_100).label("completeness_100"), ) query = api.query(*fields).filter(TranscriptStat.sample_id.in_(samples_ids)).group_by(TranscriptStat.sample_id) if genes: query = query.join(TranscriptStat.transcript).filter(Transcript.gene_id.in_(genes)) return query
def keymetrics_rows(samples_ids, genes=None): """Generate key metrics rows.""" query = (api.query( TranscriptStat, func.avg(TranscriptStat.mean_coverage).label('mean_coverage'), func.avg(TranscriptStat.completeness_10).label('completeness_10'), func.avg(TranscriptStat.completeness_15).label('completeness_15'), func.avg(TranscriptStat.completeness_20).label('completeness_20'), func.avg(TranscriptStat.completeness_50).label('completeness_50'), func.avg(TranscriptStat.completeness_100).label('completeness_100'), ).filter(TranscriptStat.sample_id.in_(samples_ids)).group_by( TranscriptStat.sample_id)) if genes: query = (query.join(TranscriptStat.transcript).filter( Transcript.gene_id.in_(genes))) return query
def keymetrics_rows(samples_ids, genes=None): """Generate key metrics rows.""" query = ( api.query( TranscriptStat, func.avg(TranscriptStat.mean_coverage).label('mean_coverage'), func.avg(TranscriptStat.completeness_10).label('completeness_10'), func.avg(TranscriptStat.completeness_15).label('completeness_15'), func.avg(TranscriptStat.completeness_20).label('completeness_20'), func.avg(TranscriptStat.completeness_50).label('completeness_50'), func.avg(TranscriptStat.completeness_100).label('completeness_100'), ) .filter(TranscriptStat.sample_id.in_(samples_ids)) .group_by(TranscriptStat.sample_id) ) if genes: query = (query.join(TranscriptStat.transcript) .filter(Transcript.gene_id.in_(genes))) return query
def samplesex_rows(sample_ids): """Generate sex prediction info rows.""" sex_query = ( api.query(TranscriptStat.sample_id, Transcript.chromosome, func.avg(TranscriptStat.mean_coverage)) .join(TranscriptStat.transcript) .filter(Transcript.chromosome.in_(["X", "Y"]), TranscriptStat.sample_id.in_(sample_ids)) .group_by(TranscriptStat.sample_id, Transcript.chromosome) ) samples = itertools.groupby(sex_query, lambda row: row[0]) for sample_id, chromosomes in samples: chr_coverage = [coverage for _, _, coverage in chromosomes] logger.debug("predicting sex") predicted_sex = predict_sex(*chr_coverage) sample_obj = Sample.query.get(sample_id) sample_row = { "sample": sample_obj.name, "group": sample_obj.group_name, "analysis_date": sample_obj.created_at, "sex": predicted_sex, "x_coverage": chr_coverage[0], "y_coverage": chr_coverage[1], } yield sample_row
def index(): sample_objs = api.query(Sample).limit(20) tx_models = api.query(Transcript).distinct(Transcript.gene_id).limit(20) return render_template('index/index.html', samples=sample_objs, transcripts=tx_models)