def init(context, setup, reset, automate): """Walk user through setting up a new config file.""" # print a nice welcome message click.echo(chanjo.__banner__) cov_tresholds = (context.obj.get('sambamba', {}) .get('cov_treshold', [10, 20])) defaults = {'sambamba.cov_treshold': {'value': cov_tresholds, 'prompt': 'sufficient coverage'}, 'database': {'value': context.obj['database'], 'prompt': 'central database path/URI'}} if not automate: questions = [(key, value['prompt'], value['value']) for key, value in iteritems(defaults)] # launch init pipeline init_pipeline('chanjo', context.obj, questions) else: logger.info('setting default config values') for key, value in iteritems(defaults): context.obj.set(key, value['value'], scope=context.obj.user_data) # write to the config file context.obj.save(default_flow_style=False) if setup: chanjo_db = Store(uri=context.obj.user_data['database']) if reset: chanjo_db.tear_down() chanjo_db.set_up()
def load_transcripts(sequence, sample_id=None, group_id=None, source=None, threshold=None): """Process a sequence of exon lines. Args: sequence (sequence): list of chanjo bed lines sample_id (Optional[str]): unique sample id, else auto-guessed grouip_id (Optional[str]): id to group samples source (Optional[str]): path to coverage source (BAM/Sambamba) threshold (Optional[int]): completeness level to disqualify exons Returns: Result: iterators of `Transcript`, transcripts processed, sample model """ exons = sambamba.depth_output(sequence) transcripts = groupby_tx(exons, sambamba=True) raw_stats = ((tx_id, tx_stat(tx_id, exons, threshold=threshold)) for tx_id, exons in iteritems(transcripts)) if sample_id is None: sample_id = next(iter(itervalues(transcripts)))[0]['sampleName'] sample_obj = Sample(id=sample_id, group_id=group_id, source=source) models = (make_model(sample_obj, tx_id, raw_stat) for tx_id, raw_stat in raw_stats) return Result(models=models, count=len(transcripts), sample=sample_obj)
def expand_row(header, row): """Parse information in row to dict. Args: header (dict): key/index header dict row (List[str]): sambamba BED row Returns: dict: parsed sambamba output row """ thresholds = { threshold: float(row[key]) for threshold, key in iteritems(header['thresholds']) } data = { 'chrom': row[0], 'chromStart': int(row[1]), 'chromEnd': int(row[2]), 'sampleName': row[header['sampleName']], 'readCount': int(row[header['readCount']]), 'meanCoverage': float(row[header['meanCoverage']]), 'thresholds': thresholds, 'extraFields': row[header['extraFields']] } return data
def init(context, setup, reset, automate, transcripts): """Walk user through setting up a new config file.""" # print a nice welcome message click.echo(chanjo.__banner__) cov_tresholds = (context.obj.get('sambamba', {}).get('cov_treshold', [10, 20])) defaults = { 'sambamba.cov_treshold': { 'value': cov_tresholds, 'prompt': 'sufficient coverage' }, 'database': { 'value': str(context.obj['database']), 'prompt': 'central database path/URI' } } if not automate: questions = [(key, value['prompt'], value['value']) for key, value in iteritems(defaults)] # launch init pipeline init_pipeline('chanjo', context.obj, questions) else: logger.info('setting default config values') for key, value in iteritems(defaults): context.obj.set(key, value['value'], scope=context.obj.user_data) # write to the config file context.obj.save(default_flow_style=False) if setup: only_tx = transcripts or context.obj.get('transcripts') or False base = TXBASE if only_tx else BASE chanjo_db = Store(uri=context.obj['database'], base=base) if reset: chanjo_db.tear_down() chanjo_db.set_up()
def link_elements(sequence): """Process a sequence of exon lines. Args: sequence (sequence): list of chanjo bed lines Returns: Result: iterators of transcript models, number of transcripts processed """ exons = parse_bed.chanjo(sequence) transcripts = groupby_tx(exons) models = (make_model(tx_id, exons) for tx_id, exons in iteritems(transcripts)) return Result(models=models, count=len(transcripts))
def tx_stat(transcript_id, exons, threshold=None): """Calculate metrics for transcript stats model. Args: transcript_id (str): unqiue transcript id exons (List[dict]): list of exon transcripts threshold (Optional[int]): completeness level to disqualify exons Returns: dict: aggregated stats over all exons """ sums = {'bases': 0, 'mean_coverage': 0} incomplete_exons = [] # for each of the exons (linked to one transcript) for exon in exons: # go over each of the fields to sum up exon_length = (exon['chromEnd'] - exon['chromStart']) sums['bases'] += exon_length sums['mean_coverage'] += (exon['meanCoverage'] * exon_length) # add to the total sum for completeness levels for comp_key in [10, 15, 20, 50, 100]: if comp_key in exon['thresholds']: sums_key = "completeness_{}".format(comp_key) if sums_key not in sums: sums[sums_key] = 0 completeness = exon['thresholds'][comp_key] sums[sums_key] += (completeness * exon_length) if threshold == comp_key and completeness < 100: exon_obj = Exon(exon['chrom'], exon['chromStart'], exon['chromEnd'], completeness) incomplete_exons.append(exon_obj) fields = { key: (value / sums['bases']) for key, value in iteritems(sums) if key != 'bases' } fields['incomplete_exons'] = incomplete_exons fields['threshold'] = threshold return fields
def statistics(data, sample_obj, exon_obj): """Create models from a sambamba output row. Args: data (dict): parsed sambamba output row sample_obj (Sample): linked sample model exon_obj (Exon): linked exon model Returns: List[ExonStatistic]: stats models linked to exon and sample """ relationships = dict(sample=sample_obj, exon=exon_obj) stats = [ExonStatistic(metric='mean_coverage', value=data['meanCoverage'], **relationships)] for threshold, value in iteritems(data['thresholds']): metric = "completeness_{}".format(threshold) stat = ExonStatistic(metric=metric, value=value, **relationships) stats.append(stat) return stats
def tx_stat(transcript_id, exons, threshold=None): """Calculate metrics for transcript stats model. Args: transcript_id (str): unqiue transcript id exons (List[dict]): list of exon transcripts threshold (Optional[int]): completeness level to disqualify exons Returns: dict: aggregated stats over all exons """ sums = {'bases': 0, 'mean_coverage': 0} incomplete_exons = [] # for each of the exons (linked to one transcript) for exon in exons: # go over each of the fields to sum up exon_length = (exon['chromEnd'] - exon['chromStart']) sums['bases'] += exon_length sums['mean_coverage'] += (exon['meanCoverage'] * exon_length) # add to the total sum for completeness levels for comp_key in [10, 15, 20, 50, 100]: if comp_key in exon['thresholds']: sums_key = "completeness_{}".format(comp_key) if sums_key not in sums: sums[sums_key] = 0 completeness = exon['thresholds'][comp_key] sums[sums_key] += (completeness * exon_length) if threshold == comp_key and completeness < 100: exon_obj = Exon(exon['chrom'], exon['chromStart'], exon['chromEnd'], completeness) incomplete_exons.append(exon_obj) fields = {key: (value / sums['bases']) for key, value in iteritems(sums) if key != 'bases'} fields['incomplete_exons'] = incomplete_exons fields['threshold'] = threshold return fields
def expand_row(header, row): """Parse information in row to dict. Args: header (dict): key/index header dict row (List[str]): sambamba BED row Returns: dict: parsed sambamba output row """ thresholds = {threshold: float(row[key]) for threshold, key in iteritems(header['thresholds'])} data = { 'chrom': row[0], 'chromStart': int(row[1]), 'chromEnd': int(row[2]), 'sampleName': row[header['sampleName']], 'readCount': int(row[header['readCount']]), 'meanCoverage': float(row[header['meanCoverage']]), 'thresholds': thresholds, 'extraFields': row[header['extraFields']] } return data
def statistics(data, sample_obj, exon_obj=None, exon_id=None): """Create models from a sambamba output row. Args: data (dict): parsed sambamba output row sample_obj (Sample): linked sample model exon_obj (int): primary key for related Exon Returns: List[ExonStatistic]: stats models linked to exon and sample """ if exon_obj: relationships = dict(sample=sample_obj, exon=exon_obj) else: relationships = dict(sample=sample_obj, exon_id=exon_id) stats = [ExonStatistic(metric='mean_coverage', value=data['meanCoverage'], **relationships)] for threshold, value in iteritems(data['thresholds']): metric = "completeness_{}".format(threshold) stat = ExonStatistic(metric=metric, value=value, **relationships) stats.append(stat) return stats