예제 #1
0
파일: init.py 프로젝트: MattWellie/chanjo
def init(context, setup, reset, automate):
    """Walk user through setting up a new config file."""
    # print a nice welcome message
    click.echo(chanjo.__banner__)

    cov_tresholds = (context.obj.get('sambamba', {})
                                .get('cov_treshold', [10, 20]))
    defaults = {'sambamba.cov_treshold': {'value': cov_tresholds,
                                          'prompt': 'sufficient coverage'},
                'database': {'value': context.obj['database'],
                             'prompt': 'central database path/URI'}}

    if not automate:
        questions = [(key, value['prompt'], value['value'])
                     for key, value in iteritems(defaults)]
        # launch init pipeline
        init_pipeline('chanjo', context.obj, questions)
    else:
        logger.info('setting default config values')
        for key, value in iteritems(defaults):
            context.obj.set(key, value['value'], scope=context.obj.user_data)

    # write to the config file
    context.obj.save(default_flow_style=False)

    if setup:
        chanjo_db = Store(uri=context.obj.user_data['database'])
        if reset:
            chanjo_db.tear_down()
        chanjo_db.set_up()
예제 #2
0
def load_transcripts(sequence,
                     sample_id=None,
                     group_id=None,
                     source=None,
                     threshold=None):
    """Process a sequence of exon lines.

    Args:
        sequence (sequence): list of chanjo bed lines
        sample_id (Optional[str]): unique sample id, else auto-guessed
        grouip_id (Optional[str]): id to group samples
        source (Optional[str]): path to coverage source (BAM/Sambamba)
        threshold (Optional[int]): completeness level to disqualify exons

    Returns:
        Result: iterators of `Transcript`, transcripts processed, sample model
    """
    exons = sambamba.depth_output(sequence)
    transcripts = groupby_tx(exons, sambamba=True)
    raw_stats = ((tx_id, tx_stat(tx_id, exons, threshold=threshold))
                 for tx_id, exons in iteritems(transcripts))

    if sample_id is None:
        sample_id = next(iter(itervalues(transcripts)))[0]['sampleName']
    sample_obj = Sample(id=sample_id, group_id=group_id, source=source)

    models = (make_model(sample_obj, tx_id, raw_stat)
              for tx_id, raw_stat in raw_stats)
    return Result(models=models, count=len(transcripts), sample=sample_obj)
예제 #3
0
def load_transcripts(sequence, sample_id=None, group_id=None, source=None,
                     threshold=None):
    """Process a sequence of exon lines.

    Args:
        sequence (sequence): list of chanjo bed lines
        sample_id (Optional[str]): unique sample id, else auto-guessed
        grouip_id (Optional[str]): id to group samples
        source (Optional[str]): path to coverage source (BAM/Sambamba)
        threshold (Optional[int]): completeness level to disqualify exons

    Returns:
        Result: iterators of `Transcript`, transcripts processed, sample model
    """
    exons = sambamba.depth_output(sequence)
    transcripts = groupby_tx(exons, sambamba=True)
    raw_stats = ((tx_id, tx_stat(tx_id, exons, threshold=threshold))
                 for tx_id, exons in iteritems(transcripts))

    if sample_id is None:
        sample_id = next(iter(itervalues(transcripts)))[0]['sampleName']
    sample_obj = Sample(id=sample_id, group_id=group_id, source=source)

    models = (make_model(sample_obj, tx_id, raw_stat) for tx_id, raw_stat
              in raw_stats)
    return Result(models=models, count=len(transcripts), sample=sample_obj)
예제 #4
0
def expand_row(header, row):
    """Parse information in row to dict.

    Args:
        header (dict): key/index header dict
        row (List[str]): sambamba BED row

    Returns:
        dict: parsed sambamba output row
    """
    thresholds = {
        threshold: float(row[key])
        for threshold, key in iteritems(header['thresholds'])
    }
    data = {
        'chrom': row[0],
        'chromStart': int(row[1]),
        'chromEnd': int(row[2]),
        'sampleName': row[header['sampleName']],
        'readCount': int(row[header['readCount']]),
        'meanCoverage': float(row[header['meanCoverage']]),
        'thresholds': thresholds,
        'extraFields': row[header['extraFields']]
    }
    return data
예제 #5
0
def init(context, setup, reset, automate, transcripts):
    """Walk user through setting up a new config file."""
    # print a nice welcome message
    click.echo(chanjo.__banner__)

    cov_tresholds = (context.obj.get('sambamba',
                                     {}).get('cov_treshold', [10, 20]))
    defaults = {
        'sambamba.cov_treshold': {
            'value': cov_tresholds,
            'prompt': 'sufficient coverage'
        },
        'database': {
            'value': str(context.obj['database']),
            'prompt': 'central database path/URI'
        }
    }

    if not automate:
        questions = [(key, value['prompt'], value['value'])
                     for key, value in iteritems(defaults)]
        # launch init pipeline
        init_pipeline('chanjo', context.obj, questions)
    else:
        logger.info('setting default config values')
        for key, value in iteritems(defaults):
            context.obj.set(key, value['value'], scope=context.obj.user_data)

    # write to the config file
    context.obj.save(default_flow_style=False)

    if setup:
        only_tx = transcripts or context.obj.get('transcripts') or False
        base = TXBASE if only_tx else BASE
        chanjo_db = Store(uri=context.obj['database'], base=base)
        if reset:
            chanjo_db.tear_down()
        chanjo_db.set_up()
예제 #6
0
def link_elements(sequence):
    """Process a sequence of exon lines.

    Args:
        sequence (sequence): list of chanjo bed lines

    Returns:
        Result: iterators of transcript models, number of transcripts processed
    """
    exons = parse_bed.chanjo(sequence)
    transcripts = groupby_tx(exons)
    models = (make_model(tx_id, exons) for tx_id, exons in
              iteritems(transcripts))
    return Result(models=models, count=len(transcripts))
예제 #7
0
def tx_stat(transcript_id, exons, threshold=None):
    """Calculate metrics for transcript stats model.

    Args:
        transcript_id (str): unqiue transcript id
        exons (List[dict]): list of exon transcripts
        threshold (Optional[int]): completeness level to disqualify exons

    Returns:
        dict: aggregated stats over all exons
    """
    sums = {'bases': 0, 'mean_coverage': 0}
    incomplete_exons = []

    # for each of the exons (linked to one transcript)
    for exon in exons:
        # go over each of the fields to sum up
        exon_length = (exon['chromEnd'] - exon['chromStart'])
        sums['bases'] += exon_length
        sums['mean_coverage'] += (exon['meanCoverage'] * exon_length)

        # add to the total sum for completeness levels
        for comp_key in [10, 15, 20, 50, 100]:
            if comp_key in exon['thresholds']:
                sums_key = "completeness_{}".format(comp_key)
                if sums_key not in sums:
                    sums[sums_key] = 0
                completeness = exon['thresholds'][comp_key]
                sums[sums_key] += (completeness * exon_length)

                if threshold == comp_key and completeness < 100:
                    exon_obj = Exon(exon['chrom'], exon['chromStart'],
                                    exon['chromEnd'], completeness)
                    incomplete_exons.append(exon_obj)

    fields = {
        key: (value / sums['bases'])
        for key, value in iteritems(sums) if key != 'bases'
    }
    fields['incomplete_exons'] = incomplete_exons
    fields['threshold'] = threshold
    return fields
예제 #8
0
def statistics(data, sample_obj, exon_obj):
    """Create models from a sambamba output row.

    Args:
        data (dict): parsed sambamba output row
        sample_obj (Sample): linked sample model
        exon_obj (Exon): linked exon model

    Returns:
        List[ExonStatistic]: stats models linked to exon and sample
    """
    relationships = dict(sample=sample_obj, exon=exon_obj)
    stats = [ExonStatistic(metric='mean_coverage', value=data['meanCoverage'],
                           **relationships)]
    for threshold, value in iteritems(data['thresholds']):
        metric = "completeness_{}".format(threshold)
        stat = ExonStatistic(metric=metric, value=value, **relationships)
        stats.append(stat)

    return stats
예제 #9
0
def tx_stat(transcript_id, exons, threshold=None):
    """Calculate metrics for transcript stats model.

    Args:
        transcript_id (str): unqiue transcript id
        exons (List[dict]): list of exon transcripts
        threshold (Optional[int]): completeness level to disqualify exons

    Returns:
        dict: aggregated stats over all exons
    """
    sums = {'bases': 0, 'mean_coverage': 0}
    incomplete_exons = []

    # for each of the exons (linked to one transcript)
    for exon in exons:
        # go over each of the fields to sum up
        exon_length = (exon['chromEnd'] - exon['chromStart'])
        sums['bases'] += exon_length
        sums['mean_coverage'] += (exon['meanCoverage'] * exon_length)

        # add to the total sum for completeness levels
        for comp_key in [10, 15, 20, 50, 100]:
            if comp_key in exon['thresholds']:
                sums_key = "completeness_{}".format(comp_key)
                if sums_key not in sums:
                    sums[sums_key] = 0
                completeness = exon['thresholds'][comp_key]
                sums[sums_key] += (completeness * exon_length)

                if threshold == comp_key and completeness < 100:
                    exon_obj = Exon(exon['chrom'], exon['chromStart'],
                                    exon['chromEnd'], completeness)
                    incomplete_exons.append(exon_obj)

    fields = {key: (value / sums['bases']) for key, value in iteritems(sums)
              if key != 'bases'}
    fields['incomplete_exons'] = incomplete_exons
    fields['threshold'] = threshold
    return fields
예제 #10
0
파일: parse.py 프로젝트: MattWellie/chanjo
def expand_row(header, row):
    """Parse information in row to dict.

    Args:
        header (dict): key/index header dict
        row (List[str]): sambamba BED row

    Returns:
        dict: parsed sambamba output row
    """
    thresholds = {threshold: float(row[key])
                  for threshold, key in iteritems(header['thresholds'])}
    data = {
        'chrom': row[0],
        'chromStart': int(row[1]),
        'chromEnd': int(row[2]),
        'sampleName': row[header['sampleName']],
        'readCount': int(row[header['readCount']]),
        'meanCoverage': float(row[header['meanCoverage']]),
        'thresholds': thresholds,
        'extraFields': row[header['extraFields']]
    }
    return data
예제 #11
0
def statistics(data, sample_obj, exon_obj=None, exon_id=None):
    """Create models from a sambamba output row.

    Args:
        data (dict): parsed sambamba output row
        sample_obj (Sample): linked sample model
        exon_obj (int): primary key for related Exon

    Returns:
        List[ExonStatistic]: stats models linked to exon and sample
    """
    if exon_obj:
        relationships = dict(sample=sample_obj, exon=exon_obj)
    else:
        relationships = dict(sample=sample_obj, exon_id=exon_id)
    stats = [ExonStatistic(metric='mean_coverage', value=data['meanCoverage'],
                           **relationships)]
    for threshold, value in iteritems(data['thresholds']):
        metric = "completeness_{}".format(threshold)
        stat = ExonStatistic(metric=metric, value=value, **relationships)
        stats.append(stat)

    return stats