Example #1
0
def align_by_identity(query, reference_fasta, output=None, format='1'):
    """
    Type sequences in a fasta file by finding the closet reference
    """
    # If output isn't specified, base it on the query
    assert format in ['1', '5']
    if output is None:
        basename = '.'.join(query.split('.')[:-1])
        output = '%s.m%s' % (basename, format)
    ref_count = fasta_size(reference_fasta)
    # Iterate over each Fasta, aligning individually.
    with BlasrWriter(output) as handle:
        handle.write_header('m1')
        for record in read_sequences(query):
            log.info('Aligning %s by identity to %s references' %
                     (record.name, ref_count))
            temp = write_temp_fasta(record)
            alignments = _align_fasta(temp.name, reference_fasta, format)
            if not alignments:
                log.info("No hits found for %s" % record.name)
                continue
            alignments = _sort_alignments(alignments)
            alignments = _filter_alignments(alignments)
            log.info(
                'Found %s alignments sharing maximum identity with the query' %
                len(alignments))
            handle.write(alignments[0])
            os.unlink(temp.name)
    check_output_file(output)
    return output
Example #2
0
def align_by_identity( query, reference_fasta, output=None, format='1' ):
    """
    Type sequences in a fasta file by finding the closet reference
    """
    # If output isn't specified, base it on the query
    assert format in ['1', '5']
    if output is None:
        basename = '.'.join( query.split('.')[:-1] )
        output = '%s.m%s' % (basename, format)
    ref_count = fasta_size(reference_fasta)
    # Iterate over each Fasta, aligning individually.
    with BlasrWriter( output ) as handle:
        handle.write_header( 'm1' )
        for record in read_sequences( query ):
            log.info('Aligning %s by identity to %s references' % (record.name, ref_count))
            temp = write_temp_fasta( record )
            alignments = _align_fasta( temp.name, reference_fasta, format )
            if not alignments:
                log.info("No hits found for %s" % record.name)
                continue
            alignments = _sort_alignments( alignments )
            alignments = _filter_alignments( alignments )
            log.info('Found %s alignments sharing maximum identity with the query' % len(alignments))
            handle.write( alignments[0] )
            os.unlink( temp.name )
    check_output_file( output )
    return output
Example #3
0
def extract_alleles(input_file,
                    output_file=None,
                    reference_file=None,
                    alignment_file=None,
                    method=METHOD,
                    sort=SORT,
                    loci=LOCI):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # If align to reference for breaking ties
    alignment_file = get_alignment_file(input_file, reference_file,
                                        alignment_file)
    alignments = list(BlasrReader(alignment_file))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus(alignments, loci)
    elif method == 'barcode':
        groups = _group_by_barcode(alignments)
    elif method == 'both':
        groups = _group_by_both(alignments, loci)
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error(msg)
        raise ValueError(msg)

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences(input_file)
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error(msg)
        raise ValueError(msg)

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups(groups, sorting_data)

    log.info('Selecting top sequences from %s according to the "%s" policy' %
             (input_file, method))
    selected = list(_select_sequences(ordered))
    log.info('Selected %s sequences from %s total for further analysis' %
             (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list(_subset_sequences(sequences, selected))
    _write_output(subset, output_file, output_type)
    return output_file
Example #4
0
def rename_sequences( sequence_file ):
    records = read_sequences( sequence_file )
    if any([r.name.strip().endswith('|quiver') for r in records]):
        records = [rename_record(r) for r in records]
        output_file = get_output_file( sequence_file )
        write_sequences( records, output_file )
        return output_file
    else:
        return sequence_file
Example #5
0
def extract_alleles( input_file, output_file=None, reference_file=None,
                                                   alignment_file=None,
                                                   method=METHOD,
                                                   sort=SORT,
                                                   loci=LOCI ):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file( input_file )
    output_type = get_file_type( output_file )

    # If align to reference for breaking ties
    alignment_file = get_alignment_file( input_file, reference_file, alignment_file )
    alignments = list( BlasrReader( alignment_file ))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus( alignments, loci )
    elif method == 'barcode':
        groups = _group_by_barcode( alignments )
    elif method == 'both':
        groups = _group_by_both( alignments, loci )
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error( msg )
        raise ValueError( msg )

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences( input_file )
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error( msg )
        raise ValueError( msg )

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups( groups, sorting_data )

    log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method))
    selected = list( _select_sequences( ordered ))
    log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list( _subset_sequences( sequences, selected ))
    _write_output( subset, output_file, output_type )
    return output_file
Example #6
0
def trim_alleles(input_file, output_file=None, trim=0):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""

    # If no trim or output file is specified, we can skip this module
    if trim == 0 and output_file is None:
        log.info('No trimming necessary for "%s", skipping...' % input_file)
        return input_file

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # Read the input sequences and trim each record
    sequences = read_sequences(input_file)
    log.info("Trimming sequences by %s bp from each end" % trim)
    trimmed = _trim_sequences(sequences, trim)

    log.info("Writing the trimmed sequences out to %s" % output_file)
    _write_output(trimmed, output_file, output_type)
    return output_file
Example #7
0
def trim_alleles(input_file, output_file=None, trim=0):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""

    # If no trim or output file is specified, we can skip this module
    if trim == 0 and output_file is None:
        log.info('No trimming necessary for "%s", skipping...' % input_file)
        return input_file

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # Read the input sequences and trim each record
    sequences = read_sequences(input_file)
    log.info('Trimming sequences by %s bp from each end' % trim)
    trimmed = _trim_sequences(sequences, trim)

    log.info('Writing the trimmed sequences out to %s' % output_file)
    _write_output(trimmed, output_file, output_type)
    return output_file