Beispiel #1
0
def extract_alleles(input_file,
                    output_file=None,
                    reference_file=None,
                    alignment_file=None,
                    method=METHOD,
                    sort=SORT,
                    loci=LOCI):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # If align to reference for breaking ties
    alignment_file = get_alignment_file(input_file, reference_file,
                                        alignment_file)
    alignments = list(BlasrReader(alignment_file))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus(alignments, loci)
    elif method == 'barcode':
        groups = _group_by_barcode(alignments)
    elif method == 'both':
        groups = _group_by_both(alignments, loci)
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error(msg)
        raise ValueError(msg)

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences(input_file)
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error(msg)
        raise ValueError(msg)

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups(groups, sorting_data)

    log.info('Selecting top sequences from %s according to the "%s" policy' %
             (input_file, method))
    selected = list(_select_sequences(ordered))
    log.info('Selected %s sequences from %s total for further analysis' %
             (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list(_subset_sequences(sequences, selected))
    _write_output(subset, output_file, output_type)
    return output_file
Beispiel #2
0
def extract_alleles( input_file, output_file=None, reference_file=None,
                                                   alignment_file=None,
                                                   method=METHOD,
                                                   sort=SORT,
                                                   loci=LOCI ):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file( input_file )
    output_type = get_file_type( output_file )

    # If align to reference for breaking ties
    alignment_file = get_alignment_file( input_file, reference_file, alignment_file )
    alignments = list( BlasrReader( alignment_file ))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus( alignments, loci )
    elif method == 'barcode':
        groups = _group_by_barcode( alignments )
    elif method == 'both':
        groups = _group_by_both( alignments, loci )
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error( msg )
        raise ValueError( msg )

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences( input_file )
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error( msg )
        raise ValueError( msg )

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups( groups, sorting_data )

    log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method))
    selected = list( _select_sequences( ordered ))
    log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list( _subset_sequences( sequences, selected ))
    _write_output( subset, output_file, output_type )
    return output_file
Beispiel #3
0
def get_output_file( input_file, modifier ):
    """
    Get a modified output file name based on some input file
    """
    basename = '.'.join( input_file.split('.')[:-1] )
    file_type = get_file_type( input_file )
    return '%s.%s.%s' % (basename, modifier, file_type)
Beispiel #4
0
def _get_output_file(input_file):
    """
    Get the output file, either as provided or from the input filename
    """
    basename = '.'.join(input_file.split('.')[:-1])
    input_type = get_file_type(input_file)
    return '%s.oriented.%s' % (basename, input_type)
Beispiel #5
0
def get_output_file(input_file, modifier):
    """
    Get a modified output file name based on some input file
    """
    basename = '.'.join(input_file.split('.')[:-1])
    file_type = get_file_type(input_file)
    return '%s.%s.%s' % (basename, modifier, file_type)
Beispiel #6
0
def _get_output_file( input_file ):
    """
    Get the output file, either as provided or from the input filename
    """
    basename = '.'.join( input_file.split('.')[:-1] ) 
    input_type = get_file_type( input_file )
    return '%s.oriented.%s' % (basename, input_type)
Beispiel #7
0
def _get_output_type( output_file ):
    """
    Get the output filetype and confirm the format is valid
    """
    output_type = get_file_type( output_file )
    if output_type in ['fasta', 'fastq']:
        return output_type
    else:
        msg = "Output file must be either Fasta or Fastq format"
        log.error( msg )
        raise TypeError( msg )
Beispiel #8
0
def _get_output_type(output_file):
    """
    Get the output filetype and confirm the format is valid
    """
    output_type = get_file_type(output_file)
    if output_type in ['fasta', 'fastq']:
        return output_type
    else:
        msg = "Output file must be either Fasta or Fastq format"
        log.error(msg)
        raise TypeError(msg)
Beispiel #9
0
def _parse_input_records( input_file ):
    """
    Parse the input sequence records with the appropriate pbcore Reader
    """
    input_type = get_file_type( input_file )
    if input_type == 'fasta':
        return list( FastaReader( input_file ))
    elif input_type == 'fastq':
        return list( FastqReader( input_file ))
    else:
        msg = 'Input file must be either Fasta or Fastq'
        log.error( msg )
Beispiel #10
0
def exons_to_cDNA( exon_file ):
    """
    Combine a multi-Fasta of Exon sequences into a mock cDNA
    """
    output_type = get_file_type( exon_file )
    output_file = _get_output_file( exon_file, output_type )
    records = _parse_exon_records( exon_file, output_type )
    log.info("Combinging %s exons sequences to cDNA" % len(records))
    if len( records ):
        sorted_records = _sort_records( records )
        cDNA_record = _combine_records( sorted_records )
        log.info("Writing cDNA sequence out to %s" % output_file)
        write_sequences( cDNA_record, output_file )
Beispiel #11
0
def exons_to_cDNA(exon_file):
    """
    Combine a multi-Fasta of Exon sequences into a mock cDNA
    """
    output_type = get_file_type(exon_file)
    output_file = _get_output_file(exon_file, output_type)
    records = _parse_exon_records(exon_file, output_type)
    log.info("Combinging %s exons sequences to cDNA" % len(records))
    if len(records):
        sorted_records = _sort_records(records)
        cDNA_record = _combine_records(sorted_records)
        log.info("Writing cDNA sequence out to %s" % output_file)
        write_sequences(cDNA_record, output_file)
Beispiel #12
0
def extract_alleles( input_file, min_reads, min_length, output_file=None ):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Set the output file if not specified
    output_file = output_file or _get_output_file( input_file )
    output_type = get_file_type( output_file )
    # Parse the alignment data and extract the target sequences
    sequences = _parse_input_records( input_file )
    sequences = _filter_on_length( sequences, min_length )
    sequences = _filter_on_numreads( sequences, min_reads )
    _write_output( sequences, output_file, output_type )
    return output_file
Beispiel #13
0
def extract_exons(input_record, exon_fofn, directory=None):
    """
    Extract all exons from a particular Fasta File into a separate Fasta File
    """
    if isinstance(input_record, str):
        output_type = get_file_type(input_record)
        input_record = _read_fasta_record(input_record)
    elif isinstance(input_record, FastaRecord):
        output_type = 'fasta'
    elif isinstance(input_record, FastqRecord):
        output_type = 'fastq'
    else:
        msg = 'Input record must be Filename, FastaRecord or FastqRecord'
        log.error(msg)
        raise TypeError(msg)
    return _extract_exons(input_record, exon_fofn, output_type, directory)
Beispiel #14
0
def extract_exons( input_record, exon_fofn, directory=None ):
    """
    Extract all exons from a particular Fasta File into a separate Fasta File
    """
    if isinstance( input_record, str ):
        output_type = get_file_type( input_record )
        input_record = _read_fasta_record( input_record )
    elif isinstance( input_record, FastaRecord ):
        output_type = 'fasta'
    elif isinstance( input_record, FastqRecord ):
        output_type = 'fastq'
    else:
        msg = 'Input record must be Filename, FastaRecord or FastqRecord'
        log.error( msg )
        raise TypeError( msg )
    return _extract_exons( input_record, exon_fofn, output_type, directory )
Beispiel #15
0
def trim_alleles(input_file, output_file=None, trim=0):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""

    # If no trim or output file is specified, we can skip this module
    if trim == 0 and output_file is None:
        log.info('No trimming necessary for "%s", skipping...' % input_file)
        return input_file

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # Read the input sequences and trim each record
    sequences = read_sequences(input_file)
    log.info("Trimming sequences by %s bp from each end" % trim)
    trimmed = _trim_sequences(sequences, trim)

    log.info("Writing the trimmed sequences out to %s" % output_file)
    _write_output(trimmed, output_file, output_type)
    return output_file
Beispiel #16
0
def trim_alleles(input_file, output_file=None, trim=0):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""

    # If no trim or output file is specified, we can skip this module
    if trim == 0 and output_file is None:
        log.info('No trimming necessary for "%s", skipping...' % input_file)
        return input_file

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # Read the input sequences and trim each record
    sequences = read_sequences(input_file)
    log.info('Trimming sequences by %s bp from each end' % trim)
    trimmed = _trim_sequences(sequences, trim)

    log.info('Writing the trimmed sequences out to %s' % output_file)
    _write_output(trimmed, output_file, output_type)
    return output_file
Beispiel #17
0
def _get_output_file( input_file ):
    basename = '.'.join( input_file.split('.')[:-1] )
    file_type = get_file_type( input_file )
    return '%s.filtered.%s' % (basename, file_type)
Beispiel #18
0
def _get_output_file( input_file ):
    basename = '.'.join( input_file.split('.')[:-1] )
    file_type = get_file_type( input_file )
    return '%s.selected.%s' % (basename, file_type)
Beispiel #19
0
def _get_output_file(input_file):
    basename = ".".join(input_file.split(".")[:-1])
    file_type = get_file_type(input_file)
    return "%s.trimmed.%s" % (basename, file_type)