Example #1
0
def orient_sequences(input_file,
                     reference_file=None,
                     alignment_file=None,
                     output_file=None):
    """
    Reorient a fasta file so all sequences are in the same direction as their reference
    """
    log.info(
        "Reorienting all sequences in %s to the direction of their reference" %
        input_file)
    # Set the output file and type
    output_file = output_file or _get_output_file(input_file)
    output_type = _get_output_type(output_file)
    if valid_file(output_file):
        log.info("Found existing output file %s, skipping orientation step" %
                 output_file)
        return output_file
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file(input_file, reference_file,
                                        alignment_file)
    reversed_seqs = _identify_reversed_sequences(alignment_file)
    log.info("Identified %s sequences needing Reverse Complementation" %
             len(reversed_seqs))
    input_records = _parse_input_records(input_file)
    reversed_records = _reverse_records(input_records, reversed_seqs)
    log.info("Writing out sequences to %s" % output_file)
    _write_output(reversed_records, output_file, output_type)
    return output_file
Example #2
0
def extract_alleles(input_file,
                    output_file=None,
                    reference_file=None,
                    alignment_file=None,
                    method=METHOD,
                    sort=SORT,
                    loci=LOCI):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # If align to reference for breaking ties
    alignment_file = get_alignment_file(input_file, reference_file,
                                        alignment_file)
    alignments = list(BlasrReader(alignment_file))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus(alignments, loci)
    elif method == 'barcode':
        groups = _group_by_barcode(alignments)
    elif method == 'both':
        groups = _group_by_both(alignments, loci)
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error(msg)
        raise ValueError(msg)

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences(input_file)
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error(msg)
        raise ValueError(msg)

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups(groups, sorting_data)

    log.info('Selecting top sequences from %s according to the "%s" policy' %
             (input_file, method))
    selected = list(_select_sequences(ordered))
    log.info('Selected %s sequences from %s total for further analysis' %
             (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list(_subset_sequences(sequences, selected))
    _write_output(subset, output_file, output_type)
    return output_file
Example #3
0
def extract_alleles( input_file, output_file=None, reference_file=None,
                                                   alignment_file=None,
                                                   method=METHOD,
                                                   sort=SORT,
                                                   loci=LOCI ):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file( input_file )
    output_type = get_file_type( output_file )

    # If align to reference for breaking ties
    alignment_file = get_alignment_file( input_file, reference_file, alignment_file )
    alignments = list( BlasrReader( alignment_file ))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus( alignments, loci )
    elif method == 'barcode':
        groups = _group_by_barcode( alignments )
    elif method == 'both':
        groups = _group_by_both( alignments, loci )
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error( msg )
        raise ValueError( msg )

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences( input_file )
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error( msg )
        raise ValueError( msg )

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups( groups, sorting_data )

    log.info('Selecting top sequences from %s according to the "%s" policy' % (input_file, method))
    selected = list( _select_sequences( ordered ))
    log.info('Selected %s sequences from %s total for further analysis' % (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list( _subset_sequences( sequences, selected ))
    _write_output( subset, output_file, output_type )
    return output_file
Example #4
0
def cdna_from_file( input_file, hmm_fofn, output=None,
                                          reference=None,
                                          alignment=None ):
    """
    Extract the cDNA sequences from a mixed Fasta or Fastq
    """
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file( input_file, reference, alignment )
    output_file = output or get_output_file( input_file, 'cDNA' )

    # Prepare the Fasta by orienting and subsetting it
    records = _parse_input_records( input_file )
    hmms = parse_locus_dict( hmm_fofn )
    loci = _parse_loci( alignment_file )

    # Compose and output the records
    cdna_records = list( cdna_from_records( records, loci, hmms ))
    write_records( cdna_records, output_file )
    return output_file
Example #5
0
def orient_sequences( input_file, reference_file=None, alignment_file=None, output_file=None ):
    """
    Reorient a fasta file so all sequences are in the same direction as their reference
    """
    log.info("Reorienting all sequences in %s to the direction of their reference" % input_file)
    # Set the output file and type
    output_file = output_file or _get_output_file( input_file )
    output_type = _get_output_type( output_file )
    if valid_file( output_file ):
        log.info("Found existing output file %s, skipping orientation step" % output_file)
        return output_file
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file( input_file, reference_file, alignment_file )
    reversed_seqs = _identify_reversed_sequences( alignment_file )
    log.info("Identified %s sequences needing Reverse Complementation" % len(reversed_seqs))
    input_records = _parse_input_records( input_file )
    reversed_records = _reverse_records( input_records, reversed_seqs )
    log.info("Writing out sequences to %s" % output_file)
    _write_output( reversed_records, output_file, output_type )
    return output_file
Example #6
0
def cdna_from_file(input_file,
                   hmm_fofn,
                   output=None,
                   reference=None,
                   alignment=None):
    """
    Extract the cDNA sequences from a mixed Fasta or Fastq
    """
    # Check the input files, and align the input file if needed
    alignment_file = get_alignment_file(input_file, reference, alignment)
    output_file = output or get_output_file(input_file, 'cDNA')

    # Prepare the Fasta by orienting and subsetting it
    records = _parse_input_records(input_file)
    hmms = parse_locus_dict(hmm_fofn)
    loci = _parse_loci(alignment_file)

    # Compose and output the records
    cdna_records = list(cdna_from_records(records, loci, hmms))
    write_records(cdna_records, output_file)
    return output_file