Beispiel #1
0
def sort_subreads(subread_file, reference_file):
    """
    Aligning
    """
    log.info("Aligning subreads to the two best references")
    temp = 'temp2.m1'
    if valid_file(temp):
        return {hit.qname: hit.tname for hit in BlasrReader(temp)}
    align_best_reference(subread_file, reference_file, temp)
    return {hit.qname: hit.tname for hit in BlasrReader(temp)}
Beispiel #2
0
def parse_trims( blasr_file, window ):
    trims = {}
    for record in BlasrReader( blasr_file ):
        start = max(int(record.qstart)-window, 0)
        end = min(int(record.qend)+window, int(record.qlength))
        trims[record.qname] = (start, end)
    return trims
Beispiel #3
0
def create_chimeras(input_file,
                    output=None,
                    reference_file=None,
                    alignment_file=None):
    """
    Pick the top N Amplicon Analysis consensus seqs from a Fasta by Nreads
    """
    # Check the input files, and align the input file if needed
    if reference_file and alignment_file is None:
        alignment_file = align_best_reference(input_file, reference_file)
    elif reference_file is None and alignment_file is None:
        msg = "extract_alleles requires either an Alignment or a Reference!"
        log.error(msg)
        raise IOError(msg)
    # Set the output file if not specified
    if output is None:
        basename = '.'.join(input_file.split('.')[:-1])
        output = '%s.chimeras.fasta' % basename
    # Parse the alignment data and extract the target sequences
    alignments = list(BlasrReader(alignment_file))
    groups = _group_by_locus(alignments)
    groups = _filter_groups(groups)
    sequences = list(FastaReader(input_file))
    chimeras = list(_create_chimeras(groups, sequences))
    write_fasta(chimeras, output)
    return output
Beispiel #4
0
def extract_alleles(input_file,
                    output_file=None,
                    reference_file=None,
                    alignment_file=None,
                    method=METHOD,
                    sort=SORT,
                    loci=LOCI):
    """Pick the top 2 Amplicon Analysis consensus seqs per group from a Fasta"""
    method = method or METHOD
    loci = loci or LOCI

    # Set the output file if not specified
    output_file = output_file or _get_output_file(input_file)
    output_type = get_file_type(output_file)

    # If align to reference for breaking ties
    alignment_file = get_alignment_file(input_file, reference_file,
                                        alignment_file)
    alignments = list(BlasrReader(alignment_file))

    # Run the appropriate grouping
    if method == 'locus':
        groups = _group_by_locus(alignments, loci)
    elif method == 'barcode':
        groups = _group_by_barcode(alignments)
    elif method == 'both':
        groups = _group_by_both(alignments, loci)
    elif method == 'all':
        groups = {a.qname: [a] for a in alignments}
    else:
        msg = "Invalid Selection Metric: %s" % method
        log.error(msg)
        raise ValueError(msg)

    # Read the input sequences and use them to generate our sorting data
    sequences = read_sequences(input_file)
    if sort == 'num_reads':
        sorting_data = {s.name: consensus_size(s) for s in sequences}
    elif sort == 'accuracy':
        assert get_file_type(input_file) == 'fastq'
        sorting_data = {s.name: record_accuracy(s) for s in sequences}
    else:
        msg = "Invalid Sorting Metric: %s" % sort
        log.error(msg)
        raise ValueError(msg)

    log.info('Sorting sequences for selection according to "%s"' % sort)
    ordered = _sort_groups(groups, sorting_data)

    log.info('Selecting top sequences from %s according to the "%s" policy' %
             (input_file, method))
    selected = list(_select_sequences(ordered))
    log.info('Selected %s sequences from %s total for further analysis' %
             (len(selected), len(sequences)))

    log.info('Writing the selected sequences out to %s' % output_file)
    subset = list(_subset_sequences(sequences, selected))
    _write_output(subset, output_file, output_type)
    return output_file
Beispiel #5
0
def hits_by_reference(alignment):
    by_reference = {}
    for hit in BlasrReader(alignment):
        try:
            by_reference[hit.tname].append(hit)
        except:
            by_reference[hit.tname] = [hit]
    return by_reference
Beispiel #6
0
def _identify_reversed_sequences(blasr_file):
    """
    Identify hits where the query and reference have difference orientations
    """
    reversed_seqs = []
    for record in BlasrReader(blasr_file):
        if record.qstrand != record.tstrand:
            reversed_seqs.append(record.qname)
    return set(reversed_seqs)
Beispiel #7
0
def _parse_exon_location(alignment_file):
    """
    Parse the most likely Exon location from an Exon-Fasta alignment
    """
    alignments = list(BlasrReader(alignment_file))
    alignments = sorted(alignments, key=lambda x: int(x.score))
    alignments = sorted(alignments,
                        key=lambda x: float(x.pctsimilarity),
                        reverse=True)
    return int(alignments[0].tstart), int(alignments[0].tend)
Beispiel #8
0
def order_references(subread_file, reference_file):
    """
    Select the two best reference sequences from a list
    """
    log.info("Selecting the best references sequences to use")
    temp = 'temp.m1'
    if not valid_file(temp):
        align_best_reference(subread_file, reference_file, temp)
    c = Counter([hit.tname for hit in BlasrReader(temp)])
    return [k for k, v in c.most_common()]
Beispiel #9
0
def _parse_blasr_alignment(blasr_file):
    results = {}
    for entry in BlasrReader(blasr_file):
        name = get_base_sequence_name(entry.qname)
        if isinstance(entry, BlasrM1):
            results[name] = [entry.tname, entry.pctsimilarity]
        elif isinstance(entry, BlasrM5):
            diffs = int(entry.nmis) + int(entry.nins) + int(entry.ndel)
            pctid = 100 * int(entry.nmat) / float(int(entry.nmat) + diffs)
            results[name] = [entry.tname, pctid]
    return results
Beispiel #10
0
def _parse_alignment(alignment):
    """
    Parse the genomic typeings from the gDNA alignment
    """
    hits = {}
    for record in BlasrReader(alignment):
        try:
            hits[record.qname].append(record)
        except:
            hits[record.qname] = [record]
    return hits
Beispiel #11
0
def parse_alignment_positions(alignment_file):
    positions = []
    for hit in BlasrReader(alignment_file):
        left = {'name': hit.qname, 'start': 1, 'end': int(hit.qstart)}
        right = {
            'name': hit.tname,
            'start': int(hit.tstart),
            'end': int(hit.tlength)
        }
        positions.append((left, right))
    return positions
Beispiel #12
0
def _align_sequences(query, reference):
    """
    Align one fasta file of sequences to another
    """
    temp = NamedTemporaryFile(suffix='.m1', delete=False)
    align_best_reference(query, reference, output=temp.name)
    if valid_file(temp.name):
        hits = list(BlasrReader(temp.name))
        os.unlink(temp.name)
        return hits
    os.unlink(temp.name)
    return None
Beispiel #13
0
def _parse_alignment(alignment):
    """
    Parse the location of each hit in the alignment file
    """
    locations = {}
    for entry in BlasrReader(alignment):
        if entry.tstrand == '1':
            start = int(entry.tlength) - int(entry.tend)
            end = int(entry.tlength) - int(entry.tstart)
        else:
            start = int(entry.tstart)
            end = int(entry.tend)
        locations[entry.qname] = (start, end, entry.tname)
    return locations
Beispiel #14
0
def parse_alignment_positions(alignment_file):
    positions = []
    for hit in BlasrReader(alignment_file):
        position = {
            'name': hit.qname,
            'ref': hit.tname,
            'qstart': int(hit.qstart),
            'qend': int(hit.qend),
            'tstart': int(hit.tstart),
            'tend': int(hit.tend),
            'qstring': hit.qstring,
            'tstring': hit.tstring
        }
        positions.append(position)
    return positions
Beispiel #15
0
def _parse_orientation(filename):
    """
    Parse the orientations of a list of sequences from a Blasr alignment file
    """
    orientations = {}
    for record in BlasrReader(filename):
        if record.qname in orientations:
            msg = 'Duplicate record name! (%s)' % record.qname
            log.error(msg)
            raise ValueError(msg)
        if record.qstrand == record.tstrand:
            orientations[record.qname] = 'forward'
        else:
            orientations[record.qname] = 'reverse'
    return orientations
Beispiel #16
0
def create_m1_reference(m1_file, reference=None):
    log.info('Parsing Blasr M1 results from "{0}"'.format(m1_file))
    results = {}
    for record in BlasrReader(m1_file):
        qname = get_base_sequence_name(record.qname)
        tname = get_base_sequence_name(record.tname)
        if qname in results:
            msg = 'Duplicate sequence ids found! "{0}"'.format(qname)
            log.info(msg)
            raise KeyError(msg)
        if reference:
            results[qname] = reference[tname]
        else:
            results[qname] = tname
    log.info('Finished reading Blasr results')
    return results
Beispiel #17
0
def create_m5_reference(m5_file):
    log.info('Parsing Blasr M5 results from "{0}"'.format(m5_file))
    results = {}
    diffs = {}
    for record in BlasrReader(m5_file):
        qname = get_base_sequence_name(record.qname)
        tname = get_base_sequence_name(record.tname)
        diff_count = int(record.nmis) + int(record.nins) + int(record.ndel)
        if qname not in diffs:
            results[qname] = tname
            diffs[qname] = diff_count
        elif diffs[qname] > diff_count:
            results[qname] = tname
            diffs[qname] = diff_count
    log.info('Finished reading Blasr results')
    return results
Beispiel #18
0
def _parse_alignment(alignment):
    """
    Parse the location of each hit in the alignment file
    """
    log.info("Parsing subread locations from alignment data")
    locations = {}
    for entry in BlasrReader(alignment):
        if '/' in entry.qname:
            qname = '/'.join(entry.qname.split('/')[0:3])
        else:
            qname = entry.qname
        if entry.tstrand == '1':
            start = int(entry.tlength) - int(entry.tend)
            end = int(entry.tlength) - int(entry.tstart)
        else:
            start = int(entry.tstart)
            end = int(entry.tend)
        locations[qname] = (start, end)
    return locations
Beispiel #19
0
def _align_fasta(query, reference, format):
    """
    Align a single query sequence to all valid references
    """
    suffix = '.m%s' % format
    temp_align = tempfile.NamedTemporaryFile(suffix=suffix, delete=False)
    reference_count = fasta_size(reference)
    blasr_args = {
        'nproc': NPROC,
        'out': temp_align.name,
        'bestn': reference_count,
        'nCandidates': reference_count,
        'm': format,
        'noSplitSubreads': True
    }
    run_blasr(query, reference, blasr_args)
    # Parse the output for return and delete the file
    alignments = list(BlasrReader(temp_align.name))
    os.unlink(temp_align.name)
    return alignments
Beispiel #20
0
def filter_m5_file(m5_file, filtered_file):
    """
    Filter an M5 alignment file to contain only the alignments with the fewest diffs
    """
    log.info('Filtering Blasr M5 results from "{0}"'.format(m5_file))
    selected = {}
    diffs = {}
    count = 0
    for record in BlasrReader(m5_file):
        count += 1
        diff_count = int(record.nmis) + int(record.nins) + int(record.ndel)
        if record.qname not in diffs:
            selected[record.qname] = record
            diffs[record.qname] = diff_count
        elif diffs[record.qname] > diff_count:
            selected[record.qname] = record
            diffs[record.qname] = diff_count
    log.info('Selected %s records from %s alignments' % (count, len(selected)))
    with open(filtered_file, 'w') as output:
        for record in selected.itervalues():
            output.write('%s\n' % record_to_string(record))
    log.info('Finished filtering Blasr results')
Beispiel #21
0
def _parse_loci( blasr_file ):
    """
    Parse the likely locus of sequences from a Blasr file
    """
    locus_calls = {}
    for entry in BlasrReader( blasr_file ):
        if entry.tname == 'tname':
            continue
        # Parse the locus from either Tokai or IMGT references
        reference = entry.tname.split('*')[0]
        if reference.startswith('HLA-'):
            locus = reference[-1]
        else:
            locus = reference.split('_')[1]
        # Save the Locus/Sequence pair unless duplicate
        if entry.qname in locus_calls:
            msg = 'Duplicate sequence name found "%s"!' % entry.qname
            log.error( msg )
            raise ValueError( msg )
        else:
            locus_calls[entry.qname] = locus
    return locus_calls
Beispiel #22
0
def count_hits(filename):
    return len(list(BlasrReader(filename)))
Beispiel #23
0
def format_blasr_file(input_file, output_file):
    with BlasrWriter(output_file) as writer:
        with BlasrReader(input_file) as reader:
            writer.write_header(reader.filetype)
            for record in reader:
                writer.write(record)