Example #1
0
 def __init__(self, batchfile=None, num_cpus=1):
     self._batchfile = batchfile
     self._execstr = ''
     self._instance = hp._THyPhy(join(abspath(hp.__file__), 'res'), num_cpus)
     self._stdout = ''
     self._stderr = ''
     self._warnings = ''
Example #2
0
    def __init__(self, cwd="", nthreads=1, dsid="my_ds", trid="my_tree", lfid="my_lf"):
        self.__instance = HyPhy._THyPhy(cwd, nthreads)
        self.version = self._get_version()

        self.dsid = dsid
        self.dsfid = self.dsid + "f"
        self.trid = trid
        self.lfid = lfid
Example #3
0
 def __init__(self, batchfile=None, num_cpus=1):
     self._batchfile = batchfile
     self._execstr = ''
     self._instance = hp._THyPhy(join(abspath(hp.__file__), 'res'),
                                 num_cpus)
     self._stdout = ''
     self._stderr = ''
     self._warnings = ''
Example #4
0
    def __init__(self,
                 cwd='',
                 nthreads=1,
                 dsid='my_ds',
                 trid='my_tree',
                 lfid='my_lf'):
        self.__instance = HyPhy._THyPhy(cwd, nthreads)
        self.version = self._get_version()

        self.dsid = dsid
        self.dsfid = self.dsid + 'f'
        self.trid = trid
        self.lfid = lfid
Example #5
0
    def __init__(self, cwd=os.getcwd(), nthreads=1, alphabet='ACGT',
                 gap_open=20, gap_open2=20, gap_extend=10, gap_extend2=10,
                 no_terminal_penalty=1):
        self.__instance = HyPhy._THyPhy(cwd, nthreads)
        self.call('alignOptions = {};')

        # default settings
        self.set_alphabet()
        self.set_matrix()
        self.set_gap_open()
        self.set_gap_open(20, is_first=False)
        self.set_affine()
        self.set_gap_extend()
        self.set_gap_extend(10, is_first=False)
        self.set_terminal()
Example #6
0
"""
Use project file to punch out genes from FDA amino acid refs
"""
import os
import HyPhy
import hyphyAlign
import json
from seqUtils import convert_fasta

hyphy = HyPhy._THyPhy (os.getcwd(), 1) # instance of HyPhy
hyphyAlign.change_settings(hyphy)  # default settings

handle = open('fda_hcv_polyprotein.fa', 'rU')
fasta = convert_fasta(handle)
handle.close()

handle = open('/Users/art/git/MiseqPipeline/projects.json', 'rU')
proj = json.load(handle)
handle.close()

h77 = {}
for key in proj['regions'].iterkeys():
    if 'H77' in key and not key.endswith('seed'):
        aa = ''.join(proj['regions'][key]['reference'])
        h77.update({str(key): str(aa)})
        
outfile = open('fda_hcv_coords.fa', 'w')

for h, s in fasta:
    for gene, refseq in h77.iteritems():
        aquery, aref, ascore = hyphyAlign.pair_align(hyphy, refseq, s)
Example #7
0
# import the HyPhy library
# and standard OS utilities

import os, HyPhy

# first, create a HyPhy interface instance (class _THyPhy)
# the first argument defines the root directory for HyPhy
# and the second - how many threads the computational core
# should spawn

hyphyInstance = HyPhy._THyPhy(os.getcwd(), 2)

# the basic interface command is 'ExecuteBF' which
# executes HyPhy batch language commands in HyPhy
# and returns a string representation of the return value
# (if any) from HYPHY
# The returned object is of type _THyPhyString with
# sData and sLength fields
# HyPhy will take care of disposing of the memory needed
# to store the result

hyphyResult = hyphyInstance.ExecuteBF("return 2+2;")
print("Testing a trivial HyPhy command. 2+2 = ", hyphyResult.sData)

# an optional second argument to ExecuteBF
# can be used to "flush" the current state of the system

# this is the default option for the call of ExecuteBF
# passing the second argument of False or 0 will preserve
# the execution state
Example #8
0
"""
Use project file to punch out genes from FDA amino acid refs
"""
import os
import HyPhy
import hyphyAlign
import json
from seqUtils import convert_fasta

hyphy = HyPhy._THyPhy(os.getcwd(), 1)  # instance of HyPhy
hyphyAlign.change_settings(hyphy)  # default settings

handle = open('fda_hcv_polyprotein.fa', 'rU')
fasta = convert_fasta(handle)
handle.close()

handle = open('/Users/art/git/MiseqPipeline/projects.json', 'rU')
proj = json.load(handle)
handle.close()

h77 = {}
for key in proj['regions'].iterkeys():
    if 'H77' in key and not key.endswith('seed'):
        aa = ''.join(proj['regions'][key]['reference'])
        h77.update({str(key): str(aa)})

outfile = open('fda_hcv_coords.fa', 'w')

for h, s in fasta:
    for gene, refseq in h77.iteritems():
        aquery, aref, ascore = hyphyAlign.pair_align(hyphy, refseq, s)
Example #9
0
        elif i[0] == '>' or i[0] == '#':
            if len(sequence) > 0:
                blocks.append([h, sequence])
                sequence = ''    # reset containers
                h = i.strip('\n')[1:]
            else:
                h = i.strip('\n')[1:]
        else:
            sequence += i.strip('\n')
    try:
        blocks.append([h,sequence])    # handle last entry
    except:
        raise Exception("convert_fasta(): Error appending to blocks [{},{}]".format(h, sequence))
    return blocks

hyphy = HyPhy._THyPhy(os.getcwd(), 1)  # @UndefinedVariable

dump = hyphy.ExecuteBF('MESSAGE_LOGGING = 0;', False)

hyphyAlign.change_settings(hyphy, 
                            alphabet = hyphyAlign.nucAlphabet, 
                            scoreMatrix = hyphyAlign.nucScoreMatrix, 
                            gapOpen = 20,
                            gapOpen2 = 20,
                            gapExtend = 10,
                            gapExtend2 = 10,
                            noTerminalPenalty = 1)


with open('HCV_REF_2012_genome.fasta', 'rU') as handle:
    genomes = convert_fasta(handle)  # keep one per genotype
Example #10
0
                sequence = ''  # reset containers
                h = i.strip('\n')[1:]
            else:
                h = i.strip('\n')[1:]
        else:
            sequence += i.strip('\n')
    try:
        blocks.append([h, sequence])  # handle last entry
    except:
        raise Exception(
            "convert_fasta(): Error appending to blocks [{},{}]".format(
                h, sequence))
    return blocks


hyphy = HyPhy._THyPhy(os.getcwd(), 1)  # @UndefinedVariable

dump = hyphy.ExecuteBF('MESSAGE_LOGGING = 0;', False)

hyphyAlign.change_settings(hyphy,
                           alphabet=hyphyAlign.nucAlphabet,
                           scoreMatrix=hyphyAlign.nucScoreMatrix,
                           gapOpen=20,
                           gapOpen2=20,
                           gapExtend=10,
                           gapExtend2=10,
                           noTerminalPenalty=1)

with open('HCV_REF_2012_genome.fasta', 'rU') as handle:
    genomes = convert_fasta(handle)  # keep one per genotype
Example #11
0
# import the HyPhy library
# and standard OS utilities

import os, HyPhy
from six import print_ as print

# first, create a HyPhy interface instance (class _THyPhy)
# the first argument defines the root directory for HyPhy
# and the second - how many threads the computational core
# should spawn

hyphyInstance = HyPhy._THyPhy (os.getcwd(),2)

# the basic interface command is 'ExecuteBF' which
# executes HyPhy batch language commands in HyPhy
# and returns a string representation of the return value
# (if any) from HYPHY
# The returned object is of type _THyPhyString with
# sData and sLength fields
# HyPhy will take care of disposing of the memory needed
# to store the result

hyphyResult = hyphyInstance.ExecuteBF ("return 2+2;");
print("Testing a trivial HyPhy command. 2+2 = ", hyphyResult.sData)

# an optional second argument to ExecuteBF
# can be used to "flush" the current state of the system

# this is the default option for the call of ExecuteBF
# passing the second argument of False or 0 will preserve
# the execution state
def csf2counts (path,mode,mixture_cutoffs,amino_reference_sequence="/usr/local/share/miseq/refs/csf2counts_amino_refseqs.csv"):
    """
    Calculate HXB2-aligned nucleotide and amino acid counts from a CSF.
    """

    import csv, logging, HyPhy, os, sys
    from hyphyAlign import change_settings, get_boundaries, pair_align
    from miseqUtils import ambig_dict, convert_csf, convert_fasta, mixture_dict, translate_nuc

    logger = logging.getLogger()
    hyphy = HyPhy._THyPhy (os.getcwd(), 1)
    change_settings(hyphy) # default gap open penalty 40(20), extension penalty 10(5) - we may need to change these

    amino_alphabet = 'ACDEFGHIKLMNPQRSTVWY*'

    if mode not in ['Amplicon', 'Nextera']:
        return logger.error("{} is an unsupported mode - halting csf2counts".format(mode))

    # set up file paths
    filename = os.path.basename(path)
    root = os.path.dirname(path) if os.path.dirname(path) != '' else '.'
    file_prefix = filename.replace('.csf', '')
    outpath = root+'/'+file_prefix#"{}/{}".format(root, file_prefix)

    # CSF contains sample + region in filename (Ex: F00844_S68.HIV1B-pol.0.csf)
    sample, ref = filename.split('.')[:2]

    # Amino reference sequences in refseqs is used to coordinate normalize our samples
    with open(amino_reference_sequence, "rb") as f:
        input_file = csv.reader(f)
        refseqs = {}
        for row in input_file:
            region, amino = row
            refseqs[region] = amino

    # If we have no reference sequence, we can't align the input sequences
    if ref not in refseqs:
        logger.error("No reference for {} - halting csf2counts".format(ref))
        return

    refseq = refseqs[ref]

    # Load CSF (CSF header, offset, sequence) into fasta data structure
    with open(path, 'rU') as infile:
        fasta, lefts, rights = convert_csf(infile.readlines())

    if len(fasta) == 0:
        # skip empty file
        logger.error('{} is an empty file'.format(filename))
        return

    # CSFs come from self-alignment derived SAMs: the reads are out of frame.
    frame_evidence = {}
    for frame in range(3):
        frame_evidence[frame] = 0

    # Look at first five reads in CSF and vote on correct ORF
    for read_index in range(min(5, len(fasta))): # make this robust to having fewer than 5 reads
        header, seq = fasta[read_index]
        max_score = -999
        best_ORF = 0
        possible_ORFs = [0, 1, 2]

        # Determine best ORF for this read
        prefix = ('-'*lefts[header] if mode == 'Nextera' else '')
        for frame in possible_ORFs:
            p = translate_nuc(prefix + seq, frame)
            aquery, aref, ascore = pair_align(hyphy, refseq, p)
            if ascore > max_score:
                best_ORF = frame
                max_score = ascore

        # Read provides 1 of 5 votes for best ORF
        frame_evidence[best_ORF] += 1

    best_frame = max(frame_evidence, key=lambda n: frame_evidence[n])
    logging.debug('Best ORF = %d' % best_frame)#logging.debug("Best ORF = {}".format(best_frame))

    nuc_counts = {} # Base counts by self-consensus coordinate
    aa_counts = {}	# Amino counts by self-consensus coordinate
    pcache = []		# Cache protein sequences


    # CSF reads aligned against self-consensus: offset is with respect to self

    # For each sequence in the csf
    for i, (header, seq) in enumerate(fasta):

        # Determine the offset (Amplicon runs have no offset)
        left = lefts[header] if mode == 'Nextera' else 0

        # Amplicons store read counts in the CSF header
        count = 1 if mode == 'Nextera' else int(header.split('_')[1])

        # Determine nuc counts with respect to self-consensus coordinates
        for j, nuc in enumerate(seq):
            pos = left + j
            if pos not in nuc_counts:
                nuc_counts.update({pos: {}})
            if nuc not in nuc_counts[pos]:
                nuc_counts[pos].update({nuc: 0})
            nuc_counts[pos][nuc] += count

        # Determine amino counts with respect to self-consensus coordinates
        p = translate_nuc('-'*left + seq, best_frame)
        pcache.append(p)

        # boundaries of read in amino acid space
        aa_left = (left + best_frame) / 3
        aa_right = (rights[header] + left + best_frame) / 3

        for pos, aa in enumerate(p):
            # Do not store gap information
            #if aa == '-':
            #	continue
            if pos < aa_left or pos >= aa_right:
                continue
            if pos not in aa_counts:
                aa_counts.update({pos: {}})
            if aa not in aa_counts[pos]:
                aa_counts[pos].update({aa: 0})
            aa_counts[pos][aa] += count


    # Generate amino plurality consensus for query to reference coordinate mapping
    aa_coords = aa_counts.keys()
    aa_coords.sort()

    aa_max = ''
    for pos in range(min(aa_coords), max(aa_coords)+1):
        if pos in aa_coords:
            intermed = [(aa_count, amino) for amino, aa_count in aa_counts[pos].iteritems()]
            intermed.sort(reverse=True)
            aa_max += intermed[0][1]
        else:
            aa_max += '?' # no coverage but not a gap

    logger.debug('Amino plurality consensus = ' + aa_max)#logger.debug("Amino plurality consensus = {}".format(aa_max))

    aquery, aref, ascore = pair_align(hyphy, refseq, aa_max)
    left, right = get_boundaries(aref)	# Coords of first/last non-gap character

    logger.debug('Aligned amino plurality conseq = ' + aquery)#logger.debug("Aligned amino plurality conseq = {}".format(aquery))
    logger.debug('Aligned reference sequence = ' + aref)#logger.debug("Aligned reference sequence = {}".format(aref))

    qindex_to_refcoord = {}			# Query <-> reference coordinate mapping
    inserts = []					# Keep track of which aa positions are insertions
    qindex = 0						# Where we are in the query?
    rindex = 0						# Where we are in the reference?
    ref_coords = range(len(aref))

    # For each coordinate on the reference, create a mapping to the query
    for i in ref_coords:

        # Do not consider parts of the query outside of the reference
        if i < left:
            qindex += 1

        elif i >= right:
            break

        # A gap in the reference is an insertion in the query which we want to skip in the mapping
        elif aref[i] == '-':
            inserts.append(qindex)	# Store insert location in query coordinate space
            qindex += 1				# Track along the query

        # If theres a gap in the query we are only effectively tracking along the pre-alignment reference
        elif aquery[i] == '-':
            rindex += 1

        # Normal case: tracking forward on both sequences
        else:
            qindex_to_refcoord[qindex] = rindex #qindex_to_refcoord.update({qindex: rindex})
            qindex += 1
            rindex += 1

        #print i, rindex, aref[i], qindex, aquery[i]


    logger.debug('qindex_to_refcoord: ' + str(qindex_to_refcoord))#"qindex_to_refcoord {}".format(qindex_to_refcoord))


    # Write inserts to an indels.csv file
    if len(inserts) > 0:
        with open(outpath+".indels.csv", 'w') as indelfile:
            indelfile.write('insertion,count\n')
            indel_counts = {}
            for p in pcache:
                ins_str = str(inserts[0])
                last_i = -1
                for i in inserts:
                    if last_i > -1 and i - last_i > 1:
                        # end of a contiguous indel
                        ins_str += ',%d' % i
                    try:
                        ins_str += p[i]
                    except IndexError:
                        break
                    last_i = i
                if not indel_counts.has_key(ins_str):
                    indel_counts.update({ins_str: 0})
                indel_counts[ins_str] += 1
            for ins_str, count in indel_counts.iteritems():
                indelfile.write('%s,%d\n' % (ins_str, count))

    # Initialize initial (blank) consensus sequence for each mixture rule
    maxcon = ''
    conseqs = ['' for cut in mixture_cutoffs]
    query_codon_pos = 0
    nuc_coords = nuc_counts.keys()		  # nucs[self-coord][nuc] = count
    nuc_coords.sort()

    # account for assembly offset due to extra bases in sample-specific consensus
    nuc_assembly_offset = min(lefts.values())

    # Output nucleotide counts in reference coordinate space to nuc.csv files
    nucfile = open(outpath+'.nuc.freqs', 'w')#open("{}.nuc.csv".format(outpath), 'w')
    nucfile.write("query.nuc.pos,refSeq.nuc.pos,A,C,G,T\n")

    for query_nuc_pos in nuc_coords:
        nucleotide_counts = [nuc_counts[query_nuc_pos].get(nuc, 0) for nuc in 'ACGT']
        nucleotide_counts_string = ','.join(map(str, nucleotide_counts))

        # Convert nucleotide query index into reference index
        try:
            # best frame is adjusted by shift from query to assembly coordinates
            adjustment = best_frame - (3 - nuc_assembly_offset%3)%3
            query_aa_pos = (query_nuc_pos - nuc_assembly_offset + adjustment) / 3
            query_codon_pos = (query_nuc_pos - nuc_assembly_offset + adjustment) % 3
            ref_aa_pos = qindex_to_refcoord[query_aa_pos]
            ref_nuc_pos = 3*ref_aa_pos + query_codon_pos

            nucfile.write(','.join(map(str, [query_nuc_pos+1, ref_nuc_pos+1, nucleotide_counts_string])))#"{},{},{}\n".format(query_nuc_pos, ref_nuc_pos, nucleotide_counts_string))
            nucfile.write('\n')

        except KeyError:
            #logger.debug("No coordinate mapping for query nuc {} / amino {} ({})".format(query_nuc_pos, query_aa_pos, filename))
            logger.debug('No coordinate mapping for query nuc %d / amino %d (%s)' % (query_nuc_pos, query_aa_pos, filename))
            continue


        # Store self-aligned nucleotide plurality conseqs
        intermed = [(count, nuc) for nuc, count in nuc_counts[query_nuc_pos].iteritems()]
        intermed.sort(reverse=True)
        maxcon += intermed[0][1]

        # Determine the number of bases in total at this query position
        total_count = sum([count for count, nuc in intermed])

        for ci, mixture_cutoff in enumerate(mixture_cutoffs):
            mixture = []

            # If a base is greater than the proportion cutoff, the base contributes
            for count, nuc in intermed:
                if float(count) / total_count > mixture_cutoff:
                    mixture.append(nuc)

            # If an N exists with other bases, those bases take precedence
            if 'N' in mixture:
                if len(mixture) > 1:
                    mixture.remove('N')
                else:
                    conseqs[ci] += 'N'
                    #logger.debug("N was the majority base at position {} - {} (mixture_cutoff = {})".format(query_nuc_pos, filename, mixture_cutoff))
                    logger.debug("N was the majority base at position %d - %s (mixture_cutoff = %f)" % (query_nuc_pos, filename, mixture_cutoff))
                    continue

            # If there is a gap, but also bases, those bases take precedence
            if '-' in mixture:
                if len(mixture) > 1:
                    mixture.remove('-')
                else:
                    conseqs[ci] += '-'
                    continue

            # Attach mixture (If one exists) to the conseq with appropriate mixture cutoff rule
            if len(mixture) > 1:
                mixture.sort()
                conseqs[ci] += ambig_dict[''.join(mixture)]
            elif len(mixture) == 1:
                conseqs[ci] += mixture[0]
            else:
                # Mixture of length zero, no bases exceed cutoff
                conseqs[ci] += 'N'
    nucfile.close()

    # Store self-aligned plurality amino sequences in .conseq files
    #with open("{}.conseq".format(outpath), 'w') as confile:
    with open(outpath+'.conseq', 'w') as confile:
        confile.write('>%s_MAX\n%s\n' % (sample, maxcon))
        for ci, cutoff in enumerate(mixture_cutoffs):
            confile.write('>%s_%1.3f\n%s\n' % (sample, cutoff, conseqs[ci]))


    # Write amino acid counts in reference coordinate space in amino.csv files
    #with open("{}.amino.csv".format(outpath), 'w') as aafile:
    with open(outpath+".amino.freqs", 'w') as aafile:
        aafile.write("query.aa.pos,refseq.aa.pos,%s\n" % (','.join(list(amino_alphabet))))

        for qindex, ref_aa_pos in qindex_to_refcoord.iteritems():
            # adjust for assembly offset
            aa_pos = qindex + min(aa_coords)

            # Ignore query inserts
            if aa_pos in inserts:
                logger.debug("%d is an insert - ignoring" % (aa_pos))
                continue

            try:
                #ref_aa_pos = qindex_to_refcoord[aa_pos] + 1	 # FIXME: DO WE NEED TO ADD 1?
                aa_counts_string = ','.join(map(str, [aa_counts[aa_pos].get(aa, 0) for aa in amino_alphabet]))
                # note that we are subtracting the minimum aa_counts key
                aafile.write('%d,%d,%s\n' % (aa_pos, ref_aa_pos+1, aa_counts_string))

            except KeyError:
                logger.debug("No query-ref mapping available for aapos=%d (%s)" % (aa_pos, filename))
def g2p_scoring(csf_path, g2p_alignment_cutoff):
    """
    Take an env (amplicon) CSF and generate a v3prot file.

    Header: contains the G2P FPR and read count
    Sequence: protein aligned V3

    The CSF must be from an amplicon run: column 1 must contain the rank + count.
    """

    import logging,os,sys
    from hyphyAlign import apply2nuc, change_settings, get_boundaries, HyPhy, pair_align, refSeqs
    from minG2P import conan_g2p
    from miseqUtils import translate_nuc

    csf_filename = os.path.basename(csf_path)
    prefix = csf_filename.split('.')[0]
    logger = logging.getLogger()
    hyphy = HyPhy._THyPhy (os.getcwd(), 1)			# HyPhy is used for alignment
    change_settings(hyphy)					# Configure scoring matrix / gap penalties
    refseq = translate_nuc(refSeqs['V3, clinical'], 0)	# V3 ref seq is NON-STANDARD: talk to Guin

    if csf_filename.find("HIV1B-env") == -1 or not csf_path.endswith('.csf'):
        return logger.error("{} is not an HIV1B-env CSF file".format(csf_filename))

    # Store CSF in fasta-like variable called sequences
    sequences = []
    with open(csf_path, 'rU') as csf_file:
        for line in csf_file:
            header, left_offset, seq_no_gaps  = line.strip("\n").split(",")
            sequences.append((header, seq_no_gaps))

    if len(sequences) == 0:
        # skip empty file
        return logger.error('%s is an empty file' % csf_filename)

    # Determine offset from 1st sequence to correct frameshift induced by sample-specific remapping
    seq1 = sequences[0][1].strip("-")
    best_offset = 0
    best_score = -999
    possible_ORFs = [0, 1, 2]
    for offset in possible_ORFs:
        aaEnvSeq = translate_nuc(seq1, offset)
        aquery, aref, ascore = pair_align(hyphy, refseq, aaEnvSeq)
        if ascore > best_score:
            best_offset = offset
            best_score = ascore

    # For each env sequence, extract the V3 nucleotide sequence
    badfile = open(csf_path.replace('.csf', '.badV3'), 'w')
    v3nucs = {}
    for header, seq in sequences:
        count = int(header.split('_')[-1])
        seq = seq.replace("-","")					# Strip dashes at flanking regions generated by alignment
        aaEnvSeq = translate_nuc(seq, best_offset)			# Translate env on correct ORF
        aquery, aref, ascore = pair_align(hyphy, refseq, aaEnvSeq)
        left, right = get_boundaries(aref)				# Get left/right boundaries of V3 protein
        v3prot = aquery[left:right]					# Extract V3 protein
        v3nuc = apply2nuc(seq[(3*left-best_offset):], v3prot,		# Use alignment to extract V3 nuc seq
                aref[left:right], keepIns=True, keepDel=False)

        # Drop V3 data that don't satisfy quality control
        if 'N' in v3nuc or not v3prot.startswith('C') or not v3prot.endswith('C') or '*' in v3prot or ascore < g2p_alignment_cutoff or len(v3prot) < 32 or len(v3prot) > 40:
            badfile.write('>%s_reason_%s\n%s\n' % (header,
                '|'.join(['stopcodon' if '*' in v3prot else '',					# V3 can't have internal stop codon
                'lowscore' if ascore < g2p_alignment_cutoff else '',				# The G2P alignment can't be poor
                'cystines' if not v3prot.startswith('C') or not v3prot.endswith('C') else '',	# V3 must start/end with C
                'ambig' if 'N' in v3nuc else '']),seq))						# There must be no unknown bases
        else:
            # Track the count of each v3 nucleotide sequence
            if v3nucs.has_key(v3nuc):
                v3nucs[v3nuc] += count
            else:
                v3nucs.update({v3nuc: count})
    badfile.close()

    # Calculate g2p scores for each v3 nuc sequence
    v3prots = {}
    for v3nuc, count in v3nucs.iteritems():
        g2p, fpr, aligned = conan_g2p(v3nuc)

        if g2p is None:
            continue

        # Track the count of each protein sequence
        if v3prots.has_key(aligned):
            v3prots[aligned]['count'] += count
        else:
            # Dict within dict - store count and fpr for each sequence
            v3prots.update({aligned: {'count': count, 'fpr': fpr}})

    # Collect v3 prot sequences and their output (v is a dict mapping to count and fpr)
    intermed = [(v['count'], v3prot) for v3prot, v in v3prots.iteritems()]
    intermed.sort(reverse=True)

    # For this sample, write a v3prot file containing the prefix, sequence, rank, count, and fpr
    v3prot_path = csf_path.replace('.csf', '.v3prot')
    logger.info("Writing results to {}".format(v3prot_path))
    with open(v3prot_path, 'w') as v3protfile:
        for i, (count, v3prot) in enumerate(intermed):
            fpr = v3prots[v3prot]['fpr']
            v3protfile.write(">{}_variant_{}_count_{}_fpr_{}\n{}\n".format(prefix, i, count, fpr, v3prot))