Python stat_variationの例、pysamstats.stat_variation Pythonの例

コード例 #1

0

ファイルを表示

ファイル: pileups.py プロジェクト: ajuric/consensus-net

    def _generate_pileups(self):
        bam_file = pysam.AlignmentFile(self.bam_file_path)

        info_of_interest = ['A', 'C', 'G', 'T', 'insertions', 'deletions']
        indel_positions = [4, 5]

        # Last number in shape - 5 - is for letters other than A, C, G and T.
        mapping = {
            'A': 0,
            'a': 0,
            'C': 1,
            'c': 1,
            'G': 2,
            'g': 2,
            'T': 3,
            't': 3
        }
        total_options = len(info_of_interest) + 1

        pileups = [
            np.zeros((bam_file.get_reference_length(contig_name),
                      len(info_of_interest)))
            for contig_name in bam_file.references
        ]

        y_oh = [
            np.zeros(
                (bam_file.get_reference_length(contig_name), total_options))
            for contig_name in bam_file.references
        ]

        total_length = np.sum([
            bam_file.get_reference_length(contig_name)
            for contig_name in bam_file.references
        ])
        progress_counter = 0
        contig_names = bam_file.references
        with progressbar.ProgressBar(max_value=total_length) as progress_bar:
            for contig_id, contig_name in enumerate(contig_names):
                for record in pysamstats.stat_variation(
                        bam_file,
                        chrom=contig_name,
                        fafile=self.reference_fasta_path):
                    progress_bar.update(progress_counter)
                    progress_counter += 1

                    curr_position = record['pos']

                    for i, info in enumerate(info_of_interest):
                        pileups[contig_id][curr_position][i] += record[info]

                    pileup_argmax = np.argmax(
                        pileups[contig_id][curr_position])
                    if pileup_argmax in indel_positions:
                        y_oh[contig_id][curr_position][pileup_argmax] = 1
                    else:
                        y_oh[contig_id][curr_position][mapping.get(
                            record['ref'], -1)] = 1

        return pileups, y_oh, contig_names

コード例 #2

0

ファイルを表示

ファイル: armiTest.py プロジェクト: forestdussault/geneSipprV2

def populateDict():
    global holdingDict
    if not os.path.isfile(JSONProfile):
        for rec in pysamstats.stat_variation(alignmentfile=bamFile, fafile=target):
            # holdingDict[rec['chrom'].split("_")[0]] = {}

            # geneName = rec['chrom'].split(".")[0]
            aros = re.findall("(?<=ARO:)3\d{6}", rec['chrom'])
            aro = " ".join(aros)
            # if len(aros) > 1:
            #     print str(aros)
            # aros = rec['chrom'].split("!")[1].split("_")[0]
            if rec['matches'] - rec['mismatches'] > 0:
                holdingDict[rec['chrom']][int(rec['pos'])] = rec['reads_all']
            dotter()


        JSONreport = open(JSONProfile, "wb")
        output = json.dumps(holdingDict, sort_keys=True, indent=4, separators=(',', ': '))
        JSONreport.write(output)
        JSONreport.close()
    else:
        with open(JSONProfile, "rb") as jsonReport:
            # Load the data
            # print strain, targetName, JSONProfile, jsonReport
            holdingDict.update(json.load(jsonReport))

コード例 #3

0

ファイルを表示

def parse_bam(bam, Chr, Pos, End, Ref, Alt, REF, ):
    # exec_pysamstats = '/home/liaoth/tools/pysamstats_venv/bin/pysamstats'
    #
    # t_result = os.popen(
    #     '{exe} --fasta {fa} --type variation {t_f} -c {chr} -s {start} -e {end} -u'
    #         .format(exe=exec_pysamstats, fa=fasta, t_f=bam_path, chr=Chr, start=Pos, end=End + 1)).read()
    result = pysamstats.stat_variation(bam,
                                       REF,
                                       chrom=Chr,
                                       start=Pos,
                                       end=End + 1,
                                       truncate=True,
                                       one_based=True)
    result_dict = {}
    for idx, record in enumerate(result):
        result_dict[idx] = record
    data_df = pd.DataFrame.from_dict(result_dict, orient='index')

    if data_df.shape[0] == 0:
        data_df = data_df.append(pd.DataFrame(columns=tmp))
        data_df.loc[0, :] = 0

    if Ref in ['A', 'C', 'G', 'T'] and Alt in ['A', 'C', 'G', 'T']:
        ref_cov, mut_cov, cov = data_df.loc[0,
                                            [Ref + '_pp',
                                             Alt + '_pp',
                                             'reads_pp']]

        if cov != 0:
            mut_per = mut_cov / float(cov)
        else:
            mut_per = 0

    else:
        if data_df.sum()['reads_pp'] > 0 and len(data_df) > 0:
            if Ref == '-' or len(Alt) > len(Ref):
                # insertions
                mut_per = data_df.sum()['insertions_pp'] / float(data_df.sum()['reads_pp'])
                ref_cov = data_df.sum()['matches_pp'] / float(len(data_df))
                mut_cov = data_df.sum()['insertions_pp'] / float(len(data_df))
            elif Alt == '-' or len(Ref) > len(Alt):
                # deletions
                mut_per = data_df.sum(0)['deletions_pp'] / float(data_df.sum()['reads_pp'])
                ref_cov = data_df.sum(0)['matches_pp'] / float(len(data_df))
                mut_cov = data_df.sum(0)['deletions_pp'] / float(len(data_df))
            else:
                return ['Wrong pos'] * 3
        else:
            return ['Off target'] * 3
    return mut_cov, mut_per, ref_cov

コード例 #4

0

ファイルを表示

ファイル: LoLoPicker_somatic.py プロジェクト: jcarrotzhang/LoLoPicker

	def filter_germline(chr, t_columns_pos, ref_seq, alt_base):
		n_alt_A=0; n_alt_C=0; n_alt_G=0; n_alt_T=0; n_refcount=0
		for rec in pysamstats.stat_variation(n_samfile, ref, chrom=str(chr), start=int(t_columns_pos), end=int(t_columns_pos)+1):
			chr = rec['chrom']; pos = rec['pos']; ref_seq = rec['ref']
			if pos == t_columns_pos:
				if rec['reads_pp'] > 4:
					if float(rec['mismatches_pp']/rec['reads_pp']) <= 0.10:
						for n_columns in n_samfile.pileup(chr, int(t_columns_pos), int(t_columns_pos)+1, truncate=True):
							if n_columns.pos == t_columns_pos:
								(n_alt_A, n_alt_C, n_alt_G, n_alt_T, n_refcount) = process_reads(n_columns, n_columns.pos, ref_seq, "N");
					else:
						n_alt_A = rec['mismatches_pp']; n_alt_C = rec['mismatches_pp']; n_alt_G = rec['mismatches_pp']; n_alt_T = rec['mismatches_pp']; n_refcount = rec['reads_pp']-rec['mismatches_pp']
				else:
					n_alt_A=0; n_alt_G=0; n_alt_C=0; n_alt_T=0; n_refcount=rec['reads_pp']
 
		if alt_base == "A":
			return n_alt_A, n_refcount
              	if alt_base == "G":
                      	return n_alt_G, n_refcount
                if alt_base == "T":
                       	return n_alt_T, n_refcount
                if alt_base == "C":
                       	return n_alt_C, n_refcount

コード例 #5

0

ファイルを表示

ファイル: sixteenS_probes.py プロジェクト: forestdussault/geneSipprV2

 def parse(self):
     import pysamstats
     import operator
     import numpy
     while True:
         sample, analysistype = self.parsequeue.get()
         # Initialise dictionaries to store parsed data
         matchdict = dict()
         depthdict = dict()
         seqdict = dict()
         snpdict = dict()
         gapdict = dict()
         maxdict = dict()
         mindict = dict()
         deviationdict = dict()
         sample[analysistype].results = dict()
         sample[analysistype].avgdepth = dict()
         sample[analysistype].resultssnp = dict()
         sample[analysistype].resultsgap = dict()
         sample[analysistype].sequences = dict()
         sample[analysistype].maxcoverage = dict()
         sample[analysistype].mincoverage = dict()
         sample[analysistype].standarddev = dict()
         # Variable to store the expected position in gene/allele
         pos = 0
         try:
             # Use the stat_variation function of pysam stats to return records parsed from sorted bam files
             # Values of interest can be retrieved using the appropriate keys
             for rec in pysamstats.stat_variation(alignmentfile=sample[analysistype].sortedbam,
                                                  fafile=sample[analysistype].baitfile,
                                                  max_depth=1000000):
                 # Initialise seqdict with the current gene/allele if necessary with an empty string
                 if rec['chrom'] not in seqdict:
                     seqdict[rec['chrom']] = str()
                     # Since this is the first position in a "new" gene/allele, reset the pos variable to 0
                     pos = 0
                 # Initialise gap dict with 0 gaps
                 if rec['chrom'] not in gapdict:
                     gapdict[rec['chrom']] = 0
                 # If there is a gap in the alignment, record the size of the gap in gapdict
                 if int(rec['pos']) > pos:
                     # Add the gap size to gap dict
                     gapdict[rec['chrom']] += rec['pos'] - pos
                     # Set the expected position to the current position
                     pos = int(rec['pos'])
                 # Increment pos in preparation for the next iteration
                 pos += 1
                 # Initialise snpdict if necessary
                 if rec['chrom'] not in snpdict:
                     snpdict[rec['chrom']] = 0
                 # Initialise the current gene/allele in depthdict with the depth (reads_all) if necessary,
                 # otherwise add the current depth to the running total
                 if rec['chrom'] not in depthdict:
                     depthdict[rec['chrom']] = int(rec['reads_all'])
                 else:
                     depthdict[rec['chrom']] += int(rec['reads_all'])
                 # Dictionary of bases and the number of times each base was observed per position
                 bases = {'A': rec['A'], 'C': rec['C'], 'G': rec['G'], 'T': rec['T']}
                 # If the most prevalent base (calculated with max() and operator.itemgetter()) does not match the
                 # reference base, add this prevalent base to seqdict
                 if max(bases.items(), key=operator.itemgetter(1))[0] != rec['ref']:
                     seqdict[rec['chrom']] += max(bases.items(), key=operator.itemgetter(1))[0]
                     # Increment the running total of the number of SNPs
                     snpdict[rec['chrom']] += 1
                 else:
                     # If the bases match, add the reference base to seqdict
                     seqdict[rec['chrom']] += (rec['ref'])
                     # Initialise posdict if necessary, otherwise, increment the running total of matches
                     if rec['chrom'] not in matchdict:
                         matchdict[rec['chrom']] = 1
                     else:
                         matchdict[rec['chrom']] += 1
                 # Find the max and min coverage for each strain/gene combo
                 try:
                     maxdict[rec['chrom']] = int(rec['reads_all']) if \
                         int(rec['reads_all']) >= maxdict[rec['chrom']] else maxdict[rec['chrom']]
                 except KeyError:
                     maxdict[rec['chrom']] = int(rec['reads_all'])
                 try:
                     mindict[rec['chrom']] = int(rec['reads_all']) if \
                         int(rec['reads_all']) <= mindict[rec['chrom']] else mindict[rec['chrom']]
                 except KeyError:
                     mindict[rec['chrom']] = int(rec['reads_all'])
                 # Create a list of all the depths in order to calculate the standard deviation
                 try:
                     deviationdict[rec['chrom']].append(int(rec['reads_all']))
                 except KeyError:
                     deviationdict[rec['chrom']] = list()
                     deviationdict[rec['chrom']].append(int(rec['reads_all']))
         # If there are no results in the bam file, then pass over the strain
         except ValueError:
             pass
         # Iterate through all the genes/alleles with results above
         for allele in sorted(matchdict):
             # If the length of the match is greater or equal to the length of the gene/allele (multiplied by the
             # cutoff value) as determined using faidx indexing, then proceed
             if matchdict[allele] >= sample[analysistype].faidict[allele] * self.cutoff:
                 # Calculate the average depth by dividing the total number of reads observed by the
                 # length of the gene
                 averagedepth = float(depthdict[allele]) / float(matchdict[allele])
                 percentidentity = float(matchdict[allele]) / float(sample[analysistype].faidict[allele]) * 100
                 # Only report a positive result if this average depth is greater than 10X
                 if averagedepth > 10:
                     # Populate resultsdict with the gene/allele name, the percent identity, and the average depth
                     sample[analysistype].results.update({allele: '{:.2f}'.format(percentidentity)})
                     sample[analysistype].avgdepth.update({allele: '{:.2f}'.format(averagedepth)})
                     # Add the SNP and gap results to dictionaries
                     sample[analysistype].resultssnp.update({allele: snpdict[allele]})
                     sample[analysistype].resultsgap.update({allele: gapdict[allele]})
                     sample[analysistype].sequences.update({allele: seqdict[allele]})
                     sample[analysistype].maxcoverage.update({allele: maxdict[allele]})
                     sample[analysistype].mincoverage.update({allele: mindict[allele]})
                     sample[analysistype]\
                         .standarddev.update({allele: '{:.2f}'.format(numpy.std(deviationdict[allele], ddof=1))})
         self.parsequeue.task_done()

コード例 #6

0

ファイルを表示

ファイル: pileup_mod.py プロジェクト: ctuni/TFM

	
	ref_genome=''

	if 'mature' in file:
		ref_genome='../../../Reference_Genomes/Modifications_files/Ref_seq/families_tRNA_refgenome.fa'
	if 'PG' in file:
		ref_genome='../../../Reference_Genomes/Modifications_files/Ref_seq/precursor_tRNA_refgenome.fa'
	


	file_base_call=open(sample+'_'+bam_type+'_'+'base_calling_CORRECT_OK.txt','w')
	

	#BASE CALLING
	#pysam will give us the base calling for each trna (max_depth is used becouse by default the maximum number of reads that will read in a position is 8000, so we increse the number to a high number in order to have the total number of reads correct in a ceratin position.) 
	for record in pysamstats.stat_variation(bamfile,  fafile=ref_genome,  max_depth=10000000):
		ref_base=record['ref']

		tRNA_info='REF-'+str(record['ref'])+':'+str(record[ref_base])+' '+'A:'+str(record['A'])+' '+'C:'+str(record['C'])+' '+'G:'+str(record['G'])+' '+'T:'+str(record['T'])

		tRNA=record['chrom']
		

		pos=''
		ref=record['ref']
		#Positions of the precursor are not the same as the positions in the mature genome since we have the leading and trailing regions and the introns. So we have to transform the positions from the precursor genome to be like the mature ones. 
		if bam_type=='precursor':
			pos_i=record['pos']
			#take in to acount the leading and trailing regions.
			if pos_i > 49 and pos_i < int(prec_length[tRNA])-50:
				pos=pos_i-49

コード例 #7

0

ファイルを表示

ファイル: add_per_info_into_csv.py プロジェクト: zzygyx9119/Whole_pipelines

def parse_bam(
    bam,
    Chr,
    Pos,
    End,
    Ref,
    Alt,
    REF,
    sig='N',
):
    #exec_pysamstats = '/home/liaoth/tools/pysamstats_venv/bin/pysamstats'
    #
    # t_result = os.popen(
    #     '{exe} --fasta {fa} --type variation {t_f} -c {chr} -s {start} -e {end} -u'
    #         .format(exe=exec_pysamstats, fa=fasta, t_f=bam_path, chr=Chr, start=Pos, end=End + 1)).read()
    result = pysamstats.stat_variation(bam,
                                       REF,
                                       chrom=Chr,
                                       start=Pos,
                                       end=End + 1,
                                       truncate=True,
                                       one_based=True)
    result_dict = {}
    for idx, record in enumerate(result):
        result_dict[idx] = record
    t_data_df = pd.DataFrame.from_dict(result_dict, orient='index')

    if t_data_df.shape[0] == 0:
        t_data_df = t_data_df.append(pd.DataFrame(columns=tmp))
        t_data_df.loc[0, :] = 0

    if Ref in ['A', 'C', 'G', 'T'] and Alt in ['A', 'C', 'G', 'T']:
        T_ref_cov, T_mut_cov, T_cov = t_data_df.loc[
            0, [Ref + '_pp', Alt + '_pp', 'reads_pp']]

        if T_cov != 0:
            T_mut_per = T_mut_cov / float(T_cov)
        else:
            T_mut_per = 0

        added_col['%s_mut_per' % sig].append(T_mut_per)
        added_col['%s_ref_cov' % sig].append(T_ref_cov)
        added_col['%s_mut_cov' % sig].append(T_mut_cov)
    else:
        if t_data_df.sum()['reads_pp'] > 0 and len(t_data_df) > 0:
            if Ref == '-' or len(Alt) > len(Ref):
                # insertions
                added_col['%s_mut_per' %
                          sig].append(t_data_df.sum()['insertions_pp'] /
                                      float(t_data_df.sum()['reads_pp']))
                added_col['%s_ref_cov' % sig].append(
                    t_data_df.sum()['matches_pp'] / float(len(t_data_df)))
                added_col['%s_mut_cov' % sig].append(
                    t_data_df.sum()['insertions_pp'] / float(len(t_data_df)))
            elif Alt == '-' or len(Ref) > len(Alt):
                # deletions
                added_col['%s_mut_per' % sig].append(
                    t_data_df.sum(0)['deletions_pp'] /
                    float(t_data_df.sum()['reads_pp']))
                added_col['%s_ref_cov' % sig].append(
                    t_data_df.sum(0)['matches_pp'] / float(len(t_data_df)))
                added_col['%s_mut_cov' % sig].append(
                    t_data_df.sum(0)['deletions_pp'] / float(len(t_data_df)))
            else:
                for _key in [_ for _ in added_col.keys() if sig in _]:
                    added_col[_key].append('Wrong pos')
        else:
            for _key in [_ for _ in added_col.keys() if sig in _]:
                added_col[_key].append('Off target')

コード例 #8

0

ファイルを表示

ファイル: Pileup_variants.py プロジェクト: Lenka-Stejskal/Scripts-for-MD

import pysam
import pysamstats

bamfile = pysam.AlignmentFile('picared.bam')
print("P"),
print("A"),
print("C"),
print("G"),
print("T"),
print("I"),
print("D")
for record in pysamstats.stat_variation(bamfile,
                                        chrom='gi|11|ref|TL|E1E2J6',
                                        fafile="ref_E1E2.fa"):
    print(record['pos']),
    print(record['A']),
    print(record['C']),
    print(record['G']),
    print(record['T']),
    print(record['insertions']),
    print(record['deletions'])

コード例 #9

0

ファイルを表示

ファイル: LoLoPicker_somatic.py プロジェクト: jcarrotzhang/LoLoPicker

		if alt_base == "A":
			return n_alt_A, n_refcount
              	if alt_base == "G":
                      	return n_alt_G, n_refcount
                if alt_base == "T":
                       	return n_alt_T, n_refcount
                if alt_base == "C":
                       	return n_alt_C, n_refcount

	foundReg=[];	raws=[]
	cov = 0
	ftemp = open(tempfile, 'w')	
	print >>ftemp, "#chr\tpos\tref\talt\tt_ref\tt_alt\tn_ref\tn_alt\tjudge"
	print "calling raw variants..."
	for bedline in ( raw.strip().split() for raw in open(bed)):
 	     	for rec in pysamstats.stat_variation(t_samfile, ref, chrom=str(bedline[0]), start=int(bedline[1]), end=int(bedline[2])):
			if rec['reads_pp'] > 4:
				if (rec['mismatches_pp'] > 2) and (rec['insertions'] <= 3) and (rec['deletions'] <= 3):
					chr = rec['chrom']; pos = rec['pos'];
					ref_seq = rec['ref']
					for t_columns in t_samfile.pileup(chr, int(pos), int(pos)+1, truncate=True):
						try:
							foundReg.index(str(chr)+"\t"+str(t_columns.pos))
                    				except ValueError:
                                			foundReg.append(str(chr)+"\t"+str(t_columns.pos))
							(t_alt_A, t_alt_C, t_alt_G, t_alt_T, t_refcount, altReadPosE_f_A, altReadPosE_r_A, altReadPosE_f_G, altReadPosE_r_G, altReadPosE_f_C, altReadPosE_r_C, altReadPosE_f_T, altReadPosE_r_T, altReadPosS_f_A, altReadPosS_r_A, altReadPosS_f_G, altReadPosS_r_G, altReadPosS_f_C, altReadPosS_r_C, altReadPosS_f_T, altReadPosS_r_T) = process_reads(t_columns, t_columns.pos, ref_seq, "Y")
							totalcount = t_refcount + t_alt_A + t_alt_G + t_alt_C + t_alt_T
							if totalcount > 4:
								if (t_alt_A > tumor_alt_cutoff) and ((t_alt_A/totalcount) > tumor_alf_cutoff):
									if (all(5 >= i for i in altReadPosS_f_A) and all(5 >= i for i in altReadPosS_r_A)) or (all(5 >= i for i in altReadPosE_f_A) and all(5 >= i for i in altReadPosE_r_A)):
										raw_calls = (chr+"\t"+str(pos+1)+"\t"+ref_seq.upper()+"\t"+"A"+"\t"+str(t_refcount)+"\t"+str(t_alt_A)+"\t"+"0"+"\t"+"0"+"\t"+"clustered_pos")

コード例 #10

0

ファイルを表示

ファイル: allelefind.py プロジェクト: forestdussault/geneSipprV2

    def parse(self):
        import pysamstats
        import operator
        while True:
            sample, vtx = self.parsequeue.get()
            # Initialise dictionaries to store parsed data
            matchdict = dict()
            depthdict = dict()
            seqdict = dict()
            resultsdict = dict()
            snpdict = dict()
            gapdict = dict()
            faidict = dict()
            uniqueresults = dict()
            refdict = dict()
            # Variable to store the expected position in gene/allele
            pos = 0
            # Get the fai file into a dictionary to be used in parsing results
            with open(sample[self.analysistype].faifile[vtx], 'rb') as faifile:
                for line in faifile:
                    data = line.split('\t')
                    faidict[data[0]] = int(data[1])
            try:
                # Use the stat_variation function of pysam stats to return records parsed from sorted bam files
                # Values of interest can be retrieved using the appropriate keys
                correction = 0
                for rec in pysamstats.stat_variation(alignmentfile=sample[self.analysistype].sortedbam[vtx],
                                                     fafile=sample[self.analysistype].targetfiles[vtx],
                                                     max_depth=1000000):

                    # Add the reference sequence to the dictionary
                    if rec['chrom'] not in refdict:
                        refdict[rec['chrom']] = str()
                    refdict[rec['chrom']] += rec['ref']
                    # Initialise seqdict with the current gene/allele if necessary with an empty string
                    if rec['chrom'] not in seqdict:
                        seqdict[rec['chrom']] = str()
                        # Since this is the first position in a "new" gene/allele, reset the pos variable to 0
                        pos = 0
                        # There seems to be a bug in pysamstats with how gaps at the start of the sequence are treated.
                        # Although the position is correct, the whole reference sequence is still included, rather than
                        # starting at where the gap ends
                        if rec['pos'] > pos:
                            # If there is a gap of 173 bases at the beginning of the match, the reference sequence
                            # still should start at 0, but it starts at 173, therefore, the match actually starts at
                            # 2 * 173 = 346
                            correction = 2 * rec['pos']
                            # The number of gaps is equal to the starting position
                            gapdict[rec['chrom']] = rec['pos']
                            # The actual position will be rec['pos']
                            pos = rec['pos']
                    # Allow the position to reach the calculated correction factor
                    if rec['pos'] >= correction:
                        # Initialise gap dict with 0 gaps
                        if rec['chrom'] not in gapdict:
                            gapdict[rec['chrom']] = 0
                        # If there is a gap in the alignment, record the size of the gap in gapdict
                        if int(rec['pos']) > pos:
                            # Add the gap size to gap dict
                            gapdict[rec['chrom']] += rec['pos'] - pos
                            # Add dashes to the sequence to indicate the gap
                            seqdict[rec['chrom']] += 'N' * (int(rec['pos'] - pos))
                            # Set the expected position to the current position
                            pos = int(rec['pos'])
                        # Increment pos in preparation for the next iteration
                        pos += 1
                        # Initialise snpdict if necessary
                        if rec['chrom'] not in snpdict:
                            snpdict[rec['chrom']] = 0
                        # Initialise the current gene/allele in depthdict with the depth (reads_all) if necessary,
                        # otherwise add the current depth to the running total
                        if rec['chrom'] not in depthdict:
                            depthdict[rec['chrom']] = int(rec['reads_all'])
                        else:
                            depthdict[rec['chrom']] += int(rec['reads_all'])
                        # Dictionary of bases and the number of times each base was observed per position
                        bases = {'A': rec['A'], 'C': rec['C'], 'G': rec['G'], 'T': rec['T']}
                        # Track any deletions prior to the sequence
                        if rec['deletions'] > rec['matches']:
                            seqdict[rec['chrom']] += 'N'
                            # Increment the running total of the number of SNPs
                            snpdict[rec['chrom']] += 1
                        else:
                            if rec['matches'] > 0 or rec['mismatches'] > 0:
                                # If the most prevalent base (calculated with max() and operator.itemgetter())
                                # doesn't match the reference base, add this prevalent base to seqdict
                                if max(bases.iteritems(), key=operator.itemgetter(1))[0] != rec['ref']:
                                    seqdict[rec['chrom']] += max(bases.iteritems(), key=operator.itemgetter(1))[0]
                                    # Increment the running total of the number of SNPs
                                    snpdict[rec['chrom']] += 1
                                else:
                                    # If the bases match, add the reference base to seqdict
                                    seqdict[rec['chrom']] += (rec['ref'])
                                    # Initialise posdict if necessary, otherwise, increment the running total of matches
                                    if rec['chrom'] not in matchdict:
                                        matchdict[rec['chrom']] = 1
                                    else:
                                        matchdict[rec['chrom']] += 1
            # If there are no results in the bam file, then pass over the strain
            except ValueError:
                pass
            # Iterate through all the genes/alleles with results above
            for allele in sorted(matchdict):
                # If the length of the match is greater or equal to the length of the gene/allele (multiplied by the
                # cutoff value) as determined using faidx indexing, then proceed
                # Calculate the average depth by dividing the total number of reads observed by the length of the gene
                averagedepth = float(depthdict[allele]) / float(matchdict[allele])
                percentidentity = float(matchdict[allele]) / float(faidict[allele]) * 100
                if percentidentity == self.cutoff * 100:
                    # Only report a positive result if this average depth is greater than 4X
                    if averagedepth > 4:
                        # Populate resultsdict with the gene/allele name, the percent identity, and the average depth
                        resultsdict.update({allele: {'{:.2f}'.format(percentidentity): '{:.2f}'.format(averagedepth)}})
            # Add the results to the object
            sample[self.analysistype].allelematches[vtx] = resultsdict
            # Determine if there are alleles without a 100% match
            if not resultsdict:
                for allele in sorted(matchdict):
                    # Filter the alleles to only include the vtx subunit
                    if vtx in allele:
                        percentidentity = float(matchdict[allele]) / float(faidict[allele]) * 100
                        # Use a more relaxed cutoff to find the closest alleles
                        if percentidentity >= self.cutoff * 50:
                            uniqueresults.update({allele: percentidentity})
                try:
                    # Find the best match (highest percent identity)
                    closestallele = max(uniqueresults.iteritems(), key=operator.itemgetter(1))[0]
                    percentidentity = max(uniqueresults.iteritems(), key=operator.itemgetter(1))[1]
                    averagedepth = float(depthdict[closestallele]) / float(matchdict[closestallele])
                    # Populate the metadata with the results
                    sample[self.analysistype].newsequences[vtx] = seqdict[closestallele]
                    sample[self.analysistype].newseqclosestmatch[vtx] = \
                        {closestallele: {'{:.2f}'.format(percentidentity): '{:.2f}'.format(averagedepth)}}
                    sample[self.analysistype].newseqclosestseq[vtx] = {closestallele: refdict[closestallele]}
                except ValueError:
                    pass
            self.parsequeue.task_done()