def multi_align_samples(self, retree=1): # store here a dict, {sample_name: [sequence_id1, sequence_id2, ...]} # because in Python3 OrderedDict does not have iteritems(), it simply has items() if hasattr(self.genomeFastas, "iteritems"): genomeKVIterator = self.genomeFastas.iteritems() else: genomeKVIterator = self.genomeFastas.items() for sampleName, fastaFile in genomeKVIterator: with util.file.open_or_gzopen(fastaFile, 'r') as inf: for seq in Bio.SeqIO.parse(inf, 'fasta'): self.sequence_order.setdefault(sampleName, default=[]) self.sequence_order[sampleName].append(seq.id) inputFastas = [] inputFastas.append(self.ref) inputFastas.extend(self.genomeFastas.values()) transposedFiles = interhost.transposeChromosomeFiles(inputFastas) # since the FASTA files are for idx, filePath in enumerate(transposedFiles): outFile = util.file.mkstempfname('.fasta') outFilePath = os.path.dirname(outFile) alignedOutFile = tools.mafft.MafftTool().execute( inFastas=[os.path.abspath(filePath)], outFile=os.path.join(outFilePath, "{}{}.fasta".format("aligned", idx)), localpair=False, globalpair=True, preservecase=True, reorder=None, gapOpeningPenalty=None, offset=None, verbose=False, outputAsClustal=None, maxiters=1000, threads=-1, retree=retree) self.alignedFastas.append(alignedOutFile)
def multi_align_samples(self, retree=1): # store here a dict, {sample_name: [sequence_id1, sequence_id2, ...]} # because in Python3 OrderedDict does not have iteritems(), it simply has items() if hasattr(self.genomeFastas, "iteritems"): genomeKVIterator = self.genomeFastas.iteritems() else: genomeKVIterator = self.genomeFastas.items() for sampleName, fastaFile in genomeKVIterator: with util.file.open_or_gzopen(fastaFile, 'r') as inf: for seq in Bio.SeqIO.parse(inf, 'fasta'): self.sequence_order.setdefault(sampleName, default=[]) self.sequence_order[sampleName].append(seq.id) inputFastas = [] inputFastas.append(self.ref) inputFastas.extend(self.genomeFastas.values()) transposedFiles = interhost.transposeChromosomeFiles(inputFastas) # since the FASTA files are for idx, filePath in enumerate(transposedFiles): outFile = util.file.mkstempfname('.fasta') outFilePath = os.path.dirname(outFile) alignedOutFile = tools.mafft.MafftTool().execute( inFastas=[os.path.abspath(filePath)], outFile=os.path.join(outFilePath, "{}{}.fasta".format("aligned", idx)), localpair=False, globalpair=True, preservecase=True, reorder=None, gapOpeningPenalty=None, offset=None, verbose=False, outputAsClustal=None, maxiters=1000, threads=-1, retree=retree) self.alignedFastas.append(alignedOutFile)
def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False): """ Write or print pairwise alignment summary information for sequences in two FASTA files, including SNPs, ambiguous bases, and indels. """ gap = '-' ambiguous = 'N' aligner = tools.muscle.MuscleTool() per_chr_fastas = interhost.transposeChromosomeFiles([inFastaFileOne, inFastaFileTwo]) results = OrderedDict() results["same_unambig"] = 0 results["snp_unambig"] = 0 results["indel_unambig"] = 0 results["indel_ambig"] = 0 results["ambig_one"] = 0 results["ambig_two"] = 0 results["ambig_both"] = 0 results["unambig_both"] = 0 for chr_fasta in per_chr_fastas: same_unambig = 0 snp_unambig = 0 indel_unambig = 0 indel_ambig = 0 ambig_one = 0 ambig_two = 0 ambig_both = 0 unambig_both = 0 alignOutFileName = util.file.mkstempfname('.fasta') aligner.execute(chr_fasta, alignOutFileName, fmt="clw") with open(alignOutFileName, "r") as f: alignment = Bio.AlignIO.read(f, "clustal") for col_idx in range(0, alignment.get_alignment_length()): col = alignment[:, col_idx] c1 = col[0] c2 = col[1] if (c1 in ambiguous and c2 in ambiguous): ambig_both +=1 elif c1 in ambiguous: ambig_one += 1 elif c2 in ambiguous: ambig_two += 1 if (c1 in IUPACUnambiguousDNA().letters and c2 in IUPACUnambiguousDNA().letters): unambig_both += 1 if c1 == c2: same_unambig += 1 else: snp_unambig += 1 if ((c1 == gap and c2 in IUPACUnambiguousDNA().letters) or (c2 == gap and c1 in IUPACUnambiguousDNA().letters)): indel_unambig += 1 if ((c1 == gap and c2 in ambiguous) or (c2 == gap and c1 in ambiguous)): indel_ambig += 1 if printCounts: print("Counts for this segment/chromosome:") print("same_unambig ", same_unambig) print("snp_unambig ", snp_unambig) print("indel_unambig", indel_unambig) print("indel_ambig ", indel_ambig) print("ambig_one ", ambig_one) print("ambig_two ", ambig_two) print("ambig_both ", ambig_both) print("unambig_both ", unambig_both) results["same_unambig"] += same_unambig results["snp_unambig"] += snp_unambig results["indel_unambig"] += indel_unambig results["indel_ambig"] += indel_ambig results["ambig_one"] += ambig_one results["ambig_two"] += ambig_two results["ambig_both"] += ambig_both results["unambig_both"] += unambig_both if printCounts: print("\nCounts for this sample:") print("same_unambig ", results["same_unambig"]) print("snp_unambig ", results["snp_unambig"]) print("indel_unambig", results["indel_unambig"]) print("indel_ambig ", results["indel_ambig"]) print("ambig_one ", results["ambig_one"]) print("ambig_two ", results["ambig_two"]) print("ambig_both ", results["ambig_both"]) print("unambig_both ", results["unambig_both"]) if outfileName: with open(outfileName, "wt") as of: csvout = csv.writer(of, delimiter='\t') csvout.writerow(list(results.keys())) csvout.writerow(list(results.values()))
def alignment_summary(inFastaFileOne, inFastaFileTwo, outfileName=None, printCounts=False): """ Write or print pairwise alignment summary information for sequences in two FASTA files, including SNPs, ambiguous bases, and indels. """ gap = '-' ambiguous = 'N' aligner = tools.muscle.MuscleTool() per_chr_fastas = interhost.transposeChromosomeFiles( [inFastaFileOne, inFastaFileTwo]) results = OrderedDict() results["same_unambig"] = 0 results["snp_unambig"] = 0 results["indel_unambig"] = 0 results["indel_ambig"] = 0 results["ambig_one"] = 0 results["ambig_two"] = 0 results["ambig_both"] = 0 results["unambig_both"] = 0 for chr_fasta in per_chr_fastas: same_unambig = 0 snp_unambig = 0 indel_unambig = 0 indel_ambig = 0 ambig_one = 0 ambig_two = 0 ambig_both = 0 unambig_both = 0 alignOutFileName = util.file.mkstempfname('.fasta') aligner.execute(chr_fasta, alignOutFileName, fmt="clw") with open(alignOutFileName, "r") as f: alignment = Bio.AlignIO.read(f, "clustal") for col_idx in range(0, alignment.get_alignment_length()): col = alignment[:, col_idx] c1 = col[0] c2 = col[1] if (c1 in ambiguous and c2 in ambiguous): ambig_both += 1 elif c1 in ambiguous: ambig_one += 1 elif c2 in ambiguous: ambig_two += 1 if (c1 in IUPACUnambiguousDNA().letters and c2 in IUPACUnambiguousDNA().letters): unambig_both += 1 if c1 == c2: same_unambig += 1 else: snp_unambig += 1 if ((c1 == gap and c2 in IUPACUnambiguousDNA().letters) or (c2 == gap and c1 in IUPACUnambiguousDNA().letters)): indel_unambig += 1 if ((c1 == gap and c2 in ambiguous) or (c2 == gap and c1 in ambiguous)): indel_ambig += 1 if printCounts: print("Counts for this segment/chromosome:") print("same_unambig ", same_unambig) print("snp_unambig ", snp_unambig) print("indel_unambig", indel_unambig) print("indel_ambig ", indel_ambig) print("ambig_one ", ambig_one) print("ambig_two ", ambig_two) print("ambig_both ", ambig_both) print("unambig_both ", unambig_both) results["same_unambig"] += same_unambig results["snp_unambig"] += snp_unambig results["indel_unambig"] += indel_unambig results["indel_ambig"] += indel_ambig results["ambig_one"] += ambig_one results["ambig_two"] += ambig_two results["ambig_both"] += ambig_both results["unambig_both"] += unambig_both if printCounts: print("\nCounts for this sample:") print("same_unambig ", results["same_unambig"]) print("snp_unambig ", results["snp_unambig"]) print("indel_unambig", results["indel_unambig"]) print("indel_ambig ", results["indel_ambig"]) print("ambig_one ", results["ambig_one"]) print("ambig_two ", results["ambig_two"]) print("ambig_both ", results["ambig_both"]) print("unambig_both ", results["unambig_both"]) if outfileName: with open(outfileName, "wt") as of: csvout = csv.writer(of, delimiter='\t') csvout.writerow(list(results.keys())) csvout.writerow(list(results.values()))