class BaseSeq(Helper): def __init__(self, bam, barcodes=None, out=None, ref=None, rewritten_bam=None, consensus_reference=None, consensus_genomes=None, haplotype_distribution=None, vcf=None, chain=None, crossmap=None, export=None, rank=None, debug=None): self.bam = bam self.barcodes = barcodes self.out = out self.ref = Reference(ref) self.rewritten_bam = rewritten_bam self.rewritten_sorted_bam = rewritten_bam.replace(".bam", ".sorted.bam") if rewritten_bam else None self.consensus_reference = consensus_reference self.consensus_genomes = consensus_genomes self.haplotype_distribution = haplotype_distribution self.vcf = vcf self.chain = chain self.crossmap = crossmap self.export = export self.rank = rank self.debug = int(debug) def get_barcodes(self): # simple approach - align, take soft-clipped, and use the arbitrary 20 bases # intermediate approach - use the seed and extend approach out = open(self.out, "w") self.bc = BarCode(self.bam) self.bc.simple_approach() for k, v in sorted(self.bc.barcode_to_read.items()): q = sorted(v) out.write("%s\t%s\n" % (k, ",".join(q))) out.close() def error_correction_barcodes(self): # start analysis self.bc = BarCode(self.bam) sys.stdout.write("[%s] Starting Error Correction Analysis\n" % (self.get_time(),)) # load barcodes self.bc.load_barcodes(self.barcodes) sys.stdout.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) # cluster barcodes self.bc.cluster_barcodes() sys.stdout.write("[%s] Clustered BarCodes\n" % (self.get_time(),)) def filter_barcodes(self, barcode, export="fastq"): list_of_ids = [] with open(self.barcodes, "r") as f: for line in f: data = line.strip("\r\n").split("\t") if barcode == data[0]: list_of_ids = data[1].split(",") break self.bc = BarCode(self.bam) self.bc.filter_and_export(list_of_ids, self.out, export=export) def sort_and_rewrite_bam(self): self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),)) self.bc.load_barcodes(self.barcodes) sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) self.bc.sort_and_rewrite_bam(self.rewritten_bam) pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", "")) sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),)) def split_bam_by_barcode(self): self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting procedure to split BAM by barcode\n" % (self.get_time(),)) self.bc.split_bam_into_barcodes(self.ref, self.out, self.export) sys.stderr.write("[%s] Finished splitting BAM by barcode id\n" % (self.get_time(),)) def assemble_consensus_genomes(self): # build consensus self.consensus = Consensus(self.rewritten_sorted_bam, self.ref) sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),)) self.consensus.build(debug=self.debug) sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),)) self.consensus.infer_consensus(self.consensus_reference) sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),)) self.consensus.output_consensus_genomes(self.consensus_genomes) sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),)) self.consensus.output_haplotype_distribution(self.haplotype_distribution) sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),)) self.quark = Quark(self.ref.sequence) self.quark.distance_matrix(sorted(self.consensus.freq_distribution.items(), key=lambda q: q[1], reverse=True)) self.quark.graph_it() self.quark.rank_it(self.rank) sys.stderr.write("[%s] Output Quark Analysis\n" % (self.get_time(),)) self.ovcf = VCF(self.vcf, crossmap=self.crossmap) self.ovcf.get_variants(self.ref.sequence, self.consensus.consensus_genomes) self.ovcf.output_vcf(self.ref.sequence) sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),)) self.summary_statistics() sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),)) self.ochain = Chain(self.chain) self.ochain.output_chain(self.ref, self.consensus.inferred_consensus, self.consensus.inferred_structure) sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),)) def summary_statistics(self): # coverage per genome # variants per genome # estimate PCR and sequencing errors # barcode distribution f_out = open(self.out, "w") self.bc = BarCode(self.bam) #TEMP self.bc.load_barcodes(self.barcodes) #TEMP self.consensus.output_consensus_coverage(f_out) self.ovcf.output_variants_distribution(f_out) self.bc.output_reads_in_barcode_distribution(f_out) f_out.close() def run(self): # Phase 1 - Detection of BarCode self.bc = BarCode(self.bam) sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),)) self.bc.simple_approach() sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),)) self.bc.write_barcodes(self.barcodes) sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),)) # Phase 2 - Rewrite BAM sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),)) self.bc.load_barcodes(self.barcodes) sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),)) self.bc.bam.reset() self.bc.sort_and_rewrite_bam(self.rewritten_bam) pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", "")) sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),)) # Phase 3 - Build Consensus self.consensus = Consensus(self.rewritten_sorted_bam, self.ref) sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),)) self.consensus.build() sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),)) self.consensus.infer_consensus(self.consensus_reference) sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),)) # Phase 4 - Call Variants and Haplotypes self.consensus.output_consensus_genomes(self.consensus_genomes) sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),)) self.consensus.output_haplotype_distribution(self.haplotype_distribution) sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),)) self.ovcf = VCF(self.vcf, crossmap=self.crossmap) self.ovcf.get_variants(self.ref.sequence, self.consensus.consensus_genomes) self.ovcf.output_vcf(self.ref.sequence) sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),)) # Phase 5 - Summary Statistics and Chain Files f_out = open(self.out, "w") self.consensus.output_consensus_coverage(f_out) self.ovcf.output_variants_distribution(f_out) self.bc.output_reads_in_barcode_distribution(f_out) f_out.close() sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),)) self.ochain = Chain(self.chain) self.ochain.output_chain(self.ref, self.consensus.inferred_consensus, self.consensus.inferred_structure) sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),)) def assemble_genomes(self): pass def assemble_genomes_from_fastq(self): pass