Esempio n. 1
0
class BaseSeq(Helper):

    def __init__(self, bam, barcodes=None, out=None, ref=None,
                 rewritten_bam=None,
                 consensus_reference=None,
                 consensus_genomes=None,
                 haplotype_distribution=None,
                 vcf=None,
                 chain=None,
                 crossmap=None,
                 export=None,
                 rank=None,
                 debug=None):
        
        self.bam = bam
        self.barcodes = barcodes
        self.out = out
        self.ref = Reference(ref)

        self.rewritten_bam = rewritten_bam
        self.rewritten_sorted_bam = rewritten_bam.replace(".bam", ".sorted.bam") if rewritten_bam else None
        self.consensus_reference = consensus_reference
        self.consensus_genomes = consensus_genomes
        self.haplotype_distribution = haplotype_distribution
        self.vcf = vcf
        self.chain = chain
        self.crossmap = crossmap
        self.export = export
        self.rank = rank
        self.debug = int(debug)
    
    def get_barcodes(self):
        # simple approach - align, take soft-clipped, and use the arbitrary 20 bases
        # intermediate approach - use the seed and extend approach
        out = open(self.out, "w")

        self.bc = BarCode(self.bam)
        self.bc.simple_approach()

        for k, v in sorted(self.bc.barcode_to_read.items()):
            q = sorted(v)
            out.write("%s\t%s\n" % (k, ",".join(q)))

        out.close()

    def error_correction_barcodes(self):

        # start analysis
        self.bc = BarCode(self.bam)
        sys.stdout.write("[%s] Starting Error Correction Analysis\n" % (self.get_time(),))

        # load barcodes
        self.bc.load_barcodes(self.barcodes)
        sys.stdout.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        # cluster barcodes
        self.bc.cluster_barcodes()
        sys.stdout.write("[%s] Clustered BarCodes\n" % (self.get_time(),))

    
    def filter_barcodes(self, barcode, export="fastq"):
        list_of_ids = []
        with open(self.barcodes, "r") as f:
            for line in f:
                data = line.strip("\r\n").split("\t")
                if barcode == data[0]:
                    list_of_ids = data[1].split(",")
                    break

        self.bc = BarCode(self.bam)
        self.bc.filter_and_export(list_of_ids, self.out, export=export)


    def sort_and_rewrite_bam(self):
        
        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),))
        
        self.bc.load_barcodes(self.barcodes)
        sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        self.bc.sort_and_rewrite_bam(self.rewritten_bam)
        pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", ""))
        sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))

    def split_bam_by_barcode(self):

        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting procedure to split BAM by barcode\n" % (self.get_time(),))

        self.bc.split_bam_into_barcodes(self.ref, self.out, self.export)
        sys.stderr.write("[%s] Finished splitting BAM by barcode id\n" % (self.get_time(),))
        

    def assemble_consensus_genomes(self):
        
        # build consensus
        self.consensus = Consensus(self.rewritten_sorted_bam, self.ref)
        sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),))
        
        self.consensus.build(debug=self.debug)
        sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),))
        
        self.consensus.infer_consensus(self.consensus_reference)
        sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),))

        self.consensus.output_consensus_genomes(self.consensus_genomes)
        sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),))

        self.consensus.output_haplotype_distribution(self.haplotype_distribution)
        sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),))

        self.quark = Quark(self.ref.sequence)
        self.quark.distance_matrix(sorted(self.consensus.freq_distribution.items(),
                                          key=lambda q: q[1],
                                          reverse=True))
        self.quark.graph_it()
        self.quark.rank_it(self.rank)
        sys.stderr.write("[%s] Output Quark Analysis\n" % (self.get_time(),))
        
        self.ovcf = VCF(self.vcf, crossmap=self.crossmap)
        self.ovcf.get_variants(self.ref.sequence,
                               self.consensus.consensus_genomes)
        self.ovcf.output_vcf(self.ref.sequence)
        sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),))

        self.summary_statistics()
        sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),))

        self.ochain = Chain(self.chain)
        self.ochain.output_chain(self.ref,
                                 self.consensus.inferred_consensus,
                                 self.consensus.inferred_structure)
        sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
    
    
    def summary_statistics(self):
        # coverage per genome
        # variants per genome
        # estimate PCR and sequencing errors
        # barcode distribution
        f_out = open(self.out, "w")

        self.bc = BarCode(self.bam)  #TEMP
        self.bc.load_barcodes(self.barcodes)  #TEMP
        
        self.consensus.output_consensus_coverage(f_out)
        self.ovcf.output_variants_distribution(f_out)
        self.bc.output_reads_in_barcode_distribution(f_out)
        
        f_out.close()

    def run(self):

        # Phase 1 - Detection of BarCode
        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),))
        
        self.bc.simple_approach()
        sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),))
        
        self.bc.write_barcodes(self.barcodes)
        sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),))

        # Phase 2 - Rewrite BAM
        sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),))
        
        self.bc.load_barcodes(self.barcodes)
        sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        self.bc.bam.reset()
        self.bc.sort_and_rewrite_bam(self.rewritten_bam)
        pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", ""))
        sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))
        
        # Phase 3 - Build Consensus
        self.consensus = Consensus(self.rewritten_sorted_bam, self.ref)
        sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),))
        
        self.consensus.build()
        sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),))
        
        self.consensus.infer_consensus(self.consensus_reference)
        sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),))

        # Phase 4 - Call Variants and Haplotypes
        self.consensus.output_consensus_genomes(self.consensus_genomes)
        sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),))

        self.consensus.output_haplotype_distribution(self.haplotype_distribution)
        sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),))
        
        self.ovcf = VCF(self.vcf, crossmap=self.crossmap)
        self.ovcf.get_variants(self.ref.sequence,
                               self.consensus.consensus_genomes)
        self.ovcf.output_vcf(self.ref.sequence)
        sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),))

        # Phase 5 - Summary Statistics and Chain Files
        f_out = open(self.out, "w")
        self.consensus.output_consensus_coverage(f_out)
        self.ovcf.output_variants_distribution(f_out)
        self.bc.output_reads_in_barcode_distribution(f_out)
        f_out.close()
        sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),))

        self.ochain = Chain(self.chain)
        self.ochain.output_chain(self.ref,
                                 self.consensus.inferred_consensus,
                                 self.consensus.inferred_structure)
        sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
        
    
    def assemble_genomes(self):
        pass

    
    def assemble_genomes_from_fastq(self):
        pass