Exemple #1
0
    def convert_to_fasta(self, fasta_file_prefix):
        """converts BAM file reads into a Fasta file of reads"""
        fasta_conversion_cmd = [
            'java', '-jar', constant.PICARD + "picard.jar",
            "SamFormatConverter"
        ]
        fasta_conversion_cmd.append("INPUT=" + self.file)
        samfile_name = self.file.replace('.bam', '.sam')
        fasta_conversion_cmd.append("OUTPUT=" + samfile_name)
        rc = RunCommand(fasta_conversion_cmd)
        print fasta_conversion_cmd
        out = rc.run_command()
        print out

        #convert sam file into fasta file of reads
        sam_file = open(samfile_name, 'r')
        fasta_file_name = fasta_file_prefix + '.fasta'
        fasta_file = open(fasta_file_name, 'w')
        for line in sam_file:
            if line[0] != '@':
                fields = line.split('\t')
                name = fields[0]
                seq = fields[9]
                fasta_file.write('>')
                fasta_file.write(name)
                fasta_file.write('\n')
                fasta_file.write(seq)
                fasta_file.write('\n')
        fasta_file.close()
        return fasta_file_name
Exemple #2
0
    def sort_bam(self):

        cmd = [
            constant.SAMTOOLS, "sort",
            self.get_input_file(),
            self.get_output_file()
        ]
        rc = RunCommand(cmd)
        out = rc.run_command()
        return 1
Exemple #3
0
	def align_with_GPU(self):
		result = self.modeldir + "out.ali"
		print(os.getcwd()) 
		print self.unaligned_seq
		print result
		gpas = RunCommand(["./gpas/gpas -sm gpas/data/substitution_matrices/BLOSUM62.txt -a nwg -i ", self.unaligned_seq ," -o ", result])
		gpas.run()

		self.gpas_aligned_file = result
		self.convert_gpas_to_fasta()
Exemple #4
0
 def is_mapped(self):
     cmd = None
     if self.__is_sam_format():
         cmd = ["grep", "^@", self.get_input_file()]
     elif self.__is_bam_format():
         cmd = [constant.SAMTOOLS, "view", "-H", self.get_input_file()]
     else:
         return 0
     rc = RunCommand(cmd)
     out = rc.run_command()
     return self.__check_headers(out)
Exemple #5
0
    def convert_to_unmapped_bam(self, java_heap='32G'):

        direction = self.__get_direction()
        if not direction or direction == 'FR':
            cmd = self.__build_command(java_heap)
            if cmd:
                rc = RunCommand(cmd)
                out = rc.run_command()
            else:
                if self.is_mapped and not os.path.exists(self.get_output_file(
                )) and self.get_output_file() != self.get_input_file():
                    os.symlink(self.get_input_file(), self.get_output_file())
            return 1
        else:
            fq_return_code, fastqs = self.convert_to_fastq(java_heap)
            fq = FastqFile(fastqs, None, None, self.__get_direction(),
                           self.get_output_file())
            fq_return_code = fq.convert_to_unmapped_bam(java_heap)
            self.__remove_fastq_files(fastqs)
            return fq_return_code
Exemple #6
0
 def __check_gi_list(self, gi_list):
     tmp_gi_list = []
     for gi in gi_list:
         if not self.__have_gi_info(gi):
             tmp_gi_list.append(gi)
     if tmp_gi_list:
         try:
             tmp_gi_file = 'tmp.gi_list'
             out = open(tmp_gi_file, 'w')
         except:
             print "Can't open tmp gi list file, " + tmp_gi_file
             sys.exit(-1)
         for i in tmp_gi_list:
             out.write(i + '\n')
         out.close()
         command = self.__build_command(tmp_gi_file)
         if command:
             rc = RunCommand(command)
             cmd_data = rc.run_command(1, 0).split('\n')
             self.__populate_blast_hit_info(cmd_data)
         else:
             print "Unable to create blastdbcmd.  Exiting."
             sys.exit(-1)
         os.remove(tmp_gi_file)
Exemple #7
0
 def __init__(self, bam):
     self.bam = bam
     self.command = RunCommand(self.__build_command(bam))
     self.flagstat_output = self.command.run_command(0).stdout.readlines()
     self.stats = self.__populate_stats()
Exemple #8
0
class BamFlagstat():
    """This class represents data in samtools flagstat output"""
    def __init__(self, bam):
        self.bam = bam
        self.command = RunCommand(self.__build_command(bam))
        self.flagstat_output = self.command.run_command(0).stdout.readlines()
        self.stats = self.__populate_stats()

    # private function to make flagstat command
    def __build_command(self, bam):
        return [constant.SAMTOOLS, "flagstat", bam]

    # private function to parse output
    def __populate_stats(self):
        data = {}
        qc_fail = 0
        for i in self.flagstat_output:
            tmp = i.rstrip('\n').split(' ')
            if re.match('\+', tmp[1]):
                qc_fail = tmp[2]
            qc_pass = tmp[0]
            key = ""
            for j in range(3, int(len(tmp))):
                if re.match('\(', tmp[j]):
                    if not re.search('mapQ', tmp[j]):
                        break
                    key += tmp[j] + "_"
                else:
                    key += tmp[j] + "_"
            key = key[:-1]
            data[key] = ()
            data[key] = (int(qc_pass), int(qc_fail))
        return data

    # private function to get the value from a particular stat
    def __get_value_from_key(self, key, key_tuple):
        sum = 0
        for i in key_tuple:
            sum += self.stats[key][i]
        return sum

    # private function to tell whether or not to look at
    # qc_pass, qc_fail or both (default)
    def __get_data_to_check(self, qc_pass=None):
        if qc_pass == None:
            return (0, 1)
        if qc_pass == 1:
            return (0, )
        return (1, )

    # private function to help get the right stat
    def __check_helper(self, key, qc_pass=None):
        return self.__get_value_from_key(key,
                                         self.__get_data_to_check(qc_pass))

    # public functions follow to get number value from each key
    def num_total_reads(self, qc_pass=None):
        key = 'in_total'
        return self.__check_helper(key, qc_pass)

    def num_paired_in_sequencing(self, qc_pass=None):
        key = 'paired_in_sequencing'
        return self.__check_helper(key, qc_pass)

    def num_properly_paired(self, qc_pass=None):
        key = 'properly_paired'
        return self.__check_helper(key, qc_pass)

    def num_duplicates(self, qc_pass=None):
        key = 'duplicates'
        return self.__check_helper(key, qc_pass)

    def num_read1_seqs(self, qc_pass=None):
        key = 'read1'
        return self.__check_helper(key, qc_pass)

    def num_read2_seqs(self, qc_pass=None):
        key = 'read2'
        return self.__check_helper(key, qc_pass)

    def num_singletons(self, qc_pass=None):
        key = 'singletons'
        return self.__check_helper(key, qc_pass)

    def num_mapped(self, qc_pass=None):
        key = 'mapped'
        return self.__check_helper(key, qc_pass)

    def num_mapped_with_mate(self, qc_pass=None):
        key = 'with_itself_and_mate_mapped'
        return self.__check_helper(key, qc_pass)

    def num_mates_mapped_to_diff_chr(self, qc_pass=None):
        key = 'with_mate_mapped_to_a_different_chr'
        return self.__check_helper(key, qc_pass)

    def num_mates_mapped_to_diff_chr_qual(self, qc_pass=None):
        key = 'with_mate_mapped_to_a_different_chr_(mapQ>=5)'
        return self.__check_helper(key, qc_pass)

    # public functions follow to get common %
    def pct_mapped(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_total_reads(qc_pass) != 0:
                return "%.2f" % ((float(self.num_mapped(qc_pass)) /
                                  self.num_total_reads(qc_pass)) * 100)
        return 0

    def pct_properly_paired(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % ((float(self.num_properly_paired(qc_pass)) /
                                  self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_aligned_with_pair(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % ((float(self.num_mapped_with_mate(qc_pass)) /
                                  self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_singletons(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % ((float(self.num_singletons(qc_pass)) /
                                  self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_paired_in_sequencing(self, qc_pass=None):
        if self.num_total_reads(qc_pass) != 0:
            return "%.2f" % ((float(self.num_paired_in_sequencing(qc_pass)) /
                              self.num_total_reads(qc_pass)) * 100)
        return 0

    def pct_duplicates(self, qc_pass=None):
        if self.num_total_reads(qc_pass) != 0:
            return "%.2f" % ((float(self.num_duplicates(qc_pass)) /
                              self.num_total_reads(qc_pass)) * 100)
        return 0

    def pct_chimeras(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass):
                return "%.2f" % (
                    (float(self.num_mates_mapped_to_diff_chr(qc_pass)) /
                     self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_chimeras_qual(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % (
                    (float(self.num_mates_mapped_to_diff_chr_qual(qc_pass)) /
                     self.num_mapped(qc_pass)) * 100)
        return 0

    # public function to get the stats keys; may be useful for user to peruse.
    def get_keys(self):
        return self.stats.keys()

    # public function to see if bam is mapped or not (prevents division by 0)
    def is_aligned(self, qc_pass=None):
        key = 'mapped'
        return self.__check_helper(key, qc_pass) != 0

    def is_paired(self):
        if float(self.pct_paired_in_sequencing()) > 0:
            return 1
        else:
            return 0

    def get_bam_name(self):
        return self.bam

    def __parenthesize(self, value):
        return ' (' + str(value) + '%)'

    def get_aligned_stats_table(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            aligned_data = self.get_unaligned_stats_table(qc_pass)
            aligned_data.append([
                "Mapped",
                str(self.num_mapped(qc_pass)) +
                self.__parenthesize(self.pct_mapped(qc_pass))
            ])
            aligned_data.append([
                "Singletons",
                str(self.num_singletons(qc_pass)) +
                self.__parenthesize(self.pct_singletons(qc_pass))
            ])
            aligned_data.append([
                "Mapped w/ Mate",
                str(self.num_mapped_with_mate(qc_pass)) +
                self.__parenthesize(self.pct_aligned_with_pair(qc_pass))
            ])
            aligned_data.append([
                "Properly Paired",
                str(self.num_properly_paired(qc_pass)) +
                self.__parenthesize(self.pct_properly_paired(qc_pass))
            ])
            aligned_data.append([
                "Cross-chromosome",
                str(self.num_mates_mapped_to_diff_chr(qc_pass)) +
                self.__parenthesize(self.pct_chimeras(qc_pass))
            ])
            aligned_data.append([
                "Cross-chromosome (MQ >= 5)",
                str(self.num_mates_mapped_to_diff_chr_qual(qc_pass)) +
                self.__parenthesize(self.pct_chimeras_qual(qc_pass))
            ])
            return aligned_data
        return self.get_unaligned_stats_table(qc_pass)

    def get_unaligned_stats_table(self, qc_pass=None):
        unaligned_data = []
        unaligned_data.append(["Total Reads", self.num_total_reads(qc_pass)])
        unaligned_data.append([
            "Paired Reads",
            str(self.num_paired_in_sequencing(qc_pass)) +
            self.__parenthesize(self.pct_paired_in_sequencing(qc_pass))
        ])
        unaligned_data.append([
            "Duplicates",
            str(self.num_duplicates(qc_pass)) +
            self.__parenthesize(self.pct_duplicates(qc_pass))
        ])
        unaligned_data.append(["Total Read 1", self.num_read1_seqs(qc_pass)])
        unaligned_data.append(["Total Read 2", self.num_read2_seqs(qc_pass)])
        return unaligned_data
Exemple #9
0
 def align_with_muscle(self):
   result = self.modeldir + "seq.ali"
   muscle = RunCommand(["muscle -in ", self.unaligned_seq ," -out ", result])
   muscle.run()
   self.fasta_aligned_file = result
Exemple #10
0
 def convert_to_unmapped_bam(self, java_heap='32G'):
     cmd = self.__build_command(java_heap)
     if cmd:
         rc = RunCommand(cmd)
         out = rc.run_command()
     return 1
Exemple #11
0
 def __make_bam_index(self,bam):
     rc = RunCommand([constant.SAMTOOLS,"index",bam])
     rc.run_command()
Exemple #12
0
 def __get_header_info(self,bam):
     cmd = self.__build_bam_header_command(bam)
     rc = RunCommand(cmd)
     return self.__get_lengths_from_headers(rc.run_command(0))        
Exemple #13
0
 def convert_to_unmapped_bam(self, java_heap='32G', stringency='LENIENT'):
     cmd = self.__build_command(java_heap, stringency)
     if cmd:
         rc = RunCommand(cmd)
         out = rc.run_command()
     return 1
Exemple #14
0
 def convert_to_fastq(self, java_heap='32G'):
     cmd, fastqs = self.__build_fastq_command(java_heap)
     if cmd:
         rc = RunCommand(cmd)
         out = rc.run_command()
     return 1, fastqs
Exemple #15
0
 def convert_to_fastq(self, java_heap='32G', stringency='LENIENT'):
     cmd, fastqs = self.__build_fastq_command(java_heap, stringency)
     if cmd:
         rc = RunCommand(cmd)
         out = rc.run_command()
     return 1, fastqs
Exemple #16
0
class BamFlagstat():
    """This class represents data in samtools flagstat output"""

    def __init__(self, bam):
        self.bam = bam
        self.command = RunCommand(self.__build_command(bam))
        self.flagstat_output = self.command.run_command(0).stdout.readlines()
        self.stats = self.__populate_stats()
        #self.pysam_stats = self.__pysam_populate_stats()

    # private function to make flagstat command
    def __build_command(self, bam):
        return [constant.SAMTOOLS,"flagstat",bam]

    # private function to parse output
    def __populate_stats(self):
        data = {}
        qc_fail = 0
        for i in self.flagstat_output:           
            tmp = i.rstrip('\n').split(' ')
            if re.match('\+',tmp[1]):
                qc_fail = tmp[2]
            qc_pass = tmp[0]
            key = ""
            for j in range(3,int(len(tmp))):
                if re.match('\(',tmp[j]):
                    if not re.search('mapQ',tmp[j]):
                        break
                    key += tmp[j] + "_"
                else:
                    key += tmp[j] + "_"
            key = key[:-1]
            data[key] = ()
            data[key] = (int(qc_pass), int(qc_fail))
        return data
    # Example dictionary of what data should look like
    # {'properly_paired': (3726986, 101942), 'duplicates': (232520, 13824), 'read1': (2008943, 60581),
    #  'singletons': (28670, 3947), 'with_mate_mapped_to_a_different_chr_(mapQ>=5)': (0, 0), 'read2': (2008906, 60530),
    #  'in_total': (4017849, 121111), 'mapped': (3778676, 106985), 'paired_in_sequencing': (4017849, 121111),
    #  'with_itself_and_mate_mapped': (3750006, 103038), 'with_mate_mapped_to_a_different_chr': (0, 0)}

    # def __pysam_populate_stats(self):
    #     # need to have same keys that __populate_stats produces.
    #     pybam = pysam.AlignmentFile(self.bam, 'rb')
    #     data = {'properly_paired': (), 'duplicates': (), 'read1': (), 'singletons': (),
    #             'with_mate_mapped_to_a_different_chr_(mapQ>=5)': (), 'read2': (), 'in_total': (),
    #             'mapped': (), 'paired_in_sequencing': (), 'with_itself_and_mate_mapped': (),
    #             'with_mate_mapped_to_a_different_chr': ()}
    #     #print("count>" + str(pybam.count(until_eof=True)))
    #     reads = pybam.fetch(until_eof=True)
    #     mapped_list = []
    #     unmapped_list = []
    #     for r in reads:
    #         r_list = str(r).split('\t')
    #         if '*' in r_list[6]:
    #             unmapped_list.append(r_list[0])
    #         else:
    #             mapped_list.append(r_list[0])
    #     print(len(set(mapped_list)) + len(set(unmapped_list)))
    #     return data



    # private function to get the value from a particular stat
    def __get_value_from_key(self,key,key_tuple):
        sum = 0
        for i in key_tuple:
            sum += self.stats[key][i]
        return sum

    # private function to tell whether or not to look at
    # qc_pass, qc_fail or both (default)
    def __get_data_to_check(self,qc_pass=None):
        if qc_pass == None:
            return (0,1)
        if qc_pass == 1:
            return (0,)
        return (1,)

    # private function to help get the right stat
    def __check_helper(self,key, qc_pass=None):
        return self.__get_value_from_key(key,self.__get_data_to_check(qc_pass))

    # public functions follow to get number value from each key
    def num_total_reads(self,qc_pass=None):
        key = 'in_total'
        return self.__check_helper(key,qc_pass)

    def num_paired_in_sequencing(self,qc_pass=None):
        key = 'paired_in_sequencing'
        return self.__check_helper(key,qc_pass)

    def num_properly_paired(self,qc_pass=None):
        key = 'properly_paired'
        return self.__check_helper(key,qc_pass)

    def num_duplicates(self,qc_pass=None):
        key = 'duplicates'
        return self.__check_helper(key,qc_pass)        

    def num_read1_seqs(self,qc_pass=None):
        key = 'read1'
        return self.__check_helper(key,qc_pass)

    def num_read2_seqs(self,qc_pass=None):
        key = 'read2'
        return self.__check_helper(key,qc_pass)

    def num_singletons(self,qc_pass=None):
        key = 'singletons'
        return self.__check_helper(key,qc_pass)    

    def num_mapped(self,qc_pass=None):
        key = 'mapped'
        return self.__check_helper(key,qc_pass)

    def num_mapped_with_mate(self,qc_pass=None):
        key = 'with_itself_and_mate_mapped'
        return self.__check_helper(key,qc_pass)

    def num_mates_mapped_to_diff_chr(self,qc_pass=None):
        key = 'with_mate_mapped_to_a_different_chr'
        return self.__check_helper(key,qc_pass)

    def num_mates_mapped_to_diff_chr_qual(self,qc_pass=None):
        key = 'with_mate_mapped_to_a_different_chr_(mapQ>=5)'
        return self.__check_helper(key,qc_pass)

    # public functions follow to get common %
    def pct_mapped(self,qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_total_reads(qc_pass) != 0:
                return "%.2f" % ((float(self.num_mapped(qc_pass))/self.num_total_reads(qc_pass)) * 100)
        return 0
    
    def pct_properly_paired(self,qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % ((float(self.num_properly_paired(qc_pass))/self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_aligned_with_pair(self,qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % ((float(self.num_mapped_with_mate(qc_pass))/self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_singletons(self,qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % ((float(self.num_singletons(qc_pass))/self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_paired_in_sequencing(self,qc_pass=None):
        if self.num_total_reads(qc_pass) != 0:
            return "%.2f" % ((float(self.num_paired_in_sequencing(qc_pass))/self.num_total_reads(qc_pass)) * 100)
        return 0

    def pct_duplicates(self,qc_pass=None):
        if self.num_total_reads(qc_pass) != 0:
            return "%.2f" % ((float(self.num_duplicates(qc_pass))/self.num_total_reads(qc_pass)) * 100)
        return 0
    
    def pct_chimeras(self,qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass):
                return "%.2f" % ((float(self.num_mates_mapped_to_diff_chr(qc_pass))/self.num_mapped(qc_pass)) * 100)
        return 0

    def pct_chimeras_qual(self,qc_pass=None):
        if self.is_aligned(qc_pass):
            if self.num_mapped(qc_pass) != 0:
                return "%.2f" % ((float(self.num_mates_mapped_to_diff_chr_qual(qc_pass))/self.num_mapped(qc_pass)) * 100)
        return 0

    # public function to get the stats keys; may be useful for user to peruse.
    def get_keys(self):
        return self.stats.keys()

    # public function to see if bam is mapped or not (prevents division by 0)
    def is_aligned(self,qc_pass=None):
        key = 'mapped'
        return self.__check_helper(key,qc_pass) != 0

    def is_paired(self):
        if float(self.pct_paired_in_sequencing()) > 0:
            return 1
        else:
            return 0

    def get_bam_name(self):
        return self.bam

    def __parenthesize(self, value):
        return ' (' + str(value) + '%)'

    def get_aligned_stats_table(self, qc_pass=None):
        if self.is_aligned(qc_pass):
            aligned_data = self.get_unaligned_stats_table(qc_pass)
            aligned_data.append(["Mapped", str(self.num_mapped(qc_pass)) +
                                                self.__parenthesize(self.pct_mapped(qc_pass))])
            aligned_data.append(["Singletons", str(self.num_singletons(qc_pass)) +
                                                    self.__parenthesize(self.pct_singletons(qc_pass))])
            aligned_data.append(["Mapped w/ Mate", str(self.num_mapped_with_mate(qc_pass)) +
                                                        self.__parenthesize(self.pct_aligned_with_pair(qc_pass))])
            aligned_data.append(["Properly Paired", str(self.num_properly_paired(qc_pass)) +
                                                         self.__parenthesize(self.pct_properly_paired(qc_pass))])
            aligned_data.append(["Cross-chromosome", str(self.num_mates_mapped_to_diff_chr(qc_pass)) +
                                                          self.__parenthesize(self.pct_chimeras(qc_pass))])
            aligned_data.append(["Cross-chromosome (MQ >= 5)", str(self.num_mates_mapped_to_diff_chr_qual(qc_pass)) +
                                                                    self.__parenthesize(self.pct_chimeras_qual(qc_pass))])
            return aligned_data
        return self.get_unaligned_stats_table(qc_pass)

    def get_unaligned_stats_table(self,qc_pass=None):
        unaligned_data = []
        unaligned_data.append(["Total Reads", self.num_total_reads(qc_pass)])
        unaligned_data.append(["Paired Reads", str(self.num_paired_in_sequencing(qc_pass)) +
                                                   self.__parenthesize(self.pct_paired_in_sequencing(qc_pass))])
        unaligned_data.append(["Duplicates",str(self.num_duplicates(qc_pass)) +
                                                self.__parenthesize(self.pct_duplicates(qc_pass))])
        unaligned_data.append(["Total Read 1",self.num_read1_seqs(qc_pass)])
        unaligned_data.append(["Total Read 2",self.num_read2_seqs(qc_pass)])
        return unaligned_data