Beispiel #1
0
def parseCas(infile, order_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs) / 100)
    cas_line_re = re.compile(
        r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0
    if (runCmd(command=command)):
        for line in runCmd(command=command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(
                    1)) - 1  # -1 because index of contig list starts with zero
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
                BtLog.progress(seqs_parsed, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #2
0
def parseCas(infile, order_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs)/100)
    cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0
    if (runCmd(command=command)):
        for line in runCmd(command=command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
                BtLog.progress(seqs_parsed, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #3
0
def writeNodesDB(nodesDB, nodesDB_f):
    print BtLog.status_d['5'] % nodesDB_f
    nodes_count = nodesDB['nodes_count']
    i = 0
    with open(nodesDB_f, 'w') as fh:
        fh.write("# nodes_count = %s\n" % nodes_count)
        for node in nodesDB:
            if not node == "nodes_count":
                i += 1
                BtLog.progress(i, 1000, nodes_count)
                fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
Beispiel #4
0
def writeNodesDB(nodesDB, nodesDB_f):
    print BtLog.status_d['5'] % nodesDB_f
    nodes_count = nodesDB['nodes_count']
    i = 0
    with open(nodesDB_f, 'w') as fh:
        fh.write("# nodes_count = %s\n" % nodes_count)
        for node in nodesDB:
            if not node == "nodes_count":
                i += 1
                BtLog.progress(i, 1000, nodes_count)
                fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
Beispiel #5
0
def parseBam(infile, set_of_blobs, no_base_cov_flag):
    '''
    checkBam returns reads_total and reads_mapped
    base_cov_dict is list of coverages for each contigs, since list appending should be faster

    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped / 1000)
    base_cov_dict = {blob: [] for blob in set_of_blobs}
    #base_cov_dict = {blob : 0 for blob in set_of_blobs}
    read_cov_dict = {blob: 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(
        r"(\d+)M|X|=")  # only gets digits before M,X,='s
    # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
    command = "samtools view -F 1024 -F 4 -F 256 " + infile
    seen_reads = 0
    #import time
    #start = time.time()
    if not (no_base_cov_flag):
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                base_cov_dict[match[2]].append(
                    sum([
                        int(matching)
                        for matching in cigar_match_re.findall(match[5])
                    ]))
                #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    else:
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    if not int(reads_mapped) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_mapped, seen_reads)
    base_cov_dict = {
        seq_name: sum(base_covs)
        for seq_name, base_covs in base_cov_dict.items()
    }
    #end = time.time()
    #print (end-start)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #6
0
def parseCov(infile, set_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    reads_unmapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith('#'):
                    if line.startswith("## Total Reads"):
                        reads_total = int(line.split(" = ")[1])
                    elif line.startswith("## Mapped Reads"):
                        reads_mapped = int(line.split(" = ")[1])
                    elif line.startswith("## Unmapped Reads"):
                        reads_unmapped = int(line.split(" = ")[1])
                    else:
                        pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(
                            match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name, infile)
                        else:
                            read_cov_dict[name] = read_cov
                            base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name)
                    else:
                        base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
Beispiel #7
0
def parseCov(infile, set_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    reads_unmapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith('#'):
                    if line.startswith("## Total Reads"):
                        reads_total = int(line.split(" = ")[1])
                    elif line.startswith("## Mapped Reads"):
                        reads_mapped = int(line.split(" = ")[1])
                    elif line.startswith("## Unmapped Reads"):
                        reads_unmapped = int(line.split(" = ")[1])
                    else:
                        pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name, infile)
                        else:
                            read_cov_dict[name] = read_cov
                            base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name)
                    else:
                        base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
Beispiel #8
0
 def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random):
     print BtLog.status_d['6'] % ",".join(taxrules)
     tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB)
     self.lineages = BtTax.getLineages(tree_lists, nodesDB)
     self.taxrules = taxrules
     i = 0
     for blObj in self.dict_of_blobs.values():
         i += 1
         BtLog.progress(i, 100, self.seqs)
         for taxrule in taxrules:
             if (blObj.hits):
                 blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random)
             else:
                 blObj.taxonomy[taxrule] = BtTax.noHit()
     self.set_of_taxIds = set()
Beispiel #9
0
def readNodesDB(nodesDB_f):
    nodesDB = {}
    nodesDB_count = 0
    nodes_count = 0
    with open(nodesDB_f) as fh:
        for line in fh:
            if line.startswith("#"):
                nodesDB_count = int(line.lstrip("# nodes_count = ").rstrip("\n"))
            else:
                nodes_count += 1
                node, rank, name, parent = line.rstrip("\n").split("\t")
                nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent}
                if (nodesDB_count):
                    BtLog.progress(nodes_count, 1000, nodesDB_count)
    nodesDB['nodes_count'] = nodes_count
    return nodesDB
Beispiel #10
0
def readNodesDB(nodesDB_f):
    nodesDB = {}
    nodesDB_count = 0
    nodes_count = 0
    with open(nodesDB_f) as fh:
        for line in fh:
            if line.startswith("#"):
                nodesDB_count = int(line.lstrip("# nodes_count = ").rstrip("\n"))
            else:
                nodes_count += 1
                node, rank, name, parent = line.rstrip("\n").split("\t")
                nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent}
                if (nodesDB_count):
                    BtLog.progress(nodes_count, 1000, nodesDB_count)
    nodesDB['nodes_count'] = nodes_count
    return nodesDB
Beispiel #11
0
 def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random):
     print BtLog.status_d["6"] % ",".join(taxrules)
     tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB)
     self.lineages = BtTax.getLineages(tree_lists, nodesDB)
     self.taxrules = taxrules
     i = 0
     for blObj in self.dict_of_blobs.values():
         i += 1
         BtLog.progress(i, 100, self.seqs)
         for taxrule in taxrules:
             if blObj.hits:
                 blObj.taxonomy[taxrule] = BtTax.taxRule(
                     taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random
                 )
             else:
                 blObj.taxonomy[taxrule] = BtTax.noHit()
     self.set_of_taxIds = set()
Beispiel #12
0
def parseBam(infile, set_of_blobs, no_base_cov_flag):
    '''
    checkBam returns reads_total and reads_mapped
    base_cov_dict is list of coverages for each contigs, since list appending should be faster

    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped/1000)
    base_cov_dict = {blob : [] for blob in set_of_blobs}
    #base_cov_dict = {blob : 0 for blob in set_of_blobs}
    read_cov_dict = {blob : 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
    # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
    command = "samtools view -F 1024 -F 4 -F 256 " + infile
    seen_reads = 0
    #import time
    #start = time.time()
    if not (no_base_cov_flag):
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])]))
                #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    else:
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    if not int(reads_mapped) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_mapped, seen_reads)
    base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()}
    #end = time.time()
    #print (end-start)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #13
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    list_f = args['--list']
    invert = args['--invert']
    prefix = args['--out']

    output = []
    out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna")

    print BtLog.status_d['1'] % ("list", list_f)
    items = BtIO.parseSet(list_f)
    items_count = len(items)
    print BtLog.status_d['22'] % fasta_f
    items_parsed = []
    sequences = 0
    for header, sequence in BtIO.readFasta(fasta_f):
        sequences += 1
        if header in items:
            if not (invert):
                items_parsed.append(header)
                output.append(">%s\n%s\n" % (header, sequence))
        else:
            if (invert):
                items_parsed.append(header)
                output.append(">%s\n%s\n" % (header, sequence))
        BtLog.progress(len(output), 10, items_count, no_limit=True)
    BtLog.progress(items_count, 10, items_count)

    items_parsed_count = len(items_parsed)
    print BtLog.status_d['23'] % ('{:.2%}'.format(items_parsed_count/sequences), "{:,}".format(items_count), "{:,}".format(items_parsed_count), "{:,}".format(sequences))

    items_parsed_count_unique = len(set(items_parsed))
    if not items_parsed_count == items_parsed_count_unique:
        print BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1])))

    with open(out_f, "w") as fh:
        print BtLog.status_d['24'] % out_f
        fh.write("".join(output))
Beispiel #14
0
 def view(self, **kwargs):
     # arguments
     viewObjs = kwargs['viewObjs']
     ranks = kwargs['ranks']
     taxrule = kwargs['taxrule']
     hits_flag = kwargs['hits_flag']
     seqs = kwargs['seqs']
     cov_libs = kwargs['cov_libs']
     progress_bar = kwargs['progressbar']
     # Default sequences if no subset
     if not (seqs):
         seqs = self.order_of_blobs
     # Default cov_libs if no subset
     cov_lib_names = cov_libs
     if not (cov_libs):
         cov_lib_names = [covLib for covLib in self.covLibs]
     tax_lib_names = [taxLib for taxLib in sorted(self.hitLibs)]
     lineages = self.lineages
     # setup
     for viewObj in viewObjs:
         if viewObj.name == 'table':
             viewObj.header = self.getTableHeader(taxrule, ranks, hits_flag, cov_lib_names)
         if viewObj.name == 'concoct_cov':
             viewObj.header = self.getConcoctCovHeader(cov_lib_names)
         if viewObj.name == 'covlib':
             viewObj.header = self.getCovHeader(cov_lib_names)
         if viewObj.name == 'experimental':
             viewObj.covs = {cov_lib : [] for cov_lib in cov_lib_names}
             viewObj.covs["covsum"] = []
             for taxrule in self.taxrules:
                 viewObj.tax[taxrule] = {rank : [] for rank in BtTax.RANKS}
     # bodies
     for i, seq in enumerate(seqs):
         if (progress_bar):
             BtLog.progress(i, 1000, len(seqs))
         blob = self.dict_of_blobs[seq]
         for viewObj in viewObjs:
             if viewObj.name == 'table':
                 viewObj.body.append(self.getTableLine(blob, taxrule, ranks, hits_flag, cov_lib_names, tax_lib_names, lineages))
             if viewObj.name == 'concoct_cov':
                 viewObj.body.append(self.getConcoctCovLine(blob, cov_lib_names))
             if viewObj.name == 'experimental':
                 viewObj.names.append(blob['name'])
                 viewObj.gc.append(blob['gc'])
                 viewObj.length.append(blob['length'])
                 cov_sum = 0.0
                 for cov_lib in blob['covs']:
                     viewObj.covs[cov_lib].append(blob['covs'][cov_lib])
                     cov_sum += blob['covs'][cov_lib]
                 viewObj.covs['covsum'].append(cov_sum)
                 for taxrule in blob['taxonomy']:
                     for rank in blob['taxonomy'][taxrule]:
                         viewObj.tax[taxrule][rank].append(blob['taxonomy'][taxrule][rank]['tax'])
             if viewObj.name == 'concoct_tax':
                 for rank in ranks:
                     if not rank in viewObj.body:
                         viewObj.body[rank] = []
                     viewObj.body[rank].append(self.getConcoctTaxLine(blob, rank, taxrule))
             if viewObj.name == 'covlib':
                 viewObj.body.append(self.getCovLine(blob, cov_lib_names))
     if (progress_bar):
         BtLog.progress(len(seqs), 1000, len(seqs))
     for viewObj in viewObjs:
         viewObj.output()
Beispiel #15
0
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort,
                      keep_sorted, sort_threads):
    '''
    checkBam returns reads_total and reads_mapped
    parse BAM to extract readpairs
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    if do_sort:
        command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (
            infile, infile)
        runCmd(command=command, wait=True)
        infile = "%s.readsorted.bam" % infile

    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped / 1000)
    command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile
    seen_reads = 0
    read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(
        outfile, include, exclude)
    read_pair_out_fhs = []
    used_fhs = {}
    iterator = runCmd(command=command)
    read_pair_type = None
    if include:
        sequence_to_type_dict = defaultdict(lambda: 'Ex')
        for incl in include:
            sequence_to_type_dict[incl] = 'In'
        sequence_to_type_dict['*'] = 'Un'
    elif exclude:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        for excl in exclude:
            sequence_to_type_dict[excl] = 'Ex'
        sequence_to_type_dict['*'] = 'Un'
    else:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        sequence_to_type_dict['*'] = 'Un'
    for l in iterator:
        read1 = l.split()
        try:
            seen_reads += 2
            read2 = next(iterator).split()
            read_pair_type = "".join(
                sorted([
                    sequence_to_type_dict[read1[2]],
                    sequence_to_type_dict[read2[2]]
                ]))
            print_bam(read_pair_out_fs, read_pair_type, read1, read2)
            read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2)
            read_pair_count[read_pair_type] += 1
            BtLog.progress(seen_reads, progress_unit, reads_total)
            if seen_reads % progress_unit == 0:
                used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs,
                                                read_pair_seqs)
                read_pair_seqs = {
                    read_pair_type: tuple()
                    for read_pair_type in read_pair_count
                }
        except StopIteration:
            print BtLog.warn_d['11']
    used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
    close_fhs(used_fhs)
    # info log
    info_string = []
    info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)),
                        '{0:.1%}'.format(1.00)))
    for read_pair_type, count in read_pair_count.items():
        info_string.append((read_pair_type + ' pairs', "{:,}".format(count),
                            '{0:.1%}'.format(count / int(seen_reads / 2))))
    info_out_f = getOutFile(outfile, None, "info.txt")
    with open(info_out_f, 'w') as info_fh:
        print BtLog.status_d['24'] % info_out_f
        info_fh.write(get_table(info_string))
    # gzip
    if gzip:
        if not which('gzip'):
            BtLog.error('43')
        for out_f in used_fhs:
            print BtLog.status_d['25'] % out_f
            runCmd(command="gzip -f " + out_f, wait=True)

    if not int(reads_total) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_total, seen_reads)
    if do_sort and not keep_sorted:
        os.remove(infile)
    return 1
Beispiel #16
0
 def view(self, **kwargs):
     # arguments
     viewObjs = kwargs["viewObjs"]
     ranks = kwargs["ranks"]
     taxrule = kwargs["taxrule"]
     hits_flag = kwargs["hits_flag"]
     seqs = kwargs["seqs"]
     cov_libs = kwargs["cov_libs"]
     progress_bar = kwargs["progressbar"]
     # Default sequences if no subset
     if not (seqs):
         seqs = self.order_of_blobs
     # Default cov_libs if no subset
     cov_lib_names = cov_libs
     if not (cov_libs):
         cov_lib_names = [covLib for covLib in self.covLibs]
     tax_lib_names = [taxLib for taxLib in sorted(self.hitLibs)]
     lineages = self.lineages
     # setup
     for viewObj in viewObjs:
         if viewObj.name == "table":
             viewObj.header = self.getTableHeader(taxrule, ranks, hits_flag, cov_lib_names)
         if viewObj.name == "concoct_cov":
             viewObj.header = self.getConcoctCovHeader(cov_lib_names)
         if viewObj.name == "covlib":
             viewObj.header = self.getCovHeader(cov_lib_names)
         if viewObj.name == "experimental":
             viewObj.covs = {cov_lib: [] for cov_lib in cov_lib_names}
             viewObj.covs["covsum"] = []
             for taxrule in self.taxrules:
                 viewObj.tax[taxrule] = {rank: [] for rank in BtTax.RANKS}
     # bodies
     for i, seq in enumerate(seqs):
         if progress_bar:
             BtLog.progress(i, 1000, len(seqs))
         blob = self.dict_of_blobs[seq]
         for viewObj in viewObjs:
             if viewObj.name == "table":
                 viewObj.body.append(
                     self.getTableLine(blob, taxrule, ranks, hits_flag, cov_lib_names, tax_lib_names, lineages)
                 )
             if viewObj.name == "concoct_cov":
                 viewObj.body.append(self.getConcoctCovLine(blob, cov_lib_names))
             if viewObj.name == "experimental":
                 viewObj.names.append(blob["name"])
                 viewObj.gc.append(blob["gc"])
                 viewObj.length.append(blob["length"])
                 cov_sum = 0.0
                 for cov_lib in blob["covs"]:
                     viewObj.covs[cov_lib].append(blob["covs"][cov_lib])
                     cov_sum += blob["covs"][cov_lib]
                 viewObj.covs["covsum"].append(cov_sum)
                 for taxrule in blob["taxonomy"]:
                     for rank in blob["taxonomy"][taxrule]:
                         viewObj.tax[taxrule][rank].append(blob["taxonomy"][taxrule][rank]["tax"])
             if viewObj.name == "concoct_tax":
                 for rank in ranks:
                     if not rank in viewObj.body:
                         viewObj.body[rank] = []
                     viewObj.body[rank].append(self.getConcoctTaxLine(blob, rank, taxrule))
             if viewObj.name == "covlib":
                 viewObj.body.append(self.getCovLine(blob, cov_lib_names))
     if progress_bar:
         BtLog.progress(len(seqs), 1000, len(seqs))
     for viewObj in viewObjs:
         viewObj.output()
Beispiel #17
0
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads):
    '''
    checkBam returns reads_total and reads_mapped
    parse BAM to extract readpairs
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    if do_sort:
        command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile)
        runCmd(command=command, wait=True)
        infile = "%s.readsorted.bam" % infile

    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped/1000)
    command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile
    seen_reads = 0
    read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(outfile, include, exclude)
    read_pair_out_fhs = []
    used_fhs = {}
    iterator = runCmd(command=command)
    read_pair_type = None
    if include:
        sequence_to_type_dict = defaultdict(lambda: 'Ex')
        for incl in include:
            sequence_to_type_dict[incl] = 'In'
        sequence_to_type_dict['*'] = 'Un'
    elif exclude:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        for excl in exclude:
            sequence_to_type_dict[excl] = 'Ex'
        sequence_to_type_dict['*'] = 'Un'
    else:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        sequence_to_type_dict['*'] = 'Un'
    for l in iterator:
        read1 = l.split()
        try:
            seen_reads += 2
            read2 = next(iterator).split()
            read_pair_type = "".join(sorted([sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]]]))
            print_bam(read_pair_out_fs, read_pair_type, read1, read2)
            read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2)
            read_pair_count[read_pair_type] += 1
            BtLog.progress(seen_reads, progress_unit, reads_total)
            if seen_reads % progress_unit == 0:
                used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
                read_pair_seqs = {read_pair_type : tuple() for read_pair_type in read_pair_count}
        except StopIteration:
                print BtLog.warn_d['11']
    used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
    close_fhs(used_fhs)
    # info log
    info_string = []
    info_string.append(('Total pairs', "{:,}".format(int(seen_reads/2)), '{0:.1%}'.format(1.00)))
    for read_pair_type, count in read_pair_count.items():
        info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count/int(seen_reads/2))))
    info_out_f = getOutFile(outfile, None, "info.txt")
    with open(info_out_f, 'w') as info_fh:
        print BtLog.status_d['24'] % info_out_f
        info_fh.write(get_table(info_string))
    # gzip
    if gzip:
        if not which('gzip'):
            BtLog.error('43')
        for out_f in used_fhs:
            print BtLog.status_d['25'] % out_f
            runCmd(command="gzip -f " + out_f, wait=True)

    if not int(reads_total) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_total, seen_reads)
    if do_sort and not keep_sorted:
        os.remove(infile)
    return 1