Beispiel #1
0
def parseJson(infile):
    '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/'''
    if not isfile(infile):
        BtLog.error('0', infile)
    import time
    start = time.time()
    json_parser = ''
    with open(infile, 'r') as fh:
        print BtLog.status_d['15']
        json_string = fh.read()
    try:
        import ujson as json # fastest
        json_parser = 'ujson'
        print BtLog.status_d['16'] % json_parser
    except ImportError:
        try:
            import simplejson as json # fast
            json_parser = 'simplejson'
        except ImportError:
            import json # default
            json_parser = 'json'
        print BtLog.status_d['17'] % json_parser
    try:
        obj = json.loads(json_string.decode("ascii"))
    except ValueError:
        BtLog.error('37', infile, "BlobDB")
    data = byteify(obj)
    print BtLog.status_d['20'] % (time.time() - start)
    return data
Beispiel #2
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d['1'] % ('FASTA', fasta_f)
        self.assembly_f = abspath(fasta_f)
        if (fasta_type):
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count

                if (fasta_type):
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error('5', blObj.name)

        if self.seqs == 0 or self.length == 0:
            BtLog.error('1')
Beispiel #3
0
def set_format_scatterplot(axScatter, **kwargs):
    min_x, max_x = None, None
    min_y, max_y = None, None
    if kwargs['plot'] == 'blobplot':
        min_x, max_x = 0, 1
        major_xticks = MultipleLocator(0.2)
        minor_xticks = AutoMinorLocator(20)
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('linear')
        axScatter.xaxis.set_major_locator(major_xticks)
        axScatter.xaxis.set_minor_locator(minor_xticks)
    elif kwargs['plot'] == 'covplot':
        min_x, max_x = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('log')
    else:
        BtLog.error('34' % kwargs['plot'])
    axScatter.set_xlim( (min_x, max_x) )
    axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale
    axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-')
    axScatter.set_axisbelow(True)
    axScatter.xaxis.labelpad = 20
    axScatter.yaxis.labelpad = 20
    axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False)
    axScatter.tick_params(axis='both', which='both', direction='out')
    return axScatter
Beispiel #4
0
def parseCas(infile, order_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs) / 100)
    cas_line_re = re.compile(
        r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0
    if (runCmd(command=command)):
        for line in runCmd(command=command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(
                    1)) - 1  # -1 because index of contig list starts with zero
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
                BtLog.progress(seqs_parsed, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #5
0
 def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection):
     selected_cov_libs = []
     cov_lib_selection_error = 0
     if (cov_lib_selection):
         if cov_lib_selection == 'covsum':
             selected_cov_libs.append('covsum')
         elif "," in cov_lib_selection:
             selected_cov_libs = cov_lib_selection.split(",")
             if not set(selected_cov_libs).issubset(set(cov_lib_dict.keys())):
                 cov_lib_selection_error = 1
         else:
             selected_cov_libs.append(cov_lib_selection)
             if not cov_lib_selection in cov_lib_dict:
                 cov_lib_selection_error = 1
     else:
         selected_cov_libs = cov_lib_dict.keys()
     if cov_lib_selection_error:
         covlib_string = []
         for covlib in cov_lib_dict:
             cov_lib_f = cov_lib_dict[covlib]['f']
             if not cov_lib_f:
                 cov_lib_f = "sum of coverages from all covlibs"
             covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f))
         BtLog.error('33', "\n".join(covlib_string))
     return selected_cov_libs
Beispiel #6
0
def main():
    args = docopt(__doc__)
    bam_f = args['--bam']
    include_f = args['--include']
    exclude_f = args['--exclude']
    out_prefix = args['--out']
    gzip = args['--gzip']
    do_sort = args['--sort']
    keep_sorted = args['--keep']
    sort_threads = int(args['--threads'])

    print BtLog.status_d['22'] % bam_f
    out_f = BtIO.getOutFile(bam_f, out_prefix, None)
    if include_f and exclude_f:
        print BtLog.error('43')
    elif include_f:
        sequence_list = BtIO.parseList(include_f)
        BtIO.parseBamForFilter(bam_f, out_f, sequence_list, None, gzip,
                               do_sort, keep_sorted, sort_threads)
    elif exclude_f:
        sequence_list = BtIO.parseList(exclude_f)
        BtIO.parseBamForFilter(bam_f, out_f, None, sequence_list, gzip,
                               do_sort, keep_sorted, sort_threads)
    else:
        BtIO.parseBamForFilter(bam_f, out_f, None, None, gzip, do_sort,
                               keep_sorted, sort_threads)
Beispiel #7
0
def set_format_scatterplot(axScatter, **kwargs):
    min_x, max_x = None, None
    min_y, max_y = None, None
    if kwargs['plot'] == 'blobplot':
        min_x, max_x = 0, 1
        major_xticks = MultipleLocator(0.2)
        minor_xticks = AutoMinorLocator(20)
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('linear')
        axScatter.xaxis.set_major_locator(major_xticks)
        axScatter.xaxis.set_minor_locator(minor_xticks)
    elif kwargs['plot'] == 'covplot':
        min_x, max_x = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('log')
    else:
        BtLog.error('34' % kwargs['plot'])
    axScatter.set_xlim( (min_x, max_x) )
    axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale
    axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-')
    axScatter.set_axisbelow(True)
    axScatter.xaxis.labelpad = 20
    axScatter.yaxis.labelpad = 20
    axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False)
    axScatter.tick_params(axis='both', which='both', direction='out')
    return axScatter
Beispiel #8
0
 def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection):
     selected_cov_libs = []
     cov_lib_selection_error = 0
     if (cov_lib_selection):
         if cov_lib_selection == 'covsum':
             selected_cov_libs.append('covsum')
         elif "," in cov_lib_selection:
             selected_cov_libs = cov_lib_selection.split(",")
             if not set(selected_cov_libs).issubset(set(cov_lib_dict.keys())):
                 cov_lib_selection_error = 1
         else:
             selected_cov_libs.append(cov_lib_selection)
             if not cov_lib_selection in cov_lib_dict:
                 cov_lib_selection_error = 1
     else:
         selected_cov_libs = cov_lib_dict.keys()
     if cov_lib_selection_error:
         covlib_string = []
         for covlib in cov_lib_dict:
             cov_lib_f = cov_lib_dict[covlib]['f']
             if not cov_lib_f:
                 cov_lib_f = "sum of coverages from all covlibs"
             covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f))
         BtLog.error('33', "\n".join(covlib_string))
     return selected_cov_libs
Beispiel #9
0
def main():

    #main_dir = dirname(__file__)
    args = docopt(__doc__)
    fasta_f = args['--infile']
    fasta_type = args['--type']
    sam_fs = args['--sam']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--hitsfile']
    prefix = args['--out']
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    min_bitscore_diff = float(args['--min_diff'])
    tax_collision_random = args['--tax_collision_random']
    title = args['--title']

    # outfile
    out_f = BtIO.getOutFile("blobDB", prefix, "json")
    if not (title):
        title = out_f

    # coverage
    if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
           [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]

    # taxonomy
    hit_libs = [BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]

    # Create BlobDB object
    blobDb = BtCore.BlobDb(title)
    blobDb.version = blobtools.__version__
    # Parse FASTA
    blobDb.parseFasta(fasta_f, fasta_type)

    # Parse nodesDB OR names.dmp, nodes.dmp
    nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default)
    blobDb.nodesDB_f = nodesDB_f

    # Parse similarity hits
    if (hit_libs):
        blobDb.parseHits(hit_libs)
        blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random)
    else:
        print BtLog.warn_d['0']

    # Parse coverage
    blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None)

    # Generating BlobDB and writing to file
    print BtLog.status_d['7'] % out_f
    BtIO.writeJson(blobDb.dump(), out_f)
Beispiel #10
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d["1"] % ("FASTA", fasta_f)
        self.assembly_f = abspath(fasta_f)
        if fasta_type:
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count

                if fasta_type:
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error("5", blObj.name)

        if self.seqs == 0 or self.length == 0:
            BtLog.error("1")
Beispiel #11
0
def parseJson(infile):
    '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/'''
    if not isfile(infile):
        BtLog.error('0', infile)
    import time
    start = time.time()
    json_parser = ''
    with open(infile, 'r') as fh:
        print BtLog.status_d['15']
        json_string = fh.read()
    try:
        import ujson as json  # fastest
        json_parser = 'ujson'
        print BtLog.status_d['16'] % json_parser
    except ImportError:
        try:
            import simplejson as json  # fast
            json_parser = 'simplejson'
        except ImportError:
            import json  # default
            json_parser = 'json'
        print BtLog.status_d['17'] % json_parser
    try:
        obj = json.loads(json_string.decode("ascii"))
    except ValueError:
        BtLog.error('37', infile, "BlobDB")
    data = byteify(obj)
    print BtLog.status_d['20'] % (time.time() - start)
    return data
Beispiel #12
0
def parseCovFromHeader(fasta_type, header):
    '''
    Returns the coverage from the header of a FASTA
    sequence depending on the assembly type
    '''
    ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus']
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
    if fasta_type == 'spades':
        spades_match_re = re.compile(r"_cov_(\d+\.*\d*)")
        cov = re.findall(r"_cov_(\d+\.*\d*)", header)
        return float(spades_match_re.findall(header)[0])
    elif fasta_type == 'velvet':
        return float(header.split("_")[-1])
    #elif fasta_type == 'abyss' or fasta_type == 'soap':
    #    temp = header.split(" ")
    #    return float(temp[2]/(temp[1]+1-75))
    elif fasta_type == 'platanus':
        temp = header.rstrip("\n").split("_")
        if len(temp) >= 3:
            return float(temp[2].replace("cov", "")) # scaffold/scaffoldBubble/contig
        else:
            return float(temp[1].replace("cov", "")) # gapClosed
    else:
        pass
Beispiel #13
0
def parseCovFromHeader(fasta_type, header):
    '''
    Returns the coverage from the header of a FASTA
    sequence depending on the assembly type
    '''
    ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus']
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
    if fasta_type == 'spades':
        spades_match_re = re.compile(r"_cov_(\d+\.*\d*)")
        cov = re.findall(r"_cov_(\d+\.*\d*)", header)
        return float(spades_match_re.findall(header)[0])
    elif fasta_type == 'velvet':
        return float(header.split("_")[-1])
    #elif fasta_type == 'abyss' or fasta_type == 'soap':
    #    temp = header.split(" ")
    #    return float(temp[2]/(temp[1]+1-75))
    elif fasta_type == 'platanus':
        temp = header.rstrip("\n").split("_")
        if len(temp) >= 3:
            return float(temp[2].replace("cov",
                                         ""))  # scaffold/scaffoldBubble/contig
        else:
            return float(temp[1].replace("cov", ""))  # gapClosed
    else:
        pass
Beispiel #14
0
def parseCas(infile, order_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs)/100)
    cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0
    if (runCmd(command=command)):
        for line in runCmd(command=command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
                BtLog.progress(seqs_parsed, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #15
0
def parseSet(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = set()
        for l in fh:
            items.add(l.rstrip("\n").lstrip(">"))
    return items
Beispiel #16
0
def parseSet(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = set()
        for l in fh:
            items.add(l.rstrip("\n").lstrip(">"))
    return items
Beispiel #17
0
def parseList(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = []
        for l in fh:
            items.append(l.rstrip("\n"))
    return items
Beispiel #18
0
def parseList(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = []
        for l in fh:
            items.append(l.rstrip("\n"))
    return items
Beispiel #19
0
def parseColours(infile):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                temp = l.rstrip("\n").split(",")
                items[temp[0]] = temp[1]
    return items
Beispiel #20
0
def parseColours(infile):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                temp = l.rstrip("\n").split(",")
                items[temp[0]] = temp[1]
    return items
Beispiel #21
0
def writeNodesDB(nodesDB, nodesDB_f):
    print BtLog.status_d['5'] % nodesDB_f
    nodes_count = nodesDB['nodes_count']
    i = 0
    with open(nodesDB_f, 'w') as fh:
        fh.write("# nodes_count = %s\n" % nodes_count)
        for node in nodesDB:
            if not node == "nodes_count":
                i += 1
                BtLog.progress(i, 1000, nodes_count)
                fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
Beispiel #22
0
def writeNodesDB(nodesDB, nodesDB_f):
    print BtLog.status_d['5'] % nodesDB_f
    nodes_count = nodesDB['nodes_count']
    i = 0
    with open(nodesDB_f, 'w') as fh:
        fh.write("# nodes_count = %s\n" % nodes_count)
        for node in nodesDB:
            if not node == "nodes_count":
                i += 1
                BtLog.progress(i, 1000, nodes_count)
                fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
Beispiel #23
0
def parseCov(infile, set_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    reads_unmapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith('#'):
                    if line.startswith("## Total Reads"):
                        reads_total = int(line.split(" = ")[1])
                    elif line.startswith("## Mapped Reads"):
                        reads_mapped = int(line.split(" = ")[1])
                    elif line.startswith("## Unmapped Reads"):
                        reads_unmapped = int(line.split(" = ")[1])
                    else:
                        pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(
                            match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name, infile)
                        else:
                            read_cov_dict[name] = read_cov
                            base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name)
                    else:
                        base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
Beispiel #24
0
def readFasta(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        header, seqs = '', []
        for l in fh:
            if l[0] == '>':
                if header:
                    yield header, ''.join(seqs)
                header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
            else:
                seqs.append(l[:-1])
        yield header, ''.join(seqs)
Beispiel #25
0
def parseDict(infile, key, value):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            items = {}
            k_idx = int(key)
            v_idx = int(value)
            for l in fh:
                temp = l.rstrip("\n").split()
                items[temp[k_idx]] = temp[v_idx]
    return items
Beispiel #26
0
def parseCatColour(infile):
    catcolour_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    seq_name, category = l.rstrip("\n").split(",")
                    catcolour_dict[seq_name] = category
                except:
                    BtLog.error('23', infile)
    return catcolour_dict
Beispiel #27
0
def parseDict(infile, key, value):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            items = {}
            k_idx = int(key)
            v_idx = int(value)
            for l in fh:
                temp = l.rstrip("\n").split()
                items[temp[k_idx]] = temp[v_idx]
    return items
Beispiel #28
0
def readFasta(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        header, seqs = '', []
        for l in fh:
            if l[0] == '>':
                if header:
                    yield header, ''.join(seqs)
                header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
            else:
                seqs.append(l[:-1])
        yield header, ''.join(seqs)
Beispiel #29
0
def parseCatColour(infile):
    catcolour_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    seq_name, category = l.rstrip("\n").split(",")
                    catcolour_dict[seq_name] = category
                except:
                    BtLog.error('23', infile)
    return catcolour_dict
Beispiel #30
0
def parseCov(infile, set_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    reads_unmapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith('#'):
                    if line.startswith("## Total Reads"):
                        reads_total = int(line.split(" = ")[1])
                    elif line.startswith("## Mapped Reads"):
                        reads_mapped = int(line.split(" = ")[1])
                    elif line.startswith("## Unmapped Reads"):
                        reads_unmapped = int(line.split(" = ")[1])
                    else:
                        pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name, infile)
                        else:
                            read_cov_dict[name] = read_cov
                            base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name)
                    else:
                        base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
Beispiel #31
0
def parseReferenceCov(infile):
    refcov_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                    refcov_dict[cov_lib] = {'reads_total' : int(reads_total_ref),
                                            'reads_mapped' : int(reads_mapped_ref)}
                except:
                    BtLog.error('21', infile)
    return refcov_dict
Beispiel #32
0
def parseReferenceCov(infile):
    refcov_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                    refcov_dict[cov_lib] = {'reads_total' : int(reads_total_ref),
                                            'reads_mapped' : int(reads_mapped_ref)}
                except:
                    BtLog.error('21', infile)
    return refcov_dict
Beispiel #33
0
    def plotBar(self, cov_lib, out_f):
        fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov')
        ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] }
        ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] }
        reads_total = self.cov_libs_total_reads_dict[cov_lib]
        reads_mapped = self.stats['all']['reads_mapped'][cov_lib]
        reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib]
        ax_main_data['labels'].append('Unmapped (assembly)')
        ax_main_data['values'].append(reads_unmapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        ax_main_data['labels'].append('Mapped (assembly)')
        ax_main_data['values'].append(reads_mapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        if (self.refcov_dict):
            if cov_lib in self.refcov_dict:
                reads_total_ref = self.refcov_dict[cov_lib]['reads_total']
                reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped']
                reads_unmapped_ref = reads_total_ref - reads_mapped_ref
                ax_main_data['labels'].append('Unmapped (ref)')
                ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
                ax_main_data['labels'].append('Mapped (ref)')
                ax_main_data['values'].append(reads_mapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
            else:
                BtLog.error('40', cov_lib)

        # mapped plotted groups
        for group in self.plot_order:
           ax_group_data['labels'].append(group)
           ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib])
           ax_group_data['colours'].append(self.colours[group])
        rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours'])
        for rect_g in rect_group:
            height_g = float(rect_g.get_height())
            ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)
        rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours'])
        for rect_m in rect_main:
            height_m = float(rect_m.get_height())
            ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)

        ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        #figsuptitle = fig.suptitle(out_f, verticalalignment='top')
        out_f = "%s.read_cov.%s" % (out_f, cov_lib)
        print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)
        fig.tight_layout()
        #fig.savefig("%s.%s" % (out_f, self.format), format=self.format,  bbox_extra_artists=(figsuptitle,))
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
Beispiel #34
0
    def plotBar(self, cov_lib, out_f):
        fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov')
        ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] }
        ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] }
        reads_total = self.cov_libs_total_reads_dict[cov_lib]
        reads_mapped = self.stats['all']['reads_mapped'][cov_lib]
        reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib]
        ax_main_data['labels'].append('Unmapped (assembly)')
        ax_main_data['values'].append(reads_unmapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        ax_main_data['labels'].append('Mapped (assembly)')
        ax_main_data['values'].append(reads_mapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        if (self.refcov_dict):
            if cov_lib in self.refcov_dict:
                reads_total_ref = self.refcov_dict[cov_lib]['reads_total']
                reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped']
                reads_unmapped_ref = reads_total_ref - reads_mapped_ref
                ax_main_data['labels'].append('Unmapped (ref)')
                ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
                ax_main_data['labels'].append('Mapped (ref)')
                ax_main_data['values'].append(reads_mapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
            else:
                BtLog.error('40', cov_lib)

        # mapped plotted groups
        for group in self.plot_order:
           ax_group_data['labels'].append(group)
           ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib])
           ax_group_data['colours'].append(self.colours[group])
        rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours'])
        for rect_g in rect_group:
            height_g = float(rect_g.get_height())
            ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)
        rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours'])
        for rect_m in rect_main:
            height_m = float(rect_m.get_height())
            ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)

        ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        #figsuptitle = fig.suptitle(out_f, verticalalignment='top')
        out_f = "%s.read_cov.%s" % (out_f, cov_lib)
        print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)
        fig.tight_layout()
        #fig.savefig("%s.%s" % (out_f, self.format), format=self.format,  bbox_extra_artists=(figsuptitle,))
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
Beispiel #35
0
def parseCmdLabels(labels):
    label_d = {}
    name, groups = '', ''
    if labels:
        try:
            for label in labels:
                name, groups = str(label).split("=")
                if "," in groups:
                    for group in groups.split(","):
                        label_d[group] = name
                else:
                    label_d[groups] = name
        except:
            BtLog.error('17', labels)
    return label_d
Beispiel #36
0
 def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random):
     print BtLog.status_d['6'] % ",".join(taxrules)
     tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB)
     self.lineages = BtTax.getLineages(tree_lists, nodesDB)
     self.taxrules = taxrules
     i = 0
     for blObj in self.dict_of_blobs.values():
         i += 1
         BtLog.progress(i, 100, self.seqs)
         for taxrule in taxrules:
             if (blObj.hits):
                 blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random)
             else:
                 blObj.taxonomy[taxrule] = BtTax.noHit()
     self.set_of_taxIds = set()
Beispiel #37
0
def parseCmdLabels(labels):
    label_d = {}
    name, groups = '', ''
    if labels:
        try:
            for label in labels:
                name, groups = str(label).split("=")
                if "," in groups:
                    for group in groups.split(","):
                        label_d[group] = name
                else:
                    label_d[groups] = name
        except:
            BtLog.error('17', labels)
    return label_d
Beispiel #38
0
def readNodesDB(nodesDB_f):
    nodesDB = {}
    nodesDB_count = 0
    nodes_count = 0
    with open(nodesDB_f) as fh:
        for line in fh:
            if line.startswith("#"):
                nodesDB_count = int(line.lstrip("# nodes_count = ").rstrip("\n"))
            else:
                nodes_count += 1
                node, rank, name, parent = line.rstrip("\n").split("\t")
                nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent}
                if (nodesDB_count):
                    BtLog.progress(nodes_count, 1000, nodesDB_count)
    nodesDB['nodes_count'] = nodes_count
    return nodesDB
Beispiel #39
0
def readNodesDB(nodesDB_f):
    nodesDB = {}
    nodesDB_count = 0
    nodes_count = 0
    with open(nodesDB_f) as fh:
        for line in fh:
            if line.startswith("#"):
                nodesDB_count = int(line.lstrip("# nodes_count = ").rstrip("\n"))
            else:
                nodes_count += 1
                node, rank, name, parent = line.rstrip("\n").split("\t")
                nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent}
                if (nodesDB_count):
                    BtLog.progress(nodes_count, 1000, nodesDB_count)
    nodesDB['nodes_count'] = nodes_count
    return nodesDB
Beispiel #40
0
def parseSam(infile, set_of_blobs, no_base_cov_flag):
    if not isfile(infile):
        BtLog.error('0', infile)
    base_cov_dict = {blob: [] for blob in set_of_blobs}
    read_cov_dict = {blob: 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(
        r"(\d+)M|X|=")  # only gets digits before M,X,='s
    reads_total = 0
    reads_mapped = 0
    if not (no_base_cov_flag):
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            base_cov_dict[match[2]].append(
                                sum([
                                    int(matching) for matching in
                                    cigar_match_re.findall(match[5])
                                ]))
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    else:
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    base_cov_dict = {
        seq_name: sum(base_covs)
        for seq_name, base_covs in base_cov_dict.items()
    }
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #41
0
 def computeTaxonomy(self, taxrules, nodesDB, min_bitscore_diff, tax_collision_random):
     print BtLog.status_d["6"] % ",".join(taxrules)
     tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB)
     self.lineages = BtTax.getLineages(tree_lists, nodesDB)
     self.taxrules = taxrules
     i = 0
     for blObj in self.dict_of_blobs.values():
         i += 1
         BtLog.progress(i, 100, self.seqs)
         for taxrule in taxrules:
             if blObj.hits:
                 blObj.taxonomy[taxrule] = BtTax.taxRule(
                     taxrule, blObj.hits, self.lineages, min_bitscore_diff, tax_collision_random
                 )
             else:
                 blObj.taxonomy[taxrule] = BtTax.noHit()
     self.set_of_taxIds = set()
Beispiel #42
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    bam_fs = args['--bam']
    cas_fs = args['--cas']
    sam_fs = args['--sam']
    prefix = args['--output']
    no_base_cov_flag = args['--no_base_cov']

    # Make covLibs
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)]
    if not (cov_libs):
        BtLog.error('31')
    blobDb = BtCore.BlobDb('cov')
    blobDb.version = blobtools.__version__
    blobDb.parseFasta(fasta_f, None)
    blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=no_base_cov_flag)
Beispiel #43
0
def checkCas(infile):
    print BtLog.status_d['12']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not (which('clc_mapping_info')):
        BtLog.error('20')
    seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
    reads_total_re = re.compile(r"\s+Reads\s+(\d+)")
    reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%")
    seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0
    output = ''
    command = "clc_mapping_info -s " + infile
    for line in runCmd(command=command):
        output += line
    seqs_total = int(seqs_total_re.search(output).group(1))
    reads_mapped = int(reads_mapping_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return seqs_total, reads_total, reads_mapped
Beispiel #44
0
def checkCas(infile):
    print BtLog.status_d['12']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not (which('clc_mapping_info')):
        BtLog.error('20')
    seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
    reads_total_re = re.compile(r"\s+Reads\s+(\d+)")
    reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%")
    seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0
    output = ''
    command = "clc_mapping_info -s " + infile
    for line in runCmd(command=command):
        output += line
    seqs_total = int(seqs_total_re.search(output).group(1))
    reads_mapped = int(reads_mapping_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return seqs_total, reads_total, reads_mapped
Beispiel #45
0
def readTax(infile, set_of_blobs):
    '''
    If more fields need to be parsed:
        - change hit_line_re
        - catch matches in variables
        - add as key-value pairs to hitDict
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)"
                             )  # TEST TEST , if not split it afterwards
    with open(infile) as fh:
        for line in fh:
            match = hit_line_re.search(line)
            if match:
                hitDict = {
                    'name': match.group(1),
                    'taxId': match.group(
                        2
                    ),  # string because if int, conversion is a nightmare ...
                    'score': float(match.group(3))
                }
                if hitDict['name'] not in set_of_blobs:
                    BtLog.error('19', hitDict['name'], infile)
                if hitDict['taxId'] == 'N/A':
                    BtLog.error('22', infile)
                yield hitDict
Beispiel #46
0
def checkBam(infile):
    print BtLog.status_d['10']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not which('samtools'):
        BtLog.error('7')
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
    reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = "samtools flagstat " + infile
    for line in runCmd(command=command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    reads_secondary = int(reads_secondary_re.search(output).group(1))
    reads_supplementary = int(reads_supplementary_re.search(output).group(1))
    reads_mapped = reads_mapped - reads_secondary - reads_secondary
    reads_total = int(reads_total_re.search(output).group(1)) - reads_secondary - reads_supplementary
    # check whether there are reads in BAM
    if not reads_total or not reads_mapped:
        BtLog.error('29' % infile)
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \
        '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return reads_total, reads_mapped
Beispiel #47
0
def checkBam(infile):
    print BtLog.status_d['10']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not which('samtools'):
        BtLog.error('7')
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
    reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = "samtools flagstat " + infile
    for line in runCmd(command=command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    reads_secondary = int(reads_secondary_re.search(output).group(1))
    reads_supplementary = int(reads_supplementary_re.search(output).group(1))
    reads_mapped = reads_mapped - reads_secondary - reads_secondary
    reads_total = int(reads_total_re.search(output).group(
        1)) - reads_secondary - reads_supplementary
    # check whether there are reads in BAM
    if not reads_total or not reads_mapped:
        BtLog.error('29' % infile)
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \
        '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return reads_total, reads_mapped
Beispiel #48
0
def parseSam(infile, set_of_blobs, no_base_cov_flag):
    if not isfile(infile):
        BtLog.error('0', infile)
    base_cov_dict = {blob : [] for blob in set_of_blobs}
    read_cov_dict = {blob : 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
    reads_total = 0
    reads_mapped = 0
    if not (no_base_cov_flag):
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])]))
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    else:
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()}
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #49
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    list_f = args['--list']
    invert = args['--invert']
    prefix = args['--out']

    output = []
    out_f = BtIO.getOutFile(fasta_f, prefix, "filtered.fna")

    print BtLog.status_d['1'] % ("list", list_f)
    items = BtIO.parseSet(list_f)
    items_count = len(items)
    print BtLog.status_d['22'] % fasta_f
    items_parsed = []
    sequences = 0
    for header, sequence in BtIO.readFasta(fasta_f):
        sequences += 1
        if header in items:
            if not (invert):
                items_parsed.append(header)
                output.append(">%s\n%s\n" % (header, sequence))
        else:
            if (invert):
                items_parsed.append(header)
                output.append(">%s\n%s\n" % (header, sequence))
        BtLog.progress(len(output), 10, items_count, no_limit=True)
    BtLog.progress(items_count, 10, items_count)

    items_parsed_count = len(items_parsed)
    print BtLog.status_d['23'] % ('{:.2%}'.format(items_parsed_count/sequences), "{:,}".format(items_count), "{:,}".format(items_parsed_count), "{:,}".format(sequences))

    items_parsed_count_unique = len(set(items_parsed))
    if not items_parsed_count == items_parsed_count_unique:
        print BtLog.warn_d['8'] % "\n\t\t\t".join(list(set([x for x in items_parsed if items_parsed.count(x) > 1])))

    with open(out_f, "w") as fh:
        print BtLog.status_d['24'] % out_f
        fh.write("".join(output))
Beispiel #50
0
def main():
    args = docopt(__doc__)
    bam_f = args['--bam']
    include_f = args['--include']
    exclude_f = args['--exclude']
    out_prefix = args['--out']
    gzip = args['--gzip']
    do_sort = args['--sort']
    keep_sorted = args['--keep']
    sort_threads = int(args['--threads'])

    print BtLog.status_d['22'] % bam_f
    out_f = BtIO.getOutFile(bam_f, out_prefix, None)
    if include_f and exclude_f:
        print BtLog.error('43')
    elif include_f:
        sequence_list = BtIO.parseList(include_f)
        BtIO.parseBamForFilter(bam_f, out_f, sequence_list, None, gzip, do_sort, keep_sorted, sort_threads)
    elif exclude_f:
        sequence_list = BtIO.parseList(exclude_f)
        BtIO.parseBamForFilter(bam_f, out_f, None, sequence_list, gzip, do_sort, keep_sorted, sort_threads)
    else:
        BtIO.parseBamForFilter(bam_f, out_f, None, None, gzip, do_sort, keep_sorted, sort_threads)
Beispiel #51
0
def parseBam(infile, set_of_blobs, no_base_cov_flag):
    '''
    checkBam returns reads_total and reads_mapped
    base_cov_dict is list of coverages for each contigs, since list appending should be faster

    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped / 1000)
    base_cov_dict = {blob: [] for blob in set_of_blobs}
    #base_cov_dict = {blob : 0 for blob in set_of_blobs}
    read_cov_dict = {blob: 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(
        r"(\d+)M|X|=")  # only gets digits before M,X,='s
    # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
    command = "samtools view -F 1024 -F 4 -F 256 " + infile
    seen_reads = 0
    #import time
    #start = time.time()
    if not (no_base_cov_flag):
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                base_cov_dict[match[2]].append(
                    sum([
                        int(matching)
                        for matching in cigar_match_re.findall(match[5])
                    ]))
                #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    else:
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    if not int(reads_mapped) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_mapped, seen_reads)
    base_cov_dict = {
        seq_name: sum(base_covs)
        for seq_name, base_covs in base_cov_dict.items()
    }
    #end = time.time()
    #print (end-start)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #52
0
def parseBam(infile, set_of_blobs, no_base_cov_flag):
    '''
    checkBam returns reads_total and reads_mapped
    base_cov_dict is list of coverages for each contigs, since list appending should be faster

    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped/1000)
    base_cov_dict = {blob : [] for blob in set_of_blobs}
    #base_cov_dict = {blob : 0 for blob in set_of_blobs}
    read_cov_dict = {blob : 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
    # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
    command = "samtools view -F 1024 -F 4 -F 256 " + infile
    seen_reads = 0
    #import time
    #start = time.time()
    if not (no_base_cov_flag):
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])]))
                #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    else:
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    if not int(reads_mapped) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_mapped, seen_reads)
    base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()}
    #end = time.time()
    #print (end-start)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Beispiel #53
0
def readTax(infile, set_of_blobs):
    '''
    If more fields need to be parsed:
        - change hit_line_re
        - catch matches in variables
        - add as key-value pairs to hitDict
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards
    with open(infile) as fh:
        for line in fh:
            match = hit_line_re.search(line)
            if match:
                hitDict = {
                    'name' : match.group(1),
                    'taxId' : match.group(2), # string because if int, conversion is a nightmare ...
                    'score' : float(match.group(3))
                    }
                if hitDict['name'] not in set_of_blobs:
                    BtLog.error('19', hitDict['name'], infile)
                if hitDict['taxId'] == 'N/A':
                    BtLog.error('22', infile)
                yield hitDict
Beispiel #54
0
def parseNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
    gets JSON'ed into blobtools/data/nodes_db.json if this file
    does not exist. Nodes_db.json is used if neither "--names" and "--nodes"
    nor "--db" is specified.
    '''
    nodesDB = {}
    names_f = kwargs['names']
    nodes_f = kwargs['nodes']
    nodesDB_f = kwargs['nodesDB']
    nodesDB_default = kwargs['nodesDBdefault']

    if (nodes_f and names_f):
        if not isfile(names_f):
            BtLog.error('0', names_f)
        if not isfile(nodes_f):
            BtLog.error('0', nodes_f)
        print BtLog.status_d['3'] % (nodes_f, names_f)
        try:
            nodesDB = readNamesNodes(names_f, nodes_f)
        except:
            BtLog.error('3', nodes_f, names_f)
    elif (nodesDB_f):
        if not isfile(nodesDB_f):
            BtLog.error('0', nodesDB_f)
        print BtLog.status_d['4'] % (nodesDB_f)
        try:
            nodesDB = readNodesDB(nodesDB_f)
        except:
            BtLog.error('27', nodesDB_f)
    elif (nodesDB_default):
        if not isfile(nodesDB_default):
            BtLog.error('28')
        print BtLog.status_d['4'] % (nodesDB_default)
        try:
            nodesDB = readNodesDB(nodesDB_default)
        except:
            BtLog.error('27', nodesDB_default)
        nodesDB_f = nodesDB_default

    # Write nodesDB if not available
    if not isfile(nodesDB_default):
        writeNodesDB(nodesDB, nodesDB_default)

    return nodesDB, nodesDB_f
Beispiel #55
0
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort,
                      keep_sorted, sort_threads):
    '''
    checkBam returns reads_total and reads_mapped
    parse BAM to extract readpairs
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    if do_sort:
        command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (
            infile, infile)
        runCmd(command=command, wait=True)
        infile = "%s.readsorted.bam" % infile

    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped / 1000)
    command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile
    seen_reads = 0
    read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(
        outfile, include, exclude)
    read_pair_out_fhs = []
    used_fhs = {}
    iterator = runCmd(command=command)
    read_pair_type = None
    if include:
        sequence_to_type_dict = defaultdict(lambda: 'Ex')
        for incl in include:
            sequence_to_type_dict[incl] = 'In'
        sequence_to_type_dict['*'] = 'Un'
    elif exclude:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        for excl in exclude:
            sequence_to_type_dict[excl] = 'Ex'
        sequence_to_type_dict['*'] = 'Un'
    else:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        sequence_to_type_dict['*'] = 'Un'
    for l in iterator:
        read1 = l.split()
        try:
            seen_reads += 2
            read2 = next(iterator).split()
            read_pair_type = "".join(
                sorted([
                    sequence_to_type_dict[read1[2]],
                    sequence_to_type_dict[read2[2]]
                ]))
            print_bam(read_pair_out_fs, read_pair_type, read1, read2)
            read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2)
            read_pair_count[read_pair_type] += 1
            BtLog.progress(seen_reads, progress_unit, reads_total)
            if seen_reads % progress_unit == 0:
                used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs,
                                                read_pair_seqs)
                read_pair_seqs = {
                    read_pair_type: tuple()
                    for read_pair_type in read_pair_count
                }
        except StopIteration:
            print BtLog.warn_d['11']
    used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
    close_fhs(used_fhs)
    # info log
    info_string = []
    info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)),
                        '{0:.1%}'.format(1.00)))
    for read_pair_type, count in read_pair_count.items():
        info_string.append((read_pair_type + ' pairs', "{:,}".format(count),
                            '{0:.1%}'.format(count / int(seen_reads / 2))))
    info_out_f = getOutFile(outfile, None, "info.txt")
    with open(info_out_f, 'w') as info_fh:
        print BtLog.status_d['24'] % info_out_f
        info_fh.write(get_table(info_string))
    # gzip
    if gzip:
        if not which('gzip'):
            BtLog.error('43')
        for out_f in used_fhs:
            print BtLog.status_d['25'] % out_f
            runCmd(command="gzip -f " + out_f, wait=True)

    if not int(reads_total) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_total, seen_reads)
    if do_sort and not keep_sorted:
        os.remove(infile)
    return 1
Beispiel #56
0
def main():

    #main_dir = dirname(__file__)
    args = docopt(__doc__)
    fasta_f = args['--infile']
    fasta_type = args['--type']
    sam_fs = args['--sam']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--hitsfile']
    prefix = args['--out']
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    min_bitscore_diff = float(args['--min_diff'])
    tax_collision_random = args['--tax_collision_random']
    title = args['--title']

    # outfile
    out_f = BtIO.getOutFile("blobDB", prefix, "json")
    if not (title):
        title = out_f

    # coverage
    if not (fasta_type
            ) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
           [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]

    # taxonomy
    hit_libs = [
        BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f)
        for idx, lib_f in enumerate(hit_fs)
    ]

    # Create BlobDB object
    blobDb = BtCore.BlobDb(title)
    blobDb.version = blobtools.__version__
    # Parse FASTA
    blobDb.parseFasta(fasta_f, fasta_type)

    # Parse nodesDB OR names.dmp, nodes.dmp
    nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f,
                                           names=names_f,
                                           nodesDB=nodesDB_f,
                                           nodesDBdefault=nodesDB_default)
    blobDb.nodesDB_f = nodesDB_f

    # Parse similarity hits
    if (hit_libs):
        blobDb.parseHits(hit_libs)
        blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff,
                               tax_collision_random)
    else:
        print BtLog.warn_d['0']

    # Parse coverage
    blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None)

    # Generating BlobDB and writing to file
    print BtLog.status_d['7'] % out_f
    BtIO.writeJson(blobDb.dump(), out_f)
Beispiel #57
0
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads):
    '''
    checkBam returns reads_total and reads_mapped
    parse BAM to extract readpairs
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    if do_sort:
        command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile)
        runCmd(command=command, wait=True)
        infile = "%s.readsorted.bam" % infile

    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped/1000)
    command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile
    seen_reads = 0
    read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(outfile, include, exclude)
    read_pair_out_fhs = []
    used_fhs = {}
    iterator = runCmd(command=command)
    read_pair_type = None
    if include:
        sequence_to_type_dict = defaultdict(lambda: 'Ex')
        for incl in include:
            sequence_to_type_dict[incl] = 'In'
        sequence_to_type_dict['*'] = 'Un'
    elif exclude:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        for excl in exclude:
            sequence_to_type_dict[excl] = 'Ex'
        sequence_to_type_dict['*'] = 'Un'
    else:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        sequence_to_type_dict['*'] = 'Un'
    for l in iterator:
        read1 = l.split()
        try:
            seen_reads += 2
            read2 = next(iterator).split()
            read_pair_type = "".join(sorted([sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]]]))
            print_bam(read_pair_out_fs, read_pair_type, read1, read2)
            read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2)
            read_pair_count[read_pair_type] += 1
            BtLog.progress(seen_reads, progress_unit, reads_total)
            if seen_reads % progress_unit == 0:
                used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
                read_pair_seqs = {read_pair_type : tuple() for read_pair_type in read_pair_count}
        except StopIteration:
                print BtLog.warn_d['11']
    used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
    close_fhs(used_fhs)
    # info log
    info_string = []
    info_string.append(('Total pairs', "{:,}".format(int(seen_reads/2)), '{0:.1%}'.format(1.00)))
    for read_pair_type, count in read_pair_count.items():
        info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count/int(seen_reads/2))))
    info_out_f = getOutFile(outfile, None, "info.txt")
    with open(info_out_f, 'w') as info_fh:
        print BtLog.status_d['24'] % info_out_f
        info_fh.write(get_table(info_string))
    # gzip
    if gzip:
        if not which('gzip'):
            BtLog.error('43')
        for out_f in used_fhs:
            print BtLog.status_d['25'] % out_f
            runCmd(command="gzip -f " + out_f, wait=True)

    if not int(reads_total) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_total, seen_reads)
    if do_sort and not keep_sorted:
        os.remove(infile)
    return 1
Beispiel #58
0
def parseNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
    gets JSON'ed into blobtools/data/nodes_db.json if this file
    does not exist. Nodes_db.json is used if neither "--names" and "--nodes"
    nor "--db" is specified.
    '''
    nodesDB = {}
    names_f = kwargs['names']
    nodes_f = kwargs['nodes']
    nodesDB_f = kwargs['nodesDB']
    nodesDB_default = kwargs['nodesDBdefault']

    if (nodes_f and names_f):
        if not isfile(names_f):
            BtLog.error('0', names_f)
        if not isfile(nodes_f):
            BtLog.error('0', nodes_f)
        print BtLog.status_d['3'] % (nodes_f, names_f)
        try:
            nodesDB = readNamesNodes(names_f, nodes_f)
        except:
            BtLog.error('3', nodes_f, names_f)
    elif (nodesDB_f):
        if not isfile(nodesDB_f):
            BtLog.error('0', nodesDB_f)
        print BtLog.status_d['4'] % (nodesDB_f)
        try:
            nodesDB = readNodesDB(nodesDB_f)
        except:
            BtLog.error('27', nodesDB_f)
    elif (nodesDB_default):
        if not isfile(nodesDB_default):
            BtLog.error('28')
        print BtLog.status_d['4'] % (nodesDB_default)
        try:
            nodesDB = readNodesDB(nodesDB_default)
        except:
            BtLog.error('27', nodesDB_default)
        nodesDB_f = nodesDB_default

    # Write nodesDB if not available
    if not isfile(nodesDB_default):
        writeNodesDB(nodesDB, nodesDB_default)

    return nodesDB, nodesDB_f