Ejemplo n.º 1
0
def set_format_scatterplot(axScatter, **kwargs):
    min_x, max_x = None, None
    min_y, max_y = None, None
    if kwargs['plot'] == 'blobplot':
        min_x, max_x = 0, 1
        major_xticks = MultipleLocator(0.2)
        minor_xticks = AutoMinorLocator(20)
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('linear')
        axScatter.xaxis.set_major_locator(major_xticks)
        axScatter.xaxis.set_minor_locator(minor_xticks)
    elif kwargs['plot'] == 'covplot':
        min_x, max_x = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('log')
    else:
        BtLog.error('34' % kwargs['plot'])
    axScatter.set_xlim( (min_x, max_x) )
    axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale
    axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-')
    axScatter.set_axisbelow(True)
    axScatter.xaxis.labelpad = 20
    axScatter.yaxis.labelpad = 20
    axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False)
    axScatter.tick_params(axis='both', which='both', direction='out')
    return axScatter
Ejemplo n.º 2
0
def parseCas(infile, order_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs)/100)
    cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0
    if (runCmd(command=command)):
        for line in runCmd(command=command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
                BtLog.progress(seqs_parsed, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
Ejemplo n.º 3
0
def parseCovFromHeader(fasta_type, header):
    '''
    Returns the coverage from the header of a FASTA
    sequence depending on the assembly type
    '''
    ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus']
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
    if fasta_type == 'spades':
        spades_match_re = re.compile(r"_cov_(\d+\.*\d*)")
        cov = re.findall(r"_cov_(\d+\.*\d*)", header)
        return float(spades_match_re.findall(header)[0])
    elif fasta_type == 'velvet':
        return float(header.split("_")[-1])
    #elif fasta_type == 'abyss' or fasta_type == 'soap':
    #    temp = header.split(" ")
    #    return float(temp[2]/(temp[1]+1-75))
    elif fasta_type == 'platanus':
        temp = header.rstrip("\n").split("_")
        if len(temp) >= 3:
            return float(temp[2].replace("cov", "")) # scaffold/scaffoldBubble/contig
        else:
            return float(temp[1].replace("cov", "")) # gapClosed
    else:
        pass
Ejemplo n.º 4
0
def parseJson(infile):
    '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/'''
    if not isfile(infile):
        BtLog.error('0', infile)
    import time
    start = time.time()
    json_parser = ''
    with open(infile, 'r') as fh:
        print BtLog.status_d['15']
        json_string = fh.read()
    try:
        import ujson as json # fastest
        json_parser = 'ujson'
        print BtLog.status_d['16'] % json_parser
    except ImportError:
        try:
            import simplejson as json # fast
            json_parser = 'simplejson'
        except ImportError:
            import json # default
            json_parser = 'json'
        print BtLog.status_d['17'] % json_parser
    try:
        obj = json.loads(json_string.decode("ascii"))
    except ValueError:
        BtLog.error('37', infile, "BlobDB")
    data = byteify(obj)
    print BtLog.status_d['20'] % (time.time() - start)
    return data
Ejemplo n.º 5
0
def main():

    #main_dir = dirname(__file__)
    args = docopt(__doc__)
    fasta_f = args['--infile']
    fasta_type = args['--type']
    sam_fs = args['--sam']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--hitsfile']
    prefix = args['--out']
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    min_bitscore_diff = float(args['--min_diff'])
    tax_collision_random = args['--tax_collision_random']
    title = args['--title']

    # outfile
    out_f = BtIO.getOutFile("blobDB", prefix, "json")
    if not (title):
        title = out_f

    # coverage
    if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
           [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]

    # taxonomy
    hit_libs = [BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]

    # Create BlobDB object
    blobDb = BtCore.BlobDb(title)
    blobDb.version = blobtools.__version__
    # Parse FASTA
    blobDb.parseFasta(fasta_f, fasta_type)

    # Parse nodesDB OR names.dmp, nodes.dmp
    nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f, names=names_f, nodesDB=nodesDB_f, nodesDBdefault=nodesDB_default)
    blobDb.nodesDB_f = nodesDB_f

    # Parse similarity hits
    if (hit_libs):
        blobDb.parseHits(hit_libs)
        blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff, tax_collision_random)
    else:
        print BtLog.warn_d['0']

    # Parse coverage
    blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None)

    # Generating BlobDB and writing to file
    print BtLog.status_d['7'] % out_f
    BtIO.writeJson(blobDb.dump(), out_f)
Ejemplo n.º 6
0
def parseCovFromHeader(fasta_type, header):
    '''
    Returns the coverage from the header of a FASTA
    sequence depending on the assembly type
    '''
    ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus']
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
    if fasta_type == 'spades':
        spades_match_re = re.compile(r"_cov_(\d+\.*\d*)")
        cov = re.findall(r"_cov_(\d+\.*\d*)", header)
        return float(spades_match_re.findall(header)[0])
    elif fasta_type == 'velvet':
        return float(header.split("_")[-1])
    #elif fasta_type == 'abyss' or fasta_type == 'soap':
    #    temp = header.split(" ")
    #    return float(temp[2]/(temp[1]+1-75))
    elif fasta_type == 'platanus':
        temp = header.rstrip("\n").split("_")
        if len(temp) >= 3:
            return float(temp[2].replace("cov",
                                         ""))  # scaffold/scaffoldBubble/contig
        else:
            return float(temp[1].replace("cov", ""))  # gapClosed
    else:
        pass
Ejemplo n.º 7
0
def main():
    args = docopt(__doc__)
    bam_f = args['--bam']
    include_f = args['--include']
    exclude_f = args['--exclude']
    out_prefix = args['--out']
    gzip = args['--gzip']
    do_sort = args['--sort']
    keep_sorted = args['--keep']
    sort_threads = int(args['--threads'])

    print BtLog.status_d['22'] % bam_f
    out_f = BtIO.getOutFile(bam_f, out_prefix, None)
    if include_f and exclude_f:
        print BtLog.error('43')
    elif include_f:
        sequence_list = BtIO.parseList(include_f)
        BtIO.parseBamForFilter(bam_f, out_f, sequence_list, None, gzip,
                               do_sort, keep_sorted, sort_threads)
    elif exclude_f:
        sequence_list = BtIO.parseList(exclude_f)
        BtIO.parseBamForFilter(bam_f, out_f, None, sequence_list, gzip,
                               do_sort, keep_sorted, sort_threads)
    else:
        BtIO.parseBamForFilter(bam_f, out_f, None, None, gzip, do_sort,
                               keep_sorted, sort_threads)
Ejemplo n.º 8
0
def set_format_scatterplot(axScatter, **kwargs):
    min_x, max_x = None, None
    min_y, max_y = None, None
    if kwargs['plot'] == 'blobplot':
        min_x, max_x = 0, 1
        major_xticks = MultipleLocator(0.2)
        minor_xticks = AutoMinorLocator(20)
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('linear')
        axScatter.xaxis.set_major_locator(major_xticks)
        axScatter.xaxis.set_minor_locator(minor_xticks)
    elif kwargs['plot'] == 'covplot':
        min_x, max_x = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        min_y, max_y = kwargs['min_cov']*0.1, kwargs['max_cov']+1000
        axScatter.set_yscale('log')
        axScatter.set_xscale('log')
    else:
        BtLog.error('34' % kwargs['plot'])
    axScatter.set_xlim( (min_x, max_x) )
    axScatter.set_ylim( (min_y, max_y) ) # This sets the max-Coverage so that all libraries + sum are at the same scale
    axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-')
    axScatter.set_axisbelow(True)
    axScatter.xaxis.labelpad = 20
    axScatter.yaxis.labelpad = 20
    axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False)
    axScatter.tick_params(axis='both', which='both', direction='out')
    return axScatter
Ejemplo n.º 9
0
 def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection):
     selected_cov_libs = []
     cov_lib_selection_error = 0
     if (cov_lib_selection):
         if cov_lib_selection == 'covsum':
             selected_cov_libs.append('covsum')
         elif "," in cov_lib_selection:
             selected_cov_libs = cov_lib_selection.split(",")
             if not set(selected_cov_libs).issubset(set(cov_lib_dict.keys())):
                 cov_lib_selection_error = 1
         else:
             selected_cov_libs.append(cov_lib_selection)
             if not cov_lib_selection in cov_lib_dict:
                 cov_lib_selection_error = 1
     else:
         selected_cov_libs = cov_lib_dict.keys()
     if cov_lib_selection_error:
         covlib_string = []
         for covlib in cov_lib_dict:
             cov_lib_f = cov_lib_dict[covlib]['f']
             if not cov_lib_f:
                 cov_lib_f = "sum of coverages from all covlibs"
             covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f))
         BtLog.error('33', "\n".join(covlib_string))
     return selected_cov_libs
Ejemplo n.º 10
0
 def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection):
     selected_cov_libs = []
     cov_lib_selection_error = 0
     if (cov_lib_selection):
         if cov_lib_selection == 'covsum':
             selected_cov_libs.append('covsum')
         elif "," in cov_lib_selection:
             selected_cov_libs = cov_lib_selection.split(",")
             if not set(selected_cov_libs).issubset(set(cov_lib_dict.keys())):
                 cov_lib_selection_error = 1
         else:
             selected_cov_libs.append(cov_lib_selection)
             if not cov_lib_selection in cov_lib_dict:
                 cov_lib_selection_error = 1
     else:
         selected_cov_libs = cov_lib_dict.keys()
     if cov_lib_selection_error:
         covlib_string = []
         for covlib in cov_lib_dict:
             cov_lib_f = cov_lib_dict[covlib]['f']
             if not cov_lib_f:
                 cov_lib_f = "sum of coverages from all covlibs"
             covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f))
         BtLog.error('33', "\n".join(covlib_string))
     return selected_cov_libs
Ejemplo n.º 11
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d["1"] % ("FASTA", fasta_f)
        self.assembly_f = abspath(fasta_f)
        if fasta_type:
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count

                if fasta_type:
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error("5", blObj.name)

        if self.seqs == 0 or self.length == 0:
            BtLog.error("1")
Ejemplo n.º 12
0
def parseCas(infile, order_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs) / 100)
    cas_line_re = re.compile(
        r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0
    if (runCmd(command=command)):
        for line in runCmd(command=command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(
                    1)) - 1  # -1 because index of contig list starts with zero
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
                BtLog.progress(seqs_parsed, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
Ejemplo n.º 13
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d['1'] % ('FASTA', fasta_f)
        self.assembly_f = abspath(fasta_f)
        if (fasta_type):
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count

                if (fasta_type):
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error('5', blObj.name)

        if self.seqs == 0 or self.length == 0:
            BtLog.error('1')
Ejemplo n.º 14
0
def parseJson(infile):
    '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/'''
    if not isfile(infile):
        BtLog.error('0', infile)
    import time
    start = time.time()
    json_parser = ''
    with open(infile, 'r') as fh:
        print BtLog.status_d['15']
        json_string = fh.read()
    try:
        import ujson as json  # fastest
        json_parser = 'ujson'
        print BtLog.status_d['16'] % json_parser
    except ImportError:
        try:
            import simplejson as json  # fast
            json_parser = 'simplejson'
        except ImportError:
            import json  # default
            json_parser = 'json'
        print BtLog.status_d['17'] % json_parser
    try:
        obj = json.loads(json_string.decode("ascii"))
    except ValueError:
        BtLog.error('37', infile, "BlobDB")
    data = byteify(obj)
    print BtLog.status_d['20'] % (time.time() - start)
    return data
Ejemplo n.º 15
0
def parseSet(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = set()
        for l in fh:
            items.add(l.rstrip("\n").lstrip(">"))
    return items
Ejemplo n.º 16
0
def parseList(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = []
        for l in fh:
            items.append(l.rstrip("\n"))
    return items
Ejemplo n.º 17
0
def parseList(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = []
        for l in fh:
            items.append(l.rstrip("\n"))
    return items
Ejemplo n.º 18
0
def parseSet(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = set()
        for l in fh:
            items.add(l.rstrip("\n").lstrip(">"))
    return items
Ejemplo n.º 19
0
def parseColours(infile):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                temp = l.rstrip("\n").split(",")
                items[temp[0]] = temp[1]
    return items
Ejemplo n.º 20
0
def parseColours(infile):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                temp = l.rstrip("\n").split(",")
                items[temp[0]] = temp[1]
    return items
Ejemplo n.º 21
0
def parseBam(infile, set_of_blobs, no_base_cov_flag):
    '''
    checkBam returns reads_total and reads_mapped
    base_cov_dict is list of coverages for each contigs, since list appending should be faster

    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped / 1000)
    base_cov_dict = {blob: [] for blob in set_of_blobs}
    #base_cov_dict = {blob : 0 for blob in set_of_blobs}
    read_cov_dict = {blob: 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(
        r"(\d+)M|X|=")  # only gets digits before M,X,='s
    # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
    command = "samtools view -F 1024 -F 4 -F 256 " + infile
    seen_reads = 0
    #import time
    #start = time.time()
    if not (no_base_cov_flag):
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                base_cov_dict[match[2]].append(
                    sum([
                        int(matching)
                        for matching in cigar_match_re.findall(match[5])
                    ]))
                #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    else:
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    if not int(reads_mapped) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_mapped, seen_reads)
    base_cov_dict = {
        seq_name: sum(base_covs)
        for seq_name, base_covs in base_cov_dict.items()
    }
    #end = time.time()
    #print (end-start)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Ejemplo n.º 22
0
def parseCov(infile, set_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    reads_unmapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith('#'):
                    if line.startswith("## Total Reads"):
                        reads_total = int(line.split(" = ")[1])
                    elif line.startswith("## Mapped Reads"):
                        reads_mapped = int(line.split(" = ")[1])
                    elif line.startswith("## Unmapped Reads"):
                        reads_unmapped = int(line.split(" = ")[1])
                    else:
                        pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(
                            match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name, infile)
                        else:
                            read_cov_dict[name] = read_cov
                            base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name)
                    else:
                        base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
Ejemplo n.º 23
0
def parseDict(infile, key, value):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            items = {}
            k_idx = int(key)
            v_idx = int(value)
            for l in fh:
                temp = l.rstrip("\n").split()
                items[temp[k_idx]] = temp[v_idx]
    return items
Ejemplo n.º 24
0
def readFasta(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        header, seqs = '', []
        for l in fh:
            if l[0] == '>':
                if header:
                    yield header, ''.join(seqs)
                header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
            else:
                seqs.append(l[:-1])
        yield header, ''.join(seqs)
Ejemplo n.º 25
0
def parseDict(infile, key, value):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            items = {}
            k_idx = int(key)
            v_idx = int(value)
            for l in fh:
                temp = l.rstrip("\n").split()
                items[temp[k_idx]] = temp[v_idx]
    return items
Ejemplo n.º 26
0
def parseCatColour(infile):
    catcolour_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    seq_name, category = l.rstrip("\n").split(",")
                    catcolour_dict[seq_name] = category
                except:
                    BtLog.error('23', infile)
    return catcolour_dict
Ejemplo n.º 27
0
def parseCatColour(infile):
    catcolour_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    seq_name, category = l.rstrip("\n").split(",")
                    catcolour_dict[seq_name] = category
                except:
                    BtLog.error('23', infile)
    return catcolour_dict
Ejemplo n.º 28
0
def readFasta(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        header, seqs = '', []
        for l in fh:
            if l[0] == '>':
                if header:
                    yield header, ''.join(seqs)
                header, seqs = l[1:-1].split()[0], [] # Header is split at first whitespace
            else:
                seqs.append(l[:-1])
        yield header, ''.join(seqs)
Ejemplo n.º 29
0
def parseCov(infile, set_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    reads_unmapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith('#'):
                    if line.startswith("## Total Reads"):
                        reads_total = int(line.split(" = ")[1])
                    elif line.startswith("## Mapped Reads"):
                        reads_mapped = int(line.split(" = ")[1])
                    elif line.startswith("## Unmapped Reads"):
                        reads_unmapped = int(line.split(" = ")[1])
                    else:
                        pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name, infile)
                        else:
                            read_cov_dict[name] = read_cov
                            base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name)
                    else:
                        base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
Ejemplo n.º 30
0
    def plotBar(self, cov_lib, out_f):
        fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov')
        ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] }
        ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] }
        reads_total = self.cov_libs_total_reads_dict[cov_lib]
        reads_mapped = self.stats['all']['reads_mapped'][cov_lib]
        reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib]
        ax_main_data['labels'].append('Unmapped (assembly)')
        ax_main_data['values'].append(reads_unmapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        ax_main_data['labels'].append('Mapped (assembly)')
        ax_main_data['values'].append(reads_mapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        if (self.refcov_dict):
            if cov_lib in self.refcov_dict:
                reads_total_ref = self.refcov_dict[cov_lib]['reads_total']
                reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped']
                reads_unmapped_ref = reads_total_ref - reads_mapped_ref
                ax_main_data['labels'].append('Unmapped (ref)')
                ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
                ax_main_data['labels'].append('Mapped (ref)')
                ax_main_data['values'].append(reads_mapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
            else:
                BtLog.error('40', cov_lib)

        # mapped plotted groups
        for group in self.plot_order:
           ax_group_data['labels'].append(group)
           ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib])
           ax_group_data['colours'].append(self.colours[group])
        rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours'])
        for rect_g in rect_group:
            height_g = float(rect_g.get_height())
            ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)
        rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours'])
        for rect_m in rect_main:
            height_m = float(rect_m.get_height())
            ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)

        ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        #figsuptitle = fig.suptitle(out_f, verticalalignment='top')
        out_f = "%s.read_cov.%s" % (out_f, cov_lib)
        print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)
        fig.tight_layout()
        #fig.savefig("%s.%s" % (out_f, self.format), format=self.format,  bbox_extra_artists=(figsuptitle,))
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
Ejemplo n.º 31
0
def parseReferenceCov(infile):
    refcov_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                    refcov_dict[cov_lib] = {'reads_total' : int(reads_total_ref),
                                            'reads_mapped' : int(reads_mapped_ref)}
                except:
                    BtLog.error('21', infile)
    return refcov_dict
Ejemplo n.º 32
0
    def plotBar(self, cov_lib, out_f):
        fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov')
        ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] }
        ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] }
        reads_total = self.cov_libs_total_reads_dict[cov_lib]
        reads_mapped = self.stats['all']['reads_mapped'][cov_lib]
        reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib]
        ax_main_data['labels'].append('Unmapped (assembly)')
        ax_main_data['values'].append(reads_unmapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        ax_main_data['labels'].append('Mapped (assembly)')
        ax_main_data['values'].append(reads_mapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        if (self.refcov_dict):
            if cov_lib in self.refcov_dict:
                reads_total_ref = self.refcov_dict[cov_lib]['reads_total']
                reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped']
                reads_unmapped_ref = reads_total_ref - reads_mapped_ref
                ax_main_data['labels'].append('Unmapped (ref)')
                ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
                ax_main_data['labels'].append('Mapped (ref)')
                ax_main_data['values'].append(reads_mapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
            else:
                BtLog.error('40', cov_lib)

        # mapped plotted groups
        for group in self.plot_order:
           ax_group_data['labels'].append(group)
           ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib])
           ax_group_data['colours'].append(self.colours[group])
        rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours'])
        for rect_g in rect_group:
            height_g = float(rect_g.get_height())
            ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)
        rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours'])
        for rect_m in rect_main:
            height_m = float(rect_m.get_height())
            ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)

        ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        #figsuptitle = fig.suptitle(out_f, verticalalignment='top')
        out_f = "%s.read_cov.%s" % (out_f, cov_lib)
        print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)
        fig.tight_layout()
        #fig.savefig("%s.%s" % (out_f, self.format), format=self.format,  bbox_extra_artists=(figsuptitle,))
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
Ejemplo n.º 33
0
def parseReferenceCov(infile):
    refcov_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                    refcov_dict[cov_lib] = {'reads_total' : int(reads_total_ref),
                                            'reads_mapped' : int(reads_mapped_ref)}
                except:
                    BtLog.error('21', infile)
    return refcov_dict
Ejemplo n.º 34
0
def parseCmdLabels(labels):
    label_d = {}
    name, groups = '', ''
    if labels:
        try:
            for label in labels:
                name, groups = str(label).split("=")
                if "," in groups:
                    for group in groups.split(","):
                        label_d[group] = name
                else:
                    label_d[groups] = name
        except:
            BtLog.error('17', labels)
    return label_d
Ejemplo n.º 35
0
def parseCmdLabels(labels):
    label_d = {}
    name, groups = '', ''
    if labels:
        try:
            for label in labels:
                name, groups = str(label).split("=")
                if "," in groups:
                    for group in groups.split(","):
                        label_d[group] = name
                else:
                    label_d[groups] = name
        except:
            BtLog.error('17', labels)
    return label_d
Ejemplo n.º 36
0
def parseSam(infile, set_of_blobs, no_base_cov_flag):
    if not isfile(infile):
        BtLog.error('0', infile)
    base_cov_dict = {blob: [] for blob in set_of_blobs}
    read_cov_dict = {blob: 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(
        r"(\d+)M|X|=")  # only gets digits before M,X,='s
    reads_total = 0
    reads_mapped = 0
    if not (no_base_cov_flag):
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            base_cov_dict[match[2]].append(
                                sum([
                                    int(matching) for matching in
                                    cigar_match_re.findall(match[5])
                                ]))
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    else:
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    base_cov_dict = {
        seq_name: sum(base_covs)
        for seq_name, base_covs in base_cov_dict.items()
    }
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Ejemplo n.º 37
0
def parseBam(infile, set_of_blobs, no_base_cov_flag):
    '''
    checkBam returns reads_total and reads_mapped
    base_cov_dict is list of coverages for each contigs, since list appending should be faster

    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped/1000)
    base_cov_dict = {blob : [] for blob in set_of_blobs}
    #base_cov_dict = {blob : 0 for blob in set_of_blobs}
    read_cov_dict = {blob : 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
    # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
    command = "samtools view -F 1024 -F 4 -F 256 " + infile
    seen_reads = 0
    #import time
    #start = time.time()
    if not (no_base_cov_flag):
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])]))
                #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    else:
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    if not int(reads_mapped) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_mapped, seen_reads)
    base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()}
    #end = time.time()
    #print (end-start)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Ejemplo n.º 38
0
def checkCas(infile):
    print BtLog.status_d['12']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not (which('clc_mapping_info')):
        BtLog.error('20')
    seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
    reads_total_re = re.compile(r"\s+Reads\s+(\d+)")
    reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%")
    seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0
    output = ''
    command = "clc_mapping_info -s " + infile
    for line in runCmd(command=command):
        output += line
    seqs_total = int(seqs_total_re.search(output).group(1))
    reads_mapped = int(reads_mapping_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return seqs_total, reads_total, reads_mapped
Ejemplo n.º 39
0
def checkCas(infile):
    print BtLog.status_d['12']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not (which('clc_mapping_info')):
        BtLog.error('20')
    seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
    reads_total_re = re.compile(r"\s+Reads\s+(\d+)")
    reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%")
    seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0
    output = ''
    command = "clc_mapping_info -s " + infile
    for line in runCmd(command=command):
        output += line
    seqs_total = int(seqs_total_re.search(output).group(1))
    reads_mapped = int(reads_mapping_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return seqs_total, reads_total, reads_mapped
Ejemplo n.º 40
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    bam_fs = args['--bam']
    cas_fs = args['--cas']
    sam_fs = args['--sam']
    prefix = args['--output']
    no_base_cov_flag = args['--no_base_cov']

    # Make covLibs
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)]
    if not (cov_libs):
        BtLog.error('31')
    blobDb = BtCore.BlobDb('cov')
    blobDb.version = blobtools.__version__
    blobDb.parseFasta(fasta_f, None)
    blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=no_base_cov_flag)
Ejemplo n.º 41
0
def checkBam(infile):
    print BtLog.status_d['10']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not which('samtools'):
        BtLog.error('7')
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
    reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = "samtools flagstat " + infile
    for line in runCmd(command=command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    reads_secondary = int(reads_secondary_re.search(output).group(1))
    reads_supplementary = int(reads_supplementary_re.search(output).group(1))
    reads_mapped = reads_mapped - reads_secondary - reads_secondary
    reads_total = int(reads_total_re.search(output).group(
        1)) - reads_secondary - reads_supplementary
    # check whether there are reads in BAM
    if not reads_total or not reads_mapped:
        BtLog.error('29' % infile)
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \
        '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return reads_total, reads_mapped
Ejemplo n.º 42
0
def readTax(infile, set_of_blobs):
    '''
    If more fields need to be parsed:
        - change hit_line_re
        - catch matches in variables
        - add as key-value pairs to hitDict
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)"
                             )  # TEST TEST , if not split it afterwards
    with open(infile) as fh:
        for line in fh:
            match = hit_line_re.search(line)
            if match:
                hitDict = {
                    'name': match.group(1),
                    'taxId': match.group(
                        2
                    ),  # string because if int, conversion is a nightmare ...
                    'score': float(match.group(3))
                }
                if hitDict['name'] not in set_of_blobs:
                    BtLog.error('19', hitDict['name'], infile)
                if hitDict['taxId'] == 'N/A':
                    BtLog.error('22', infile)
                yield hitDict
Ejemplo n.º 43
0
def checkBam(infile):
    print BtLog.status_d['10']
    if not isfile(infile):
        BtLog.error('0', infile)
    if not which('samtools'):
        BtLog.error('7')
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
    reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = "samtools flagstat " + infile
    for line in runCmd(command=command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    reads_secondary = int(reads_secondary_re.search(output).group(1))
    reads_supplementary = int(reads_supplementary_re.search(output).group(1))
    reads_mapped = reads_mapped - reads_secondary - reads_secondary
    reads_total = int(reads_total_re.search(output).group(1)) - reads_secondary - reads_supplementary
    # check whether there are reads in BAM
    if not reads_total or not reads_mapped:
        BtLog.error('29' % infile)
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \
        '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return reads_total, reads_mapped
Ejemplo n.º 44
0
def parseSam(infile, set_of_blobs, no_base_cov_flag):
    if not isfile(infile):
        BtLog.error('0', infile)
    base_cov_dict = {blob : [] for blob in set_of_blobs}
    read_cov_dict = {blob : 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(r"(\d+)M|X|=") # only gets digits before M,X,='s
    reads_total = 0
    reads_mapped = 0
    if not (no_base_cov_flag):
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            base_cov_dict[match[2]].append(sum([int(matching) for matching in cigar_match_re.findall(match[5])]))
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    else:
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    base_cov_dict = {seq_name: sum(base_covs) for seq_name, base_covs in base_cov_dict.items()}
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Ejemplo n.º 45
0
def main():
    args = docopt(__doc__)
    bam_f = args['--bam']
    include_f = args['--include']
    exclude_f = args['--exclude']
    out_prefix = args['--out']
    gzip = args['--gzip']
    do_sort = args['--sort']
    keep_sorted = args['--keep']
    sort_threads = int(args['--threads'])

    print BtLog.status_d['22'] % bam_f
    out_f = BtIO.getOutFile(bam_f, out_prefix, None)
    if include_f and exclude_f:
        print BtLog.error('43')
    elif include_f:
        sequence_list = BtIO.parseList(include_f)
        BtIO.parseBamForFilter(bam_f, out_f, sequence_list, None, gzip, do_sort, keep_sorted, sort_threads)
    elif exclude_f:
        sequence_list = BtIO.parseList(exclude_f)
        BtIO.parseBamForFilter(bam_f, out_f, None, sequence_list, gzip, do_sort, keep_sorted, sort_threads)
    else:
        BtIO.parseBamForFilter(bam_f, out_f, None, None, gzip, do_sort, keep_sorted, sort_threads)
Ejemplo n.º 46
0
def readTax(infile, set_of_blobs):
    '''
    If more fields need to be parsed:
        - change hit_line_re
        - catch matches in variables
        - add as key-value pairs to hitDict
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards
    with open(infile) as fh:
        for line in fh:
            match = hit_line_re.search(line)
            if match:
                hitDict = {
                    'name' : match.group(1),
                    'taxId' : match.group(2), # string because if int, conversion is a nightmare ...
                    'score' : float(match.group(3))
                    }
                if hitDict['name'] not in set_of_blobs:
                    BtLog.error('19', hitDict['name'], infile)
                if hitDict['taxId'] == 'N/A':
                    BtLog.error('22', infile)
                yield hitDict
Ejemplo n.º 47
0
    def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
        data_dict = {}
        read_cov_dict = {}
        max_cov = 0.0
        min_cov = 1000.0
        cov_lib_dict = self.covLibs
        cov_lib_names_l = self.covLibs.keys() # does not include cov_sum
        if len(cov_lib_names_l) > 1:
            # more than one cov_lib, cov_sum_lib has to be created
            cov_lib_dict['covsum'] = CovLibObj('covsum', 'covsum', 'Sum of cov in %s' % basename(self.title)).__dict__ # ugly
            cov_lib_dict['covsum']['reads_total'] = sum([self.covLibs[x]['reads_total'] for x in self.covLibs])
            cov_lib_dict['covsum']['reads_mapped'] = sum([self.covLibs[x]['reads_mapped'] for x in self.covLibs])
            cov_lib_dict['covsum']['cov_sum'] = sum([self.covLibs[x]['cov_sum'] for x in self.covLibs])
            cov_lib_dict['covsum']['mean_cov'] = cov_lib_dict['covsum']['cov_sum']/self.seqs
        for blob in self.dict_of_blobs.values():
            name, gc, length, group = blob['name'], blob['gc'], blob['length'], ''
            if (catcolour_dict): # annotation with categories specified in catcolour
                group = str(catcolour_dict[name])
            elif (c_index): # annotation with c_index instead of taxonomic group
                if taxrule not in self.taxrules:
                    BtLog.error('11', taxrule, self.taxrules)
                else:
                    group = str(blob['taxonomy'][taxrule][rank]['c_index'])
            else: # annotation with taxonomic group
                if not (taxrule) or taxrule not in self.taxrules:
                    BtLog.warn_d['9'] % (taxrule, self.taxrules)
                if taxrule in blob['taxonomy']:
                    group = str(blob['taxonomy'][taxrule][rank]['tax'])
            if not group in data_dict:
                data_dict[group] = {
                                    'name' : [],
                                    'length' : [],
                                    'gc' : [],
                                    'covs' : {covLib : [] for covLib in cov_lib_dict.keys()},           # includes cov_sum if it exists
                                    'reads_mapped' : {covLib : 0 for covLib in cov_lib_dict.keys()},    # includes cov_sum if it exists
                                    'count' : 0,
                                    'count_hidden' : 0,
                                    'count_visible' : 0,
                                    'span': 0,
                                    'span_hidden' : 0,
                                    'span_visible' : 0,
                                    }
            data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
            data_dict[group]['span'] = data_dict[group].get('span', 0) + int(length)
            if ((hide_nohits) and group == 'no-hit') or length < min_length: # hidden
                data_dict[group]['count_hidden'] = data_dict[group].get('count_hidden', 0) + 1
                data_dict[group]['span_hidden'] = data_dict[group].get('span_hidden', 0) + int(length)
            else: # visible
                data_dict[group]['count_visible'] = data_dict[group].get('count_visible', 0) + 1
                data_dict[group]['span_visible'] = data_dict[group].get('span_visible', 0) + int(length)
                data_dict[group]['name'].append(name)
                data_dict[group]['length'].append(length)
                data_dict[group]['gc'].append(gc)
                cov_sum = 0.0
                reads_mapped_sum = 0
                for cov_lib in sorted(cov_lib_names_l):
                    cov = float(blob['covs'][cov_lib])
                    if cov < 0.02:
                        cov = 0.02
                    if cov < min_cov:
                        min_cov = cov
                    # increase max_cov
                    if cov > max_cov:
                        max_cov = cov
                    # add cov of blob to group
                    data_dict[group]['covs'][cov_lib].append(cov)
                    cov_sum += cov
                    # add readcov
                    if cov_lib in blob['read_cov']:
                        reads_mapped = blob['read_cov'][cov_lib]
                        data_dict[group]['reads_mapped'][cov_lib] += reads_mapped
                        reads_mapped_sum += reads_mapped
                if len(cov_lib_names_l) > 1:
                    if cov_sum < 0.02 :
                        cov_sum = 0.02
                    data_dict[group]['covs']['covsum'].append(cov_sum)
                    if cov_sum > max_cov:
                        max_cov = cov_sum
                    if (reads_mapped_sum):
                        data_dict[group]['reads_mapped']['covsum'] += reads_mapped_sum

        return data_dict, min_cov, max_cov, cov_lib_dict
Ejemplo n.º 48
0
    def plotScatter(self, cov_lib, info_flag, out_f):

        fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins = self.setupPlot(self.plot)
        # empty handles for big legend
        legend_handles = []
        legend_labels = []
        # marker size scaled by biggest blob (size in points^2)
        max_length = max(array(self.stats['all']['length'])) # length of biggest blob
        max_marker_size = 12500 # marker size for biggest blob, i.e. area of 12500^2 pixel
        for idx, group in enumerate(self.plot_order):
            idx += 1
            lw, alpha = 0.5, 0.8
            if group == 'no-hit':
                alpha = 0.5
            group_length_array = array(self.stats[group]['length'])
            if len(group_length_array) > 0 and group not in self.exclude_groups:
                colour = self.colours[group]
                group_x_array = ''
                group_y_array = ''
                if self.plot == 'blobplot':
                    group_x_array = array(self.stats[group]['gc'])
                    group_y_array = array(self.stats[group]['covs'][cov_lib])
                elif self.plot == 'covplot':
                    group_x_array = array(self.stats[group]['covs'][cov_lib])
                    group_y_array = array([self.cov_y_dict.get(name, 0.02) for name in self.stats[group]['name']])
                else:
                    BtLog.error('34', self.plot)
                marker_size_array = []
                if (self.ignore_contig_length): # no scaling
                    if group == "no-hit":
                        s = 20
                    else:
                        s = 100
                    marker_size_array = [s for length in group_length_array]
                else: # scaling by max_length
                    marker_size_array = [(length/max_length)*max_marker_size for length in group_length_array]
                # generate label for legend
                group_span_in_mb = round(self.stats[group]['span_visible']/1000000, 2)
                group_number_of_seqs = self.stats[group]['count_visible']
                group_n50 = self.stats[group]['n50']
                fmt_seqs = "{:,}".format(group_number_of_seqs)
                fmt_span = "{:,}".format(group_span_in_mb)
                fmt_n50 = "{:,}".format(group_n50)
                label = "%s (%s;%sMB;%snt)" % (group, fmt_seqs, fmt_span, fmt_n50)
                if (info_flag):
                    print BtLog.info_d['0'] % (group, fmt_seqs, fmt_span, fmt_n50)
                legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markerfacecolor=colour))
                legend_labels.append(label)

                weights_array = None
                if self.hist_type == "span":
                    weights_array = group_length_array/1000

                axHistx.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3)
                axHisty.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
                axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label)
                axLegend.axis('off')
                if (self.multiplot):
                    fig_m, axScatter_m, axHistx_m, axHisty_m, axLegend_m, top_bins, right_bins = self.setupPlot(self.plot)
                    legend_handles_m = []
                    legend_labels_m = []
                    legend_handles_m.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markerfacecolor=colour))
                    legend_labels_m.append(label)
                    axHistx_m.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3)
                    axHisty_m.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
                    axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label)
                    axLegend_m.axis('off')
                    axLegend_m.legend(legend_handles_m, legend_labels_m, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True)
                    plot_ref_legend(axScatter_m, max_length, max_marker_size, self.ignore_contig_length)
                    m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_"))
                    fig_m = plot_legend(fig_m, axLegend_m, m_out_f, self.legend_flag, self.format, self.cumulative_flag)
                    print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)
                    fig_m.savefig("%s.%s" % (m_out_f, self.format), format=self.format)
                    plt.close(fig_m)
                elif (self.cumulative_flag):
                    axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True)
                    plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length)
                    m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_"))
                    fig.add_axes(axLegend)
                    fig = plot_legend(fig, axLegend, m_out_f, self.legend_flag, self.format, self.cumulative_flag)
                    if not (self.no_title):
                        fig.suptitle(out_f, fontsize=35, verticalalignment='top')
                    print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)
                    fig.savefig("%s.%s" % (m_out_f, self.format), format=self.format)
                else:
                    pass
        plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length)
        axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True, loc=6 )
        out_f = "%s.%s" % (out_f, cov_lib)
        fig.add_axes(axLegend)
        fig = plot_legend(fig, axLegend, out_f, self.legend_flag, self.format, self.cumulative_flag)
        if not (self.no_title):
            fig.suptitle(out_f, fontsize=35, verticalalignment='top')
        print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
Ejemplo n.º 49
0
def parseNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
    gets JSON'ed into blobtools/data/nodes_db.json if this file
    does not exist. Nodes_db.json is used if neither "--names" and "--nodes"
    nor "--db" is specified.
    '''
    nodesDB = {}
    names_f = kwargs['names']
    nodes_f = kwargs['nodes']
    nodesDB_f = kwargs['nodesDB']
    nodesDB_default = kwargs['nodesDBdefault']

    if (nodes_f and names_f):
        if not isfile(names_f):
            BtLog.error('0', names_f)
        if not isfile(nodes_f):
            BtLog.error('0', nodes_f)
        print BtLog.status_d['3'] % (nodes_f, names_f)
        try:
            nodesDB = readNamesNodes(names_f, nodes_f)
        except:
            BtLog.error('3', nodes_f, names_f)
    elif (nodesDB_f):
        if not isfile(nodesDB_f):
            BtLog.error('0', nodesDB_f)
        print BtLog.status_d['4'] % (nodesDB_f)
        try:
            nodesDB = readNodesDB(nodesDB_f)
        except:
            BtLog.error('27', nodesDB_f)
    elif (nodesDB_default):
        if not isfile(nodesDB_default):
            BtLog.error('28')
        print BtLog.status_d['4'] % (nodesDB_default)
        try:
            nodesDB = readNodesDB(nodesDB_default)
        except:
            BtLog.error('27', nodesDB_default)
        nodesDB_f = nodesDB_default

    # Write nodesDB if not available
    if not isfile(nodesDB_default):
        writeNodesDB(nodesDB, nodesDB_default)

    return nodesDB, nodesDB_f
Ejemplo n.º 50
0
    def plotScatter(self, cov_lib, info_flag, out_f):

        fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins = self.setupPlot(self.plot)
        # empty handles for big legend
        legend_handles = []
        legend_labels = []
        # marker size scaled by biggest blob (size in points^2)
        max_length = max(array(self.stats['all']['length'])) # length of biggest blob
        max_marker_size = 12500 # marker size for biggest blob, i.e. area of 12500^2 pixel
        for idx, group in enumerate(self.plot_order):
            idx += 1
            lw, alpha = 0.5, 0.8
            if group == 'no-hit':
                alpha = 0.5
            group_length_array = array(self.stats[group]['length'])
            if len(group_length_array) > 0 and group not in self.exclude_groups:
                colour = self.colours[group]
                group_x_array = ''
                group_y_array = ''
                if self.plot == 'blobplot':
                    group_x_array = array(self.stats[group]['gc'])
                    group_y_array = array(self.stats[group]['covs'][cov_lib])
                elif self.plot == 'covplot':
                    group_x_array = array(self.stats[group]['covs'][cov_lib])
                    group_y_array = array([self.cov_y_dict.get(name, 0.02) for name in self.stats[group]['name']])
                else:
                    BtLog.error('34', self.plot)
                marker_size_array = []
                if (self.ignore_contig_length): # no scaling
                    if group == "no-hit":
                        s = 20
                    else:
                        s = 100
                    marker_size_array = [s for length in group_length_array]
                else: # scaling by max_length
                    marker_size_array = [(length/max_length)*max_marker_size for length in group_length_array]
                # generate label for legend
                group_span_in_mb = round(self.stats[group]['span_visible']/1000000, 2)
                group_number_of_seqs = self.stats[group]['count_visible']
                group_n50 = self.stats[group]['n50']
                fmt_seqs = "{:,}".format(group_number_of_seqs)
                fmt_span = "{:,}".format(group_span_in_mb)
                fmt_n50 = "{:,}".format(group_n50)
                label = "%s (%s;%sMB;%snt)" % (group, fmt_seqs, fmt_span, fmt_n50)
                if (info_flag):
                    print BtLog.info_d['0'] % (group, fmt_seqs, fmt_span, fmt_n50)
                legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=BLACK, markerfacecolor=colour))
                legend_labels.append(label)

                weights_array = None
                if self.hist_type == "span":
                    weights_array = group_length_array/1000

                axHistx.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3)
                axHisty.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
                axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label)
                axLegend.axis('off')
                if (self.multiplot):
                    fig_m, axScatter_m, axHistx_m, axHisty_m, axLegend_m, top_bins, right_bins = self.setupPlot(self.plot)
                    legend_handles_m = []
                    legend_labels_m = []
                    legend_handles_m.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=BLACK, markerfacecolor=colour))
                    legend_labels_m.append(label)
                    axHistx_m.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3)
                    axHisty_m.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
                    axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=BLACK, label=label)
                    axLegend_m.axis('off')
                    axLegend_m.legend(legend_handles_m, legend_labels_m, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True)
                    plot_ref_legend(axScatter_m, max_length, max_marker_size, self.ignore_contig_length)
                    m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_"))
                    fig_m = plot_legend(fig_m, axLegend_m, m_out_f, self.legend_flag, self.format, self.cumulative_flag)
                    print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)
                    fig_m.savefig("%s.%s" % (m_out_f, self.format), format=self.format)
                    plt.close(fig_m)
                elif (self.cumulative_flag):
                    axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True)
                    plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length)
                    m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_"))
                    fig.add_axes(axLegend)
                    fig = plot_legend(fig, axLegend, m_out_f, self.legend_flag, self.format, self.cumulative_flag)
                    if not (self.no_title):
                        fig.suptitle(out_f, fontsize=35, verticalalignment='top')
                    print BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format)
                    fig.savefig("%s.%s" % (m_out_f, self.format), format=self.format)
                else:
                    pass
        plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length)
        axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True, loc=6 )
        out_f = "%s.%s" % (out_f, cov_lib)
        fig.add_axes(axLegend)
        fig = plot_legend(fig, axLegend, out_f, self.legend_flag, self.format, self.cumulative_flag)
        if not (self.no_title):
            fig.suptitle(out_f, fontsize=35, verticalalignment='top')
        print BtLog.status_d['8'] % "%s.%s" % (out_f, self.format)
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
Ejemplo n.º 51
0
def main():

    #main_dir = dirname(__file__)
    args = docopt(__doc__)
    fasta_f = args['--infile']
    fasta_type = args['--type']
    sam_fs = args['--sam']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--hitsfile']
    prefix = args['--out']
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    min_bitscore_diff = float(args['--min_diff'])
    tax_collision_random = args['--tax_collision_random']
    title = args['--title']

    # outfile
    out_f = BtIO.getOutFile("blobDB", prefix, "json")
    if not (title):
        title = out_f

    # coverage
    if not (fasta_type
            ) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
           [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]

    # taxonomy
    hit_libs = [
        BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f)
        for idx, lib_f in enumerate(hit_fs)
    ]

    # Create BlobDB object
    blobDb = BtCore.BlobDb(title)
    blobDb.version = blobtools.__version__
    # Parse FASTA
    blobDb.parseFasta(fasta_f, fasta_type)

    # Parse nodesDB OR names.dmp, nodes.dmp
    nodesDB_default = join(blobtools.DATADIR, "nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f,
                                           names=names_f,
                                           nodesDB=nodesDB_f,
                                           nodesDBdefault=nodesDB_default)
    blobDb.nodesDB_f = nodesDB_f

    # Parse similarity hits
    if (hit_libs):
        blobDb.parseHits(hit_libs)
        blobDb.computeTaxonomy(taxrules, nodesDB, min_bitscore_diff,
                               tax_collision_random)
    else:
        print BtLog.warn_d['0']

    # Parse coverage
    blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=None)

    # Generating BlobDB and writing to file
    print BtLog.status_d['7'] % out_f
    BtIO.writeJson(blobDb.dump(), out_f)
Ejemplo n.º 52
0
def parseNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
    gets JSON'ed into blobtools/data/nodes_db.json if this file
    does not exist. Nodes_db.json is used if neither "--names" and "--nodes"
    nor "--db" is specified.
    '''
    nodesDB = {}
    names_f = kwargs['names']
    nodes_f = kwargs['nodes']
    nodesDB_f = kwargs['nodesDB']
    nodesDB_default = kwargs['nodesDBdefault']

    if (nodes_f and names_f):
        if not isfile(names_f):
            BtLog.error('0', names_f)
        if not isfile(nodes_f):
            BtLog.error('0', nodes_f)
        print BtLog.status_d['3'] % (nodes_f, names_f)
        try:
            nodesDB = readNamesNodes(names_f, nodes_f)
        except:
            BtLog.error('3', nodes_f, names_f)
    elif (nodesDB_f):
        if not isfile(nodesDB_f):
            BtLog.error('0', nodesDB_f)
        print BtLog.status_d['4'] % (nodesDB_f)
        try:
            nodesDB = readNodesDB(nodesDB_f)
        except:
            BtLog.error('27', nodesDB_f)
    elif (nodesDB_default):
        if not isfile(nodesDB_default):
            BtLog.error('28')
        print BtLog.status_d['4'] % (nodesDB_default)
        try:
            nodesDB = readNodesDB(nodesDB_default)
        except:
            BtLog.error('27', nodesDB_default)
        nodesDB_f = nodesDB_default

    # Write nodesDB if not available
    if not isfile(nodesDB_default):
        writeNodesDB(nodesDB, nodesDB_default)

    return nodesDB, nodesDB_f
Ejemplo n.º 53
0
def main():
    args = docopt(__doc__)
    blast_f = args['--blast']
    diamond_f = args['--diamond']

    uniref_f = args['--uniref']
    rnacentral_f = args['--rnacentral']
    swissprot_f = args['--swissprot']
    taxid = args['--taxid']

    force = args['--force']
    prefix = args['--out']

    out_f, hit_f, map_f, taxid_d = None, None, None, {}

    # Check if blast_f OR diamond_f is speciefied
    if not (bool(blast_f) + bool(diamond_f) == 1):
        BtLog.error('26')
    elif blast_f:
        hit_f = blast_f
    elif diamond_f:
        hit_f = diamond_f
    else:
        pass

    # Check if taxID or Mapping file is supplied
    if (taxid):
        try:
            taxid = int(taxid)
        except TypeError:
            BtLog.error('26')
        out_f = BtIO.getOutFile(hit_f, prefix, "tax_%s.out" % taxid)
        taxid_d = defaultdict(lambda: taxid)
    elif (bool(uniref_f) + bool(rnacentral_f) + bool(swissprot_f) == 1):
        if uniref_f:
            print BtLog.status_d['1'] % ("ID-to-taxID Mapping file", uniref_f)
            taxid_d = BtIO.parseDict(uniref_f, 0, 1)
            out_f = BtIO.getOutFile(hit_f, prefix, "uniref.out")
            map_f = uniref_f
        elif rnacentral_f:
            print BtLog.status_d['1'] % ("ID-to-taxID Mapping file",
                                         rnacentral_f)
            taxid_d = BtIO.parseDict(rnacentral_f, 0, 3)
            out_f = BtIO.getOutFile(hit_f, prefix, "rnacentral.out")
            map_f = rnacentral_f
        elif swissprot_f:
            print BtLog.status_d['1'] % ("ID-to-taxID Mapping file",
                                         swissprot_f)
            taxid_d = BtIO.parseDict(swissprot_f, 0, 1)
            out_f = BtIO.getOutFile(hit_f, prefix, "swissprot.out")
            map_f = swissprot_f
        else:
            pass
    else:
        BtLog.error('41')

    output = []
    print BtLog.status_d['1'] % ("hits file", hit_f)

    with open(hit_f) as fh:
        for idx, l in enumerate(fh):
            query_id, bitscore, tax_id, subject_id, rest = None, None, None, None, None
            line = l.rstrip("\n").split()
            query_id = line[0]
            if blast_f:
                bitscore = line[2]
                tax_id = line[1]
                subject_id = line[4]
                rest = "\t".join(line[2:])
            elif diamond_f:
                bitscore = line[11]
                subject_id = line[1]
                rest = "\t".join(line[1:])
            if swissprot_f:
                subject_id = subject_id.split("|")[1]
            if blast_f and not tax_id == "N/A" and not force:  # so that it does not overwrite existing taxIDs
                print BtLog.warn_d['10'] % (idx + 1, line[0], line[1])
                output.append("%s\t%s\t%s\t%s" %
                              (query_id, tax_id, bitscore, rest))
            else:
                try:
                    tax_id = taxid_d[subject_id]
                except KeyError:
                    BtLog.warn_d['12'] % (subject_id, map_f)
                    tax_id = "N/A"
                output.append("%s\t%s\t%s\t%s" %
                              (query_id, tax_id, bitscore, rest))

    if output:
        with open(out_f, "w") as fh:
            print BtLog.status_d['24'] % out_f
            fh.write("\n".join(output))
Ejemplo n.º 54
0
def main():
    #print data_dir
    args = docopt(__doc__)
    blobdb_f = args['--input']
    prefix = args['--out']
    ranks = args['--rank']
    taxrule = args['--taxrule']
    hits_flag = args['--hits']
    seq_list_f = args['--list']
    concoct = args['--concoct']
    cov = args['--cov']
    notable = args['--notable']
    experimental = args['--experimental']
    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    out_f = BtIO.getOutFile(blobdb_f, prefix, None)

    # Are ranks sane ?
    if 'all' in ranks:
        temp_ranks = RANKS[0:-1]
        ranks = temp_ranks[::-1]
    else:
        for rank in ranks:
            if rank not in RANKS:
                BtLog.error('9', rank)

    # Does seq_list file exist?
    seqs = []
    if (seq_list_f):
        if isfile(seq_list_f):
            seqs = BtIO.parseList(seq_list_f)
        else:
            BtLog.error('0', seq_list_f)

    # Load BlobDb
    blobDb = BtCore.BlobDb('new')
    print BtLog.status_d['9'] % (blobdb_f)
    blobDb.load(blobdb_f)
    blobDb.version = blobtools.__version__

    # Is taxrule sane and was it computed?
    if (blobDb.hitLibs) and taxrule not in blobDb.taxrules:
        BtLog.error('11', taxrule, blobDb.taxrules)

    # view(s)
    viewObjs = []
    print BtLog.status_d['14']
    if not (notable):
        tableView = BtCore.ViewObj(name="table", out_f=out_f, suffix="table.txt", body=[])
        viewObjs.append(tableView)
    if (experimental):
        experimentalView = BtCore.ExperimentalViewObj(name = "experimental", view_dir=out_f)
        viewObjs.append(experimentalView)
    if (concoct):
        concoctTaxView = BtCore.ViewObj(name="concoct_tax", out_f=out_f, suffix="concoct_taxonomy_info.csv", body=dict())
        viewObjs.append(concoctTaxView)
        concoctCovView = BtCore.ViewObj(name="concoct_cov", out_f=out_f, suffix="concoct_coverage_info.tsv", body=[])
        viewObjs.append(concoctCovView)
    if (cov):
        for cov_lib_name, covLibDict in blobDb.covLibs.items():
            out_f = BtIO.getOutFile(covLibDict['f'], prefix, None)
            covView = BtCore.ViewObj(name="covlib", out_f=out_f, suffix="cov", body=[])
            blobDb.view(viewObjs=[covView], ranks=None, taxrule=None, hits_flag=None, seqs=None, cov_libs=[cov_lib_name], progressbar=True)
    if (viewObjs):
        blobDb.view(viewObjs=viewObjs, ranks=ranks, taxrule=taxrule, hits_flag=hits_flag, seqs=seqs, cov_libs=[], progressbar=True)
    print BtLog.status_d['19']
Ejemplo n.º 55
0
    def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index, catcolour_dict):
        data_dict = {}
        read_cov_dict = {}
        max_cov = 0.0
        min_cov = 1000.0
        cov_lib_dict = self.covLibs
        cov_lib_names_l = self.covLibs.keys()  # does not include cov_sum
        if len(cov_lib_names_l) > 1:
            # more than one cov_lib, cov_sum_lib has to be created
            cov_lib_dict["covsum"] = CovLibObj(
                "covsum", "covsum", "Sum of cov in %s" % basename(self.title)
            ).__dict__  # ugly
            cov_lib_dict["covsum"]["reads_total"] = sum([self.covLibs[x]["reads_total"] for x in self.covLibs])
            cov_lib_dict["covsum"]["reads_mapped"] = sum([self.covLibs[x]["reads_mapped"] for x in self.covLibs])
            cov_lib_dict["covsum"]["cov_sum"] = sum([self.covLibs[x]["cov_sum"] for x in self.covLibs])
            cov_lib_dict["covsum"]["mean_cov"] = cov_lib_dict["covsum"]["cov_sum"] / self.seqs
        for blob in self.dict_of_blobs.values():
            name, gc, length, group = blob["name"], blob["gc"], blob["length"], ""
            if catcolour_dict:  # annotation with categories specified in catcolour
                group = str(catcolour_dict[name])
            elif c_index:  # annotation with c_index instead of taxonomic group
                if taxrule not in self.taxrules:
                    BtLog.error("11", taxrule, self.taxrules)
                else:
                    group = str(blob["taxonomy"][taxrule][rank]["c_index"])
            else:  # annotation with taxonomic group
                if not (taxrule) or taxrule not in self.taxrules:
                    BtLog.warn_d["9"] % (taxrule, self.taxrules)
                if taxrule in blob["taxonomy"]:
                    group = str(blob["taxonomy"][taxrule][rank]["tax"])
            if not group in data_dict:
                data_dict[group] = {
                    "name": [],
                    "length": [],
                    "gc": [],
                    "covs": {covLib: [] for covLib in cov_lib_dict.keys()},  # includes cov_sum if it exists
                    "reads_mapped": {covLib: 0 for covLib in cov_lib_dict.keys()},  # includes cov_sum if it exists
                    "count": 0,
                    "count_hidden": 0,
                    "count_visible": 0,
                    "span": 0,
                    "span_hidden": 0,
                    "span_visible": 0,
                }
            data_dict[group]["count"] = data_dict[group].get("count", 0) + 1
            data_dict[group]["span"] = data_dict[group].get("span", 0) + int(length)
            if ((hide_nohits) and group == "no-hit") or length < min_length:  # hidden
                data_dict[group]["count_hidden"] = data_dict[group].get("count_hidden", 0) + 1
                data_dict[group]["span_hidden"] = data_dict[group].get("span_hidden", 0) + int(length)
            else:  # visible
                data_dict[group]["count_visible"] = data_dict[group].get("count_visible", 0) + 1
                data_dict[group]["span_visible"] = data_dict[group].get("span_visible", 0) + int(length)
                data_dict[group]["name"].append(name)
                data_dict[group]["length"].append(length)
                data_dict[group]["gc"].append(gc)
                cov_sum = 0.0
                reads_mapped_sum = 0
                for cov_lib in sorted(cov_lib_names_l):
                    cov = float(blob["covs"][cov_lib])
                    if cov < 0.02:
                        cov = 0.02
                    if cov < min_cov:
                        min_cov = cov
                    # increase max_cov
                    if cov > max_cov:
                        max_cov = cov
                    # add cov of blob to group
                    data_dict[group]["covs"][cov_lib].append(cov)
                    cov_sum += cov
                    # add readcov
                    if cov_lib in blob["read_cov"]:
                        reads_mapped = blob["read_cov"][cov_lib]
                        data_dict[group]["reads_mapped"][cov_lib] += reads_mapped
                        reads_mapped_sum += reads_mapped
                if len(cov_lib_names_l) > 1:
                    if cov_sum < 0.02:
                        cov_sum = 0.02
                    data_dict[group]["covs"]["covsum"].append(cov_sum)
                    if cov_sum > max_cov:
                        max_cov = cov_sum
                    if reads_mapped_sum:
                        data_dict[group]["reads_mapped"]["covsum"] += reads_mapped_sum

        return data_dict, min_cov, max_cov, cov_lib_dict
Ejemplo n.º 56
0
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort, keep_sorted, sort_threads):
    '''
    checkBam returns reads_total and reads_mapped
    parse BAM to extract readpairs
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    if do_sort:
        command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (infile, infile)
        runCmd(command=command, wait=True)
        infile = "%s.readsorted.bam" % infile

    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped/1000)
    command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile
    seen_reads = 0
    read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(outfile, include, exclude)
    read_pair_out_fhs = []
    used_fhs = {}
    iterator = runCmd(command=command)
    read_pair_type = None
    if include:
        sequence_to_type_dict = defaultdict(lambda: 'Ex')
        for incl in include:
            sequence_to_type_dict[incl] = 'In'
        sequence_to_type_dict['*'] = 'Un'
    elif exclude:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        for excl in exclude:
            sequence_to_type_dict[excl] = 'Ex'
        sequence_to_type_dict['*'] = 'Un'
    else:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        sequence_to_type_dict['*'] = 'Un'
    for l in iterator:
        read1 = l.split()
        try:
            seen_reads += 2
            read2 = next(iterator).split()
            read_pair_type = "".join(sorted([sequence_to_type_dict[read1[2]], sequence_to_type_dict[read2[2]]]))
            print_bam(read_pair_out_fs, read_pair_type, read1, read2)
            read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2)
            read_pair_count[read_pair_type] += 1
            BtLog.progress(seen_reads, progress_unit, reads_total)
            if seen_reads % progress_unit == 0:
                used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
                read_pair_seqs = {read_pair_type : tuple() for read_pair_type in read_pair_count}
        except StopIteration:
                print BtLog.warn_d['11']
    used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
    close_fhs(used_fhs)
    # info log
    info_string = []
    info_string.append(('Total pairs', "{:,}".format(int(seen_reads/2)), '{0:.1%}'.format(1.00)))
    for read_pair_type, count in read_pair_count.items():
        info_string.append((read_pair_type + ' pairs', "{:,}".format(count), '{0:.1%}'.format(count/int(seen_reads/2))))
    info_out_f = getOutFile(outfile, None, "info.txt")
    with open(info_out_f, 'w') as info_fh:
        print BtLog.status_d['24'] % info_out_f
        info_fh.write(get_table(info_string))
    # gzip
    if gzip:
        if not which('gzip'):
            BtLog.error('43')
        for out_f in used_fhs:
            print BtLog.status_d['25'] % out_f
            runCmd(command="gzip -f " + out_f, wait=True)

    if not int(reads_total) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_total, seen_reads)
    if do_sort and not keep_sorted:
        os.remove(infile)
    return 1
Ejemplo n.º 57
0
def check_input(args):
    blobdb_f = args['--infile']
    rank = args['--rank']
    c_index = args['--cindex']
    min_length = int(args['--length'])
    multiplot = args['--multiplot']
    hide_nohits = args['--nohit']
    out_prefix = args['--out']
    max_group_plot = int(args['--plotgroups'])
    sort_order = args['--sort']
    taxrule = args['--taxrule']
    hist_type = args['--hist']
    no_title = args['--notitle']
    ignore_contig_length = args['--noscale']
    labels = args['--label']
    colour_f = args['--colours']
    exclude_groups = args['--exclude']
    format = args['--format']
    no_plot_blobs = args['--noblobs']
    no_plot_reads = args['--noreads']
    refcov_f = args['--refcov']
    catcolour_f = args['--catcolour']
    legend_flag = args['--legend']
    cumulative_flag = args['--cumulative']
    cov_lib_selection = args['--lib']

    if 'blobplot' in args or 'covplot' in args:
        # Are ranks sane ?
        if rank not in BtTax.RANKS:
            BtLog.error('9', rank)
        # is taxrule provided?
        if taxrule not in BtTax.TAXRULES:
            BtLog.error('8', taxrule)
        # Are sort_order and hist_type sane?
        if not sort_order in ['span', 'count']:
            BtLog.error('14', sort_order)
        if not hist_type in ['span', 'count']:
            BtLog.error('15', hist_type)
        if (catcolour_f) and (c_index):
            BtLog.error('24')
        if (cumulative_flag) and (multiplot):
            BtLog.error('32')
    return args
Ejemplo n.º 58
0
def check_input(args):
    blobdb_f = args['--infile']
    rank = args['--rank']
    c_index = args['--cindex']
    min_length = int(args['--length'])
    multiplot = args['--multiplot']
    hide_nohits = args['--nohit']
    out_prefix = args['--out']
    max_group_plot = int(args['--plotgroups'])
    sort_order = args['--sort']
    sort_first = args['--sort_first']
    taxrule = args['--taxrule']
    hist_type = args['--hist']
    no_title = args['--notitle']
    ignore_contig_length = args['--noscale']
    labels = args['--label']
    colour_f = args['--colours']
    exclude_groups = args['--exclude']
    format = args['--format']
    no_plot_blobs = args['--noblobs']
    no_plot_reads = args['--noreads']
    refcov_f = args['--refcov']
    catcolour_f = args['--catcolour']
    legend_flag = args['--legend']
    cumulative_flag = args['--cumulative']
    cov_lib_selection = args['--lib']

    #Convert sort_first to a list
    if sort_first:
        args['--sort_first'] = sort_first.split(',')
    else:
        args['--sort_first'] = ()

    if 'blobplot' in args or 'covplot' in args:
        # Are ranks sane ?
        if rank not in BtTax.RANKS:
            BtLog.error('9', rank)
        # is taxrule provided?
        if taxrule not in BtTax.TAXRULES:
            BtLog.error('8', taxrule)
        # Are sort_order and hist_type sane?
        if not sort_order in ['span', 'count']:
            BtLog.error('14', sort_order)
        if not hist_type in ['span', 'count']:
            BtLog.error('15', hist_type)
        if (catcolour_f) and (c_index):
            BtLog.error('24')
        if (cumulative_flag) and (multiplot):
            BtLog.error('32')
    return args
Ejemplo n.º 59
0
def parseBamForFilter(infile, outfile, include, exclude, gzip, do_sort,
                      keep_sorted, sort_threads):
    '''
    checkBam returns reads_total and reads_mapped
    parse BAM to extract readpairs
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    if do_sort:
        command = 'samtools sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (
            infile, infile)
        runCmd(command=command, wait=True)
        infile = "%s.readsorted.bam" % infile

    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped / 1000)
    command = "samtools view -f 1 -F 1024 -F 256 -F 2048 %s" % infile
    seen_reads = 0
    read_pair_count, read_pair_seqs, read_pair_out_fs = init_read_pairs(
        outfile, include, exclude)
    read_pair_out_fhs = []
    used_fhs = {}
    iterator = runCmd(command=command)
    read_pair_type = None
    if include:
        sequence_to_type_dict = defaultdict(lambda: 'Ex')
        for incl in include:
            sequence_to_type_dict[incl] = 'In'
        sequence_to_type_dict['*'] = 'Un'
    elif exclude:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        for excl in exclude:
            sequence_to_type_dict[excl] = 'Ex'
        sequence_to_type_dict['*'] = 'Un'
    else:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        sequence_to_type_dict['*'] = 'Un'
    for l in iterator:
        read1 = l.split()
        try:
            seen_reads += 2
            read2 = next(iterator).split()
            read_pair_type = "".join(
                sorted([
                    sequence_to_type_dict[read1[2]],
                    sequence_to_type_dict[read2[2]]
                ]))
            print_bam(read_pair_out_fs, read_pair_type, read1, read2)
            read_pair_seqs[read_pair_type] += get_read_pair_seqs(read1, read2)
            read_pair_count[read_pair_type] += 1
            BtLog.progress(seen_reads, progress_unit, reads_total)
            if seen_reads % progress_unit == 0:
                used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs,
                                                read_pair_seqs)
                read_pair_seqs = {
                    read_pair_type: tuple()
                    for read_pair_type in read_pair_count
                }
        except StopIteration:
            print BtLog.warn_d['11']
    used_fhs = write_read_pair_seqs(used_fhs, read_pair_out_fs, read_pair_seqs)
    close_fhs(used_fhs)
    # info log
    info_string = []
    info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)),
                        '{0:.1%}'.format(1.00)))
    for read_pair_type, count in read_pair_count.items():
        info_string.append((read_pair_type + ' pairs', "{:,}".format(count),
                            '{0:.1%}'.format(count / int(seen_reads / 2))))
    info_out_f = getOutFile(outfile, None, "info.txt")
    with open(info_out_f, 'w') as info_fh:
        print BtLog.status_d['24'] % info_out_f
        info_fh.write(get_table(info_string))
    # gzip
    if gzip:
        if not which('gzip'):
            BtLog.error('43')
        for out_f in used_fhs:
            print BtLog.status_d['25'] % out_f
            runCmd(command="gzip -f " + out_f, wait=True)

    if not int(reads_total) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_total, seen_reads)
    if do_sort and not keep_sorted:
        os.remove(infile)
    return 1