Example #1
0
def readBam(infile, set_of_blobs):
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(int(reads_mapped)/1000) + 1 # lazy fix
    base_cov_dict = {}
    read_cov_dict = {}
    cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's
    # execute samtools to get only mapped reads
    command = "samtools view -F 4 " + infile
    # ADD flag picard -F 1028 to not consider optical duplicates
    #command = "samtools view -F 1028 " + infile
    # only one counter since only yields mapped reads
    parsed_reads = 0 
    for line in runCmd(command):
        match = line.split("\t")
        if match >= 11:
            seq_name = match[2]
            base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
            if (base_cov):
                parsed_reads += 1
                if seq_name not in set_of_blobs:
                    print BtLog.warn_d['2'] % (seq_name, infile)
                else:
                    base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov 
                    read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1 
        BtLog.progress(parsed_reads, progress_unit, reads_total)
    BtLog.progress(reads_total, progress_unit, reads_total)
    if not int(reads_mapped) == int(parsed_reads):
        print warn_d['3'] % (reads_mapped, parsed_reads)
    return base_cov_dict, reads_total, parsed_reads, read_cov_dict
Example #2
0
def readCas(infile, order_of_blobs):
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs)/100)
    cas_line_re = re.compile(r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0 
    if (runCmd(command)):
        for line in runCmd(command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(1)) - 1 # -1 because index of contig list starts with zero 
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
            BtLog.progress(seqs_parsed, progress_unit, seqs_total)
        BtLog.progress(seqs_total, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
Example #3
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d['1'] % ('FASTA', fasta_f)
        self.assembly_f = abspath(fasta_f)
        if (fasta_type):
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count
                
                if (fasta_type):
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error('5', blObj.name)
        
        if self.seqs == 0 or self.length == 0:
            BtLog.error('1')
Example #4
0
def readBam(infile, fasta_headers):
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(int(reads_total)/1000)
    base_cov_dict = {}
    cigar_match_re = re.compile(r"(\d+)M") # only gets digits before M's

    read_cov_dict = {}
    # execute samtools to get only mapped reads from primary alignment
    command = "samtools view -q " + str(mq) + " -F 256 -F 4 " + infile
    # only one counter since only yields mapped reads
    parsed_reads = 0
    for line in runCmd(command):
        match = line.split("\t")
        seq_name = match[2]
        if seq_name not in fasta_headers:
            print BtLog.warn_d['2'] % (seq_name, infile)
        else:
            read_cov_dict[seq_name] = read_cov_dict.get(seq_name, 0) + 1
            if not (no_base_cov_flag):
                base_cov = sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                if (base_cov):
                    base_cov_dict[seq_name] = base_cov_dict.get(seq_name, 0) + base_cov
            parsed_reads += 1
        BtLog.progress(parsed_reads, progress_unit, reads_total)
    BtLog.progress(reads_total, progress_unit, reads_total)
    return base_cov_dict, read_cov_dict, reads_total, parsed_reads
Example #5
0
def parseCatColour(catcolour_f):
    catcolour_dict = {}
    with open(catcolour_f) as fh:
        for l in fh:
            try:
                seq_name, category = l.rstrip("\n").split(",")
                catcolour_dict[seq_name] = category
            except:
                BtLog.error('23', catcolour_f)
    return catcolour_dict
Example #6
0
def writeNodesDB(nodesDB, nodesDB_f):
    nodes_count = nodesDB['nodes_count']
    i = 0
    with open(nodesDB_f, 'w') as fh:
        fh.write("# nodes_count = %s\n" % nodes_count) 
        for node in nodesDB:
            if not node == "nodes_count": 
                i += 1
                BtLog.progress(i, 1000, nodes_count)
                fh.write("%s\t%s\t%s\t%s\n" % (node, nodesDB[node]['rank'], nodesDB[node]['name'], nodesDB[node]['parent']))
Example #7
0
def parseRefCov(refcov_f):
    refcov_dict = {}
    with open(refcov_f) as fh:
        for l in fh:
            try:
                cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                refcov_dict[cov_lib] = {
                                        'reads_total' : int(reads_total_ref), 
                                        'reads_mapped' : int(reads_mapped_ref)
                                       }
            except:
                BtLog.error('21', refcov_f)
    return refcov_dict
Example #8
0
 def computeTaxonomy(self, taxrules, nodesDB):
     tree_lists = BtTax.getTreeList(self.set_of_taxIds, nodesDB)
     self.lineages = BtTax.getLineages(tree_lists, nodesDB)
     self.taxrules = taxrules
     i = 0
     for blObj in self.dict_of_blobs.values():
         i += 1
         BtLog.progress(i, 100, self.seqs)
         for taxrule in taxrules:
             if (blObj.hits):
                 blObj.taxonomy[taxrule] = BtTax.taxRule(taxrule, blObj.hits, self.lineages)
             else:
                 blObj.taxonomy[taxrule] = BtTax.noHit()
Example #9
0
def parseCovFile(cov_f):
    cov_dict = {}
    with open(cov_f) as fh:
        for l in fh:
            try:
                seq_name, cov = l.rstrip("\n").split("\t")
                if float(cov) < 0.02:
                    cov_dict[seq_name] = 0.02
                else:
                    cov_dict[seq_name] = float(cov)
            except:
                BtLog.error('25', cov_f)
    return cov_dict
Example #10
0
def readNodesDB(nodesDB_f):
    nodesDB = {}
    nodes_count = 0
    i = 0
    with open(nodesDB_f) as fh:
        for line in fh:
            if line.startswith("#"):
                nodes_count = int(line.lstrip("# nodes_count = ").rstrip("\n"))
            else:
                i += 1
                node, rank, name, parent = line.rstrip("\n").split("\t")
                nodesDB[node] = {'rank' : rank, 'name' : name, 'parent' : parent}
                BtLog.progress(i, 1000, nodes_count)
    return nodesDB
Example #11
0
def parse_labels(labels):
    label_d = {}
    name, groups = '', ''
    if (labels):
        try:
            for label in labels:
                name, groups = str(label).split("=")
                if "," in groups:
                    for group in groups.split(","):
                        label_d[group] = name
                else:
                    label_d[groups] = name
        except:
            BtLog.error('17', labels)
    return label_d
Example #12
0
def checkBam(infile):
    print BtLog.status_d['10']
    if not (which('samtools')):
        BtLog.error('7')
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = "samtools flagstat " + infile
    for line in runCmd(command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return reads_total, reads_mapped
Example #13
0
def readCov(infile, set_of_blobs):
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith("# Total Reads"):
                    reads_total = int(line.split(" = ")[1])
                elif line.startswith("# Mapped Reads"):
                    reads_mapped = int(line.split(" = ")[1])
                elif line.startswith("# Unmapped Reads"):
                    pass
                elif line.startswith("# Parameters"):
                    pass
                elif line.startswith("# contig_id"):
                    pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name, infile)
                        read_cov_dict[name] = read_cov
                        base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name, infile)
                    base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
Example #14
0
def checkCas(infile):
    print BtLog.status_d['12']
    if not (which('clc_mapping_info')):
        BtLog.error('20')
    seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
    reads_total_re = re.compile(r"\s+Reads\s+(\d+)")
    reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%")
    seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0
    output = ''
    command = "clc_mapping_info -s " + infile
    for line in runCmd(command):
        output += line
    seqs_total = int(seqs_total_re.search(output).group(1))
    reads_mapped = int(reads_mapping_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return seqs_total, reads_total, reads_mapped
Example #15
0
def readTax(infile, set_of_blobs):
    '''
    If more fields need to be parsed:
        - change hit_line_re
        - catch matches in variables
        - add as key-value pairs to hitDict
    '''
    hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards
    with open(infile) as fh:
        for line in fh:
            match = hit_line_re.search(line)
            if match:
                hitDict = {
                    'name' : match.group(1),
                    'taxId' : match.group(2), # string because if int, conversion is a nightmare ...
                    'score' : float(match.group(3))
                    }
                if hitDict['name'] not in set_of_blobs:
                    BtLog.error('19', hitDict['name'], infile)
                if hitDict['taxId'] == 'N/A':
                    BtLog.error('22', infile)
                yield hitDict
Example #16
0
def parseCovFile(cov_f):
    cov_dict = {}
    old_format = 1
    seq_name = ''
    cov = 0.0
    with open(cov_f) as fh:
        for l in fh:
            if l.startswith("#"):
                old_format = 0
            else:
                try:
                    field = l.rstrip("\n").split("\t")
                    if not (old_format):
                        seq_name, cov = field[0], field[2]
                    else:
                        seq_name, cov = field[0], field[1]
                    if float(cov) < 0.02:
                        cov_dict[seq_name] = 0.02
                    else:
                        cov_dict[seq_name] = float(cov)
                except:
                    BtLog.error('25', cov_f)
    return cov_dict
Example #17
0
def getNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that 
    gets JSON'ed into blobtools/data/nodes_db.json if this file 
    does not exist. This file is used if neither "--names" and "--nodes" 
    nor "--db" is specified.
    '''
    nodesDB = {}
    nodesDB_f = ''    
    if (kwargs['names'] and kwargs['nodes']):
        print BtLog.status_d['3'] % (kwargs['nodes'], kwargs['names'])
        nodesDB = {}
        nodes_count = 0
        with open(kwargs['nodes']) as fh:
            for line in fh:
                nodes_col = line.split("\t")
                node = {}
                node_id = nodes_col[0] 
                node['parent'] = nodes_col[2]
                node['rank'] = nodes_col[4]
                nodesDB[node_id] = node
                nodes_count += 1
        with open(kwargs['names']) as fh:
            for line in fh:
                names_col = line.split("\t")
                if names_col[6] == "scientific name":
                   nodesDB[names_col[0]]['name'] = names_col[2]
        nodesDB_f = kwargs['nodesDB']
        nodesDB['nodes_count'] = nodes_count
    elif(kwargs['nodesDB']):
        print BtLog.status_d['4'] % (kwargs['nodesDB'])
        nodesDB = readNodesDB(kwargs['nodesDB'])
        nodesDB_f = kwargs['nodesDB']
    else:
        BtLog.error('3')
    return nodesDB, nodesDB_f
Example #18
0
def readCov(infile, set_of_blobs):
    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    cov_dict = {}
    seqs_parsed = 0
    progress_unit = int(len(set_of_blobs)/100)
    with open(infile) as fh:
        for line in fh:
            BtLog.progress(seqs_parsed, 10, len(set_of_blobs))
            match = cov_line_re.search(line)
            if match:
                seqs_parsed += 1
                name, cov = match.group(1), float(match.group(2))
                if name not in set_of_blobs:
                    print BtLog.warn_d['2'] % (name, infile)
                cov_dict[name] = cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return cov_dict
Example #19
0
def parseNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
    gets JSON'ed into blobtools/data/nodes_db.json if this file
    does not exist. Nodes_db.json is used if neither "--names" and "--nodes"
    nor "--db" is specified. If all three are specified and "--db" does not
    exist, then write 'nodes_db' to file specified by "--db". If all three
    are specified and "--db" exists, error out.
    '''
    nodesDB = {}
    names_f = kwargs['names']
    nodes_f = kwargs['nodes']
    nodesDB_f = kwargs['nodesDB']
    nodesDB_default = kwargs['nodesDBdefault']

    if (nodes_f and names_f):
        if not isfile(names_f):
            BtLog.error('0', names_f)
        if not isfile(nodes_f):
            BtLog.error('0', nodes_f)
        if (nodesDB_f):
            if isfile(nodesDB_f):
                BtLog.error('47', nodesDB_f)
            BtLog.status_d['27'] % (nodesDB_f, nodes_f, names_f)
        else:
            print(BtLog.status_d['3'] % (nodes_f, names_f))
        try:
            nodesDB = readNamesNodes(names_f, nodes_f)
        except:
            BtLog.error('3', nodes_f, names_f)
    elif (nodesDB_f):
        if not isfile(nodesDB_f):
            BtLog.error('0', nodesDB_f)
        print(BtLog.status_d['4'] % (nodesDB_f))
        try:
            nodesDB = readNodesDB(nodesDB_f)
        except:
            BtLog.error('27', nodesDB_f)
    elif (nodesDB_default):
        if not isfile(nodesDB_default):
            BtLog.error('28')
        print(BtLog.status_d['4'] % (nodesDB_default))
        try:
            nodesDB = readNodesDB(nodesDB_default)
        except:
            BtLog.error('27', nodesDB_default)

    # Write nodesDB if names, nodes, nodesDB all given and nodesDB does not
    # exist.  Otherwise, write to nodesDB_default if it does not exist, unless
    # nodesDB given, then do nothing with nodesDB_default.
    if (nodes_f and names_f and nodesDB_f):
        print(BtLog.status_d['28'] % nodesDB_f)
        writeNodesDB(nodesDB, nodesDB_f)
    elif (not nodesDB_f and not isfile(nodesDB_default)):
        nodesDB_f = nodesDB_default
        print(BtLog.status_d['5'] % nodesDB_f)
        writeNodesDB(nodesDB, nodesDB_f)

    return nodesDB, nodesDB_f
Example #20
0
def main():
    #print data_dir
    args = docopt(__doc__)
    blobdb_f = args['--input']
    prefix = args['--out']
    ranks = args['--rank']
    taxrule = args['--taxrule']
    hits_flag = args['--hits']
    seq_list_f = args['--list']
    concoct = args['--concoct']
    cov = args['--cov']
    notable = args['--notable']
    experimental = args['--experimental']
    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    out_f = BtIO.getOutFile(blobdb_f, prefix, None)

    # Are ranks sane ?
    if 'all' in ranks:
        temp_ranks = RANKS[0:-1]
        ranks = temp_ranks[::-1]
    else:
        for rank in ranks:
            if rank not in RANKS:
                BtLog.error('9', rank)

    # Does seq_list file exist?
    seqs = []
    if (seq_list_f):
        if isfile(seq_list_f):
            seqs = BtIO.parseList(seq_list_f)
        else:
            BtLog.error('0', seq_list_f)

    # Load BlobDb
    blobDb = BtCore.BlobDb('new')
    print BtLog.status_d['9'] % (blobdb_f)
    blobDb.load(blobdb_f)
    blobDb.version = blobtools.__version__

    # Is taxrule sane and was it computed?
    if (blobDb.hitLibs) and taxrule not in blobDb.taxrules:
        BtLog.error('11', taxrule, blobDb.taxrules)

    # view(s)
    viewObjs = []
    print BtLog.status_d['14']
    if not (notable):
        tableView = None
        if len(blobDb.hitLibs) > 1:
            tableView = BtCore.ViewObj(name="table",
                                       out_f=out_f,
                                       suffix="%s.table.txt" % (taxrule),
                                       body=[])
        else:
            tableView = BtCore.ViewObj(name="table",
                                       out_f=out_f,
                                       suffix="table.txt",
                                       body=[])
        viewObjs.append(tableView)
    if (experimental):
        meta = {}
        if isfile(experimental):
            meta = BtIO.readYaml(experimental)
        experimentalView = BtCore.ExperimentalViewObj(name="experimental",
                                                      view_dir=out_f,
                                                      blobDb=blobDb,
                                                      meta=meta)
        viewObjs.append(experimentalView)
    if (concoct):
        concoctTaxView = None
        concoctCovView = None
        if len(blobDb.hitLibs) > 1:
            concoctTaxView = BtCore.ViewObj(
                name="concoct_tax",
                out_f=out_f,
                suffix="%s.concoct_taxonomy_info.csv" % (taxrule),
                body=dict())
            concoctCovView = BtCore.ViewObj(
                name="concoct_cov",
                out_f=out_f,
                suffix="%s.concoct_coverage_info.tsv" % (taxrule),
                body=[])
        else:
            concoctTaxView = BtCore.ViewObj(name="concoct_tax",
                                            out_f=out_f,
                                            suffix="concoct_taxonomy_info.csv",
                                            body=dict())
            concoctCovView = BtCore.ViewObj(name="concoct_cov",
                                            out_f=out_f,
                                            suffix="concoct_coverage_info.tsv",
                                            body=[])
        viewObjs.append(concoctTaxView)
        viewObjs.append(concoctCovView)
    if (cov):
        for cov_lib_name, covLibDict in blobDb.covLibs.items():
            out_f = BtIO.getOutFile(covLibDict['f'], prefix, None)
            covView = BtCore.ViewObj(name="covlib",
                                     out_f=out_f,
                                     suffix="cov",
                                     body=[])
            blobDb.view(viewObjs=[covView],
                        ranks=None,
                        taxrule=None,
                        hits_flag=None,
                        seqs=None,
                        cov_libs=[cov_lib_name],
                        progressbar=True)
    if (viewObjs):
        blobDb.view(viewObjs=viewObjs,
                    ranks=ranks,
                    taxrule=taxrule,
                    hits_flag=hits_flag,
                    seqs=seqs,
                    cov_libs=[],
                    progressbar=True)
    print BtLog.status_d['19']
Example #21
0
    for name in readFasta(infile):
        fasta_order.append(name)
        fasta_dict[name] = 0.0
    return fasta_dict, fasta_order

if __name__ == '__main__':
    main_dir = dirname(__file__)
    #print data_dir
    args = docopt(__doc__)
    assembly_f = args['--infile']
    cov_fs = args['--cov']
    
    fasta_dict = {}
    fasta_order = []
    if not isfile(assembly_f):
        BtLog.error('0', assembly_f)
    else:
        fasta_dict, fasta_order = parseFasta(assembly_f)
    
    for cov_f in cov_fs:
        if not isfile(cov_f):
            BtLog.error('0', cov_f)
        else:
            lib_cov_dict = BtPlot.parseCovFile(cov_f)
            for name in fasta_order:
                fasta_dict[name] = fasta_dict.get(name, 0.0) + lib_cov_dict[name]
                    
    
    for name in fasta_order:
        print "%s\t%s" % (name, fasta_dict[name])
Example #22
0
def main():
    args = docopt(__doc__)
    out_f, hit_f, map_f, taxid_d = None, None, None, {}
    hit_f = args['--hit_file']
    hit_col_qseqid = args['--hit_column_qseqid']
    hit_col_sseqid = args['--hit_column_sseqid']
    hit_col_score = args['--hit_column_score']
    map_f = args['--taxid_mapping_file']
    map_col_sseqid = args['--map_col_sseqid']
    map_col_taxid = args['--map_col_taxid']
    #custom_f = args['--custom']
    custom_taxid = args['--custom_taxid']
    #custom_score = args['--custom_score']
    prefix = args['--out']

    try:
        hit_col_qseqid = int(hit_col_qseqid)
        hit_col_sseqid = int(hit_col_sseqid)
        hit_col_score = int(hit_col_score)
    except ValueError:
        BtLog.error('41' % (
            "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score"))

    if custom_taxid:
        try:
            custom_taxid = int(custom_taxid)
        except TypeError:
            BtLog.error('26')
        out_f = BtIO.getOutFile(hit_f, prefix, "taxID_%s.out" % custom_taxid)
        taxid_d = defaultdict(lambda: custom_taxid)
    elif map_f:
        if map_col_sseqid and map_col_taxid:
            try:
                map_col_sseqid = int(map_col_sseqid)
                map_col_taxid = int(map_col_taxid)
            except ValueError:
                BtLog.error('44')
            print(BtLog.status_d['1'] % ("Mapping file", map_f))
            taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid)
            out_f = BtIO.getOutFile(hit_f, prefix, "taxified.out")
        else:
            BtLog.error('44')
    else:
        BtLog.error('41')

    output = []
    print(BtLog.status_d['1'] % ("similarity search result", hit_f))
    with open(hit_f) as fh:
        for idx, line in enumerate(fh):
            col = line.rstrip("\n").split()
            qseqid = col[hit_col_qseqid]
            sseqid = col[hit_col_sseqid]
            score = col[hit_col_score]
            tax_id = None
            if custom_taxid:
                tax_id = taxid_d[sseqid]
            else:
                if sseqid not in taxid_d:
                    BtLog.warn_d['12'] % (sseqid, map_f)
                tax_id = taxid_d.get(sseqid, "N/A")
            output.append("%s\t%s\t%s\t%s" % (qseqid, tax_id, score, sseqid))
    if output:
        with open(out_f, "w") as fh:
            print(BtLog.status_d['24'] % out_f)
            fh.write("\n".join(output) + "\n")
Example #23
0
    def getPlotData(self, rank, min_length, hide_nohits, taxrule, c_index,
                    catcolour_dict):
        data_dict = {}
        read_cov_dict = {}
        max_cov = 0.0
        min_cov = 1000.0
        cov_lib_dict = self.covLibs
        cov_lib_names_l = self.covLibs.keys()  # does not include cov_sum
        if len(cov_lib_names_l) > 1:
            # more than one cov_lib, cov_sum_lib has to be created
            cov_lib_dict['covsum'] = CovLibObj(
                'covsum', 'covsum',
                'Sum of cov in %s' % basename(self.title)).__dict__  # ugly
            cov_lib_dict['covsum']['reads_total'] = sum(
                [self.covLibs[x]['reads_total'] for x in self.covLibs])
            cov_lib_dict['covsum']['reads_mapped'] = sum(
                [self.covLibs[x]['reads_mapped'] for x in self.covLibs])
            cov_lib_dict['covsum']['cov_sum'] = sum(
                [self.covLibs[x]['cov_sum'] for x in self.covLibs])
            cov_lib_dict['covsum'][
                'mean_cov'] = cov_lib_dict['covsum']['cov_sum'] / self.seqs
        for blob in self.dict_of_blobs.values():
            name, gc, length, group = blob['name'], blob['gc'], blob[
                'length'], ''
            if (catcolour_dict
                ):  # annotation with categories specified in catcolour
                group = str(catcolour_dict[name])
            elif (c_index
                  ):  # annotation with c_index instead of taxonomic group
                if taxrule not in self.taxrules:
                    BtLog.error('11', taxrule, self.taxrules)
                else:
                    group = str(blob['taxonomy'][taxrule][rank]['c_index'])
            else:  # annotation with taxonomic group
                if not (taxrule) or taxrule not in self.taxrules:
                    BtLog.warn_d['9'] % (taxrule, self.taxrules)
                if taxrule in blob['taxonomy']:
                    group = str(blob['taxonomy'][taxrule][rank]['tax'])
            if not group in data_dict:
                data_dict[group] = {
                    'name': [],
                    'length': [],
                    'gc': [],
                    'covs': {covLib: []
                             for covLib in cov_lib_dict.keys()
                             },  # includes cov_sum if it exists
                    'reads_mapped':
                    {covLib: 0
                     for covLib in cov_lib_dict.keys()
                     },  # includes cov_sum if it exists
                    'count': 0,
                    'count_hidden': 0,
                    'count_visible': 0,
                    'span': 0,
                    'span_hidden': 0,
                    'span_visible': 0,
                }
            data_dict[group]['count'] = data_dict[group].get('count', 0) + 1
            data_dict[group]['span'] = data_dict[group].get('span',
                                                            0) + int(length)
            if ((hide_nohits)
                    and group == 'no-hit') or length < min_length:  # hidden
                data_dict[group]['count_hidden'] = data_dict[group].get(
                    'count_hidden', 0) + 1
                data_dict[group]['span_hidden'] = data_dict[group].get(
                    'span_hidden', 0) + int(length)
            else:  # visible
                data_dict[group]['count_visible'] = data_dict[group].get(
                    'count_visible', 0) + 1
                data_dict[group]['span_visible'] = data_dict[group].get(
                    'span_visible', 0) + int(length)
                data_dict[group]['name'].append(name)
                data_dict[group]['length'].append(length)
                data_dict[group]['gc'].append(gc)
                cov_sum = 0.0
                reads_mapped_sum = 0
                for cov_lib in sorted(cov_lib_names_l):
                    cov = float(blob['covs'][cov_lib])
                    if cov < 0.1:
                        cov = 0.1
                    if cov < min_cov:
                        min_cov = cov
                    # increase max_cov
                    if cov > max_cov:
                        max_cov = cov
                    # add cov of blob to group
                    data_dict[group]['covs'][cov_lib].append(cov)
                    cov_sum += cov
                    # add readcov
                    if cov_lib in blob['read_cov']:
                        reads_mapped = blob['read_cov'][cov_lib]
                        data_dict[group]['reads_mapped'][
                            cov_lib] += reads_mapped
                        reads_mapped_sum += reads_mapped
                if len(cov_lib_names_l) > 1:
                    if cov_sum <= 0.1 * len(
                            cov_lib_names_l):  # puts no-cov contigs at 0.1
                        cov_sum = 0.1
                    data_dict[group]['covs']['covsum'].append(cov_sum)
                    if cov_sum > max_cov:
                        max_cov = cov_sum
                    if (reads_mapped_sum):
                        data_dict[group]['reads_mapped'][
                            'covsum'] += reads_mapped_sum

        return data_dict, min_cov, max_cov, cov_lib_dict
Example #24
0
 def view(self, **kwargs):
     # arguments
     viewObjs = kwargs['viewObjs']
     ranks = kwargs['ranks']
     taxrule = kwargs['taxrule']
     hits_flag = kwargs['hits_flag']
     seqs = kwargs['seqs']
     cov_libs = kwargs['cov_libs']
     progress_bar = kwargs['progressbar']
     # Default sequences if no subset
     if not (seqs):
         seqs = self.order_of_blobs
     # Default cov_libs if no subset
     cov_lib_names = cov_libs
     if not (cov_libs):
         cov_lib_names = [covLib for covLib in self.covLibs]
     tax_lib_names = [taxLib for taxLib in sorted(self.hitLibs)]
     lineages = self.lineages
     # setup
     for viewObj in viewObjs:
         if viewObj.name == 'table':
             viewObj.header = self.getTableHeader(taxrule, ranks, hits_flag,
                                                  cov_lib_names)
         if viewObj.name == 'concoct_cov':
             viewObj.header = self.getConcoctCovHeader(cov_lib_names)
         if viewObj.name == 'covlib':
             viewObj.header = self.getCovHeader(cov_lib_names)
         if viewObj.name == 'experimental':
             viewObj.covs = {cov_lib: [] for cov_lib in cov_lib_names}
             viewObj.covs["covsum"] = []
             for taxrule in self.taxrules:
                 viewObj.tax[taxrule] = {rank: [] for rank in BtTax.RANKS}
     # bodies
     for i, seq in enumerate(seqs):
         if (progress_bar):
             BtLog.progress(i, 1000, len(seqs))
         blob = self.dict_of_blobs[seq]
         for viewObj in viewObjs:
             if viewObj.name == 'table':
                 viewObj.body.append(
                     self.getTableLine(blob, taxrule, ranks, hits_flag,
                                       cov_lib_names, tax_lib_names,
                                       lineages))
             if viewObj.name == 'concoct_cov':
                 viewObj.body.append(
                     self.getConcoctCovLine(blob, cov_lib_names))
             if viewObj.name == 'experimental':
                 viewObj.names.append(blob['name'])
                 viewObj.gc.append(blob['gc'])
                 viewObj.length.append(blob['length'])
                 cov_sum = 0.0
                 for cov_lib in blob['covs']:
                     viewObj.covs[cov_lib].append(blob['covs'][cov_lib])
                     cov_sum += blob['covs'][cov_lib]
                 viewObj.covs['covsum'].append(cov_sum)
                 for taxrule in blob['taxonomy']:
                     for rank in blob['taxonomy'][taxrule]:
                         viewObj.tax[taxrule][rank].append(
                             blob['taxonomy'][taxrule][rank]['tax'])
             if viewObj.name == 'concoct_tax':
                 for rank in ranks:
                     if not rank in viewObj.body:
                         viewObj.body[rank] = []
                     viewObj.body[rank].append(
                         self.getConcoctTaxLine(blob, rank, taxrule))
             if viewObj.name == 'covlib':
                 viewObj.body.append(self.getCovLine(blob, cov_lib_names))
     if (progress_bar):
         BtLog.progress(len(seqs), 1000, len(seqs))
     for viewObj in viewObjs:
         viewObj.output()
Example #25
0
def validate_input_create(main_dir, args):
    '''
    Accepts: 
        - main_dir
        - docopt args
    Returns:
        - title
        - fasta_f
        - fasta_type
        - cov_libs
        - hit_libs
        - nodesDB_f
        - taxrules
        - out_f
    '''
    ASSEMBLY_TYPES = [None, 'spades', 'soap', 'abyss', 'velvet']

    fasta_f = args['--infile']
    fasta_type = args['--type']
    sam_fs = args['--sam']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--taxfile']
    out_f = args['--out']
    if (out_f):
        out_f = "%s.%s" % (os.path.basename(out_f), "BlobDB.json")
    else:
        out_f = "%s" % ("BlobDB.json")
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    title = args['--title'] if (args['--title']) else out_f
    
    # Do files exist ?
    files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None]
    for f in files:
        if not os.path.isfile(f):
            BtLog.error('0', f)

    # Is taxonomy provided?
    if nodesDB_f == "data/nodesDB.txt":
        nodesDB_f = os.path.join(main_dir, nodesDB_f)
    if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)):
        BtLog.error('3')
    if not (hit_fs):
        BtLog.error('18')
    # can FASTA parser deal with assemblies
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
    # Is coverage provided?
    if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [bt.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
               [bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
               [bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
               [bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] 

    hit_libs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]

    return title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f
Example #26
0
def check_input(args):
    rank = args['--rank']
    c_index = args['--cindex']
    multiplot = args['--multiplot']
    sort_order = args['--sort']
    sort_first = args['--sort_first']
    taxrule = args['--taxrule']
    hist_type = args['--hist']
    catcolour_f = args['--catcolour']
    cumulative_flag = args['--cumulative']

    #Convert sort_first to a list
    if sort_first:
        args['--sort_first'] = sort_first.split(',')
    else:
        args['--sort_first'] = ()

    if 'blobplot' in args or 'covplot' in args:
        # Are ranks sane ?
        if rank not in BtTax.RANKS:
            BtLog.error('9', rank)
        # is taxrule provided?
        if taxrule not in BtTax.TAXRULES:
            BtLog.error('8', taxrule)
        # Are sort_order and hist_type sane?
        if not sort_order in ['span', 'count']:
            BtLog.error('14', sort_order)
        if not hist_type in ['span', 'count']:
            BtLog.error('15', hist_type)
        if (catcolour_f) and (c_index):
            BtLog.error('24')
        if (cumulative_flag) and (multiplot):
            BtLog.error('32')
    return args
Example #27
0
def main():

    #main_dir = dirname(__file__)
    args = docopt(__doc__)
    fasta_f = args['--infile']
    fasta_type = args['--type']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--hitsfile']
    prefix = args['--out']
    nodesDB_f = args['--db']
    names_f = args['--names']
    estimate_cov_flag = True if not args['--calculate_cov'] else False
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    try:
        min_bitscore_diff = float(args['--min_diff'])
        min_score = float(args['--min_score'])
    except ValueError():
        BtLog.error('45')
    tax_collision_random = args['--tax_collision_random']
    title = args['--title']

    # outfile
    out_f = BtIO.getOutFile("blobDB", prefix, "json")
    if not (title):
        title = out_f

    # coverage
    if not (fasta_type) and not bam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
           [BtCore.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)]

    # taxonomy
    hit_libs = [
        BtCore.HitLibObj('tax' + str(idx), 'tax', lib_f)
        for idx, lib_f in enumerate(hit_fs)
    ]

    # Create BlobDB object
    blobDb = BtCore.BlobDb(title)
    blobDb.version = interface.__version__
    # Parse FASTA
    blobDb.parseFasta(fasta_f, fasta_type)

    # Parse nodesDB OR names.dmp, nodes.dmp
    nodesDB_default = join(dirname(abspath(__file__)), "../data/nodesDB.txt")
    nodesDB, nodesDB_f = BtIO.parseNodesDB(nodes=nodes_f,
                                           names=names_f,
                                           nodesDB=nodesDB_f,
                                           nodesDBdefault=nodesDB_default)
    blobDb.nodesDB_f = nodesDB_f

    # Parse similarity hits
    if (hit_libs):
        blobDb.parseHits(hit_libs)
        if not taxrules:
            if len(hit_libs) > 1:
                taxrules = ['bestsum', 'bestsumorder']
            else:
                taxrules = ['bestsum']
        blobDb.computeTaxonomy(taxrules, nodesDB, min_score, min_bitscore_diff,
                               tax_collision_random)
    else:
        print(BtLog.warn_d['0'])

    # Parse coverage
    blobDb.parseCoverage(covLibObjs=cov_libs,
                         estimate_cov=estimate_cov_flag,
                         prefix=prefix)

    # Generating BlobDB and writing to file
    print(BtLog.status_d['7'] % out_f)
    BtIO.writeJson(blobDb.dump(), out_f)
Example #28
0
def parseNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that
    gets JSON'ed into blobtools/data/nodes_db.json if this file
    does not exist. Nodes_db.json is used if neither "--names" and "--nodes"
    nor "--db" is specified.
    '''
    nodesDB = {}
    names_f = kwargs['names']
    nodes_f = kwargs['nodes']
    nodesDB_f = kwargs['nodesDB']
    nodesDB_default = kwargs['nodesDBdefault']

    if (nodes_f and names_f):
        if not isfile(names_f):
            BtLog.error('0', names_f)
        if not isfile(nodes_f):
            BtLog.error('0', nodes_f)
        print BtLog.status_d['3'] % (nodes_f, names_f)
        try:
            nodesDB = readNamesNodes(names_f, nodes_f)
        except:
            BtLog.error('3', nodes_f, names_f)
    elif (nodesDB_f):
        if not isfile(nodesDB_f):
            BtLog.error('0', nodesDB_f)
        print BtLog.status_d['4'] % (nodesDB_f)
        try:
            nodesDB = readNodesDB(nodesDB_f)
        except:
            BtLog.error('27', nodesDB_f)
    elif (nodesDB_default):
        if not isfile(nodesDB_default):
            BtLog.error('28')
        print BtLog.status_d['4'] % (nodesDB_default)
        try:
            nodesDB = readNodesDB(nodesDB_default)
        except:
            BtLog.error('27', nodesDB_default)
        nodesDB_f = nodesDB_default

    # Write nodesDB if not available
    if not isfile(nodesDB_default):
        writeNodesDB(nodesDB, nodesDB_default)

    return nodesDB, nodesDB_f
Example #29
0
def parseBamForFilter(infile, include_unmapped, outfile, include, exclude,
                      gzip, do_sort, keep_sorted, sort_threads):
    '''
    parse BAM to extract readpairs
    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    if do_sort:
        command = blobtools.SAMTOOLS + ' sort -@ sort_threads -n -O bam -T temp -o %s.readsorted.bam %s' % (
            infile, infile)
        runCmd(command=command, wait=True)
        infile = "%s.readsorted.bam" % infile

    progress_unit = int(100000)
    #if progress_flag:
    #    reads_total, reads_mapped = checkBam(infile)
    command = blobtools.SAMTOOLS + " view -f 1 -F 256 -F 2048 %s" % infile

    pair_count_by_type, pair_seqs_by_type, out_fs_by_type = init_read_pairs(
        outfile, include_unmapped, include, exclude)
    if include:
        sequence_to_type_dict = defaultdict(lambda: 'Ex')
        for incl in include:
            sequence_to_type_dict[incl] = 'In'
        sequence_to_type_dict['*'] = 'Un'
    elif exclude:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        for excl in exclude:
            sequence_to_type_dict[excl] = 'Ex'
        sequence_to_type_dict['*'] = 'Un'
    else:
        sequence_to_type_dict = defaultdict(lambda: 'In')
        sequence_to_type_dict['*'] = 'Un'

    iterator = ''
    read_pair_type = None
    iterator = runCmd(command=command)
    seen_reads = 0
    sam_lines = []
    print BtLog.status_d['26'] % infile
    for sam_line in iterator:
        sam_lines.append(sam_line)
    print BtLog.status_d['22'] % infile
    reads_total = len(sam_lines)
    for i in xrange(0, len(sam_lines), 2):
        read1 = sam_lines[i].split()
        try:
            seen_reads += 2
            read2 = sam_lines[i + 1].split()
            read_pair_type = "".join(
                sorted([
                    sequence_to_type_dict[read1[2]],
                    sequence_to_type_dict[read2[2]]
                ]))
            BtLog.progress(seen_reads, progress_unit, reads_total)
            if read_pair_type in pair_seqs_by_type:
                #pair_seqs_by_type[read_pair_type] += get_read_pair_seqs(read1, read2)
                pair_seqs_by_type[read_pair_type].append(
                    get_read_pair_seqs(read1, read2))
                pair_count_by_type[read_pair_type] += 1
        except IndexError:
            print BtLog.warn_d['11']
        #print_bam(read_pair_out_fs, read_pair_type, read1, read2) # this prints SAM files for debugging
    if not seen_reads == reads_total:
        BtLog.progress(reads_total, progress_unit, reads_total)
    write_read_pair_seqs(pair_count_by_type, pair_seqs_by_type, out_fs_by_type)
    # info log
    info_string = []
    info_string.append(('Total pairs', "{:,}".format(int(seen_reads / 2)),
                        '{0:.1%}'.format(1.00)))
    for read_pair_type, count in pair_count_by_type.items():
        info_string.append((read_pair_type + ' pairs', "{:,}".format(count),
                            '{0:.1%}'.format(count / int(seen_reads / 2))))
    info_out_f = getOutFile(outfile, None, "info.txt")
    with open(info_out_f, 'w') as info_fh:
        print BtLog.status_d['24'] % info_out_f
        info_fh.write(get_table(info_string))
    if do_sort and not keep_sorted:
        os.remove(infile)
    return 1
Example #30
0
    taxrule = args['--taxrule']
    hist_type = args['--hist']
    plot_title = args['--title']
    ignore_contig_length = args['--noscale']
    #labels = args['--label']
    #colour_f = args['--colours']
    #exclude_groups = args['--exclude']
    format = args['--format'] 
    #no_plot_blobs = args['--noblobs']
    #no_plot_reads = args['--noreads']
    #refcov_f = args['--refcov']
    #catcolour_f = args['--catcolour']

    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    # Does cov_f exist ?
    if not isfile(cov_f):
        BtLog.error('0', cov_f)
    # parse cov file in dict 
    cov_dict = BtPlot.parseCovFile(cov_f)
    
    # Are ranks sane ?
    if rank not in RANKS:
        BtLog.error('9', rank)

    # Are sort_order and hist_type sane?
    if not sort_order in ['span', 'count']:
        BtLog.error('14', sort_order)
    if not hist_type in ['span', 'count']:            
Example #31
0
    def plotScatter(self, cov_lib, info_flag, out_f):

        fig, axScatter, axHistx, axHisty, axLegend, top_bins, right_bins = self.setupPlot(self.plot)
        # empty handles for big legend
        legend_handles = []
        legend_labels = []
        # marker size scaled by biggest blob (size in points^2)
        max_length = max(array(self.stats['all']['length'])) # length of biggest blob
        max_marker_size = 12500 # marker size for biggest blob, i.e. area of 12500^2 pixel
        for idx, group in enumerate(self.plot_order):
            idx += 1
            lw, alpha = 0.5, 0.8
            if group == 'no-hit':
                alpha = 0.5
            group_length_array = array(self.stats[group]['length'])
            if len(group_length_array) > 0 and group not in self.exclude_groups:
                colour = self.colours[group]
                group_x_array = ''
                group_y_array = ''
                if self.plot == 'blobplot':
                    group_x_array = array(self.stats[group]['gc'])
                    group_y_array = array(self.stats[group]['covs'][cov_lib])
                elif self.plot == 'covplot':
                    group_x_array = array(self.stats[group]['covs'][cov_lib])
                    group_y_array = array([self.cov_y_dict.get(name, 0.02) for name in self.stats[group]['name']])
                else:
                    BtLog.error('34', self.plot)
                marker_size_array = []
                if (self.ignore_contig_length): # no scaling
                    if group == "no-hit":
                        s = 20
                    else:
                        s = 100
                    marker_size_array = [s for length in group_length_array]
                else: # scaling by max_length
                    marker_size_array = [(length/max_length)*max_marker_size for length in group_length_array]
                # generate label for legend
                group_span_in_mb = round(self.stats[group]['span_visible']/1000000, 2)
                group_number_of_seqs = self.stats[group]['count_visible']
                group_n50 = self.stats[group]['n50']
                fmt_seqs = "{:,}".format(group_number_of_seqs)
                fmt_span = "{:,}".format(group_span_in_mb)
                fmt_n50 = "{:,}".format(group_n50)
                label = "%s (%s;%sMB;%snt)" % (group, fmt_seqs, fmt_span, fmt_n50)
                if (info_flag):
                    print(BtLog.info_d['0'] % (group, fmt_seqs, fmt_span, fmt_n50))
                if group == "other":
                    legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=DGREY, markerfacecolor=colour))
                else:
                    legend_handles.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=WHITE, markerfacecolor=colour))
                legend_labels.append(label)

                weights_array = None
                if self.hist_type == "span":
                    weights_array = group_length_array/1000

                axHistx.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3)
                axHisty.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
                if group == 'other':
                    axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=DGREY, label=label)
                else:
                    axScatter.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=WHITE, label=label)
                axLegend.axis('off')
                if (self.multiplot):
                    fig_m, axScatter_m, axHistx_m, axHisty_m, axLegend_m, top_bins, right_bins = self.setupPlot(self.plot)
                    legend_handles_m = []
                    legend_labels_m = []
                    legend_handles_m.append(Line2D([0], [0], linewidth = 0.5, linestyle="none", marker="o", alpha=1, markersize=24, markeredgecolor=WHITE, markerfacecolor=colour))
                    legend_labels_m.append(label)
                    axHistx_m.hist(group_x_array, weights=weights_array, color = colour, bins = top_bins, histtype='step', lw = 3)
                    axHisty_m.hist(group_y_array, weights=weights_array, color = colour, bins = right_bins, histtype='step', orientation='horizontal', lw = 3)
                    if group == 'other':
                        axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=DGREY, label=label)
                    else:
                        axScatter_m.scatter(group_x_array, group_y_array, color = colour, s = marker_size_array, lw = lw, alpha=alpha, edgecolor=WHITE, label=label)
                    axLegend_m.axis('off')
                    axLegend_m.legend(legend_handles_m, legend_labels_m, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True)
                    plot_ref_legend(axScatter_m, max_length, max_marker_size, self.ignore_contig_length)
                    m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_"))
                    fig_m = plot_legend(fig_m, axLegend_m, m_out_f, self.legend_flag, self.format, self.cumulative_flag)
                    print(BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format))
                    fig_m.savefig("%s.%s" % (m_out_f, self.format), format=self.format)
                    plt.close(fig_m)
                elif (self.cumulative_flag):
                    axLegend.legend(legend_handles, legend_labels, loc=6, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True)
                    plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length)
                    m_out_f = "%s.%s.%s.%s" % (out_f, cov_lib, idx, group.replace("/", "_").replace(" ", "_"))
                    fig.add_axes(axLegend)
                    fig = plot_legend(fig, axLegend, m_out_f, self.legend_flag, self.format, self.cumulative_flag)
                    if not (self.no_title):
                        fig.suptitle(out_f, fontsize=35, verticalalignment='top')
                    print(BtLog.status_d['8'] % "%s.%s" % (m_out_f, self.format))
                    fig.savefig("%s.%s" % (m_out_f, self.format), format=self.format)
                else:
                    pass
        plot_ref_legend(axScatter, max_length, max_marker_size, self.ignore_contig_length)
        axLegend.legend(legend_handles, legend_labels, numpoints=1, fontsize=LEGEND_FONTSIZE, frameon=True, loc=6 )
        out_f = "%s.%s" % (out_f, cov_lib)
        fig.add_axes(axLegend)
        fig = plot_legend(fig, axLegend, out_f, self.legend_flag, self.format, self.cumulative_flag)
        if not (self.no_title):
            fig.suptitle(out_f, fontsize=35, verticalalignment='top')
        print(BtLog.status_d['8'] % "%s.%s" % (out_f, self.format))
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
Example #32
0
    if (out_f):
        out_f = "%s.%s" % (out_f, "BlobDB.json")
    else:
        out_f = "%s" % ("BlobDB.json")
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    title = args['--title'] if (args['--title']) else os.path.basename(".".join(fasta_f.split('.')[0:-1]))


    # Do files exist ?
    files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None]
    for f in files:
        if not os.path.isfile(f):
            BtLog.error('0', f)

    # Is taxonomy provided?
    if nodesDB_f == "data/nodesDB.txt":
        nodesDB_f = os.path.join(main_dir, nodesDB_f)
    if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)):
        BtLog.error('3')

    if not (hit_fs):
        BtLog.error('18')

    # can FASTA parser deal with assemblies
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))

    # Is coverage provided?
Example #33
0
    TAXRULES = ['bestsum', 'bestsumorder']
    RANKS = ['species', 'genus', 'family', 'order', 'phylum', 'superkingdom', 'all']

    main_dir = dirname(__file__)
    #print data_dir
    args = docopt(__doc__)
    blobdb_f = args['--input']
    out_f = args['--out'] 
    ranks = args['--rank']
    taxrule = args['--taxrule']
    hits_flag = args['--hits']
    seq_list = args['--list']

    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    # Are ranks sane ?
    for rank in ranks:
        if rank not in RANKS:
            BtLog.error('9', rank)
    if 'all' in ranks:
        ranks = RANKS[0:-1]            

    # Is list a list of sequence names or a file?
    seqs = []
    if (seq_list):
        if isfile(seq_list):
            seqs = BtIO.parseList(seq_list)
        elif "," in seq_list:
            seqs = seq_list.split(",")
Example #34
0
    taxrule = args['--taxrule']
    hist_type = args['--hist']
    plot_title = args['--title']
    ignore_contig_length = args['--noscale']
    #labels = args['--label']
    #colour_f = args['--colours']
    #exclude_groups = args['--exclude']
    format = args['--format']
    #no_plot_blobs = args['--noblobs']
    #no_plot_reads = args['--noreads']
    #refcov_f = args['--refcov']
    #catcolour_f = args['--catcolour']

    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    # Does cov_f exist ?
    if not isfile(cov_f):
        BtLog.error('0', cov_f)
    # parse cov file in dict
    cov_dict = BtPlot.parseCovFile(cov_f)

    # Are ranks sane ?
    if rank not in RANKS:
        BtLog.error('9', rank)

    # Are sort_order and hist_type sane?
    if not sort_order in ['span', 'count']:
        BtLog.error('14', sort_order)
    if not hist_type in ['span', 'count']: