コード例 #1
0
def parseCas(infile, order_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    seqs_total, reads_total, reads_mapped = checkCas(infile)
    progress_unit = int(len(order_of_blobs) / 100)
    cas_line_re = re.compile(
        r"\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+.\d{2})\s+(\d+)\s+(\d+.\d{2})")
    command = "clc_mapping_info -n " + infile
    cov_dict = {}
    read_cov_dict = {}
    seqs_parsed = 0
    if (runCmd(command=command)):
        for line in runCmd(command=command):
            cas_line_match = cas_line_re.search(line)
            if cas_line_match:
                idx = int(cas_line_match.group(
                    1)) - 1  # -1 because index of contig list starts with zero
                try:
                    name = order_of_blobs[idx]
                    reads = int(cas_line_match.group(3))
                    cov = float(cas_line_match.group(6))
                    cov_dict[name] = cov
                    read_cov_dict[name] = reads
                    seqs_parsed += 1
                except:
                    pass
                BtLog.progress(seqs_parsed, progress_unit, seqs_total)
    return cov_dict, reads_total, reads_mapped, read_cov_dict
コード例 #2
0
ファイル: BtPlot.py プロジェクト: rjchallis/blobtools
 def subselect_cov_libs(self, cov_lib_dict, cov_lib_selection):
     selected_cov_libs = []
     cov_lib_selection_error = 0
     if (cov_lib_selection):
         if cov_lib_selection == 'covsum':
             selected_cov_libs.append('covsum')
         elif "," in cov_lib_selection:
             selected_cov_libs = cov_lib_selection.split(",")
             if not set(selected_cov_libs).issubset(set(
                     cov_lib_dict.keys())):
                 cov_lib_selection_error = 1
         else:
             selected_cov_libs.append(cov_lib_selection)
             if not cov_lib_selection in cov_lib_dict:
                 cov_lib_selection_error = 1
     else:
         selected_cov_libs = cov_lib_dict.keys()
     if cov_lib_selection_error:
         covlib_string = []
         for covlib in cov_lib_dict:
             cov_lib_f = cov_lib_dict[covlib]['f']
             if not cov_lib_f:
                 cov_lib_f = "sum of coverages from all covlibs"
             covlib_string.append("\t\t%s : %s" % (covlib, cov_lib_f))
         BtLog.error('33', "\n".join(covlib_string))
     return selected_cov_libs
コード例 #3
0
def parseCovFromHeader(fasta_type, header):
    '''
    Returns the coverage from the header of a FASTA
    sequence depending on the assembly type
    '''
    ASSEMBLY_TYPES = [None, 'spades', 'velvet', 'platanus']
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
    if fasta_type == 'spades':
        spades_match_re = re.compile(r"_cov_(\d+\.*\d*)")
        cov = re.findall(r"_cov_(\d+\.*\d*)", header)
        return float(spades_match_re.findall(header)[0])
    elif fasta_type == 'velvet':
        return float(header.split("_")[-1])
    #elif fasta_type == 'abyss' or fasta_type == 'soap':
    #    temp = header.split(" ")
    #    return float(temp[2]/(temp[1]+1-75))
    elif fasta_type == 'platanus':
        temp = header.rstrip("\n").split("_")
        if len(temp) >= 3:
            return float(temp[2].replace("cov",
                                         ""))  # scaffold/scaffoldBubble/contig
        else:
            return float(temp[1].replace("cov", ""))  # gapClosed
    else:
        pass
コード例 #4
0
def parseJson(infile):
    '''http://artem.krylysov.com/blog/2015/09/29/benchmark-python-json-libraries/'''
    if not isfile(infile):
        BtLog.error('0', infile)
    import time
    start = time.time()
    json_parser = ''
    with open(infile, 'r') as fh:
        print BtLog.status_d['15']
        json_string = fh.read()
    try:
        import ujson as json  # fastest
        json_parser = 'ujson'
        print BtLog.status_d['16'] % json_parser
    except ImportError:
        try:
            import simplejson as json  # fast
            json_parser = 'simplejson'
        except ImportError:
            import json  # default
            json_parser = 'json'
        print BtLog.status_d['17'] % json_parser
    try:
        obj = json.loads(json_string.decode("ascii"))
    except ValueError:
        BtLog.error('37', infile, "BlobDB")
    data = byteify(obj)
    print BtLog.status_d['20'] % (time.time() - start)
    return data
コード例 #5
0
ファイル: BtPlot.py プロジェクト: rjchallis/blobtools
def set_format_scatterplot(axScatter, **kwargs):
    min_x, max_x = None, None
    min_y, max_y = None, None
    if kwargs['plot'] == 'blobplot':
        min_x, max_x = 0, 1
        major_xticks = MultipleLocator(0.2)
        minor_xticks = AutoMinorLocator(20)
        min_y, max_y = kwargs['min_cov'] * 0.1, kwargs['max_cov'] + 100
        axScatter.set_yscale('log')
        axScatter.set_xscale('linear')
        axScatter.xaxis.set_major_locator(major_xticks)
        axScatter.xaxis.set_minor_locator(minor_xticks)
    elif kwargs['plot'] == 'covplot':
        min_x, max_x = kwargs['min_cov'] * 0.1, kwargs['max_cov'] + 100
        min_y, max_y = kwargs['min_cov'] * 0.1, kwargs['max_cov'] + 100
        axScatter.set_yscale('log')
        axScatter.set_xscale('log')
    else:
        BtLog.error('34' % kwargs['plot'])
    axScatter.set_xlim((min_x, max_x))
    axScatter.set_ylim(
        (min_y, max_y)
    )  # This sets the max-Coverage so that all libraries + sum are at the same scale
    axScatter.grid(True, which="major", lw=2., color=WHITE, linestyle='-')
    axScatter.set_axisbelow(True)
    axScatter.xaxis.labelpad = 20
    axScatter.yaxis.labelpad = 20
    axScatter.yaxis.get_major_ticks()[0].label1.set_visible(False)
    axScatter.tick_params(axis='both', which='both', direction='out')
    return axScatter
コード例 #6
0
def checkBam(infile):
    print BtLog.status_d['10']
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    #reads_secondary_re = re.compile(r"(\d+)\s\+\s\d+\ssecondary")
    #reads_supplementary_re = re.compile(r"(\d+)\s\+\s\d+\ssupplementary")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = blobtools.SAMTOOLS + " flagstat " + infile
    for line in runCmd(command=command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    #reads_secondary = int(reads_secondary_re.search(output).group(1))
    #reads_supplementary = int(reads_supplementary_re.search(output).group(1))
    #reads_mapped = reads_mapped - reads_secondary - reads_supplementary
    reads_total = int(reads_total_re.search(output).group(
        1))  # - reads_secondary - reads_supplementary

    # check whether there are reads in BAM
    if not reads_total or not reads_mapped:
        BtLog.error('29', infile)
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), \
        '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return reads_total, reads_mapped
コード例 #7
0
ファイル: BtCore.py プロジェクト: hyphaltip/blobtools
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d['1'] % ('FASTA', fasta_f)
        self.assembly_f = abspath(fasta_f)
        if (fasta_type):
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type, fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count
                
                if (fasta_type):
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error('5', blObj.name)
        
        if self.seqs == 0 or self.length == 0:
            BtLog.error('1')
コード例 #8
0
    def parseFasta(self, fasta_f, fasta_type):
        print BtLog.status_d['1'] % ('FASTA', fasta_f)
        self.assembly_f = abspath(fasta_f)
        if (fasta_type):
            # Set up CovLibObj for coverage in assembly header
            self.covLibs[fasta_type] = CovLibObj(fasta_type, fasta_type,
                                                 fasta_f)

        for name, seq in BtIO.readFasta(fasta_f):
            blObj = BlObj(name, seq)
            if not blObj.name in self.dict_of_blobs:
                self.seqs += 1
                self.length += blObj.length
                self.n_count += blObj.n_count

                if (fasta_type):
                    cov = BtIO.parseCovFromHeader(fasta_type, blObj.name)
                    self.covLibs[fasta_type].cov_sum += cov
                    blObj.addCov(fasta_type, cov)

                self.order_of_blobs.append(blObj.name)
                self.dict_of_blobs[blObj.name] = blObj
            else:
                BtLog.error('5', blObj.name)

        if self.seqs == 0 or self.length == 0:
            BtLog.error('1')
コード例 #9
0
ファイル: BtIO.py プロジェクト: mc-assemblage/blobtools
def readTax(infile, set_of_blobs):
    '''
    If more fields need to be parsed:
        - change hit_line_re
        - catch matches in variables
        - add as key-value pairs to hitDict
    '''
    hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)"
                             )  # TEST TEST , if not split it afterwards
    with open(infile) as fh:
        for line in fh:
            match = hit_line_re.search(line)
            if match:
                hitDict = {
                    'name': match.group(1),
                    'taxId': match.group(
                        2
                    ),  # string because if int, conversion is a nightmare ...
                    'score': float(match.group(3))
                }
                if hitDict['name'] not in set_of_blobs:
                    BtLog.error('19', hitDict['name'], infile)
                if hitDict['taxId'] == 'N/A':
                    BtLog.error('22', infile)
                yield hitDict
コード例 #10
0
def parseList(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = []
        for l in fh:
            items.append(l.rstrip("\n"))
    return items
コード例 #11
0
def parseSet(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        items = set()
        for l in fh:
            items.add(l.rstrip("\n").lstrip(">"))
    return items
コード例 #12
0
def readYaml(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        str = "".join(fh.readlines())
    try:
        data = yaml.load(str)
    except yaml.YAMLError, exc:
        BtLog.error('37', infile, "yaml")
コード例 #13
0
ファイル: BtPlot.py プロジェクト: evolgenomology/blobtools
def parseCatColour(catcolour_f):
    catcolour_dict = {}
    with open(catcolour_f) as fh:
        for l in fh:
            try:
                seq_name, category = l.rstrip("\n").split(",")
                catcolour_dict[seq_name] = category
            except:
                BtLog.error('23', catcolour_f)
    return catcolour_dict
コード例 #14
0
ファイル: BtPlot.py プロジェクト: greatfireball/blobtools
def parseCatColour(catcolour_f):
    catcolour_dict = {}
    with open(catcolour_f) as fh:
        for l in fh:
            try:
                seq_name, category = l.rstrip("\n").split(",")
                catcolour_dict[seq_name] = category
            except:
                BtLog.error('23', catcolour_f)
    return catcolour_dict
コード例 #15
0
def parseColours(infile):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                temp = l.rstrip("\n").split(",")
                items[temp[0]] = temp[1]
    return items
コード例 #16
0
def parseBam(infile, set_of_blobs, no_base_cov_flag):
    '''
    checkBam returns reads_total and reads_mapped
    base_cov_dict is list of coverages for each contigs, since list appending should be faster

    '''
    if not isfile(infile):
        BtLog.error('0', infile)
    reads_total, reads_mapped = checkBam(infile)
    progress_unit = int(reads_mapped / 1000)
    base_cov_dict = {blob: [] for blob in set_of_blobs}
    #base_cov_dict = {blob : 0 for blob in set_of_blobs}
    read_cov_dict = {blob: 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(
        r"(\d+)M|X|=")  # only gets digits before M,X,='s
    # execute samtools to get only mapped reads (no optial duplicates, no 2nd-ary alignment)
    command = blobtools.SAMTOOLS + " view -F 1024 -F 4 -F 256 " + infile
    seen_reads = 0
    #import time
    #start = time.time()
    if not (no_base_cov_flag):
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                base_cov_dict[match[2]].append(
                    sum([
                        int(matching)
                        for matching in cigar_match_re.findall(match[5])
                    ]))
                #base_cov_dict[match[2]] += sum([int(matching) for matching in cigar_match_re.findall(match[5])])
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    else:
        for line in runCmd(command=command):
            seen_reads += 1
            match = line.split()
            try:
                read_cov_dict[match[2]] += 1
            except:
                print BtLog.warn_d['2'] % (match[2])
            BtLog.progress(seen_reads, progress_unit, reads_mapped)
    if not int(reads_mapped) == int(seen_reads):
        print BtLog.warn_d['3'] % (reads_mapped, seen_reads)
        reads_mapped = seen_reads
    base_cov_dict = {
        seq_name: sum(base_covs)
        for seq_name, base_covs in base_cov_dict.items()
    }
    #end = time.time()
    #print (end-start)
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
コード例 #17
0
def parseCov(infile, set_of_blobs):
    if not isfile(infile):
        BtLog.error('0', infile)
    old_cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)")
    base_cov_dict = {}

    cov_line_re = re.compile(r"^(\S+)\t(\d+\.*\d*)\t(\d+\.*\d*)")
    reads_total = 0
    reads_mapped = 0
    reads_unmapped = 0
    read_cov_dict = {}

    seqs_parsed = 0
    progress_unit = 1
    old_format = 1
    with open(infile) as fh:
        for line in fh:
            if line.startswith("#"):
                old_format = 0
            if old_format == 0:
                if line.startswith('#'):
                    if line.startswith("## Total Reads"):
                        reads_total = int(line.split(" = ")[1])
                    elif line.startswith("## Mapped Reads"):
                        reads_mapped = int(line.split(" = ")[1])
                    elif line.startswith("## Unmapped Reads"):
                        reads_unmapped = int(line.split(" = ")[1])
                    else:
                        pass
                else:
                    match = cov_line_re.search(line)
                    if match:
                        seqs_parsed += 1
                        name, read_cov, base_cov = match.group(1), int(
                            match.group(2)), float(match.group(3))
                        if name not in set_of_blobs:
                            print BtLog.warn_d['2'] % (name)
                        else:
                            read_cov_dict[name] = read_cov
                            base_cov_dict[name] = base_cov
            else:
                match = old_cov_line_re.search(line)
                if match:
                    seqs_parsed += 1
                    name, base_cov = match.group(1), float(match.group(2))
                    if name not in set_of_blobs:
                        print BtLog.warn_d['2'] % (name)
                    else:
                        base_cov_dict[name] = base_cov
            BtLog.progress(seqs_parsed, progress_unit, len(set_of_blobs))
        #BtLog.progress(len(set_of_blobs), progress_unit, len(set_of_blobs))
    return base_cov_dict, reads_total, reads_mapped, reads_unmapped, read_cov_dict
コード例 #18
0
def parseDict(infile, key, value):
    items = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            items = {}
            k_idx = int(key)
            v_idx = int(value)
            for l in fh:
                temp = l.rstrip("\n").split()
                items[temp[k_idx]] = temp[v_idx]
    return items
コード例 #19
0
def parseCovFile(cov_f):
    cov_dict = {}
    with open(cov_f) as fh:
        for l in fh:
            try:
                seq_name, cov = l.rstrip("\n").split("\t")
                if float(cov) < 0.02:
                    cov_dict[seq_name] = 0.02
                else:
                    cov_dict[seq_name] = float(cov)
            except:
                BtLog.error('25', cov_f)
    return cov_dict
コード例 #20
0
def parseCatColour(infile):
    catcolour_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    seq_name, category = l.rstrip("\n").split(",")
                    catcolour_dict[seq_name] = category
                except:
                    BtLog.error('23', infile)
    return catcolour_dict
コード例 #21
0
ファイル: BtPlot.py プロジェクト: BioInfoTools/blobtools
def parseCovFile(cov_f):
    cov_dict = {}
    with open(cov_f) as fh:
        for l in fh:
            try:
                seq_name, cov = l.rstrip("\n").split("\t")
                if float(cov) < 0.02:
                    cov_dict[seq_name] = 0.02
                else:
                    cov_dict[seq_name] = float(cov)
            except:
                BtLog.error('25', cov_f)
    return cov_dict
コード例 #22
0
ファイル: BtPlot.py プロジェクト: evolgenomology/blobtools
def parseRefCov(refcov_f):
    refcov_dict = {}
    with open(refcov_f) as fh:
        for l in fh:
            try:
                cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                refcov_dict[cov_lib] = {
                                        'reads_total' : int(reads_total_ref),
                                        'reads_mapped' : int(reads_mapped_ref)
                                       }
            except:
                BtLog.error('21', refcov_f)
    return refcov_dict
コード例 #23
0
ファイル: BtPlot.py プロジェクト: greatfireball/blobtools
def parseRefCov(refcov_f):
    refcov_dict = {}
    with open(refcov_f) as fh:
        for l in fh:
            try:
                cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                refcov_dict[cov_lib] = {
                                        'reads_total' : int(reads_total_ref), 
                                        'reads_mapped' : int(reads_mapped_ref)
                                       }
            except:
                BtLog.error('21', refcov_f)
    return refcov_dict
コード例 #24
0
ファイル: BtPlot.py プロジェクト: zhou-ran/blobtools
    def plotBar(self, cov_lib, out_f):
        fig, ax_main, ax_group, x_pos_main, x_pos_group = self.setupPlot('readcov')
        ax_main_data = {'labels' : [], 'values' : [], 'colours' : [] }
        ax_group_data = {'labels' : [], 'values' : [], 'colours' : [] }
        reads_total = self.cov_libs_total_reads_dict[cov_lib]
        reads_mapped = self.stats['all']['reads_mapped'][cov_lib]
        reads_unmapped = reads_total - self.stats['all']['reads_mapped'][cov_lib]
        ax_main_data['labels'].append('Unmapped (assembly)')
        ax_main_data['values'].append(reads_unmapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        ax_main_data['labels'].append('Mapped (assembly)')
        ax_main_data['values'].append(reads_mapped/reads_total)
        ax_main_data['colours'].append(DGREY)
        if (self.refcov_dict):
            if cov_lib in self.refcov_dict:
                reads_total_ref = self.refcov_dict[cov_lib]['reads_total']
                reads_mapped_ref = self.refcov_dict[cov_lib]['reads_mapped']
                reads_unmapped_ref = reads_total_ref - reads_mapped_ref
                ax_main_data['labels'].append('Unmapped (ref)')
                ax_main_data['values'].append(reads_unmapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
                ax_main_data['labels'].append('Mapped (ref)')
                ax_main_data['values'].append(reads_mapped_ref/reads_total_ref)
                ax_main_data['colours'].append(DGREY)
            else:
                BtLog.error('40', cov_lib)

        # mapped plotted groups
        for group in self.plot_order:
           ax_group_data['labels'].append(group)
           ax_group_data['values'].append(self.stats[group]['reads_mapped_perc'][cov_lib])
           ax_group_data['colours'].append(self.colours[group])
        rect_group = ax_group.bar(x_pos_group, ax_group_data['values'], width = 0.5, tick_label=ax_group_data['labels'], align='center', color = ax_group_data['colours'])
        for rect_g in rect_group:
            height_g = float(rect_g.get_height())
            ax_group.text(rect_g.get_x() + rect_g.get_width()/2., 0.005 + height_g, '{:.2f}%'.format(height_g*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)
        rect_main = ax_main.bar(x_pos_main, ax_main_data['values'], width = 0.5, tick_label=ax_main_data['labels'], align='center', color = ax_main_data['colours'])
        for rect_m in rect_main:
            height_m = float(rect_m.get_height())
            ax_main.text(rect_m.get_x() + rect_m.get_width()/2., 0.005 + height_m, '{:.2f}%'.format(height_m*100), ha='center', va='bottom', fontsize=LEGEND_FONTSIZE)

        ax_main.set_xticklabels(ax_main_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        ax_group.set_xticklabels(ax_group_data['labels'], rotation=45, ha='center', fontsize=LEGEND_FONTSIZE)
        #figsuptitle = fig.suptitle(out_f, verticalalignment='top')
        out_f = "%s.read_cov.%s" % (out_f, cov_lib)
        print(BtLog.status_d['8'] % "%s.%s" % (out_f, self.format))
        fig.tight_layout()
        #fig.savefig("%s.%s" % (out_f, self.format), format=self.format,  bbox_extra_artists=(figsuptitle,))
        fig.savefig("%s.%s" % (out_f, self.format), format=self.format)
        plt.close(fig)
コード例 #25
0
def readFasta(infile):
    if not isfile(infile):
        BtLog.error('0', infile)
    with open(infile) as fh:
        header, seqs = '', []
        for l in fh:
            if l[0] == '>':
                if header:
                    yield header, ''.join(seqs).upper()
                header, seqs = l[1:-1].split()[0], [
                ]  # Header is split at first whitespace
            else:
                seqs.append(l[:-1])
        yield header, ''.join(seqs).upper()
コード例 #26
0
ファイル: BtIO.py プロジェクト: hyphaltip/blobtools
def checkBam(infile):
    print BtLog.status_d['10']
    if not (which('samtools')):
        BtLog.error('7')
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = "samtools flagstat " + infile
    for line in runCmd(command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return reads_total, reads_mapped
コード例 #27
0
ファイル: BtPlot.py プロジェクト: greatfireball/blobtools
def parse_labels(labels):
    label_d = {}
    name, groups = '', ''
    if (labels):
        try:
            for label in labels:
                name, groups = str(label).split("=")
                if "," in groups:
                    for group in groups.split(","):
                        label_d[group] = name
                else:
                    label_d[groups] = name
        except:
            BtLog.error('17', labels)
    return label_d
コード例 #28
0
def parseCmdLabels(labels):
    label_d = {}
    name, groups = '', ''
    if labels:
        try:
            for label in labels:
                name, groups = str(label).split("=")
                if "," in groups:
                    for group in groups.split(","):
                        label_d[group] = name
                else:
                    label_d[groups] = name
        except:
            BtLog.error('17', labels)
    return label_d
コード例 #29
0
def main():
    args = docopt(__doc__)
    #print(args)
    bam_f = args['--bam']
    include_f = args['--include']
    exclude_f = args['--exclude']
    out_prefix = args['--out']
    read_format = args['--read_format']
    if not read_format in set(['fq', 'fa']):
        sys.exit("[X] Read format must be fq or fa!")
    noninterleaved = args['--noninterleaved']
    include_unmapped = True
    if args['--exclude_unmapped']:
        include_unmapped = False
    out_f = BtIO.getOutFile(bam_f, out_prefix, None)
    if include_f and exclude_f:
        print(BtLog.error('43'))
    elif include_f:
        sequence_list = BtIO.parseList(include_f)
        BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, sequence_list, None, read_format)
    elif exclude_f:
        sequence_list = BtIO.parseList(exclude_f)
        BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, sequence_list, read_format)
    else:
        BtIO.parseBamForFilter(bam_f, include_unmapped, noninterleaved, out_f, None, None, read_format)
コード例 #30
0
def parseReferenceCov(infile):
    refcov_dict = {}
    if infile:
        if not isfile(infile):
            BtLog.error('0', infile)
        with open(infile) as fh:
            for l in fh:
                try:
                    cov_lib, reads_total_ref, reads_mapped_ref = l.split(",")
                    refcov_dict[cov_lib] = {
                        'reads_total': int(reads_total_ref),
                        'reads_mapped': int(reads_mapped_ref)
                    }
                except:
                    BtLog.error('21', infile)
    return refcov_dict
コード例 #31
0
def parseSam(infile, set_of_blobs, no_base_cov_flag):
    if not isfile(infile):
        BtLog.error('0', infile)
    base_cov_dict = {blob: [] for blob in set_of_blobs}
    read_cov_dict = {blob: 0 for blob in set_of_blobs}
    cigar_match_re = re.compile(
        r"(\d+)M|X|=")  # only gets digits before M,X,='s
    reads_total = 0
    reads_mapped = 0
    if not (no_base_cov_flag):
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            base_cov_dict[match[2]].append(
                                sum([
                                    int(matching) for matching in
                                    cigar_match_re.findall(match[5])
                                ]))
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    else:
        with open(infile) as fh:
            for line in fh:
                if line.startswith("@"):
                    pass
                else:
                    reads_total += 1
                    match = line.split()
                    if not match[2] == '*':
                        reads_mapped += 1
                        try:
                            read_cov_dict[match[2]] += 1
                        except:
                            print BtLog.warn_d['2'] % (match[2])
    base_cov_dict = {
        seq_name: sum(base_covs)
        for seq_name, base_covs in base_cov_dict.items()
    }
    return base_cov_dict, reads_total, reads_mapped, read_cov_dict
コード例 #32
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    bam_fs = args['--bam']
    cas_fs = args['--cas']
    prefix = args['--output']
    estimate_cov_flag = True if not args['--calculate_cov'] else False

    # Make covLibs
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)]
    if not (cov_libs):
        BtLog.error('31')
    blobDb = BtCore.BlobDb('cov')
    blobDb.version = interface.__version__
    blobDb.parseFasta(fasta_f, None)
    blobDb.parseCoverage(covLibObjs=cov_libs, estimate_cov=estimate_cov_flag, prefix=prefix)
コード例 #33
0
ファイル: BtIO.py プロジェクト: hyphaltip/blobtools
def checkCas(infile):
    print BtLog.status_d['12']
    if not (which('clc_mapping_info')):
        BtLog.error('20')
    seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
    reads_total_re = re.compile(r"\s+Reads\s+(\d+)")
    reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%")
    seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0
    output = ''
    command = "clc_mapping_info -s " + infile
    for line in runCmd(command):
        output += line
    seqs_total = int(seqs_total_re.search(output).group(1))
    reads_mapped = int(reads_mapping_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped), '{:,}'.format(reads_total), '{0:.1%}'.format(reads_mapped/reads_total))
    return seqs_total, reads_total, reads_mapped
コード例 #34
0
ファイル: BtIO.py プロジェクト: mc-assemblage/blobtools
def checkBam(infile):
    print BtLog.status_d['10']
    if not (which('samtools')):
        BtLog.error('7')
    reads_mapped_re = re.compile(r"(\d+)\s\+\s\d+\smapped")
    reads_total_re = re.compile(r"(\d+)\s\+\s\d+\sin total")
    reads_total, reads_mapped = 0, 0
    output = ''
    command = "samtools flagstat " + infile
    for line in runCmd(command):
        output += line
    reads_mapped = int(reads_mapped_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped),
                                  '{:,}'.format(reads_total), '{0:.1%}'.format(
                                      reads_mapped / reads_total))
    return reads_total, reads_mapped
コード例 #35
0
def main():
    args = docopt(__doc__)
    fasta_f = args['--infile']
    bam_fs = args['--bam']
    cas_fs = args['--cas']
    sam_fs = args['--sam']
    prefix = args['--output']
    no_base_cov_flag = args['--no_base_cov']

    # Make covLibs
    cov_libs = [BtCore.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
           [BtCore.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
           [BtCore.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)]
    if not (cov_libs):
        BtLog.error('31')
    blobDb = BtCore.BlobDb('cov')
    blobDb.version = blobtools.__version__
    blobDb.parseFasta(fasta_f, None)
    blobDb.parseCoverage(covLibObjs=cov_libs, no_base_cov=no_base_cov_flag, prefix=prefix)
コード例 #36
0
ファイル: BtIO.py プロジェクト: mc-assemblage/blobtools
def checkCas(infile):
    print BtLog.status_d['12']
    if not (which('clc_mapping_info')):
        BtLog.error('20')
    seqs_total_re = re.compile(r"\s+Contigs\s+(\d+)")
    reads_total_re = re.compile(r"\s+Reads\s+(\d+)")
    reads_mapping_re = re.compile(r"\s+Mapped reads\s+(\d+)\s+(\d+.\d+)\s+\%")
    seqs_total, reads_total, reads_mapping, mapping_rate = 0, 0, 0, 0.0
    output = ''
    command = "clc_mapping_info -s " + infile
    for line in runCmd(command):
        output += line
    seqs_total = int(seqs_total_re.search(output).group(1))
    reads_mapped = int(reads_mapping_re.search(output).group(1))
    reads_total = int(reads_total_re.search(output).group(1))
    print BtLog.status_d['11'] % ('{:,}'.format(reads_mapped),
                                  '{:,}'.format(reads_total), '{0:.1%}'.format(
                                      reads_mapped / reads_total))
    return seqs_total, reads_total, reads_mapped
コード例 #37
0
    def mapping():
        out_f, hit_f, map_f, taxid_d = None, None, None, {}
        hit_f = megablast_output  #hit file: BLAST similarity search result (TSV format)
        map_f = "/home/nancy/assembly_app/blobtools/blobtools-master/taxon_n"  #mapping file (TSV format), in which one column lists a sequence ID (of a subject) and another the NCBI TaxID
        map_col_sseqid = "0"  #column of mapping file containing sequence IDs (of the subject)
        map_col_taxid = "2"  #column of mapping file containing the TaxID of the subject
        hit_col_qseqid = "0"  #column of the hit file containing query ID
        hit_col_sseqid = "1"  #column of the hit file containing subject ID
        hit_col_score = "11"  #column of the hit file containing (bit)score

        try:
            hit_col_qseqid = int(hit_col_qseqid)
            hit_col_sseqid = int(hit_col_sseqid)
            hit_col_score = int(hit_col_score)
        except ValueError:
            BtLog.error('41' % (
                "--hit_column_qseqid, --hit_column_sseqid and --hit_column_score"
            ))

        if map_f:
            if map_col_sseqid and map_col_taxid:
                try:
                    map_col_sseqid = int(map_col_sseqid)
                    map_col_taxid = int(map_col_taxid)
                except ValueError:
                    BtLog.error('44')
                print BtLog.status_d['1'] % ("Mapping file", map_f)
                taxid_d = BtIO.parseDict(map_f, map_col_sseqid, map_col_taxid)
                out_f = BtIO.getOutFile("taxified", hit_f, "out")
            else:
                BtLog.error('44')
        else:
            BtLog.error('41')

        output = []
        print BtLog.status_d['1'] % ("similarity search result", hit_f)
        with open(hit_f) as fh:
            for idx, line in enumerate(fh):
                col = line.rstrip("\n").split()
                qseqid = col[hit_col_qseqid]
                sseqid = col[hit_col_sseqid]
                score = col[hit_col_score]
                tax_id = None
                if sseqid not in taxid_d:
                    BtLog.warn_d['12'] % (sseqid, map_f)
                tax_id = taxid_d.get(sseqid, "N/A")
                output.append("%s\t%s\t%s\t%s" %
                              (qseqid, tax_id, score, sseqid))
        if output:
            with open(out_f, "w") as fh:
                print BtLog.status_d['24'] % out_f
                fh.write("\n".join(output) + "\n")
コード例 #38
0
ファイル: BtIO.py プロジェクト: hyphaltip/blobtools
def readTax(infile, set_of_blobs):
    '''
    If more fields need to be parsed:
        - change hit_line_re
        - catch matches in variables
        - add as key-value pairs to hitDict
    '''
    hit_line_re = re.compile(r"^(\S+)\s+(\d+)[\;?\d+]*\s+(\d+\.*\d*)") # TEST TEST , if not split it afterwards
    with open(infile) as fh:
        for line in fh:
            match = hit_line_re.search(line)
            if match:
                hitDict = {
                    'name' : match.group(1),
                    'taxId' : match.group(2), # string because if int, conversion is a nightmare ...
                    'score' : float(match.group(3))
                    }
                if hitDict['name'] not in set_of_blobs:
                    BtLog.error('19', hitDict['name'], infile)
                if hitDict['taxId'] == 'N/A':
                    BtLog.error('22', infile)
                yield hitDict
コード例 #39
0
ファイル: BtPlot.py プロジェクト: evolgenomology/blobtools
def parseCovFile(cov_f):
    cov_dict = {}
    old_format = 1
    seq_name = ''
    cov = 0.0
    with open(cov_f) as fh:
        for l in fh:
            if l.startswith("#"):
                old_format = 0
            else:
                try:
                    field = l.rstrip("\n").split("\t")
                    if not (old_format):
                        seq_name, cov = field[0], field[2]
                    else:
                        seq_name, cov = field[0], field[1]
                    if float(cov) < 0.02:
                        cov_dict[seq_name] = 0.02
                    else:
                        cov_dict[seq_name] = float(cov)
                except:
                    BtLog.error('25', cov_f)
    return cov_dict
コード例 #40
0
ファイル: BtIO.py プロジェクト: hyphaltip/blobtools
def getNodesDB(**kwargs):
    '''
    Parsing names.dmp and nodes.dmp into the 'nodes_db' dict of dicts that 
    gets JSON'ed into blobtools/data/nodes_db.json if this file 
    does not exist. This file is used if neither "--names" and "--nodes" 
    nor "--db" is specified.
    '''
    nodesDB = {}
    nodesDB_f = ''    
    if (kwargs['names'] and kwargs['nodes']):
        print BtLog.status_d['3'] % (kwargs['nodes'], kwargs['names'])
        nodesDB = {}
        nodes_count = 0
        with open(kwargs['nodes']) as fh:
            for line in fh:
                nodes_col = line.split("\t")
                node = {}
                node_id = nodes_col[0] 
                node['parent'] = nodes_col[2]
                node['rank'] = nodes_col[4]
                nodesDB[node_id] = node
                nodes_count += 1
        with open(kwargs['names']) as fh:
            for line in fh:
                names_col = line.split("\t")
                if names_col[6] == "scientific name":
                   nodesDB[names_col[0]]['name'] = names_col[2]
        nodesDB_f = kwargs['nodesDB']
        nodesDB['nodes_count'] = nodes_count
    elif(kwargs['nodesDB']):
        print BtLog.status_d['4'] % (kwargs['nodesDB'])
        nodesDB = readNodesDB(kwargs['nodesDB'])
        nodesDB_f = kwargs['nodesDB']
    else:
        BtLog.error('3')
    return nodesDB, nodesDB_f
コード例 #41
0
ファイル: sumcov.py プロジェクト: evolgenomology/blobtools
    for name in readFasta(infile):
        fasta_order.append(name)
        fasta_dict[name] = 0.0
    return fasta_dict, fasta_order

if __name__ == '__main__':
    main_dir = dirname(__file__)
    #print data_dir
    args = docopt(__doc__)
    assembly_f = args['--infile']
    cov_fs = args['--cov']
    
    fasta_dict = {}
    fasta_order = []
    if not isfile(assembly_f):
        BtLog.error('0', assembly_f)
    else:
        fasta_dict, fasta_order = parseFasta(assembly_f)
    
    for cov_f in cov_fs:
        if not isfile(cov_f):
            BtLog.error('0', cov_f)
        else:
            lib_cov_dict = BtPlot.parseCovFile(cov_f)
            for name in fasta_order:
                fasta_dict[name] = fasta_dict.get(name, 0.0) + lib_cov_dict[name]
                    
    
    for name in fasta_order:
        print "%s\t%s" % (name, fasta_dict[name])
コード例 #42
0
ファイル: view.py プロジェクト: greatfireball/blobtools
    TAXRULES = ['bestsum', 'bestsumorder']
    RANKS = ['species', 'genus', 'family', 'order', 'phylum', 'superkingdom', 'all']

    main_dir = dirname(__file__)
    #print data_dir
    args = docopt(__doc__)
    blobdb_f = args['--input']
    out_f = args['--out'] 
    ranks = args['--rank']
    taxrule = args['--taxrule']
    hits_flag = args['--hits']
    seq_list = args['--list']

    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    # Are ranks sane ?
    for rank in ranks:
        if rank not in RANKS:
            BtLog.error('9', rank)
    if 'all' in ranks:
        ranks = RANKS[0:-1]            

    # Is list a list of sequence names or a file?
    seqs = []
    if (seq_list):
        if isfile(seq_list):
            seqs = BtIO.parseList(seq_list)
        elif "," in seq_list:
            seqs = seq_list.split(",")
コード例 #43
0
ファイル: BtInput.py プロジェクト: evolgenomology/blobtools
def validate_input_create(main_dir, args):
    '''
    Accepts: 
        - main_dir
        - docopt args
    Returns:
        - title
        - fasta_f
        - fasta_type
        - cov_libs
        - hit_libs
        - nodesDB_f
        - taxrules
        - out_f
    '''
    ASSEMBLY_TYPES = [None, 'spades', 'soap', 'abyss', 'velvet']

    fasta_f = args['--infile']
    fasta_type = args['--type']
    sam_fs = args['--sam']
    bam_fs = args['--bam']
    cov_fs = args['--cov']
    cas_fs = args['--cas']
    hit_fs = args['--taxfile']
    out_f = args['--out']
    if (out_f):
        out_f = "%s.%s" % (os.path.basename(out_f), "BlobDB.json")
    else:
        out_f = "%s" % ("BlobDB.json")
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    title = args['--title'] if (args['--title']) else out_f
    
    # Do files exist ?
    files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None]
    for f in files:
        if not os.path.isfile(f):
            BtLog.error('0', f)

    # Is taxonomy provided?
    if nodesDB_f == "data/nodesDB.txt":
        nodesDB_f = os.path.join(main_dir, nodesDB_f)
    if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)):
        BtLog.error('3')
    if not (hit_fs):
        BtLog.error('18')
    # can FASTA parser deal with assemblies
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))
    # Is coverage provided?
    if not (fasta_type) and not bam_fs and not sam_fs and not cov_fs and not cas_fs:
        BtLog.error('1')
    cov_libs = [bt.CovLibObj('bam' + str(idx), 'bam', lib_f) for idx, lib_f in enumerate(bam_fs)] + \
               [bt.CovLibObj('sam' + str(idx), 'sam', lib_f) for idx, lib_f in enumerate(sam_fs)] + \
               [bt.CovLibObj('cas' + str(idx), 'cas', lib_f) for idx, lib_f in enumerate(cas_fs)] + \
               [bt.CovLibObj('cov' + str(idx), 'cov', lib_f) for idx, lib_f in enumerate(cov_fs)] 

    hit_libs = [bt.hitLibObj('tax' + str(idx), 'tax', lib_f) for idx, lib_f in enumerate(hit_fs)]

    return title, fasta_f, fasta_type, cov_libs, hit_libs, taxrules, nodesDB_f, nodes_f, names_f, out_f
コード例 #44
0
ファイル: comparecov.py プロジェクト: hyphaltip/blobtools
    taxrule = args['--taxrule']
    hist_type = args['--hist']
    plot_title = args['--title']
    ignore_contig_length = args['--noscale']
    #labels = args['--label']
    #colour_f = args['--colours']
    #exclude_groups = args['--exclude']
    format = args['--format'] 
    #no_plot_blobs = args['--noblobs']
    #no_plot_reads = args['--noreads']
    #refcov_f = args['--refcov']
    #catcolour_f = args['--catcolour']

    # Does blobdb_f exist ?
    if not isfile(blobdb_f):
        BtLog.error('0', blobdb_f)

    # Does cov_f exist ?
    if not isfile(cov_f):
        BtLog.error('0', cov_f)
    # parse cov file in dict 
    cov_dict = BtPlot.parseCovFile(cov_f)
    
    # Are ranks sane ?
    if rank not in RANKS:
        BtLog.error('9', rank)

    # Are sort_order and hist_type sane?
    if not sort_order in ['span', 'count']:
        BtLog.error('14', sort_order)
    if not hist_type in ['span', 'count']:            
コード例 #45
0
ファイル: create.py プロジェクト: greatfireball/blobtools
    if (out_f):
        out_f = "%s.%s" % (out_f, "BlobDB.json")
    else:
        out_f = "%s" % ("BlobDB.json")
    nodesDB_f = args['--db']
    names_f = args['--names']
    nodes_f = args['--nodes']
    taxrules = args['--taxrule']
    title = args['--title'] if (args['--title']) else os.path.basename(".".join(fasta_f.split('.')[0:-1]))


    # Do files exist ?
    files = [x for x in list([fasta_f] + sam_fs + bam_fs + cov_fs + cas_fs + [names_f] + [nodes_f] + hit_fs) if x is not None]
    for f in files:
        if not os.path.isfile(f):
            BtLog.error('0', f)

    # Is taxonomy provided?
    if nodesDB_f == "data/nodesDB.txt":
        nodesDB_f = os.path.join(main_dir, nodesDB_f)
    if not os.path.isfile(nodesDB_f) and not ((names_f) and (nodes_f)):
        BtLog.error('3')

    if not (hit_fs):
        BtLog.error('18')

    # can FASTA parser deal with assemblies
    if not fasta_type in ASSEMBLY_TYPES:
        BtLog.error('2', ",".join(ASSEMBLY_TYPES[1:]))

    # Is coverage provided?