Ejemplo n.º 1
0
Archivo: fastq.py Proyecto: kvarq/kvarq
    def lengths(self, Amin, n=1000, points=10):
        '''
        samples length of quality trimmed records

        :param Amin: minimum PHRED value
        :param n: number of records to sample
        :param points: number of points within file to scan for records;
            this value is ignored for gzipped fastq files
        :returns: list of quality trimmed record lengths ``n`` items
        '''
        self.fd.seek(0)

        if self.gz:
            lo.debug('gzipped fastq : scan %d points at start only' % n)

        lengths = []
        for point in range(points):

            if not self.gz and point > 0:
                self.fd.seek(os.path.getsize(self.fname)*point/points)
                self.seekback()

            while n > (points - 1 - point)*n/points:
                ident, seq, plus, scores = (self.fd.readline().strip()
                        for j in range(4))
                pos, length = self.cutoff(scores, Amin)
                if length>=0:
                    lengths.append(length)

                n -= 1
        return lengths
Ejemplo n.º 2
0
Archivo: phylo.py Proyecto: kvarq/kvarq
    def _analyse(self, coverages):
        mls = []

        #TODO choose criteria dynamically

        for ml, xs in self.score_SNPs(Lineage.roots, coverages).items():
            lo.debug(str(ml)+' : '+str(xs))

            if sum(xs)>1:
                # we need at least two positive SNPs
                mls.append(ml.name)

                if 0 in xs:
                    # flag if one of the SNPs is not found
                    # mls[-1] += ' (?)'
                    pass

                if ml.children:
                    sls = []

                    # co-mutants complicate our life somewhat
                    slsc = self.score_SNPs(ml.children, coverages)
                    slsc_byname = {}
                    slsc_comutants = {}
                    for sl, xs_ in slsc.items():
                        slsc_byname.setdefault(sl.name, []).extend(xs_)
                        if sl.comutant:
                            slsc_comutants.setdefault(sl.name, []).extend([sl.comutant] * sum(xs_))

                    for slname, xs_ in slsc_byname.items():
                        comutants = ''.join(slsc_comutants.get(slname, []))
                        lo.debug('sublineage '+slname+' : '+str(xs_)+' comutants '+comutants)
                        if sum(xs_)>1:
                            sls.append(slname)
                            if comutants:
                                sls[-1] += '_' + comutants

#                            if 0 in xs_: # does not make sense when using comutants
#                                sls[-1] += ' (?)'

                    if sls:
                        mls[-1] += '/' + '-'.join(sls)

        depths = sorted([coverage.mean(include_margins=False)
                for coverage in coverages.values()])
        remark = ''

        if depths[len(depths)/2] < 10:
            remark += ' -- low coverage (median below 10x)'

        mixed = sum([coverage.mixed() for coverage in coverages.values()])
        if mixed:
            remark += ' -- mixed coverage'

        if not mls:
            return '?' + remark

        return ' // '.join(mls) + remark
Ejemplo n.º 3
0
Archivo: fastq.py Proyecto: kvarq/kvarq
    def min_max_score_check_file(self, n=1000, points=10):
        '''
        check fastq file format and return min/max PHRED score values

        :param n: number of records to scan
        :param points: number of points within file to scan for records;
            this value is ignored for gzipped fastq files
        :returns: minimum and maximum value of PHRED score (index within
            ``ASCII``)
        '''
        ret_min = +999
        ret_max = -999
        self.fd.seek(0)

        if self.gz:
            lo.debug('gzipped fastq : scan %d points at start only' % n)

        for point in range(points):

            if not self.gz and point > 0:
                # (oversamples small files)
                self.fd.seek(os.path.getsize(self.fname)*point/points)
                self.seekback()

            while n > (points - 1 - point)*n/points:
                identifier = self.fd.readline().rstrip('\n\r')
                if not identifier: break
                if not identifier[0] == '@':
                    raise FastqFileFormatException(
                        'identifier (1st line of record) must begin with "@"')
                bases = self.fd.readline().rstrip('\n\r')
                if not set(bases).issubset(set('AGCTN')):
                    raise FastqFileFormatException(
                        'bases (2nd line of record) must contain only AGCTN')
                plus = self.fd.readline().rstrip('\n\r')
                if not (plus == '+' or (plus[0]=='+' and plus[1:] == identifier[1:])):
                    raise FastqFileFormatException(
                        'separator (3rd line of record) must be == "+" or "+(ident)"')
                phredstr = self.fd.readline().rstrip('\n\r')
                if not (len(bases) == len(phredstr) or (
                        len(bases) == len(phredstr)-1 and phredstr[-1] == '!' )):
                    raise FastqFileFormatException(
                        'bases must be ~ same length as phred score (2nd, 4th line)')
                try:
                    ret_min = min(ret_min, *[self.ASCII.index(x) for x in phredstr])
                    ret_max = max(ret_max, *[self.ASCII.index(x) for x in phredstr])
                except ValueError, e:
                    raise FastqFileFormatException(
                        'phred score (4th line of record) must contain only "%s"'%
                        self.ASCII)
                n -= 1

            if not identifier: break
Ejemplo n.º 4
0
Archivo: main.py Proyecto: kvarq/kvarq
    def __init__(self, testsuite_paths):
        ThemedTk.__init__(self)

        self.settings = Settings(default_config)
        self.testsuite_paths = testsuite_paths
        self.testsuites = {}

        frame = tk.Frame(self)

        self.scan = tk.Button(frame, text='scan .fastq files', command=self.do_scan)
        self.scan.pack()

        self.explore = tk.Button(frame, text='explore .json files', command=self.do_explore)
        self.explore.pack()

        dummy = tk.Label(frame)
        dummy.pack()

        self.config = tk.Button(frame, text='settings', command=self.do_config)
        self.config.pack()

        self.help = tk.Button(frame, text='help', command=open_help)
        self.help.pack()

        if logfn:
            self.showlog = tk.Button(frame, text='show log file', command=self.do_showlog)
            self.showlog.pack()

        frame.pack(side='left', padx=10)

        outer = tk.Frame(self, borderwidth=1, relief='ridge')
        outer.pack(side='left', expand=True, fill='both', padx=5, pady=5)
        label = tk.Label(outer, text='kvarq log output')
        label.pack()
        frame = tk.Frame(outer)
        frame.pack(expand=True, fill='both')
        self.text = tk.Text(frame) #, state=tk.DISABLED)
        self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True)
        scroll = tk.Scrollbar(frame, command=self.text.yview)
        scroll.pack(side=tk.RIGHT, fill=tk.Y)
        self.text.config(yscrollcommand=scroll.set)
        self.text.yscrollbar = scroll

        def sys_exit():
            if tkMessageBox.askyesno('quit KvarQ', 'really want to exit KvarQ and close all windows?'):
                sys.exit(0)
        self.protocol("WM_DELETE_WINDOW", sys_exit)

        self.log_handler = GuiLogHandler(self.text, self.scrolldown)
        lo.addHandler(self.log_handler)
        lo.debug('GUI started')
Ejemplo n.º 5
0
 def _analyse(self, coverages):
     nodes = list()
     self.walk_tree(nodes, self.root, coverages)
     i = 0
     # discard all but longest of common paths
     while i < len(nodes):
         j = i + 1
         while j < len(nodes):
             if nodes[i].is_ancestor(nodes[j]):
                 lo.debug('pruning "%s" < "%s"' % (nodes[j].path(), nodes[i].path()))
                 del nodes[j]
             else:
                 j +=1
         i += 1
     return [node.path() for node in nodes]
Ejemplo n.º 6
0
 def walk_tree(self, ret, node, coverages):
     '''
     yields a list of tree-nodes (leaves first) that match the coverages
     '''
     for child in node.children:
         n = 0
         for test in child.tests:
             coverage = coverages[test]
             if test.template.validate(coverage):
                 n += 1
         if n == len(child.tests):
             self.walk_tree(ret, child, coverages)
             ret.append(child)
         elif n>0:
             lo.debug('discarding SNPNode "%s" : %d<%d' % (
                     child.path(), n, len(child.tests)))
Ejemplo n.º 7
0
Archivo: genes.py Proyecto: kvarq/kvarq
    def __init__(self, path, identifier=None, description=None):
        """
        :param path: name of file to read bases from; can be ``.bases``
            file that directly contains base sequence (without any
            whitespace) or a file in FASTA format (only first genome is read)
        :param identifier: short identifier of genome; will be read from
            FASTA file if none specified
        :param description: text description; will also be read from
            FASTA file if none specified
        """
        self.path = path
        self.f = file(path, "r")

        if self.f.read(1) == ">":
            self.fasta = True
            self.f.seek(0)
            defline = self.f.readline()
            idx = defline.find(" ")
            if identifier is None:
                if idx == -1:
                    identifier = defline[1:]
                else:
                    identifier = defline[1:idx]
            if description is None:
                if idx != -1 and idx < len(defline):
                    description = defline[idx + 1 :]

            # read whole sequence into memory
            self.bases = ""
            self.bases = "".join([line.rstrip("\n\r") for line in self.f.readlines()])
            if ">" in self.bases:
                lo.info("%s contains several genomes; only first read" % path)
                self.bases = self.bases[: self.bases.index(">")]
            self.size = len(self.bases)
            self.f.close()
            lo.debug('read %d bytes FASTA sequence "%s" into memory' % (self.size, identifier))

        else:
            self.fasta = False
            self.f.seek(0, 2)
            self.size = self.f.tell()

        self.identifier = identifier
        self.description = description
Ejemplo n.º 8
0
def convert_legacy_data(testsuites, data):
    '''
    :param testsuites: dictionary of :py:class:`kvarq.genes.Testsuite`
    :param data: dictionary as returned by :py:meth:`kvarq.analyse.Analyser.encode`,
        which can be of a previous version of KvarQ
    :returns: object as returned by :py:meth:`kvarq.analyse.Analyser.encode` of
        current version of KvarQ

    tries to convert data from older versions of KvarQ, raises
    :py:class:`kvarq.analyse.VersionConflictException` or
    :py:class:`kvarq.analyse.DataInconcistencyException` if data cannot be converted
    '''
    from kvarq.analyse import VersionConflictException, DataInconcistencyException

    kvarq_version = list(StrictVersion(VERSION).version)
    version = list(StrictVersion(data['info']['version']).version)

    if version[1] < 10:
        raise VersionConflictException('cannot load files v<0.10')

    # convert tests -> coverages
    if version[0] == 0 and version[1] == 10:

        # load data
        templates_by_testname = dict(reduce(lambda x, y: x + y, [[
                (str(test), test.template) for test in testsuite.tests
            ] for testsuite in testsuites.values()]))

        coverages_by_testname = dict(reduce(lambda x, y: x + y,
                [data_testsuite.items() for data_testsuite in data['testsuites'].values()]
            ))

        # convert test-nr -> coverage-nr
        nrmap = []
        coverages = OrderedDict()

        for i, testname in enumerate(data['tests']):

            if not testname in templates_by_testname:
                lo.info('json contains additional test "%s"; discarding.' % testname)
                continue

            templatename = str(templates_by_testname[testname])
            coverage = coverages_by_testname[testname]

            if templatename in coverages:
                assert coverages[templatename] == coverage, DataInconcistencyException(
                        'found contradicting coverages for template "%s" : "%s" / "%s"' %
                        templatename, (coverages[templatename], coverage))
            else:
                coverages[templatename] = coverage
                nrmap.append(i)

        # save to data
        data['coverages'] = [(k, v) for k, v in coverages.items()]
        lo.debug('mapping "nseqhits", "nseqbasehits" : (%d) %s' % (len(nrmap), str(nrmap)))
        for key in ['nseqhits', 'nseqbasehits']:
            if key not in data['stats']:
                lo.info('no stats/%s found (old json version)' % key)
                continue
            data['stats'][key] = [
                    data['stats'][key][nrmap[coveragenr]] 
                    for coveragenr in range(len(coverages)) # forward
                ] + [
                    data['stats'][key][nrmap[coveragenr] + len(data['tests'])] 
                    for coveragenr in range(len(coverages)) # reverse
                ]

        # clean up
        del data['testsuites']
        del data['tests']
        version[1] += 1

    # convert info.fastq/info.size to lists
    if version[0] == 0 and version[1] == 11:
        data['info']['fastq'] = [data['info']['fastq']]
        data['info']['size'] = [data['info']['size']]
        version[1] += 1

    assert version[0] == kvarq_version[0] and version[1] == kvarq_version[1], \
            VersionConflictException('could not elevate version more than to "%d.%d"' %
                    (version[0], version[1]))

    return data
Ejemplo n.º 9
0
Archivo: fastq.py Proyecto: kvarq/kvarq
    def __init__(self, fname, variant=None, fd=None, paired=False, quiet=False):
        '''
        open ``.fastq`` or ``.fastq.gz`` file and determine its
        variant (setting attribute ``.Azero`` accordingly)

        :param fname: name of file to open
        :param variant: specify one of ``.vendor_variants`` -- if none
            is specified, then the PHRED score of the fastq file is
            analyzed and
        :param fd: specify a file descriptor to use instead of
            opening ``fname``
        :param paired: include second file in a paired set if it is
            available (i.e. specify "file_1.fastq" as input file and
            "file_2.fastq" will be included in functions ``.filesize()``
            and ``.filenames()``)
        '''
        self.fname = fname

        if fd:
            self.fd = fd
        else:
            self.fd = None

        if self.fname.endswith('.fastq.gz'):
            self.gz = True
            if not self.fd:
                self.fd = gzip.GzipFile(self.fname, 'rb')
        elif self.fname.endswith('.fastq'):
            self.gz = False
            if not self.fd:
                self.fd = open(self.fname, 'rb')
        else:
            raise FastqFileFormatException(
                        'fastq file must have extension ".fastq" or ".fastq.gz"')

        # save second name of base if exists
        self.fname2 = None
        if paired:
            base = fname[:fname.rindex('.fastq')]
            if base[-2:] == '_1':
                fname2 = base[:-2] + '_2' + fname[fname.rindex('.fastq'):]
                if os.path.exists(fname2):
                    lo.info('including paired file "%s"' % fname2)
                    self.fname2 = fname2

        if sum(self.filesizes()) == 0:
            raise FastqFileFormatException('cannot scan empty file')

        # scan some records
        min_pos, max_pos = self.min_max_score_check_file()
        lo.debug('min_pos=%d max_pos=%d' % (min_pos, max_pos))

        if variant and variant not in self.vendor_variants:
            raise FastqFileFormatException(
                    'unknown vendor variant "%s"' % variant)

        # create list of variants compatible with PHRED scores
        variants = []
        dQs = []
        for name, vendor_variant in Fastq.vendor_variants.items():

            if ((min_pos - vendor_variant.dQ) in vendor_variant.Qrange
                    and (max_pos - vendor_variant.dQ) in vendor_variant.Qrange):
                dQs.append(vendor_variant.dQ)
                variants.append(name)

        if variant is None:
            # set variant from guesses
            if not variants:
                raise FastqFileFormatException(
                        'could not find any suitable fastq vendor variant')
            if len(set(dQs)) > 1:
                raise FastqFileFormatException(
                        'cannot determine dQ with guessed vendor variants "%s"'
                        % str(variants))
            self.variants = variants
            self.dQ = dQs[0]
        else:
            # check specified variant
            if variant not in variants:
                lo.warning('specified vendor variant "%s" seems not to be '
                        'compatible with file' % variant)
            self.variants = [variant]
            self.dQ = self.vendor_variants[variant].dQ


        self.Azero = self.ASCII[self.dQ]

        # estimate readlength/records_approx
        self.fd.seek(0)
        lines = [self.fd.readline() for i in range(4)]
        self.readlength = len(lines[1].strip('\r\n'))
        if self.gz:
            self.records_approx = None
        else:
            self.records_approx = os.path.getsize(self.fname) / len(''.join(lines))
            if self.fname2 is not None:
                self.records_approx *= 2

        # output some infos
        if not quiet:
            if self.gz:
                lo.info('gzipped fastq : readlength=? records_approx=? dQ=%d variants=%s' % (
                        self.dQ, str(self.variants)))
            else:
                lo.info('fastq : readlength=%d records_approx=%d dQ=%d variants=%s' % (
                        self.readlength, self.records_approx, self.dQ, str(self.variants)))