def lengths(self, Amin, n=1000, points=10): ''' samples length of quality trimmed records :param Amin: minimum PHRED value :param n: number of records to sample :param points: number of points within file to scan for records; this value is ignored for gzipped fastq files :returns: list of quality trimmed record lengths ``n`` items ''' self.fd.seek(0) if self.gz: lo.debug('gzipped fastq : scan %d points at start only' % n) lengths = [] for point in range(points): if not self.gz and point > 0: self.fd.seek(os.path.getsize(self.fname)*point/points) self.seekback() while n > (points - 1 - point)*n/points: ident, seq, plus, scores = (self.fd.readline().strip() for j in range(4)) pos, length = self.cutoff(scores, Amin) if length>=0: lengths.append(length) n -= 1 return lengths
def _analyse(self, coverages): mls = [] #TODO choose criteria dynamically for ml, xs in self.score_SNPs(Lineage.roots, coverages).items(): lo.debug(str(ml)+' : '+str(xs)) if sum(xs)>1: # we need at least two positive SNPs mls.append(ml.name) if 0 in xs: # flag if one of the SNPs is not found # mls[-1] += ' (?)' pass if ml.children: sls = [] # co-mutants complicate our life somewhat slsc = self.score_SNPs(ml.children, coverages) slsc_byname = {} slsc_comutants = {} for sl, xs_ in slsc.items(): slsc_byname.setdefault(sl.name, []).extend(xs_) if sl.comutant: slsc_comutants.setdefault(sl.name, []).extend([sl.comutant] * sum(xs_)) for slname, xs_ in slsc_byname.items(): comutants = ''.join(slsc_comutants.get(slname, [])) lo.debug('sublineage '+slname+' : '+str(xs_)+' comutants '+comutants) if sum(xs_)>1: sls.append(slname) if comutants: sls[-1] += '_' + comutants # if 0 in xs_: # does not make sense when using comutants # sls[-1] += ' (?)' if sls: mls[-1] += '/' + '-'.join(sls) depths = sorted([coverage.mean(include_margins=False) for coverage in coverages.values()]) remark = '' if depths[len(depths)/2] < 10: remark += ' -- low coverage (median below 10x)' mixed = sum([coverage.mixed() for coverage in coverages.values()]) if mixed: remark += ' -- mixed coverage' if not mls: return '?' + remark return ' // '.join(mls) + remark
def min_max_score_check_file(self, n=1000, points=10): ''' check fastq file format and return min/max PHRED score values :param n: number of records to scan :param points: number of points within file to scan for records; this value is ignored for gzipped fastq files :returns: minimum and maximum value of PHRED score (index within ``ASCII``) ''' ret_min = +999 ret_max = -999 self.fd.seek(0) if self.gz: lo.debug('gzipped fastq : scan %d points at start only' % n) for point in range(points): if not self.gz and point > 0: # (oversamples small files) self.fd.seek(os.path.getsize(self.fname)*point/points) self.seekback() while n > (points - 1 - point)*n/points: identifier = self.fd.readline().rstrip('\n\r') if not identifier: break if not identifier[0] == '@': raise FastqFileFormatException( 'identifier (1st line of record) must begin with "@"') bases = self.fd.readline().rstrip('\n\r') if not set(bases).issubset(set('AGCTN')): raise FastqFileFormatException( 'bases (2nd line of record) must contain only AGCTN') plus = self.fd.readline().rstrip('\n\r') if not (plus == '+' or (plus[0]=='+' and plus[1:] == identifier[1:])): raise FastqFileFormatException( 'separator (3rd line of record) must be == "+" or "+(ident)"') phredstr = self.fd.readline().rstrip('\n\r') if not (len(bases) == len(phredstr) or ( len(bases) == len(phredstr)-1 and phredstr[-1] == '!' )): raise FastqFileFormatException( 'bases must be ~ same length as phred score (2nd, 4th line)') try: ret_min = min(ret_min, *[self.ASCII.index(x) for x in phredstr]) ret_max = max(ret_max, *[self.ASCII.index(x) for x in phredstr]) except ValueError, e: raise FastqFileFormatException( 'phred score (4th line of record) must contain only "%s"'% self.ASCII) n -= 1 if not identifier: break
def __init__(self, testsuite_paths): ThemedTk.__init__(self) self.settings = Settings(default_config) self.testsuite_paths = testsuite_paths self.testsuites = {} frame = tk.Frame(self) self.scan = tk.Button(frame, text='scan .fastq files', command=self.do_scan) self.scan.pack() self.explore = tk.Button(frame, text='explore .json files', command=self.do_explore) self.explore.pack() dummy = tk.Label(frame) dummy.pack() self.config = tk.Button(frame, text='settings', command=self.do_config) self.config.pack() self.help = tk.Button(frame, text='help', command=open_help) self.help.pack() if logfn: self.showlog = tk.Button(frame, text='show log file', command=self.do_showlog) self.showlog.pack() frame.pack(side='left', padx=10) outer = tk.Frame(self, borderwidth=1, relief='ridge') outer.pack(side='left', expand=True, fill='both', padx=5, pady=5) label = tk.Label(outer, text='kvarq log output') label.pack() frame = tk.Frame(outer) frame.pack(expand=True, fill='both') self.text = tk.Text(frame) #, state=tk.DISABLED) self.text.pack(side=tk.LEFT, fill=tk.BOTH, expand=True) scroll = tk.Scrollbar(frame, command=self.text.yview) scroll.pack(side=tk.RIGHT, fill=tk.Y) self.text.config(yscrollcommand=scroll.set) self.text.yscrollbar = scroll def sys_exit(): if tkMessageBox.askyesno('quit KvarQ', 'really want to exit KvarQ and close all windows?'): sys.exit(0) self.protocol("WM_DELETE_WINDOW", sys_exit) self.log_handler = GuiLogHandler(self.text, self.scrolldown) lo.addHandler(self.log_handler) lo.debug('GUI started')
def _analyse(self, coverages): nodes = list() self.walk_tree(nodes, self.root, coverages) i = 0 # discard all but longest of common paths while i < len(nodes): j = i + 1 while j < len(nodes): if nodes[i].is_ancestor(nodes[j]): lo.debug('pruning "%s" < "%s"' % (nodes[j].path(), nodes[i].path())) del nodes[j] else: j +=1 i += 1 return [node.path() for node in nodes]
def walk_tree(self, ret, node, coverages): ''' yields a list of tree-nodes (leaves first) that match the coverages ''' for child in node.children: n = 0 for test in child.tests: coverage = coverages[test] if test.template.validate(coverage): n += 1 if n == len(child.tests): self.walk_tree(ret, child, coverages) ret.append(child) elif n>0: lo.debug('discarding SNPNode "%s" : %d<%d' % ( child.path(), n, len(child.tests)))
def __init__(self, path, identifier=None, description=None): """ :param path: name of file to read bases from; can be ``.bases`` file that directly contains base sequence (without any whitespace) or a file in FASTA format (only first genome is read) :param identifier: short identifier of genome; will be read from FASTA file if none specified :param description: text description; will also be read from FASTA file if none specified """ self.path = path self.f = file(path, "r") if self.f.read(1) == ">": self.fasta = True self.f.seek(0) defline = self.f.readline() idx = defline.find(" ") if identifier is None: if idx == -1: identifier = defline[1:] else: identifier = defline[1:idx] if description is None: if idx != -1 and idx < len(defline): description = defline[idx + 1 :] # read whole sequence into memory self.bases = "" self.bases = "".join([line.rstrip("\n\r") for line in self.f.readlines()]) if ">" in self.bases: lo.info("%s contains several genomes; only first read" % path) self.bases = self.bases[: self.bases.index(">")] self.size = len(self.bases) self.f.close() lo.debug('read %d bytes FASTA sequence "%s" into memory' % (self.size, identifier)) else: self.fasta = False self.f.seek(0, 2) self.size = self.f.tell() self.identifier = identifier self.description = description
def convert_legacy_data(testsuites, data): ''' :param testsuites: dictionary of :py:class:`kvarq.genes.Testsuite` :param data: dictionary as returned by :py:meth:`kvarq.analyse.Analyser.encode`, which can be of a previous version of KvarQ :returns: object as returned by :py:meth:`kvarq.analyse.Analyser.encode` of current version of KvarQ tries to convert data from older versions of KvarQ, raises :py:class:`kvarq.analyse.VersionConflictException` or :py:class:`kvarq.analyse.DataInconcistencyException` if data cannot be converted ''' from kvarq.analyse import VersionConflictException, DataInconcistencyException kvarq_version = list(StrictVersion(VERSION).version) version = list(StrictVersion(data['info']['version']).version) if version[1] < 10: raise VersionConflictException('cannot load files v<0.10') # convert tests -> coverages if version[0] == 0 and version[1] == 10: # load data templates_by_testname = dict(reduce(lambda x, y: x + y, [[ (str(test), test.template) for test in testsuite.tests ] for testsuite in testsuites.values()])) coverages_by_testname = dict(reduce(lambda x, y: x + y, [data_testsuite.items() for data_testsuite in data['testsuites'].values()] )) # convert test-nr -> coverage-nr nrmap = [] coverages = OrderedDict() for i, testname in enumerate(data['tests']): if not testname in templates_by_testname: lo.info('json contains additional test "%s"; discarding.' % testname) continue templatename = str(templates_by_testname[testname]) coverage = coverages_by_testname[testname] if templatename in coverages: assert coverages[templatename] == coverage, DataInconcistencyException( 'found contradicting coverages for template "%s" : "%s" / "%s"' % templatename, (coverages[templatename], coverage)) else: coverages[templatename] = coverage nrmap.append(i) # save to data data['coverages'] = [(k, v) for k, v in coverages.items()] lo.debug('mapping "nseqhits", "nseqbasehits" : (%d) %s' % (len(nrmap), str(nrmap))) for key in ['nseqhits', 'nseqbasehits']: if key not in data['stats']: lo.info('no stats/%s found (old json version)' % key) continue data['stats'][key] = [ data['stats'][key][nrmap[coveragenr]] for coveragenr in range(len(coverages)) # forward ] + [ data['stats'][key][nrmap[coveragenr] + len(data['tests'])] for coveragenr in range(len(coverages)) # reverse ] # clean up del data['testsuites'] del data['tests'] version[1] += 1 # convert info.fastq/info.size to lists if version[0] == 0 and version[1] == 11: data['info']['fastq'] = [data['info']['fastq']] data['info']['size'] = [data['info']['size']] version[1] += 1 assert version[0] == kvarq_version[0] and version[1] == kvarq_version[1], \ VersionConflictException('could not elevate version more than to "%d.%d"' % (version[0], version[1])) return data
def __init__(self, fname, variant=None, fd=None, paired=False, quiet=False): ''' open ``.fastq`` or ``.fastq.gz`` file and determine its variant (setting attribute ``.Azero`` accordingly) :param fname: name of file to open :param variant: specify one of ``.vendor_variants`` -- if none is specified, then the PHRED score of the fastq file is analyzed and :param fd: specify a file descriptor to use instead of opening ``fname`` :param paired: include second file in a paired set if it is available (i.e. specify "file_1.fastq" as input file and "file_2.fastq" will be included in functions ``.filesize()`` and ``.filenames()``) ''' self.fname = fname if fd: self.fd = fd else: self.fd = None if self.fname.endswith('.fastq.gz'): self.gz = True if not self.fd: self.fd = gzip.GzipFile(self.fname, 'rb') elif self.fname.endswith('.fastq'): self.gz = False if not self.fd: self.fd = open(self.fname, 'rb') else: raise FastqFileFormatException( 'fastq file must have extension ".fastq" or ".fastq.gz"') # save second name of base if exists self.fname2 = None if paired: base = fname[:fname.rindex('.fastq')] if base[-2:] == '_1': fname2 = base[:-2] + '_2' + fname[fname.rindex('.fastq'):] if os.path.exists(fname2): lo.info('including paired file "%s"' % fname2) self.fname2 = fname2 if sum(self.filesizes()) == 0: raise FastqFileFormatException('cannot scan empty file') # scan some records min_pos, max_pos = self.min_max_score_check_file() lo.debug('min_pos=%d max_pos=%d' % (min_pos, max_pos)) if variant and variant not in self.vendor_variants: raise FastqFileFormatException( 'unknown vendor variant "%s"' % variant) # create list of variants compatible with PHRED scores variants = [] dQs = [] for name, vendor_variant in Fastq.vendor_variants.items(): if ((min_pos - vendor_variant.dQ) in vendor_variant.Qrange and (max_pos - vendor_variant.dQ) in vendor_variant.Qrange): dQs.append(vendor_variant.dQ) variants.append(name) if variant is None: # set variant from guesses if not variants: raise FastqFileFormatException( 'could not find any suitable fastq vendor variant') if len(set(dQs)) > 1: raise FastqFileFormatException( 'cannot determine dQ with guessed vendor variants "%s"' % str(variants)) self.variants = variants self.dQ = dQs[0] else: # check specified variant if variant not in variants: lo.warning('specified vendor variant "%s" seems not to be ' 'compatible with file' % variant) self.variants = [variant] self.dQ = self.vendor_variants[variant].dQ self.Azero = self.ASCII[self.dQ] # estimate readlength/records_approx self.fd.seek(0) lines = [self.fd.readline() for i in range(4)] self.readlength = len(lines[1].strip('\r\n')) if self.gz: self.records_approx = None else: self.records_approx = os.path.getsize(self.fname) / len(''.join(lines)) if self.fname2 is not None: self.records_approx *= 2 # output some infos if not quiet: if self.gz: lo.info('gzipped fastq : readlength=? records_approx=? dQ=%d variants=%s' % ( self.dQ, str(self.variants))) else: lo.info('fastq : readlength=%d records_approx=%d dQ=%d variants=%s' % ( self.readlength, self.records_approx, self.dQ, str(self.variants)))