def illustrate(args): data = json.load(file(args.file)) testsuite_paths = discover_testsuites(args.testsuite_directory or []) testsuites = {} update_testsuites(testsuites, data['info']['testsuites'], testsuite_paths) analyser = analyse.Analyser() lo.info('loading json-file args.file') analyser.decode(testsuites, data) lo.info('updating testsuites') analyser.update_testsuites() if args.readlengths: rls = analyser.stats['readlengths'] hist = TextHist() print(hist.draw(rls, indexed=True)) if args.coverage: for name, testsuite in analyser.testsuites.items(): print(name + ':') for test in testsuite.tests: print(' - %s : %s' % (test, analyser[test])) print() if args.results: for testsuite, results in analyser.results.items(): print('\n'+testsuite) print('-'*len(testsuite)) pprint(results)
def summarize(args): js = JsonSummary() for fname in args.json: lo.info('processing ' + fname) js.add(fname) js.dump()
def __init__(self, path, identifier=None, description=None): """ :param path: name of file to read bases from; can be ``.bases`` file that directly contains base sequence (without any whitespace) or a file in FASTA format (only first genome is read) :param identifier: short identifier of genome; will be read from FASTA file if none specified :param description: text description; will also be read from FASTA file if none specified """ self.path = path self.f = file(path, "r") if self.f.read(1) == ">": self.fasta = True self.f.seek(0) defline = self.f.readline() idx = defline.find(" ") if identifier is None: if idx == -1: identifier = defline[1:] else: identifier = defline[1:idx] if description is None: if idx != -1 and idx < len(defline): description = defline[idx + 1 :] # read whole sequence into memory self.bases = "" self.bases = "".join([line.rstrip("\n\r") for line in self.f.readlines()]) if ">" in self.bases: lo.info("%s contains several genomes; only first read" % path) self.bases = self.bases[: self.bases.index(">")] self.size = len(self.bases) self.f.close() lo.debug('read %d bytes FASTA sequence "%s" into memory' % (self.size, identifier)) else: self.fasta = False self.f.seek(0, 2) self.size = self.f.tell() self.identifier = identifier self.description = description
def update(args): if args.fastq: lo.warning('re-reading of hits not currently implemented') data = json.load(file(args.json)) testsuite_paths = discover_testsuites(args.testsuite_directory or []) testsuites = {} update_testsuites(testsuites, data['info']['testsuites'], testsuite_paths) analyser = analyse.Analyser() analyser.decode(testsuites, data) analyser.update_testsuites() # save results back to .json data = analyser.encode(hits = analyser.hits is not None) j = codecs.open(args.json, 'w', 'utf-8') lo.info('re-wrote results to file ' + args.json) json.dump(data, j, indent=2)
def show(args): fastq = Fastq(args.file) if args.quality: Amin = fastq.Q2A(args.quality) n = args.number points = args.points lo.info('determining readlengths with quality>=%d of %s ' 'by reading %d records at %d points'%( args.quality, args.file, n, points)) rls = fastq.lengths(Amin, n=n, points=points) hist = TextHist() print(hist.draw(sorted(rls))) if args.info: print('dQ=' + str(fastq.dQ)) print('variants=' + str(fastq.variants)) print('readlength=' + str(fastq.readlength)) print('records_approx=' + str(fastq.records_approx or '?'))
def convert_legacy_data(testsuites, data): ''' :param testsuites: dictionary of :py:class:`kvarq.genes.Testsuite` :param data: dictionary as returned by :py:meth:`kvarq.analyse.Analyser.encode`, which can be of a previous version of KvarQ :returns: object as returned by :py:meth:`kvarq.analyse.Analyser.encode` of current version of KvarQ tries to convert data from older versions of KvarQ, raises :py:class:`kvarq.analyse.VersionConflictException` or :py:class:`kvarq.analyse.DataInconcistencyException` if data cannot be converted ''' from kvarq.analyse import VersionConflictException, DataInconcistencyException kvarq_version = list(StrictVersion(VERSION).version) version = list(StrictVersion(data['info']['version']).version) if version[1] < 10: raise VersionConflictException('cannot load files v<0.10') # convert tests -> coverages if version[0] == 0 and version[1] == 10: # load data templates_by_testname = dict(reduce(lambda x, y: x + y, [[ (str(test), test.template) for test in testsuite.tests ] for testsuite in testsuites.values()])) coverages_by_testname = dict(reduce(lambda x, y: x + y, [data_testsuite.items() for data_testsuite in data['testsuites'].values()] )) # convert test-nr -> coverage-nr nrmap = [] coverages = OrderedDict() for i, testname in enumerate(data['tests']): if not testname in templates_by_testname: lo.info('json contains additional test "%s"; discarding.' % testname) continue templatename = str(templates_by_testname[testname]) coverage = coverages_by_testname[testname] if templatename in coverages: assert coverages[templatename] == coverage, DataInconcistencyException( 'found contradicting coverages for template "%s" : "%s" / "%s"' % templatename, (coverages[templatename], coverage)) else: coverages[templatename] = coverage nrmap.append(i) # save to data data['coverages'] = [(k, v) for k, v in coverages.items()] lo.debug('mapping "nseqhits", "nseqbasehits" : (%d) %s' % (len(nrmap), str(nrmap))) for key in ['nseqhits', 'nseqbasehits']: if key not in data['stats']: lo.info('no stats/%s found (old json version)' % key) continue data['stats'][key] = [ data['stats'][key][nrmap[coveragenr]] for coveragenr in range(len(coverages)) # forward ] + [ data['stats'][key][nrmap[coveragenr] + len(data['tests'])] for coveragenr in range(len(coverages)) # reverse ] # clean up del data['testsuites'] del data['tests'] version[1] += 1 # convert info.fastq/info.size to lists if version[0] == 0 and version[1] == 11: data['info']['fastq'] = [data['info']['fastq']] data['info']['size'] = [data['info']['size']] version[1] += 1 assert version[0] == kvarq_version[0] and version[1] == kvarq_version[1], \ VersionConflictException('could not elevate version more than to "%d.%d"' % (version[0], version[1])) return data
analyser = analyse.Analyser() if not args.force: if os.path.exists(args.json): lo.error('will not overwrite file ' + args.json) sys.exit(ERROR_FILE_EXISTS) if args.extract_hits and os.path.exists(args.extract_hits): lo.error('will not overwrite file ' + args.extract_hits) sys.exit(ERROR_FILE_EXISTS) # do scanning {{{2 mb = os.path.getsize(args.fastq) / 1024 / 1024 lo.info('scanning {} ({})...'.format( ', '.join(fastq.filenames()), ', '.join(['%.2f MB' % (filesize/1024.**2) for filesize in fastq.filesizes()]) )) t0 = time.time() class AnalyseThread(threading.Thread): def __init__(self, analyser): super(AnalyseThread, self).__init__(name='analyse-thread') self.analyser = analyser self.finished = False self.exception = None self.traceback = None def run(self): try: self.analyser.spacing = args.spacing
def __init__(self, fname, variant=None, fd=None, paired=False, quiet=False): ''' open ``.fastq`` or ``.fastq.gz`` file and determine its variant (setting attribute ``.Azero`` accordingly) :param fname: name of file to open :param variant: specify one of ``.vendor_variants`` -- if none is specified, then the PHRED score of the fastq file is analyzed and :param fd: specify a file descriptor to use instead of opening ``fname`` :param paired: include second file in a paired set if it is available (i.e. specify "file_1.fastq" as input file and "file_2.fastq" will be included in functions ``.filesize()`` and ``.filenames()``) ''' self.fname = fname if fd: self.fd = fd else: self.fd = None if self.fname.endswith('.fastq.gz'): self.gz = True if not self.fd: self.fd = gzip.GzipFile(self.fname, 'rb') elif self.fname.endswith('.fastq'): self.gz = False if not self.fd: self.fd = open(self.fname, 'rb') else: raise FastqFileFormatException( 'fastq file must have extension ".fastq" or ".fastq.gz"') # save second name of base if exists self.fname2 = None if paired: base = fname[:fname.rindex('.fastq')] if base[-2:] == '_1': fname2 = base[:-2] + '_2' + fname[fname.rindex('.fastq'):] if os.path.exists(fname2): lo.info('including paired file "%s"' % fname2) self.fname2 = fname2 if sum(self.filesizes()) == 0: raise FastqFileFormatException('cannot scan empty file') # scan some records min_pos, max_pos = self.min_max_score_check_file() lo.debug('min_pos=%d max_pos=%d' % (min_pos, max_pos)) if variant and variant not in self.vendor_variants: raise FastqFileFormatException( 'unknown vendor variant "%s"' % variant) # create list of variants compatible with PHRED scores variants = [] dQs = [] for name, vendor_variant in Fastq.vendor_variants.items(): if ((min_pos - vendor_variant.dQ) in vendor_variant.Qrange and (max_pos - vendor_variant.dQ) in vendor_variant.Qrange): dQs.append(vendor_variant.dQ) variants.append(name) if variant is None: # set variant from guesses if not variants: raise FastqFileFormatException( 'could not find any suitable fastq vendor variant') if len(set(dQs)) > 1: raise FastqFileFormatException( 'cannot determine dQ with guessed vendor variants "%s"' % str(variants)) self.variants = variants self.dQ = dQs[0] else: # check specified variant if variant not in variants: lo.warning('specified vendor variant "%s" seems not to be ' 'compatible with file' % variant) self.variants = [variant] self.dQ = self.vendor_variants[variant].dQ self.Azero = self.ASCII[self.dQ] # estimate readlength/records_approx self.fd.seek(0) lines = [self.fd.readline() for i in range(4)] self.readlength = len(lines[1].strip('\r\n')) if self.gz: self.records_approx = None else: self.records_approx = os.path.getsize(self.fname) / len(''.join(lines)) if self.fname2 is not None: self.records_approx *= 2 # output some infos if not quiet: if self.gz: lo.info('gzipped fastq : readlength=? records_approx=? dQ=%d variants=%s' % ( self.dQ, str(self.variants))) else: lo.info('fastq : readlength=%d records_approx=%d dQ=%d variants=%s' % ( self.readlength, self.records_approx, self.dQ, str(self.variants)))