Exemple #1
0
def illustrate(args):

    data = json.load(file(args.file))

    testsuite_paths = discover_testsuites(args.testsuite_directory or [])
    testsuites = {}
    update_testsuites(testsuites, data['info']['testsuites'], testsuite_paths)

    analyser = analyse.Analyser()
    lo.info('loading json-file args.file')
    analyser.decode(testsuites, data)
    lo.info('updating testsuites')
    analyser.update_testsuites()

    if args.readlengths:
        rls = analyser.stats['readlengths']

        hist = TextHist()
        print(hist.draw(rls, indexed=True))

    if args.coverage:
        for name, testsuite in analyser.testsuites.items():
            print(name + ':')
            for test in testsuite.tests:
                print('  - %s : %s' % (test, analyser[test]))
            print()

    if args.results:
        for testsuite, results in analyser.results.items():
            print('\n'+testsuite)
            print('-'*len(testsuite))
            pprint(results)
Exemple #2
0
def summarize(args):

    js = JsonSummary()
    for fname in args.json:
        lo.info('processing ' + fname)
        js.add(fname)

    js.dump()
Exemple #3
0
    def __init__(self, path, identifier=None, description=None):
        """
        :param path: name of file to read bases from; can be ``.bases``
            file that directly contains base sequence (without any
            whitespace) or a file in FASTA format (only first genome is read)
        :param identifier: short identifier of genome; will be read from
            FASTA file if none specified
        :param description: text description; will also be read from
            FASTA file if none specified
        """
        self.path = path
        self.f = file(path, "r")

        if self.f.read(1) == ">":
            self.fasta = True
            self.f.seek(0)
            defline = self.f.readline()
            idx = defline.find(" ")
            if identifier is None:
                if idx == -1:
                    identifier = defline[1:]
                else:
                    identifier = defline[1:idx]
            if description is None:
                if idx != -1 and idx < len(defline):
                    description = defline[idx + 1 :]

            # read whole sequence into memory
            self.bases = ""
            self.bases = "".join([line.rstrip("\n\r") for line in self.f.readlines()])
            if ">" in self.bases:
                lo.info("%s contains several genomes; only first read" % path)
                self.bases = self.bases[: self.bases.index(">")]
            self.size = len(self.bases)
            self.f.close()
            lo.debug('read %d bytes FASTA sequence "%s" into memory' % (self.size, identifier))

        else:
            self.fasta = False
            self.f.seek(0, 2)
            self.size = self.f.tell()

        self.identifier = identifier
        self.description = description
Exemple #4
0
def update(args):

    if args.fastq:
        lo.warning('re-reading of hits not currently implemented')

    data = json.load(file(args.json))

    testsuite_paths = discover_testsuites(args.testsuite_directory or [])
    testsuites = {}
    update_testsuites(testsuites, data['info']['testsuites'], testsuite_paths)

    analyser = analyse.Analyser()
    analyser.decode(testsuites, data)
    analyser.update_testsuites()

    # save results back to .json
    data = analyser.encode(hits = analyser.hits is not None)
    j = codecs.open(args.json, 'w', 'utf-8')
    lo.info('re-wrote results to file ' + args.json)
    json.dump(data, j, indent=2)
Exemple #5
0
def show(args):

    fastq = Fastq(args.file)

    if args.quality:
        Amin = fastq.Q2A(args.quality)
        n = args.number
        points = args.points
        lo.info('determining readlengths with quality>=%d of %s '
                'by reading %d records at %d points'%(
                args.quality, args.file, n, points))
        rls = fastq.lengths(Amin, n=n, points=points)

        hist = TextHist()
        print(hist.draw(sorted(rls)))

    if args.info:
        print('dQ=' + str(fastq.dQ))
        print('variants=' + str(fastq.variants))
        print('readlength=' + str(fastq.readlength))
        print('records_approx=' + str(fastq.records_approx or '?'))
Exemple #6
0
def convert_legacy_data(testsuites, data):
    '''
    :param testsuites: dictionary of :py:class:`kvarq.genes.Testsuite`
    :param data: dictionary as returned by :py:meth:`kvarq.analyse.Analyser.encode`,
        which can be of a previous version of KvarQ
    :returns: object as returned by :py:meth:`kvarq.analyse.Analyser.encode` of
        current version of KvarQ

    tries to convert data from older versions of KvarQ, raises
    :py:class:`kvarq.analyse.VersionConflictException` or
    :py:class:`kvarq.analyse.DataInconcistencyException` if data cannot be converted
    '''
    from kvarq.analyse import VersionConflictException, DataInconcistencyException

    kvarq_version = list(StrictVersion(VERSION).version)
    version = list(StrictVersion(data['info']['version']).version)

    if version[1] < 10:
        raise VersionConflictException('cannot load files v<0.10')

    # convert tests -> coverages
    if version[0] == 0 and version[1] == 10:

        # load data
        templates_by_testname = dict(reduce(lambda x, y: x + y, [[
                (str(test), test.template) for test in testsuite.tests
            ] for testsuite in testsuites.values()]))

        coverages_by_testname = dict(reduce(lambda x, y: x + y,
                [data_testsuite.items() for data_testsuite in data['testsuites'].values()]
            ))

        # convert test-nr -> coverage-nr
        nrmap = []
        coverages = OrderedDict()

        for i, testname in enumerate(data['tests']):

            if not testname in templates_by_testname:
                lo.info('json contains additional test "%s"; discarding.' % testname)
                continue

            templatename = str(templates_by_testname[testname])
            coverage = coverages_by_testname[testname]

            if templatename in coverages:
                assert coverages[templatename] == coverage, DataInconcistencyException(
                        'found contradicting coverages for template "%s" : "%s" / "%s"' %
                        templatename, (coverages[templatename], coverage))
            else:
                coverages[templatename] = coverage
                nrmap.append(i)

        # save to data
        data['coverages'] = [(k, v) for k, v in coverages.items()]
        lo.debug('mapping "nseqhits", "nseqbasehits" : (%d) %s' % (len(nrmap), str(nrmap)))
        for key in ['nseqhits', 'nseqbasehits']:
            if key not in data['stats']:
                lo.info('no stats/%s found (old json version)' % key)
                continue
            data['stats'][key] = [
                    data['stats'][key][nrmap[coveragenr]] 
                    for coveragenr in range(len(coverages)) # forward
                ] + [
                    data['stats'][key][nrmap[coveragenr] + len(data['tests'])] 
                    for coveragenr in range(len(coverages)) # reverse
                ]

        # clean up
        del data['testsuites']
        del data['tests']
        version[1] += 1

    # convert info.fastq/info.size to lists
    if version[0] == 0 and version[1] == 11:
        data['info']['fastq'] = [data['info']['fastq']]
        data['info']['size'] = [data['info']['size']]
        version[1] += 1

    assert version[0] == kvarq_version[0] and version[1] == kvarq_version[1], \
            VersionConflictException('could not elevate version more than to "%d.%d"' %
                    (version[0], version[1]))

    return data
Exemple #7
0
    analyser = analyse.Analyser()

    if not args.force:
        if os.path.exists(args.json):
            lo.error('will not overwrite file ' + args.json)
            sys.exit(ERROR_FILE_EXISTS)
        if args.extract_hits and os.path.exists(args.extract_hits):
            lo.error('will not overwrite file ' + args.extract_hits)
            sys.exit(ERROR_FILE_EXISTS)

    # do scanning {{{2

    mb = os.path.getsize(args.fastq) / 1024 / 1024
    lo.info('scanning {} ({})...'.format(
            ', '.join(fastq.filenames()),
            ', '.join(['%.2f MB' % (filesize/1024.**2) for filesize in fastq.filesizes()])
        ))
    t0 = time.time()

    class AnalyseThread(threading.Thread):

        def __init__(self, analyser):
            super(AnalyseThread, self).__init__(name='analyse-thread')
            self.analyser = analyser
            self.finished = False
            self.exception = None
            self.traceback = None

        def run(self):
            try:
                self.analyser.spacing = args.spacing
Exemple #8
0
    def __init__(self, fname, variant=None, fd=None, paired=False, quiet=False):
        '''
        open ``.fastq`` or ``.fastq.gz`` file and determine its
        variant (setting attribute ``.Azero`` accordingly)

        :param fname: name of file to open
        :param variant: specify one of ``.vendor_variants`` -- if none
            is specified, then the PHRED score of the fastq file is
            analyzed and
        :param fd: specify a file descriptor to use instead of
            opening ``fname``
        :param paired: include second file in a paired set if it is
            available (i.e. specify "file_1.fastq" as input file and
            "file_2.fastq" will be included in functions ``.filesize()``
            and ``.filenames()``)
        '''
        self.fname = fname

        if fd:
            self.fd = fd
        else:
            self.fd = None

        if self.fname.endswith('.fastq.gz'):
            self.gz = True
            if not self.fd:
                self.fd = gzip.GzipFile(self.fname, 'rb')
        elif self.fname.endswith('.fastq'):
            self.gz = False
            if not self.fd:
                self.fd = open(self.fname, 'rb')
        else:
            raise FastqFileFormatException(
                        'fastq file must have extension ".fastq" or ".fastq.gz"')

        # save second name of base if exists
        self.fname2 = None
        if paired:
            base = fname[:fname.rindex('.fastq')]
            if base[-2:] == '_1':
                fname2 = base[:-2] + '_2' + fname[fname.rindex('.fastq'):]
                if os.path.exists(fname2):
                    lo.info('including paired file "%s"' % fname2)
                    self.fname2 = fname2

        if sum(self.filesizes()) == 0:
            raise FastqFileFormatException('cannot scan empty file')

        # scan some records
        min_pos, max_pos = self.min_max_score_check_file()
        lo.debug('min_pos=%d max_pos=%d' % (min_pos, max_pos))

        if variant and variant not in self.vendor_variants:
            raise FastqFileFormatException(
                    'unknown vendor variant "%s"' % variant)

        # create list of variants compatible with PHRED scores
        variants = []
        dQs = []
        for name, vendor_variant in Fastq.vendor_variants.items():

            if ((min_pos - vendor_variant.dQ) in vendor_variant.Qrange
                    and (max_pos - vendor_variant.dQ) in vendor_variant.Qrange):
                dQs.append(vendor_variant.dQ)
                variants.append(name)

        if variant is None:
            # set variant from guesses
            if not variants:
                raise FastqFileFormatException(
                        'could not find any suitable fastq vendor variant')
            if len(set(dQs)) > 1:
                raise FastqFileFormatException(
                        'cannot determine dQ with guessed vendor variants "%s"'
                        % str(variants))
            self.variants = variants
            self.dQ = dQs[0]
        else:
            # check specified variant
            if variant not in variants:
                lo.warning('specified vendor variant "%s" seems not to be '
                        'compatible with file' % variant)
            self.variants = [variant]
            self.dQ = self.vendor_variants[variant].dQ


        self.Azero = self.ASCII[self.dQ]

        # estimate readlength/records_approx
        self.fd.seek(0)
        lines = [self.fd.readline() for i in range(4)]
        self.readlength = len(lines[1].strip('\r\n'))
        if self.gz:
            self.records_approx = None
        else:
            self.records_approx = os.path.getsize(self.fname) / len(''.join(lines))
            if self.fname2 is not None:
                self.records_approx *= 2

        # output some infos
        if not quiet:
            if self.gz:
                lo.info('gzipped fastq : readlength=? records_approx=? dQ=%d variants=%s' % (
                        self.dQ, str(self.variants)))
            else:
                lo.info('fastq : readlength=%d records_approx=%d dQ=%d variants=%s' % (
                        self.readlength, self.records_approx, self.dQ, str(self.variants)))