Esempio n. 1
0
    def demultiplex(self, codes, fname):
        '''Actually run the demultiplexing, using demuxIllumina.'''

        # look up adapters from database,
        # write sampleSheet file
        LOGGER.debug("Making sample sheet.")
        sheet = self.make_sample_sheet(codes, fname)
        LOGGER.info("Sample sheet created.")
        # invoke demultiplexer
        cmd = [self.demux_prog, '-d', sheet,
               fname]  # demuxIllumina v2.0 and above
        LOGGER.debug("Command for demultiplexing: %s", " ".join(cmd))
        pout = call_subprocess(cmd, path=self.conf.hostpath)
        fnpat = re.compile(r"tag\s+(\w+):\s+([^\s]+)\s*$")
        fnset = set()
        lostpat = re.compile(r"lost\s+([\/\d]+)\s+reads")
        for line in pout:
            matchobj = fnpat.match(line)
            if matchobj:
                fnset.add(matchobj.group(2))
            else:
                matchobj2 = lostpat.match(line)
                if matchobj2:
                    LOGGER.info("lost %s reads", matchobj2.group(1))

        for fname in fnset:
            set_file_permissions(self.conf.group, fname)

        # Delete the sample sheet.
        os.unlink(sheet)
    def get_genome_size_file(self, genome):
        '''
    Retrieve the filename containing chromosome lengths for a given
    genome. Returns two values: the name of the filename, and whether
    that filename should be treated as a temporary file, i.e. to be
    deleted once done with. Such deletion is the responsibility of the
    calling code.
    '''
        fnchrlen = os.path.join(self.conf.genomesizedir, genome + ".fa.length")
        if not os.path.exists(fnchrlen):
            tmpfile = NamedTemporaryFile(delete=False, dir=self.conf.tmpdir)
            cmd = "%s %s > %s" % ('fetchChromSizes', genome, tmpfile.name)

            # Note - assumes we're running on our primary host. FIXME?
            call_subprocess(cmd, shell=True, path=self.conf.hostpath)
            tmpfile.close()
            try:
                LOGGER.info("Storing new chromosome sizes file as %s",
                            fnchrlen)
                move(tmpfile.name, fnchrlen)
                set_file_permissions(self.conf.group, fnchrlen)
            except Exception, err:
                LOGGER.warning(
                    "Attempt to store chromosome sizes file" +
                    " as %s failed: %s", fnchrlen, err)
                return (tmpfile.name, True)
Esempio n. 3
0
    def make_bed_graph(self, aln):
        '''
    Code wrapper for makeWiggle.
    '''
        aln = Alignment.objects.get(
            id=aln.id)  # Reload passed object within transaction.
        bed = aln.alnfile_set.filter(filetype=self.bedtype).exclude(
            filename__contains='chr21')[0]

        # Note makeWiggle can read gzipped bed files directly; we use that fact here.
        lib = aln.lane.library
        bedFN = bed.repository_file_path

        # Write to local directory first.
        bgrBASE = os.path.splitext(bed.filename)[0]
        bgrFN = bgrBASE + self.bgrtype.suffix
        cmd = BED2BGR % (quote(bedFN), quote(bgrBASE))
        LOGGER.debug(cmd)
        if not self.testMode:
            call_subprocess(cmd, shell=True, path=self.conf.hostpath)
            if not os.path.exists(bgrFN):
                LOGGER.error("Failed to create bgr file '%s'" % (bgrFN, ))
            else:
                chksum = checksum_file(bgrFN)
                bgr = Alnfile(filename=os.path.basename(bgrFN),
                              checksum=chksum,
                              filetype=self.bgrtype,
                              alignment=aln)
                bgrFN = rezip_file(bgrFN)
                move(bgrFN, bgr.repository_file_path)
                set_file_permissions(self.conf.group, bgr.repository_file_path)
                bgr.save()
def _save_file_to_database(fname, aln, chksum):
  '''
  Transaction-managed smallest unit of work that we can do with the
  database to save a file to a given Alignment.
  '''
  aln = Alignment.objects.get(id=aln.id) # Reload passed object within transaction.
  filetype = Filetype.objects.guess_type(fname)
  LOGGER.debug("Found filetype: %s", filetype)
  basefn = os.path.split(fname)[1]
  LOGGER.debug("basefn: '%s'", basefn)
  fnparts = os.path.splitext(basefn)
  if fnparts[1] == CONFIG.gzsuffix:
    basefn = fnparts[0]
  LOGGER.debug("basefn: '%s'", basefn)
  afile = Alnfile.objects.create(filename=basefn,
                                 checksum=chksum, filetype=filetype,
                                 alignment=aln)

  # Move files to permanent locations.
  destname = afile.repository_file_path
  LOGGER.debug("Moving %s to %s", fname, destname)
  move(fname, destname)
  set_file_permissions(CONFIG.group, destname)

  LOGGER.info("Added '%s' to '%s'", fname, aln.lane.library.code)
Esempio n. 5
0
def replace_repo_file(bam, newbam):

  bam = Alnfile.objects.get(id=bam.id) # Reload passed object within transaction.
  set_file_permissions(CONFIG.group, newbam)
  checksum = checksum_file(newbam, unzip=False)
  bam.checksum = checksum
  os.unlink(bam.repository_file_path)
  move(newbam, bam.repository_file_path)
  bam.save()
Esempio n. 6
0
def run_qc(fnames, workdir, destination=None, cleanup=True, register=False):

    with LaneFastQCReport(fastqs=fnames, workdir=workdir, lane=0) as qc:
        # Generate qc reports
        qc.run_fastqc(qc.fastqs)
        qc.postprocess_results(qc.fastqs)

        # create list of disk files and if needed compress some of them before.
        dfiles = []
        # NB! This is not elegant, a better way of doing it would be if ftype.gzip and os.path.splitext(fname)[1] != CONFIG.gzsuffix:,
        #     However, this code is set up no to directly interact with database.
        for fn in qc.output_files:
            if fn.endswith('txt') or fn.endswith('tar'):
                dfn = rezip_file(fn)
                dfiles.append(dfn)
            else:
                dfiles.append(fn)

        if destination is not None:
            # transfer files to destination
            for dfn in dfiles:
                # set permissions
                set_file_permissions(CONFIG.group, dfn)
                # transfer file
                transfer_file(dfn, destination)

        if register:
            # register QC files in repository
            argslist = []
            for (fn, md5) in zip(qc.output_files, qc.output_md5s):
                argslist.append(os.path.basename(fn))
                argslist.append(md5)
            # register files in repository
            cmd = "cs_addFile.py --qcfile -M --program_name %s " % qc.program_name
            cmd += " ".join(argslist)
            print "Executing \"%s\" ..." % cmd
            subproc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
            (stdout, stderr) = subproc.communicate()
            retcode = subproc.wait()
            if stdout:
                sys.stdout.write(stdout)
            if stderr:
                sys.stderr.write(stderr)

        if cleanup:
            # remove local files
            # assuming fastqc report dir is still around, construct dirname.
            # NB! A cleaner way would be to save the dir name to self.bpath in postprocess_results in LaneQCReport class and use this value.
            #     Even better, perhaps LaneFastQCReport should be implemented to keep track of all temporary files it creates.
            for dfn in dfiles:
                os.remove(dfn)
                if dfn.endswith('pdf'):
                    fqc_dirname = os.path.splitext(dfn)[0]
                    rmtree(fqc_dirname)
                    zipfile = fqc_dirname + '.zip'
                    os.remove(zipfile)
    def _save_lane_to_database(self, lane, lanefiles):

        # Finally, save everything to database. We save before trying to
        # move the files because this step will raise an exception if
        # we're trying to overwrite a pre-existing file.
        lane.save()
        for lfile in lanefiles.values():
            lfile.lane = lane  # ensure lane_id has been set.
            lfile.save()

        # Move fastq files into position, if that's a thing we're doing.
        if self.keepfastq:
            for (fastq, lfile) in lanefiles.iteritems():

                # Handle whether or not we're zipping fastq files.
                fastq = self._check_file_zipped(
                    fastq,
                    lfile)  # defensive coding; this should already be set.
                dest = lfile.repository_file_path
                destdir = os.path.dirname(dest)
                if not os.path.exists(destdir):
                    os.makedirs(destdir)
                move(fastq, dest)
                set_file_permissions(self.config.group, dest)
        # tired of maintaining it).
        if not lib.libtype.code in self.conf.nonquant_libtypes:
            wigs = self.generate_wig_files(beds)
            bigwigs = self.generate_bigwig_files(bedgraphs, chrom_sizes)

        # Bedgraph files are still useful for assessing genome- or
        # exome-wide coverage.
        bedgraphs = self.generate_bedgraph_files(beds)

        # We're now done with the chrom_sizes file.
        if chr_istmp:
            os.unlink(chrom_sizes)

        # Set group ownership and permissions appropriately
        grp = self.conf.group
        set_file_permissions(grp, in_fn)
        for bed in beds:
            set_file_permissions(grp, bed)
        for wig in wigs:
            set_file_permissions(grp, wig)
        for bgr in bedgraphs:
            set_file_permissions(grp, bgr)
        for bwig in bigwigs:
            set_file_permissions(grp, bwig)

        # compress bed file(s)
        bedgz = []
        for bed in beds:
            gzname = rezip_file(bed)
            bedgz.append(gzname)
class ExternalDataHandler(object):
    '''
  Class used to add data files to a single lane in the repository. The
  lane in question will be created if it does not already exist.
  '''
    __slots__ = ('config', 'library', 'lanenum', 'keepfastq', 'genome',
                 'facility', 'machine', 'flowcell', 'rundate', 'url',
                 'runnumber')

    def __init__(self,
                 libcode,
                 lanenum,
                 flowcell,
                 rundate,
                 runnumber=None,
                 url=None,
                 keepfastq=False,
                 genome=None,
                 facility='EXT',
                 machine='Unknown'):

        self.config = Config()
        self.library = Library.objects.get(code=libcode)
        self.lanenum = lanenum
        self.keepfastq = keepfastq
        self.facility = Facility.objects.get(code=facility)
        self.machine = machine
        self.flowcell = flowcell
        self.rundate = rundate
        self.url = url
        self.runnumber = runnumber

        if genome is None:
            self.genome = self.library.genome
        else:
            self.genome = Genome.objects.get(code=genome)

    def open_fastq(self, fastq):
        '''
    Return a file handle for a fastq file, transparently handling
    gzipped files.
    '''
        if is_zipped(fastq):
            from gzip import GzipFile
            gen = lambda x: GzipFile(x)
        else:
            gen = lambda x: open(x)
        return gen(fastq)

    def fastq_readlength(self, fastq):
        '''
    Guess the length of the reads in the fastq file. Assumes that the
    first read in the file is representative.
    '''
        # Currently just assumes that the second line is the first read, and
        # that it is representative.
        LOGGER.info("Finding read length from fastq file %s...", fastq)
        rlen = None
        with self.open_fastq(fastq) as reader:
            for _num in range(2):
                line = reader.next()
            rlen = len(line.rstrip('\n'))

        return rlen

    def fastq_readcount(self, fastq):
        '''
    Count the number of reads in the fastq file.
    '''
        LOGGER.info("Counting reads in fastq file %s...", fastq)
        lcount = 0
        with self.open_fastq(fastq) as reader:
            for line in reader:
                lcount += 1

        return lcount / 4

    def create_unsaved_lane(self, fastqs):
        '''
    Gather statistics from the fastq files and generate a Lane object
    not yet saved to the database.
    '''
        status = Status.objects.get(code='complete')
        machine = Machine.objects.get(code__iexact=self.machine)

        # Don't save this to the db just yet.
        lane = Lane(library=self.library,
                    facility=self.facility,
                    lanenum=self.lanenum,
                    machine=machine,
                    flowcell=self.flowcell,
                    rundate=self.rundate,
                    summaryurl=self.url,
                    runnumber=self.runnumber,
                    status=status,
                    flowlane=0)

        if len(fastqs) == 1:
            lane.paired = False
        elif len(fastqs) == 2:
            lane.paired = True
        else:
            raise ValueError(
                "Can only process either one or two fastq files per lane.")

        # Assumes the first fastq is representative.
        lane.readlength = self.fastq_readlength(fastqs[0])
        lane.reads = self.fastq_readcount(fastqs[0])
        lane.passedpf = lane.reads  # This is a bit of an assumption FIXME?

        lanefiles = dict()
        if self.keepfastq:
            fqtype = Filetype.objects.get(code='fq')
            for fastq in fastqs:
                checksum = checksum_file(fastq)

                if is_zipped(fastq):
                    fname = re.sub('%s$' % self.config.gzsuffix, '', fastq)
                else:
                    fname = fastq

                try:
                    lfile = Lanefile.objects.get(lane=lane, filename=fname)
                except Lanefile.DoesNotExist:
                    lfile = Lanefile(lane=lane,
                                     filename=fname,
                                     checksum=checksum,
                                     filetype=fqtype)
                    fastq = self._check_file_zipped(fastq, lfile)
                    lanefiles[fastq] = lfile

        # Returns only the new lanefiles.
        return (lane, lanefiles)

    def create_unsaved_alignment(self, bam, lane):
        '''
    Gather statistics from the bam file, and generate a bed file and a
    new Alignment object unsaved to the database.
    '''
        bamtype = Filetype.objects.get(code='bam')
        bedtype = Filetype.objects.get(code='bed')

        # Generate a bed file from the bam.
        chrom_sizes = os.path.join(self.config.genomesizedir,
                                   self.genome.code + ".fa.length")
        if not os.path.exists(chrom_sizes):
            LOGGER.warning(
                "Unable to find chromosome sizes file %s. BED file reads will be untrimmed.",
                chrom_sizes)
            chrom_sizes = None
        bam2bed = BamToBedConverter(chrom_sizes=chrom_sizes)
        bambase = os.path.splitext(bam)[0]
        bed_fn = bambase + bedtype.suffix
        beds = bam2bed.convert(bam, bed_fn)
        if len(beds) == 1:
            bed = beds[0]
        else:
            raise ValueError("Unexpected results from BAM to BED conversion")

        LOGGER.info("Counting reads in generated bed file %s...", bed)
        (mapped, unique) = count_reads(bed)

        # We don't save this yet because we're not currently within a
        # transaction.
        aln = Alignment(lane=lane,
                        genome=self.genome,
                        total_reads=lane.total_passedpf,
                        mapped=mapped,
                        munique=unique)

        # Create bam Alnfile.
        LOGGER.info("Checksumming bam file %s...", bam)
        checksum = checksum_file(bam)
        bamobj = Alnfile(
            alignment=aln,
            filetype=bamtype,
            filename=bam,  # casually assume no gzippng FIXME
            checksum=checksum)

        # Create bed Alnfile
        LOGGER.info("Checksumming bed file %s...", bed)
        checksum = checksum_file(bed)
        bedobj = Alnfile(
            alignment=aln,
            filetype=bedtype,
            filename=bed,  # only just created, so not zipped
            checksum=checksum)

        bamkey = self._check_file_zipped(bam, bamobj)
        bedkey = self._check_file_zipped(bed, bedobj)

        alnfiles = {bamkey: bamobj, bedkey: bedobj}

        return (aln, alnfiles)

    def add(self, bam, fastqs=None, progname='bwa', progvers=None):
        '''
    Main entry point for the class.
    '''
        try:

            # A pre-existing lane is left almost unmolested.
            lane = Lane.objects.get(library=self.library,
                                    facility=self.facility,
                                    lanenum=self.lanenum)

        except Lane.DoesNotExist:

            # Creation of a new lane parses statistics from the fastq
            # file(s), runs fastqc and, if desired, will store the fastq file
            # itself in the repository.
            if fastqs is None:
                raise ValueError(
                    "Cannot create a new lane in the repository without" +
                    " the fastq files from which to harvest metadata.")
            (lane, lanefiles) = self.create_unsaved_lane(fastqs)
            self._save_lane_to_database(lane, lanefiles)

            if self.keepfastq:
                fastqs = None  # Use the fastqs now stored in the repository.

            # Note: this code doesn't understand updating pre-existing reports.
            with LaneFastQCReport(target=lane,
                                  fastqs=fastqs,
                                  path=os.environ['PATH']) as fastqc:
                fastqc.insert_into_repository()  # database transaction

        # Alignments are always appended to the lane; multiple alignments
        # may be added in this way.
        (aln, alnfiles) = self.create_unsaved_alignment(bam, lane)
        lane.mapped = aln.mapped
        lane.save()

        self._save_aln_to_database(aln, alnfiles, progname, progvers)

    def _check_file_zipped(self, fname, fobj):
        # Logging currently handled by the utilities module.
        zipped = is_zipped(fname)
        if fobj.filetype.gzip and not zipped:
            fname = rezip_file(fname, overwrite=True)
        elif not fobj.filetype.gzip and zipped:
            fname = unzip_file(fname, overwrite=True)
        return fname

    @transaction.atomic
    def _save_lane_to_database(self, lane, lanefiles):

        # Finally, save everything to database. We save before trying to
        # move the files because this step will raise an exception if
        # we're trying to overwrite a pre-existing file.
        lane.save()
        for lfile in lanefiles.values():
            lfile.lane = lane  # ensure lane_id has been set.
            lfile.save()

        # Move fastq files into position, if that's a thing we're doing.
        if self.keepfastq:
            for (fastq, lfile) in lanefiles.iteritems():

                # Handle whether or not we're zipping fastq files.
                fastq = self._check_file_zipped(
                    fastq,
                    lfile)  # defensive coding; this should already be set.
                dest = lfile.repository_file_path
                destdir = os.path.dirname(dest)
                if not os.path.exists(destdir):
                    os.makedirs(destdir)
                move(fastq, dest)
                set_file_permissions(self.config.group, dest)

    @transaction.atomic
    def _save_aln_to_database(self, aln, alnfiles, progname, progvers):
        # Handle the alignment.
        aln.save()
        for alf in alnfiles.values():
            alf.alignment = aln  # ensure alignment_id has been set.
            alf.save()

        if progvers == None:
            alignerinfo = ProgramSummary(progname,
                                         ssh_host=self.config.cluster,
                                         ssh_user=self.config.clusteruser,
                                         ssh_path=self.config.clusterpath,
                                         ssh_port=self.config.clusterport)
            progname = alignerinfo.program
            progvers = alignerinfo.version

        try:
            program = Program.objects.get(program=progname,
                                          version=progvers,
                                          current=True)
        except Program.DoesNotExist, _err:
            raise StandardError(
                "Unable to find current program in database: %s %s" %
                (progname, progvers))

        DataProvenance.objects.create(program=program,
                                      parameters='',
                                      rank_index=1,
                                      data_process=aln)

        for (fname, fobj) in alnfiles.iteritems():
            fname = self._check_file_zipped(
                fname, fobj)  # defensive coding; this should already be set.
            dest = fobj.repository_file_path
            destdir = os.path.dirname(dest)
            if not os.path.exists(destdir):
                os.makedirs(destdir)
            move(fname, dest)
            set_file_permissions(self.config.group, dest)
Esempio n. 10
0
    def retrieve_fqfile(self, lfile, libname):
        '''
    Given a LimsFile object and a library name, retrieve the actual
    fastq files from the LIMS and store it in self.destination.
    '''
        if not os.path.exists(self.destination):
            LOGGER.error("Destination '%s' does not exist.", self.destination)
            return

        filename = lfile.uri.split('/')[-1]
        LOGGER.info("LIMS File: %s", filename)

        # Current file naming convention:
        # <SLX ID>.<run number>.s_<lane no>.r_<N>.fq.gz
        # Where N=1 indicates single end sequencing
        #       N=2 indicates paired end
        #       N=3 indicates paired end, multiplexed
        #       N=4 indicates dual-indexed.
        #
        # Old file naming convention (still supported):
        # s_<lane no>(?:_<N>)_sequence.txt.gz
        # Where N is only present for non-single-end sequencing
        # and means the same as above.

        # This regex supports both old and new naming conventions.
        fnpat = re.compile(  # FIXME see parse_incoming_fastq_name
            r"SLX\-\d+\.[\.\w-]+\.s_\d+\.r_(\d+).fq.gz$" +
            r"|s_\d+(?:_(\d+))?_sequence.txt.gz$" +
            r"|s_\d+\.(tar)$")  # 10X_FASTQ_TAR
        matchobj = fnpat.search(filename)

        if matchobj is None:
            LOGGER.error(
                "FASTQ file does not conform to" +
                " known naming convention: %s", filename)
            return

        flowpair = 1
        fastqtar = False
        if matchobj.group(1) is not None:  # New naming
            stype = int(matchobj.group(1))
            if stype > 1:  # it's a paired-end-style name
                flowpair = stype
        elif matchobj.group(2) is not None:  # Old naming
            flowpair = int(matchobj.group(1))
        elif matchobj.group(3) == 'tar':  # 10X FASTQ tar file, probably.
            fastqtar = True
        else:
            LOGGER.error("FASTQ file name regex gave unexpected" +
                         " results; probable error in regex code.")
            return

        # following if statement has been added by Margus to deal
        # properly PE multiplexed lanes where second read suffixed _3.fq
        # rather than _2.fq.
        if (flowpair == 3):
            flowpair = 2
        elif (flowpair == 4):
            LOGGER.warning(
                "Dual indexed files not yet supported. However, in current" +
                " usage this is likely to be mislabeled by LIMS as part" +
                " of a wider flowcell annotation.")
            #        return   # If we ever start dual indexing this will need to change
            flowpair = 2

        sample_id = libname.lower().replace(" ",
                                            "")  # do not use underscores here.
        dst = build_incoming_fastq_name(sample_id, lfile.lane.flowcell.fcid,
                                        lfile.lane.lane, flowpair)

        if fastqtar:  # s/.fq$/.tar/
            dst = os.path.splitext(dst)[0] + '.tar'

        # If the final file has already been downloaded we skip the download.
        target = os.path.join(self.destination, dst)
        if os.path.exists(target):
            LOGGER.warning("Destination file '%s' exists. Cannot overwrite.",
                           target)
            return

        compressed = False
        if os.path.splitext(lfile.uri)[1] == self.conf.gzsuffix:
            compressed = True
            dst += self.conf.gzsuffix
        target = os.path.join(self.destination, dst)

        # We also refuse to download over an intermediary gzipped
        # file. This is more likely to be an error so we raise an
        # exception here. If the file is good it should have been
        # uncompressed already.
        if os.path.exists(target):
            raise StandardError(
                "Download location '%s' exists. Cannot overwrite." % target)

        # We download these over http now.
        LOGGER.debug("Downloading LIMS file ID %s to %s", lfile.lims_id,
                     target)
        if not self.test_mode:

            # This is actually the preferred download mechanism.
            try:
                self.lims.get_file_by_uri(lfile.uri, target)

            # Fall back to download via LIMS API, if supported.
            except Exception, err:
                if lfile.lims_id is not None:
                    self.lims.get_file_by_id(lfile.lims_id, target)
                else:
                    raise err

            set_file_permissions(self.conf.group, target)
    check_bam_readcount(bam, maln, readcountdicts)

    malnfile = MergedAlnfile.objects.create(alignment=maln,
                                            filename=bam,
                                            filetype=bamtype,
                                            checksum=chksum)

    if archloc is not None:
        malnfile.archive = ArchiveLocation.objects.get(name=archloc)
        malnfile.archive_date = time.strftime('%Y-%m-%d')
        malnfile.save()

    LOGGER.info("Moving file into repository.")
    destname = malnfile.repository_file_path
    move(bam, destname)
    set_file_permissions(CONFIG.group, destname)


def load_merged_bam(bam,
                    genome=None,
                    bamfilter=False,
                    autoaln=False,
                    aligner=None,
                    alignvers=None,
                    alignparams=None,
                    archloc=None):
    '''
  Insert the specified merged bam file into the repository, linking
  against per-lane Alignments as appropriate.
  '''
    if archloc is None: