def demultiplex(self, codes, fname): '''Actually run the demultiplexing, using demuxIllumina.''' # look up adapters from database, # write sampleSheet file LOGGER.debug("Making sample sheet.") sheet = self.make_sample_sheet(codes, fname) LOGGER.info("Sample sheet created.") # invoke demultiplexer cmd = [self.demux_prog, '-d', sheet, fname] # demuxIllumina v2.0 and above LOGGER.debug("Command for demultiplexing: %s", " ".join(cmd)) pout = call_subprocess(cmd, path=self.conf.hostpath) fnpat = re.compile(r"tag\s+(\w+):\s+([^\s]+)\s*$") fnset = set() lostpat = re.compile(r"lost\s+([\/\d]+)\s+reads") for line in pout: matchobj = fnpat.match(line) if matchobj: fnset.add(matchobj.group(2)) else: matchobj2 = lostpat.match(line) if matchobj2: LOGGER.info("lost %s reads", matchobj2.group(1)) for fname in fnset: set_file_permissions(self.conf.group, fname) # Delete the sample sheet. os.unlink(sheet)
def get_genome_size_file(self, genome): ''' Retrieve the filename containing chromosome lengths for a given genome. Returns two values: the name of the filename, and whether that filename should be treated as a temporary file, i.e. to be deleted once done with. Such deletion is the responsibility of the calling code. ''' fnchrlen = os.path.join(self.conf.genomesizedir, genome + ".fa.length") if not os.path.exists(fnchrlen): tmpfile = NamedTemporaryFile(delete=False, dir=self.conf.tmpdir) cmd = "%s %s > %s" % ('fetchChromSizes', genome, tmpfile.name) # Note - assumes we're running on our primary host. FIXME? call_subprocess(cmd, shell=True, path=self.conf.hostpath) tmpfile.close() try: LOGGER.info("Storing new chromosome sizes file as %s", fnchrlen) move(tmpfile.name, fnchrlen) set_file_permissions(self.conf.group, fnchrlen) except Exception, err: LOGGER.warning( "Attempt to store chromosome sizes file" + " as %s failed: %s", fnchrlen, err) return (tmpfile.name, True)
def make_bed_graph(self, aln): ''' Code wrapper for makeWiggle. ''' aln = Alignment.objects.get( id=aln.id) # Reload passed object within transaction. bed = aln.alnfile_set.filter(filetype=self.bedtype).exclude( filename__contains='chr21')[0] # Note makeWiggle can read gzipped bed files directly; we use that fact here. lib = aln.lane.library bedFN = bed.repository_file_path # Write to local directory first. bgrBASE = os.path.splitext(bed.filename)[0] bgrFN = bgrBASE + self.bgrtype.suffix cmd = BED2BGR % (quote(bedFN), quote(bgrBASE)) LOGGER.debug(cmd) if not self.testMode: call_subprocess(cmd, shell=True, path=self.conf.hostpath) if not os.path.exists(bgrFN): LOGGER.error("Failed to create bgr file '%s'" % (bgrFN, )) else: chksum = checksum_file(bgrFN) bgr = Alnfile(filename=os.path.basename(bgrFN), checksum=chksum, filetype=self.bgrtype, alignment=aln) bgrFN = rezip_file(bgrFN) move(bgrFN, bgr.repository_file_path) set_file_permissions(self.conf.group, bgr.repository_file_path) bgr.save()
def _save_file_to_database(fname, aln, chksum): ''' Transaction-managed smallest unit of work that we can do with the database to save a file to a given Alignment. ''' aln = Alignment.objects.get(id=aln.id) # Reload passed object within transaction. filetype = Filetype.objects.guess_type(fname) LOGGER.debug("Found filetype: %s", filetype) basefn = os.path.split(fname)[1] LOGGER.debug("basefn: '%s'", basefn) fnparts = os.path.splitext(basefn) if fnparts[1] == CONFIG.gzsuffix: basefn = fnparts[0] LOGGER.debug("basefn: '%s'", basefn) afile = Alnfile.objects.create(filename=basefn, checksum=chksum, filetype=filetype, alignment=aln) # Move files to permanent locations. destname = afile.repository_file_path LOGGER.debug("Moving %s to %s", fname, destname) move(fname, destname) set_file_permissions(CONFIG.group, destname) LOGGER.info("Added '%s' to '%s'", fname, aln.lane.library.code)
def replace_repo_file(bam, newbam): bam = Alnfile.objects.get(id=bam.id) # Reload passed object within transaction. set_file_permissions(CONFIG.group, newbam) checksum = checksum_file(newbam, unzip=False) bam.checksum = checksum os.unlink(bam.repository_file_path) move(newbam, bam.repository_file_path) bam.save()
def run_qc(fnames, workdir, destination=None, cleanup=True, register=False): with LaneFastQCReport(fastqs=fnames, workdir=workdir, lane=0) as qc: # Generate qc reports qc.run_fastqc(qc.fastqs) qc.postprocess_results(qc.fastqs) # create list of disk files and if needed compress some of them before. dfiles = [] # NB! This is not elegant, a better way of doing it would be if ftype.gzip and os.path.splitext(fname)[1] != CONFIG.gzsuffix:, # However, this code is set up no to directly interact with database. for fn in qc.output_files: if fn.endswith('txt') or fn.endswith('tar'): dfn = rezip_file(fn) dfiles.append(dfn) else: dfiles.append(fn) if destination is not None: # transfer files to destination for dfn in dfiles: # set permissions set_file_permissions(CONFIG.group, dfn) # transfer file transfer_file(dfn, destination) if register: # register QC files in repository argslist = [] for (fn, md5) in zip(qc.output_files, qc.output_md5s): argslist.append(os.path.basename(fn)) argslist.append(md5) # register files in repository cmd = "cs_addFile.py --qcfile -M --program_name %s " % qc.program_name cmd += " ".join(argslist) print "Executing \"%s\" ..." % cmd subproc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True) (stdout, stderr) = subproc.communicate() retcode = subproc.wait() if stdout: sys.stdout.write(stdout) if stderr: sys.stderr.write(stderr) if cleanup: # remove local files # assuming fastqc report dir is still around, construct dirname. # NB! A cleaner way would be to save the dir name to self.bpath in postprocess_results in LaneQCReport class and use this value. # Even better, perhaps LaneFastQCReport should be implemented to keep track of all temporary files it creates. for dfn in dfiles: os.remove(dfn) if dfn.endswith('pdf'): fqc_dirname = os.path.splitext(dfn)[0] rmtree(fqc_dirname) zipfile = fqc_dirname + '.zip' os.remove(zipfile)
def _save_lane_to_database(self, lane, lanefiles): # Finally, save everything to database. We save before trying to # move the files because this step will raise an exception if # we're trying to overwrite a pre-existing file. lane.save() for lfile in lanefiles.values(): lfile.lane = lane # ensure lane_id has been set. lfile.save() # Move fastq files into position, if that's a thing we're doing. if self.keepfastq: for (fastq, lfile) in lanefiles.iteritems(): # Handle whether or not we're zipping fastq files. fastq = self._check_file_zipped( fastq, lfile) # defensive coding; this should already be set. dest = lfile.repository_file_path destdir = os.path.dirname(dest) if not os.path.exists(destdir): os.makedirs(destdir) move(fastq, dest) set_file_permissions(self.config.group, dest)
# tired of maintaining it). if not lib.libtype.code in self.conf.nonquant_libtypes: wigs = self.generate_wig_files(beds) bigwigs = self.generate_bigwig_files(bedgraphs, chrom_sizes) # Bedgraph files are still useful for assessing genome- or # exome-wide coverage. bedgraphs = self.generate_bedgraph_files(beds) # We're now done with the chrom_sizes file. if chr_istmp: os.unlink(chrom_sizes) # Set group ownership and permissions appropriately grp = self.conf.group set_file_permissions(grp, in_fn) for bed in beds: set_file_permissions(grp, bed) for wig in wigs: set_file_permissions(grp, wig) for bgr in bedgraphs: set_file_permissions(grp, bgr) for bwig in bigwigs: set_file_permissions(grp, bwig) # compress bed file(s) bedgz = [] for bed in beds: gzname = rezip_file(bed) bedgz.append(gzname)
class ExternalDataHandler(object): ''' Class used to add data files to a single lane in the repository. The lane in question will be created if it does not already exist. ''' __slots__ = ('config', 'library', 'lanenum', 'keepfastq', 'genome', 'facility', 'machine', 'flowcell', 'rundate', 'url', 'runnumber') def __init__(self, libcode, lanenum, flowcell, rundate, runnumber=None, url=None, keepfastq=False, genome=None, facility='EXT', machine='Unknown'): self.config = Config() self.library = Library.objects.get(code=libcode) self.lanenum = lanenum self.keepfastq = keepfastq self.facility = Facility.objects.get(code=facility) self.machine = machine self.flowcell = flowcell self.rundate = rundate self.url = url self.runnumber = runnumber if genome is None: self.genome = self.library.genome else: self.genome = Genome.objects.get(code=genome) def open_fastq(self, fastq): ''' Return a file handle for a fastq file, transparently handling gzipped files. ''' if is_zipped(fastq): from gzip import GzipFile gen = lambda x: GzipFile(x) else: gen = lambda x: open(x) return gen(fastq) def fastq_readlength(self, fastq): ''' Guess the length of the reads in the fastq file. Assumes that the first read in the file is representative. ''' # Currently just assumes that the second line is the first read, and # that it is representative. LOGGER.info("Finding read length from fastq file %s...", fastq) rlen = None with self.open_fastq(fastq) as reader: for _num in range(2): line = reader.next() rlen = len(line.rstrip('\n')) return rlen def fastq_readcount(self, fastq): ''' Count the number of reads in the fastq file. ''' LOGGER.info("Counting reads in fastq file %s...", fastq) lcount = 0 with self.open_fastq(fastq) as reader: for line in reader: lcount += 1 return lcount / 4 def create_unsaved_lane(self, fastqs): ''' Gather statistics from the fastq files and generate a Lane object not yet saved to the database. ''' status = Status.objects.get(code='complete') machine = Machine.objects.get(code__iexact=self.machine) # Don't save this to the db just yet. lane = Lane(library=self.library, facility=self.facility, lanenum=self.lanenum, machine=machine, flowcell=self.flowcell, rundate=self.rundate, summaryurl=self.url, runnumber=self.runnumber, status=status, flowlane=0) if len(fastqs) == 1: lane.paired = False elif len(fastqs) == 2: lane.paired = True else: raise ValueError( "Can only process either one or two fastq files per lane.") # Assumes the first fastq is representative. lane.readlength = self.fastq_readlength(fastqs[0]) lane.reads = self.fastq_readcount(fastqs[0]) lane.passedpf = lane.reads # This is a bit of an assumption FIXME? lanefiles = dict() if self.keepfastq: fqtype = Filetype.objects.get(code='fq') for fastq in fastqs: checksum = checksum_file(fastq) if is_zipped(fastq): fname = re.sub('%s$' % self.config.gzsuffix, '', fastq) else: fname = fastq try: lfile = Lanefile.objects.get(lane=lane, filename=fname) except Lanefile.DoesNotExist: lfile = Lanefile(lane=lane, filename=fname, checksum=checksum, filetype=fqtype) fastq = self._check_file_zipped(fastq, lfile) lanefiles[fastq] = lfile # Returns only the new lanefiles. return (lane, lanefiles) def create_unsaved_alignment(self, bam, lane): ''' Gather statistics from the bam file, and generate a bed file and a new Alignment object unsaved to the database. ''' bamtype = Filetype.objects.get(code='bam') bedtype = Filetype.objects.get(code='bed') # Generate a bed file from the bam. chrom_sizes = os.path.join(self.config.genomesizedir, self.genome.code + ".fa.length") if not os.path.exists(chrom_sizes): LOGGER.warning( "Unable to find chromosome sizes file %s. BED file reads will be untrimmed.", chrom_sizes) chrom_sizes = None bam2bed = BamToBedConverter(chrom_sizes=chrom_sizes) bambase = os.path.splitext(bam)[0] bed_fn = bambase + bedtype.suffix beds = bam2bed.convert(bam, bed_fn) if len(beds) == 1: bed = beds[0] else: raise ValueError("Unexpected results from BAM to BED conversion") LOGGER.info("Counting reads in generated bed file %s...", bed) (mapped, unique) = count_reads(bed) # We don't save this yet because we're not currently within a # transaction. aln = Alignment(lane=lane, genome=self.genome, total_reads=lane.total_passedpf, mapped=mapped, munique=unique) # Create bam Alnfile. LOGGER.info("Checksumming bam file %s...", bam) checksum = checksum_file(bam) bamobj = Alnfile( alignment=aln, filetype=bamtype, filename=bam, # casually assume no gzippng FIXME checksum=checksum) # Create bed Alnfile LOGGER.info("Checksumming bed file %s...", bed) checksum = checksum_file(bed) bedobj = Alnfile( alignment=aln, filetype=bedtype, filename=bed, # only just created, so not zipped checksum=checksum) bamkey = self._check_file_zipped(bam, bamobj) bedkey = self._check_file_zipped(bed, bedobj) alnfiles = {bamkey: bamobj, bedkey: bedobj} return (aln, alnfiles) def add(self, bam, fastqs=None, progname='bwa', progvers=None): ''' Main entry point for the class. ''' try: # A pre-existing lane is left almost unmolested. lane = Lane.objects.get(library=self.library, facility=self.facility, lanenum=self.lanenum) except Lane.DoesNotExist: # Creation of a new lane parses statistics from the fastq # file(s), runs fastqc and, if desired, will store the fastq file # itself in the repository. if fastqs is None: raise ValueError( "Cannot create a new lane in the repository without" + " the fastq files from which to harvest metadata.") (lane, lanefiles) = self.create_unsaved_lane(fastqs) self._save_lane_to_database(lane, lanefiles) if self.keepfastq: fastqs = None # Use the fastqs now stored in the repository. # Note: this code doesn't understand updating pre-existing reports. with LaneFastQCReport(target=lane, fastqs=fastqs, path=os.environ['PATH']) as fastqc: fastqc.insert_into_repository() # database transaction # Alignments are always appended to the lane; multiple alignments # may be added in this way. (aln, alnfiles) = self.create_unsaved_alignment(bam, lane) lane.mapped = aln.mapped lane.save() self._save_aln_to_database(aln, alnfiles, progname, progvers) def _check_file_zipped(self, fname, fobj): # Logging currently handled by the utilities module. zipped = is_zipped(fname) if fobj.filetype.gzip and not zipped: fname = rezip_file(fname, overwrite=True) elif not fobj.filetype.gzip and zipped: fname = unzip_file(fname, overwrite=True) return fname @transaction.atomic def _save_lane_to_database(self, lane, lanefiles): # Finally, save everything to database. We save before trying to # move the files because this step will raise an exception if # we're trying to overwrite a pre-existing file. lane.save() for lfile in lanefiles.values(): lfile.lane = lane # ensure lane_id has been set. lfile.save() # Move fastq files into position, if that's a thing we're doing. if self.keepfastq: for (fastq, lfile) in lanefiles.iteritems(): # Handle whether or not we're zipping fastq files. fastq = self._check_file_zipped( fastq, lfile) # defensive coding; this should already be set. dest = lfile.repository_file_path destdir = os.path.dirname(dest) if not os.path.exists(destdir): os.makedirs(destdir) move(fastq, dest) set_file_permissions(self.config.group, dest) @transaction.atomic def _save_aln_to_database(self, aln, alnfiles, progname, progvers): # Handle the alignment. aln.save() for alf in alnfiles.values(): alf.alignment = aln # ensure alignment_id has been set. alf.save() if progvers == None: alignerinfo = ProgramSummary(progname, ssh_host=self.config.cluster, ssh_user=self.config.clusteruser, ssh_path=self.config.clusterpath, ssh_port=self.config.clusterport) progname = alignerinfo.program progvers = alignerinfo.version try: program = Program.objects.get(program=progname, version=progvers, current=True) except Program.DoesNotExist, _err: raise StandardError( "Unable to find current program in database: %s %s" % (progname, progvers)) DataProvenance.objects.create(program=program, parameters='', rank_index=1, data_process=aln) for (fname, fobj) in alnfiles.iteritems(): fname = self._check_file_zipped( fname, fobj) # defensive coding; this should already be set. dest = fobj.repository_file_path destdir = os.path.dirname(dest) if not os.path.exists(destdir): os.makedirs(destdir) move(fname, dest) set_file_permissions(self.config.group, dest)
def retrieve_fqfile(self, lfile, libname): ''' Given a LimsFile object and a library name, retrieve the actual fastq files from the LIMS and store it in self.destination. ''' if not os.path.exists(self.destination): LOGGER.error("Destination '%s' does not exist.", self.destination) return filename = lfile.uri.split('/')[-1] LOGGER.info("LIMS File: %s", filename) # Current file naming convention: # <SLX ID>.<run number>.s_<lane no>.r_<N>.fq.gz # Where N=1 indicates single end sequencing # N=2 indicates paired end # N=3 indicates paired end, multiplexed # N=4 indicates dual-indexed. # # Old file naming convention (still supported): # s_<lane no>(?:_<N>)_sequence.txt.gz # Where N is only present for non-single-end sequencing # and means the same as above. # This regex supports both old and new naming conventions. fnpat = re.compile( # FIXME see parse_incoming_fastq_name r"SLX\-\d+\.[\.\w-]+\.s_\d+\.r_(\d+).fq.gz$" + r"|s_\d+(?:_(\d+))?_sequence.txt.gz$" + r"|s_\d+\.(tar)$") # 10X_FASTQ_TAR matchobj = fnpat.search(filename) if matchobj is None: LOGGER.error( "FASTQ file does not conform to" + " known naming convention: %s", filename) return flowpair = 1 fastqtar = False if matchobj.group(1) is not None: # New naming stype = int(matchobj.group(1)) if stype > 1: # it's a paired-end-style name flowpair = stype elif matchobj.group(2) is not None: # Old naming flowpair = int(matchobj.group(1)) elif matchobj.group(3) == 'tar': # 10X FASTQ tar file, probably. fastqtar = True else: LOGGER.error("FASTQ file name regex gave unexpected" + " results; probable error in regex code.") return # following if statement has been added by Margus to deal # properly PE multiplexed lanes where second read suffixed _3.fq # rather than _2.fq. if (flowpair == 3): flowpair = 2 elif (flowpair == 4): LOGGER.warning( "Dual indexed files not yet supported. However, in current" + " usage this is likely to be mislabeled by LIMS as part" + " of a wider flowcell annotation.") # return # If we ever start dual indexing this will need to change flowpair = 2 sample_id = libname.lower().replace(" ", "") # do not use underscores here. dst = build_incoming_fastq_name(sample_id, lfile.lane.flowcell.fcid, lfile.lane.lane, flowpair) if fastqtar: # s/.fq$/.tar/ dst = os.path.splitext(dst)[0] + '.tar' # If the final file has already been downloaded we skip the download. target = os.path.join(self.destination, dst) if os.path.exists(target): LOGGER.warning("Destination file '%s' exists. Cannot overwrite.", target) return compressed = False if os.path.splitext(lfile.uri)[1] == self.conf.gzsuffix: compressed = True dst += self.conf.gzsuffix target = os.path.join(self.destination, dst) # We also refuse to download over an intermediary gzipped # file. This is more likely to be an error so we raise an # exception here. If the file is good it should have been # uncompressed already. if os.path.exists(target): raise StandardError( "Download location '%s' exists. Cannot overwrite." % target) # We download these over http now. LOGGER.debug("Downloading LIMS file ID %s to %s", lfile.lims_id, target) if not self.test_mode: # This is actually the preferred download mechanism. try: self.lims.get_file_by_uri(lfile.uri, target) # Fall back to download via LIMS API, if supported. except Exception, err: if lfile.lims_id is not None: self.lims.get_file_by_id(lfile.lims_id, target) else: raise err set_file_permissions(self.conf.group, target)
check_bam_readcount(bam, maln, readcountdicts) malnfile = MergedAlnfile.objects.create(alignment=maln, filename=bam, filetype=bamtype, checksum=chksum) if archloc is not None: malnfile.archive = ArchiveLocation.objects.get(name=archloc) malnfile.archive_date = time.strftime('%Y-%m-%d') malnfile.save() LOGGER.info("Moving file into repository.") destname = malnfile.repository_file_path move(bam, destname) set_file_permissions(CONFIG.group, destname) def load_merged_bam(bam, genome=None, bamfilter=False, autoaln=False, aligner=None, alignvers=None, alignparams=None, archloc=None): ''' Insert the specified merged bam file into the repository, linking against per-lane Alignments as appropriate. ''' if archloc is None: