def count_reads(fname): ''' Count the number of reads in a bed file. This function will handle gzipped bed files seamlessly. ''' # FIXME consider using external gzip binary where available. if is_zipped(fname): fdesc = gzip.open(fname, 'rb') else: fdesc = open(fname, 'rb') mapped = 0 unique = 0 checktally = True for line in fdesc: flds = line.split("\t") count = 1 # Account for tally processing of smallRNA-seq data. if checktally: tally = flds[3].split(':') if len(tally) == 2: tally = tally[1].split('_') if len(tally) == 2 and tally[0] == 'count': count = int(tally[1]) else: checktally = False # these are not tally data. else: checktally = False # these are not tally data. mapped += count if int(flds[4]) > 0: unique += count fdesc.close() return (mapped, unique)
def _check_file_zipped(self, fname, fobj): # Logging currently handled by the utilities module. zipped = is_zipped(fname) if fobj.filetype.gzip and not zipped: fname = rezip_file(fname, overwrite=True) elif not fobj.filetype.gzip and zipped: fname = unzip_file(fname, overwrite=True) return fname
def create_unsaved_lane(self, fastqs): ''' Gather statistics from the fastq files and generate a Lane object not yet saved to the database. ''' status = Status.objects.get(code='complete') machine = Machine.objects.get(code__iexact=self.machine) # Don't save this to the db just yet. lane = Lane(library=self.library, facility=self.facility, lanenum=self.lanenum, machine=machine, flowcell=self.flowcell, rundate=self.rundate, summaryurl=self.url, runnumber=self.runnumber, status=status, flowlane=0) if len(fastqs) == 1: lane.paired = False elif len(fastqs) == 2: lane.paired = True else: raise ValueError( "Can only process either one or two fastq files per lane.") # Assumes the first fastq is representative. lane.readlength = self.fastq_readlength(fastqs[0]) lane.reads = self.fastq_readcount(fastqs[0]) lane.passedpf = lane.reads # This is a bit of an assumption FIXME? lanefiles = dict() if self.keepfastq: fqtype = Filetype.objects.get(code='fq') for fastq in fastqs: checksum = checksum_file(fastq) if is_zipped(fastq): fname = re.sub('%s$' % self.config.gzsuffix, '', fastq) else: fname = fastq try: lfile = Lanefile.objects.get(lane=lane, filename=fname) except Lanefile.DoesNotExist: lfile = Lanefile(lane=lane, filename=fname, checksum=checksum, filetype=fqtype) fastq = self._check_file_zipped(fastq, lfile) lanefiles[fastq] = lfile # Returns only the new lanefiles. return (lane, lanefiles)
def open_fastq(self, fastq): ''' Return a file handle for a fastq file, transparently handling gzipped files. ''' if is_zipped(fastq): from gzip import GzipFile gen = lambda x: GzipFile(x) else: gen = lambda x: open(x) return gen(fastq)
def add(self, files, final_status=None): ''' Process a list of filenames (files must exist on disk). The optional final_status argument specifies a models.Status object to which the lane should be linked upon completion. ''' # We need at least one bed file. bed = self.identify_bed_file(files) if not bed: raise ValueError("Unable to identify any bed files in the input.") # Find the appropriate alignment. Note that aln is not yet saved # in the database. (aln, lane) = self.aln_from_bedfile(bed) # Do some heavy lifting *outside* of our database transaction, to # avoid locking the db for extended periods. chksums = dict() processed = [] for fname in files: # If the file is uncompressed, don't waste time zipping it prior # to checksum. chksums[fname] = checksum_file(fname) # also works on zipped file ftype = Filetype.objects.guess_type(fname) if ftype is None: raise ValueError("File type not recognised from database: %s" % fname) if ftype.gzip and not is_zipped(fname): fname = rezip_file(fname) processed.append(fname) # All database changes should be handled by the # transaction-embedded method below. self._save_to_repository(processed, chksums, aln, final_status) return aln
def submit_gsnap(self, files, genome, indexdir, number, queue, wait=False): ''' Submit gsnap alignment jobs, samtools merge and cleanup jobs to the cluster. ''' assert len(files) in (1, 2) keeptype = 'concordant' if len(files) == 2 else 'unpaired' bjobs = [] # Only test the zippedness of our inputs once. use_gunzip = True if is_zipped(files[0]) else False if number is None: LOGGER.info("Counting reads in fastq input files...") number = int( (self.count_fastq_reads(files[0], use_gunzip) / 1e6) * 20) LOGGER.info("Will start %d jobs on cluster.", number) libmatch = re.match(r'^(do\d+)_', os.path.basename(files[0])) if libmatch: libcode = libmatch.group(1) else: raise ValueError("Unable to parse library code from file name %s" % files[0]) # Copy files to cluster workdir. clfiles = [ '%s_%s' % (self.namespace, os.path.basename(x)) for x in files ] for filenum in range(len(files)): if not self.cluster_file_exists(clfiles[filenum]): LOGGER.info( "Copying fastq file to cluster working directory...") self.submitter.remote_copy_files([files[filenum]], [clfiles[filenum]]) # Launch the gsnap job array. prefix = (r'%s/%s_%s.\\\$((\\\$LSB_JOBINDEX - 1))' % (self.outdir, self.namespace, libcode)) # Create the working directory, if it's not already there. cmd = r'mkdir -p %s' % self.outdir # Build the gsnap command. Note that the '-B' option can be # switched to 5 if there's too much disk IO; this has the # downside of increasing memory requirements though. cmd += ( (r' && gsnap --part=\\\$((\\\$LSB_JOBINDEX - 1))/%d -d %s -D %s' % (number, genome, indexdir)) + r' -m 2 --trim-mismatch-score=0 -N 1 -n 1' + r' -E 100 --pairexpect=160 -B 4' + r' --antistranded-penalty=1 --quality-protocol=sanger' + r' --nofails -A sam' + (r' --split-output=%s --nthreads=8' % prefix)) if use_gunzip: cmd += ' --gunzip' cmd += " " + " ".join(clfiles) # The samtools view (sam->bam) step is actually where we assume # paired-end sequencing. for tag in ('uniq', 'mult'): samfile = r'%s.%s_%s' % (prefix, keeptype, tag) # Sort here rather than later as it's more efficient. cmd += ( r' && samtools view -b -S -o - %s | samtools sort -o %s.bam -' % (samfile, samfile)) # Clean up the sam files. cmd += r' && rm %s.nomapping' % prefix cmd += ( (r' && rm %s.' % prefix) + r'{unpaired,paired,halfmapping,concordant}_' + r'{transloc,mult,circular,uniq,uniq_scr,uniq_long,uniq_inv,uniq_circular}' ) cmd = ( r'if [ ! -f %s.%s_mult.bam ]; then %s; else echo Skipping %s; fi' % (prefix, keeptype, cmd, prefix)) LOGGER.info("Submitting job array of %d jobs.", number) jobname = 'gsnap[1-%d]%%%d' % (number, self.throttle) LOGGER.debug("Job name: %s", jobname) LOGGER.debug("Command: %s", cmd) bjobs.append( self.submitter.submit_command(cmd, mem=self.memsize * 1024, queue=queue, auto_requeue=False, jobname=jobname)) mergejob = self._merge_bams(bjobs, libcode, queue, njobs=number, keeptype=keeptype) fastq_cleanup = r'rm %s' % " ".join(clfiles) if wait is True: self.wait_on_cluster([mergejob], fastq_cleanup) return else: LOGGER.info("Submitting fastq cleanup job...") jobid = self.submitter.submit_command(fastq_cleanup, queue=queue, depend_jobs=[mergejob]) return jobid
def run(self, flowcell, flowlane=None, fcq=None, destdir=None): '''The main entry point for the class.''' multiplexed = {} if destdir is None: destdir = self.conf.incoming # get list of new lanes from flow cell if fcq is None: fcq = FlowCellQuery(flowcell, flowlane, lims=self.lims, trust_lims_adapters=self.trust_lims_adapters) flowlanes = set() if fcq.lims_fc.analysis_status not in self.ready: LOGGER.info("flow cell status '%s'", fcq.lims_fc.analysis_status) sys.exit("Flow cell analysis status not yet completed.") for (lanenum, libset) in fcq.lane_library.items(): if lanenum not in multiplexed: multiplexed[lanenum] = set() for lib in libset: if fcq.lib_status[lib] in ('new') or not self.db_library_check: # Only register lane for demultiplexing if this if lib not # in lane.lims_samples() if not fcq.lane_demuxed[lanenum]: multiplexed[lanenum].add(lib) flowlanes.add((fcq.lims_fc.fcid, lanenum)) if len(flowlanes) == 0: LOGGER.info("No ready lanes for flowcell '%s'", flowcell) sys.exit("No lanes to process.") # We need to set our working directory to something suitable # before we start; otherwise we end up demuxing into a home # directory or similar. pwd = os.getcwd() os.chdir(destdir) downloading = Status.objects.get(code='downloading data') downloaded = Status.objects.get(code='downloaded') # for each lane... path = destdir for (flowcell, flowlane) in flowlanes: # Mark our lane(s) as active (note that each library has its own # version of this lane). for lane in Lane.objects.filter(flowcell=flowcell, flowlane=flowlane): lane.status = downloading lane.save() # retrieve file fetcher = FQFileFetcher(destination=path, lims=self.lims, test_mode=self.test_mode, unprocessed_only=True, force_download=self.force_download) fetcher.fetch(flowcell, flowlane) if self.test_mode: print("Test Mode: skipping download of %s lane %s to %s" % (flowcell, flowlane, path)) continue failed_fnames = {} for fname in fetcher.targets: if len(fname) > 0: # Check file was retrieved. if not os.path.exists(fname): LOGGER.error("Can't seem to find expected file '%s'", fname) failed_fnames[fname] = fname else: muxed_libs = multiplexed[flowlane] if len(muxed_libs) > 1: # Demultiplex file if required. Here we unfortunately # have to unzip the data, and we will rezip it # following the process regardless of its input state. if is_zipped(fname): fname = unzip_file(fname) LOGGER.info( "Demultiplexing file %s for libraries: %s", fname, ", ".join(muxed_libs)) self.demultiplex(muxed_libs, fname) for lib in muxed_libs: self.output_files += [ rezip_file(dmf) for dmf in self._demux_files[lib] ] else: LOGGER.info( "File does not require demultiplexing: %s", fname) self.output_files.append(fname) for fname in self.output_files: if fname not in failed_fnames: # The next line will parse regular Fastq filenames or the 10X tarball filenames. (code, flowcell, flowlane, flowpair) = parse_incoming_fastq_name(os.path.basename(fname), ext=r'.(fq.gz|tar)') LOGGER.info( "Changing code=%s, flowcell=%s, flowlane=%s, flowpair=%s to 'downloaded'", code, flowcell, flowlane, flowpair) try: lane = Lane.objects.get(flowcell=flowcell, flowlane=flowlane, library__code=code) lane.status = downloaded lane.save() except Lane.DoesNotExist, _err: try: lib = Library.objects.search_by_name(code) except Library.DoesNotExist, _err: LOGGER.error( "No library %s. Unable to register lane for the library.", code) continue LOGGER.info("Registering lane for %s.", fname) facobj = Facility.objects.get(code='CRI') machine_obj = Machine.objects.get( code__iexact=str('Unknown')) lane = Lane(facility=facobj, library=lib, flowcell=flowcell, flowlane=flowlane, lanenum=Lane.objects.next_lane_number(lib), status=downloaded, rundate='2008-01-01', paired=False, genomicssampleid='', usersampleid=code, runnumber='', seqsamplepf='', seqsamplebad='', failed=False, machine=machine_obj) lane.save()