Example #1
0
def count_reads(fname):
  '''
  Count the number of reads in a bed file. This function will handle
  gzipped bed files seamlessly.
  '''
  # FIXME consider using external gzip binary where available.
  if is_zipped(fname):
    fdesc = gzip.open(fname, 'rb')
  else:
    fdesc = open(fname, 'rb')
  mapped = 0
  unique = 0
  checktally = True
  for line in fdesc:
    flds = line.split("\t")
    count = 1

    # Account for tally processing of smallRNA-seq data.
    if checktally:
      tally = flds[3].split(':')
      if len(tally) == 2:
        tally = tally[1].split('_')
        if len(tally) == 2 and tally[0] == 'count':
          count = int(tally[1])
        else:
          checktally = False # these are not tally data.
      else:
        checktally = False # these are not tally data.

    mapped += count
    if int(flds[4]) > 0:
      unique += count
  fdesc.close()
  return (mapped, unique)
 def _check_file_zipped(self, fname, fobj):
     # Logging currently handled by the utilities module.
     zipped = is_zipped(fname)
     if fobj.filetype.gzip and not zipped:
         fname = rezip_file(fname, overwrite=True)
     elif not fobj.filetype.gzip and zipped:
         fname = unzip_file(fname, overwrite=True)
     return fname
    def create_unsaved_lane(self, fastqs):
        '''
    Gather statistics from the fastq files and generate a Lane object
    not yet saved to the database.
    '''
        status = Status.objects.get(code='complete')
        machine = Machine.objects.get(code__iexact=self.machine)

        # Don't save this to the db just yet.
        lane = Lane(library=self.library,
                    facility=self.facility,
                    lanenum=self.lanenum,
                    machine=machine,
                    flowcell=self.flowcell,
                    rundate=self.rundate,
                    summaryurl=self.url,
                    runnumber=self.runnumber,
                    status=status,
                    flowlane=0)

        if len(fastqs) == 1:
            lane.paired = False
        elif len(fastqs) == 2:
            lane.paired = True
        else:
            raise ValueError(
                "Can only process either one or two fastq files per lane.")

        # Assumes the first fastq is representative.
        lane.readlength = self.fastq_readlength(fastqs[0])
        lane.reads = self.fastq_readcount(fastqs[0])
        lane.passedpf = lane.reads  # This is a bit of an assumption FIXME?

        lanefiles = dict()
        if self.keepfastq:
            fqtype = Filetype.objects.get(code='fq')
            for fastq in fastqs:
                checksum = checksum_file(fastq)

                if is_zipped(fastq):
                    fname = re.sub('%s$' % self.config.gzsuffix, '', fastq)
                else:
                    fname = fastq

                try:
                    lfile = Lanefile.objects.get(lane=lane, filename=fname)
                except Lanefile.DoesNotExist:
                    lfile = Lanefile(lane=lane,
                                     filename=fname,
                                     checksum=checksum,
                                     filetype=fqtype)
                    fastq = self._check_file_zipped(fastq, lfile)
                    lanefiles[fastq] = lfile

        # Returns only the new lanefiles.
        return (lane, lanefiles)
 def open_fastq(self, fastq):
     '''
 Return a file handle for a fastq file, transparently handling
 gzipped files.
 '''
     if is_zipped(fastq):
         from gzip import GzipFile
         gen = lambda x: GzipFile(x)
     else:
         gen = lambda x: open(x)
     return gen(fastq)
Example #5
0
  def add(self, files, final_status=None):

    '''
    Process a list of filenames (files must exist on disk). The
    optional final_status argument specifies a
    models.Status object to which the lane should be linked
    upon completion.
    '''

    # We need at least one bed file.
    bed = self.identify_bed_file(files)
    if not bed:
      raise ValueError("Unable to identify any bed files in the input.")

    # Find the appropriate alignment. Note that aln is not yet saved
    # in the database.
    (aln, lane) = self.aln_from_bedfile(bed)

    # Do some heavy lifting *outside* of our database transaction, to
    # avoid locking the db for extended periods.
    chksums = dict()
    processed = []
    for fname in files:

      # If the file is uncompressed, don't waste time zipping it prior
      # to checksum.
      chksums[fname] = checksum_file(fname) # also works on zipped file

      ftype = Filetype.objects.guess_type(fname)
      if ftype is None:
        raise ValueError("File type not recognised from database: %s" % fname)
      if ftype.gzip and not is_zipped(fname):
        fname = rezip_file(fname)

      processed.append(fname)

    # All database changes should be handled by the
    # transaction-embedded method below.
    self._save_to_repository(processed, chksums, aln, final_status)

    return aln
    def submit_gsnap(self, files, genome, indexdir, number, queue, wait=False):
        '''
    Submit gsnap alignment jobs, samtools merge and cleanup jobs to the cluster.
    '''
        assert len(files) in (1, 2)

        keeptype = 'concordant' if len(files) == 2 else 'unpaired'

        bjobs = []

        # Only test the zippedness of our inputs once.
        use_gunzip = True if is_zipped(files[0]) else False

        if number is None:
            LOGGER.info("Counting reads in fastq input files...")
            number = int(
                (self.count_fastq_reads(files[0], use_gunzip) / 1e6) * 20)
            LOGGER.info("Will start %d jobs on cluster.", number)

        libmatch = re.match(r'^(do\d+)_', os.path.basename(files[0]))
        if libmatch:
            libcode = libmatch.group(1)
        else:
            raise ValueError("Unable to parse library code from file name %s" %
                             files[0])

        # Copy files to cluster workdir.
        clfiles = [
            '%s_%s' % (self.namespace, os.path.basename(x)) for x in files
        ]
        for filenum in range(len(files)):
            if not self.cluster_file_exists(clfiles[filenum]):
                LOGGER.info(
                    "Copying fastq file to cluster working directory...")
                self.submitter.remote_copy_files([files[filenum]],
                                                 [clfiles[filenum]])

        # Launch the gsnap job array.
        prefix = (r'%s/%s_%s.\\\$((\\\$LSB_JOBINDEX - 1))' %
                  (self.outdir, self.namespace, libcode))

        # Create the working directory, if it's not already there.
        cmd = r'mkdir -p %s' % self.outdir

        # Build the gsnap command. Note that the '-B' option can be
        # switched to 5 if there's too much disk IO; this has the
        # downside of increasing memory requirements though.
        cmd += (
            (r' && gsnap --part=\\\$((\\\$LSB_JOBINDEX - 1))/%d -d %s -D %s' %
             (number, genome, indexdir)) +
            r' -m 2 --trim-mismatch-score=0 -N 1 -n 1' +
            r' -E 100 --pairexpect=160 -B 4' +
            r' --antistranded-penalty=1 --quality-protocol=sanger' +
            r' --nofails -A sam' +
            (r' --split-output=%s --nthreads=8' % prefix))

        if use_gunzip:
            cmd += ' --gunzip'

        cmd += " " + " ".join(clfiles)

        # The samtools view (sam->bam) step is actually where we assume
        # paired-end sequencing.
        for tag in ('uniq', 'mult'):
            samfile = r'%s.%s_%s' % (prefix, keeptype, tag)

            # Sort here rather than later as it's more efficient.
            cmd += (
                r' && samtools view -b -S -o - %s | samtools sort -o %s.bam -'
                % (samfile, samfile))

        # Clean up the sam files.
        cmd += r' && rm %s.nomapping' % prefix
        cmd += (
            (r' && rm %s.' % prefix) +
            r'{unpaired,paired,halfmapping,concordant}_' +
            r'{transloc,mult,circular,uniq,uniq_scr,uniq_long,uniq_inv,uniq_circular}'
        )

        cmd = (
            r'if [ ! -f %s.%s_mult.bam ]; then %s; else echo Skipping %s; fi' %
            (prefix, keeptype, cmd, prefix))

        LOGGER.info("Submitting job array of %d jobs.", number)
        jobname = 'gsnap[1-%d]%%%d' % (number, self.throttle)
        LOGGER.debug("Job name: %s", jobname)
        LOGGER.debug("Command: %s", cmd)
        bjobs.append(
            self.submitter.submit_command(cmd,
                                          mem=self.memsize * 1024,
                                          queue=queue,
                                          auto_requeue=False,
                                          jobname=jobname))

        mergejob = self._merge_bams(bjobs,
                                    libcode,
                                    queue,
                                    njobs=number,
                                    keeptype=keeptype)

        fastq_cleanup = r'rm %s' % " ".join(clfiles)

        if wait is True:
            self.wait_on_cluster([mergejob], fastq_cleanup)
            return

        else:
            LOGGER.info("Submitting fastq cleanup job...")
            jobid = self.submitter.submit_command(fastq_cleanup,
                                                  queue=queue,
                                                  depend_jobs=[mergejob])
            return jobid
Example #7
0
    def run(self, flowcell, flowlane=None, fcq=None, destdir=None):
        '''The main entry point for the class.'''
        multiplexed = {}

        if destdir is None:
            destdir = self.conf.incoming

        # get list of new lanes from flow cell
        if fcq is None:
            fcq = FlowCellQuery(flowcell,
                                flowlane,
                                lims=self.lims,
                                trust_lims_adapters=self.trust_lims_adapters)

        flowlanes = set()
        if fcq.lims_fc.analysis_status not in self.ready:
            LOGGER.info("flow cell status '%s'", fcq.lims_fc.analysis_status)
            sys.exit("Flow cell analysis status not yet completed.")

        for (lanenum, libset) in fcq.lane_library.items():
            if lanenum not in multiplexed:
                multiplexed[lanenum] = set()
            for lib in libset:
                if fcq.lib_status[lib] in ('new') or not self.db_library_check:

                    # Only register lane for demultiplexing if this if lib not
                    # in lane.lims_samples()
                    if not fcq.lane_demuxed[lanenum]:
                        multiplexed[lanenum].add(lib)

                    flowlanes.add((fcq.lims_fc.fcid, lanenum))

        if len(flowlanes) == 0:
            LOGGER.info("No ready lanes for flowcell '%s'", flowcell)
            sys.exit("No lanes to process.")

        # We need to set our working directory to something suitable
        # before we start; otherwise we end up demuxing into a home
        # directory or similar.
        pwd = os.getcwd()
        os.chdir(destdir)

        downloading = Status.objects.get(code='downloading data')
        downloaded = Status.objects.get(code='downloaded')

        # for each lane...
        path = destdir
        for (flowcell, flowlane) in flowlanes:

            # Mark our lane(s) as active (note that each library has its own
            # version of this lane).
            for lane in Lane.objects.filter(flowcell=flowcell,
                                            flowlane=flowlane):
                lane.status = downloading
                lane.save()

            # retrieve file
            fetcher = FQFileFetcher(destination=path,
                                    lims=self.lims,
                                    test_mode=self.test_mode,
                                    unprocessed_only=True,
                                    force_download=self.force_download)
            fetcher.fetch(flowcell, flowlane)

            if self.test_mode:
                print("Test Mode: skipping download of %s lane %s to %s" %
                      (flowcell, flowlane, path))
                continue

            failed_fnames = {}
            for fname in fetcher.targets:
                if len(fname) > 0:

                    # Check file was retrieved.
                    if not os.path.exists(fname):
                        LOGGER.error("Can't seem to find expected file '%s'",
                                     fname)
                        failed_fnames[fname] = fname
                    else:
                        muxed_libs = multiplexed[flowlane]
                        if len(muxed_libs) > 1:

                            # Demultiplex file if required. Here we unfortunately
                            # have to unzip the data, and we will rezip it
                            # following the process regardless of its input state.
                            if is_zipped(fname):
                                fname = unzip_file(fname)
                            LOGGER.info(
                                "Demultiplexing file %s for libraries: %s",
                                fname, ", ".join(muxed_libs))
                            self.demultiplex(muxed_libs, fname)
                            for lib in muxed_libs:
                                self.output_files += [
                                    rezip_file(dmf)
                                    for dmf in self._demux_files[lib]
                                ]
                        else:
                            LOGGER.info(
                                "File does not require demultiplexing: %s",
                                fname)
                            self.output_files.append(fname)

        for fname in self.output_files:
            if fname not in failed_fnames:
                # The next line will parse regular Fastq filenames or the 10X tarball filenames.
                (code, flowcell, flowlane,
                 flowpair) = parse_incoming_fastq_name(os.path.basename(fname),
                                                       ext=r'.(fq.gz|tar)')
                LOGGER.info(
                    "Changing code=%s, flowcell=%s, flowlane=%s, flowpair=%s to 'downloaded'",
                    code, flowcell, flowlane, flowpair)
                try:
                    lane = Lane.objects.get(flowcell=flowcell,
                                            flowlane=flowlane,
                                            library__code=code)
                    lane.status = downloaded
                    lane.save()
                except Lane.DoesNotExist, _err:
                    try:
                        lib = Library.objects.search_by_name(code)
                    except Library.DoesNotExist, _err:
                        LOGGER.error(
                            "No library %s. Unable to register lane for the library.",
                            code)
                        continue
                    LOGGER.info("Registering lane for %s.", fname)
                    facobj = Facility.objects.get(code='CRI')
                    machine_obj = Machine.objects.get(
                        code__iexact=str('Unknown'))
                    lane = Lane(facility=facobj,
                                library=lib,
                                flowcell=flowcell,
                                flowlane=flowlane,
                                lanenum=Lane.objects.next_lane_number(lib),
                                status=downloaded,
                                rundate='2008-01-01',
                                paired=False,
                                genomicssampleid='',
                                usersampleid=code,
                                runnumber='',
                                seqsamplepf='',
                                seqsamplebad='',
                                failed=False,
                                machine=machine_obj)
                    lane.save()