Example #1
0
    def make_bed_graph(self, aln):
        '''
    Code wrapper for makeWiggle.
    '''
        aln = Alignment.objects.get(
            id=aln.id)  # Reload passed object within transaction.
        bed = aln.alnfile_set.filter(filetype=self.bedtype).exclude(
            filename__contains='chr21')[0]

        # Note makeWiggle can read gzipped bed files directly; we use that fact here.
        lib = aln.lane.library
        bedFN = bed.repository_file_path

        # Write to local directory first.
        bgrBASE = os.path.splitext(bed.filename)[0]
        bgrFN = bgrBASE + self.bgrtype.suffix
        cmd = BED2BGR % (quote(bedFN), quote(bgrBASE))
        LOGGER.debug(cmd)
        if not self.testMode:
            call_subprocess(cmd, shell=True, path=self.conf.hostpath)
            if not os.path.exists(bgrFN):
                LOGGER.error("Failed to create bgr file '%s'" % (bgrFN, ))
            else:
                chksum = checksum_file(bgrFN)
                bgr = Alnfile(filename=os.path.basename(bgrFN),
                              checksum=chksum,
                              filetype=self.bgrtype,
                              alignment=aln)
                bgrFN = rezip_file(bgrFN)
                move(bgrFN, bgr.repository_file_path)
                set_file_permissions(self.conf.group, bgr.repository_file_path)
                bgr.save()
 def _check_file_zipped(self, fname, fobj):
     # Logging currently handled by the utilities module.
     zipped = is_zipped(fname)
     if fobj.filetype.gzip and not zipped:
         fname = rezip_file(fname, overwrite=True)
     elif not fobj.filetype.gzip and zipped:
         fname = unzip_file(fname, overwrite=True)
     return fname
Example #3
0
def run_qc(fnames, workdir, destination=None, cleanup=True, register=False):

    with LaneFastQCReport(fastqs=fnames, workdir=workdir, lane=0) as qc:
        # Generate qc reports
        qc.run_fastqc(qc.fastqs)
        qc.postprocess_results(qc.fastqs)

        # create list of disk files and if needed compress some of them before.
        dfiles = []
        # NB! This is not elegant, a better way of doing it would be if ftype.gzip and os.path.splitext(fname)[1] != CONFIG.gzsuffix:,
        #     However, this code is set up no to directly interact with database.
        for fn in qc.output_files:
            if fn.endswith('txt') or fn.endswith('tar'):
                dfn = rezip_file(fn)
                dfiles.append(dfn)
            else:
                dfiles.append(fn)

        if destination is not None:
            # transfer files to destination
            for dfn in dfiles:
                # set permissions
                set_file_permissions(CONFIG.group, dfn)
                # transfer file
                transfer_file(dfn, destination)

        if register:
            # register QC files in repository
            argslist = []
            for (fn, md5) in zip(qc.output_files, qc.output_md5s):
                argslist.append(os.path.basename(fn))
                argslist.append(md5)
            # register files in repository
            cmd = "cs_addFile.py --qcfile -M --program_name %s " % qc.program_name
            cmd += " ".join(argslist)
            print "Executing \"%s\" ..." % cmd
            subproc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
            (stdout, stderr) = subproc.communicate()
            retcode = subproc.wait()
            if stdout:
                sys.stdout.write(stdout)
            if stderr:
                sys.stderr.write(stderr)

        if cleanup:
            # remove local files
            # assuming fastqc report dir is still around, construct dirname.
            # NB! A cleaner way would be to save the dir name to self.bpath in postprocess_results in LaneQCReport class and use this value.
            #     Even better, perhaps LaneFastQCReport should be implemented to keep track of all temporary files it creates.
            for dfn in dfiles:
                os.remove(dfn)
                if dfn.endswith('pdf'):
                    fqc_dirname = os.path.splitext(dfn)[0]
                    rmtree(fqc_dirname)
                    zipfile = fqc_dirname + '.zip'
                    os.remove(zipfile)
Example #4
0
  def add(self, files, final_status=None):

    '''
    Process a list of filenames (files must exist on disk). The
    optional final_status argument specifies a
    models.Status object to which the lane should be linked
    upon completion.
    '''

    # We need at least one bed file.
    bed = self.identify_bed_file(files)
    if not bed:
      raise ValueError("Unable to identify any bed files in the input.")

    # Find the appropriate alignment. Note that aln is not yet saved
    # in the database.
    (aln, lane) = self.aln_from_bedfile(bed)

    # Do some heavy lifting *outside* of our database transaction, to
    # avoid locking the db for extended periods.
    chksums = dict()
    processed = []
    for fname in files:

      # If the file is uncompressed, don't waste time zipping it prior
      # to checksum.
      chksums[fname] = checksum_file(fname) # also works on zipped file

      ftype = Filetype.objects.guess_type(fname)
      if ftype is None:
        raise ValueError("File type not recognised from database: %s" % fname)
      if ftype.gzip and not is_zipped(fname):
        fname = rezip_file(fname)

      processed.append(fname)

    # All database changes should be handled by the
    # transaction-embedded method below.
    self._save_to_repository(processed, chksums, aln, final_status)

    return aln
Example #5
0
  def add_bam_to_lane(self, bam, lane, tc1=False, chrom_sizes=None):
    '''
    Generate a bed file from a bam file and add both to the given
    lane. This method is typically used from within an ipython shell to
    handle unusual cases outside the main pipeline. Note that genome
    and data provenance info is passed in via the class attributes
    prog and params.
    '''
    bam_to_bed = BamToBedConverter(tc1=tc1, chrom_sizes=chrom_sizes)
    base       = os.path.splitext(bam)[0]
    
    bedtype = Filetype.objects.get(code='bed')
    bed_fn  = base + bedtype.suffix
    beds    = bam_to_bed.convert(bam, bed_fn)
    chksums = dict( (fname, checksum_file(fname)) for fname in [bam] + beds )

    # First bed file is the main one.
    aln     = self._create_alignment(beds[0], lane)

    if bedtype.gzip:
      bedgz = [ rezip_file(bed) for bed in beds ]

    self._save_to_repository([bam] + bedgz, chksums, aln)
        # Set group ownership and permissions appropriately
        grp = self.conf.group
        set_file_permissions(grp, in_fn)
        for bed in beds:
            set_file_permissions(grp, bed)
        for wig in wigs:
            set_file_permissions(grp, wig)
        for bgr in bedgraphs:
            set_file_permissions(grp, bgr)
        for bwig in bigwigs:
            set_file_permissions(grp, bwig)

        # compress bed file(s)
        bedgz = []
        for bed in beds:
            gzname = rezip_file(bed)
            bedgz.append(gzname)

        # compress wiggle files
        wigsgz = []
        for wig in wigs:
            gzname = rezip_file(wig)
            wigsgz.append(gzname)

        # compress bedgraph files
        bgrgz = []
        for bgr in bedgraphs:
            gzname = rezip_file(bgr)
            bgrgz.append(gzname)

        # Don't compress bigwig files
Example #7
0
    def run(self, flowcell, flowlane=None, fcq=None, destdir=None):
        '''The main entry point for the class.'''
        multiplexed = {}

        if destdir is None:
            destdir = self.conf.incoming

        # get list of new lanes from flow cell
        if fcq is None:
            fcq = FlowCellQuery(flowcell,
                                flowlane,
                                lims=self.lims,
                                trust_lims_adapters=self.trust_lims_adapters)

        flowlanes = set()
        if fcq.lims_fc.analysis_status not in self.ready:
            LOGGER.info("flow cell status '%s'", fcq.lims_fc.analysis_status)
            sys.exit("Flow cell analysis status not yet completed.")

        for (lanenum, libset) in fcq.lane_library.items():
            if lanenum not in multiplexed:
                multiplexed[lanenum] = set()
            for lib in libset:
                if fcq.lib_status[lib] in ('new') or not self.db_library_check:

                    # Only register lane for demultiplexing if this if lib not
                    # in lane.lims_samples()
                    if not fcq.lane_demuxed[lanenum]:
                        multiplexed[lanenum].add(lib)

                    flowlanes.add((fcq.lims_fc.fcid, lanenum))

        if len(flowlanes) == 0:
            LOGGER.info("No ready lanes for flowcell '%s'", flowcell)
            sys.exit("No lanes to process.")

        # We need to set our working directory to something suitable
        # before we start; otherwise we end up demuxing into a home
        # directory or similar.
        pwd = os.getcwd()
        os.chdir(destdir)

        downloading = Status.objects.get(code='downloading data')
        downloaded = Status.objects.get(code='downloaded')

        # for each lane...
        path = destdir
        for (flowcell, flowlane) in flowlanes:

            # Mark our lane(s) as active (note that each library has its own
            # version of this lane).
            for lane in Lane.objects.filter(flowcell=flowcell,
                                            flowlane=flowlane):
                lane.status = downloading
                lane.save()

            # retrieve file
            fetcher = FQFileFetcher(destination=path,
                                    lims=self.lims,
                                    test_mode=self.test_mode,
                                    unprocessed_only=True,
                                    force_download=self.force_download)
            fetcher.fetch(flowcell, flowlane)

            if self.test_mode:
                print("Test Mode: skipping download of %s lane %s to %s" %
                      (flowcell, flowlane, path))
                continue

            failed_fnames = {}
            for fname in fetcher.targets:
                if len(fname) > 0:

                    # Check file was retrieved.
                    if not os.path.exists(fname):
                        LOGGER.error("Can't seem to find expected file '%s'",
                                     fname)
                        failed_fnames[fname] = fname
                    else:
                        muxed_libs = multiplexed[flowlane]
                        if len(muxed_libs) > 1:

                            # Demultiplex file if required. Here we unfortunately
                            # have to unzip the data, and we will rezip it
                            # following the process regardless of its input state.
                            if is_zipped(fname):
                                fname = unzip_file(fname)
                            LOGGER.info(
                                "Demultiplexing file %s for libraries: %s",
                                fname, ", ".join(muxed_libs))
                            self.demultiplex(muxed_libs, fname)
                            for lib in muxed_libs:
                                self.output_files += [
                                    rezip_file(dmf)
                                    for dmf in self._demux_files[lib]
                                ]
                        else:
                            LOGGER.info(
                                "File does not require demultiplexing: %s",
                                fname)
                            self.output_files.append(fname)

        for fname in self.output_files:
            if fname not in failed_fnames:
                # The next line will parse regular Fastq filenames or the 10X tarball filenames.
                (code, flowcell, flowlane,
                 flowpair) = parse_incoming_fastq_name(os.path.basename(fname),
                                                       ext=r'.(fq.gz|tar)')
                LOGGER.info(
                    "Changing code=%s, flowcell=%s, flowlane=%s, flowpair=%s to 'downloaded'",
                    code, flowcell, flowlane, flowpair)
                try:
                    lane = Lane.objects.get(flowcell=flowcell,
                                            flowlane=flowlane,
                                            library__code=code)
                    lane.status = downloaded
                    lane.save()
                except Lane.DoesNotExist, _err:
                    try:
                        lib = Library.objects.search_by_name(code)
                    except Library.DoesNotExist, _err:
                        LOGGER.error(
                            "No library %s. Unable to register lane for the library.",
                            code)
                        continue
                    LOGGER.info("Registering lane for %s.", fname)
                    facobj = Facility.objects.get(code='CRI')
                    machine_obj = Machine.objects.get(
                        code__iexact=str('Unknown'))
                    lane = Lane(facility=facobj,
                                library=lib,
                                flowcell=flowcell,
                                flowlane=flowlane,
                                lanenum=Lane.objects.next_lane_number(lib),
                                status=downloaded,
                                rundate='2008-01-01',
                                paired=False,
                                genomicssampleid='',
                                usersampleid=code,
                                runnumber='',
                                seqsamplepf='',
                                seqsamplebad='',
                                failed=False,
                                machine=machine_obj)
                    lane.save()
Example #8
0
    def insert_into_repository(self, move_files=True):
        '''Insert self.output_files into the database.'''

        if len(self.output_files) == 0:
            self.generate()

        params = {self.target_name: self.target}
        qcobj = self.data_process.objects.create(**params)
        DataProvenance.objects.create(program=self._dbprog,
                                      parameters=self.program_params,
                                      rank_index=1,
                                      data_process=qcobj)

        for i in range(len(self.output_files)):
            fname = self.output_files[i]
            if len(self.output_md5s) != len(self.output_files):
                checksum = None
            else:
                checksum = self.output_md5s[i]

            LOGGER.info("Inserting %s", fname)
            # Note: this will fail if multiple types match.
            ftype = Filetype.objects.guess_type(fname)

            if os.path.isabs(fname):
                fpath = fname
            else:
                fpath = os.path.join(self.workdir, fname)

            if checksum is None or checksum == '':
                checksum = checksum_file(fpath)

            fparms = {
                self.file_target_name: qcobj,
                'filename': os.path.split(fname)[1],
                'checksum': checksum,
                'filetype': ftype
            }
            fobj = self.data_file(**fparms)

            fobj.save()

            if move_files:
                # Zip up the file if necessary.
                if ftype.gzip and os.path.splitext(
                        fname)[1] != CONFIG.gzsuffix:
                    fpath = rezip_file(fpath)
                if self.move_files:
                    dest = fobj.repository_file_path
                    # destdir = os.path.dirname(dest)
                    # if not os.path.exists(destdir):
                    #    os.makedirs(destdir)
                    # move(fpath, dest)
                    # set_file_permissions(CONFIG.group, dest)
                    if os.path.isabs(dest):
                        dest = os.path.split(dest)[0] + '/'
                    transfer_file(
                        fpath,
                        "%s@%s:%s" % (CONFIG.user, CONFIG.datahost, dest),
                        set_ownership=True
                    )  # note that transfer_file sets destination file permissions as in CONF