def create_unsaved_alignment(self, bam, lane):
        '''
    Gather statistics from the bam file, and generate a bed file and a
    new Alignment object unsaved to the database.
    '''
        bamtype = Filetype.objects.get(code='bam')
        bedtype = Filetype.objects.get(code='bed')

        # Generate a bed file from the bam.
        chrom_sizes = os.path.join(self.config.genomesizedir,
                                   self.genome.code + ".fa.length")
        if not os.path.exists(chrom_sizes):
            LOGGER.warning(
                "Unable to find chromosome sizes file %s. BED file reads will be untrimmed.",
                chrom_sizes)
            chrom_sizes = None
        bam2bed = BamToBedConverter(chrom_sizes=chrom_sizes)
        bambase = os.path.splitext(bam)[0]
        bed_fn = bambase + bedtype.suffix
        beds = bam2bed.convert(bam, bed_fn)
        if len(beds) == 1:
            bed = beds[0]
        else:
            raise ValueError("Unexpected results from BAM to BED conversion")

        LOGGER.info("Counting reads in generated bed file %s...", bed)
        (mapped, unique) = count_reads(bed)

        # We don't save this yet because we're not currently within a
        # transaction.
        aln = Alignment(lane=lane,
                        genome=self.genome,
                        total_reads=lane.total_passedpf,
                        mapped=mapped,
                        munique=unique)

        # Create bam Alnfile.
        LOGGER.info("Checksumming bam file %s...", bam)
        checksum = checksum_file(bam)
        bamobj = Alnfile(
            alignment=aln,
            filetype=bamtype,
            filename=bam,  # casually assume no gzippng FIXME
            checksum=checksum)

        # Create bed Alnfile
        LOGGER.info("Checksumming bed file %s...", bed)
        checksum = checksum_file(bed)
        bedobj = Alnfile(
            alignment=aln,
            filetype=bedtype,
            filename=bed,  # only just created, so not zipped
            checksum=checksum)

        bamkey = self._check_file_zipped(bam, bamobj)
        bedkey = self._check_file_zipped(bed, bedobj)

        alnfiles = {bamkey: bamobj, bedkey: bedobj}

        return (aln, alnfiles)
Ejemplo n.º 2
0
    def postprocess_results(self, fns):
        '''Checks for output, add to self.output_files. Note that we want
    the compressed archive to be gzipped (*.tar.gz), not zipped. We
    also want the fastqc_report.txt file stored separately and
    uncompressed.'''

        for fpath in fns:

            fname = os.path.split(fpath)[1]
            fname = re.sub(r'\.gz$', '', fname)

            # FastQC strips '.fastq' but not '.fq', so we only remove the former here.
            fname = re.sub(r'\.fastq$', '', fname)

            base = "%s_fastqc" % fname
            bpath = os.path.join(self.workdir, base)

            if not os.path.exists(bpath):
                raise StandardError("Expected output directory not found: %s" %
                                    bpath)

            # Sort out the tar-gzipped archive.
            gzarch = "%s.tar" % bpath
            tar = tarfile.open(gzarch, mode='w')

            # A little jimmying around so we only get the directory we want.
            pwd = os.getcwd()
            os.chdir(self.workdir)
            tar.add(base)
            os.chdir(pwd)

            tar.close()
            self.output_files.append(gzarch)
            self.output_md5s.append(checksum_file(gzarch))

            # The text file containing summary results. Useful for analyses.
            resfile = "%s.txt" % bpath
            copy(os.path.join(bpath, 'fastqc_data.txt'), resfile)
            self.output_files.append(resfile)
            self.output_md5s.append(checksum_file(resfile))

            # Generating a PDF for our end-users.
            html = os.path.join(bpath, 'fastqc_report.html')
            pdf = "%s.pdf" % bpath

            # FIXME resource_filename is a little brittle, would
            # resource_string be better?
            cmd = [
                'wkhtmltopdf-amd64', '--user-style-sheet',
                resource_filename(Requirement.parse('osqpipe'),
                                  'osqpipe/pipeline/fastqc_pdf_styles.css'),
                html, pdf
            ]
            call_subprocess(cmd, path=self.path)
            self.output_files.append(pdf)
            self.output_md5s.append(checksum_file(pdf))
def update_library_bam_readgroups(libcode, update=False):
    '''
  Add or Replace the read groups for all the bam files attached to a
  given library. If update is False (the default), picard
  AddOrReplaceReadGroups is used; if update is True then just the bam
  file header is rewritten using an internal pipeline function.
  '''
    lib = Library.objects.get(code=libcode)
    bams = Alnfile.objects.filter(alignment__lane__library__code=libcode,
                                  filetype__code='bam')
    common_args = ('VALIDATION_STRINGENCY=SILENT',
                   'TMP_DIR=%s' % CONFIG.tmpdir)

    for bam in bams:
        LOGGER.info("Updating bam file: %s", bam.filename)
        checksum = checksum_file(bam.repository_file_path, unzip=False)
        if checksum != bam.checksum:
            raise ValueError(
                "Stored bam checksum does not agree with that in the repository."
            )
        if update:
            LOGGER.debug("Rewriting bam file header: %s", bam.filename)
            update_bam_readgroups(bam)
        else:
            tmpfile = "%s.update_rg" % (bam.filename, )
            cmd = ('picard', 'AddOrReplaceReadGroups', 'INPUT=%s' %
                   bam.repository_file_path, 'OUTPUT=%s' % tmpfile, 'RGLB=%s' %
                   lib.code, 'RGSM=%s' % sanitize_samplename(lib.sample.name),
                   'RGCN=%s' % bam.alignment.lane.facility.code,
                   'RGPU=%d' % int(bam.alignment.lane.lanenum),
                   'RGPL=illumina') + common_args

            LOGGER.debug("Running command: %s", " ".join(cmd))
            call_subprocess(cmd, path=os.environ['PATH'])
            update_repo_bamfile(bam, tmpfile)
Ejemplo n.º 4
0
    def make_bed_graph(self, aln):
        '''
    Code wrapper for makeWiggle.
    '''
        aln = Alignment.objects.get(
            id=aln.id)  # Reload passed object within transaction.
        bed = aln.alnfile_set.filter(filetype=self.bedtype).exclude(
            filename__contains='chr21')[0]

        # Note makeWiggle can read gzipped bed files directly; we use that fact here.
        lib = aln.lane.library
        bedFN = bed.repository_file_path

        # Write to local directory first.
        bgrBASE = os.path.splitext(bed.filename)[0]
        bgrFN = bgrBASE + self.bgrtype.suffix
        cmd = BED2BGR % (quote(bedFN), quote(bgrBASE))
        LOGGER.debug(cmd)
        if not self.testMode:
            call_subprocess(cmd, shell=True, path=self.conf.hostpath)
            if not os.path.exists(bgrFN):
                LOGGER.error("Failed to create bgr file '%s'" % (bgrFN, ))
            else:
                chksum = checksum_file(bgrFN)
                bgr = Alnfile(filename=os.path.basename(bgrFN),
                              checksum=chksum,
                              filetype=self.bgrtype,
                              alignment=aln)
                bgrFN = rezip_file(bgrFN)
                move(bgrFN, bgr.repository_file_path)
                set_file_permissions(self.conf.group, bgr.repository_file_path)
                bgr.save()
Ejemplo n.º 5
0
def replace_repo_file(bam, newbam):

  bam = Alnfile.objects.get(id=bam.id) # Reload passed object within transaction.
  set_file_permissions(CONFIG.group, newbam)
  checksum = checksum_file(newbam, unzip=False)
  bam.checksum = checksum
  os.unlink(bam.repository_file_path)
  move(newbam, bam.repository_file_path)
  bam.save()
    def create_unsaved_lane(self, fastqs):
        '''
    Gather statistics from the fastq files and generate a Lane object
    not yet saved to the database.
    '''
        status = Status.objects.get(code='complete')
        machine = Machine.objects.get(code__iexact=self.machine)

        # Don't save this to the db just yet.
        lane = Lane(library=self.library,
                    facility=self.facility,
                    lanenum=self.lanenum,
                    machine=machine,
                    flowcell=self.flowcell,
                    rundate=self.rundate,
                    summaryurl=self.url,
                    runnumber=self.runnumber,
                    status=status,
                    flowlane=0)

        if len(fastqs) == 1:
            lane.paired = False
        elif len(fastqs) == 2:
            lane.paired = True
        else:
            raise ValueError(
                "Can only process either one or two fastq files per lane.")

        # Assumes the first fastq is representative.
        lane.readlength = self.fastq_readlength(fastqs[0])
        lane.reads = self.fastq_readcount(fastqs[0])
        lane.passedpf = lane.reads  # This is a bit of an assumption FIXME?

        lanefiles = dict()
        if self.keepfastq:
            fqtype = Filetype.objects.get(code='fq')
            for fastq in fastqs:
                checksum = checksum_file(fastq)

                if is_zipped(fastq):
                    fname = re.sub('%s$' % self.config.gzsuffix, '', fastq)
                else:
                    fname = fastq

                try:
                    lfile = Lanefile.objects.get(lane=lane, filename=fname)
                except Lanefile.DoesNotExist:
                    lfile = Lanefile(lane=lane,
                                     filename=fname,
                                     checksum=checksum,
                                     filetype=fqtype)
                    fastq = self._check_file_zipped(fastq, lfile)
                    lanefiles[fastq] = lfile

        # Returns only the new lanefiles.
        return (lane, lanefiles)
Ejemplo n.º 7
0
class RepoFileHandler(object):
    '''Class which is almost certainly overkill given the limited
  functionality left in this script, post-refactor.'''
    @staticmethod
    def run(fns, md5files=False, archive=None, md5sums=None):
        '''Main entry point for the class.'''

        arc = None
        arc_date = None

        if archive is not None:
            try:
                arc = ArchiveLocation.objects.get(name=archive)
                arc_date = datetime.date.today()
            except ArchiveLocation.DoesNotExist, _err:
                raise SystemExit("No ArchiveLocation with name '%s'" % archive)

        i = 0
        for fname in fns:
            # We assume that the file names in the list may correspond to different lanes.
            # Hence, we search lane for each file again.
            lane = get_lane_for_file(fname)
            # Even though fname was already parsed in get_lane_for_file, parse it again as we need the pipeline value
            (code, facility, lanenum,
             pipeline) = parse_repository_filename(fname)

            # if md5sums have been provided
            if md5sums is not None:
                chksum = md5sums[i]
                i += 1
            else:
                # if md5sum is available in .md5 file on the file location
                if md5files:
                    chksum = checksum_from_file(fname)
            # As a last resort, try to compute md5sum
            if chksum is None:
                chksum = checksum_file(fname)
            filetype = Filetype.objects.guess_type(fname)
            basefn = os.path.split(fname)[1]
            fnparts = os.path.splitext(basefn)
            if fnparts[1] == '.gz':
                basefn = fnparts[0]
            LOGGER.debug("basefn: '%s'" % (basefn))

            lanefile = Lanefile(filename=basefn,
                                checksum=chksum,
                                filetype=filetype,
                                lane=lane,
                                pipeline=pipeline,
                                archive=arc,
                                archive_date=arc_date)
            lanefile.save()
            LOGGER.info("Added %s to repository.", basefn)
Ejemplo n.º 8
0
def run_picard(libcode, facility, lanenum=None, genome=None):

  bams = Alnfile.objects.filter(alignment__lane__library__code=libcode,
                                alignment__lane__facility__code=facility,
                                filetype__code='bam')
  if lanenum is not None:
    bams = bams.filter(alignment__lane__lanenum=lanenum)
  if genome is not None:
    bams = bams.filter(alignment__genome__code=genome)

  if len(bams) == 0:
    raise StandardError("Unable to find matching bam file in the repository.")

  for bam in bams:
    LOGGER.info("Confirming file checksum: %s", bam.filename)
    oldsum   = checksum_file(bam.repository_file_path, unzip=False)
    if oldsum != bam.checksum:
      raise ValueError(("MD5 checksum of bam file on disk (%s) does not agree"
                        + " with stored repository value (%s): %s")
                       % (oldsum, bam.checksum, bam.filename))
    newbam   = bam.repository_file_path + '.cleaned'
    postproc = BamPostProcessor(input_fn=bam.repository_file_path,
                                output_fn=newbam)

    # Run CleanSam
    LOGGER.info("Running CleanSam...")
    call_subprocess(postproc.clean_sam(), path=CONFIG.hostpath)

    # Run AddOrReplaceReadGroups
    LOGGER.info("Running AddOrReplaceReadGroups...")
    call_subprocess(postproc.add_or_replace_read_groups(), path=CONFIG.hostpath)
    os.unlink(postproc.cleaned_fn)

    # Run FixMateInformation
    LOGGER.info("Running FixMateInformation...")
    call_subprocess(postproc.fix_mate_information(), path=CONFIG.hostpath)
    os.unlink(postproc.rgadded_fn)

    # Quick sanity check on the output
    newcount = count_bam_reads(newbam)

    # FIXME total_reads should be total reads in bam, not in fastq.
    oldcount = bam.alignment.total_reads
    if bam.alignment.lane.paired:
      oldcount = oldcount * 2
    if newcount != oldcount:
      raise ValueError(("Read count in cleaned bam file (%d) does not agree"
                        + " with total_reads in repository (%d): %s")
                       % (newcount, oldcount, newbam))

    # Clean up and replace the old bam file with the new one.
    LOGGER.info("Replacing old bam file with new: %s", bam.repository_file_path)
    replace_repo_file(bam, newbam)
def update_repo_bamfile(bam, newfile):
    '''
  Carefully replace a bam file in the repository.
  '''
    # Ensure we're using the latest bam object from the database within
    # this transaction.
    bam = Alnfile.objects.get(id=bam.id)
    checksum = checksum_file(newfile, unzip=False)
    deleteme = "%s.bak" % (bam.repository_file_path, )
    move(bam.repository_file_path, deleteme)
    move(newfile, bam.repository_file_path)
    bam.checksum = checksum
    bam.save()
    os.unlink(deleteme)
Ejemplo n.º 10
0
    def _remove_primary_files(self, fobjs):
        '''
    Deletes primary copies of the archived files.
    '''
        # If file has been in Archive for long enough or force_delete,
        # delete the source.
        files_deleted = 0
        for fobj in fobjs:

            archpath = fobj.repository_file_path
            repopath = fobj.original_repository_file_path

            if self.force_delete:
                LOGGER.warning(\
                  "Executing forced deletion. Archive information: date=%s file=%s."
                  + " Removing %s", fobj.archive_date, archpath, repopath)
            else:
                LOGGER.info(\
                  "More than %d days passed since archiving %s."
                  + " (Archive date=%s).",
                  self.archive.host_delete_timelag, repopath, fobj.archive_date)

            # Before deleting the file, check that the file in archive not
            # only exists but has the same md5 sum as recorded in
            # repository. BEAR IN MIND that the way the queries are
            # currently set up this code is run over ALMOST EVERY BAM FILE
            # IN THE REPOSITORY!
            if os.path.exists(repopath):
                if os.path.exists(archpath):
                    checksum = checksum_file(archpath)
                    if checksum != fobj.checksum:
                        LOGGER.error(\
                          "Error: Archive file checksum (%s) not same as in repository (%s)."
                          + " Can not delete the file!", checksum, fobj.checksum)
                    else:
                        LOGGER.warning("Removing %s.", repopath)
                        os.unlink(repopath)
                        files_deleted += 1
                else:
                    LOGGER.error(
                        "File %s recorded to be in archive but missing on disk.",
                        archpath)
                    continue

        if files_deleted == 0:
            LOGGER.info("Zero files deleted from repository.")
        else:
            LOGGER.warning("%d files removed from repository.", files_deleted)
Ejemplo n.º 11
0
    def _register_file_in_archive(self, fobj):
        '''
    Given a file name (or file path), and the name of an archive as recorded
    in the repository, make sure there is a valid copy of the file in the
    archive and delete the file from the primary repository file tree.
    '''
        LOGGER.warning("""Registering '%s' in archive.""", fobj)

        # Reload the Datafile object within this transaction.
        fobj = type(fobj).objects.get(filename=fobj.filename)
        (fobj, previously_archived) = self._set_archive_location(fobj)
        archpath = fobj.repository_file_path

        # Errors here will typically need careful manual investigation.
        if not previously_archived or \
              (previously_archived and (self.force_overwrite or self.force_md5_check)):

            # Check that the transfer to the archive completed successfully.
            LOGGER.info(
                "Comparing md5 sum of %s in archive and in repository ...",
                fobj)

            # Raising an exception rolls back the transaction cleanly.
            if not os.path.exists(archpath):
                raise ArchiveError(
                    "Error: File has not yet appeared in the archive: %s" %
                    fobj)

            checksum = checksum_file(archpath)  # archive path

            if checksum == fobj.checksum:
                LOGGER.info("Md5 sum in repository and for %s are identical.",
                            archpath)

                # Actually record the archiving in the database.
                fobj.save()
                LOGGER.info("Saved file in the %s archive: %s", fobj.archive,
                            fobj)

            else:

                raise ArchiveError(\
                  ("Error: Archive file checksum (%s) not same as in"
                  + " repository (%s) for file %s. Skipping!") % (checksum, fobj.checksum, fobj))

        return None
Ejemplo n.º 12
0
    def restore_file_from_archive(self, fpath):
        '''
    Method restores the file in the archive back into the main
    repository file tree, and resets the archive flag such that the
    repository copy is now considered authoritative. Note that using
    this function may be counterproductive if running a regular cron
    job to archive all files of a specific filetype (subsequent cron
    job runs will simply move the file back into the archive
    again). It is recommended to only restore files which are not
    managed in this fashion.
    '''
        fobj = _find_file(fpath)

        if not fobj.archive:
            LOGGER.error(\
              "File %s is not currently registered to any archive location.", fobj)

        archpath = fobj.repository_file_path

        # This removes all archive metadata (the repository copy will be
        # authoritative once more). Note that we are not deleting the
        # archived file (since the archive will typically not allow this).
        fobj.archive = None
        fobj.archive_date = None
        fobj.save()
        repopath = fobj.repository_file_path

        # This may happen when restoring a file halfway through the
        # transfer process. We default to using the copy already present,
        # not least because if e.g. the archive is unavailable this is the
        # only way to restore access quickly.
        if os.path.exists(repopath):
            LOGGER.warning("File %s already present in repository tree.", fobj)
        else:
            copy2(archpath, repopath)

        checksum = checksum_file(repopath)

        # Another manual investigation type error.
        if checksum != fobj.checksum:
            raise ValueError(\
              "Restored file checksum (%s) does not agree with repository value (%s)."
              % (checksum, fobj.checksum))
Ejemplo n.º 13
0
  def add(self, files, final_status=None):

    '''
    Process a list of filenames (files must exist on disk). The
    optional final_status argument specifies a
    models.Status object to which the lane should be linked
    upon completion.
    '''

    # We need at least one bed file.
    bed = self.identify_bed_file(files)
    if not bed:
      raise ValueError("Unable to identify any bed files in the input.")

    # Find the appropriate alignment. Note that aln is not yet saved
    # in the database.
    (aln, lane) = self.aln_from_bedfile(bed)

    # Do some heavy lifting *outside* of our database transaction, to
    # avoid locking the db for extended periods.
    chksums = dict()
    processed = []
    for fname in files:

      # If the file is uncompressed, don't waste time zipping it prior
      # to checksum.
      chksums[fname] = checksum_file(fname) # also works on zipped file

      ftype = Filetype.objects.guess_type(fname)
      if ftype is None:
        raise ValueError("File type not recognised from database: %s" % fname)
      if ftype.gzip and not is_zipped(fname):
        fname = rezip_file(fname)

      processed.append(fname)

    # All database changes should be handled by the
    # transaction-embedded method below.
    self._save_to_repository(processed, chksums, aln, final_status)

    return aln
Ejemplo n.º 14
0
  def add_bam_to_lane(self, bam, lane, tc1=False, chrom_sizes=None):
    '''
    Generate a bed file from a bam file and add both to the given
    lane. This method is typically used from within an ipython shell to
    handle unusual cases outside the main pipeline. Note that genome
    and data provenance info is passed in via the class attributes
    prog and params.
    '''
    bam_to_bed = BamToBedConverter(tc1=tc1, chrom_sizes=chrom_sizes)
    base       = os.path.splitext(bam)[0]
    
    bedtype = Filetype.objects.get(code='bed')
    bed_fn  = base + bedtype.suffix
    beds    = bam_to_bed.convert(bam, bed_fn)
    chksums = dict( (fname, checksum_file(fname)) for fname in [bam] + beds )

    # First bed file is the main one.
    aln     = self._create_alignment(beds[0], lane)

    if bedtype.gzip:
      bedgz = [ rezip_file(bed) for bed in beds ]

    self._save_to_repository([bam] + bedgz, chksums, aln)
Ejemplo n.º 15
0
    def gatk_preprocess_libraries(self, libcodes, genome=None, wait=True):
        '''
    Runs our standard GATK preprocessing pipeline on a set of
    libraries. Sanity checks are made that the libraries all come from
    the same sample, are of the same type, and that the alignments are
    all against the same genome.
    '''
        libs = Library.objects.filter(code__in=libcodes)

        if libs.count() == 0:
            raise StandardError("No libraries match libcodes %s" %
                                ",".join(libcodes))

        # Quick sanity check.
        indivs = list(set([lib.sample.name for lib in libs]))
        if len(indivs) > 1:
            raise ValueError(
                "Libraries come from multiple individual samples: %s" %
                ", ".join(indivs))

        bams = Alnfile.objects.filter(
            alignment__lane__library__code__in=libcodes, filetype__code='bam')
        if genome is not None:
            bams = bams.filter(alignment__genome__code=genome)

        # Another sanity check.
        alngens = list(set([bam.alignment.genome.code for bam in bams]))
        if len(alngens) > 1:
            raise ValueError("Alignments found against multiple genomes: %s" %
                             ", ".join(alngens))

        # Another sanity check.
        libtypes = list(
            set([bam.alignment.lane.library.libtype.code for bam in bams]))
        if len(libtypes) > 1:
            raise ValueError(
                "Alignments found against multiple library types: %s" %
                ", ".join(libtypes))

        # Another sanity check.
        tissues = list(
            set([
                bam.alignment.lane.library.sample.tissue.name for bam in bams
            ]))
        if len(tissues) > 1:
            raise ValueError("Alignments found against multiple tissues: %s" %
                             ", ".join(tissues))

        # And yet another sanity check.
        LOGGER.info("Validating bam file checksums.")
        for bam in bams:
            md5 = checksum_file(bam.repository_file_path, unzip=False)
            if md5 != bam.checksum:
                raise ValueError(
                    "Checksum for bam file %s does not agree with repository: (%s, %s)"
                    % (bam.filename, md5, bam.checksum))

        LOGGER.info("Count of %d bam files found for sample individual %s",
                    bams.count(), indivs[0])
        merged_fn = "%s.bam" % (sanitize_samplename(indivs[0]), )

        # Now we merge the files.
        self.samtools_merge_bams([bam.repository_file_path for bam in bams],
                                 merged_fn)

        self.gatk_preprocess_bam(merged_fn, bams[0].alignment, wait=wait)
Ejemplo n.º 16
0
    def postprocess_hicup(self):
        '''Post-processes hicup results. NB! The function is expected to run in cluster.'''

        # Find html report
        report_file = None
        if not os.path.isdir(self.hicup_output_dir):
            LOGGER.error("No report dir found! Expected %s.",
                         self.hicup_output_dir)
            sys.exit(1)
        for f in os.listdir(self.hicup_output_dir):
            if f.endswith('html'):
                report_file = f
                break
        if report_file is None:
            LOGGER.error("No html report found in %s.", self.hicup_output_dir)
            sys.exit(1)

        # Rename report file
        os.rename(os.path.join(self.hicup_output_dir, report_file),
                  self.hicup_report_fname)

        # Copy report to repository
        # NB! There is vulnerability in below as we asssume input file follows odom lab convention
        code = self.fq1.split('_')[0]
        dest_file = os.path.join(self.conf.repositorydir, code,
                                 os.path.basename(self.hicup_report_fname))
        destination = "%s@%s:%s/%s/" % (self.conf.user, self.conf.datahost,
                                        self.conf.repositorydir, code)
        transfer_file(self.hicup_report_fname,
                      destination,
                      set_ownership=False)

        # Register report in repository
        md5 = checksum_file(self.hicup_report_fname, unzip=False)
        cmd = "cs_addFile.py --qcfile --program_name hicup -M %s %s" % (
            os.path.basename(self.hicup_report_fname), md5)

        subproc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        (stdout, stderr) = subproc.communicate()
        retcode = subproc.wait()
        if stdout is not None or stdout != "":
            LOGGER.info("STDOUT:")
            LOGGER.info(sys.stdout.write(stdout))
        if stderr is not None or stderr != "":
            LOGGER.error("STDERR:")
            LOGGER.error(sys.stderr.write(stderr))
        if retcode != 0:
            LOGGER.error("Failed to execute '%s'\n\n" % cmd)
            sys.exit(1)

        # Set chgrp for the report file
        cmd2 = "ssh -o StrictHostKeyChecking=no %s@%s 'chgrp %s %s'" % (
            self.conf.user, self.conf.datahost, self.conf.group, dest_file)
        subproc = Popen(cmd2, stdout=PIPE, stderr=PIPE, shell=True)
        (stdout, stderr) = subproc.communicate()
        retcode = subproc.wait()
        if stdout is not None or stdout != "":
            LOGGER.info("STDOUT:")
            LOGGER.info(sys.stdout.write(stdout))
        if stderr is not None or stderr != "":
            LOGGER.error("STDERR:")
            LOGGER.error(sys.stderr.write(stderr))
        if retcode != 0:
            LOGGER.error("Failed to execute '%s'\n\n" % cmd)
            sys.exit(1)

        # Remove report dir
        shutil.rmtree(self.hicup_output_dir)
        # Remove hicup report
        os.remove(self.hicup_report_fname)
        os.remove(self.fq1)
Ejemplo n.º 17
0
    def postprocess_hicup(self):
        '''Post-processes hicup results. NB! The function is expected to run in cluster.'''

        # Find html report and bam file
        report_file = None
        bam_file = None
        if not os.path.isdir(self.hicup_output_dir):
            LOGGER.error("No report dir found! Expected %s.",
                         self.hicup_output_dir)
            sys.exit(1)
        # Find report file
        for f in os.listdir(self.hicup_output_dir):
            if f.endswith('html'):
                report_file = f
        # Find bam file
        for f in os.listdir(self.hicup_output_dir):
            if f.endswith('bam'):
                bam_file = f

        if report_file is None:
            LOGGER.error("No html report found in %s.", self.hicup_output_dir)
            sys.exit(1)

        if bam_file is None:
            LOGGER.error("No bam output found in %s.", self.hicup_output_dir)
            sys.exit(1)

        # Rename report file
        os.rename(os.path.join(self.hicup_output_dir, report_file),
                  self.hicup_report_fname)

        # Rename bam file
        os.rename(os.path.join(self.hicup_output_dir, bam_file),
                  self.hicup_report_bam)

        # Copy report to repository
        # NB! Vulnerability below! We asssume file name following Odom lab convention which may not always be the case?!
        code = self.fq1.split('_')[0]
        dest_file = os.path.join(self.conf.repositorydir, code,
                                 os.path.basename(self.hicup_report_fname))
        destination = "%s@%s:%s/%s/" % (self.conf.user, self.conf.datahost,
                                        self.conf.repositorydir, code)
        transfer_file(self.hicup_report_fname,
                      destination,
                      set_ownership=False)

        # Set chgrp for the report file
        cmd2 = "ssh -o StrictHostKeyChecking=no %s@%s 'chgrp %s %s'" % (
            self.conf.user, self.conf.datahost, self.conf.group, dest_file)
        subproc = Popen(cmd2, stdout=PIPE, stderr=PIPE, shell=True)
        (stdout, stderr) = subproc.communicate()
        retcode = subproc.wait()
        if stdout is not None or stdout != "":
            LOGGER.info("STDOUT:")
            LOGGER.info(sys.stdout.write(stdout))
        if stderr is not None or stderr != "":
            LOGGER.error("STDERR:")
            LOGGER.error(sys.stderr.write(stderr))
        if retcode != 0:
            LOGGER.error("Failed to execute '%s'\n\n" % cmd)
            sys.exit(1)

        # Register report in repository
        md5 = checksum_file(self.hicup_report_fname, unzip=False)
        cmd = "cs_addFile.py --qcfile --program_name hicup -M %s %s" % (
            os.path.basename(self.hicup_report_fname), md5)

        subproc = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
        (stdout, stderr) = subproc.communicate()
        retcode = subproc.wait()
        if stdout is not None or stdout != "":
            LOGGER.info("STDOUT:")
            LOGGER.info(sys.stdout.write(stdout))
        if stderr is not None or stderr != "":
            LOGGER.error("STDERR:")
            LOGGER.error(sys.stderr.write(stderr))
        if retcode != 0:
            if 'already exists' in stderr:
                LOGGER.info("%s already in repository!" %
                            os.path.basename(self.hicup_report_fname))
            else:
                LOGGER.error("Failed to register %s in repository!\n\n" %
                             os.path.basename(self.hicup_report_fname))

        # Copy bam over for processing
        bam_destination = os.path.join(self.conf.repositorydir, "incoming")
        dest_file = os.path.join(bam_destination,
                                 os.path.basename(self.hicup_report_bam))
        destination = "%s@%s:%s" % (self.conf.user, self.conf.datahost,
                                    bam_destination)
        transfer_file(self.hicup_report_bam, destination, set_ownership=False)

        # Set chgrp for bam (yes, this is not an elegant way of doing this!)
        cmd2 = "ssh -o StrictHostKeyChecking=no %s@%s 'chgrp %s %s && touch %s.done'" % (
            self.conf.user, self.conf.datahost, self.conf.group, dest_file,
            dest_file)
        subproc = Popen(cmd2, stdout=PIPE, stderr=PIPE, shell=True)
        (stdout, stderr) = subproc.communicate()
        retcode = subproc.wait()
        if stdout is not None or stdout != "":
            LOGGER.info("STDOUT:")
            LOGGER.info(sys.stdout.write(stdout))
        if stderr is not None or stderr != "":
            LOGGER.error("STDERR:")
            LOGGER.error(sys.stderr.write(stderr))
        if retcode != 0:
            LOGGER.error("Failed to execute '%s'\n\n" % cmd)

        # Remove report dir
        shutil.rmtree(self.hicup_output_dir)
        # Remove hicup report
        os.remove(self.hicup_report_fname)
  else:
    lane = lanelist[0]

    if genome is None:
      alns = lane.alignment_set.all()
    else:
      alns = lane.alignment_set.filter(genome__code=genome)

    if alns.count() > 1:
      LOGGER.error("Too many alignments for lane '%s'; consider supplying a genome code.", lane.id)
    else:
      aln = alns[0]

      # Calculate checksum outside the db transaction, since it takes
      # a long time.
      chksum = checksum_file(fname)
      _save_file_to_database(fname, aln, chksum)

###########################################################

if __name__ == '__main__':

  from argparse import ArgumentParser

  PARSER = ArgumentParser(
    description='Append a file to an alignment already existing in the'
    + ' repository. Hints may be provided as to which alignment should'
    + ' be used; otherwise the script will attempt to guess based on'
    + ' the filename.')

  PARSER.add_argument('-f', '--file', dest='file', type=str, required=True,
Ejemplo n.º 19
0
    def insert_into_repository(self, move_files=True):
        '''Insert self.output_files into the database.'''

        if len(self.output_files) == 0:
            self.generate()

        params = {self.target_name: self.target}
        qcobj = self.data_process.objects.create(**params)
        DataProvenance.objects.create(program=self._dbprog,
                                      parameters=self.program_params,
                                      rank_index=1,
                                      data_process=qcobj)

        for i in range(len(self.output_files)):
            fname = self.output_files[i]
            if len(self.output_md5s) != len(self.output_files):
                checksum = None
            else:
                checksum = self.output_md5s[i]

            LOGGER.info("Inserting %s", fname)
            # Note: this will fail if multiple types match.
            ftype = Filetype.objects.guess_type(fname)

            if os.path.isabs(fname):
                fpath = fname
            else:
                fpath = os.path.join(self.workdir, fname)

            if checksum is None or checksum == '':
                checksum = checksum_file(fpath)

            fparms = {
                self.file_target_name: qcobj,
                'filename': os.path.split(fname)[1],
                'checksum': checksum,
                'filetype': ftype
            }
            fobj = self.data_file(**fparms)

            fobj.save()

            if move_files:
                # Zip up the file if necessary.
                if ftype.gzip and os.path.splitext(
                        fname)[1] != CONFIG.gzsuffix:
                    fpath = rezip_file(fpath)
                if self.move_files:
                    dest = fobj.repository_file_path
                    # destdir = os.path.dirname(dest)
                    # if not os.path.exists(destdir):
                    #    os.makedirs(destdir)
                    # move(fpath, dest)
                    # set_file_permissions(CONFIG.group, dest)
                    if os.path.isabs(dest):
                        dest = os.path.split(dest)[0] + '/'
                    transfer_file(
                        fpath,
                        "%s@%s:%s" % (CONFIG.user, CONFIG.datahost, dest),
                        set_ownership=True
                    )  # note that transfer_file sets destination file permissions as in CONF
Ejemplo n.º 20
0
class FQFileFetcher(object):
    '''Class used to query the LIMS for fastq files associated with a
  given flowcell and download them to a destination directory.'''

    __slots__ = ('destination', 'lims', 'targets', 'test_mode', 'conf',
                 'unprocessed_only', 'force_download')

    def __init__(self,
                 destination,
                 lims=None,
                 test_mode=False,
                 unprocessed_only=False,
                 force_download=False):

        self.conf = Config()
        self.test_mode = test_mode
        self.unprocessed_only = unprocessed_only
        self.destination = destination
        self.force_download = force_download
        self.targets = set()
        if lims is None:
            lims = Lims()
        if not lims.running():
            LOGGER.error("Remote LIMS access broken... cannot continue.")
            sys.exit("LIMS not running.")
        self.lims = lims

        if self.test_mode:
            LOGGER.setLevel(DEBUG)
        else:
            LOGGER.setLevel(INFO)

    def retrieve_fqfile(self, lfile, libname):
        '''
    Given a LimsFile object and a library name, retrieve the actual
    fastq files from the LIMS and store it in self.destination.
    '''
        if not os.path.exists(self.destination):
            LOGGER.error("Destination '%s' does not exist.", self.destination)
            return

        filename = lfile.uri.split('/')[-1]
        LOGGER.info("LIMS File: %s", filename)

        # Current file naming convention:
        # <SLX ID>.<run number>.s_<lane no>.r_<N>.fq.gz
        # Where N=1 indicates single end sequencing
        #       N=2 indicates paired end
        #       N=3 indicates paired end, multiplexed
        #       N=4 indicates dual-indexed.
        #
        # Old file naming convention (still supported):
        # s_<lane no>(?:_<N>)_sequence.txt.gz
        # Where N is only present for non-single-end sequencing
        # and means the same as above.

        # This regex supports both old and new naming conventions.
        fnpat = re.compile(  # FIXME see parse_incoming_fastq_name
            r"SLX\-\d+\.[\.\w-]+\.s_\d+\.r_(\d+).fq.gz$" +
            r"|s_\d+(?:_(\d+))?_sequence.txt.gz$" +
            r"|s_\d+\.(tar)$")  # 10X_FASTQ_TAR
        matchobj = fnpat.search(filename)

        if matchobj is None:
            LOGGER.error(
                "FASTQ file does not conform to" +
                " known naming convention: %s", filename)
            return

        flowpair = 1
        fastqtar = False
        if matchobj.group(1) is not None:  # New naming
            stype = int(matchobj.group(1))
            if stype > 1:  # it's a paired-end-style name
                flowpair = stype
        elif matchobj.group(2) is not None:  # Old naming
            flowpair = int(matchobj.group(1))
        elif matchobj.group(3) == 'tar':  # 10X FASTQ tar file, probably.
            fastqtar = True
        else:
            LOGGER.error("FASTQ file name regex gave unexpected" +
                         " results; probable error in regex code.")
            return

        # following if statement has been added by Margus to deal
        # properly PE multiplexed lanes where second read suffixed _3.fq
        # rather than _2.fq.
        if (flowpair == 3):
            flowpair = 2
        elif (flowpair == 4):
            LOGGER.warning(
                "Dual indexed files not yet supported. However, in current" +
                " usage this is likely to be mislabeled by LIMS as part" +
                " of a wider flowcell annotation.")
            #        return   # If we ever start dual indexing this will need to change
            flowpair = 2

        sample_id = libname.lower().replace(" ",
                                            "")  # do not use underscores here.
        dst = build_incoming_fastq_name(sample_id, lfile.lane.flowcell.fcid,
                                        lfile.lane.lane, flowpair)

        if fastqtar:  # s/.fq$/.tar/
            dst = os.path.splitext(dst)[0] + '.tar'

        # If the final file has already been downloaded we skip the download.
        target = os.path.join(self.destination, dst)
        if os.path.exists(target):
            LOGGER.warning("Destination file '%s' exists. Cannot overwrite.",
                           target)
            return

        compressed = False
        if os.path.splitext(lfile.uri)[1] == self.conf.gzsuffix:
            compressed = True
            dst += self.conf.gzsuffix
        target = os.path.join(self.destination, dst)

        # We also refuse to download over an intermediary gzipped
        # file. This is more likely to be an error so we raise an
        # exception here. If the file is good it should have been
        # uncompressed already.
        if os.path.exists(target):
            raise StandardError(
                "Download location '%s' exists. Cannot overwrite." % target)

        # We download these over http now.
        LOGGER.debug("Downloading LIMS file ID %s to %s", lfile.lims_id,
                     target)
        if not self.test_mode:

            # This is actually the preferred download mechanism.
            try:
                self.lims.get_file_by_uri(lfile.uri, target)

            # Fall back to download via LIMS API, if supported.
            except Exception, err:
                if lfile.lims_id is not None:
                    self.lims.get_file_by_id(lfile.lims_id, target)
                else:
                    raise err

            set_file_permissions(self.conf.group, target)

        if not os.path.exists(target) and not self.test_mode:
            LOGGER.error("Failed to retrieve file '%s'", dst)
        else:

            # Compare the md5sum against those available in upstream LIMS.
            if not self.test_mode and lfile.md5sum is not None:
                md5 = checksum_file(target, unzip=False)
                if md5 != lfile.md5sum:
                    raise StandardError(
                        "File md5sum (%s) disagrees with upstream LIMS (%s): %s"
                        % (md5, lfile.md5sum, target))

        # Files are typically still compressed at this stage. This should
        # be handled seamlessly by downstream code.
        self.targets.add(target)
        LOGGER.info("Downloaded file to %s", target)
Ejemplo n.º 21
0
def _edit_bam_readgroup_data(bam,
                             platform_unit=None,
                             library=None,
                             sample=None,
                             center=None):
    '''
  Fairly generic internal function which makes the actual readgroup
  annotation changes. This function is deliberately agnostic about
  where the annotation comes from; it is up to the caller to make that
  decision. Note that it is assumed that the caller function is within
  a transaction; this allows us to be sure that database-derived
  annotation passed to this function will not change during the
  procedure.
  '''
    if bam.filetype.code != 'bam':
        raise ValueError("Function requires bam file, not %s (%s)" %
                         (bam.filetype.code, bam.filename))

    # First, extract the current file header.
    LOGGER.info("Reading current bam file header.")
    cmd = ('samtools', 'view', '-H', bam.repository_file_path)
    subproc = Popen(cmd, stdout=PIPE, stderr=STDOUT)
    header = subproc.communicate()[0]
    retcode = subproc.wait()

    if retcode != 0:
        raise CalledProcessError(retcode, " ".join(cmd))

    if len(header) == 0:
        raise ValueError(
            "The bam file has no header information; is this actually a bam file?"
        )

    newheader = []
    for line in header.split("\n"):

        # Make the actual changes here.
        if re.match('@RG', line):
            LOGGER.info("Editing @RG header line.")
            fields = dict(
                field.split(':', 1) for field in line.split("\t")
                if not re.match('^@', field))
            if platform_unit is not None:
                fields['PU'] = platform_unit
            if library is not None:
                fields['LB'] = library
            if sample is not None:
                fields['SM'] = sample
            if center is not None:
                fields['CN'] = center
            newline = "@RG\t%s" % "\t".join(
                ["%s:%s" % (key, val) for (key, val) in fields.iteritems()])
            newheader.append(newline)
        else:
            newheader.append(line)
    newheader = "\n".join(newheader)

    # Replace the old header with the edited version.
    LOGGER.info("Replacing bam file header.")
    tmpbam = "%s.reheader" % bam.repository_file_path
    move(bam.repository_file_path, tmpbam)
    cmd = 'samtools reheader - %s > %s' % (tmpbam, bam.repository_file_path)

    subproc = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=STDOUT, shell=True)
    stdout = subproc.communicate(input=newheader)
    retcode = subproc.wait()

    if retcode != 0:
        raise CalledProcessError(retcode, cmd)

    LOGGER.info("Correcting bam file checksum.")
    chksum = checksum_file(bam.repository_file_path, unzip=False)
    bam.checksum = chksum
    bam.save()
    os.unlink(tmpbam)
                alns += [newaln]
            else:
                raise err

    alns = list(set(alns))

    LOGGER.info("Linking MergedAlignment to %d source Alignments.", len(alns))
    maln = MergedAlignment.objects.create()
    for aln in alns:
        maln.alignments.add(aln)

    # Raise ValidationError if the MergedAlignment contains inconsistencies.
    maln.full_clean()

    LOGGER.info("Calculating bam file MD5 checksum...")
    chksum = checksum_file(bam, unzip=False)

    LOGGER.info("Checking read count in bam file against lane records...")
    check_bam_readcount(bam, maln, readcountdicts)

    malnfile = MergedAlnfile.objects.create(alignment=maln,
                                            filename=bam,
                                            filetype=bamtype,
                                            checksum=chksum)

    if archloc is not None:
        malnfile.archive = ArchiveLocation.objects.get(name=archloc)
        malnfile.archive_date = time.strftime('%Y-%m-%d')
        malnfile.save()

    LOGGER.info("Moving file into repository.")