コード例 #1
0
ファイル: test_functions.py プロジェクト: zzygyx9119/ratatosk
 def _make_source_file_name(target,
                            label,
                            src_suffix,
                            tgt_suffix,
                            src_label=None):
     # If tgt_suffix is list, target suffix should always
     # correspond to tgt_suffix[0]
     source = target
     if isinstance(tgt_suffix, tuple) or isinstance(tgt_suffix, list):
         tgt_suffix = tgt_suffix[0]
     if tgt_suffix and not src_suffix is None:
         if src_label:
             # Trick: remove src_label first if present since
             # the source label addition here corresponds to a
             # "diff" compared to target name
             source = rreplace(rreplace(source, tgt_suffix, "",
                                        1), src_label, "",
                               1) + src_label + src_suffix
         else:
             source = rreplace(source, tgt_suffix, src_suffix, 1)
     if label:
         if source.count(label) > 1:
             print "label '{}' found multiple times in target '{}'; this could be intentional".format(
                 label, source)
         elif source.count(label) == 0:
             print "label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(
                 label, source)
         source = rreplace(source, label, "", 1)
     return source
コード例 #2
0
ファイル: test_functions.py プロジェクト: percyfal/ratatosk
        def _make_source_file_name(target_cls, source_cls, diff_label=None):
            src_label = source_cls().label
            tgt_suffix = target_cls.suffix
            src_suffix = source_cls().suffix
            if isinstance(tgt_suffix, tuple) or isinstance(tgt_suffix, list):
                if len(tgt_suffix) > 0:
                    tgt_suffix = tgt_suffix[0]
            if isinstance(src_suffix, tuple) or isinstance(src_suffix, list):
                if len(src_suffix) > 0:
                    src_suffix = src_suffix[0]
            # Start by stripping tgt_suffix
            if tgt_suffix:
                source = rreplace(target_cls.target, tgt_suffix, "", 1)
            else:
                source = target_cls.target
            # Then remove the target label and diff_label
            source = rreplace(source, target_cls.label, "", 1)
            if diff_label:
                source = rreplace(source, str(diff_label), "", 1)
            if src_label:
                # Trick: remove src_label first if present since
                # the source label addition here corresponds to a
                # "diff" compared to target name
                source = rreplace(source, str(src_label), "", 1) + str(src_label) + str(src_suffix)
            else:
                source = source + str(src_suffix)
            if src_label:
                if source.count(str(src_label)) > 1:
                    print "label '{}' found multiple times in target '{}'; this could be intentional".format(src_label, source)
                elif source.count(src_label) == 0:
                    print "label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(src_label, source)

            return source
コード例 #3
0
ファイル: job.py プロジェクト: SciLifeLab/ratatosk
    def _make_source_file_name(self):
        """Construct source file name from a target.

        Change target_suffix to source_suffix. Remove label from a
        target file name. A target given to a task must have its file
        name modified for the requirement. This function should
        therefore be called in the requires function. Make sure only
        to replace the last label.

        :return: string
        """
        source = self.target
        if isinstance(self.target_suffix, tuple):
            if self.target_suffix[0] and self.source_suffix:
                source = rreplace(source, self.target_suffix[0], self.source_suffix, 1)
        else:
            if self.target_suffix and self.source_suffix:
                source = rreplace(source, self.target_suffix, self.source_suffix, 1)
        if not self.label:
            return source
        if source.count(self.label) > 1:
            logger.warn("label '{}' found multiple times in target '{}'; this could be intentional".format(self.label, source))
        elif source.count(self.label) == 0:
            logger.warn("label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(self.label, source))
        return rreplace(source, self.label, "", 1)
コード例 #4
0
ファイル: job.py プロジェクト: zzygyx9119/ratatosk
    def _make_source_file_name(self,
                               parent_cls,
                               diff_label=None,
                               add_label=None):
        """Make source file name for parent tasks. Uses parent_cls to
        get parent class suffix (i.e. source suffix as viewed
        from self). The optional argument diff_label is needed for
        cases where the parent class is several steps up in the
        workflow, meaning that several labels have been added along
        the way. This is an irritating and as of yet unresolved issue.

        :param parent_cls: parent class
        :param diff_label: the "difference" in labels between self and parent.  E.g. if self.target=file.merge.sort.recal.bam depends on task with output file.merge.bam, and self.label=.recal, we would need to set the difference (.sort) here.
        :param add_label: label that should be added to parent source, e.g. read suffix 


        :return: parent task target name (source)
        """
        src_label = parent_cls().label
        tgt_suffix = self.sfx()
        src_suffix = parent_cls().sfx()
        target = self.target
        if isinstance(self.target, tuple) or isinstance(self.target, list):
            target = self.target[self._target_iter]
            self._target_iter += 1
        if isinstance(tgt_suffix, tuple) or isinstance(tgt_suffix, list):
            if len(tgt_suffix) > 0:
                tgt_suffix = tgt_suffix[0]
        if isinstance(src_suffix, tuple) or isinstance(src_suffix, list):
            if len(src_suffix) > 0:
                src_suffix = src_suffix[0]
        # Start by setting source, stripping tgt_suffix if present
        source = target
        if tgt_suffix:
            source = rreplace(target, tgt_suffix, "", 1)
        # Then remove the target label and optional diff_label
        if self.label:
            source = rreplace(source, self.label, "", 1)
        if diff_label:
            source = rreplace(source, str(diff_label), "", 1)
        if add_label:
            source = source + add_label
        if src_label:
            # Trick: remove src_label first if present since
            # the source label addition here corresponds to a
            # "diff" compared to target name
            source = rreplace(source, str(src_label), "",
                              1) + str(src_label) + str(src_suffix)
        else:
            source = source + str(src_suffix)
        if src_label:
            if source.count(str(src_label)) > 1:
                print "label '{}' found multiple times in target '{}'; this could be intentional".format(
                    src_label, source)
            elif source.count(src_label) == 0:
                print "label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(
                    src_label, source)
        return source
コード例 #5
0
ファイル: bwa.py プロジェクト: SciLifeLab/ratatosk
 def args(self):
     sai1 = self.input()[0]
     sai2 = self.input()[1]
     fastq1 = luigi.LocalTarget(rreplace(sai1.fn, self.source_suffix, ".fastq.gz", 1))
     fastq2 = luigi.LocalTarget(rreplace(sai2.fn, self.source_suffix, ".fastq.gz", 1))
     if not self.read_group:
         foo = sai1.fn.replace(".sai", "")
         # The platform should be configured elsewhere
         self.read_group = "-r \"{}\"".format("\t".join(["@RG", "ID:{}".format(foo), "SM:{}".format(foo), "PL:{}".format(self.platform)]))
     return [self.read_group, self.bwaref, sai1, sai2, fastq1, fastq2, ">", self.output()]
コード例 #6
0
ファイル: job.py プロジェクト: percyfal/ratatosk
    def _make_source_file_name(self, parent_cls, diff_label=None, add_label=None):
        """Make source file name for parent tasks. Uses parent_cls to
        get parent class suffix (i.e. source suffix as viewed
        from self). The optional argument diff_label is needed for
        cases where the parent class is several steps up in the
        workflow, meaning that several labels have been added along
        the way. This is an irritating and as of yet unresolved issue.

        :param parent_cls: parent class
        :param diff_label: the "difference" in labels between self and parent.  E.g. if self.target=file.merge.sort.recal.bam depends on task with output file.merge.bam, and self.label=.recal, we would need to set the difference (.sort) here.
        :param add_label: label that should be added to parent source, e.g. read suffix 


        :return: parent task target name (source)
        """
        src_label = parent_cls().label
        tgt_suffix = self.sfx()
        src_suffix = parent_cls().sfx()
        target = self.target
        if isinstance(self.target, tuple) or isinstance(self.target, list):
            target = self.target[self._target_iter]
            self._target_iter += 1
        if isinstance(tgt_suffix, tuple) or isinstance(tgt_suffix, list):
            if len(tgt_suffix) > 0:
                tgt_suffix = tgt_suffix[0]
        if isinstance(src_suffix, tuple) or isinstance(src_suffix, list):
            if len(src_suffix) > 0:
                src_suffix = src_suffix[0]
        # Start by setting source, stripping tgt_suffix if present
        source = target
        if tgt_suffix:
            source = rreplace(target, tgt_suffix, "", 1)
        # Then remove the target label and optional diff_label
        if self.label:
            source = rreplace(source, self.label, "", 1)
        if diff_label:
            source = rreplace(source, str(diff_label), "", 1)
        if add_label:
            source = source + add_label
        if src_label:
            # Trick: remove src_label first if present since
            # the source label addition here corresponds to a
            # "diff" compared to target name
            source = rreplace(source, str(src_label), "", 1) + str(src_label) + str(src_suffix)
        else:
            source = source + str(src_suffix)
        if src_label:
            if source.count(str(src_label)) > 1:
                print "label '{}' found multiple times in target '{}'; this could be intentional".format(src_label, source)
            elif source.count(src_label) == 0:
                print "label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(src_label, source)
        return source
コード例 #7
0
ファイル: bwa.py プロジェクト: SciLifeLab/ratatosk
 def requires(self):
     cls = self.set_parent_task()
     source = self._make_source_file_name()
     # Ugly hack for 1 -> 2 dependency: works but should be dealt with otherwise
     if str(fullclassname(cls)) in ["ratatosk.lib.utils.misc.ResyncMatesJobTask"]:
         if re.search(self.read1_suffix, source):
             self.is_read1 = True
             fq1 = source
             fq2 = rreplace(source, self.read1_suffix, self.read2_suffix, 1)
         else:
             self.is_read1 = False
             fq1 = rreplace(source, self.read2_suffix, self.read1_suffix, 1)
             fq2 = source
         return cls(target=[fq1, fq2])
     else:
         return cls(target=source)
コード例 #8
0
ファイル: gatk.py プロジェクト: percyfal/ratatosk
 def requires(self):
     cls = self.parent()[0]
     bamcls = self.parent()[0]().parent()[0]
     source = self.source()[0]
     if self.split_by == "chromosome":
         # Partition sources by chromosome. Need to get the
         # references from the source bam file, i.e. the source to
         # the parent task
         bamfile = rreplace(source, self.sfx(), bamcls().sfx(), 1)
         if os.path.exists(bamfile):
             samfile = pysam.Samfile(bamfile, "rb")
             refs = samfile.references
             samfile.close()
         elif os.path.exists(os.path.expanduser(self.ref)):
             dictfile = os.path.expanduser(os.path.splitext(self.ref)[0] + ".dict")
             with open(dictfile) as fh:
                 seqdict = [x for x in fh.readlines() if x.startswith("@SQ")]
             m = [re.search(r'SN:([a-zA-z0-9]+)', x) for x in seqdict]
             refs = [x.group(1) for x in m]
         else:
             return []
         outdir = "{base}-split".format(base=os.path.splitext(self.target)[0])
         if not os.path.exists(outdir):
             os.makedirs(outdir)
         split_targets = [os.path.join("{base}-split".format(base=os.path.splitext(self.target)[0]), 
                                       "{base}-{ref}{ext}".format(base=os.path.splitext(os.path.basename(self.target))[0], ref=chr_ref, ext=self.sfx())) for chr_ref in refs]
         return [cls(target=tgt, target_region=chr_ref) for tgt, chr_ref in izip(split_targets, refs)]
     else:
         return [cls(target=source)]
コード例 #9
0
ファイル: gatk.py プロジェクト: zzygyx9119/ratatosk
 def output(self):
     if isinstance(self.suffix, tuple):
         return [
             luigi.LocalTarget(rreplace(self.target, self.suffix[0], x, 1))
             for x in self.suffix
         ]
     else:
         return [luigi.LocalTarget(self.target)]
コード例 #10
0
ファイル: gatk.py プロジェクト: percyfal/ratatosk
    def _make_source_file_name(self, parent_cls):
        """Assume pattern is {base}-split/{base}-{ref}{ext}, as in
        CombineVariants.

        FIX ME: well, generalize
        """
        base = rreplace(os.path.join(os.path.dirname(os.path.dirname(self.target)), os.path.basename(self.target)), self.label, "", 1).split("-")
        return "".join(base[0:-1]) + parent_cls().sfx()
コード例 #11
0
ファイル: misc.py プロジェクト: SciLifeLab/ratatosk
 def _make_paired_source_file_names(self):
     """Construct source file name from a target.
     """
     source_list = self.target
     for source in source_list:
         if isinstance(self.target_suffix, tuple):
             if self.target_suffix[0] and self.source_suffix:
                 source = rreplace(source, self.target_suffix[0], self.source_suffix, 1)
         else:
             if self.target_suffix and self.source_suffix:
                 source = rreplace(source, self.target_suffix, self.source_suffix, 1)
         if not self.label:
             source_list.append(source)
         if source.count(self.label) > 1:
             logger.warn("label '{}' found multiple times in target '{}'; this could be intentional".format(self.label, source))
         elif source.count(self.label) == 0:
             logger.warn("label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(self.label, source))
     return [rreplace(x, self.label, "", 1) for x in source_list]
コード例 #12
0
ファイル: gatk.py プロジェクト: zzygyx9119/ratatosk
    def _make_source_file_name(self, parent_cls):
        """Assume pattern is {base}-split/{base}-{ref}{ext}, as in
        CombineVariants.

        FIX ME: well, generalize
        """
        base = rreplace(
            os.path.join(os.path.dirname(os.path.dirname(self.target)),
                         os.path.basename(self.target)), self.label, "",
            1).split("-")
        return "".join(base[0:-1]) + parent_cls().sfx()
コード例 #13
0
ファイル: bwa.py プロジェクト: zzygyx9119/ratatosk
 def requires(self):
     cls = self.parent()[0]
     source = self.source()[0]
     # Ugly hack for 1 -> 2 dependency: works but should be dealt with otherwise
     if str(fullclassname(cls)) in ["ratatosk.lib.utils.misc.ResyncMates"]:
         rt = determine_read_type(source, self.read1_suffix, self.read2_suffix)
         if rt == 1:
             self.is_read1 = True
             fq1 = source
             fq2 = rreplace(source, self.read1_suffix, self.read2_suffix, 1)
         elif rt == 2:
             self.is_read1 = False
             fq1 = rreplace(source, self.read2_suffix, self.read1_suffix, 1)
             fq2 = source
         retval = [cls(target=[fq1, fq2])]
     else:
         retval = [cls(target=source)]
     if len(self.parent()) > 1:
         retval += [cls(target=source) for cls, source in izip(self.parent()[1:], self.source()[1:])]
     return retval
コード例 #14
0
ファイル: test_functions.py プロジェクト: percyfal/ratatosk
 def _make_source_file_name(target, label, src_suffix, tgt_suffix, src_label=None):
     # If tgt_suffix is list, target suffix should always
     # correspond to tgt_suffix[0]
     source = target
     if isinstance(tgt_suffix, tuple) or isinstance(tgt_suffix, list):
         tgt_suffix = tgt_suffix[0]
     if tgt_suffix and not src_suffix is None:
         if src_label:
             # Trick: remove src_label first if present since
             # the source label addition here corresponds to a
             # "diff" compared to target name
             source = rreplace(rreplace(source, tgt_suffix, "", 1), src_label, "", 1) + src_label + src_suffix
         else:
             source = rreplace(source, tgt_suffix, src_suffix, 1)
     if label:
         if source.count(label) > 1:
             print "label '{}' found multiple times in target '{}'; this could be intentional".format(label, source)
         elif source.count(label) == 0:
             print "label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(label, source)
         source = rreplace(source, label, "", 1)
     return source
コード例 #15
0
ファイル: htslib.py プロジェクト: zzygyx9119/ratatosk
 def requires(self):
     vcfcls = self.parent()[0]
     indexcls = ratatosk.lib.variation.tabix.Tabix
     return [
         cls(target=source)
         for cls, source in izip(self.parent(), self.source())
     ] + [
         indexcls(target=rreplace(self.source()[0],
                                  vcfcls().suffix,
                                  indexcls().suffix, 1),
                  parent_task=fullclassname(vcfcls))
     ]
コード例 #16
0
 def args(self):
     cls = self.parent()[0]
     seq = self.threeprime
     if determine_read_type(self.input()[0].path, self.read1_suffix,
                            self.read2_suffix) == 2:
         seq = self.fiveprime
     return [
         "-a", seq,
         self.input()[0], "-o",
         self.output(), ">",
         rreplace(self.input()[0].path, str(cls().suffix[0]),
                  self.label + self.suffix[1], 1)
     ]
コード例 #17
0
ファイル: site_functions.py プロジェクト: SciLifeLab/ratatosk
def collect_sample_runs(task):
    """Collect sample runs for a sample. Since it is to be used with
    MergeSamFiles it should return a list of targets.

    :param task: current task

    :return: list of bam files for each sample run in a flowcell directory
    """
    logging.debug("Collecting sample runs for {}".format(task.target))
    sample_runs = target_generator(os.path.dirname(os.path.dirname(task.target)), 
                                   sample=[os.path.basename(os.path.dirname(task.target))])
    bam_list = [x[2] + os.path.basename(rreplace(task.target.replace(x[0], ""), "{}{}".format(task.label, task.target_suffix), task.source_suffix, 1)) for x in sample_runs]
    logging.debug("Generated target bamfile list {}".format(bam_list))
    return bam_list
コード例 #18
0
ファイル: bwa.py プロジェクト: zzygyx9119/ratatosk
 def _get_read_group(self):
     if not self.read_group:
         from ratatosk import backend
         cls = self.parent()[0]
         sai1 = self.input()[0]
         rgid = rreplace(rreplace(sai1.path, cls().sfx(), "", 1), self.add_label[0], "", 1)
         smid = rgid
         # Get sample information if present in global vars. Note
         # that this requires the
         # backend.__global_vars__["targets"] be set
         # This is not perfect but works for now
         for tgt in backend.__global_vars__.get("targets", []):
             if smid.startswith(tgt.prefix("sample_run")):
                 smid = tgt.sample_id()
                 break
         # The platform should be configured elsewhere
         rg = "\"{}\"".format("\t".join(["@RG", "ID:{}".format(rgid), "SM:{}".format(smid), "PL:{}".format(self.platform)]))
         if self.pipe:
             return rg.replace("\t", "\\t")
         else:
             return rg
     else:
         return self.read_group
コード例 #19
0
ファイル: gatk.py プロジェクト: zzygyx9119/ratatosk
 def requires(self):
     """Task requirements. In many cases this is a single source
     whose name can be generated following the code below, and
     therefore doesn't need reimplementation in the subclasses."""
     bamcls = self.parent()[0]
     indexcls = ratatosk.lib.tools.samtools.Index
     return [
         cls(target=source)
         for cls, source in izip(self.parent(), self.source())
     ] + [
         indexcls(target=rreplace(self.source()[0],
                                  bamcls().sfx(),
                                  indexcls().sfx(), 1),
                  parent_task=fullclassname(bamcls))
     ]
コード例 #20
0
ファイル: haloplex.py プロジェクト: zzygyx9119/ratatosk
 def requires(self):
     """Task requirements. In many cases this is a single source
     whose name can be generated following the code below, and
     therefore doesn't need reimplementation in the subclasses."""
     bamcls = self.parent()[0]
     indexcls = ratatosk.lib.tools.samtools.Index
     return [bamcls(target=self.source()[0])] + [
         CombineVariants(
             target=os.path.join(self.outdir, "CombinedVariants.vcf"))
     ] + [
         indexcls(target=rreplace(self.source()[0],
                                  bamcls().sfx(),
                                  indexcls().sfx(), 1),
                  parent_task=fullclassname(bamcls))
     ]
コード例 #21
0
def generic_collect_sample_runs(task):
    """Collect sample runs for a sample. Since it is to be used with
    MergeSamFiles it should return a list of targets.

    :param task: current task

    :return: list of bam files for each sample run in a flowcell directory
    """
    logging.debug("Collecting sample runs for {}".format(task.target))
    sample_runs = generic_target_generator(os.path.dirname(os.path.dirname(task.target)), 
                                           sample=[os.path.basename(os.path.dirname(task.target))])
    src_suffix = task.parent()[0]().suffix
    bam_list = list(set([x.prefix("sample_run") + os.path.basename(rreplace(task.target.replace(x.sample_id(), ""), "{}{}".format(task.label, task.suffix), src_suffix, 1)) for x in sample_runs]))
    logging.debug("Generated target bamfile list {}".format(bam_list))
    return bam_list
コード例 #22
0
ファイル: test_functions.py プロジェクト: zzygyx9119/ratatosk
        def _make_source_file_name(target_cls, source_cls, diff_label=None):
            src_label = source_cls().label
            tgt_suffix = target_cls.suffix
            src_suffix = source_cls().suffix
            if isinstance(tgt_suffix, tuple) or isinstance(tgt_suffix, list):
                if len(tgt_suffix) > 0:
                    tgt_suffix = tgt_suffix[0]
            if isinstance(src_suffix, tuple) or isinstance(src_suffix, list):
                if len(src_suffix) > 0:
                    src_suffix = src_suffix[0]
            # Start by stripping tgt_suffix
            if tgt_suffix:
                source = rreplace(target_cls.target, tgt_suffix, "", 1)
            else:
                source = target_cls.target
            # Then remove the target label and diff_label
            source = rreplace(source, target_cls.label, "", 1)
            if diff_label:
                source = rreplace(source, str(diff_label), "", 1)
            if src_label:
                # Trick: remove src_label first if present since
                # the source label addition here corresponds to a
                # "diff" compared to target name
                source = rreplace(source, str(src_label), "",
                                  1) + str(src_label) + str(src_suffix)
            else:
                source = source + str(src_suffix)
            if src_label:
                if source.count(str(src_label)) > 1:
                    print "label '{}' found multiple times in target '{}'; this could be intentional".format(
                        src_label, source)
                elif source.count(src_label) == 0:
                    print "label '{}' not found in target '{}'; are you sure your target is correctly formatted?".format(
                        src_label, source)

            return source
コード例 #23
0
ファイル: gatk.py プロジェクト: SciLifeLab/ratatosk
 def args(self):
     # This is plain daft and inconsistent. If we want PrintReads
     # to run on a bam file for which there is baserecalibrated
     # output, it does *not* work to set requirements to point both
     # to IndelRealigner and
     # BaseReacalibrator(parent_task=IndelRealigner) - the
     # dependencies break. This fix changes meaning of input option
     # (-I) depending on whether we do recalibrate or note
     # TODO: sort this out - is the above statement really true?
     if self.recalibrate:
         inputfile = rreplace(self.input().fn, self.source_suffix, InputBamFile.target_suffix.default, 1)
         retval = ["-BQSR", self.input(), "-o", self.output(), "-I", inputfile]
     else:
         retval = ["-I", self.input(), "-o", self.output()]
     if not self.ref:
         raise Exception("need reference for PrintReads")
     retval += [" -R {}".format(self.ref)]
     return retval
コード例 #24
0
def organize_sample_runs(task, cls):
    # This currently relies on the folder structure sample/fc1,
    # sample/fc2 etc...
    logging.debug("Organizing samples for {}".format(task.target))
    targetdir = os.path.dirname(task.target)
    flowcells = os.listdir(targetdir)
    bam_list = []
    for fc in flowcells:
        fc_dir = os.path.join(targetdir, fc)
        if not os.path.isdir(fc_dir):
            continue
        if not fc_dir.endswith("XX"):
            continue
        logging.debug("Looking in directory {}".format(fc))
        # This assumes only one sample run per flowcell
        bam_list.append(os.path.join(fc_dir, os.path.basename(rreplace(task.target, "{}{}".format(task.label, task.target_suffix), task.source_suffix, 1))))
    logging.debug("Generated target bamfile list {}".format(bam_list))
    return bam_list
コード例 #25
0
ファイル: gatk.py プロジェクト: zzygyx9119/ratatosk
 def requires(self):
     cls = self.parent()[0]
     bamcls = self.parent()[0]().parent()[0]
     source = self.source()[0]
     if self.split_by == "chromosome":
         # Partition sources by chromosome. Need to get the
         # references from the source bam file, i.e. the source to
         # the parent task
         bamfile = rreplace(source, self.sfx(), bamcls().sfx(), 1)
         if os.path.exists(bamfile):
             samfile = pysam.Samfile(bamfile, "rb")
             refs = samfile.references
             samfile.close()
         elif os.path.exists(os.path.expanduser(self.ref)):
             dictfile = os.path.expanduser(
                 os.path.splitext(self.ref)[0] + ".dict")
             with open(dictfile) as fh:
                 seqdict = [
                     x for x in fh.readlines() if x.startswith("@SQ")
                 ]
             m = [re.search(r'SN:([a-zA-z0-9]+)', x) for x in seqdict]
             refs = [x.group(1) for x in m]
         else:
             return []
         outdir = "{base}-split".format(
             base=os.path.splitext(self.target)[0])
         if not os.path.exists(outdir):
             os.makedirs(outdir)
         split_targets = [
             os.path.join(
                 "{base}-split".format(
                     base=os.path.splitext(self.target)[0]),
                 "{base}-{ref}{ext}".format(base=os.path.splitext(
                     os.path.basename(self.target))[0],
                                            ref=chr_ref,
                                            ext=self.sfx()))
             for chr_ref in refs
         ]
         return [
             cls(target=tgt, target_region=chr_ref)
             for tgt, chr_ref in izip(split_targets, refs)
         ]
     else:
         return [cls(target=source)]
コード例 #26
0
ファイル: sample.py プロジェクト: vals/ratatosk.ext.scilife
def collect_sample_runs(task):
    """Collect sample runs for a sample. Since it is to be used with
    MergeSamFiles it should return a list of targets.

    :param task: current task

    :return: list of bam files for each sample run in a flowcell directory
    """
    logging.debug("Collecting sample runs for {}".format(task.target))
    sample = os.path.basename(os.path.dirname(task.target))
    if backend.__global_vars__.get("targets", None):
        sample_runs = [x for x in backend.__global_vars__.get("targets") if x.sample_id() == sample]
    else:
        sample_runs = target_generator_handler(os.path.dirname(os.path.dirname(task.target)), 
                                               sample=[sample])
    src_suffix = task.parent()[0]().sfx()
    bam_list = list(set([x.prefix("sample_run") + os.path.basename(rreplace(task.target.replace(x.sample_id(), ""), "{}{}".format(task.label, task.suffix), src_suffix, 1)) for x in sample_runs]))
    logging.debug("Generated target bamfile list {}".format(bam_list))
    return bam_list
コード例 #27
0
ファイル: site_functions.py プロジェクト: SciLifeLab/ratatosk
def organize_sample_runs(task):
    # This currently relies on the folder structure sample/fc1,
    # sample/fc2 etc... This should possibly also be a
    # configurable function?
    # NB: this is such a pain to get right I'm adding lots of debug right now
    logging.debug("Organizing samples for {}".format(task.target))
    targetdir = os.path.dirname(task.target)
    flowcells = os.listdir(targetdir)
    bam_list = []
    for fc in flowcells:
        fc_dir = os.path.join(targetdir, fc)
        if not os.path.isdir(fc_dir):
            continue
        if not fc_dir.endswith("XX"):
            continue
        logging.debug("Looking in directory {}".format(fc))
        # This assumes only one sample run per flowcell
        bam_list.append(os.path.join(fc_dir, os.path.basename(rreplace(task.target, "{}{}".format(task.label, task.target_suffix), task.source_suffix, 1))))
    logging.debug("Generated target bamfile list {}".format(bam_list))
    return bam_list
コード例 #28
0
ファイル: gatk.py プロジェクト: SciLifeLab/ratatosk
 def requires(self):
     cls = self.set_parent_task()
     source = self._make_source_file_name()
     return [cls(target=source), ratatosk.lib.tools.samtools.IndexBam(target=rreplace(source, self.source_suffix, ".bai", 1), parent_task=fullclassname(cls))]
コード例 #29
0
ファイル: bwa.py プロジェクト: SciLifeLab/ratatosk
 def requires(self):
     # From target name, generate sai1, sai2, fastq1, fastq2
     sai1 = rreplace(self._make_source_file_name(), self.source_suffix, self.read1_suffix + self.source_suffix, 1)
     sai2 = rreplace(self._make_source_file_name(), self.source_suffix, self.read2_suffix + self.source_suffix, 1)
     return [BwaAln(target=sai1), BwaAln(target=sai2)]
コード例 #30
0
ファイル: gatk.py プロジェクト: SciLifeLab/ratatosk
 def requires(self):
     cls = self.set_parent_task()
     source = self._make_source_file_name()
     return [cls(target=source),
             ratatosk.lib.tools.samtools.IndexBam(target=rreplace(source, self.source_suffix, ".bai", 1), parent_task="ratatosk.lib.tools.gatk.InputBamFile"), 
             ratatosk.lib.tools.gatk.RealignerTargetCreator(target=rreplace(source, ".bam", ".intervals", 1))]
コード例 #31
0
ファイル: bwa.py プロジェクト: zzygyx9119/ratatosk
 def args(self):
     cls = self.parent()[0]
     parent_cls = cls().parent()[0]
     (fastq1, fastq2) = [luigi.LocalTarget(rreplace(sai.path, cls().suffix, parent_cls().sfx(), 1)) for sai in self.input()]
     return ["-r", self._get_read_group(), self.bwaref, self.input()[0].path, self.input()[1].path, fastq1, fastq2, ">", self.output()]
コード例 #32
0
ファイル: picard.py プロジェクト: zzygyx9119/ratatosk
 def requires(self):
     return [InsertMetrics(target=self.target + str(InsertMetrics().suffix)),
             HsMetrics(target=self.target + str(HsMetrics().suffix)),
             HsMetricsNonDup(target=rreplace(self.target, str(DuplicationMetrics().label), "", 1) + str(HsMetrics().suffix)),
             AlignmentMetrics(target=self.target + str(AlignmentMetrics().suffix))]
コード例 #33
0
ファイル: samtools.py プロジェクト: percyfal/ratatosk
 def args(self):
     output_prefix = luigi.LocalTarget(rreplace(self.output().path, self.suffix, "", 1))
     return [self.input()[0], output_prefix]
コード例 #34
0
ファイル: picard.py プロジェクト: zzygyx9119/ratatosk
 def output(self):
     return [luigi.LocalTarget(self.target),
             luigi.LocalTarget(rreplace(self.target, self.suffix[0], self.suffix[1], 1))]
コード例 #35
0
ファイル: picard.py プロジェクト: zzygyx9119/ratatosk
 def args(self):
     return ["INPUT=", self.input()[0], "OUTPUT=", self.output(), "METRICS_FILE=", rreplace(self.output().path, "{}{}".format(self.label, self.suffix[0]), self.suffix[1], 1)]
コード例 #36
0
ファイル: cutadapt.py プロジェクト: percyfal/ratatosk
 def args(self):
     cls = self.parent()[0]
     seq = self.threeprime 
     if determine_read_type(self.input()[0].path, self.read1_suffix, self.read2_suffix) == 2:
         seq = self.fiveprime
     return ["-a", seq, self.input()[0], "-o", self.output(), ">", rreplace(self.input()[0].path, str(cls().suffix[0]), self.label + self.suffix[1], 1)]
コード例 #37
0
 def args(self):
     output_prefix = luigi.LocalTarget(
         rreplace(self.output().path, self.suffix, "", 1))
     return [self.input()[0], output_prefix]
コード例 #38
0
ファイル: picard.py プロジェクト: SciLifeLab/ratatosk
 def requires(self):
     return [InsertMetrics(target=self.target + str(InsertMetrics.target_suffix.default[0])),
             HsMetrics(target=self.target + str(HsMetrics.target_suffix.default)),
             HsMetricsNonDup(target=rreplace(self.target, str(DuplicationMetrics.label.default), "", 1) + str(HsMetrics.target_suffix.default)),
             AlignmentMetrics(target=self.target + str(AlignmentMetrics.target_suffix.default))]
コード例 #39
0
ファイル: gatk.py プロジェクト: percyfal/ratatosk
 def requires(self):
     """Task requirements. In many cases this is a single source
     whose name can be generated following the code below, and
     therefore doesn't need reimplementation in the subclasses."""
     bamcls = self.parent()[0]
     indexcls = ratatosk.lib.tools.samtools.Index
     return [cls(target=source) for cls, source in izip(self.parent(), self.source())] + [indexcls(target=rreplace(self.source()[0], bamcls().sfx(), indexcls().sfx(), 1), parent_task=fullclassname(bamcls))]
コード例 #40
0
ファイル: haloplex.py プロジェクト: percyfal/ratatosk
 def requires(self):
     """Task requirements. In many cases this is a single source
     whose name can be generated following the code below, and
     therefore doesn't need reimplementation in the subclasses."""
     bamcls = self.parent()[0]
     indexcls = ratatosk.lib.tools.samtools.Index
     return [bamcls(target=self.source()[0])]  + [CombineVariants(target=os.path.join(self.outdir, "CombinedVariants.vcf"))] + [indexcls(target=rreplace(self.source()[0], bamcls().sfx(), indexcls().sfx(), 1), parent_task=fullclassname(bamcls))]
コード例 #41
0
ファイル: tabix.py プロジェクト: zzygyx9119/ratatosk
 def requires(self):
     zipcls = ratatosk.lib.variation.tabix.Bgzip
     indexcls = ratatosk.lib.variation.tabix.Tabix
     return [zipcls(target=self.source()[0]), 
                    indexcls(target=rreplace(self.source()[0], zipcls().sfx(), indexcls().sfx(), 1),
                             parent_task=fullclassname(zipcls))]
コード例 #42
0
ファイル: gatk.py プロジェクト: percyfal/ratatosk
 def output(self):
     if isinstance(self.suffix, tuple):
         return [luigi.LocalTarget(rreplace(self.target, self.suffix[0], x, 1)) for x in self.suffix]
     else:
         return [luigi.LocalTarget(self.target)]