Beispiel #1
0
class Apply_VQSR(GATK):
    name = "Apply VQSR"
    mem_req = 8 * 1024
    time_req = 12 * 60
    persist = True

    inputs = ['vcf', 'recal', 'tranches']
    outputs = [TaskFile(name='vcf', persist=True)]

    def cmd(self, i, s, p):
        if p['glm'] == 'SNP':
            cmd = r"""
            {self.bin}
            -T ApplyRecalibration
            -R {s[reference_fasta_path]}
            -input {i[vcf][0]}
            -tranchesFile {i[tranches][0]}
            -recalFile {i[recal][0]}
            -o $OUT.vcf
            --ts_filter_level 99.9
            -mode SNP
            """
        elif p['glm'] == 'INDEL':
            cmd = r"""
            {self.bin}
            -T ApplyRecalibration
            -R {s[reference_fasta_path]}
            -input {i[vcf][0]}
            -tranchesFile {i[tranches][0]}
            -recalFile {i[recal][0]}
            -o $OUT.vcf
            --ts_filter_level 99.9
            -mode INDEL
            """
        return cmd
Beispiel #2
0
class CombineVariants(GATK):
    name = "Combine Variants"
    mem_req = 3 * 1024
    time_req = 12 * 60

    inputs = ['vcf']
    outputs = [TaskFile(name='vcf', basename='master.vcf')]
    persist = True

    default_params = {'genotypeMergeOptions': 'UNSORTED'}

    def cmd(self, i, s, p):
        """
        :param genotypemergeoptions: select from the following:
            UNIQUIFY - Make all sample genotypes unique by file. Each sample shared across RODs gets named sample.ROD.
            PRIORITIZE - Take genotypes in priority order (see the priority argument).
            UNSORTED - Take the genotypes in any order.
            REQUIRE_UNIQUE - Require that all samples/genotypes be unique between all inputs.
        """
        return r"""
            {self.bin}
            -T CombineVariants
            -R {s[reference_fasta_path]}
            {inputs}
            -o $OUT.vcf
            -genotypeMergeOptions {p[genotypeMergeOptions]}
        """, {
            'inputs':
            "\n".join(["--variant {0}".format(vcf) for vcf in i['vcf']])
        }
Beispiel #3
0
    def __init__(self, path, name=None, fmt=None, *args, **kwargs):
        """
:param path: the path to the input file
:param name: the name or keyword for the input file
:param fmt: the format of the input file
"""
        super(INPUT, self).__init__(*args, **kwargs)
        self.add_output(TaskFile(path=path, name=name, fmt=fmt, persist=True))
Beispiel #4
0
class PASTE(Tool):
    inputs = ['txt']
    outputs = [TaskFile(name='txt',basename='paste.txt',persist=True)]
    time_req = 1
    
    def cmd(self,i,s,p):
        return 'paste {input} > $OUT.txt', {
                'input':' '.join(map(lambda x: str(x),i['txt']))
                }
Beispiel #5
0
class CAT(Tool):
    inputs = ['txt']
    outputs = [TaskFile(fmt='txt',basename='cat.txt')]
    time_req = 1
    
    def cmd(self,i,s,p):
        return 'cat {input} > $OUT.txt', {
                'input':' '.join(map(lambda x: str(x),i['txt']))
                }
Beispiel #6
0
class MarkDuplicates(Picard):
    name = "Mark Duplicates"
    mem_req = 6*1024
    time_req = 20*60
    inputs = ['bam']
    outputs = [TaskFile(name='bam',basename='markdupes.bam'),
               TaskFile(name='metrics',basename='markdupes.metrics')]
    persist=True
        
    jar = 'MarkDuplicates.jar'
    
    def cmd(self,i,s,p):
        return r"""
            {self.bin}
            {inputs}
            O=$OUT.bam
            METRICS_FILE=$OUT.metrics
            ASSUME_SORTED=True
            CREATE_INDEX=True
        """, {'inputs':list2input(i['bam'])}
Beispiel #7
0
class SplitFastq(Tool):
    inputs = ['1.fastq.gz', '2.fastq.gz']
    outputs = [TaskFile(name='dir', persist=True)]
    time_req = 12 * 60
    mem_req = 1000
    persist = True

    def cmd(self, i, s, p):
        input = i['1.fastq.gz'][0] if p['pair'] == 1 else i['2.fastq.gz'][0]
        return "python {s[genomekey_library_path]}/scripts/splitfastq.py {input} $OUT.dir", {
            'input': input
        }
Beispiel #8
0
    def __init__(self, stage_name=None, tags={}, dag=None):
        """
:param stage_name: (str) The name of the stage this tool belongs to. Required.
:param tags: (dict) A dictionary of tags.
:param dag: The dag this task belongs to.
:param parents: A list of tool instances which this tool is dependent on
"""
        #if len(tags)==0: raise ToolValidationError('Empty tag dictionary. All tools should have at least one tag.')
        if not hasattr(self, 'name'): self.name = self.__class__.__name__
        if not hasattr(self, 'output_files'): self.output_files = []
        if not hasattr(self, 'settings'): self.settings = {}
        if not hasattr(self, 'parameters'): self.parameters = {}

        self.stage_name = stage_name if stage_name else self.name
        self.tags = tags
        self.dag = dag

        # Because defining attributes in python creates a reference to a single instance across all class instance
        # any taskfile instances in self.outputs is used as a template for instantiating a new class
        self.outputs = [
            copy.copy(o) if isinstance(o, TaskFile) else o
            for o in self.outputs
        ]
        self.id = get_id()

        # Create empty output TaskFiles
        for output in self.outputs:
            if isinstance(output, TaskFile):
                self.add_output(output)
            elif isinstance(output, str):
                tf = TaskFile(fmt=output, persist=self.persist)
                self.add_output(tf)
            else:
                raise ToolValidationError, "{0}.outputs must be a list strs or Taskfile instances.".format(
                    self)

        #validate inputs are strs
        if any([not isinstance(i, str) for i in self.inputs]):
            raise ToolValidationError, "{0} has elements in self.inputs that are not of type str".format(
                self)

        if len(self.inputs) != len(set(self.inputs)):
            raise ToolValidationError(
                'Duplicate names in tool.inputs detected in {0}. Perhaps try using [1.ext,2.ext,...]'
                .format(self))

        output_names = [o.name for o in self.output_files]
        if len(output_names) != len(set(output_names)):
            raise ToolValidationError(
                'Duplicate names in tool.output_files detected in {0}. Perhaps try using [1.ext,2.ext,...] when defining outputs'
                .format(self))