Ejemplo n.º 1
0
    def output(self):
        sample_name = pfn(self.sampleID, 'sample_name')
        project_name = pfn(self.sampleID, 'project_name')

        return luigi.LocalTarget(
            output_fmt.format(
                path=base_outpath, PN=project_name, SN=sample_name) + '.sam')
Ejemplo n.º 2
0
    def run(self):
        sample_name = pfn(self.PE1, 'sample_name')
        project_name = pfn(self.PE1, 'project_name')
        log_name = '{base}/{PN}_result/trim_result/{SN}_trimed.log'.format(
            base=base_outpath, PN=project_name, SN=sample_name)
        if not os.path.isdir('{base}/{PN}_result/trim_result'.format(
                base=base_outpath, PN=project_name)):
            os.makedirs('{base}/{PN}_result/trim_result'.format(
                base=base_outpath, PN=project_name))

        input1 = self.PE1
        input2 = self.PE2

        if input2:
            cmdline = "java -jar ~/tools/Trimmomatic-0.36/trimmomatic-0.36.jar PE -threads 20 {base_in}/{input1}.fastq.gz {base_in}/{input2}.fastq.gz -trimlog {output} {base_out}/{input1}.clean.fq.gz {base_out}/{input1}.unpaired.fq.gz {base_out}/{input2}.clean.fq.gz {base_out}/{input2}.unpaired.fq.gz ILLUMINACLIP:/home/liaoth/tools/Trimmomatic-0.36/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format(
                input1=input1,
                input2=input2,
                base_in=base_inpath,
                base_out=os.path.dirname(log_name),
                output=log_name)
            os.system(cmdline)
            record_cmdline(cmdline)
        else:
            cmdline = "java -jar ~/tools/Trimmomatic-0.36/trimmomatic-0.36.jar SE -threads 20 {base_in}/{input1}.fastq.gz -trimlog {output} {base_out}/{input1}.clean.fq.gz ILLUMINACLIP:/home/liaoth/tools/Trimmomatic-0.36/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36".format(
                input1=input1,
                base_in=base_inpath,
                base_out=os.path.dirname(log_name),
                output=log_name)
            os.system(cmdline)
            record_cmdline(cmdline)
Ejemplo n.º 3
0
    def run(self):
        sampleIDs = self.sample_IDs.split(',')

        output_dir = self.output().path.rpartition('/')[0]

        if os.path.isdir(output_dir) != True:
            os.makedirs(output_dir)

        if pfn(sampleIDs[0], 'mt2_for') == NORMAL_SIG:
            input_normal = self.input()[0].path
            input_tumor = self.input()[1].path
        elif pfn(sampleIDs[0], 'mt2_for') == TUMOR_SIG:
            input_normal = self.input()[1].path
            input_tumor = self.input()[0].path
        else:
            input_tumor = ''
            input_normal = ''

        prefix = self.output().path.rpartition('.bam')[0]
        cmdline = '''java -Xmx10g -jar ~/tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar -T MuTect2 --allSitePLs -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:normal {input_normal} --input_file:tumor {input_tumor} --out {prefix}.vcf --bamOutput {prefix}.bam --log_to_file {prefix}.log'''.format(
            REF=REF_file_path,
            cosmic=cos_snp,
            db_snp=db_snp,
            input_tumor=input_tumor,
            input_normal=input_normal,
            prefix=prefix)
        os.system(cmdline)
        record_cmdline(cmdline)
Ejemplo n.º 4
0
    def output(self):

        Project_ID = pfn(self.sample_NT, 'project_name')
        sample_name = pfn(self.sample_NT, 'sample_name')

        output_path = somatic_single_output_fmt.format(
            path=base_outpath, PN=Project_ID, SN=sample_name) + '.mt2.bam'
        return luigi.LocalTarget(output_path)
Ejemplo n.º 5
0
    def output(self):
        sampleIDs = self.sample_IDs.split(',')
        Project_ID = pfn(sampleIDs[0], 'project_name')
        pair_name = pfn(sampleIDs[0], 'pair_name')

        output_path = somatic_pair_output_fmt.format(
            path=base_outpath, PN=Project_ID, PairN=pair_name) + '.mt2.bam'
        return luigi.LocalTarget(output_path)
Ejemplo n.º 6
0
    def run(self):
        input1 = self.input()[0].path
        mt2_id = pfn(self.sample_NT, 'mt2_for')
        prefix = self.output().path.rpartition('.bam')[0]
        output_dir = self.output().path.rpartition('/')[0]

        if os.path.isdir(output_dir) != True:
            os.makedirs(output_dir)

        if mt2_id == NORMAL_SIG:
            cmdline = '''java -Xmx10g -jar ~/tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_tumor} --out {prefix}.vcf --bamOutput {prefix}.bam --log_to_file {prefix}.log '''.format(
                REF=REF_file_path,
                cosmic=cos_snp,
                db_snp=db_snp,
                input_tumor=input1,
                prefix=prefix)
            os.system(cmdline)
            record_cmdline(cmdline)

        # Normal only
        else:
            cmdline = '''java -Xmx10g -jar ~/tools/GenomeAnalysisTK-3.6/GenomeAnalysisTK.jar -T MuTect2 --allSitePLs --artifact_detection_mode -R {REF} --cosmic {cosmic} --dbsnp {db_snp} --input_file:tumor {input_tumor} --out {prefix}.vcf --bamOutput {prefix}.bam --log_to_file {prefix}.log --tumor_lod 4 '''.format(
                REF=REF_file_path,
                cosmic=cos_snp,
                db_snp=db_snp,
                input_tumor=input1,
                prefix=prefix)
            os.system(cmdline)
            record_cmdline(cmdline)
Ejemplo n.º 7
0
    def requires(self):
        samples_IDs = str(self.x).split(',')

        pair_bucket = defaultdict(list)
        for _x in samples_IDs:
            pair_bucket[pfn(_x, 'pair_name')].append(_x)
        adjust_multiple = []
        for each in pair_bucket.keys():
            if len(pair_bucket[each]) > 2:
                tmp = pair_bucket[each]
                only_normal = [
                    _ for _ in tmp if pfn(_, 'mt2_for') == NORMAL_SIG
                ][0]
                for _each in tmp:
                    if pfn(_each, 'mt2_for') == TUMOR_SIG and pfn(
                            _each, 'sample_name').replace(TUMOR_SIG,
                                                          '') != each:
                        adjust_multiple.append(
                            (pfn(_each, 'sample_name').replace(TUMOR_SIG, ''),
                             [only_normal, _each]))
                    elif pfn(_each, 'mt2_for') == TUMOR_SIG and pfn(
                            _each, 'sample_name').replace(TUMOR_SIG,
                                                          '') == each:
                        adjust_multiple.append((each, [only_normal, _each]))
        pair_bucket.update(dict(adjust_multiple))
        global pair_bucket
        ###{'XK-2': ['XK-2T_S20', 'XK-2W_S17'],'XK-8': ['XK-8T_S21', 'XK-8W_S18']}

        samples_IDs += [_x for _x in pair_bucket.keys()]

        if debug_:
            import pdb
            pdb.set_trace()
        for i in samples_IDs:
            yield Annovar2(sample_ID=i)
Ejemplo n.º 8
0
    def output(self):
        sampleIDs = self.sample_IDs.split(',')
        Project_ID = pfn(sampleIDs[0], 'project_name')
        pair_name = [
            k for k, v in pair_bucket.items() if set(v) == set(sampleIDs)
        ][0]

        output_path = somatic_pair_output_fmt.format(
            path=base_outpath, PN=Project_ID, PairN=pair_name) + '.mt2.bam'
        return luigi.LocalTarget(output_path)
Ejemplo n.º 9
0
def formatter_output(args):
    val = ''
    if ',' in args:
        arg_List = args.split(',')
        parsed_name = []
        for val in arg_List:
            parsed_name.append(pfn(PE1_fmt.format(input=val), 'all'))
    else:
        val = args
        parsed_name = pfn(PE1_fmt.format(input=val), 'all')

    if not self_adjust_fn:
        fq_file = PE1_fmt.format(input=val)
    else:
        input_list = glob.glob(base_inpath + '/*' + val + '*')
        if filter_str:
            input_list = [
                _i.replace(fq_suffix, '') for _i in input_list
                if filter_str not in _i
            ]
        fq_file = input_list[0]
    if debug_:
        import pdb
        pdb.set_trace()
    output = """Current Variants: Please make sure your variants is right.\n\n
    Input path: {b_i}
    output path: {b_o}
    Sig represent NORMAL: {sig_n}
    Sig represent TUMOR: {sig_T}
    Pair file format: {pe_fmt}
    input_fastq_file example:{fq}\n

    One of args filename parsed: {parsed_result}""".format(
        b_i=base_inpath,
        b_o=base_outpath,
        sig_n=NORMAL_SIG,
        sig_T=TUMOR_SIG,
        pe_fmt=PE1_fmt,
        fq=fq_file,
        parsed_result=str(parsed_name))
    return output
Ejemplo n.º 10
0
    def requires(self):
        samples_IDs = str(self.x).split(',')

        pair_bucket = defaultdict(list)
        for _x in samples_IDs:
            pair_bucket[pfn(_x, 'pair_name')].append(_x)
        global pair_bucket
        ###{'XK-2': ['XK-2T_S20', 'XK-2W_S17'],'XK-8': ['XK-8T_S21', 'XK-8W_S18']}

        samples_IDs += [_x for _x in pair_bucket.keys()]
        for i in samples_IDs:
            yield Annovar2(sample_ID=i)
Ejemplo n.º 11
0
    def run(self):
        sample_name = pfn(self.PE1, 'sample_name')
        project_name = pfn(self.PE1, 'project_name')
        trim_r_path = trim_fmt.format(base=base_outpath, PN=project_name)
        log_name = os.path.join(trim_r_path, '%s_trimed.log' % sample_name)

        if not os.path.isdir(trim_r_path):
            os.makedirs(trim_r_path)

        input1 = self.PE1
        input2 = self.PE2
        output1 = PE1_fmt.format(input=pfn(self.PE1, 'sample_name'))
        output2 = PE2_fmt.format(input=pfn(self.PE2, 'sample_name'))

        if input2:
            cmdline = "java -jar {trimmomatic_jar} PE -threads 20 {base_in}/{input1}{fq_suffix} {base_in}/{input2}{fq_suffix} -trimlog {output} {base_out}/{output1}.clean.fq.gz {base_out}/{output1}.unpaired.fq.gz {base_out}/{output2}.clean.fq.gz {base_out}/{output2}.unpaired.fq.gz ILLUMINACLIP:{trimmomatic_jar_dir}/adapters/TruSeq3-PE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:50".format(
                trimmomatic_jar=trimmomatic_jar,
                trimmomatic_jar_dir=os.path.dirname(trimmomatic_jar),
                input1=input1,
                input2=input2,
                base_in=base_inpath,
                base_out=os.path.dirname(log_name),
                output1=output1,
                output2=output2,
                fq_suffix=fq_suffix,
                output=log_name)
            os.system(cmdline)
            record_cmdline(cmdline)
        else:
            cmdline = "java -jar {trimmomatic_jar} SE -threads 20 {base_in}/{input1}{fq_suffix} -trimlog {output} {base_out}/{input1}.clean.fq.gz ILLUMINACLIP:{trimmomatic_jar_dir}/adapters/TruSeq3-SE.fa:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36".format(
                trimmomatic_jar=trimmomatic_jar,
                trimmomatic_jar_dir=os.path.dirname(trimmomatic_jar),
                input1=input1,
                base_in=base_inpath,
                base_out=os.path.dirname(log_name),
                fq_suffix=fq_suffix,
                output=log_name)
            os.system(cmdline)
            record_cmdline(cmdline)
Ejemplo n.º 12
0
    def run(self):
        sample_name = pfn(self.sampleID, 'sample_name')
        project_name = pfn(self.sampleID, 'project_name')

        if Pair_data:
            input1 = self.input().path
            input2 = self.input().path.replace(R1_INDICATOR, R2_INDICATOR)
            if not os.path.isdir(
                    output_dir.format(
                        path=base_outpath, PN=project_name, SN=sample_name)):
                os.makedirs(
                    output_dir.format(path=base_outpath,
                                      PN=project_name,
                                      SN=sample_name))
            cmdline = "bwa mem -M -t 20 -k 19 -R '@RG\\tID:{SN}\\tSM:{SN}\\tPL:illumina\\tLB:lib1\\tPU:L001' {REF} {i1} {i2} > {o}".format(
                SN=sample_name,
                REF=REF_file_path,
                i1=input1,
                i2=input2,
                o=self.output().path)
            os.system(cmdline)
            record_cmdline(cmdline)
        else:
            input1 = self.input().path
            if not os.path.isdir(
                    output_dir.format(
                        path=base_outpath, PN=project_name, SN=sample_name)):
                os.makedirs(
                    output_dir.format(path=base_outpath,
                                      PN=project_name,
                                      SN=sample_name))
            cmdline = "bwa mem -M -t 20 -k 19 -R '@RG\\tID:{SN}\\tSM:{SN}\\tPL:illumina\\tLB:lib1\\tPU:L001' {REF} {i1} > {o}".format(
                SN=sample_name,
                REF=REF_file_path,
                i1=input1,
                o=self.output().path)
            os.system(cmdline)
            record_cmdline(cmdline)
Ejemplo n.º 13
0
    def run(self):
        input1 = self.input()[0].path
        mt2_id = pfn(self.sample_NT, 'mt2_for')
        prefix = self.output().path.rpartition('.bam')[0]
        output_dir = self.output().path.rpartition('/')[0]

        if os.path.isdir(output_dir) != True:
            os.makedirs(output_dir)

        if bed_file_path:
            suffix_str = " --intervals %s" % bed_file_path
        else:
            suffix_str = ''

        if mt2_id == NORMAL_SIG:
            cmdline = "{gatk} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format(
                gatk=gatk_pro,
                REF=REF_file_path,
                db_snp=db_snp,
                input_tumor=input1,
                prefix=prefix,
                T_name=pfn(self.sample_NT, 'sample_name')) + suffix_str
            os.system(cmdline)
            record_cmdline(cmdline)

        # Normal only
        else:
            cmdline = "{gatk} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam --tumor-lod-to-emit 4".format(
                gatk=gatk_pro,
                REF=REF_file_path,
                db_snp=db_snp,
                input_tumor=input1,
                prefix=prefix,
                T_name=pfn(self.sample_NT, 'sample_name')) + suffix_str
            os.system(cmdline)
            record_cmdline(cmdline)
Ejemplo n.º 14
0
    def run(self):
        sampleIDs = self.sample_IDs.split(',')

        output_dir = self.output().path.rpartition('/')[0]

        if os.path.isdir(output_dir) != True:
            os.makedirs(output_dir)
        input_tumor = ''
        input_normal = ''
        normal_name = ''
        tumor_name = ''

        if pfn(sampleIDs[0], 'mt2_for') == NORMAL_SIG:
            input_normal = self.input()[0].path
            input_tumor = self.input()[1].path
            normal_name = pfn(sampleIDs[0], 'sample_name')
            tumor_name = pfn(sampleIDs[1], 'sample_name')
        elif pfn(sampleIDs[0], 'mt2_for') == TUMOR_SIG:
            input_normal = self.input()[1].path
            input_tumor = self.input()[0].path
            normal_name = pfn(sampleIDs[1], 'sample_name')
            tumor_name = pfn(sampleIDs[0], 'sample_name')

        prefix = self.output().path.rpartition('.bam')[0]
        if bed_file_path:
            suffix_str = " --intervals %s" % bed_file_path
        else:
            suffix_str = ''
        cmdline = "{gatk} Mutect2 --java-options '-Xmx20g' --native-pair-hmm-threads 20 --reference {REF} -I {input_normal} -normal {N_name} -I {input_tumor} -tumor {T_name} --dbsnp {db_snp} --seconds-between-progress-updates 60 --all-site-pls -stand-call-conf 10 -A Coverage -A DepthPerAlleleBySample -A FisherStrand -A BaseQuality -A QualByDepth -A RMSMappingQuality -A MappingQualityRankSumTest -A ReadPosRankSumTest -A ChromosomeCounts --all-site-pls true --output {prefix}.vcf -bamout {prefix}.bam".format(
            REF=REF_file_path,
            cosmic=cos_snp,
            db_snp=db_snp,
            input_tumor=input_tumor,
            input_normal=input_normal,
            gatk=gatk_pro,
            N_name=normal_name,
            T_name=tumor_name,
            prefix=prefix) + suffix_str
        os.system(cmdline)
        record_cmdline(cmdline)
Ejemplo n.º 15
0
 def output(self):
     project_name = pfn(self.PE1, 'project_name')
     output1 = PE1_fmt.format(input=pfn(self.PE1, 'sample_name'))
     return luigi.LocalTarget(
         os.path.join(trim_fmt.format(base=base_outpath, PN=project_name),
                      '/%s.clean.fq.gz' % output1))
def Add_in_vcf_PA(bam_list,
                  vcf_path,
                  output_vcf,
                  fasta_file='/home/liaoth/data/hg19/ucsc.hg19.fasta',
                  N_sig=NORMAL_SIG,
                  T_sig=TUMOR_SIG):
    """
    receive a vcf file and a related bam. Add coverage from bam into vcf and make it a new field.
    For pair analyse vcf.
    bam_list order must like [normal one, tumor one]
    :param bam: bam path
    :param vcf: vcf path or vcf file object.
    :return: A new vcf to output.
    """
    ori_format2info = ['AF', 'AD']
    field1 = "SAD"
    field2 = "SAF"
    field3 = "PoS"
    NT_SIG = [pfn(_bam, 'mt2_for') for _bam in bam_list]
    NT_name = [pfn(_bam, 'sample_name') for _bam in bam_list]
    if type(vcf_path) == str:
        vcf_readed = vcf.Reader(open(vcf_path, 'r'))
    else:
        try:
            vcf_readed = vcf.Reader(fsock=vcf_path)
        except:
            raise IOError
            print 'Wrong vcf, it is a %s' % str(type(vcf_path))

    pos_list = parsed_vcf2pos_list(vcf_path)

    is_single = False
    right_infos = vcf_readed.infos
    machine = right_infos.values()[0]
    # Modify the info part.
    if not is_single:
        field1_info = [
            field1, '4', 'Integer',
            "(REF base count, alt base count). Self cal allele depths from bam file. If there are two pair, it is normal-tumore order."
        ]
        field2_info = [
            field2, 'R', 'Float',
            "Alt base count divide the total reads number in this pos. Self cal frequency from bam file. If there are two pair, it is normal-tumore order."
        ]
        field3_info = [
            field3, '1', 'Integer',
            "A field which describe this file is single only analysis or pair analysis. 1 for single analysis, 2 for pair analysis."
        ]

        right_infos[field1] = machine._make(field1_info + [None, None])
        right_infos[field2] = machine._make(field2_info + [None, None])
        right_infos[field3] = machine._make(field3_info + [None, None])
        for ori_format in ori_format2info:
            _ori_format_info = vcf_readed.formats[ori_format]._asdict().values(
            ) + [None, None]
            _ori_format_info[
                3] += ". If there are two pair, it is normal-tumore order."  # fetch ori format value and ID and fix it into length == 6
            if ori_format == 'AD':
                _ori_format_info[1] = '4'
            elif ori_format == 'AF':
                _ori_format_info[1] = 'R'
            right_infos[ori_format] = machine._make(_ori_format_info)

    vcf_readed.infos = right_infos
    # Fetch the cov info from bam file, and prepare the writed file.
    all_cov_info = special_cal_cov(bam_list, pos_list, fasta_file)
    vcf_writer = vcf.Writer(open(output_vcf, 'w'), vcf_readed)

    for record in vcf_readed:
        if record.is_snp:
            query_for = (record.CHROM, record.POS - 1)
            buckec_SAD = []
            bucket_SAF = []
            for ori_format in ori_format2info:
                exec 'bucket_%s = []' % ori_format

            for sample_call in record.samples:
                # it needs to fix the sample and the cov_info order.
                sample = str(sample_call.sample)

                idx = [
                    NT_SIG.index(s) for s, n in zip(NT_SIG, NT_name)
                    if sample == n
                ][0]
                cov_info = all_cov_info[idx][query_for]

                ref_base, ref_cov = cov_info[0]
                if len(cov_info) > 2:
                    for n_i in range(1, len(cov_info) - 1):
                        if cov_info[n_i][0] == record.ALT[0]:
                            alt_base, alt_cov = cov_info[n_i]
                elif len(cov_info) == 1:
                    alt_base = record.ALT[0]
                    alt_cov = 0
                else:
                    alt_base, alt_cov = cov_info[1]
                ### fix the bucket order to normal-tumore order.
                if sample == [
                        n for s, n in zip(NT_SIG, NT_name) if s == N_sig
                ]:
                    buckec_SAD.insert(0, int(alt_cov))
                    buckec_SAD.insert(0, int(ref_cov))

                    if sum((int(ref_cov), int(alt_cov))) != 0:
                        bucket_SAF.insert(
                            0,
                            round(
                                float(alt_cov) / sum(
                                    (int(ref_cov), int(alt_cov))), 4))
                    else:
                        bucket_SAF.insert(0, 0)
                    # data = dict(sample_call.data._asdict())
                    for ori_format in ori_format2info:
                        if ori_format == 'AD':
                            exec "bucket_{i}.insert(0,tuple(data['{i}'])[0])".format(
                                i=ori_format)
                            exec "bucket_{i}.insert(0,tuple(data['{i}'])[1])".format(
                                i=ori_format)
                        else:
                            exec "bucket_{i}.insert(0,data['{i}'])".format(
                                i=ori_format)
                else:
                    buckec_SAD += [int(ref_cov), int(alt_cov)]

                    if sum((int(ref_cov), int(alt_cov))) != 0:
                        bucket_SAF.append(
                            round(
                                float(alt_cov) / sum(
                                    (int(ref_cov), int(alt_cov))), 4))
                    else:
                        bucket_SAF.append(0)
                    # data = dict(sample_call.data._asdict())
                    for ori_format in ori_format2info:
                        if ori_format == 'AD':
                            exec "bucket_{i} += list(data['{i}'])".format(
                                i=ori_format)
                        else:
                            exec "bucket_{i}.append(data['{i}'])".format(
                                i=ori_format)
            record.INFO[field1] = buckec_SAD
            record.INFO[field2] = bucket_SAF
            record.INFO[field3] = 2
            for ori_format in ori_format2info:
                exec "record.INFO['{i}'] = bucket_{i}".format(i=ori_format)
        vcf_writer.write_record(record)
    vcf_writer.close()
Ejemplo n.º 17
0
 def output(self):
     project_name = pfn(self.PE1, 'project_name')
     return luigi.LocalTarget(
         '{base}/{PN}_result/trim_result/{input1}.clean.fq.gz'.format(
             base=base_outpath, PN=project_name, input1=self.PE1))