Example #1
0
    def genotype_call(reference, vcf, output_file="./combined.vcf", ploidy=2):
        wd = os.path.dirname(os.path.abspath(output_file)) + "/"
        reference = os.path.abspath(reference)
        vcf = os.path.abspath(vcf)

        mkdir(wd)

        assert os.path.exists(wd), f'{wd} could not be created'
        assert os.path.exists(reference), f'{reference} does not exist'
        assert os.path.exists(vcf), f'{vcf} does not exist'

        e(f"""gatk GenotypeGVCFs \
        -R "{reference}" -ploidy {ploidy} \
        -V "{vcf}" \
        -O "{output_file}" 
        """)
        return
Example #2
0
    def variant_call(wd, reference, alignment, ploidy=2):
        wd = os.path.abspath(wd) + "/"
        reference = os.path.abspath(reference)
        alignment = os.path.abspath(alignment)

        mkdir(wd)

        assert os.path.exists(wd), f'{wd} could not be created'
        assert os.path.exists(reference), f'{reference} does not exist'
        assert os.path.exists(alignment), f'{alignment} does not exist'

        e(f"""gatk HaplotypeCaller -ERC GVCF \
         -R "{reference}" -ploidy {ploidy} \
         -I "{alignment}" --output-mode EMIT_ALL_CONFIDENT_SITES \
         -O "{wd}raw.g.vcf.gz"
        """)

        e(f"""gatk GenotypeGVCFs \
        -R "{reference}" -ploidy {ploidy} \
        -V "{wd}raw.g.vcf.gz" \
        -O "{wd}output.vcf.gz" 
        """)
Example #3
0
 def init_ref(path):
     assert "PICARD" in os.environ, "PICARD environment variable not configured"
     filename = os.path.basename(path)
     workdir = os.path.dirname(path)
     e("cd {workdir};bwa index -a is  {record_name}",
       record_name=filename,
       workdir=workdir)
     e("cd {workdir};java -jar $PICARD CreateSequenceDictionary R={record_name} O={record_name}.dict",
       record_name=filename,
       workdir=workdir)
     e("cd {workdir};samtools faidx {record_name}",
       record_name=filename,
       workdir=workdir)
     # e("bowtie2-build {record_name} {record_name}".format(record_name=filename))
     e("cd {workdir};makeblastdb -dbtype nucl -in {record_name} ",
       record_name=filename,
       workdir=workdir)
Example #4
0
    def clean_reads(work_dir,
                    read1,
                    read2,
                    trim_left=20,
                    trim_qual_right=25,
                    trim_qual_window=25,
                    min_len=35,
                    window_size=5,
                    cpu=1,
                    clip_string=""):
        """

        :param work_dir:
        :param read1:
        :param read2:
        :param min_qual_mean:
        :param trim_left:
        :param trim_qual_right:
        :param trim_qual_window:
        :param min_len:
        :param window_size:
        :param cpu:
        :param clip_string: example -> ILLUMINACLIP:TruSeq3-SE:2:30:10
        :return:
        """
        work_dir = os.path.abspath(work_dir) + "/"
        read1 = os.path.abspath(read1)
        read2 = os.path.abspath(read2)

        # Quality control
        # "prinseq-lite.pl -fastq {read1_full} -fastq2 {read2_full} -min_qual_mean {min_qual_mean}" +
        # " -trim_left {trim_left}  -trim_qual_right {trim_qual_right} -trim_qual_window {trim_qual_window}" +
        # " -min_len {min_len} -out_good trimmed",
        e('java -jar $TRIMMOMATIC PE -threads {cpu} "{read1_full}" "{read2_full}" '
          + ' {pout1} {upout1} {pout2} {upout2} ' +
          ' HEADCROP:{trim_left}  TRAILING:{trim_qual_right} SLIDINGWINDOW:{window_size}:{trim_qual_window} '
          + ' {clip_string}  MINLEN:{min_len} ',
          work_dir,
          read1_full=read1,
          read2_full=read2,
          trim_left=trim_left,
          trim_qual_right=trim_qual_right,
          trim_qual_window=trim_qual_window,
          min_len=min_len,
          cpu=cpu,
          pout1="trimmed_1.fastq",
          pout2="trimmed_2.fastq",
          upout1="trimmed_1_singletons.fastq",
          upout2="trimmed_2_singletons.fastq",
          window_size=window_size,
          clip_string=clip_string)

        e("fastqc trimmed_1.fastq", work_dir)
        e("fastqc trimmed_2.fastq", work_dir)

        if os.path.exists(work_dir + "trimmed_1_singletons.fastq"):
            e("cat trimmed_1_singletons.fastq >> trimmed_s.fastq", work_dir)
            os.remove(work_dir + "trimmed_1_singletons.fastq")

        if os.path.exists(work_dir + "trimmed_2_singletons.fastq"):
            e("cat trimmed_2_singletons.fastq >> trimmed_s.fastq", work_dir)
            os.remove(work_dir + "trimmed_2_singletons.fastq")
Example #5
0
    def process_strain(ref_fasta, strain, read_paths, work_dir):
        """

       :param ref_fasta:
       :param strain:
       :param read_paths: tuple with paths of (path_r1,path_r2,path_singles)
       :param work_dir:
       :return:
        """

        out_bwa_bam = "final_bwa.bam"
        out_bwa_bam_idx = out_bwa_bam + ".bai"
        cwd = os.getcwd()
        if not os.path.exists(work_dir):
            os.makedirs(work_dir)
        try:
            os.chdir(work_dir)

            out_bwa_pe = "bwa_pe.sam"
            out_bwa_pe_bam = "bwa_pe.bam"
            out_unmapped_pe_bam = "unmapped.bam"

            if not os.path.exists(out_bwa_bam_idx) and not os.path.exists(
                    out_bwa_pe_bam):
                e('bwa mem -R "@RG\\tID:illumina\\tSM:{ncepa}\\tLB:{ncepa}"  {ref_fasta} {pe1} {pe2}  >  '
                  + out_bwa_pe,
                  ref_fasta=ref_fasta,
                  ncepa=strain,
                  pe1=read_paths[0],
                  pe2=read_paths[1])
                e("samtools view -F 4 -Sbh %s > %s" %
                  (out_bwa_pe, out_bwa_pe_bam))
                e("samtools view -f 4 -Sbh %s > %s" %
                  (out_bwa_pe, out_unmapped_pe_bam))
                unmapped_pair_1 = "unmapped_pair_1.fastq"
                unmapped_pair_2 = "unmapped_pair_2.fastq"
                e("bedtools bamtofastq -i {ubam} -fq {upair}   -fq2 {upair2}",
                  upair2=unmapped_pair_2,
                  upair=unmapped_pair_1,
                  ubam=out_unmapped_pe_bam)
                out_bwa_pe_bam = Mapping.realign(out_bwa_pe_bam, ref_fasta)

            for x in [out_bwa_pe, out_unmapped_pe_bam]:
                if os.path.exists(x):
                    os.remove(x)

            out_bwa_se = "bwa_se.sam"
            out_bwa_se_bam = "bwa_se.bam"
            out_unmapped_se_bam = "unmapped_se.bam"
            if not os.path.exists(out_bwa_bam_idx) and not os.path.exists(
                    out_bwa_se_bam):
                e('bwa mem -R "@RG\\tID:illumina\\tSM:{ncepa}\\tLB:{ncepa}"  {ref_fasta} {s1}   >  '
                  + out_bwa_se,
                  ref_fasta=ref_fasta,
                  ncepa=strain,
                  s1=read_paths[2])
                e("samtools view -F 4 -Sbh %s > %s" %
                  (out_bwa_se, out_bwa_se_bam))
                e("samtools view -f 4 -Sbh %s > %s" %
                  (out_bwa_se, out_unmapped_se_bam))

                unmapped_single = "unmapped_single.fastq"
                e("bedtools bamtofastq -i {ubam} -fq {upair}   ",
                  upair=unmapped_single,
                  ubam=out_unmapped_se_bam)
                out_bwa_se_bam = Mapping.realign(out_bwa_se_bam, ref_fasta)

            for x in [out_bwa_se, out_unmapped_se_bam]:
                if os.path.exists(x):
                    os.remove(x)

            out_bwa_raw_bam = "bwa_raw.bam"
            out_bwa_fm_bam = "bwa_fm.bam"
            out_bwa_fm_sort_bam = "bwa_fm_sort.bam"

            if not os.path.exists(out_bwa_bam_idx):
                e("samtools merge %s %s %s " %
                  (out_bwa_raw_bam, out_bwa_pe_bam, out_bwa_se_bam))
                e("samtools sort -n -o %s %s" %
                  (out_bwa_fm_sort_bam, out_bwa_raw_bam))
                e("samtools fixmate  %s %s" %
                  (out_bwa_fm_sort_bam, out_bwa_fm_bam))
                e("samtools sort -o %s %s" % (out_bwa_bam, out_bwa_fm_bam))
                e("samtools index %s" % out_bwa_bam)

            for x in [
                    out_bwa_raw_bam, out_bwa_fm_bam, out_bwa_fm_sort_bam,
                    out_bwa_pe_bam, out_bwa_se_bam, out_bwa_pe_bam + ".bai",
                    out_bwa_se_bam + ".bai"
            ]:
                if os.path.exists(x):
                    os.remove(x)

            if not os.path.exists("flagstat.txt"):
                e("samtools flagstat %s > %s" % (out_bwa_bam, "flagstat.txt"))
        finally:
            os.chdir(cwd)
Example #6
0
    def realign(bam_file, ref_fasta):
        out_bwa_bam = "sorted_" + bam_file
        e("samtools sort -o %s %s" % (out_bwa_bam, bam_file))
        out_bwa_final_bam = "realigned2_" + bam_file
        out_bwa_intervals = bam_file + ".intervals"
        out_bwa_intervals2 = bam_file + ".intervals"
        bwa_realigned = "realigned_" + bam_file
        duplicates = "duplicates_" + bam_file
        bwa_iter1 = "iter1_" + bam_file
        if not os.path.exists(out_bwa_final_bam):
            e("samtools index %s" % out_bwa_bam)
            e("gatk -T RealignerTargetCreator -R {ref} -I {input} -o {out}",
              ref=ref_fasta,
              input=out_bwa_bam,
              out=out_bwa_intervals)
            e("gatk -T IndelRealigner -R {ref} -I {input} -targetIntervals {intervals} -o {output}",
              ref=ref_fasta,
              input=out_bwa_bam,
              intervals=out_bwa_intervals,
              output=bwa_realigned)
            # Aca se recomienda correr el BaseRecalibrator de GATK pero no se tiene un vcf con variantes comunes
            e("picard MarkDuplicates I={input}   REMOVE_DUPLICATES=true O={output} M={duplicates}",
              input=bwa_realigned,
              output=bwa_iter1,
              duplicates=duplicates)
            e("samtools index {input}", input=bwa_iter1)
            e("gatk -T RealignerTargetCreator -R {ref} -I {input} -o {intervals}",
              ref=ref_fasta,
              input=bwa_iter1,
              intervals=out_bwa_intervals2)
            e("gatk -T IndelRealigner -R {ref} -I {input} -targetIntervals {intervals} -o {output}",
              ref=ref_fasta,
              input=bwa_iter1,
              intervals=out_bwa_intervals2,
              output=out_bwa_final_bam)
            e("samtools index %s" % out_bwa_final_bam)

        for x in [
                bam_file, out_bwa_intervals, out_bwa_intervals2, bwa_realigned,
                bwa_iter1, bwa_iter1 + ".bai"
        ]:
            if os.path.exists(x):
                os.remove(x)

        return out_bwa_final_bam
Example #7
0
    def variant_call(work_dir, record, alignment, strain):
        work_dir = os.path.abspath(work_dir) + "/"
        record = os.path.abspath(record)
        alignment = os.path.abspath(alignment)

        # Call variants in the sequence data
        e("java -jar $GATK -T HaplotypeCaller -R {record_name} -I {alignment} -gt_mode DISCOVERY -ploidy 1 -stand_call_conf 30 -o raw_variants.vcf",
          work_dir,
          record_name=record,
          alignment=alignment)
        # Apply hard filters to a call set
        e("java -jar $GATK -T SelectVariants -R {record_name} -V raw_variants.vcf  -selectType SNP -o raw_snps.vcf",
          work_dir,
          record_name=record)
        e("java -jar $GATK -T VariantFiltration -R {record_name} -V raw_snps.vcf  -filter \"QD < 2.0 || FS > 60.0 || MQ < 40.0 || MQRankSum < -12.5 || ReadPosRankSum < -8.0\" --filterName \"my_snp_filter\" -o filtered_snps.vcf",
          work_dir,
          record_name=record)
        e("java -jar $GATK -T SelectVariants -R {record_name} -V raw_variants.vcf  -selectType INDEL -o raw_indels.vcf",
          work_dir,
          record_name=record)
        e("java -jar $GATK -T VariantFiltration -R {record_name} -V raw_indels.vcf  -filter \"QD < 2.0 || FS > 200.0 || ReadPosRankSum < -20.0\" --filterName \"my_indel_filter\" -o filtered_indels.vcf",
          work_dir,
          record_name=record)
        e("java -jar $GATK -T CombineVariants --assumeIdenticalSamples -R {record_name} -V filtered_snps.vcf -V filtered_indels.vcf -genotypeMergeOptions UNIQUIFY -o concatenated.vcf",
          work_dir,
          record_name=record)
        # Removes column from vcf header
        e(
            "sed \'/^#[^#]/ {{s/\\t%s\\.variant2//}}\' concatenated.vcf > %s.vcf"
            % (strain, "final.vcf"), work_dir)
        return strain + ".vcf"
Example #8
0
    def alignment(work_dir,
                  record,
                  trimmed_1="trimmed_1.fastq",
                  trimmed_2="trimmed_2.fastq",
                  cpus=multiprocessing.cpu_count(),
                  strain="sample1",
                  species=None):
        if not species:
            species = strain

        work_dir = os.path.abspath(work_dir) + "/"
        record = os.path.abspath(record)
        # Generate a SAM file containing aligned reads
        e("bwa mem -t {cpus} -M -R \'@RG\\tID:group1\\tSM:{strain}\\tPL:illumina\\tLB:{species}\' {record_name} {trimmed_1} {trimmed_2} > aligned_reads.sam",
          work_dir,
          record_name=record,
          strain=strain,
          species=species,
          cpus=cpus,
          trimmed_1=trimmed_1,
          trimmed_2=trimmed_2)
        # Filter mapped reads and convert to BAM
        e("samtools view -@ {cpus} -F 4 -S -b -h aligned_reads.sam > mapped_reads.bam",
          work_dir,
          cpus=cpus)
        e("samtools view -@ {cpus} -f 4 -S -b -h aligned_reads.sam > unmapped_reads.bam",
          work_dir,
          cpus=cpus)
        os.remove(work_dir + "aligned_reads.sam")
        # Convert back to FASTQ for quality control
        e("samtools fastq mapped_reads.bam > mapped_reads.fastq", work_dir)
        e("fastqc mapped_reads.fastq", work_dir)
        # Sort and mark duplicates
        e(
            "java -jar $PICARD SortSam INPUT=mapped_reads.bam OUTPUT=sorted_reads.bam SORT_ORDER=coordinate",
            work_dir)
        e(
            "java -jar $PICARD MarkDuplicates INPUT=sorted_reads.bam OUTPUT=dedup_reads.bam METRICS_FILE=metrics.txt",
            work_dir)
        e("java -jar $PICARD BuildBamIndex INPUT=dedup_reads.bam", work_dir)

        return work_dir + "dedup_reads.bam"
Example #9
0
    def alignment(wd,
                  ref,
                  trimmed_1="trimmed_1.fastq",
                  trimmed_2="trimmed_2.fastq",
                  cpus=multiprocessing.cpu_count(),
                  strain="sample1",
                  species=None,
                  force=False,
                  read_group="group1"):
        if not species:
            species = strain

        mkdir(wd)

        wd = os.path.abspath(wd) + "/"
        ref = os.path.abspath(ref)

        assert os.path.exists(wd), f'{wd} could not be created'
        assert os.path.exists(ref), f'{ref} does not exist'
        assert os.path.exists(trimmed_1), f'{trimmed_1} does not exist'
        assert os.path.exists(trimmed_2), f'{trimmed_2} does not exist'

        # Generate a SAM file containing aligned reads
        if force or not os.path.exists(f"{wd}mapped_reads_raw.bam"):
            tab = "\\t"
            e(f"bwa mem -t {cpus} -M -R \'@RG{tab}ID:{read_group}{tab}SM:{strain}{tab}PL:illumina{tab}LB:{species}\' {ref} {trimmed_1} {trimmed_2} > {wd}aligned_reads.sam"
              )
        assert os.path.getsize(f"{wd}aligned_reads.sam"
                               ) > 10, f"{wd}aligned_reads.sam cant be empty"
        # Filter mapped reads and convert to BAM
        if force or (not os.path.exists(f"{wd}dedup.bam ")
                     and not os.path.exists(f"{wd}mapped_reads_raw.bam")):
            e(f"samtools view -@ {cpus} -F 4 -S -b -h {wd}aligned_reads.sam | samtools sort - > {wd}mapped_reads_raw.bam"
              )
            e(f"samtools view -@ {cpus} -f 4 -S -b -h {wd}aligned_reads.sam > {wd}unmapped_reads.bam"
              )
            e(f"bedtools bamtofastq -i unmapped_reads.bam -fq {wd}unmapped_1.fastq -fq2 {wd}unmapped_2.fastq"
              )
        if os.path.exists(f"{wd}unmapped_reads.bam"):
            os.remove(f"{wd}unmapped_reads.bam")

        if os.path.exists(f"{wd}aligned_reads.sam"):
            os.remove(f"{wd}aligned_reads.sam")

        # Sort and mark duplicates
        e(f"gatk MarkDuplicates -INPUT {wd}mapped_reads_raw.bam -OUTPUT {wd}dedup.bam -METRICS_FILE {wd}metrics.txt"
          )
        assert os.path.getsize(
            f"{wd}dedup.bam") > 10, f"{wd}dedup.bam cant be empty"

        os.remove(f"{wd}mapped_reads_raw.bam")
        e(f'samtools sort {wd}dedup.bam > {wd}mapped_reads.bam')
        os.remove(f"{wd}dedup.bam")
        e(f'samtools index {wd}mapped_reads.bam')
        e(f"gatk CollectInsertSizeMetrics --I {wd}mapped_reads.bam --O {wd}insert_size_metrics.txt --H {wd}insert_size_histogram.pdf --M 0.5"
          )

        return f'{wd}mapped_reads.bam'
Example #10
0
    def init_ref(reference_path):
        last = reference_path.split(".")[-1]
        if last == "gz":
            last = reference_path.split(".")[-2] + ".gz"
        assert last in ["fna", "fasta", "fa", "fna.gz", "fasta.gz",
                        "fa.gz"], f'unknown extension for {reference_path}'
        dict_path = reference_path.replace("." + last, ".dict")
        e(f"bwa index -a is  {reference_path}")
        e(f"samtools dict {reference_path} > {dict_path}")
        if last.endswith(".gz"):
            e(f"zcat {reference_path}| bgzip > {reference_path}.tmp; cp '{reference_path}.tmp' '{reference_path}' && rm '{reference_path}.tmp'"
              )

        e(f"samtools faidx {reference_path}")
        # e("bowtie2-build {record_name} {record_name}".format(record_name=filename))
        if reference_path.endswith(".gz"):
            e(f"zcat {reference_path} | makeblastdb -dbtype nucl -title {reference_path} -input_type fasta -out {reference_path}  -in -"
              )
        else:
            e(f"makeblastdb -dbtype nucl -in {reference_path} ")