Beispiel #1
0
def fq_trimming(pipeline,
                fq1_files,
                fq2_files,
                clinseq_barcode,
                ref,
                outdir,
                maxcores=1):
    fq1_abs = [normpath(x) for x in fq1_files]
    fq2_abs = [normpath(x) for x in fq2_files]
    logging.debug("Trimming {} and {}".format(fq1_abs, fq2_abs))
    pairs = [(fq1_abs[k], fq2_abs[k]) for k in range(len(fq1_abs))]

    fq1_trimmed = []
    fq2_trimmed = []

    for fq1, fq2 in pairs:
        skewer = Skewer()
        skewer.input1 = fq1
        skewer.input2 = fq2
        skewer.output1 = outdir + "/skewer/libs/{}".format(
            os.path.basename(fq1))
        skewer.output2 = outdir + "/skewer/libs/{}".format(
            os.path.basename(fq2))
        skewer.stats = outdir + "/skewer/libs/skewer-stats-{}.log".format(
            os.path.basename(fq1))
        skewer.threads = maxcores
        skewer.jobname = "skewer/{}".format(os.path.basename(fq1))
        skewer.scratch = pipeline.scratch
        skewer.is_intermediate = True
        fq1_trimmed.append(skewer.output1)
        fq2_trimmed.append(skewer.output2)
        pipeline.add(skewer)

    cat1 = Cat()
    cat1.input = fq1_trimmed
    cat1.output = outdir + "/skewer/{}-concatenated_1.fastq.gz".format(
        clinseq_barcode)
    cat1.jobname = "cat1/{}".format(clinseq_barcode)
    cat1.is_intermediate = True
    pipeline.add(cat1)

    cat2 = Cat()
    cat2.input = fq2_trimmed
    cat2.jobname = "cat2/{}".format(clinseq_barcode)
    cat2.output = outdir + "/skewer/{}-concatenated_2.fastq.gz".format(
        clinseq_barcode)
    cat2.is_intermediate = True
    pipeline.add(cat2)

    return cat1.output, cat2.output
Beispiel #2
0
def align_se(pipeline, fq1_files, clinseq_barcode, ref, outdir, maxcores, remove_duplicates=True):
    """
    Align single end data
    :param pipeline:
    :param fq1_files:
    :param lib:
    :param ref:
    :param outdir:
    :param maxcores:
    :param remove_duplicates:
    :return:
    """
    logging.debug("Aligning files: {}".format(fq1_files))
    fq1_abs = [normpath(x) for x in fq1_files]
    fq1_trimmed = []
    for fq1 in fq1_abs:
        skewer = Skewer()
        skewer.input1 = fq1
        skewer.input2 = None
        skewer.output1 = outdir + "/skewer/{}".format(os.path.basename(fq1))
        skewer.output2 = outdir + "/skewer/unused-dummyfq2-{}".format(os.path.basename(fq1))
        skewer.stats = outdir + "/skewer/skewer-stats-{}.log".format(os.path.basename(fq1))
        skewer.threads = maxcores
        skewer.jobname = "skewer/{}".format(os.path.basename(fq1))
        skewer.scratch = pipeline.scratch
        skewer.is_intermediate = True
        fq1_trimmed.append(skewer.output1)
        pipeline.add(skewer)


    cat1 = Cat()
    cat1.input = fq1_trimmed
    cat1.output = outdir + "/skewer/{}_1.fastq.gz".format(clinseq_barcode)
    cat1.jobname = "cat/{}".format(clinseq_barcode)
    cat1.is_intermediate = False
    pipeline.add(cat1)

    bwa = Bwa()
    bwa.input_fastq1 = cat1.output
    bwa.input_reference_sequence = ref
    bwa.remove_duplicates = remove_duplicates

    library_id = parse_prep_id(clinseq_barcode)
    sample_string = compose_sample_str(extract_unique_capture(clinseq_barcode))

    bwa.readgroup = "\"@RG\\tID:{rg_id}\\tSM:{rg_sm}\\tLB:{rg_lb}\\tPL:ILLUMINA\"".format(\
        rg_id=clinseq_barcode, rg_sm=sample_string, rg_lb=library_id)

    bwa.threads = maxcores
    bwa.output = "{}/{}.bam".format(outdir, clinseq_barcode)
    bwa.scratch = pipeline.scratch
    bwa.jobname = "bwa/{}".format(clinseq_barcode)
    bwa.is_intermediate = False
    pipeline.add(bwa)

    return bwa.output
    def __init__(self,
                 genome_resources,
                 outdir,
                 maxcores=1,
                 runner=Shellrunner()):
        PypedreamPipeline.__init__(self, normpath(outdir), runner=runner)

        self.genome_resources = genome_resources
        self.input_reference_sequence = "{}/human_g1k_v37_decoy.fasta.gz".format(
            genome_resources)
        self.cosmic_vcf = "{}/CosmicCodingMuts_v71.vcf.gz".format(
            genome_resources)
        self.qdnaseq_background = "{}/qdnaseq_background.Rdata".format(
            genome_resources)
        self.swegene_common_vcf = "{}/swegen_common.vcf.gz".format(
            genome_resources)
        self.thousand_genome_vcf = "{}/1000G_phase1.indels.b37.vcf.gz".format(
            genome_resources)
        self.mills_and_1000g_gold_standard = "{}/Mills_and_1000G_gold_standard.indels.b37.vcf.gz".format(
            genome_resources)
        self.brca_exchange = "{}/BrcaExchangeClinvar_15Jan2019_v26_hg19.vcf.gz".format(
            genome_resources)
        self.oncokb = "{}/OncoKB_6Mar19_v1.9.txt".format(genome_resources)
        self.outdir = outdir
        self.maxcores = maxcores
        self.reference_data = dict()

        self.exac_remote = "ftp://ftp.broadinstitute.org/pub/ExAC_release/release0.3.1/ExAC.r0.3.1.sites.vep.vcf.gz"
        self.dbsnp_remote = "ftp://ftp.ncbi.nlm.nih.gov/snp/organisms/human_9606_b149_GRCh37p13/VCF/All_20161121.vcf.gz"
        self.clinvar_remote = "ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/archive_1.0/2016/clinvar_20160203.vcf.gz"
        self.icgc_somatic_remote = "https://dcc.icgc.org/api/v1/download?fn=/release_20/Summary/simple_somatic_mutation.aggregated.vcf.gz"
        self.ensembl_version = "75"
        self.ensembl_gtf_remote = "ftp://ftp.ensembl.org/pub/release-" + self.ensembl_version + \
                                  "/gtf/homo_sapiens/Homo_sapiens.GRCh37." + self.ensembl_version + ".gtf.gz"
        self.mitranscriptome_remote = "http://mitranscriptome.org/download/mitranscriptome.gtf.tar.gz"

        self.prepare_reference_genome()
        self.prepare_genes()
        self.prepare_sveffect_regions()
        self.prepare_intervals()
        self.prepare_variants()

        fetch_vep_cache = InstallVep()
        fetch_vep_cache.output_dir = "{}/vep/".format(self.outdir)
        #self.add(fetch_vep_cache)

        self.reference_data['vep_dir'] = fetch_vep_cache.output_dir

        self.make_ref_paths_relative()

        with open("{}/autoseq-genome.json".format(self.outdir),
                  "w") as output_file:
            json.dump(self.reference_data,
                      output_file,
                      indent=4,
                      sort_keys=True)
Beispiel #4
0
def find_fastqs(library, libdir):
    """Find fastq files for a given library id in a given direcory.

        Returns a tuple with two lists:
    (['foo_1.fastq.gz', 'bar_1.fastq.gz'], # read 1
     ['foo_2.fastq.gz', 'bar_2.fastq.gz'])

    Supports the following file naming convenstions:
    *_1.fastq.gz / *_2.fastq.gz
    *_1.fq.gz / *_2.fq.gz
    *R1_nnn.fastq.gz / *R2_nnn.fastq.gz

    :rtype: tuple[str,str]
    """
    if not library:
        return (None, None)
    regex_fq1 = '(.+)(_1\.fastq.gz|_1\.fq.gz|R1_\d{3}.fastq.gz)'
    regex_fq2 = '(.+)(_2\.fastq.gz|_2\.fq.gz|R2_\d{3}.fastq.gz)'

    d = normpath(os.path.join(libdir, library))
    logger.debug(
        "Looking for fastq files for library {library} in {libdir}".format(
            library=library, libdir=libdir))

    fq1s = []
    fq2s = []

    for f in os.listdir(d):
        match1 = re.search(regex_fq1, f)
        if match1:
            fn = "".join(match1.groups())
            fq1s.append(os.path.join(libdir, library, fn))
        match2 = re.search(regex_fq2, f)
        if match2:
            fn = "".join(match2.groups())
            fq2s.append(os.path.join(libdir, library, fn))

    fq1s.sort()
    fq2s.sort()

    logging.debug("Found {}".format((fq1s, fq2s)))
    return fq1s, fq2s
Beispiel #5
0
from autoseq.util.path import normpath

alascca_test_outdir = normpath("~/tmp/alascca-test")
alascca_purity_test_outdir = normpath("~/tmp/alascca-purity-test")
liqbio_test_outdir = normpath("~/tmp/liqbio-test")
Beispiel #6
0
def align_pe(pipeline, fq1_files, fq2_files, clinseq_barcode, ref, outdir, maxcores=1, remove_duplicates=True):
    """
    align paired end data
    :param pipeline:
    :param fq1_files:
    :param fq2_files:
    :param lib:
    :param ref:
    :param outdir:
    :param maxcores:
    :param remove_duplicates:
    :return:
    """
    fq1_abs = [normpath(x) for x in fq1_files]
    fq2_abs = [normpath(x) for x in fq2_files]
    logging.debug("Trimming {} and {}".format(fq1_abs, fq2_abs))
    pairs = [(fq1_abs[k], fq2_abs[k]) for k in range(len(fq1_abs))]

    fq1_trimmed = []
    fq2_trimmed = []

    for fq1, fq2 in pairs:
        skewer = Skewer()
        skewer.input1 = fq1
        skewer.input2 = fq2
        skewer.output1 = outdir + "/skewer/libs/{}".format(os.path.basename(fq1))
        skewer.output2 = outdir + "/skewer/libs/{}".format(os.path.basename(fq2))
        skewer.stats = outdir + "/skewer/libs/skewer-stats-{}.log".format(os.path.basename(fq1))
        skewer.threads = maxcores
        skewer.jobname = "skewer/{}".format(os.path.basename(fq1))
        skewer.scratch = pipeline.scratch
        skewer.is_intermediate = True
        fq1_trimmed.append(skewer.output1)
        fq2_trimmed.append(skewer.output2)
        pipeline.add(skewer)

    cat1 = Cat()
    cat1.input = fq1_trimmed
    cat1.output = outdir + "/skewer/{}-concatenated_1.fastq.gz".format(clinseq_barcode)
    cat1.jobname = "cat1/{}".format(clinseq_barcode)
    cat1.is_intermediate = True
    pipeline.add(cat1)

    cat2 = Cat()
    cat2.input = fq2_trimmed
    cat2.jobname = "cat2/{}".format(clinseq_barcode)
    cat2.output = outdir + "/skewer/{}-concatenated_2.fastq.gz".format(clinseq_barcode)
    cat2.is_intermediate = True
    pipeline.add(cat2)

    bwa = Bwa()
    bwa.input_fastq1 = cat1.output
    bwa.input_fastq2 = cat2.output
    bwa.input_reference_sequence = ref
    bwa.remove_duplicates = remove_duplicates

    library_id = parse_prep_id(clinseq_barcode)
    sample_string = compose_sample_str(extract_unique_capture(clinseq_barcode))

    bwa.readgroup = "\"@RG\\tID:{rg_id}\\tSM:{rg_sm}\\tLB:{rg_lb}\\tPL:ILLUMINA\"".format(\
        rg_id=clinseq_barcode, rg_sm=sample_string, rg_lb=library_id)

    bwa.threads = maxcores
    bwa.output = "{}/{}.bam".format(outdir, clinseq_barcode)
    bwa.jobname = "bwa/{}".format(clinseq_barcode)
    bwa.scratch = pipeline.scratch
    bwa.is_intermediate = False
    pipeline.add(bwa)

    return bwa.output