def removeDuplicates(runCFG, bam_files, threads='1'):
    #initial parameters
    outDir = runCFG['exec']['outdir']
    logfile = runCFG['exec']['logfile']
    checkexists(os.path.join(outDir, 'rm_dups'))
    outDir = os.path.join(outDir, 'rm_dups')

    #notify starting to remove duplicates
    procTitle('Remove Duplicates', runCFG)
    print('\nSniffles: Removing duplicate reads')
    #get time at start
    start = time.time()

    #generate commands
    cmds = []
    output_list = []
    for path in bam_files:
        full_path = os.path.abspath(path)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]

        #remove duplicate reads command
        cmd = f'java -Xmx2g -jar /tools/picard.jar MarkDuplicates I=/in_dir/{id}.bam O=/out_dir/{id}.bam REMOVE_DUPLICATES=true M=/out_dir/{id}.removeDupMetrics.txt'
        cmds.append(cmd)

        #add id to finished list
        output_list.append(os.path.join(outDir, f'{id}.bam'))

    #set up multiprocessing
    pool = mp.Pool(processes=threads)

    #denote start of remove duplicate reads in logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Removing Duplicates\n')
        #start multiprocessing
        results = pool.starmap_async(
            cd.call, [[cmd, '/reads', {
                path: "/in_dir",
                outDir: "/out_dir"
            }] for cmd in cmds])
        pool.close()
        pool.join()
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')
    #get time at end
    end = time.time()

    #determine runtime of processes
    runtime = round(end - start, 2)
    print(f'\nSniffles: Finished removing duplicates in {runtime} seconds')
    return output_list
Exemple #2
0
def mapping(runCFG,param_paths,outDir,threads='1'):

    logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile'])

    num_jobs,num_threads = cpu_count(threads)

    cmds = []
    read_path = ''
    ref_path = ''
    output_bam_list = []
    for param_path in param_paths:
        id = param_path[0]
        read1 = os.path.basename(param_path[1])
        read2 = os.path.basename(param_path[2])
        read_path = os.path.dirname(os.path.abspath(param_path[1]))
        ref_path = runCFG['exec']['outdir'] + '/ref_sequence'
        reference_sequence_name = os.path.basename(param_path[3])

        #check output folder exists
        checkexists(os.path.join(outDir))
        if read2 != '':
            #generate command for paired end
            cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} -1 /reads/{read1} -2 /reads/{read2} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'"
        else:
            #generate command for interleaved
            cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} --interleaved /reads/{read1} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'"
        cmds.append(cmd)

        #data for next stage
        output_bam_list.append(os.path.join(outDir,f'{id}.bam'))

    #set up multiprocessing
    #start multiprocessing
    pool = mp.Pool(processes=num_jobs)
    #notify starting mapping
    procTitle('Mapping Reads')
    print('\nSniffles: Started mapping')
    #get start time
    start = time.time()
    #denote start of mapping in logs
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Mapping\n')
        #start multiprocessing
        results = pool.starmap_async(cd.call,[[cmd,'/reads',{ref_path:"/reference",read_path:"/reads",outDir:"/output"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')
    #get end time
    end = time.time()
    #get total runtime
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished mapping in {runtime} seconds')
    return output_bam_list
Exemple #3
0
def indexing(runCFG,*paths):
    logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile'])
    outDir = runCFG['exec']['outdir'] + '/ref_sequence'
    checkexists(outDir)
    procTitle('Indexing Reference Genome')
    for path in paths:
        reference_sequence_abspath = os.path.abspath(path)
        reference_sequence_name = os.path.basename(reference_sequence_abspath)
        #index reference
        cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name}'
        with open(logfile,'a') as outlog:
            outlog.write("*************************\n")
            outlog.write("Bowtie2 indexing the reference\n")
            copyfile(reference_sequence_abspath,os.path.join(outDir,reference_sequence_name))
            outlog.write(cd.call(cmd,'/data',{outDir:"/data"}))
            outlog.write("*************************\n")
def indexing(runCFG, *paths):
    #print('\n-----------------------Sniffles: Indexing reference sequence-----------------------\n')

    logfile = runCFG['exec']['logfile']
    outDir = runCFG['exec']['outdir'] + '/ref_sequence'
    checkexists(outDir)
    procTitle("Indexing reference sequence", runCFG)

    for path in paths:
        reference_sequence_abspath = os.path.abspath(path)
        reference_sequence_name = os.path.basename(reference_sequence_abspath)
        print(path)
        #index reference
        cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name} --quiet'  #' --threads {threads}'
        with open(logfile, 'a') as outlog:
            outlog.write("***********\n")
            outlog.write("Bowtie2 indexing the reference\n")
            copyfile(reference_sequence_abspath,
                     os.path.join(outDir, reference_sequence_name))
            outlog.write(cd.call(cmd, '/data', {outDir: "/data"}))
            outlog.write("***********\n")
Exemple #5
0
                       cfg['exec']['outdir'] + '/norm_mapping', numThreads)

#generate consensus
if cfg['exec']['generateConsensus']:
    fasta_list = consensus(cfg, bam_list, numThreads)

    #map reads to consensus
    if cfg['exec']['mapToConsensus']:
        mapping_list = []
        indexing(cfg, *fasta_list)
        for id in readData.runtime['trimmed']:
            for fasta in fasta_list:
                fasta_id = os.path.basename(fasta).split('.')[0]
                if fasta_id == id:
                    mapping_list.append(
                        (id, readData.runtime['trimmed'][id][0],
                         readData.runtime['trimmed'][id][1],
                         os.path.abspath(fasta)))
        mapping(cfg, mapping_list, cfg['exec']['outdir'] + '/map_to_consensus',
                numThreads)

#call snps
if cfg['exec']['callSNPs']:
    snpcaller(cfg, bam_list, numThreads)

sc.procTitle('Finished Sniffles')
end = time.time()
runtime = round(end - start, 2)
runtime = str(datetime.timedelta(seconds=runtime))
print(f'Sniffles: Finished with a total runtime of {runtime}.')
Exemple #6
0
def snpcaller(runCFG, bam_files, threads='1'):
    #set parameters
    outDir = runCFG['exec']['outdir']
    logfile = os.path.join(outDir, runCFG['exec']['logfile'])
    outDir = os.path.join(outDir, 'snp_calls')
    checkexists(outDir)

    #set reference sequence
    reference_sequence_path = runCFG['exec']['outdir'] + '/ref_sequence'
    reference_sequence_name = os.path.basename(
        runCFG['exec']['referenceSequence'])

    #starting time point
    start = time.time()
    procTitle('SNP Calling')
    print(f'\nSniffles: Started calling SNPs')

    bams = []
    sample_list = []
    for path in bam_files:
        full_path = os.path.abspath(path)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]
        sample_list.append(id)
        bams.append('/infile/' + file_name)

    #generate mpileup
    cmd1 = 'bash -c \'samtools mpileup -ABR -d 1000000 {bams} -f /ref/{reference_sequence_name} > all.mpileup &&'.format(
        bams=' '.join(bams), reference_sequence_name=reference_sequence_name)

    #call snps
    snp_frequency = runCFG['snpcalling']['snpFrequency']
    min_cov = runCFG['snpcalling']['minCoverage']
    snp_qual_threshold = runCFG['snpcalling']['snpQualityThreshold']

    cmd2 = 'java -jar /tools/varscan.jar mpileup2cns all.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{samples}") > all_snps.vcf\''.format(
        min_cov=min_cov,
        snp_qual_threshold=snp_qual_threshold,
        snp_frequency=snp_frequency,
        samples='\n'.join(sample_list))

    #add commands to list for multiprocessing
    cmd = cmd1 + cmd2

    #future code block for annotating aa changes
    #if runCFG['exec']['annotateAAChanges']:
    #pass
    #TODO add annotater for annotating aa changes

    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Calling SNPs\n')
        results = cd.call(cmd, '/outfile', {
            reference_sequence_path: "/ref",
            path: "/infile",
            outDir: "/outfile"
        })
        outlog.write('-----------\n')
        outlog.write(results)
        #denote end of logs
        outlog.write('***********\n')

    #get end time
    end = time.time()
    #get total runtime
    runtime = round(end - start, 2)
    print(f'\nSniffles: Finished calling snps in {runtime} seconds')
Exemple #7
0
def consensus(runCFG,bam_list,threads='1'):
    #inital parameters
    outDir =runCFG['exec']['outdir']
    logfile = os.path.join(outDir,runCFG['exec']['logfile'])
    outDir = os.path.join(outDir,'consensus')
    checkexists(outDir)

    #notify starting mapping
    procTitle('Generate Consensus')
    print('\nSniffles: Started generating consensus vcf')

    #get start time
    overall_start = time.time()
    start = time.time()

    #set reference sequence
    reference_sequence_abspath = os.path.abspath(runCFG['exec']['referenceSequence'])
    reference_sequence_name = os.path.basename(reference_sequence_abspath)
    reference_sequence_dir = runCFG['exec']['outdir'] + '/ref_sequence'

    #command list
    cmds = []
    vcf_list = []
    for path in bam_list:
        full_path = os.path.abspath(path)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]
        #run varscan mpileup2cns to generate vcf with consensus information
        minCov = runCFG['snpcalling']['minCoverage']
        quality = runCFG['snpcalling']['snpQualityThreshold']
        freq = runCFG['snpcalling']['consensusFrequency']

        #make multiway pileup using samtools
        cmd1 = f'bash -c \'samtools mpileup -ABd 1000000 /infile/{file_name} -f /ref/{reference_sequence_name} -o {id}.pileup && '

        cmd2 = f'java -jar /tools/varscan.jar mpileup2cns {id}.pileup --min-coverage {minCov} --min-avg-qual {quality} --min-var-freq {freq} --strand-filter 1 --output-vcf 1 > {id}.vcf\''
        cmds.append(cmd1 + cmd2)
        vcf_list.append(os.path.join(outDir,f'{id}.vcf'))

    #setup multiprocessing
    pool = mp.Pool(processes=threads)

    #start multiprocessing
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Generating Consensus\n')
        #start multiprocessing
        results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",path:"/infile",outDir:"/outfile"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    #check if vcf file is empty, if it is skip id and remove vcf file
    filtered_vcf_list = []
    for path in vcf_list:
        try:
            if os.path.getsize(path)>0:
                filtered_vcf_list.append(path)
            else:
                os.remove(path)
        except:
            pass

    end = time.time()
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished generating the consensus vcf in {runtime} seconds')
    start = time.time()
    print(f'\nSniffles: Generating consensus fasta')

    #command list for compressing files
    cmds = []
    out_fasta = []
    for vcf in filtered_vcf_list:
        full_path = os.path.abspath(vcf)
        file_name = os.path.basename(full_path)
        path = os.path.dirname(full_path)
        id = file_name.split(".")[0]
        #compress vcf file with bgzip
        cmd = f'bash -c \'bgzip {id}.vcf && tabix {id}.vcf.gz && bcftools consensus -f /ref/{reference_sequence_name} {id}.vcf.gz -o {id}.fasta\''
        out_fasta.append(os.path.join(outDir,f'{id}.fasta'))
        cmds.append(cmd)

    #start multiprocessing
    with open(logfile,'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Creating consensus Fasta\n')
        #start multiprocessing
        results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",outDir:"/outfile"}] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    end = time.time()
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished generating consensus fasta in {runtime} seconds')

    #determine runtime of processes
    end = time.time()
    runtime = round(end - overall_start,2)
    print(f'\nSniffles: Finished generating consensus sequence in {runtime} seconds')
    return out_fasta
sc.checkexists(outDir)

cfg['exec']['outdir'] = os.path.join(outDir,cfg['exec']['outdir'])

try:
	os.mkdir(cfg['exec']['outdir'])
except FileExistsError:
	cfg['exec']['outdir'] = cfg['exec']['outdir']+'_'+str(int(time.time()))
	os.mkdir(cfg['exec']['outdir'])
outDir = cfg['exec']['outdir']

logfile=os.path.join(outDir,cfg['exec']['logfile'])
cfg['exec']['logfile'] = logfile

startRunMessage = f"Beginning run at {strftime('%a, %d %b %Y %I:%M:%S %p', time.localtime())}"
sc.procTitle(startRunMessage, cfg)

with open(logfile,'a') as outlog:
	outlog.write(startRunMessage + "\n")

cfg['Errors'] = []

for reference, gtf in zip(list(cfg['exec']['referenceSequences']), cfg['postprocessing']['gtfFileNames']):
	sc.procTitle (f"Processing samples for reference sequence {reference.split('.')[0]}", cfg)
		
	#assign reference sequence value and gtf value to current reference and gtf
	cfg['exec']['referenceSequence'] = os.path.join(os.getcwd(), reference)
	cfg['postprocessing']['gtfFileName'] = os.path.join(os.getcwd(), gtf)

	try:
		inDir = os.path.abspath(args.i)
Exemple #9
0
def normCoverage(runCFG,bam_files,threads='1'):
    #NOTE: normalizing with bbnorm uses all available memory, thus can only be run serialy

    #inital parameters
    outDir =runCFG['exec']['outdir']
    checkexists(os.path.join(outDir,'normalized'))
    logfile = os.path.join(outDir,runCFG['exec']['logfile'])
    outDir = os.path.join(outDir,'normalized')

    #notify starting to remove duplicates
    procTitle('Normalize Coverage')
    print('\nSniffles: Normalizing read coverage')

    #get time at start
    start = time.time()

    #denote start of remove duplicate reads in logs
    with open(logfile,'a') as outlog:
        outlog.write('********************\n')
        outlog.write('Normalizing coverage\n')

        #run normalization
        output_list = []
        for path in bam_files:
            full_path = os.path.abspath(path)
            file_name = os.path.basename(full_path)
            path = os.path.dirname(full_path)
            id = file_name.split('.')[0]

            #get reads from mapped bamfile
            cmd_get_reads = f'bash -c \'samtools fastq /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq -2 /out_dir/{id}_mapped_2.fastq && '

            #run seqtk to subsample reads
            total_reads = runCFG['exec']['totalReads']
            cmd_normalization = f'seqtk sample -s100 /out_dir/{id}_mapped_1.fastq {total_reads} > {id}_1.fastq && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq {total_reads} > {id}_2.fastq\''

            #start docker containers and run
            outlog.write(f'{id}-----------\n')
            stdout=cd.call(cmd_get_reads+cmd_normalization,'/out_dir',{path:"/bam_files",outDir:"/out_dir"})
            outlog.write(stdout)
            outlog.write(f'-----------\n')

            output_list.append([os.path.join(outDir,f'{id}_1.fastq'),os.path.join(outDir,f'{id}_2.fastq')])


            #cleanup
            try:
                os.remove(f'{outDir}/{id}_mapped_1.fastq')
            except:
                pass
            try:
                os.remove(f'{outDir}/{id}_mapped_2.fastq')
            except:
                pass
        outlog.write('********************\n')

    #get time at end
    end = time.time()

    #determine runtime of processes
    runtime = round(end - start,2)
    print(f'\nSniffles: Finished normalizing read coverage in {runtime} seconds')
    return output_list
def normCoverage(runCFG, bam_files, threads='1'):
    #initial parameters
    outDir = runCFG['exec']['outdir']
    checkexists(os.path.join(outDir, 'normalized'))
    logfile = runCFG['exec']['logfile']
    outDir = os.path.join(outDir, 'normalized')

    #notify starting to remove duplicates
    procTitle("Downsampling with seqtk to normalize coverage", runCFG)
    #print('\n-----------------------Sniffles: Downsampling with seqtk to normalize coverage-----------------------')

    #get time at start
    start = time.time()

    #denote start of remove duplicate reads in logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Downsampling with seqtk to normalize coverage\n')

        #run normalization
        output_list = []
        for path in bam_files:

            full_path = os.path.abspath(path)
            file_name = os.path.basename(full_path)
            path = os.path.dirname(full_path)
            id = file_name.split('.')[0]

            #get reads from mapped bamfile
            cmd_get_reads = f'bash -c \'samtools collate /bam_files/{id}.bam collating && samtools fastq -n /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq.gz -2 /out_dir/{id}_mapped_2.fastq.gz'

            #run seqtk to subsample readsc
            total_reads = runCFG['exec']['totalReads']
            cmd_normalization = f' && seqtk sample -s100 /out_dir/{id}_mapped_1.fastq.gz {total_reads} > {id}_1.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq.gz {total_reads} > /out_dir/{id}_2.fastq.gz\''

            if runCFG['exec']['unpaired']:
                cmd_get_reads += f' -0 /out_dir/{id}_mapped_U.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_U.fastq.gz {total_reads} > /out_dir/{id}_U.fastq.gz'
                output_list.append([
                    os.path.join(outDir, f'{id}_1.fastq.gz'),
                    os.path.join(outDir, f'{id}_2.fastq.gz'),
                    os.path.join(outDir, f'{id}_U.fastq.gz')
                ])
            else:
                output_list.append([
                    os.path.join(outDir, f'{id}_1.fastq.gz'),
                    os.path.join(outDir, f'{id}_2.fastq.gz')
                ])

            #start docker containers and run
            outlog.write(f'{id}\n-----------\n')
            stdout = cd.call(cmd_get_reads + cmd_normalization, '/out_dir', {
                path: "/bam_files",
                outDir: "/out_dir"
            })
            outlog.write(stdout)

            #cleanup
            try:
                os.remove(f'{outDir}/{id}_mapped_1.fastq.gz')
            except:
                pass
            try:
                os.remove(f'{outDir}/{id}_mapped_2.fastq.gz')
            except:
                pass
            try:
                os.remove(f'{outDir}/{id}_mapped_U.fastq.gz')
            except:
                pass
        outlog.write('***********\n')

    #get time at end
    end = time.time()

    #determine runtime of processes
    runtime = round(end - start, 2)
    runtime = str(datetime.timedelta(seconds=runtime))
    print(f'\nSniffles: Finished normalizing read coverage in {runtime}')
    return output_list
def VCFannotator(runCFG, vcffiles):
    # read in reference sequences and gtfs and store information about protein sequences
    # create a dictionary of gene names and start/stop sites, allowing for more than one start/stop site.

    #import file location parameters from config file
    outDir = os.path.join(runCFG['exec']['outdir'], 'vcf_annotations')
    checkexists(outDir)
    logfile = os.path.join(outDir, runCFG['exec']['logfile'])
    refseqfasta = runCFG['exec']['referenceSequence']
    refseqname = refseqfasta.split(".")[0]

    if runCFG['exec']['mapToConsensus']:
        refseqfasta = os.path.join(runCFG['exec']['outdir'], 'ref_sequence',
                                   refseqfasta)

    #get start time
    start1 = time.time()

    procTitle('Annotating SNPs', runCFG)

    #Extract coding sequence coordinates from gtf files:
    coding_regions = {
    }  #will be dictionary of dictionaryies (format segment:gene:[[startExon1, stopExon1], [startExon2, stopExon2]])
    with open(runCFG['postprocessing']['gtfFileName'], "r") as gtf:
        for line in gtf:
            if line.strip(
                    "\n"
            ) != "":  # ignore blank lines (otherwise throws an index error)
                line = line.replace("/", "_")
                lineitems = line.split("\t")
                segment_name = lineitems[0]
                annotation_type = lineitems[2]
                start = int(
                    lineitems[3]) - 1  # adding the -1 here for 0 indexing
                stop = int(
                    lineitems[4]) - 1  # adding the -1 here for 0 indexing
                gene_name = lineitems[8]
                gene_name = gene_name.split(";")[0]
                gene_name = gene_name.replace("gene_id ", "")
                gene_name = gene_name.replace("\"", "")

                if annotation_type.lower() == "cds":
                    if segment_name not in coding_regions:
                        coding_regions[segment_name] = {}
                        coding_regions[segment_name][gene_name] = [[
                            start, stop
                        ]]
                    elif segment_name in coding_regions and gene_name not in coding_regions[
                            segment_name]:
                        coding_regions[segment_name][gene_name] = [[
                            start, stop
                        ]]
                    elif gene_name in coding_regions[segment_name]:
                        coding_regions[segment_name][gene_name].append(
                            [start, stop])

    # pull in reference fasta file, separate gene segments into a dictionary
    ref_segments = {}

    for seq in SeqIO.parse(refseqfasta, "fasta"):
        refseqname = str(seq.id).replace("/", "_")
        sequence = str(seq.seq).lower()
        ref_segments[refseqname] = sequence

    # use gene coordinates to create coding sequences from reference sequences
    transcripts = {}

    #Reminder of current data structures:
    #coding_regions[segment][gene]:coordinates of genes
    #ref_segments[nameofsegment]:sequence

    for segment in coding_regions:
        for gene in coding_regions[segment]:
            transcripts[gene] = ""
            coordinates = coding_regions[segment][
                gene]  # define the coding regions for each gene
            for start, stop in coordinates:  # loop through start/stop sites in coding regions
                sequence_chunk = ref_segments[segment][start:stop + 1]
                transcripts[gene] = transcripts[
                    gene] + sequence_chunk  # append each piece of the transcript together

    # loop through each transcript to make sure that it begins with a start codon and ends with a stop codon
    #for t in transcripts:
    #if transcripts[t][0:3] != start_codon:
    #print("WARNING! " + refseqname + " " + t + " does not contain a start codon! The first three nucleotides are " + transcripts[t][0:3])
    #if transcripts[t][-3:] not in stop_codons:
    #print("WARNING! " + refseqname + " " + t + " does not contain a stop codon! These are the last 3 nucleotides: " + transcripts[t][-3:])

    print(vcffiles)
    if os.path.isdir(vcffiles):
        vcffiles = glob.glob(vcffiles + "/*.vcf")
    elif type(vcffiles) == list:
        if vcffiles[0].split(".")[-1] == "vcf":
            pass
    else:
        print("vcffiles has no vcf files!")

    listofmutstoExport = []
    ##Loop through each vcf file and annotate amino acid changes
    print(vcffiles)
    for i, vcfname in tqdm(enumerate(vcffiles)):
        with open(vcfname, "r") as TextVCF:
            for index, line in enumerate(TextVCF, 0):
                if "#CHROM" in line:
                    rowstoskip = index

        #Reads the vcf file into a pandas DataFrame
        print(vcfname)
        try:
            vcfDF = pd.read_csv(vcfname, sep='\t', skiprows=rowstoskip)
        except OSError as inst:
            print("\n" + vcfname +
                  " did not open appropriately. Please check file.\n")

        #fix the / in chrome bug
        vcfDF["#CHROM"] = vcfDF["#CHROM"].str.replace("/", "_")
        #extract frequencies for list of muts to export:
        #In order to make this easier, I'm going to assume each VCF has only one sample. This code DOES NOT WORK for more than one sample per VCF.
        #print(vcfDF.iloc[:,-1].str.split(":").str[6].str.rstrip('%').astype('float')/100)
        freqlocation = vcfDF.loc[0, "FORMAT"].split(":").index("FREQ")
        try:
            vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split(
                ":").str[freqlocation].astype('float')
        except ValueError:
            vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split(
                ":").str[freqlocation].str.rstrip('%').astype('float') / 100
        except:
            raise

        listofmuts = []

        #loop through each line in vcfDF, extract chrom, pos, reference nucleotide, alternate nucleotide
        for chrom, pos, ref, alt, freq in zip(vcfDF['#CHROM'], vcfDF["POS"],
                                              vcfDF["REF"].str.lower(),
                                              vcfDF["ALT"].str.lower(),
                                              vcfDF["FREQ"]):
            pos -= 1  #subtract one from position to convert from VCF's 1 indexing to python's 0
            for gene in coding_regions[chrom].keys(
            ):  #loop through each gene potentially applicable to that position (i.e., all on chromosome)
                priorExonLength = 0
                #print (gene)
                for start, stop in coding_regions[chrom][
                        gene]:  #loop through each exon of gene
                    #if pos in exon, calculate codon, reference aa, and variant aa
                    #print (f"pos: {pos} start: {start} stop: {stop}")
                    if pos in range(start, stop):
                        #print ('is in range, annotating.')
                        within_gene_position = pos - start + priorExonLength  #within gene position is the position in this exon (pos-startOfExon), plus the length of any prior exons (exonstart)
                        codon_pos = (within_gene_position % 3)
                        alternatetranscript = transcripts[
                            gene][:within_gene_position] + alt + transcripts[
                                gene][within_gene_position + 1:]

                        codon = transcripts[gene][(within_gene_position -
                                                   codon_pos):(
                                                       within_gene_position +
                                                       (3 - codon_pos))]
                        variantcodon = alternatetranscript[(
                            within_gene_position -
                            codon_pos):(within_gene_position +
                                        (3 - codon_pos))]
                        ref_aa = Seq(codon).translate()
                        variant_aa = Seq(variantcodon).translate()
                        aa_num = str(int(within_gene_position / 3) + 1)
                        #Catch errors in annotation calculations where the math results in an incorrect codon
                        if codon[codon_pos] != ref:
                            print(
                                "Something's quite wrong here. The reference SNP is not what it should be."
                            )
                            print(f"\n\nchrom: {chrom}, gene: {gene}")
                            print(
                                f"\npos: {pos}  within_gene_position: {within_gene_position}\ncodon_pos: {codon_pos}  codon: {codon}  variantcodon: {variantcodon}\n\n"
                            )
                            print(
                                f"ref:  {ref} alt: {alt}  ref_aa: {ref_aa} \nvariant_aa: {variant_aa}\n aa_num: {aa_num}\n"
                            )
                            print(transcripts[gene])
                        ref_aa = Seq(codon).translate()
                        variant_aa = Seq(variantcodon).translate()
                        aa_num = str(int(within_gene_position / 3) + 1)
                        if ref_aa != variant_aa:
                            listofmuts.append([
                                chrom, gene, pos + 1,
                                str(ref_aa + aa_num + variant_aa)
                            ])
                            if freq > 0.01 and freq < 0.99:
                                listofmutstoExport.append({
                                    "segment":
                                    chrom,
                                    'gene':
                                    gene,
                                    'position':
                                    pos + 1,
                                    'frequency':
                                    float(freq),
                                    'AAchange':
                                    str(ref_aa + aa_num + variant_aa)
                                })
                        elif ref_aa == variant_aa:
                            listofmuts.append([chrom, gene, pos + 1, "."])
                        break  #if pos is in exon, stop looping though exons
                    else:
                        priorExonLength += (
                            stop + 1 - start
                        )  #The next exon will begin after the length of this exon, i.e., after the stop point minus the start point
                #else statement only executed if for loop finishes without breaking (ie if pos is never within the gene being examined)
                else:
                    listofmuts.append([chrom, gene, pos + 1, "not in ORF"])
                    continue  #continue onto next gene

        AAchange = pd.DataFrame(listofmuts,
                                columns=['#CHROM', 'gene', 'POS', 'AAchange'])

        vcfDF = vcfDF.merge(AAchange, how='left', on=['#CHROM', 'POS'])

        vcfDF['gene'] = vcfDF['gene'].astype(str)
        vcfDF['gene'] = vcfDF['gene'].replace("NA", "NA gene")
        annotatedVCFname = os.path.basename(vcfname).split(
            ".")[0] + ".annotated_vcf"
        outputfile = vcfDF.to_csv(os.path.join(outDir, annotatedVCFname),
                                  sep="\t",
                                  index=None,
                                  header=True)
        vcffiles[i] = annotatedVCFname

    #importantMuts = pd.DataFrame(listofmutstoExport)
    #print (importantMuts)
    #importantMuts = importantMuts.groupby(['segment', 'gene', 'position', 'AAchange'], as_index=False).mean()

    #importantMuts = importantMuts.loc[importantMuts['freq']>0.02 & importantMuts['freq']<0.98]
    #importantMuts['freq'] = importantMuts['freq']/len(vcffiles)
    #importantMutsexport = importantMuts.to_csv(os.path.join(outDir, "allMutationsPresent.tsv"), sep = '\t', index=None, header=True)

    #get end time
    end = time.time()
    #get total runtime
    runtimeSeconds = end - start1
    runtime = datetime.timedelta(seconds=runtimeSeconds)
    print(f'\nSniffles: Finished annotating snps in {str(runtime)}')
    return (vcffiles)
Exemple #12
0
def RePlow(runCFG, bam_files, threads='1'):
	#set parameters
	outDir = runCFG['exec']['outdir']
	logfile = runCFG['exec']['logfile']
	bamfilespath = os.path.dirname(bam_files[0])#os.path.join(outDir, "norm_mapping")
	outDir = os.path.join(outDir,'snp_calls')

	reference_sequence_path = os.path.join(runCFG['exec']['outdir'], 'ref_sequence')
	reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence'])
	# if os.path.isdir(reference_sequence_path) and os.listdir("ref_sequence") != os.listdir(reference_sequence_path) :
	# 	rmtree (reference_sequence_path)
	# 	copytree("ref_sequence", reference_sequence_path)
	# if not os.path.isdir(reference_sequence_path):
	# 	copytree("ref_sequence", reference_sequence_path)
	#starting time point
	start =  time.time()
	procTitle('Analyzing SNPs with RePlow', runCFG)
	#print('\n-----------------------Sniffles: Calling SNPs with RePlow-----------------------')

	bams = []
	sample_list = []
	repDict = {}

	#create list of bam files to run
	print ("bam_files:")
	print (bam_files)
	
	for path in bam_files:
		full_path = os.path.abspath(path)
		file_name = os.path.basename(full_path)
		path = os.path.dirname(full_path)
		id = file_name.split(".")[0]
		sample_list.append(id)
		bams.append('/infile/'+file_name)
		print (file_name)

		#if processing replicate runs, create dictionary samplename:[list of replicate bam files for that sample]
		if runCFG['exec']['replicates']:
			repBreakdown = runCFG['exec']['replicateNotation'].split("_")
			repBreakdown = "_".join(repBreakdown[:-1])
			repBreakdown = repBreakdown[:repBreakdown.find(r"\d")]
			repBreakdown = repBreakdown.split("Sample")
			repKey = file_name[file_name.find(repBreakdown[0])+len(repBreakdown[0]):file_name.find(repBreakdown[1])]

			if repKey not in repDict.keys():
				repDict[repKey] = [file_name]
			else:
				repDict[repKey].append(file_name)
	print (repDict)


	#import SNP quality parameters from config
	snp_qual_threshold=runCFG['snpcalling']['snpQualityThreshold']
	consensus_frequency=runCFG['snpcalling']['consensusFrequency']
	mut_rate=runCFG['replow_settings']['mutrate']
	map_qual_threshold=runCFG['replow_settings']['mapquality']

	#make a default .bed file from reference that instructs replow to call SNPs on the whole genome
	chrom = []
	chromEnd = []
	with open(os.path.join(reference_sequence_path, reference_sequence_name),'r') as refseq:
		for line in refseq.readlines():
			if line[0] == ">":
				chrom.append(line[1:].rstrip())
				
			elif len(chrom) != len(chromEnd):
				chromEnd.append(len(line)-2)  #-1 to adjust for zero indexing in a bed file
			else:
				chromEnd[len(chrom)] =+ len(line)
		
		bedfile = pd.DataFrame()
		bedfile["chrom"] = chrom
		bedfile["chromStart"] = 0
		bedfile["chromEnd"] = pd.Series(chromEnd)
	print (reference_sequence_name)
	bedfilename = "".join(reference_sequence_name.split(".")[:-1])+".bed"
	bedcsv = bedfile.to_csv(os.path.join(reference_sequence_path, bedfilename), index=False, header=False, sep="\t")


	#Prep RePlow cmds
	cmds=[]
	keycmds=[]
	indexcmds=[]
	#Adds command to index reference sequence for RePlow
	indexcmds.append(f'samtools index /ref/{reference_sequence_name}')

	#add commands to list for multiprocessing
	###TO BE IMPROVED: Currently I only generate commands if I am running RePlow on replicate sequencing runs.
	### While that is the typical case, I should eventually expand this to cover single runs as well.
	#run through each sample name in replicate dictionary
	# print ("repDict:")
	print (repDict)
	for key in repDict.keys():
		#generate command to index each replicate bam file 
		for i, bam in enumerate(repDict[key]):
			repDict[key][i] = "/data/"+ bam
			indexcmds.append(f'samtools index {repDict[key][i]}')
		
		#create comma-deliniated list of replicate bam files, generate unique replow command for each sample
		bamslist=','.join(repDict[key])
		#replowcmd = f'java -cp dependency/*:classes tgil.replow.RePlow -r /ref/{reference_sequence_name} -b {bamslist} -T /ref/{bedfilename} -R /usr/bin/Rscript -f {consensus_frequency} -q {snp_qual_threshold} -Q {map_qual_threshold} -m {mut_rate} -o /output -L {key}'
		outlogHeader = f"\"{key}\n-----------\n\""
		replowcmd = f'bash -c \'printf {outlogHeader} >> {os.path.join("/logfile", os.path.basename(logfile))} && java -jar /source/RePlow-1.1.0.jar -r /ref/{reference_sequence_name} -b {bamslist} -T /ref/{bedfilename} -R /usr/bin/Rscript -f {consensus_frequency} -q {snp_qual_threshold} -Q {map_qual_threshold} -m {mut_rate} -o /output -L {key}\''
		keycmds.append(replowcmd)
	print ("Indexcmds:")
	print (indexcmds)
	print ("Keycmds:")
	print (keycmds)
	#generate mulitprocessing pool
	pool = mp.Pool(processes=threads)

	#index files first

	with open(logfile,'a') as outlog:
		outlog.write('***********\n')
		outlog.write('RePlow\n')
		outlog.write('***********\n')
		#run commands in docker contaniers with multiprocessing
		indexresults = pool.starmap_async(cd.call,[[cmd,'/source',{os.path.join (os.getcwd(), "replow"):"/source", bamfilespath:"/data", outDir:"/output", reference_sequence_path:"/ref",os.path.dirname(logfile):"/logfile"}] for cmd in indexcmds])
		pool.close()
		pool.join()

		pool = mp.Pool(processes=threads)
		results = pool.starmap_async(cd.call,[[cmd,'/source',{os.path.join (os.getcwd(), "replow"):"/source", bamfilespath:"/data", outDir:"/output", reference_sequence_path:"/ref",os.path.dirname(logfile):"/logfile"}] for cmd in keycmds])
		pool.close()
		pool.join()
		stdouts = indexresults.get() + results.get()
		
		for stdout in stdouts:
			#outlog.write('-----------\n')
			outlog.write(stdout)

		#Convert .call into .vcf
		for file in glob.glob(outDir + "/*.call"):
			samplename = calltoVCF(file, outDir)
			outlog.write(f"Call file {samplename} converted to VCF format")

		#denote end of logs
		outlog.write('***********\n')
				
	#get end time
	end = time.time()
	#get total runtime
	runtime = round(end - start,2)
	runtime = str(datetime.timedelta(seconds=runtime))
	print(f'\nSniffles: Finished calling snps in {runtime}')

	return (os.path.join(outDir, samplename +".vcf"))
def snpcaller(runCFG,bam_files,threads='1'):
	#set parameters
	outDir = runCFG['exec']['outdir']
	logfile = runCFG['exec']['logfile']
	outDir = os.path.join(outDir,'snp_calls')
	checkexists(outDir)
	
	#set reference sequence
	reference_sequence_path = os.path.dirname(runCFG['exec']['referenceSequence'])
	reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence'])
	

	#starting time point
	start =  time.time()
	if runCFG['exec']['replicates']:
		message = 'Calling replicate SNPs with Varscan'
	else:
		message = 'Calling SNPs with Varscan'

	procTitle(message, runCFG)

	bams = []
	sample_list = []
	listofVCFs = []
	repDict = {}
	
	#Create list of bam files to call SNPs on
	for path in bam_files:
		full_path = os.path.abspath(path)
		file_name = os.path.basename(full_path)
		path = os.path.dirname(full_path)
		id = file_name.split(".")[0]
		sample_list.append(id)
		bams.append('/infile/'+file_name)

		#if processing replicate runs, create dictionary samplename:[list of replicate vcf files for that sample]
		#this dictionary will be used later to merge and average replicate vcfs
		if runCFG['exec']['replicates']:
			repBreakdown = runCFG['exec']['replicateNotation'].split("_")
			repBreakdown = "_".join(repBreakdown[:-1])
			repBreakdown = repBreakdown[:repBreakdown.find(r"\d")]
			repBreakdown = repBreakdown.split("Sample")
			repKey = file_name[file_name.find(repBreakdown[0])+len(repBreakdown[0]):file_name.find(repBreakdown[1])]

			vcf_name = (id+".vcf")
			listofVCFs.append(vcf_name)
			if repKey not in repDict.keys():
				repDict[repKey] = [vcf_name]
			else:
				repDict[repKey].append(vcf_name)

	#import SNP calling quality parameters from config file
	snp_frequency=runCFG['snpcalling']['snpFrequency']
	min_cov=runCFG['snpcalling']['minCoverage']
	snp_qual_threshold=runCFG['snpcalling']['snpQualityThreshold']

	#generate commands to call variants
	cmds=[]
	for bam, sample in zip(bams, sample_list):
		#mpileup command
		outlogHeader = f"{bam.split('/')[-1].split('.')[0]}\n-----------\n"

		cmd1 = f'printf \"{outlogHeader}\" >> {os.path.join("/logfile", os.path.basename(logfile))} && samtools mpileup -ABR -d 1000000 {bam} -f /ref/{reference_sequence_name} > {sample}.mpileup'

		#varscan command
		cmd2 = f'java -jar /tools/varscan.jar mpileup2snp {sample}.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{sample}") > {sample}_temp.vcf'
		#compress and normalize vcf
		cmd3 = f'bcftools norm -c sw -m - -f /ref/{reference_sequence_name} -o {sample}.vcf {sample}_temp.vcf && rm {sample}_temp.vcf'
		
		if not runCFG['exec']['replicates']:
			listofVCFs.append(os.path.join(outDir, f"{sample}.vcf"))
		#add commands to list for multiprocessing
		cmds.append("bash -c \'" + cmd1 + " && " + cmd2 + " && " + cmd3 + "\'")
	
	#initialize multiprocessing pool
	pool = mp.Pool(processes=threads)

	#open logfile
	with open(logfile,'a') as outlog:
		outlog.write('***********\n')
		outlog.write('Calling SNPs\n')

		#run commands with mutliprocessing
		results = pool.starmap_async(cd.call,[[cmd, '/outfile',{reference_sequence_path:"/ref",path:"/infile",outDir:"/outfile",os.path.dirname(logfile):"/logfile"}] for cmd in cmds])
		
		pool.close()
		pool.join()
		stdouts = results.get()
		print ('finished all results')
		for stdout in stdouts:
			#outlog.write('-----------\n')
			outlog.write(stdout)
		#if processing duplicate runs, merge and average SNP calls
		if runCFG['exec']['replicates']:
			listofVCFs = (VCFaverager(runCFG, repDict, listofVCFs))

		outlog.write(str(listofVCFs))
		outlog.write('-----------\n')
		
		#Combine sample vcfs into one master VCF:
		#allSNPs = VCFcombiner(runCFG, listofVCFs, "allVarscanSNVs.vcf")
		#outlog.write(f"\nCombined all vcf files into master vcf file allVarscanSNVs.vcf\n")
		outlog.write('-----------\n')
		#denote end of logs
		outlog.write('***********\n')

	#get end time
	end = time.time()
	#get total runtime
	runtime = round(end - start,2)
	runtime = str(datetime.timedelta(seconds=runtime))
	print(f'\nSniffles: Finished calling snps in {runtime}')
	
	return (listofVCFs)
Exemple #14
0
def trim(readData, runCFG, threads, ids=''):

    #parameters
    minlength = runCFG['trimmomatic']['minlength']
    windowsize = runCFG['trimmomatic']['windowSize']
    qscore = runCFG['trimmomatic']['qscore']
    adapterpath = "/tools/adapters/" + runCFG['trimmomatic']['adaptersFileName']
    outDir = runCFG['exec']['outdir']
    logfile = os.path.join(outDir, runCFG['exec']['logfile'])

    #set up list of ids to trim
    if not ids:
        ids = readData.ids

    #generate commands for each trim job
    cmds = []
    for id in ids:
        #main command
        main_cmd = f'java -jar /tools/trimmomatic.jar '

        #get read path
        if readData.reads[id]:
            read_path = os.path.dirname(os.path.abspath(
                readData.reads[id].fwd))
            read1_basename = os.path.basename(readData.reads[id].fwd)
            read2_basename = os.path.basename(readData.reads[id].rev)

        #determine args
        if runCFG['trimmomatic']['removeAdapters']:
            if runCFG['trimmomatic']['paired']:
                args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}'
                readData.add_runtime(
                    'trimmed', id,
                    f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz',
                    f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz')
            else:
                args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}'
                readData.add_runtime(
                    'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz')
        else:
            if runCFG['trimmomatic']['paired']:
                args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}'
                readData.add_runtime(
                    'trimmed', id,
                    f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz',
                    f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz')
            else:
                args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}'
                readData.add_runtime(
                    'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz')

        #prepare command and add to list
        sample_cmd = main_cmd + args
        cmds.append(sample_cmd)

    #make out dir if it doesn't already exist
    try:
        os.mkdir(os.path.join(outDir, 'trimmed'))
    except:
        pass

    #set up multiprocessing
    #start multiprocessing
    pool = mp.Pool(processes=threads)
    #notify starting trimming
    procTitle('Quality Trimming')
    print('\nSniffles: Started quality trimming')
    #start timer
    start = time.time()
    #denote logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Trimmomatic\n')
        #begin multiprocessing
        results = pool.starmap_async(cd.call, [[
            cmd, '/data', {
                read_path: "/data",
                os.path.join(outDir, 'trimmed'): "/output"
            }
        ] for cmd in cmds])
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write('-----------\n')
            outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')
    #get time
    end = time.time()
    #determine runtime of processes
    runtime = round(end - start, 2)
    print(f'\nSniffles: Finished trimming in {runtime} seconds')
Exemple #15
0
def trim(readData, runCFG, threads=1, ids=''):

    #parameters
    minlength = runCFG['trimmomatic']['minlength']
    windowsize = runCFG['trimmomatic']['windowSize']
    qscore = runCFG['trimmomatic']['qscore']
    adapterpath = "/Trimmomatic-0.36/adapters/" + runCFG['trimmomatic'][
        'adaptersFileName']
    outDir = runCFG['exec']['outdir']
    logfile = runCFG['exec']['logfile']

    #set up list of ids to trim
    if not ids:
        ids = readData.ids

    #generate commands for each trim job
    cmds = []
    #rmspacecmds = []
    for id in ids:
        #get read path
        if readData.reads[id]:
            read_path = os.path.dirname(os.path.abspath(
                readData.reads[id].fwd))
            read1_basename = os.path.basename(readData.reads[id].fwd)
            read2_basename = os.path.basename(readData.reads[id].rev)

        #main command
        outlogHeader = f"\"{id}\n-----------\n\""
        containerLogpath = os.path.join("/logfile", os.path.basename(logfile))
        main_cmd = f'bash -c \'printf {outlogHeader} >> {containerLogpath} && java -jar /tools/trimmomatic.jar '
        #regexExpression='s/(^@.*) (.*)/\\1_\\2/g'

        #determine args
        if runCFG['trimmomatic']['removeAdapters']:
            if runCFG['trimmomatic']['paired']:
                args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\''
                readData.add_runtime(
                    'trimmed', id,
                    f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz',
                    f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz')
                #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed_1P.fastq.gz && sed -re \"{regexExpression}\" -i /output/{id}_trimmed_2P.fastq\'"
            else:
                args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz ILLUMINACLIP:{adapterpath}:1:30:10 SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\''
                readData.add_runtime(
                    'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz')
                #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed.fastq\'"
        else:
            if runCFG['trimmomatic']['paired']:
                args = f'PE {read1_basename} {read2_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\''
                readData.add_runtime(
                    'trimmed', id,
                    f'{outDir}/trimmed/{id}_trimmed_1P.fastq.gz',
                    f'{outDir}/trimmed/{id}_trimmed_2P.fastq.gz')
                #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed_1P.fastq && sed -re \"{regexExpression}\" -i /output/{id}_trimmed_2P.fastq\'"
            else:
                args = f'SE {read1_basename} -baseout /output/{id}_trimmed.fastq.gz SLIDINGWINDOW:{windowsize}:{qscore} MINLEN:{minlength}\''
                readData.add_runtime(
                    'trimmed', id, f'{outDir}/trimmed/{id}_trimmed.fastq.gz')
                #rmspaces = f"bash -c \'sed -re \"{regexExpression}\" -i /output/{id}_trimmed.fastq\'"

        #prepare command and add to list
        sample_cmd = main_cmd + args
        #rmspacecmds.append(rmspaces)
        cmds.append(sample_cmd)
        for cmd in cmds:
            print(cmd)

    #make out dir if it doesn't already exist
    try:
        os.mkdir(os.path.join(outDir, 'trimmed'))
    except:
        pass

    #set up multiprocessing
    pool = mp.Pool(processes=threads)
    # pool2 = mp.Pool(processes=threads)
    #notify starting trimming
    procTitle("Started quality trimming", runCFG)

    #start timer
    start = time.time()

    #denote logs
    with open(logfile, 'a') as outlog:
        outlog.write('***********\n')
        outlog.write('Trimmomatic\n')
        outlog.write('***********\n')
        #begin multiprocessing
        results = pool.starmap_async(cd.call, [[
            cmd, '/data', {
                read_path: "/data",
                os.path.join(outDir, 'trimmed'): "/output",
                os.path.dirname(logfile): "/logfile"
            }
        ] for cmd in cmds])
        pool.close()
        pool.join()
        stdouts = results.get()
        for stdout in stdouts:
            outlog.write(stdout)
        # results = pool2.starmap_async(cd.call,[[cmd,'/data',{read_path:"/data",os.path.join(outDir,'trimmed'):"/output",os.path.dirname(logfile):"/logfile"}] for cmd in rmspacecmds])
        # pool.close()
        # pool.join()
        # stdouts = results.get()
        # for stdout in stdouts:
        # 	outlog.write(stdout)
        #denote end of logs
        outlog.write('***********\n')

    #get time
    end = time.time()
    #determine runtime of processes
    runtime = round(end - start, 2)
    runtime = str(datetime.timedelta(seconds=runtime))
    print(f'Sniffles: Finished trimming in {runtime} seconds\n')