def mapping(runCFG,param_paths,outDir,threads='1'): logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile']) num_jobs,num_threads = cpu_count(threads) cmds = [] read_path = '' ref_path = '' output_bam_list = [] for param_path in param_paths: id = param_path[0] read1 = os.path.basename(param_path[1]) read2 = os.path.basename(param_path[2]) read_path = os.path.dirname(os.path.abspath(param_path[1])) ref_path = runCFG['exec']['outdir'] + '/ref_sequence' reference_sequence_name = os.path.basename(param_path[3]) #check output folder exists checkexists(os.path.join(outDir)) if read2 != '': #generate command for paired end cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} -1 /reads/{read1} -2 /reads/{read2} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'" else: #generate command for interleaved cmd = f"bash -c \'bowtie2 -x {reference_sequence_name} --interleaved /reads/{read1} -p {num_threads} --local | samtools view -bS | samtools sort -o /output/{id}.bam\'" cmds.append(cmd) #data for next stage output_bam_list.append(os.path.join(outDir,f'{id}.bam')) #set up multiprocessing #start multiprocessing pool = mp.Pool(processes=num_jobs) #notify starting mapping procTitle('Mapping Reads') print('\nSniffles: Started mapping') #get start time start = time.time() #denote start of mapping in logs with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Mapping\n') #start multiprocessing results = pool.starmap_async(cd.call,[[cmd,'/reads',{ref_path:"/reference",read_path:"/reads",outDir:"/output"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start,2) print(f'\nSniffles: Finished mapping in {runtime} seconds') return output_bam_list
def removeDuplicates(runCFG, bam_files, threads='1'): #initial parameters outDir = runCFG['exec']['outdir'] logfile = runCFG['exec']['logfile'] checkexists(os.path.join(outDir, 'rm_dups')) outDir = os.path.join(outDir, 'rm_dups') #notify starting to remove duplicates procTitle('Remove Duplicates', runCFG) print('\nSniffles: Removing duplicate reads') #get time at start start = time.time() #generate commands cmds = [] output_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] #remove duplicate reads command cmd = f'java -Xmx2g -jar /tools/picard.jar MarkDuplicates I=/in_dir/{id}.bam O=/out_dir/{id}.bam REMOVE_DUPLICATES=true M=/out_dir/{id}.removeDupMetrics.txt' cmds.append(cmd) #add id to finished list output_list.append(os.path.join(outDir, f'{id}.bam')) #set up multiprocessing pool = mp.Pool(processes=threads) #denote start of remove duplicate reads in logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Removing Duplicates\n') #start multiprocessing results = pool.starmap_async( cd.call, [[cmd, '/reads', { path: "/in_dir", outDir: "/out_dir" }] for cmd in cmds]) pool.close() pool.join() stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #get time at end end = time.time() #determine runtime of processes runtime = round(end - start, 2) print(f'\nSniffles: Finished removing duplicates in {runtime} seconds') return output_list
def average_depth(runCFG,bam_list,inDir,outDir): #check that output folder exists checkexists(os.path.join(outDir)) #setup inital parameters ref_path = runCFG['exec']['outdir'] + '/ref_sequence' reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence']) logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile']) #bam file list that will meet threashold filtered_bam_list = [] #create logfile that will hold all average depths with open(os.path.join(outDir,'average_depth.log'),'w') as outdepth: #loop through each bam for bam in bam_list: filename = os.path.basename(bam) id = filename.split('.')[0] #open the log to log output with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Coverage\n') #generate command for the current bam file cmd = f'bash -c "samtools view /indata/{filename} > {id}.tmp.sam && /tools/bbmap/pileup.sh in={id}.tmp.sam out={id}_coverage.csv ref=/reference/{reference_sequence_name} && rm {id}.tmp.sam"' #use docker to run the command output = cd.call(cmd,'/outdata',{inDir:"/indata",outDir:"/outdata",ref_path:"/reference"}) #record the output in the log outlog.write(output) #denote end of logs outlog.write('***********\n') #only add isolates that pass average depth and percent of reference covered percent_cov = 0 avg_cov = 0 #parse lines of stdout for info we need for line in output.splitlines(): if "Percent of reference bases covered:" in line: match = re.search('[0-9,.]+',line) if match: percent_cov = float(match[0]) if "Average coverage:" in line: match = re.search('[0-9,.]+',line) if match: avg_cov = float(match[0]) #check against the config for min thresholds if avg_cov >= runCFG['exec']['minimumAverageDepth'] and percent_cov >= runCFG['exec']['percentRefCovered']: #record outdepth.write(f'{id},{avg_cov},{percent_cov},Pass\n') filtered_bam_list.append(bam) else: outdepth.write(f'{id},{avg_cov},{percent_cov},Fail\n') #return only bam files that meet coverage requirements return filtered_bam_list
def indexing(runCFG,*paths): logfile = os.path.join(runCFG['exec']['outdir'],runCFG['exec']['logfile']) outDir = runCFG['exec']['outdir'] + '/ref_sequence' checkexists(outDir) procTitle('Indexing Reference Genome') for path in paths: reference_sequence_abspath = os.path.abspath(path) reference_sequence_name = os.path.basename(reference_sequence_abspath) #index reference cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name}' with open(logfile,'a') as outlog: outlog.write("*************************\n") outlog.write("Bowtie2 indexing the reference\n") copyfile(reference_sequence_abspath,os.path.join(outDir,reference_sequence_name)) outlog.write(cd.call(cmd,'/data',{outDir:"/data"})) outlog.write("*************************\n")
def indexing(runCFG, *paths): #print('\n-----------------------Sniffles: Indexing reference sequence-----------------------\n') logfile = runCFG['exec']['logfile'] outDir = runCFG['exec']['outdir'] + '/ref_sequence' checkexists(outDir) procTitle("Indexing reference sequence", runCFG) for path in paths: reference_sequence_abspath = os.path.abspath(path) reference_sequence_name = os.path.basename(reference_sequence_abspath) print(path) #index reference cmd = f'bowtie2-build {reference_sequence_name} {reference_sequence_name} --quiet' #' --threads {threads}' with open(logfile, 'a') as outlog: outlog.write("***********\n") outlog.write("Bowtie2 indexing the reference\n") copyfile(reference_sequence_abspath, os.path.join(outDir, reference_sequence_name)) outlog.write(cd.call(cmd, '/data', {outDir: "/data"})) outlog.write("***********\n")
parser.print_help() parser.exit() args = parser.parse_args() numThreads = args.t configFile = args.c #get start time start = time.time() #get input and output paths try: outDir = os.path.abspath(args.o) except (AttributeError, TypeError) as err: outDir = os.getcwd() #check if output dir exists if not create it sc.checkexists(outDir) try: inDir = os.path.abspath(args.i) except (AttributeError, TypeError) as err: inDir = os.getcwd() #open config file and store configuation with open(configFile, 'r') as ymlFile: cfg = yaml.load(ymlFile) #create outdir cfg['exec']['outdir'] = os.path.join(outDir, cfg['exec']['outdir']) try: os.mkdir(cfg['exec']['outdir']) except FileExistsError:
def snpcaller(runCFG, bam_files, threads='1'): #set parameters outDir = runCFG['exec']['outdir'] logfile = os.path.join(outDir, runCFG['exec']['logfile']) outDir = os.path.join(outDir, 'snp_calls') checkexists(outDir) #set reference sequence reference_sequence_path = runCFG['exec']['outdir'] + '/ref_sequence' reference_sequence_name = os.path.basename( runCFG['exec']['referenceSequence']) #starting time point start = time.time() procTitle('SNP Calling') print(f'\nSniffles: Started calling SNPs') bams = [] sample_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] sample_list.append(id) bams.append('/infile/' + file_name) #generate mpileup cmd1 = 'bash -c \'samtools mpileup -ABR -d 1000000 {bams} -f /ref/{reference_sequence_name} > all.mpileup &&'.format( bams=' '.join(bams), reference_sequence_name=reference_sequence_name) #call snps snp_frequency = runCFG['snpcalling']['snpFrequency'] min_cov = runCFG['snpcalling']['minCoverage'] snp_qual_threshold = runCFG['snpcalling']['snpQualityThreshold'] cmd2 = 'java -jar /tools/varscan.jar mpileup2cns all.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{samples}") > all_snps.vcf\''.format( min_cov=min_cov, snp_qual_threshold=snp_qual_threshold, snp_frequency=snp_frequency, samples='\n'.join(sample_list)) #add commands to list for multiprocessing cmd = cmd1 + cmd2 #future code block for annotating aa changes #if runCFG['exec']['annotateAAChanges']: #pass #TODO add annotater for annotating aa changes with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Calling SNPs\n') results = cd.call(cmd, '/outfile', { reference_sequence_path: "/ref", path: "/infile", outDir: "/outfile" }) outlog.write('-----------\n') outlog.write(results) #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start, 2) print(f'\nSniffles: Finished calling snps in {runtime} seconds')
def consensus(runCFG,bam_list,threads='1'): #inital parameters outDir =runCFG['exec']['outdir'] logfile = os.path.join(outDir,runCFG['exec']['logfile']) outDir = os.path.join(outDir,'consensus') checkexists(outDir) #notify starting mapping procTitle('Generate Consensus') print('\nSniffles: Started generating consensus vcf') #get start time overall_start = time.time() start = time.time() #set reference sequence reference_sequence_abspath = os.path.abspath(runCFG['exec']['referenceSequence']) reference_sequence_name = os.path.basename(reference_sequence_abspath) reference_sequence_dir = runCFG['exec']['outdir'] + '/ref_sequence' #command list cmds = [] vcf_list = [] for path in bam_list: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] #run varscan mpileup2cns to generate vcf with consensus information minCov = runCFG['snpcalling']['minCoverage'] quality = runCFG['snpcalling']['snpQualityThreshold'] freq = runCFG['snpcalling']['consensusFrequency'] #make multiway pileup using samtools cmd1 = f'bash -c \'samtools mpileup -ABd 1000000 /infile/{file_name} -f /ref/{reference_sequence_name} -o {id}.pileup && ' cmd2 = f'java -jar /tools/varscan.jar mpileup2cns {id}.pileup --min-coverage {minCov} --min-avg-qual {quality} --min-var-freq {freq} --strand-filter 1 --output-vcf 1 > {id}.vcf\'' cmds.append(cmd1 + cmd2) vcf_list.append(os.path.join(outDir,f'{id}.vcf')) #setup multiprocessing pool = mp.Pool(processes=threads) #start multiprocessing with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Generating Consensus\n') #start multiprocessing results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",path:"/infile",outDir:"/outfile"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #check if vcf file is empty, if it is skip id and remove vcf file filtered_vcf_list = [] for path in vcf_list: try: if os.path.getsize(path)>0: filtered_vcf_list.append(path) else: os.remove(path) except: pass end = time.time() runtime = round(end - start,2) print(f'\nSniffles: Finished generating the consensus vcf in {runtime} seconds') start = time.time() print(f'\nSniffles: Generating consensus fasta') #command list for compressing files cmds = [] out_fasta = [] for vcf in filtered_vcf_list: full_path = os.path.abspath(vcf) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] #compress vcf file with bgzip cmd = f'bash -c \'bgzip {id}.vcf && tabix {id}.vcf.gz && bcftools consensus -f /ref/{reference_sequence_name} {id}.vcf.gz -o {id}.fasta\'' out_fasta.append(os.path.join(outDir,f'{id}.fasta')) cmds.append(cmd) #start multiprocessing with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Creating consensus Fasta\n') #start multiprocessing results = pool.starmap_async(cd.call,[[cmd,'/outfile',{reference_sequence_dir:"/ref",outDir:"/outfile"}] for cmd in cmds]) stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') end = time.time() runtime = round(end - start,2) print(f'\nSniffles: Finished generating consensus fasta in {runtime} seconds') #determine runtime of processes end = time.time() runtime = round(end - overall_start,2) print(f'\nSniffles: Finished generating consensus sequence in {runtime} seconds') return out_fasta
#get input path try: inDir = os.path.abspath(args.i) except (AttributeError, TypeError) as err: inDir = os.getcwd() print(f"Raw reads directory {args.i} cannot be found. Sniffles will look for fasta files in the current working directory {inDir}.\n") #create outdir try: outDir = os.path.abspath(args.o) except (AttributeError, TypeError) as err: outDir = os.getcwd() print(f"Output directory {args.o} cannot be found. Output will be placed in a separate folder in the current working directory {outDir}.\n") sc.checkexists(outDir) cfg['exec']['outdir'] = os.path.join(outDir,cfg['exec']['outdir']) try: os.mkdir(cfg['exec']['outdir']) except FileExistsError: cfg['exec']['outdir'] = cfg['exec']['outdir']+'_'+str(int(time.time())) os.mkdir(cfg['exec']['outdir']) outDir = cfg['exec']['outdir'] logfile=os.path.join(outDir,cfg['exec']['logfile']) cfg['exec']['logfile'] = logfile startRunMessage = f"Beginning run at {strftime('%a, %d %b %Y %I:%M:%S %p', time.localtime())}" sc.procTitle(startRunMessage, cfg)
def normCoverage(runCFG,bam_files,threads='1'): #NOTE: normalizing with bbnorm uses all available memory, thus can only be run serialy #inital parameters outDir =runCFG['exec']['outdir'] checkexists(os.path.join(outDir,'normalized')) logfile = os.path.join(outDir,runCFG['exec']['logfile']) outDir = os.path.join(outDir,'normalized') #notify starting to remove duplicates procTitle('Normalize Coverage') print('\nSniffles: Normalizing read coverage') #get time at start start = time.time() #denote start of remove duplicate reads in logs with open(logfile,'a') as outlog: outlog.write('********************\n') outlog.write('Normalizing coverage\n') #run normalization output_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split('.')[0] #get reads from mapped bamfile cmd_get_reads = f'bash -c \'samtools fastq /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq -2 /out_dir/{id}_mapped_2.fastq && ' #run seqtk to subsample reads total_reads = runCFG['exec']['totalReads'] cmd_normalization = f'seqtk sample -s100 /out_dir/{id}_mapped_1.fastq {total_reads} > {id}_1.fastq && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq {total_reads} > {id}_2.fastq\'' #start docker containers and run outlog.write(f'{id}-----------\n') stdout=cd.call(cmd_get_reads+cmd_normalization,'/out_dir',{path:"/bam_files",outDir:"/out_dir"}) outlog.write(stdout) outlog.write(f'-----------\n') output_list.append([os.path.join(outDir,f'{id}_1.fastq'),os.path.join(outDir,f'{id}_2.fastq')]) #cleanup try: os.remove(f'{outDir}/{id}_mapped_1.fastq') except: pass try: os.remove(f'{outDir}/{id}_mapped_2.fastq') except: pass outlog.write('********************\n') #get time at end end = time.time() #determine runtime of processes runtime = round(end - start,2) print(f'\nSniffles: Finished normalizing read coverage in {runtime} seconds') return output_list
def normCoverage(runCFG, bam_files, threads='1'): #initial parameters outDir = runCFG['exec']['outdir'] checkexists(os.path.join(outDir, 'normalized')) logfile = runCFG['exec']['logfile'] outDir = os.path.join(outDir, 'normalized') #notify starting to remove duplicates procTitle("Downsampling with seqtk to normalize coverage", runCFG) #print('\n-----------------------Sniffles: Downsampling with seqtk to normalize coverage-----------------------') #get time at start start = time.time() #denote start of remove duplicate reads in logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Downsampling with seqtk to normalize coverage\n') #run normalization output_list = [] for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split('.')[0] #get reads from mapped bamfile cmd_get_reads = f'bash -c \'samtools collate /bam_files/{id}.bam collating && samtools fastq -n /bam_files/{id}.bam -1 /out_dir/{id}_mapped_1.fastq.gz -2 /out_dir/{id}_mapped_2.fastq.gz' #run seqtk to subsample readsc total_reads = runCFG['exec']['totalReads'] cmd_normalization = f' && seqtk sample -s100 /out_dir/{id}_mapped_1.fastq.gz {total_reads} > {id}_1.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_2.fastq.gz {total_reads} > /out_dir/{id}_2.fastq.gz\'' if runCFG['exec']['unpaired']: cmd_get_reads += f' -0 /out_dir/{id}_mapped_U.fastq.gz && seqtk sample -s100 /out_dir/{id}_mapped_U.fastq.gz {total_reads} > /out_dir/{id}_U.fastq.gz' output_list.append([ os.path.join(outDir, f'{id}_1.fastq.gz'), os.path.join(outDir, f'{id}_2.fastq.gz'), os.path.join(outDir, f'{id}_U.fastq.gz') ]) else: output_list.append([ os.path.join(outDir, f'{id}_1.fastq.gz'), os.path.join(outDir, f'{id}_2.fastq.gz') ]) #start docker containers and run outlog.write(f'{id}\n-----------\n') stdout = cd.call(cmd_get_reads + cmd_normalization, '/out_dir', { path: "/bam_files", outDir: "/out_dir" }) outlog.write(stdout) #cleanup try: os.remove(f'{outDir}/{id}_mapped_1.fastq.gz') except: pass try: os.remove(f'{outDir}/{id}_mapped_2.fastq.gz') except: pass try: os.remove(f'{outDir}/{id}_mapped_U.fastq.gz') except: pass outlog.write('***********\n') #get time at end end = time.time() #determine runtime of processes runtime = round(end - start, 2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'\nSniffles: Finished normalizing read coverage in {runtime}') return output_list
def VCFannotator(runCFG, vcffiles): # read in reference sequences and gtfs and store information about protein sequences # create a dictionary of gene names and start/stop sites, allowing for more than one start/stop site. #import file location parameters from config file outDir = os.path.join(runCFG['exec']['outdir'], 'vcf_annotations') checkexists(outDir) logfile = os.path.join(outDir, runCFG['exec']['logfile']) refseqfasta = runCFG['exec']['referenceSequence'] refseqname = refseqfasta.split(".")[0] if runCFG['exec']['mapToConsensus']: refseqfasta = os.path.join(runCFG['exec']['outdir'], 'ref_sequence', refseqfasta) #get start time start1 = time.time() procTitle('Annotating SNPs', runCFG) #Extract coding sequence coordinates from gtf files: coding_regions = { } #will be dictionary of dictionaryies (format segment:gene:[[startExon1, stopExon1], [startExon2, stopExon2]]) with open(runCFG['postprocessing']['gtfFileName'], "r") as gtf: for line in gtf: if line.strip( "\n" ) != "": # ignore blank lines (otherwise throws an index error) line = line.replace("/", "_") lineitems = line.split("\t") segment_name = lineitems[0] annotation_type = lineitems[2] start = int( lineitems[3]) - 1 # adding the -1 here for 0 indexing stop = int( lineitems[4]) - 1 # adding the -1 here for 0 indexing gene_name = lineitems[8] gene_name = gene_name.split(";")[0] gene_name = gene_name.replace("gene_id ", "") gene_name = gene_name.replace("\"", "") if annotation_type.lower() == "cds": if segment_name not in coding_regions: coding_regions[segment_name] = {} coding_regions[segment_name][gene_name] = [[ start, stop ]] elif segment_name in coding_regions and gene_name not in coding_regions[ segment_name]: coding_regions[segment_name][gene_name] = [[ start, stop ]] elif gene_name in coding_regions[segment_name]: coding_regions[segment_name][gene_name].append( [start, stop]) # pull in reference fasta file, separate gene segments into a dictionary ref_segments = {} for seq in SeqIO.parse(refseqfasta, "fasta"): refseqname = str(seq.id).replace("/", "_") sequence = str(seq.seq).lower() ref_segments[refseqname] = sequence # use gene coordinates to create coding sequences from reference sequences transcripts = {} #Reminder of current data structures: #coding_regions[segment][gene]:coordinates of genes #ref_segments[nameofsegment]:sequence for segment in coding_regions: for gene in coding_regions[segment]: transcripts[gene] = "" coordinates = coding_regions[segment][ gene] # define the coding regions for each gene for start, stop in coordinates: # loop through start/stop sites in coding regions sequence_chunk = ref_segments[segment][start:stop + 1] transcripts[gene] = transcripts[ gene] + sequence_chunk # append each piece of the transcript together # loop through each transcript to make sure that it begins with a start codon and ends with a stop codon #for t in transcripts: #if transcripts[t][0:3] != start_codon: #print("WARNING! " + refseqname + " " + t + " does not contain a start codon! The first three nucleotides are " + transcripts[t][0:3]) #if transcripts[t][-3:] not in stop_codons: #print("WARNING! " + refseqname + " " + t + " does not contain a stop codon! These are the last 3 nucleotides: " + transcripts[t][-3:]) print(vcffiles) if os.path.isdir(vcffiles): vcffiles = glob.glob(vcffiles + "/*.vcf") elif type(vcffiles) == list: if vcffiles[0].split(".")[-1] == "vcf": pass else: print("vcffiles has no vcf files!") listofmutstoExport = [] ##Loop through each vcf file and annotate amino acid changes print(vcffiles) for i, vcfname in tqdm(enumerate(vcffiles)): with open(vcfname, "r") as TextVCF: for index, line in enumerate(TextVCF, 0): if "#CHROM" in line: rowstoskip = index #Reads the vcf file into a pandas DataFrame print(vcfname) try: vcfDF = pd.read_csv(vcfname, sep='\t', skiprows=rowstoskip) except OSError as inst: print("\n" + vcfname + " did not open appropriately. Please check file.\n") #fix the / in chrome bug vcfDF["#CHROM"] = vcfDF["#CHROM"].str.replace("/", "_") #extract frequencies for list of muts to export: #In order to make this easier, I'm going to assume each VCF has only one sample. This code DOES NOT WORK for more than one sample per VCF. #print(vcfDF.iloc[:,-1].str.split(":").str[6].str.rstrip('%').astype('float')/100) freqlocation = vcfDF.loc[0, "FORMAT"].split(":").index("FREQ") try: vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split( ":").str[freqlocation].astype('float') except ValueError: vcfDF["FREQ"] = vcfDF.iloc[:, -1].str.split( ":").str[freqlocation].str.rstrip('%').astype('float') / 100 except: raise listofmuts = [] #loop through each line in vcfDF, extract chrom, pos, reference nucleotide, alternate nucleotide for chrom, pos, ref, alt, freq in zip(vcfDF['#CHROM'], vcfDF["POS"], vcfDF["REF"].str.lower(), vcfDF["ALT"].str.lower(), vcfDF["FREQ"]): pos -= 1 #subtract one from position to convert from VCF's 1 indexing to python's 0 for gene in coding_regions[chrom].keys( ): #loop through each gene potentially applicable to that position (i.e., all on chromosome) priorExonLength = 0 #print (gene) for start, stop in coding_regions[chrom][ gene]: #loop through each exon of gene #if pos in exon, calculate codon, reference aa, and variant aa #print (f"pos: {pos} start: {start} stop: {stop}") if pos in range(start, stop): #print ('is in range, annotating.') within_gene_position = pos - start + priorExonLength #within gene position is the position in this exon (pos-startOfExon), plus the length of any prior exons (exonstart) codon_pos = (within_gene_position % 3) alternatetranscript = transcripts[ gene][:within_gene_position] + alt + transcripts[ gene][within_gene_position + 1:] codon = transcripts[gene][(within_gene_position - codon_pos):( within_gene_position + (3 - codon_pos))] variantcodon = alternatetranscript[( within_gene_position - codon_pos):(within_gene_position + (3 - codon_pos))] ref_aa = Seq(codon).translate() variant_aa = Seq(variantcodon).translate() aa_num = str(int(within_gene_position / 3) + 1) #Catch errors in annotation calculations where the math results in an incorrect codon if codon[codon_pos] != ref: print( "Something's quite wrong here. The reference SNP is not what it should be." ) print(f"\n\nchrom: {chrom}, gene: {gene}") print( f"\npos: {pos} within_gene_position: {within_gene_position}\ncodon_pos: {codon_pos} codon: {codon} variantcodon: {variantcodon}\n\n" ) print( f"ref: {ref} alt: {alt} ref_aa: {ref_aa} \nvariant_aa: {variant_aa}\n aa_num: {aa_num}\n" ) print(transcripts[gene]) ref_aa = Seq(codon).translate() variant_aa = Seq(variantcodon).translate() aa_num = str(int(within_gene_position / 3) + 1) if ref_aa != variant_aa: listofmuts.append([ chrom, gene, pos + 1, str(ref_aa + aa_num + variant_aa) ]) if freq > 0.01 and freq < 0.99: listofmutstoExport.append({ "segment": chrom, 'gene': gene, 'position': pos + 1, 'frequency': float(freq), 'AAchange': str(ref_aa + aa_num + variant_aa) }) elif ref_aa == variant_aa: listofmuts.append([chrom, gene, pos + 1, "."]) break #if pos is in exon, stop looping though exons else: priorExonLength += ( stop + 1 - start ) #The next exon will begin after the length of this exon, i.e., after the stop point minus the start point #else statement only executed if for loop finishes without breaking (ie if pos is never within the gene being examined) else: listofmuts.append([chrom, gene, pos + 1, "not in ORF"]) continue #continue onto next gene AAchange = pd.DataFrame(listofmuts, columns=['#CHROM', 'gene', 'POS', 'AAchange']) vcfDF = vcfDF.merge(AAchange, how='left', on=['#CHROM', 'POS']) vcfDF['gene'] = vcfDF['gene'].astype(str) vcfDF['gene'] = vcfDF['gene'].replace("NA", "NA gene") annotatedVCFname = os.path.basename(vcfname).split( ".")[0] + ".annotated_vcf" outputfile = vcfDF.to_csv(os.path.join(outDir, annotatedVCFname), sep="\t", index=None, header=True) vcffiles[i] = annotatedVCFname #importantMuts = pd.DataFrame(listofmutstoExport) #print (importantMuts) #importantMuts = importantMuts.groupby(['segment', 'gene', 'position', 'AAchange'], as_index=False).mean() #importantMuts = importantMuts.loc[importantMuts['freq']>0.02 & importantMuts['freq']<0.98] #importantMuts['freq'] = importantMuts['freq']/len(vcffiles) #importantMutsexport = importantMuts.to_csv(os.path.join(outDir, "allMutationsPresent.tsv"), sep = '\t', index=None, header=True) #get end time end = time.time() #get total runtime runtimeSeconds = end - start1 runtime = datetime.timedelta(seconds=runtimeSeconds) print(f'\nSniffles: Finished annotating snps in {str(runtime)}') return (vcffiles)
def snpcaller(runCFG,bam_files,threads='1'): #set parameters outDir = runCFG['exec']['outdir'] logfile = runCFG['exec']['logfile'] outDir = os.path.join(outDir,'snp_calls') checkexists(outDir) #set reference sequence reference_sequence_path = os.path.dirname(runCFG['exec']['referenceSequence']) reference_sequence_name = os.path.basename(runCFG['exec']['referenceSequence']) #starting time point start = time.time() if runCFG['exec']['replicates']: message = 'Calling replicate SNPs with Varscan' else: message = 'Calling SNPs with Varscan' procTitle(message, runCFG) bams = [] sample_list = [] listofVCFs = [] repDict = {} #Create list of bam files to call SNPs on for path in bam_files: full_path = os.path.abspath(path) file_name = os.path.basename(full_path) path = os.path.dirname(full_path) id = file_name.split(".")[0] sample_list.append(id) bams.append('/infile/'+file_name) #if processing replicate runs, create dictionary samplename:[list of replicate vcf files for that sample] #this dictionary will be used later to merge and average replicate vcfs if runCFG['exec']['replicates']: repBreakdown = runCFG['exec']['replicateNotation'].split("_") repBreakdown = "_".join(repBreakdown[:-1]) repBreakdown = repBreakdown[:repBreakdown.find(r"\d")] repBreakdown = repBreakdown.split("Sample") repKey = file_name[file_name.find(repBreakdown[0])+len(repBreakdown[0]):file_name.find(repBreakdown[1])] vcf_name = (id+".vcf") listofVCFs.append(vcf_name) if repKey not in repDict.keys(): repDict[repKey] = [vcf_name] else: repDict[repKey].append(vcf_name) #import SNP calling quality parameters from config file snp_frequency=runCFG['snpcalling']['snpFrequency'] min_cov=runCFG['snpcalling']['minCoverage'] snp_qual_threshold=runCFG['snpcalling']['snpQualityThreshold'] #generate commands to call variants cmds=[] for bam, sample in zip(bams, sample_list): #mpileup command outlogHeader = f"{bam.split('/')[-1].split('.')[0]}\n-----------\n" cmd1 = f'printf \"{outlogHeader}\" >> {os.path.join("/logfile", os.path.basename(logfile))} && samtools mpileup -ABR -d 1000000 {bam} -f /ref/{reference_sequence_name} > {sample}.mpileup' #varscan command cmd2 = f'java -jar /tools/varscan.jar mpileup2snp {sample}.mpileup --min-coverage {min_cov} --min-avg-qual {snp_qual_threshold} --min-var-freq {snp_frequency} --strand-filter 1 --output-vcf 1 --variants --vcf-sample-list <(echo -e "{sample}") > {sample}_temp.vcf' #compress and normalize vcf cmd3 = f'bcftools norm -c sw -m - -f /ref/{reference_sequence_name} -o {sample}.vcf {sample}_temp.vcf && rm {sample}_temp.vcf' if not runCFG['exec']['replicates']: listofVCFs.append(os.path.join(outDir, f"{sample}.vcf")) #add commands to list for multiprocessing cmds.append("bash -c \'" + cmd1 + " && " + cmd2 + " && " + cmd3 + "\'") #initialize multiprocessing pool pool = mp.Pool(processes=threads) #open logfile with open(logfile,'a') as outlog: outlog.write('***********\n') outlog.write('Calling SNPs\n') #run commands with mutliprocessing results = pool.starmap_async(cd.call,[[cmd, '/outfile',{reference_sequence_path:"/ref",path:"/infile",outDir:"/outfile",os.path.dirname(logfile):"/logfile"}] for cmd in cmds]) pool.close() pool.join() stdouts = results.get() print ('finished all results') for stdout in stdouts: #outlog.write('-----------\n') outlog.write(stdout) #if processing duplicate runs, merge and average SNP calls if runCFG['exec']['replicates']: listofVCFs = (VCFaverager(runCFG, repDict, listofVCFs)) outlog.write(str(listofVCFs)) outlog.write('-----------\n') #Combine sample vcfs into one master VCF: #allSNPs = VCFcombiner(runCFG, listofVCFs, "allVarscanSNVs.vcf") #outlog.write(f"\nCombined all vcf files into master vcf file allVarscanSNVs.vcf\n") outlog.write('-----------\n') #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start,2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'\nSniffles: Finished calling snps in {runtime}') return (listofVCFs)
def mapping(runCFG, param_paths, outDir, threads='8'): seed = 0 start = time.time() #assert len(param_paths) > 0, "Cannot map reads: No reads provided" print(param_paths) logfile = runCFG['exec']['logfile'] num_jobs, num_threads = cpu_count(threads) cmds = [] read_path = '' ref_path = '' output_bam_list = [] for param_path in param_paths: id = param_path[0] read1 = os.path.basename(param_path[1]) read2 = os.path.basename(param_path[2]) try: unpairedReads = os.path.basename(param_path[4]) except: unpairedReads = "" print(f"unpairedReads for id {id}: " + unpairedReads) read1un = read1.split(".")[0][0:-1] + "U.fastq.gz" read2un = read2.split(".")[0][0:-1] + "U.fastq.gz" read_path = os.path.dirname(os.path.abspath(param_path[1])) ref_path = runCFG['exec']['outdir'] + '/ref_sequence' reference_sequence_name = os.path.basename(param_path[3]) outlogHeader = f"\"{id}\n-----------\n\"" #check output folder exists checkexists(os.path.join(outDir)) logfilepath = os.path.join("/logfile", os.path.basename(logfile)) checkexists(os.path.join(outDir, "unmapped")) if read2 != '': #generate command for paired end cmd = f"bash -c \'printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -1 /reads/{read1} -2 /reads/{read2} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}" else: #generate command for interleaved cmd = f"bash -c \'printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} --interleaved /reads/{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}" if runCFG['exec']['unpaired']: if unpairedReads != "": cmdUn = f"_paired.bam && printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -U /reads/{unpairedReads} --un /output/unmapped/U_{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}_unpaired.bam" elif read2 != '': #generate command for unpaired cmdUn = f"_paired.bam && printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -U /reads/{read1un},/reads/{read2un} --un /output/unmapped/U_{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}_unpaired.bam" else: #generate command for interleaved cmdUn = f"_paired.bam && printf {outlogHeader} >> {logfilepath} && bowtie2 -x {reference_sequence_name} -U /reads/{read1un} --un /output/unmapped/U_{read1} --seed {seed} --no-unal --local 2>> /logfile/{id}mappingstats.log | samtools view -bS | samtools sort -o /output/{id}_unpaired.bam" mergecmd = f" && samtools merge /output/{id}.bam /output/{id}_unpaired.bam /output/{id}_paired.bam && samtools index /output/{id}.bam\'" cmds.append(cmd + cmdUn + mergecmd) else: cmds.append(cmd + ".bam\'") #data for next stage output_bam_list.append(os.path.join(outDir, f'{id}.bam')) #set up multiprocessing pool = mp.Pool(processes=num_jobs) #get start time start = time.time() #denote start of mapping in logs with open(logfile, 'a') as outlog: outlog.write('***********\n') outlog.write('Mapping\n') #start multiprocessing results = pool.starmap_async(cd.call, [[ cmd, '/reads', { ref_path: "/reference", read_path: "/reads", outDir: "/output", os.path.dirname(logfile): "/logfile" } ] for cmd in cmds]) pool.close() pool.join() stdouts = results.get() for stdout in stdouts: outlog.write('-----------\n') outlog.write(stdout) #denote end of logs outlog.write('***********\n') #get end time end = time.time() #get total runtime runtime = round(end - start, 2) runtime = str(datetime.timedelta(seconds=runtime)) print(f'\nSniffles: Finished mapping in {runtime}') return output_bam_list