def prepare_vari(workdir, snpEff, snpSift, email, genome, genes): """ Prepare files for running provean, each folder should only have vcf and vcf.idx file * workdir: the folder that has vcf files * snpEff: path to snpEff * snpSift: path to snpSift * email: email or phone number ([email protected]) * genome: genome name defined in snpEff * genes: A list of gene symbols """ vcfFiles = glob.glob(workdir + '/*.filter.vcf') vcfFile = vcfFiles[0] #============= 1. Annotate vcf results using snpEff ================ annotatedVCF = vcfFile[:-3] + 'eff.vcf' if not os.path.exists(annotatedVCF): annotatedVCF = snpEff_annotateVCF( vcfFile, snpEff, genome) # annotated: filename.eff.vcf #============= 2. Loop for every gene ================================ for gene in genes: print gene, 'start to get input files for provean' if gene == '': try: filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [ '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))' ]) except: print gene, 'snpSift filter failed' Message('snpSift filter failed', email) else: gene_if = ('(ANN[*].GENE=\'{gene}\')').format(gene=gene) #============= (1). Filter the annotated file ======================== try: filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [ gene_if, '&' '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))' ]) print 'filteredVCF is: ', filteredVCF except: print gene, 'snpSift filter failed' Message('snpSift filter failed', email) #============= (2). Get input files for provean ====================== try: vari_files = vari_input4provean(filteredVCF) except: print gene, 'fail to get provean inputs' Message('fail to get provean inputs', email) raise if vari_files == '': print gene, 'does not have any interested variants' print workdir, 'provean input succeed'
file_path = param['filePath'] kallisto_index = param['kallisto_index'] trim = param['trim'] phred = param['phred'] trimmomatic = param['trimmomatic'] trimmoAdapter = param['trimmoAdapter'] #=============================================================================== # Pipeline #=============================================================================== #=========== (0) enter the directory ================ Message(startMessage,email) os.chdir(file_path) #=========== (1) reads files and trim ==================== fastqFiles = list_files(file_path) print 'list file succeeded' if trim == 'True': try: fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred,trimmoAdapter,batch=6) print 'trim succeed' print 'fastqFiles is: ',fastqFiles except: print 'trim failed' Message('trim failed',email) raise #=========== (2) run sailfish mapping ====================
ref_fa = param['refSequence'] file_path = param['filePath'] bwaIndex = param['alignerDb'] trim = param['trim'] phred = param['phred'] picard = param['picard'] trimmomatic = param['trimmomatic'] trimmoAdapter = param['trimmoAdapter'] gatk = param['gatk'] read_group = param['readGroup'] organism = param['organism'] ##***************** Part 0. Build index file for bwa and GATK ****** ##================= Part I. Preprocess ============================ #======== 1. map and dedupping ===================================== Message(startMessage, email) #======== (0) enter the directory ======================== bwa_path = bwaIndex[:bwaIndex.rfind('/')] if not os.path.exists(bwa_path): os.mkdir(bwa_path) if os.listdir(bwa_path) == []: bwa_Db(bwa_path, ref_fa) os.chdir(file_path) #======== (1) read files ================================ fastqFiles = list_files(file_path) if trim == 'True': trim_fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred, trimmoAdapter, batch=6) remove(fastqFiles)
trimmoAdapter = param['trimmoAdapter'] #------------- specific parameters for gsnap ------------ host_gsnapDbName = param['host_gsnapDbName'] host_gsnapAnnotation = param['host_gsnapAnnotation'] virus_gsnapDbName = param['virus_gsnapDbName'] virus_gsnapAnnotation = param['virus_gsnapAnnotation'] #------------- blast parameters --------------------------- blast_Db = param['blast_Db'] runBlastAfterAssemble = param['runBlastAfterAssemble'] blast_nt_DB = param['blast_nt_DB'] #=============================================================================== # 1. map to host reference genome #=============================================================================== #======== (0) enter the directory ======================== os.chdir(file_path) Message(startMessage,email) """ #======== (1) read files ================================ fastqFiles = list_files(file_path) if trim == 'True': fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred) print 'list file succeed' print 'fastqFiles is: ',fastqFiles #======== (2) align fastq files to host ================================ try: if aligner == 'gsnap': # check index if os.listdir(host_alignerDb) == []: gsnap_Db(host_fa,host_alignerDb,host_gsnapDbName,host_gsnapAnnotation) map_files = gsnap(fastqFiles,host_alignerDb, host_gsnapDbName,host_gsnapAnnotation,thread) # [file.sam] else:
endMessage = param['endMessage'] # database reference fastaFile = param['reference'] record_dict = SeqIO.index(fastaFile, 'fasta') gffFile = param['annotation'] genome = param['genome'] CodonFile = param['CodonFile'] # software parameters snpSift = param['snpSift'] snpEff = param['snpEff'] provean = param['provean'] support_set_path = param['support_set'] # other parameters gene_file = param['gene_file'] Message(startMessage, email) #=============================================================================== # Variant analysis pipeline #=============================================================================== ## read gene list if gene_file == '': genes = [''] else: genes = [] geneFile = open(gene_file, 'r') for line in geneFile: if line[0].isalpha(): genes.append(line[:-1]) geneFile.close() genes = list(set(genes))
file_path = param['filePath'] starDb = param['alignerDb'] trim = param['trim'] phred = param['phred'] picard = param['picard'] trimmomatic = param['trimmomatic'] trimmoAdapter = param['trimmoAdapter'] gatk = param['gatk'] read_group = param['readGroup'] organism = param['organism'] ##***************** Part 0. Build index file for bwa and GATK ****** ##***************** Part I. Preprocess ============================ #======== 1. map and dedupping ===================================== #======== (0) enter the directory ======================== os.chdir(file_path) Message(startMessage, email) #======== (1) read files ================================ fastqFiles = list_files(file_path) if trim == 'True': trim_fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred, trimmoAdapter, batch=6) remove(fastqFiles) else: trim_fastqFiles = fastqFiles sys.stdout.write('list file succeed\n') sys.stdout.write('fastqFiles is: {fq}\n'.format(fq=trim_fastqFiles))
gold_snp = param['dbSNP'] phaseINDEL = param['phase1INDEL'] gold_indel = param['MillINDEL'] omni = param['omni'] hapmap = param['hapMap'] gatk = param['gatk'] read_group = param['readGroup'] organism = param['organism'] ##***************** Part 0. Build index file for bwa and GATK ****** ##***************** Part I. Preprocess ============================ #======== 1. map and dedupping ===================================== #======== (0) enter the directory ======================== os.chdir(file_path) Message(startMessage, email) #======== (1) read files ================================ fastqFiles = list_files_human(file_path) if trim == 'True': fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred, trimmoAdapter) sys.stdout.write('list file succeed\n') sys.stdout.write('fastqFiles is: {fq}\n'.format(fq=fastqFiles)) #======== (2) align using 2 pass STAR ==================== try: map_sams = STAR2Pass(fastqFiles, starDb, ref_fa, thread) sys.stdout.write('align succeed\n') sys.stdout.write('map_sams is: {map}\n'.format(map=map_sams)) except: sys.stdout.write('align failed\n') Message('align failed', email)
alignerrRNADb = param['alignerrRNADb'] alignerDb = param['alignerDb'] trim = param['trim'] phred = param['phred'] trimmomatic = param['trimmomatic'] trimmoAdapter = param['trimmoAdapter'] annotation = param['annotation'] ##***************** Part 0. Build index file for bowtie ****** ##================= Part I. Preprocess ============================ #======== 1. map reads ================================== #======== (0) enter the directory ======================= os.chdir(file_path) Message(startMessage, email) #======== (1) read and trim files ====================== fastqFiles = list_files(file_path) if trim == 'True': fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred, batch=6) # [[filename.fq.gz]] print 'list file succeed' print 'fastqFiles is: ', fastqFiles #======== (2) align to rRNA ============================= try: noRNA_fqs = bowtie2(fastqFiles, alignerrRNADb, thread, otherParameters=['--un-gz' ]) # [[filename.norna.fq.gz]] print 'extract norRNA succeed'
trim = param['trim'] phred = param['phred'] trimmomatic = param['trimmomatic'] trimmoAdapter = param['trimmoAdapter'] aligner = param['aligner'] annotation = param['annotation'] output_path = param['htseqOutPath'] db_name = param['gsnapDbName'] gsnap_annotation = param['gsnapAnnotation'] Dict = param['symbolIDFile'] inputpath = file_path #=========== (0) enter the directory ================ Message(startMessage, email) os.chdir(file_path) #=========== (1) reads files and trim =============== fastqFiles = list_files(file_path) if trim == 'True': fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred, trimmoAdapter) print 'list file succeed' #=========== (2) run gsnap to do the mapping ======== if aligner == 'gsnap': map_files = gsnap(fastqFiles, db_path, db_name, gsnap_annotation, thread) else: map_files = STAR(fastqFiles, db_path, thread) print 'align succeed' #=========== (3) samtools to sort the file ========== sorted_bam = sam2bam_sort(map_files, thread) print 'sorted succeed'
organism = param['organism'] #------------- specific parameters for gsnap ------------ host_gsnapDbName = param['host_gsnapDbName'] host_gsnapAnnotation = param['host_gsnapAnnotation'] virus_gsnapDbName = param['virus_gsnapDbName'] virus_gsnapAnnotation = param['virus_gsnapAnnotation'] #------------- htseqCount parameters -------------------- host_htseqFolder = param['host_htseqOutputFolder'] virus_htseqFolder = param['virus_htseqOutputFolder'] #=============================================================================== # 1. map to host reference genome #=============================================================================== #======== (0) enter the directory ======================== os.chdir(file_path) Message(startMessage,email) #======== (1) read files ================================ fastqFiles = list_files(file_path) if trim == 'True': fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred) # [[fq.gz]] print 'list file succeed' print 'fastqFiles is: ',fastqFiles #======== (2) align fastq files to host ================================ try: if aligner == 'gsnap': map_files = gsnap(fastqFiles,host_alignerDb, host_gsnapDbName,host_gsnapAnnotation,thread) else: map_files = STAR(fastqFiles,host_alignerDb,thread) print 'host align succeed' print 'map_files is: ',map_files # [file.sam] except:
def prepare_fa_vari(workdir, snpEff, snpSift, email, genome, genes, record_dict, gffFile): """ Prepare files for running provean, each folder should only have vcf and vcf.idx file * workdir: the folder that has vcf files * snpEff: path to snpEff * snpSift: path to snpSift * email: email or phone number ([email protected]) * genome: genome name defined in snpEff * genes: A list of gene symbols * record_dict: """ gene_rna_lst = [ f[:-11] for f in os.listdir(workdir) if f.endswith('protein.fa') ] os.chdir(workdir) # set work directory vcfFiles = [f for f in os.listdir(workdir) if f.endswith('filter.vcf')] vcfFile = vcfFiles[0] proteinFiles = [] variantFiles = [] #============= 1. Annotate vcf results using snpEff ================ annotatedVCF = vcfFile[:-3] + 'eff.vcf' if not os.path.exists(workdir + '/' + annotatedVCF): annotatedVCF = snpEff_annotateVCF( vcfFile, snpEff, genome) # annotated: filename.eff.vcf #============= 2. Loop for every genes ================================ for gene in genes: print gene, 'start to get input files for provean' if gene == '': try: filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [ '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))' ]) except: print gene, 'snpSift filter failed' Message('snpSift filter failed', email) else: gene_if = ('(ANN[*].GENE=\'{gene}\')').format(gene=gene) #============= (1). Filter the annotated file ======================== try: filteredVCF = snpSift_filterVCF(annotatedVCF, snpSift, [ gene_if, '&' '((ANN[*].IMPACT=\'HIGH\') | (ANN[*].IMPACT=\'MODERATE\'))' ]) print 'filteredVCF is: ', filteredVCF except: print gene, 'snpSift filter failed' Message('snpSift filter failed', email) #============= (2). Get input files for provean ====================== try: [protein_files, variant_files] = vcf2input4provean(filteredVCF, record_dict, gffFile, gene_rna_lst) except: print gene, 'fail to get provean inputs' Message('fail to get provean inputs', email) raise if protein_files != '': proteinFiles.extend(protein_files) variantFiles.extend(variant_files) print gene, 'prepare for provean input finish' else: print gene, 'does not have interested variants' raise print workdir, 'provean input succeed'
gffFile, gene_rna_lst) except: print gene, 'fail to get provean inputs' Message('fail to get provean inputs', email) raise if protein_files != '': proteinFiles.extend(protein_files) variantFiles.extend(variant_files) print gene, 'prepare for provean input finish' else: print gene, 'does not have interested variants' raise print workdir, 'provean input succeed' Message(startMessage, email) genes = get_genes_from_file(gene_file) #================= 0. list directories ========================================= os.chdir(pathway) # set work directory folders = get_all_folders(pathway) folders = natsorted(folders) #============= 2. prepare input files for provean ====================================== batch_folders = chunk(folders, int(thread)) for batch in batch_folders: proc = [ Process(target=prepare_fa_vari, args=( pathway + '/' + f, snpEff, snpSift, email,
#------------- software parameters ---------------------- file_path = param['filePath'] aligner = param['aligner'] alignerDb = param['alignerDb'] trim = param['trim'] phred = param['phred'] picard = param['picard'] trimmomatic = param['trimmomatic'] trimmoAdapter = param['trimmoAdapter'] #------------- specific parameters for gsnap ------------ gsnapDbName = param['gsnapDbName'] gsnapAnnotation = param['gsnapAnnotation'] #======== (0) enter the directory ================ os.chdir(file_path) Message(startMessage, email) #======== (1) read files ================================ fastqFiles = list_files(file_path) if trim == 'True': fastqFiles = Trimmomatic(trimmomatic, fastqFiles, phred) print 'list file succeed' print 'fastqFiles is: ', fastqFiles #======== (2) align fastq files to host ================================ try: if aligner == 'gsnap': # check index if os.listdir(alignerDb) == []: gsnap_Db(ref_fa, alignerDb, gsnapDbName, gsnapAnnotation) map_files = gsnap(fastqFiles, alignerDb, gsnapDbName, gsnapAnnotation, thread) # [file.sam] else:
trimmomatic = param['trimmomatic'] trimmoAdapter = param['trimmoAdapter'] aligner = param['aligner'] annotation = param['annotation'] output_path = param['htseqOutPath'] htseqBatch = param['htseqBatch'] db_name = param['gsnapDbName'] gsnap_annotation = param['gsnapAnnotation'] Dict = param['symbolIDFile'] inputpath = file_path #=========== (0) enter the directory ================ os.chdir(file_path) Message(startMessage,email) #=========== (1) reads files and trim =============== fastqFiles = list_files(file_path) print 'list file succeed' if trim == 'True': try: trim_fastqFiles = Trimmomatic(trimmomatic,fastqFiles,phred,trimmoAdapter,batch=6) print 'trim succeed' print 'fastqFiles is: ',fastqFiles remove(fastqFiles) except: print 'trim failed' Message('trim failed',email) raise else: