def transfer_alea_files(fnlist): transferDir = 'alea_transfer' cmn.mkdir(transferDir) newlist = [] for fn in fnlist: print('transfering %s from archive server ...' % fn) cmd = 'rsync -r [email protected]:%s %s' % (fn, transferDir) cmn.run(cmd) newlist.append('%s/%s' % (transferDir, cmn.lastName(fn))) return newlist
def parse_ref(seqDict): cmn.mkdir('baits') newDict = {} for i, name in enumerate(seqDict): seq = seqDict[name] fnlabel = 'bait%s' % i dn = 'baits/%s.fa' % fnlabel name = name.replace('*', '').replace('"', "'") fasta = '>%s\n%s\n' % (name, seq) cmn.write_file(fasta, dn) cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel) cmn.run(cmd) newDict[name] = dn return newDict
def attempt_to_find_genus_by_abundence(ID, fqlist): tmpdir = 'tmp_%s' % ID cmn.mkdir(tmpdir) os.chdir(tmpdir) cmn.write_lines(fqlist, 'fqlist') cmd = '/work/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist' cmn.run(cmd) dn = 'picked_bait.txt' if cmn.filexist(dn): genus = cmn.txt_read(dn).strip().split('_')[0].split()[0] else: genus = None os.chdir('..') cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID)) cmn.run('rm -r %s ' % tmpdir) return genus
sys.exit() try: pre = sys.argv[4] except: pre = 'x' #cores = 32 # in biohpc, it is 32 partition = 'super' for i, arg in enumerate(sys.argv): if arg == '-p': partition = sys.argv[i + 1] tmpdir = '%s_files' % pre cmn.mkdir(tmpdir) bundle = len(cmds) / N for i in range(N): cmd = cmds[i * bundle:(i + 1) * bundle] if i == N - 1: cmd += cmds[(i + 1) * bundle:] dn = '%s/%s_%s' % (tmpdir, pre, i) cmn.write_file('\n'.join(cmd), dn) submit_job(dn, Npara, pre, i, partition) cwd = os.getcwd() d_stat = '%s/stat.info' % tmpdir info = [] info.append('commands are from: %s/%s' % (cwd, sys.argv[1])) info.append('split into %s jobs' % sys.argv[2])
'/project/biophysics/Nick_lab/wli/sequencing/mapping/SNP_calling/2_gatk/template_gatk.job' ) template = template.replace('assembly_v0', ass_label) fns = cmn.cmd2lines('ls ../1_bwa_align/*.sam | grep -v _v0_') fns = [os.path.abspath(i) for i in fns] #good_set = set('LEP18259 3318 3303'.split()) finished = cmn.cmd2lines("ls */*.vcf|cut -d '/' -f 2|cut -d '_' -f 1") #group spiecies group_dict = group_by_species(fns) for slabel in group_dict: if slabel in finished: print('skip finished ' + slabel) continue cmn.mkdir(slabel) os.chdir(slabel) fns = group_dict[slabel] f_sam = merge_sams(slabel, fns) cmd = template.replace('3377', slabel) #cmd = cmd.replace('--job-name=gatk', '--job-name=%s' % slabel) os.chdir('..') cmn.write_file(cmd, 'gatk%s.job' % slabel) cmn.run('sbatch gatk%s.job' % slabel)
try: odir, f_ass = sys.argv[1:3] except: print("Usage: *.py filelist assembly_v0.fa", file=sys.stderr) print("you should index assembly_v0.fa first with -p assembly_v0", file=sys.stderr) print("using command /home2/wli/local/bwa-0.7.12/bwa index ", file=sys.stderr) sys.exit() #fns = cmn.cmd2lines('ls %s/*.fq' % odir) fns = cmn.getid(odir) group_dict = separate_by_label(fns) ass_label = cmn.find_between(cmn.lastName(f_ass), 'assembly_', '.fa') cmn.mkdir('job_files') cmn.mkdir('cmd_files') for plabel in group_dict: print('processing lib %s' % plabel) each = group_dict[plabel] #also parse the files inside this function #return the file name after parsing paired, unpaired = separate_by_pair(plabel, each) if paired == None: continue label = '%s_%s' % (plabel, ass_label) #index_label = cmn.lastName(f_ass).replace('.fa', '') index_label = f_ass.replace('.fa', '') cmd = '' cmd += '/home2/wli/local/bwa-0.7.12/bwa mem -t 32 -M %s %s %s > %s_paired.sam;\n' % (index_label, paired[0], paired[1], label)
kept_spdir_files = 'realigned_reads_step2.bam snp_step2.vcf$'.split() for each in spdirs: print(each) wdir_label = cmn.lastName(each) dwdir = '%s/%s' % (ddir, wdir_label) if os.path.exists(dwdir): print( 'the destination directory has already exists! please check manually to choose which one to keep:' ) print('distination dir: %s' % dwdir) print('current dir: %s' % each) print('\n') continue cmn.mkdir(dwdir) fbam = '%s/realigned_reads_step2.bam' % (each) if os.path.exists(fbam): cmd = 'cp %s/realigned_reads_step2.bam %s' % (each, dwdir) else: fbam = '%s/realigned_reads.bam' % each if os.path.exists(fbam): cmd = 'cp %s/realigned_reads.bam %s' % (each, dwdir) else: print('Error, can not find bam file!') cmn.run(cmd) #print cmd fvcf = spdirs[each] cmd = 'cp %s %s' % (fvcf, dwdir) cmn.run(cmd)
defline = lines[0] seq = ''.join(lines[1:]) adict[defline] = seq return adict #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #fn = 'all_genomes_noGap.fa' #fn = 'all_genomes_charGap.fa' try: fn = sys.argv[1] except: print('*.py all_genomes_charGap.fa ') sys.exit() adict = read_fa(fn) fnlabel = cmn.lastName(fn).replace('.fa', '') outdir = 'splitS_%s' % fnlabel cmn.mkdir(outdir) for i, key in enumerate(adict): seq = adict[key] fasta = '>%s\n%s\n' % (key, seq) dn = '%s/%s_%s.fa' % (outdir, fnlabel, i) cmn.write_file(fasta, dn)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: read, fn, direction = sys.argv[1:4] except: print("Usage: *.py", file=sys.stderr) sys.exit() fished_reads = [] cmn.mkdir('grep_out') if True: with open(fn) as fp: for lineN, line in enumerate(fp): if lineN % 4 != 1: #only take the sequence continue line = line.strip() if direction == 'backward': line = line[::-1] #find match forward and + strand i1 = line.find(read) if i1 != -1: fished_reads.append(line[i1:]) #in the reverse strand
import sys python_lib = '/work/00412/mtang/sequencing/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn import os fromDir = os.path.abspath(sys.argv[1]) toDir = os.path.abspath(sys.argv[2]) wdirs = cmn.cmd2lines('ls %s | grep ^sampleRun_' % fromDir) #toKeep = ['*.txt', '*.report', 'barcode_count', '*_contig.fa', 'denovo_barcode.fa', 'bait0_denovo.br'] toKeep = [ '*.txt', '*.report', '*_contig.fa', 'denovo_barcode.fa', 'bait0_denovo.br' ] for wdir in wdirs: eachToDir = '%s/%s' % (toDir, wdir) cmn.mkdir(eachToDir) eachFromDir = '%s/%s' % (fromDir, wdir) for fn in toKeep: cmd = 'cp %s/%s %s' % (eachFromDir, fn, eachToDir) print(cmd) cmn.run(cmd)
pairDict = {} for fn in fns: key = '_'.join(cmn.lastName(fn).split('_')[:-1]) #if '250' in key or '500' in key: # print 'skip short lib: %s' % fn # cmn.run('ln -s %s' % fn) # continue try: pairDict[key].append(fn) except KeyError: pairDict[key] = [fn] cmn.mkdir('logs') cmds = [] for key in pairDict: each = pairDict[key] if len(each) == 2: each.sort() cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/re-pair-reads_wenlin %s %s %s >& logs/%s_run.log &' % ( each[0], each[1], key, key) cmds.append(cmd) else: print('cannot find pair for %s' % str(each)) for iii in each: cmn.run('ln -s %s' % iii) cmds.append('wait')
import sys python_lib = '/work/biophysics/mtang/SNP_calling/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn import os jobs = [line.strip().split()[-1] for line in cmn.getid(sys.argv[1])] fromDir = sys.argv[2].rstrip('/') #the dir ends with step3 cwd = os.getcwd() cmn.mkdir('job_files') cmn.mkdir('step3_gatk') fromPdir = '/'.join(fromDir.split('/')[:-1]) cmn.run('ln -s %s/step2_bwa_mapping' % fromPdir) fjobs = [] #1. copy the directory to current for job in jobs: wdir = job[4:-4] current = '%s/%s' % (fromDir, wdir) cmd = 'cp -r %s step3_gatk' % current print('forking data for %s' % current) cmn.run(cmd) new = '%s/step3_gatk/%s' % (cwd, wdir) user = cmn.cmd2info('echo $USER').strip() user_label = user[0]
#check if all the files has contains falist = cmn.file2lines(fn) bad_falist = [ fa for fa in falist if not cmn.filexist(fa) and '/archive/butterfly/' not in fa ] if len(bad_falist) != 0: print('Error!') print('the following files are errorous:') print('\n'.join(bad_falist)) sys.exit() transferDir = 'archiveTransfer' cmn.mkdir(transferDir) alea_list = [fa for fa in falist if '/archive/butterfly' in fa] biohpc_list = set(falist) - set(alea_list) newlist = transfer_alea_files(alea_list) newlist += list(biohpc_list) dn = 'new.falist' cmn.write_lines(newlist, dn) #backup this newlist cmn.mkdir('../falist_info') dirlabel = os.getcwd().rstrip('/').split('/')[-1] backFn = '../falist_info/%s.falist' % dirlabel
if __name__ == '__main__': #options=parse_options() try: fns = [os.path.abspath(each) for each in sys.argv[3:]] #KmerCut = int(sys.argv[1]) KmerSize = int(sys.argv[1]) Ncpu = int(sys.argv[2]) except: print("Usage: *.py KmerCut KmerSize(19) Ncpu R1.fq R2.fq", file=sys.stderr) sys.exit() outlabel = cmn.lastName(fns[0]).split('_')[0] tmpDir = '%s_jf' % outlabel cmn.mkdir(tmpDir) #step1, run Jellyfish print('running Jellyfish to get Kmer count...') os.chdir(tmpDir) cmd = 'jellyfish count -m %s -t %s -s 10000000000 -c 8 --timing=jf.err --canonical ' % ( KmerSize, Ncpu) cmd += ' '.join(fns) cmn.run(cmd) cmd = 'jellyfish histo mer_counts.jf > %smer_histo.txt' % KmerSize cmn.run(cmd) cmd = 'jellyfish dump -c mer_counts.jf > %smer_counts' % KmerSize cmn.run(cmd) #step2, filter out reads with high Kmers
except: print('usage: *.py fsam fass', file=sys.stderr) sys.exit() cmd = 'module add samtools; samtools faidx %s' % fass cmn.run(cmd) cmd = 'module add picard/1.117; java -jar $PICARD/CreateSequenceDictionary.jar R=%s O=%s.dict' % ( fass, fass[:-3]) cmn.run(cmd) template = cmn.txt_read( '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/template_gatk_bias_fromSam.job' ) template = template.replace('[WL_ref]', fass) template = template.replace('[INPUT.sam]', fsam) sampleId = cmn.lastName(fsam).replace('highQ_', '').split('_')[0] dnlabel = '%s_%s' % (cmn.lastName(fsam).replace( '.sam', ''), cmn.lastName(fass).replace('.fa', '')) cmn.mkdir(dnlabel) os.chdir(dnlabel) cwd = os.getcwd() pre_cmds = 'cd %s\n' % cwd template = template.replace('5642', sampleId) template = template.replace('[WL_preprossing]', pre_cmds) cmn.write_file(template, 'gatk%s.job' % sampleId) #cmn.run('sbatch gatk%s.job' % sampleId)
isIndexed = True print('###############################################') if not isIndexed: print('**********************************************') print('\nimportant!!!') print('please re-run this script after all references are indexed!\n') print('**********************************************') ############################### #all the steps below would put into the job files template = cmn.txt_read( '/work/biophysics/mtang/SNP_calling/scripts/templates/template_gatk_unbias4TACC.job' ) cmn.mkdir('job_files') fjobs = [] for sp in refdict: #if sp.split('_')[0] not in subsetIDs: # continue snp_list = refdict[sp] for samdir, ref in snp_list: label = '%s_%s' % (sp, ref) if label not in subsetJobs: continue print('processing %s' % label) #a. make directory olabel = '%s_%s' % (sp, ref) wdir = '%s/%s' % (cwd, olabel) wdir4TACC = '../%s' % olabel
cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID)) cmn.run('rm -r %s ' % tmpdir) return genus if __name__=='__main__': #options=parse_options() try: #fn, f_table = sys.argv[1:3] fn = sys.argv[1] except: print("Usage: *.py fqlist", file=sys.stderr) sys.exit() cmn.mkdir('tmpStat') IDlist = set([]) fq_groups = {} for line in cmn.file2lines(fn): Id = cmn.lastName(line).split('_')[0] Id = Id.replace('NVG-', '').replace('11-BOA-','').replace('LEP-', 'LEP') IDlist.add(Id) fq = os.path.abspath(line) try: fq_groups[Id].append(fq) except KeyError: fq_groups[Id] = [fq] nameDict = get_names_4barcode()
fqlist = cmn.file2lines(fn) groupDict = {} for fq in fqlist: ID = cmn.lastName(fq).split('_')[0] try: groupDict[ID].append(fq) except KeyError: groupDict[ID] = [fq] for sample in groupDict: fqlist = groupDict[sample] #fqlist = cmn.cmd2lines('ls /project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/tmp_link_fastq/%s*.fastq' % sample) #fqlist = cmn.cmd2lines('ls /work/biophysics/wli/workspace/filtered_6313*q') wdir = 'mitoD_%s' % sample cmn.mkdir(wdir) os.chdir(wdir) cwd = os.getcwd() info = template.replace('[cwd]', cwd) info = info.replace('[fq_files]', ' '.join(fqlist)) info = info.replace('[sample]', sample) #prepare quake infiles fqlist_local = [] for fq in fqlist: cmn.run('ln -s ' + fq) fqlist_local.append(cmn.lastName(fq)) cmn.write_lines(fqlist_local, 'fqlist') cmn.run('ln -s fqlist infiles') #make fq2fa comand
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() adict = {} with open(fn) as fp: for line in fp: if line[0] == '>': label = line.strip() else: seq = line.strip() adict[label] = '%s\n%s\n' % (label, seq) times = 10 keys = list(adict.keys()) cmn.mkdir('shuffle_genome') for each in range(times): random.shuffle(keys) new = [adict[key] for key in keys] dn = 'shuffle_genome/%s_shuffle%s' % (cmn.lastName(fn), each) cmn.write_file(''.join(new), dn)
try: fn = sys.argv[1] except: print("Usage: *.py filelist", file=sys.stderr) sys.exit() #step 1 including 1. making the pileup file, 2. correct bias, 3. sam map again 4. snp call till last step #step 2: just the snp call using multiple CPUs #only put 15 jobs in a node to avoid memmory problem f_ass = '/project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/assembly_v2.fa' #should not run for reference genome; this should just build by original snp call ref_sp = '3935' step1dir = 'step1_cmds' cmn.mkdir(step1dir) #fns = cmn.getid(fn) fns = cmn.cmd2lines( 'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/realigned_reads.bam' ) finished_pileups = cmn.cmd2lines( 'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/*.pileup' ) finished = set( [cmn.lastName(i).split('_')[0] for i in cmn.cmd2lines('ls */*.vcf')]) step1_finished = set([ cmn.lastName(i).split('/')[0] for i in cmn.cmd2lines('ls */realigned_reads.bam') ])
cwd = os.getcwd() #1. read in info refs = set([]) rdict = {} for line in cmn.file2lines(finfo): sp, fastq, ref = line.strip().split() try: rdict[sp].append((fastq, ref)) except KeyError: rdict[sp] = [(fastq, ref)] refs.add(ref) #2. prepare reference jobs refdir = '/work/biophysics/mtang/SNP_calling/indexed_references' cmn.mkdir(refdir) os.chdir(refdir) index_cmds = ['cd %s' % refdir] for ref in refs: if not os.path.exists(cmn.lastName(ref)): #cmn.run('ln -s %s' % ref) cmn.run('cp %s %s/' % (ref, refdir)) ref = cmn.lastName(ref) reflabel = ref.replace('.fa', '') checkFn = reflabel + '.pac' if cmn.filexist(checkFn): print('found finished ref for %s, skip it' % ref) continue cmd = '/home2/wli/local/bwa-0.7.12/bwa index %s -p %s &' % (ref, reflabel) index_cmds.append(cmd)