def grep_reads(read, f_libs, direction): #reverse the read #reverse = ''.join([rdict[i] for i in read[::-1]]) cmds = [] for fn in f_libs: cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/grep_reads.py %s %s %s &' % ( read, fn, direction) cmds.append(cmd) cmds.append('\nwait;\n') f_job = 'grep_read.job' cmn.write_lines(cmds, f_job) cmn.run('bash %s ' % f_job) #the output dir is grep_out dn = 'all_grep_reads.txt' cmn.run('cat grep_out/* > %s' % dn) fished_reads = cmn.getid(dn) return fished_reads
#options=parse_options() try: fref, fqlist = sys.argv[1:3] except: print("Usage: *.py sampleInfo.baits fqlist", file=sys.stderr) sys.exit() #add primer if not added ref_seqs, toAddDict = read_baits(fref) #log the baits into the dataset log_newBaits_ifPossible(ref_seqs) #index ref here frefs = parse_ref(ref_seqs) fqlist = cmn.getid(fqlist) fq_groups = group_fq(fqlist) N = cmn.cpu_check() bwa_cmds = ['module add bwa'] for reflabel in frefs: fref = frefs[reflabel] fnlabel = cmn.lastName(fref).replace('.fa', '') for sp in fq_groups: R1, R2, single = fq_groups[sp] cmd = 'bwa mem -t %s -B 2 -M %s %s %s | grep "%s" > %s_paired_%s_mapped.sam ' % ( N, fnlabel, R1, R2, reflabel, sp, fnlabel) bwa_cmds.append(cmd) cmd = 'bwa mem -t %s -B 2 -M %s %s | grep "%s" > %s_single_%s_mapped.sam ' % ( N, fnlabel, single, reflabel, sp, fnlabel)
import sys python_lib = '/work/biophysics/mtang/SNP_calling/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn import os vcf_list = [os.path.abspath(fn) for fn in cmn.getid(sys.argv[1])] for fn in vcf_list: items = fn.split('/') parent_dir = '/'.join(items[:-3]) step2_dir = '%s/step2_bwa_mapping/mapped_reads_count' % parent_dir sp = cmn.lastName(fn).split('_')[0] lines = cmn.cmd2lines('grep %s %s/*' % (sp, step2_dir)) maxRef = (None, 0) if len(lines) == 0: print('Error for %s' % fn) for line in lines: a, ref, mapN, totalN = line.strip().split() if int(mapN) > maxRef[1]: maxRef = [ref, int(mapN)] ref = maxRef[0] #ref = 'Junonia_v2_withMito' parent_dir = '/'.join(items[:-1]) new_vcf = '%s/%s_%s_snp_step2.vcf' % (parent_dir, sp, ref) cmd = 'mv %s %s' % (fn, new_vcf) print(cmd)
#options=parse_options() try: fn, fadd = sys.argv[1:3] except: print("Usage: *.py aln repID.file", file=sys.stderr) sys.exit() #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps' #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs' #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID) # if i.strip() != '']) #fadd = 'added_sps' #if cmn.filexist(fadd): # print 'found local list, add them in' goodIDs = set([each for each in cmn.getid(fadd) if each[0] != '#']) dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa', '') + '_taken.fa' dp = open(dn, 'w') new = [] leftIDs = set(goodIDs) with open(fn) as fp: for line in fp: if line[0] == '>': #name = line[1:].strip().strip().split('_')[0].replace('flt', '').split('Dup')[0] name = line[1:].strip().split()[0] if name in goodIDs: isGood = True if name in leftIDs:
import sys import os python_lib = '/work/biophysics/mtang/SNP_calling/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn #1. read in data fns = cmn.getid(sys.argv[1]) falist = cmn.cmd2lines('ls *m2s.fa') finished_maps = set([fn.replace('_m2s.fa', '.map') for fn in falist]) isGood = True cmds = [] for fn in fns: label = cmn.lastName(fn) if label in finished_maps: continue isGood = False if 'MITO' in label: cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta_mito.py %s' % fn else: cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta.py %s' % fn cmds.append(cmd)
try: fn, fadd = sys.argv[1:3] except: print("Usage: *.py aln repID.file", file=sys.stderr) sys.exit() #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps' #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs' #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID) # if i.strip() != '']) #fadd = 'added_sps' #if cmn.filexist(fadd): # print 'found local list, add them in' goodIDs = set( [each.split('_')[0] for each in cmn.getid(fadd) if each[0] != '#']) dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa', '') + '_taken.fa' dp = open(dn, 'w') new = [] leftIDs = set(goodIDs) with open(fn) as fp: for line in fp: if line[0] == '>': name = line[1:].strip().strip().split('_')[0].replace( 'flt', '').split('Dup')[0].split('.Lere')[0] #name = line[1:].strip().split()[0] #name = line[1:].strip().split('.Lerema')[0] if name in goodIDs:
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: odir, f_ass = sys.argv[1:3] except: print("Usage: *.py filelist assembly_v0.fa", file=sys.stderr) print("you should index assembly_v0.fa first with -p assembly_v0", file=sys.stderr) print("using command /home2/wli/local/bwa-0.7.12/bwa index ", file=sys.stderr) sys.exit() #fns = cmn.cmd2lines('ls %s/*.fq' % odir) fns = cmn.getid(odir) group_dict = separate_by_label(fns) ass_label = cmn.find_between(cmn.lastName(f_ass), 'assembly_', '.fa') cmn.mkdir('job_files') cmn.mkdir('cmd_files') for plabel in group_dict: print('processing lib %s' % plabel) each = group_dict[plabel] #also parse the files inside this function #return the file name after parsing paired, unpaired = separate_by_pair(plabel, each) if paired == None:
sys.path.append(python_lib) import cmn import os import time def get_current_jobs(label, user): cmd = 'squeue| grep %s| grep g%s|wc -l' % (user, label) N = cmn.cmd2info(cmd).split()[0] N = int(N) return N fn = 'forked_jobs.list' jobs = cmn.getid(fn) cores = int(sys.argv[1]) user = cmn.cmd2info('echo $USER').strip() user_label = user[0] currentN = get_current_jobs(user_label, user) os.chdir('job_files') todo = list(jobs) while(len(todo) != 0): fjob = todo[0] currentN = get_current_jobs(user_label, user)
for a0, a1, a2 in [aset, bset]: if a1 == None and a0 != None and a2 != None: return True return False #no indel if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py samlist", file=sys.stderr) sys.exit() fns = cmn.getid(fn) cmn.run('rm hasDeletion 2> /dev/null') read_dict = {} bad_alignments = [] seqDict = {} for fn in fns: print('parsing %s...' % fn) try: samfile = pysam.AlignmentFile(fn) except: print('skip empty file %s' % fn) continue for record in samfile:
sys.path.append(python_lib) import cmn import os def group_list(alist): adict = {} for fn in alist: sp = cmn.lastName(fn).split('_')[0] try: adict[sp].append(fn) except KeyError: adict[sp] = [fn] return adict fq_list = cmn.getid(sys.argv[1]) vcf_list = cmn.getid(sys.argv[2]) samdir_list = cmn.getid(sys.argv[3]) vcfCov_dir = cmn.getid(sys.argv[4])[0] #5737_3311_assembly_v1_stat.report finished = [cmn.lastName(each) for each in cmn.getid(sys.argv[5])] refresh = any([each=='-r' for each in sys.argv]) fq_groups = group_list(fq_list)# group by sp vcf_groups = group_list(vcf_list) cmds = [] for sp in vcf_groups: vcf_fns = vcf_groups[sp]
#main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py gap_see", file=sys.stderr) sys.exit() pop_map = {} fpops = cmn.cmd2lines( 'ls /project/biophysics/Nick_lab/wli/sequencing/general_info/P*IDs') for fpop in fpops: alist = cmn.getid(fpop) popname = cmn.lastName(fpop)[1:-3] for sp in alist: pop_map[sp] = popname #15101E04_snp.codeVcf_father 13629191 1986713 0.145768960168 adict = {} for line in cmn.file2lines(fn): items = line.split() sp = items[0].split('_')[0] gapF = float(items[-1]) try: pop = pop_map[sp] except: continue
print('Error! can not decide the reference for %s' % sp) print('Please remove the duplications in ') print('\n'.join(bestFns)) isbad = True ref = cmn.txt_read(bestFns[0]) adict[sp] = ref rset.add(ref) if isbad: sys.exit() return rset, adict #1. read in data fns = cmn.getid(sys.argv[1]) #bwa_dirs = [line.strip().rstrip('/') for line in cmn.getid(sys.argv[2])] #2. check which reference they used #sp is unique vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns} #6188_3842_assembly_v2_snp_step2.vcf #vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns} sps = list(vcf_dict.keys()) #ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs) ref_genomes, refmapping = set([]), {} for fn in fns: #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '')
import sys python_lib = '/work/biophysics/mtang/SNP_calling/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn import os jobs = [line.strip().split()[-1] for line in cmn.getid(sys.argv[1])] fromDir = sys.argv[2].rstrip('/') #the dir ends with step3 cwd = os.getcwd() cmn.mkdir('job_files') cmn.mkdir('step3_gatk') fromPdir = '/'.join(fromDir.split('/')[:-1]) cmn.run('ln -s %s/step2_bwa_mapping' % fromPdir) fjobs = [] #1. copy the directory to current for job in jobs: wdir = job[4:-4] current = '%s/%s' % (fromDir, wdir) cmd = 'cp -r %s step3_gatk' % current print('forking data for %s' % current) cmn.run(cmd) new = '%s/step3_gatk/%s' % (cwd, wdir) user = cmn.cmd2info('echo $USER').strip() user_label = user[0]
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fns = sys.argv[1:] except: print("Usage: *.py", file=sys.stderr) sys.exit() fqs = [] for fn in fns: fqs += cmn.getid(fn) sequences = [] for fq in fqs: print('reading %s' % fq) with open(fq) as fp: for i, line in enumerate(fp): if i % 4 == 1: sequences.append(line.strip()) maxlength = max(list(map(len, sequences))) for i in range(3, maxlength): print('checking %s' % i) sample_times = 10 check_seeds = generate_random_sequence(i, sample_times)
print('Error! can not decide the reference for %s' % sp) print('Please remove the duplications in ') print('\n'.join(bestFns)) isbad = True ref = cmn.txt_read(bestFns[0]) adict[sp] = ref rset.add(ref) if isbad: sys.exit() return rset, adict #1. read in data bwa_dirs = cmn.getid(sys.argv[1]) fgood = 'good_maps.txt' fbad = 'bad_maps.txt' mito = set([]) genome = set([]) for line in cmn.file2lines(fgood): sp = line.split('_')[0] if 'MITO' in line: mito.add(sp) else: genome.add(sp) refs, refdict = detect_ref_genomes( genome , bwa_dirs)
import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #fn = 'coding.fasta' fn = sys.argv[1] sampleIDs = set(cmn.getid(sys.argv[2])) print(sampleIDs) gapped = set([]) adict = {} with open(fn) as fp: for line in fp: if line[0] == '>': defline = line.strip() else: seq = line.strip() adict[defline] = seq if defline.split('_')[0][1:] in sampleIDs: count = 0 for char in seq: if char == '-' or char == 'N' or char == ',' or char == 'X':
print('Error! can not decide the reference for %s' % sp) print('Please remove the duplications in ') print('\n'.join(bestFns)) isbad = True ref = cmn.txt_read(bestFns[0]) adict[sp] = ref rset.add(ref) if isbad: sys.exit() return rset, adict #1. read in data fns = cmn.getid(sys.argv[1]) bwa_dirs = [line.strip().rstrip('/') for line in cmn.getid(sys.argv[2])] #2. check which reference they used #sp is unique vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns} #6188_3842_assembly_v2_snp_step2.vcf #vcf_dict = {cmn.lastName(fn).replace('_snp_step2.vcf', ''): fn for fn in fns} sps = list(vcf_dict.keys()) #ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs) ref_genomes, refmapping = set([]), {} for fn in fns: #../../step3_gatk/5729_3614_assembly_v1/5729_3614_assembly_v1_snp_step2.vcf fnlabel = cmn.lastName(fn).replace('_snp_step2.vcf', '')
adict[sp] = [fn] return adict def check_NA(label): fn = label + '_stat.report' line = cmn.file2lines(fn)[-1] items = line.strip().split() #return isWarnning if len(items) != 10 or 'NA' in items: return True else: return False fq_list = cmn.getid(sys.argv[1]) vcf_list = cmn.getid(sys.argv[2]) samdir_list = cmn.getid(sys.argv[3]) vcfCov_dir = cmn.getid(sys.argv[4])[0] report_files = cmn.cmd2lines('ls *_stat.report') finished_labels = set( [cmn.lastName(each).replace('_stat.report', '') for each in report_files]) refresh = any([each == '-r' for each in sys.argv]) fq_groups = group_list(fq_list) # group by sp vcf_groups = group_list(vcf_list) isGood = True
keys = list(adict.keys()) for key in keys: each = adict[key] if len(each) != 2: print('Error! number of libs is wrong for %s' % key) print('below are the detected libs:') print('\n'.join(each)) print('Please fix!') sys.exit() each.sort() adict[key] = each return adict #---------------main-------------------- fastqs = cmn.getid(sys.argv[1]) findex = sys.argv[2] ad_dict = make_ad_dict(findex) fastq_dict = group_fastqs(fastqs) cmds = [] for key in fastq_dict: R1, R2 = fastq_dict[key] try: ad1 = ad_dict[key] except KeyError: print('Error! missing data for %s' % key) continue
#options=parse_options() try: freftable, mapdir, freq = sys.argv[1:4] except: print( "Usage: *.py ../step1_gather_data/mapping_info.txt ../step2_bwa_mapping ../step1_gather_data/require_SNPs.dict.pkl", file=sys.stderr) sys.exit() cwd = os.getcwd() if not os.path.exists('bad_vcf.list'): print('Error! can not find info for bad vcf files!') sys.exit() badones = set(cmn.getid('bad_vcf.list')) #1. read in info fsams = cmn.cmd2lines('ls %s/*/*/*.sam' % mapdir) #print fsams samdirs = set(['/'.join(fsam.split('/')[:-2]) for fsam in fsams]) #print samdirs require_refs = cmn.pickle_read(freq) fq_dict = {} refdict = {} #1. tell by reftable #make the requirement by the reftable required = {} for line in cmn.file2lines(freftable): items = line.strip().split()
#options=parse_options() try: fn, fadd = sys.argv[1:3] except: print("Usage: *.py aln repID.file", file=sys.stderr) sys.exit() #fID = '/work/biophysics/wli/introgression2/4_filterIntro/rep_sps' #fID = '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/4_build_tree/pureIDs' #goodIDs = set([i.split()[0] for i in cmn.file2lines(fID) # if i.strip() != '']) #fadd = 'added_sps' #if cmn.filexist(fadd): # print 'found local list, add them in' goodIDs = set(cmn.getid(fadd)) dn = cmn.lastName(fn).replace('.fasta', '').replace('.fa', '') + '_taken.fa' dp = open(dn, 'w') new = [] leftIDs = set(goodIDs) with open(fn) as fp: for line in fp: if line[0] == '>': name = line[1:].strip().split('_')[0].split('-')[0] if name in goodIDs: isGood = True if name in leftIDs: leftIDs.remove(name)
if fq == '': print('Error! can not find fastq list file!') sys.exit() else: print('guessing fastq file to be %s' % fq) if fref == '': print('Error! can not find ref table file!') sys.exit() else: print('guessing ref table file to be %s' % fref) fq_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/fastq.filelist' if os.path.exists(fq_all): aset = set(cmn.getid(fq_all)) else: aset = set([]) bset = set(cmn.getid(fq)) newset = aset | bset newset = filter_best_fastq(newset) cmn.write_lines(newset, fq_all) fref_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/refTable.txt' if os.path.exists(fref_all): aset = set(cmn.getid(fref_all)) else: aset = set([])
import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn, fg = sys.argv[1:3] except: print("Usage: *.py 2nd_sam_aln.txt good_reads.txt", file=sys.stderr) sys.exit() goodIDs = set(cmn.getid(fg)) dp = open('filtered_sam_aln.txt', 'w') dbad = open('bad_sam_aln.txt', 'w') with open(fn) as fp: for line in fp: Id = line.strip().split()[0] if Id in goodIDs: dp.write(line) else: dbad.write(line) dp.close() dbad.close()
#options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py map pop1 pop2 ...", file=sys.stderr) sys.exit() poplist = sys.argv[2:] if len(poplist) == 0: print('please specify populations!') sys.exit() popdict = {} for fpop in poplist: IDs = cmn.getid(fpop) name = cmn.lastName(fpop) popdict[name] = IDs #for ID in IDs: # popdict[ID] = name #seqDict = {} #with open(fn) as fp: # for line in fp: # if line[0] == '>': # name = line[1:].split('_')[0] # else: # seq = line.strip() # try: # seqDict[name].append(seq) # except KeyError:
if __name__=='__main__': #options=parse_options() try: #fn, f_table = sys.argv[1:3] fn = sys.argv[1] except: print("Usage: *.py RAxML_bestTree.noGap", file=sys.stderr) sys.exit() nameDict = get_names_4barcode() info = [] missing = [] lines = cmn.getid(fn) for line in lines: sp = line.strip().split()[0] #line = '%s\t%s\n' % (line, nameDict[sp]) try: info.append(nameDict[sp].replace('"', '')) except KeyError: missing.append(sp) info.append('') info = '\n'.join(info) cmn.write_file(info, 'sampleInfo')