def find_tree_file(wdir, name): tmp = cmn.cmd2lines('ls -t %s/*renamed' % wdir) if len(tmp) == 0: tmp += cmn.cmd2lines('ls -t %s/%s*tre' % (wdir, name)) if len(tmp) == 0: print('Error! can not find tree in %s' % wdir) sys.exit() return tmp[0]
def backup_finalStat(wdir): ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/final_stats/' fns = cmn.cmd2lines('ls %s/*.report| grep -v all_genome' % wdir) for fn in fns: print('processing %s...' % fn) fnlabel = cmn.lastName(fn) #don't back up the ones without species items = fnlabel.replace('_stat.report', '').split('_') if len(items) == 1: print('skip the fasta without sp for %s' % fn) continue #get the one with least NA and more data amount dn = '%s/%s' % (ddir, fnlabel) if os.path.exists(dn): print('merging new and old data for %s' % fnlabel) Nold_na, Nold_data = count_final_stat(dn) Nnew_na, Nnew_data = count_final_stat(fn) if Nnew_na < Nold_na: #less NA cmn.run('cp %s %s' % (fn, dn)) else: if Nnew_na == Nold_na: #same NA number if Nnew_data > Nold_data: cmn.run('cp %s %s' % (fn, dn)) else: cmn.run('cp %s %s' % (fn, dn)) cmn.run('cd %s; cat *.report > allstat.txt' % ddir)
def old_log_newBaits_ifPossible(seqs): fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' takenNames = set( [each.strip()[1:] for each in cmn.cmd2lines('grep ">" %s' % fall)]) fnew = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa' seqDict = read_fa(fnew) for name in seqs: if name not in takenNames and (name not in seqDict): seqDict[name] = seqs[name][20:678] with open(fnew, 'w') as dp: for name in seqDict: if name not in takenNames and (name not in seqDict): print('saving %s into database...' % name) fasta = '>%s\n%s\n' % (name, seqDict[name]) dp.write(fasta) fverify = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa' dict2 = read_fa(fverify) seqDict.update(dict2) with open(fverify, 'w') as dp: for name in seqDict: fasta = '>%s\n%s\n' % (name.replace( '(assembled)', '').strip('.'), seqDict[name].replace('-', 'N')) dp.write(fasta) cmd = 'module add blast;cd /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes; makeblastdb -in=all_barcodes_4verify.fa -dbtype=nucl; chmod a+w all_barcodes_4verify.*' cmn.run(cmd)
def read_compare_file(fn, fcheck, ID): cmd = 'grep -P \'^%s\t\' %s' % (ID, fn) line = cmn.cmd2lines(cmd)[0] #TODO: need to rework if 'same' in line: return 'mostConfident', [0] elif 'diffGenus' in line: return 'diffGenus', [0, 1] elif 'takenD' in line: return 'confident', [0] elif 'completeDenovo' in line and ('goodCC' in line): return 'confident', [0] elif 'completeDenovo' in line: return 'denovoOnly', [0] elif 'goodCC' in line: return 'goodRef', [1] else: #these sample would be somewhat problematic if 'Error' in line: return 'Error', [0, 1] elif isHighRatio(line): return 'Suspicius', [0, 1] elif 'Gap0' not in line: return 'PoorSample', [1] elif 'noDenovo' in line: return 'refOnly', [0] else: return 'TODO', [0, 1]
def read_baits(): fns = cmn.cmd2lines('ls baits/bait*.fa') seqDict = {} for fn in fns: name, seq = cmn.file2lines(fn) seqDict[name[1:]] = list(seq) return seqDict
def search_for_old_fastq(label, wdirs): global selfDir alist = [] for wdir in wdirs: cmd = 'ls %s/%s*q 2> /dev/null' % (wdir, label) alist += [line for line in cmn.cmd2lines(cmd) if selfDir not in line] return alist
def prune_tree(ftree, fseq): t = ete3.Tree(ftree) IDlist = cmn.cmd2lines('grep ">" %s|cut -d ">" -f 2' % fseq) t.prune(IDlist) dn = 'prune_tree.tre' cmn.write_file(t.write(format=1), dn) return dn
def parse_fastq_dir(wdir): adict = {} fns = cmn.cmd2lines('ls %s/*q' % wdir) for line in fns: sp = cmn.lastName(line).split('_')[0] try: adict[sp].append(line) except KeyError: adict[sp] = [line] return adict
def do_barcode_blast(sequence): fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta' namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '') fquery = '/tmp/%s.fa' % namelabel cmn.write_file(sequence, fquery) cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb) cmd += '-outfmt \'6 sseqid qlen slen length pident\'' lines = cmn.cmd2lines(cmd) #cmn.run('rm %s' % fquery) return lines
def find_annotation(CG): if CG == 'NA': return 'NA' fn = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/gene_association.fb' anno_lines = cmn.cmd2lines('grep %s %s' % (CG, fn)) annos = set([]) for line in anno_lines: items = line.split('\t') anno = items[9] annos.add(anno) return ','.join(annos)
def get_names(): adict = {} fns = cmn.cmd2lines('ls -tr /project/biophysics/Nick_lab/wli/sequencing/scripts/data/*.sampleData') for fn in fns: for line in cmn.file2lines(fn): line = line.strip() items = line.split() sp = items[0].split('-')[-1] line = line.replace(items[0], sp).replace('-', '_').replace('(','').replace(')', '') adict[sp] = '_'.join(line.split()) return adict
def do_barcode_blast(sequence, seqDict): #fref = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo' fdb = makeBlastDatabase(seqDict) #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta' namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '') namelabel = namelabel.replace('/', '_') fquery = '/tmp/%s.fa' % namelabel cmn.write_file(sequence, fquery) cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb) cmd += '-outfmt \'6 sseqid qlen slen length pident\'' lines = cmn.cmd2lines(cmd) cmn.run('rm %s' % fquery) return lines
def get_query_sequence(seqDict, genus, sp): #1. anything in Eudamine file has higher priority fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt' cmd = 'grep %s %s' % (sp, fEud) lines = cmn.cmd2lines(cmd) if len(lines) == 1: name = lines[0].split()[0] seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) print('pick %s for %s %s' % (name, genus, sp)) return fasta, qlen #look it up in other files names = list(seqDict.keys()) good_names = [name for name in names if genus in name ] if len(good_names) == 0:#sp is just 'sp' print('can not find barcode for genus keyword "%s"' % genus) good_names = names if len(good_names) > 1: #try to refine it tmp = [name for name in good_names if sp in name] if len(tmp) != 0: good_names = tmp #try to see if type species is there tmp = [name for name in good_names if name[0] == '*'] if len(tmp) != 0: good_names = tmp else: tmp = [name for name in good_names if '*' in name] if len(tmp) != 0: good_names = tmp #then randomly pick one, get the max length ones name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-'))) seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) print('pick %s for %s %s' % (name, genus, sp)) return fasta, qlen
def parse_info_file(wdir): fns = cmn.cmd2lines(' ls %s/fastq*' % wdir) adict = {} for fn in fns: fp = open(fn) #with open(fn) as fp: for line in fp: line = line.strip() sp = cmn.lastName(line).split('_')[0] try: adict[sp].append(line) except KeyError: adict[sp] = [line] fp.close() for key in list(adict.keys()): adict[key] = set(adict[key]) return adict
def read_rep(): dn = 'rep.dict.pkl' if cmn.filexist(dn): print('loading repeats using precomputed data...') return cmn.pickle_read(dn) freps = cmn.cmd2lines('ls annotation_repeats/*.gff3') repdict = {} for frep in freps: for line in cmn.file2lines(fn): items = line.strip().split() scaf = items[0] if scaf not in repdict: repdict[scaf] = set([]) i, j = list(map(int, items[3:5])) repdict[scaf] = repdict[scaf] | set(range(i, j)) cmn.pickle_write(repdict, dn) return repdict
def split_and_order_sp_byTree(sampleIDs): treeDir = '/project/biophysics/Nick_lab/mtang/building_trees' wdirs = cmn.cmd2lines('ls %s' % treeDir) wdir_dict = get_newest_tree_dir(wdirs) rdict = {} for projectName in wdir_dict: wdir = '%s/%s' % (treeDir, wdir_dict[projectName]) print('pick %s for %s' % (wdir, projectName)) IDfile = '%s/NickList' % (wdir) IDlist = [each.replace('NVG-', '').replace('LEP-', 'LEP') for each in cmn.file2lines(IDfile)] overlapIDs = set(IDlist) & set(sampleIDs) if len(overlapIDs) > 0: ftree = find_tree_file(wdir, projectName) ordered_IDs = order_ID_byTree(ftree, overlapIDs) rdict[projectName] = ordered_IDs return rdict
def backup_vcf_coverage(wdir): ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/check_vcf_coverage' fns = cmn.cmd2lines('ls %s/*_vcf.cov' % wdir) #1. only back up the new version of cov file for fn in fns: print('processing %s...' % fn) lines = cmn.file2lines(fn) items = lines[-1].strip().split() if len(items) != 6: print('skip old format file %s' % fn) continue fnlabel = cmn.lastName(fn) dn = '%s/%s' % (ddir, fnlabel) if os.path.exists(dn): print('merging new and old data for %s' % fnlabel) covOld = float(cmn.file2lines(dn)[-1].split()[-2]) cov = float(lines[-1].split()[-2]) if cov > covOld: cmn.run('cp %s %s' % (fn, dn)) else: cmn.run('cp %s %s' % (fn, dn))
def backup_fasta(wdir): ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/map2fasta' fns = cmn.cmd2lines('ls %s/*_m2s.fa| grep -v all_genome' % wdir) for fn in fns: print('processing %s...' % fn) fnlabel = cmn.lastName(fn) #don't back up the ones without species items = fnlabel.replace('_snp_step2_MITO_m2s.fa', '').replace('_snp_step2_m2s.fa', '').split('_') if len(items) == 1: print('skip the fasta without sp for %s' % fn) continue #get the least gapped one dn = '%s/%s' % (ddir, fnlabel) if os.path.exists(dn): print('merging new and old data for %s' % fnlabel) Nold = count_fasta_nonGap(dn) Nnew = count_fasta_nonGap(fn) if Nnew > Nold: cmn.run('cp %s %s' % (fn, dn)) else: cmn.run('cp %s %s' % (fn, dn))
#main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] Ncores = int(sys.argv[2]) except: print("Usage: *.py fa Ncores", file=sys.stderr) sys.exit() #if the nodes are less than 4 taxa, produce a random tree cmd = "grep '>' %s" % (fn) lines = [ each[1:].strip() for each in cmn.cmd2lines(cmd) if each.strip() != '' ] N = len(lines) if N < 4: print('Warning: fastme can not make tree of less than 4 taxa') print('Warning: so I make a fake tree...') dn = '%s.phylip.fastme.tre' % cmn.lastName(fn) if N == 1: info = '(%s);\n' % lines[0] if N == 2: a, b = lines info = '(%s,%s);\n' % (a, b) elif N == 3: a, b, c = lines info = '((%s,%s),%s);\n' % (a, b, c)
sys.exit() #step 1 including 1. making the pileup file, 2. correct bias, 3. sam map again 4. snp call till last step #step 2: just the snp call using multiple CPUs #only put 15 jobs in a node to avoid memmory problem f_ass = '/project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/assembly_v2.fa' #should not run for reference genome; this should just build by original snp call ref_sp = '3935' step1dir = 'step1_cmds' cmn.mkdir(step1dir) #fns = cmn.getid(fn) fns = cmn.cmd2lines( 'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/realigned_reads.bam' ) finished_pileups = cmn.cmd2lines( 'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/2_gatk/*/*.pileup' ) finished = set( [cmn.lastName(i).split('_')[0] for i in cmn.cmd2lines('ls */*.vcf')]) step1_finished = set([ cmn.lastName(i).split('/')[0] for i in cmn.cmd2lines('ls */realigned_reads.bam') ]) template = cmn.txt_read( '/project/biophysics/Nick_lab/wli/sequencing/myAnalysis/clean_ref_bias/1_gatk_runs/step1_job.template' )
fn=sys.argv[1] except: print("Usage: *.py NickList requred_keywords", file=sys.stderr) sys.exit() words = sys.argv[2:] IDs = set(cmn.file2lines(fn)) #note: cne is equal to 3574_assembly_v1 wdirs = [ '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/map2fasta', '/project/biophysics/Nick_lab/mtang/unbias_SNPs/*/step4_postprocessing/map2fasta' ] alea_list = cmn.cmd2lines('ssh [email protected] "ls /archive/butterfly/unbias_pipeline_info/step4_postprocessing/map2fasta/*MITO*.fa"') missing = [] falist = [] required = '' if len(words) != 0: required = '|' + '|'.join(['grep %s' % word for word in words]) faDict = {} for ID in IDs: taken = [] for wdir in wdirs: cmd = 'ls %s/%s*_m2s.fa 2> /dev/null| grep MITO %s' % (wdir, ID, required) taken += cmn.cmd2lines(cmd)
except: print("Usage: *.py NickList requred_keywords", file=sys.stderr) sys.exit() words = sys.argv[2:] IDs = set(cmn.file2lines(fn)) #note: cne is equal to 3574_assembly_v1 wdirs = [ '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/vcf2map', '/project/biophysics/Nick_lab/mtang/unbias_SNPs/*/step4_postprocessing/vcf2map' ] alea_list = cmn.cmd2lines( 'ssh [email protected] "ls /archive/butterfly/unbias_pipeline_info/step4_postprocessing/vcf2map/*.map| grep -v mitogenome"' ) missing = [] falist = [] required = '' if len(words) != 0: for word in words: word = word.strip() if word == '3574_assembly_v1' or word == 'cne': word = '"3574_assembly_v1\|cne"' required += '| grep %s' % word words = set(words)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: wdir = sys.argv[1] except: print("Usage: *.py wdir", file=sys.stderr) sys.exit() total = 0 unaligned = 0 bestDir = cmn.txt_read('%s/best_mapping.txt' % wdir) fns = cmn.cmd2lines('ls %s/%s/*.sam' % (wdir, bestDir)) sp = fns[0].split('/')[-3] for fn in fns: fp = open(fn) #with open(fn) as fp: for line in fp: if line[0] == '@': continue else: total += 1 if line.strip().split()[2] == '*': unaligned += 1 fp.close() print(sp, bestDir, (total - unaligned), total)
import sys import os python_lib = '/work/biophysics/mtang/SNP_calling/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn #1. read in data fns = cmn.getid(sys.argv[1]) falist = cmn.cmd2lines('ls *m2s.fa') finished_maps = set([fn.replace('_m2s.fa', '.map') for fn in falist]) isGood = True cmds = [] for fn in fns: label = cmn.lastName(fn) if label in finished_maps: continue isGood = False if 'MITO' in label: cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta_mito.py %s' % fn else: cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta.py %s' % fn cmds.append(cmd)
if python_lib not in sys.path: sys.path.append(python_lib) import cmn import os from fullname_lib import get_names_4barcode #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #infoLines = cmn.cmd2lines('head -n 1 sampleRun*/rescued_read_assembled_mis1*.txt') IDlist = cmn.cmd2lines("ls -d sampleRun_* |grep -v fake|cut -d '_' -f 2") nameDict = get_names_4barcode() for ID in IDlist: items = nameDict[ID].replace('?', '').split() ID, genus, sp = items[:3] print('sampleInfo', ID, genus, sp) fn = 'sampleRun_%s/good_read_assembled.txt' % ID #label = '%s_%s' % (genus, sp) cmd = 'head %s -n 2| grep %s' % (fn, genus) print(cmd) info = cmn.cmd2info(cmd).strip() if info == '': print('please re-run', ID, genus, sp)
def group_fastq(fns): adict = {} for fn in fns: key = cmn.lastName(fn).split('_')[0] try: adict[key].append(fn) except KeyError: adict[key] = [fn] return adict #~~~~~~~~main~~~~~~~~~~~~~~# wdir = sys.argv[1].rstrip('/') selfDir = os.path.abspath(wdir) fastqs = cmn.cmd2lines('ls %s/*q' % wdir) print(fastqs) log_info = [] outdir = '%s/fastq_thisBatch' % wdir cmn.mkdir(outdir) hasCombined = False for fastq in fastqs: label = '.'.join(cmn.lastName(fastq).split('.')[:-1]) print('processing %s' % label) old_fastqs = search_for_old_fastq(label, search_dirs) dn = '%s/%s' % (outdir, cmn.lastName(fastq)) if not os.path.exists(dn): cmn.run('mv %s %s' % (fastq, dn)) else:
except: print( "Usage: *.py ../step1_gather_data/mapping_info.txt ../step2_bwa_mapping ../step1_gather_data/require_SNPs.dict.pkl TACC_IDs", file=sys.stderr) sys.exit() cwd = os.getcwd() #subsetIDs = set(cmn.getid(fsubset)) subsetJobs = set([ cmn.lastName(line.replace('sbatch', '').strip())[4:-4] for line in cmn.file2lines(fsubset) ]) #1. read in info fsams = cmn.cmd2lines('ls %s/*/*/*.sam' % mapdir) #print fsams samdirs = set(['/'.join(fsam.split('/')[:-2]) for fsam in fsams]) #print samdirs require_refs = cmn.pickle_read(freq) fq_dict = {} refdict = {} #1. tell by reftable #make the requirement by the reftable for line in cmn.file2lines(freftable): items = line.strip().split() sp = items[0] fastqs = items[1].split(',') fq_dict[sp] = fastqs
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: indir, label = sys.argv[1:3] except: print("Usage: *.py ../1_process_alignment/noGap_splits noGap", file=sys.stderr) sys.exit() fns = [os.path.abspath(i) for i in cmn.cmd2lines('ls %s/*' % indir)] wdir = 'split_run_%s' % label cmn.mkdir(wdir) os.chdir(wdir) cmn.mkdir('job_files') for count, fn in enumerate(fns): cmn.run('ln -s %s' % fn) fn_new = cmn.lastName(fn) cmd = 'rm *%s_%s; /home2/wli/local/RAxML/raxmlHPC-PTHREADS-SSE3 -m GTRGAMMA -s %s -n %s_%s -p 7112 -T 48 ' % ( label, count, fn_new, label, count) dn = 'job_files/sg%s.job' % count cmn.run('/home2/wli/my_programs/make_job.py "%s" -p 256GB -t 33 > %s' % (cmd, dn))
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py *.report", file=sys.stderr) sys.exit() fcomp = 'compare.check' cmd = 'grep -v same compare.check' lines = cmn.cmd2lines(cmd) #exclude those with gap0 and nothing in report file badSp = set([]) for line in cmn.file2lines(fn): if line[0] == '#': continue sp = line.split()[0].split('_')[0] badSp.add(sp) for line in lines: items = line.strip().split() sp = items[0] if sp not in badSp and items[2] == 'Gap0': if 'no' in items[-1]: line = 'goodReport:' + line
stackSeq = parse_del_p(stackSeq, del_p) cleanSeq = parse_del_p(cleanSeq, del_p) threadSeq = parse_del_p(threadSeq, del_p) return threadSeq, stackSeq, cleanSeq def parse_del_p(seq, del_p): new = [char for i, char in enumerate(seq) if i not in del_p] return ''.join(new) if __name__ == '__main__': #olines = cmn.cmd2lines('grep "thread\|stack" sampleRun_*/good_read_assembled.txt') #lines = cmn.cmd2lines('grep -H "threaded_\|stack_\|clean_" sampleRun_*/rescued_read_assembled_mis1*.txt') wdirs = cmn.cmd2lines('ls -d sampleRun_*') #cmn.mkdir('sampleRun_fake') #cmn.run('touch sampleRun_fake/good_read_assembled.txt') #cmn.run('touch sampleRun_fake/rescued_read_assembled_mis1.txt') #time.sleep(2) cmn.run('rm cannot_fixed_indel.txt 2> /dev/null') #frecords = cmn.cmd2lines('ls sampleRun_*/pickingLog.txt') stack_lines = {} thread_lines = {} clean_lines = {} leftN = 20 barcodeLength = 658 for wdir in wdirs: