def alnDict2output(aln_dict, dn, order='sorting'): info = [] if len(aln_dict) == 0: cmn.run('touch %s' % dn) return None #maxLength = max([len(each) for each in aln_dict.keys()]) maxLength = 0 maxNameLength = max([len(each) for each in aln_dict]) nameformat = '{:<%s}' % maxNameLength names = list(aln_dict.keys()) if order == 'sorting': names = sorted(names, key=lambda x: number4sorting(aln_dict[x])) elif order == 'grouping': #this is used to output inconsistent group #rank by grouping of species IDs names = sorted(names, key=lambda x: group_by_spnames(x)) else: names.sort() for i, name in enumerate(names): #name = 'readgroup%s' % i aln = aln_dict[name] name = nameformat.format(name) toAdd = maxLength - len(aln) if toAdd > 0: aln += '-' * toAdd info.append('%s %s\n' % (name, ''.join(aln))) cmn.write_file(''.join(info), dn)
def old_log_newBaits_ifPossible(seqs): fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' takenNames = set( [each.strip()[1:] for each in cmn.cmd2lines('grep ">" %s' % fall)]) fnew = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa' seqDict = read_fa(fnew) for name in seqs: if name not in takenNames and (name not in seqDict): seqDict[name] = seqs[name][20:678] with open(fnew, 'w') as dp: for name in seqDict: if name not in takenNames and (name not in seqDict): print('saving %s into database...' % name) fasta = '>%s\n%s\n' % (name, seqDict[name]) dp.write(fasta) fverify = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa' dict2 = read_fa(fverify) seqDict.update(dict2) with open(fverify, 'w') as dp: for name in seqDict: fasta = '>%s\n%s\n' % (name.replace( '(assembled)', '').strip('.'), seqDict[name].replace('-', 'N')) dp.write(fasta) cmd = 'module add blast;cd /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes; makeblastdb -in=all_barcodes_4verify.fa -dbtype=nucl; chmod a+w all_barcodes_4verify.*' cmn.run(cmd)
def parse_IDmapping_and_newDict(fn): print('processing fasta names...') cmd = 'source /home2/wli/.bash_profile; rename_fa_fullname.py %s > tmp.namelog' % fn cmn.run(cmd) seqDict = read_fa(fn + '.renamed') IDmapping = names2IDs(list(seqDict.keys())) return IDmapping, seqDict
def compute_mash_distance(f1, f2): global cpu dn = '/tmp/%s-%s' % (cmn.lastName(f1), cmn.lastName(f2)) cmd = '/home2/wli/local/mash-Linux64-v1.1.1/mash dist -p %s %s %s > %s' % ( cpu, f1, f2, dn) cmn.run(cmd) #print 'cmd:', cmd dist = cmn.txt_read(dn).strip().split()[2] return dist
def makeBlastDatabase(seqDict): dn = 'db4picking.fa' new = ['>%s\n%s\n' % (name, seqDict[name]) for name in seqDict if seqDict[name].strip('N-X') != ''] cmn.write_file(''.join(new), dn) cmd = 'module add blast; makeblastdb -dbtype=nucl -in=%s' % dn cmn.run(cmd) return dn
def transfer_alea_files(fnlist): global transferDir newlist = [] for fn in fnlist: print('transfering %s from archive server ...' % fn) cmd = 'rsync -r [email protected]:%s %s' % (fn, transferDir) cmn.run(cmd) newlist.append('%s/%s' % (transferDir, cmn.lastName(fn))) return newlist
def separate_by_pair(fastqs, wdir): print(wdir) pdict = {} mapdict = {} for fastq in fastqs: key = '.'.join(cmn.lastName(fastq).split('.')[:-1]) mapdict[key] = fastq names = list(mapdict.keys()) length = max([len(name) for name in names]) for i in range(length): if len(names) == 0: break checks = [name[:-1 - i] for name in names] count_dict = Counter(checks) for key in count_dict: if count_dict[key] == 2: # got paired paired_names = [each for each in names if each.startswith(key)] fns = [mapdict[name] for name in paired_names] pdict[key] = fns #remove it from the list for name in paired_names: names.remove(name) if len(pdict) == 0: print('Error! fastq lib name not recognized, contact Wenlin for help!') sys.exit() singleLibs = [mapdict[name] for name in names] print(singleLibs) if len(singleLibs) > 1: print( 'Warnning: more than one lib detected as single lib. below is the single list:' ) print('\n'.join(singleLibs)) print('Email Wenlin for help') #print 'paired libs are:' #for key in pdict: # print pdict[key] #print '\nsingle libs are:' #print ' '.join(singleLibs) singleFn = '%s/single.fq' % wdir if cmn.filexist(singleFn): cmn.run('rm %s' % singleFn) for fn in singleLibs: cmn.run('cat %s/%s >> %s' % (wdir, fn, singleFn)) return pdict, singleFn
def update_baits(bait_dict): adict = {} for i, name in enumerate(bait_dict): fnlabel = 'bait%s' % i dn = 'baits/%s.fa' % fnlabel seq = bait_dict[name] fasta = '>%s\n%s\n' % (name, ''.join(seq)) cmn.write_file(fasta, dn) cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel) cmn.run(cmd) adict[name] = dn return adict
def read_refN(ref_genomes): adict = {} for ref in ref_genomes: fN = '%s/%s_scaf_header.lines' % (ref_dir, ref) #print fN if not cmn.filexist(fN): fhead = '%s/%s_scaf.header' % (ref_dir, ref) cmd = 'wc -l %s > %s' % (fhead, fN) cmn.run(cmd) N = int(cmn.txt_read(fN).split()[0]) adict[ref] = N return adict
def backup_finalStat(wdir): ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/final_stats/' fns = cmn.cmd2lines('ls %s/*.report| grep -v all_genome' % wdir) for fn in fns: print('processing %s...' % fn) fnlabel = cmn.lastName(fn) #don't back up the ones without species items = fnlabel.replace('_stat.report', '').split('_') if len(items) == 1: print('skip the fasta without sp for %s' % fn) continue #get the one with least NA and more data amount dn = '%s/%s' % (ddir, fnlabel) if os.path.exists(dn): print('merging new and old data for %s' % fnlabel) Nold_na, Nold_data = count_final_stat(dn) Nnew_na, Nnew_data = count_final_stat(fn) if Nnew_na < Nold_na: #less NA cmn.run('cp %s %s' % (fn, dn)) else: if Nnew_na == Nold_na: #same NA number if Nnew_data > Nold_data: cmn.run('cp %s %s' % (fn, dn)) else: cmn.run('cp %s %s' % (fn, dn)) cmn.run('cd %s; cat *.report > allstat.txt' % ddir)
def separate_by_pair_old(label, fns): #paired = [i for i in fns if ('_paired' in i) ] paired = [i for i in fns if ('_pair' in i) or ('_R' in i)] paired.sort() if len(paired) != 2: print('error: wrong number of pairs as %s' % str(paired)) print('from: %s' % str(fns)) print('need to change the label criterion') sys.exit() unpaired = set(fns) - set(paired) #parse each files newPaired = [] for fn in paired: if os.path.exists(cmn.lastName(fn)): cmn.run('unlink %s;' % cmn.lastName(fn)) cmn.run('ln -s %s' % fn) newPaired.append(cmn.lastName(fn)) singleFn = '%s_single.fq' % label if cmn.filexist(singleFn): cmn.run('rm %s' % singleFn) for fn in unpaired: cmn.run('cat %s >> %s' % (fn, singleFn)) return newPaired, singleFn
def parse_inserted_gap(ID, seq, label): fn = 'sampleRun_%s/bait_insertion' % ID #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')): if cmn.filexist(fn): #lines = cmn.file2lines(fn) #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:])) #Ngap = 0 #for line in lines: # items = line.strip().split() # Ngap += len(items[-1]) #check what is the right range of sequence print('runing blast to fix %s' % ID) checkSeq = seq.replace('-', 'N').strip('N') fquery = 'tmpInput.fa' fasta = '>input\n%s\n' % checkSeq cmn.write_file(fasta, fquery) dn = 'tmpBr_%s.txt' % label cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn cmn.run(cmd) isFixed = False for line in cmn.file2lines(dn): items = line.strip().split() #print items qstart, qend, sstart, send = list(map(int, items[2:6])) if sstart == 1 and send == 658 and qstart == 21: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 658: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if sstart == 2 and send == 655 and qstart == 22: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 654: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if not isFixed: cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt') return seq
def separate_by_pair_vold(fastqs, wdir): pdict = {} mapdict = { } for fastq in fastqs: key = '.'.join(cmn.lastName(fastq).split('.')[:-1]) mapdict[key] = fastq names = list(mapdict.keys()) length = min([len(name) for name in names]) for i in range(length): checks = [name[:-1-i] for name in names] count_dict = Counter(checks) if max(count_dict.values()) == 2: #got paired for name in names: key = name[:-1-i] fn = mapdict[name] try: pdict[key].append(fn) except: pdict[key] = [fn] break if len(pdict) == 0: print('Error! fastq lib name not recognized, contact Wenlin for help!') sys.exit() singleLibs = [] keys = list(pdict.keys()) for key in keys: libs = pdict[key] if len(libs) != 2: singleLibs += libs del pdict[key] #print 'paired libs are:' #for key in pdict: # print pdict[key] #print '\nsingle libs are:' #print ' '.join(singleLibs) singleFn = '%s/single.fq' % wdir if cmn.filexist(singleFn): cmn.run('rm %s' % singleFn) for fn in singleLibs: cmn.run('cat %s >> %s' % (fn, singleFn)) return pdict, singleFn
def parse_fqlist(fqlist): alist = [] for line in cmn.file2lines(fqlist): label = cmn.lastName(line) if 'R1' in label: alist.append(line) elif 'R2' in label: alist.append(line) if len(alist) != 2: print('Error! can not recoginze fastq names in %s' % fqlist) cmd = 'touch fastq_error' cmn.run(cmd) sys.exit() return [alist]
def parse_ref(seqDict): cmn.mkdir('baits') newDict = {} for i, name in enumerate(seqDict): seq = seqDict[name] fnlabel = 'bait%s' % i dn = 'baits/%s.fa' % fnlabel name = name.replace('*', '').replace('"', "'") fasta = '>%s\n%s\n' % (name, seq) cmn.write_file(fasta, dn) cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel) cmn.run(cmd) newDict[name] = dn return newDict
def get_mash_file(name, seq): global mash_file_dict, cpu try: fn = mash_file_dict[name] except KeyError: fn = '/tmp/%s' % name seq = ''.join(seq).replace('-', '').replace('N', '') fasta = '>%s\n%s\n' % (name, seq) cmn.write_file(fasta, fn) cmd = '/home2/wli/local/mash-Linux64-v1.1.1/mash sketch -n -p %s %s' % ( cpu, fn) cmn.run(cmd) dn = fn + '.msh' mash_file_dict[name] = dn fn = dn return fn
def do_barcode_blast(sequence, seqDict): #fref = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo' fdb = makeBlastDatabase(seqDict) #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta' namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '') namelabel = namelabel.replace('/', '_') fquery = '/tmp/%s.fa' % namelabel cmn.write_file(sequence, fquery) cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb) cmd += '-outfmt \'6 sseqid qlen slen length pident\'' lines = cmn.cmd2lines(cmd) cmn.run('rm %s' % fquery) return lines
def merge_sams(dn, fns): #dn = '%s.sam' % label print('merging files: %s into %s' % (str(fns), dn)) if cmn.filexist(dn): cmn.run('rm ' + dn) fp_dn = open(dn, "a") filter_and_write_lines(fp_dn, fns[0], header=True) for fn in fns[1:]: filter_and_write_lines(fp_dn, fn) fp_dn.close() return dn
def do_barcode_blast(sequence): fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa' namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace( '*', '').split('[')[0].replace('"', '').replace( "'", '').split('(')[0].split('/')[0].split('\\')[0] fquery = '/tmp/%s.fa' % namelabel cmn.write_file(sequence, fquery) fbr = fquery + '.br' cmd = 'module add blast; blastn -max_target_seqs 1000 -query %s -db %s -ungapped ' % ( fquery, fdb) cmd += '-outfmt \'6 sseqid slen length pident qstart qend qseq sseq\'' cmd += ' -out %s ' % fbr #print cmd cmn.run(cmd) #cmd += ' | head -n 10' #lines = cmn.cmd2lines(cmd) lines = cmn.file2lines(fbr) cmn.run('rm %s' % fquery) return lines
def extract_same_genus(genus, fall): dn = 'genus_for_autoPicking.fa' namelist = [] with open(fall) as fp, open(dn, 'w') as dp: for line in fp: if '>' in line: name = line[1:].strip() label = name.split('_')[0] if label == genus: isGood = True namelist.append(name) else: isGood = False if isGood: dp.write(line) cmd = 'module add bwa; bwa index %s' % dn cmn.run(cmd) return dn, namelist
def merge_sams(label, fns): dn = '%s.sam' % label print('merging files: %s into %s' % (str(fns), dn)) if cmn.filexist(dn): cmn.run('rm ' + dn) cmn.run('cp %s %s' % (fns[0], dn)) fp_dn = open(dn, "a") for fn in fns[1:]: fp = open(fn) for line in fp: if line[0] != "@" and line[0] != "[" and line.split()[2] != "*": #if line[0] != "@": fp_dn.write(line) fp.close() fp_dn.close() return dn
def separate_by_pair(label, fns): paired = [i for i in fns if ('_paired' in i) or ('_R' in i)] paired.sort() if len(paired) != 2: print('error: wrong number of pairs as %s' % str(paired)) print('from: %s' % str(fns)) #print 'need to change the label criterion' print('skip this lib') return None, None unpaired = set(fns) - set(paired) #parse each files newPaired = [] for fn in paired: cmn.run('ln -s %s' % fn) newPaired.append(cmn.lastName(fn)) singleFn = '%s_single.fq' % label if cmn.filexist: cmn.run('rm %s' % singleFn) for fn in unpaired: cmn.run('cat %s >> %s' % (fn, singleFn)) return newPaired, singleFn
def backup_vcf_coverage(wdir): ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/check_vcf_coverage' fns = cmn.cmd2lines('ls %s/*_vcf.cov' % wdir) #1. only back up the new version of cov file for fn in fns: print('processing %s...' % fn) lines = cmn.file2lines(fn) items = lines[-1].strip().split() if len(items) != 6: print('skip old format file %s' % fn) continue fnlabel = cmn.lastName(fn) dn = '%s/%s' % (ddir, fnlabel) if os.path.exists(dn): print('merging new and old data for %s' % fnlabel) covOld = float(cmn.file2lines(dn)[-1].split()[-2]) cov = float(lines[-1].split()[-2]) if cov > covOld: cmn.run('cp %s %s' % (fn, dn)) else: cmn.run('cp %s %s' % (fn, dn))
def grep_reads(read, f_libs, direction): #reverse the read #reverse = ''.join([rdict[i] for i in read[::-1]]) cmds = [] for fn in f_libs: cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/grep_reads.py %s %s %s &' % ( read, fn, direction) cmds.append(cmd) cmds.append('\nwait;\n') f_job = 'grep_read.job' cmn.write_lines(cmds, f_job) cmn.run('bash %s ' % f_job) #the output dir is grep_out dn = 'all_grep_reads.txt' cmn.run('cat grep_out/* > %s' % dn) fished_reads = cmn.getid(dn) return fished_reads
def backup_fasta(wdir): ddir = '/project/biophysics/Nick_lab/mtang/archive/step4_postprocessing/map2fasta' fns = cmn.cmd2lines('ls %s/*_m2s.fa| grep -v all_genome' % wdir) for fn in fns: print('processing %s...' % fn) fnlabel = cmn.lastName(fn) #don't back up the ones without species items = fnlabel.replace('_snp_step2_MITO_m2s.fa', '').replace('_snp_step2_m2s.fa', '').split('_') if len(items) == 1: print('skip the fasta without sp for %s' % fn) continue #get the least gapped one dn = '%s/%s' % (ddir, fnlabel) if os.path.exists(dn): print('merging new and old data for %s' % fnlabel) Nold = count_fasta_nonGap(dn) Nnew = count_fasta_nonGap(fn) if Nnew > Nold: cmn.run('cp %s %s' % (fn, dn)) else: cmn.run('cp %s %s' % (fn, dn))
def attempt_to_find_genus_by_abundence(ID, fqlist): tmpdir = 'tmp_%s' % ID cmn.mkdir(tmpdir) os.chdir(tmpdir) cmn.write_lines(fqlist, 'fqlist') cmd = '/work/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist' cmn.run(cmd) dn = 'picked_bait.txt' if cmn.filexist(dn): genus = cmn.txt_read(dn).strip().split('_')[0].split()[0] else: genus = None os.chdir('..') cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID)) cmn.run('rm -r %s ' % tmpdir) return genus
print("Usage: *.py fa Ncores", file=sys.stderr) sys.exit() #if the nodes are less than 4 taxa, produce a random tree cmd = "grep '>' %s" % (fn) lines = [ each[1:].strip() for each in cmn.cmd2lines(cmd) if each.strip() != '' ] N = len(lines) if N < 4: print('Warning: fastme can not make tree of less than 4 taxa') print('Warning: so I make a fake tree...') dn = '%s.phylip.fastme.tre' % cmn.lastName(fn) if N == 1: info = '(%s);\n' % lines[0] if N == 2: a, b = lines info = '(%s,%s);\n' % (a, b) elif N == 3: a, b, c = lines info = '((%s,%s),%s);\n' % (a, b, c) cmn.write_file(info, dn) sys.exit() label = cmn.lastName(fn) cmd = 'rm RAxML_*.%s;' % label cmd += '/home2/wli/local/RAxML/raxmlHPC-PTHREADS-SSE3 -m GTRGAMMA -p 7112 -T %s -s %s -n %s' % ( Ncores, label, label) cmn.run(cmd)
info = info.replace('[WL_preprocessing]', '\n'.join(step1cmds)) #make snp call cmds #f_sam = merge_sams(sp, fsams) info = info.replace('5328', sp) info = info.replace('[WL_cwd]', os.getcwd()) info2 = template2.replace('assembly_selfref', asslabel) info2 = info2.replace('5328', sp) info2 = info2.replace('[WL_cwd]', os.getcwd()) os.chdir('..') fjob = 'job_files/s1_%s.job' % sp cmn.write_file(info, fjob) cmn.run('cd job_files; sbatch s1_%s.job' % sp) if sp not in step1_finished: step1_jobs.append(fjob) fjob = 'job_files/s2_%s.job' % sp cmn.write_file(info2, fjob) step2_jobs.append(fjob) #cmn.run('cd job_files; sbatch sg%s.job' % sp) info = ['bash %s\n' % each for each in step1_jobs] cmn.write_file(''.join(info), 'step1todo.cmds') info = ['sbatch %s\n' % each for each in step2_jobs] cmn.write_file(''.join(info), 'step2todo.cmds')
header = lines[0].split()[2:] #cmn.write_lines(header, 'header_names') #new = [' '.join(['sp%s' % i for i in xrange(len(header))])] new = [' '.join(header)] for line in lines[1:]: firstChar = '' items = line.split()[2:] newline = [] for item in items: chars = item.split() if firstChar == '': firstChar = chars[0] if len(chars) == 1: if chars[0] == '-': newline.append('0,0') elif firstChar == chars[0]: newline.append('1,0') else: newline.append('0,1') else: #have both char newline.append('1,1') new.append(' '.join(newline)) dn = fn + '.tmix' cmn.write_lines(new, dn) cmn.run('gzip %s' % dn)
for sample in groupDict: fqlist = groupDict[sample] #fqlist = cmn.cmd2lines('ls /project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/tmp_link_fastq/%s*.fastq' % sample) #fqlist = cmn.cmd2lines('ls /work/biophysics/wli/workspace/filtered_6313*q') wdir = 'mitoD_%s' % sample cmn.mkdir(wdir) os.chdir(wdir) cwd = os.getcwd() info = template.replace('[cwd]', cwd) info = info.replace('[fq_files]', ' '.join(fqlist)) info = info.replace('[sample]', sample) #prepare quake infiles fqlist_local = [] for fq in fqlist: cmn.run('ln -s ' + fq) fqlist_local.append(cmn.lastName(fq)) cmn.write_lines(fqlist_local, 'fqlist') cmn.run('ln -s fqlist infiles') #make fq2fa comand quake_fqlist = [each.replace('.fastq', '.cor.fastq') for each in fqlist_local] fq2fa_cmds = ['rm %s.fa 2> /dev/null' % sample] for fq in quake_fqlist: cmd = 'fq2fa %s >> %s.fa;' % (fq, sample) fq2fa_cmds.append(cmd) cmn.write_lines(quake_fqlist, 'fqlist.cor') cmd = '\n'.join(fq2fa_cmds) info = info.replace('[fq2fa_commands]', cmd)