def separate_by_pair_old(label, fns): #paired = [i for i in fns if ('_paired' in i) ] paired = [i for i in fns if ('_pair' in i) or ('_R' in i)] paired.sort() if len(paired) != 2: print('error: wrong number of pairs as %s' % str(paired)) print('from: %s' % str(fns)) print('need to change the label criterion') sys.exit() unpaired = set(fns) - set(paired) #parse each files newPaired = [] for fn in paired: if os.path.exists(cmn.lastName(fn)): cmn.run('unlink %s;' % cmn.lastName(fn)) cmn.run('ln -s %s' % fn) newPaired.append(cmn.lastName(fn)) singleFn = '%s_single.fq' % label if cmn.filexist(singleFn): cmn.run('rm %s' % singleFn) for fn in unpaired: cmn.run('cat %s >> %s' % (fn, singleFn)) return newPaired, singleFn
def find_genus_info(): genus = None try: genus = sys.argv[2] except: fn = 'restricted_genus.info' if cmn.filexist(fn): genus = cmn.txt_read(fn).strip() return genus
def find_reference(fn): pdir = '/'.join(fn.split('/')[:-1]) fass_label = '%s/assembly_selfref_v2' % pdir fass = '%s/assembly_selfref_v2.fa' % pdir if not cmn.filexist(fass): print('WARNING: can not find assembly_selfref_v2.fa, use the orginal one') reflabel = '_'.join(fn.split('/')[-2].split('_')[1:]) fass_label = '/work/biophysics/mtang/SNP_calling/indexed_references/%s' % reflabel return fass_label
def separate_by_pair(fastqs, wdir): print(wdir) pdict = {} mapdict = {} for fastq in fastqs: key = '.'.join(cmn.lastName(fastq).split('.')[:-1]) mapdict[key] = fastq names = list(mapdict.keys()) length = max([len(name) for name in names]) for i in range(length): if len(names) == 0: break checks = [name[:-1 - i] for name in names] count_dict = Counter(checks) for key in count_dict: if count_dict[key] == 2: # got paired paired_names = [each for each in names if each.startswith(key)] fns = [mapdict[name] for name in paired_names] pdict[key] = fns #remove it from the list for name in paired_names: names.remove(name) if len(pdict) == 0: print('Error! fastq lib name not recognized, contact Wenlin for help!') sys.exit() singleLibs = [mapdict[name] for name in names] print(singleLibs) if len(singleLibs) > 1: print( 'Warnning: more than one lib detected as single lib. below is the single list:' ) print('\n'.join(singleLibs)) print('Email Wenlin for help') #print 'paired libs are:' #for key in pdict: # print pdict[key] #print '\nsingle libs are:' #print ' '.join(singleLibs) singleFn = '%s/single.fq' % wdir if cmn.filexist(singleFn): cmn.run('rm %s' % singleFn) for fn in singleLibs: cmn.run('cat %s/%s >> %s' % (wdir, fn, singleFn)) return pdict, singleFn
def read_refN(ref_genomes): adict = {} for ref in ref_genomes: fN = '%s/%s_scaf_header.lines' % (ref_dir, ref) #print fN if not cmn.filexist(fN): fhead = '%s/%s_scaf.header' % (ref_dir, ref) cmd = 'wc -l %s > %s' % (fhead, fN) cmn.run(cmd) N = int(cmn.txt_read(fN).split()[0]) adict[ref] = N return adict
def separate_by_pair_vold(fastqs, wdir): pdict = {} mapdict = { } for fastq in fastqs: key = '.'.join(cmn.lastName(fastq).split('.')[:-1]) mapdict[key] = fastq names = list(mapdict.keys()) length = min([len(name) for name in names]) for i in range(length): checks = [name[:-1-i] for name in names] count_dict = Counter(checks) if max(count_dict.values()) == 2: #got paired for name in names: key = name[:-1-i] fn = mapdict[name] try: pdict[key].append(fn) except: pdict[key] = [fn] break if len(pdict) == 0: print('Error! fastq lib name not recognized, contact Wenlin for help!') sys.exit() singleLibs = [] keys = list(pdict.keys()) for key in keys: libs = pdict[key] if len(libs) != 2: singleLibs += libs del pdict[key] #print 'paired libs are:' #for key in pdict: # print pdict[key] #print '\nsingle libs are:' #print ' '.join(singleLibs) singleFn = '%s/single.fq' % wdir if cmn.filexist(singleFn): cmn.run('rm %s' % singleFn) for fn in singleLibs: cmn.run('cat %s >> %s' % (fn, singleFn)) return pdict, singleFn
def parse_inserted_gap(ID, seq, label): fn = 'sampleRun_%s/bait_insertion' % ID #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')): if cmn.filexist(fn): #lines = cmn.file2lines(fn) #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:])) #Ngap = 0 #for line in lines: # items = line.strip().split() # Ngap += len(items[-1]) #check what is the right range of sequence print('runing blast to fix %s' % ID) checkSeq = seq.replace('-', 'N').strip('N') fquery = 'tmpInput.fa' fasta = '>input\n%s\n' % checkSeq cmn.write_file(fasta, fquery) dn = 'tmpBr_%s.txt' % label cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn cmn.run(cmd) isFixed = False for line in cmn.file2lines(dn): items = line.strip().split() #print items qstart, qend, sstart, send = list(map(int, items[2:6])) if sstart == 1 and send == 658 and qstart == 21: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 658: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if sstart == 2 and send == 655 and qstart == 22: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 654: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if not isFixed: cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt') return seq
def parse_bait(fn): alist = [] adict = {} if cmn.filexist('bait_insertion'): indel_dict = read_indel_info('bait_insertion') else: indel_dict = {} for line in cmn.file2lines(fn): sp, name, seq = line.strip().split() if len(indel_dict) != []: seq = add_indel(seq, indel_dict) adict[name] = seq alist.append(name) return adict, alist
def merge_sams(dn, fns): #dn = '%s.sam' % label print('merging files: %s into %s' % (str(fns), dn)) if cmn.filexist(dn): cmn.run('rm ' + dn) fp_dn = open(dn, "a") filter_and_write_lines(fp_dn, fns[0], header=True) for fn in fns[1:]: filter_and_write_lines(fp_dn, fn) fp_dn.close() return dn
def attempt_to_find_genus_by_abundence(ID, fqlist): tmpdir = 'tmp_%s' % ID cmn.mkdir(tmpdir) os.chdir(tmpdir) cmn.write_lines(fqlist, 'fqlist') cmd = '/work/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist' cmn.run(cmd) dn = 'picked_bait.txt' if cmn.filexist(dn): genus = cmn.txt_read(dn).strip().split('_')[0].split()[0] else: genus = None os.chdir('..') cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID)) cmn.run('rm -r %s ' % tmpdir) return genus
def read_rep(): dn = 'rep.dict.pkl' if cmn.filexist(dn): print('loading repeats using precomputed data...') return cmn.pickle_read(dn) freps = cmn.cmd2lines('ls annotation_repeats/*.gff3') repdict = {} for frep in freps: for line in cmn.file2lines(fn): items = line.strip().split() scaf = items[0] if scaf not in repdict: repdict[scaf] = set([]) i, j = list(map(int, items[3:5])) repdict[scaf] = repdict[scaf] | set(range(i, j)) cmn.pickle_write(repdict, dn) return repdict
def merge_sams(label, fns): dn = '%s.sam' % label print('merging files: %s into %s' % (str(fns), dn)) if cmn.filexist(dn): cmn.run('rm ' + dn) cmn.run('cp %s %s' % (fns[0], dn)) fp_dn = open(dn, "a") for fn in fns[1:]: fp = open(fn) for line in fp: if line[0] != "@" and line[0] != "[" and line.split()[2] != "*": #if line[0] != "@": fp_dn.write(line) fp.close() fp_dn.close() return dn
def combine_data(fn, label): global ischeck_badID, sp, old_dir sp = '_'.join(cmn.lastName(fn).split('_')[:-1]) accepted_labels = ['R1', 'R2', 'singleton'] if label not in accepted_labels: print('Error! your indicated label are not accepted') print('accepted values are %s' % (','.join(accepted_labels))) sys.exit() newDict = read_fastq(fn) oldFn = '%s/%s_%s.fastq' % (old_dir, sp, label) if cmn.filexist(oldFn): print('combine new data with old data for %s' % fn) oldDict = read_fastq(oldFn) #newDict = read_fastq(fn) finalDict = combine_fastq(oldDict, newDict) else: finalDict = newDict return finalDict
def count_sam_align(fns): totalN = 0 alignN = 0 half_alignN = 0 #more than half aligned total_pN = 0 #mapped positions total_ptN = 0 # total positions for fn in fns: if not cmn.filexist(fn): continue #pN and ptN are the counts by positions alnN, halfN, tN, pN, ptN = aligned_reads(fn) totalN += tN alignN += alnN half_alignN += halfN total_pN += pN total_ptN += ptN pPercent = float(total_pN) / total_ptN items = fn.split('/') sp, ref = items[-3:-1] print(sp, ref, alignN, totalN, half_alignN, pPercent)
rdict[sp] = [(fastq, ref)] refs.add(ref) #2. prepare reference jobs refdir = '/work/biophysics/mtang/SNP_calling/indexed_references' cmn.mkdir(refdir) os.chdir(refdir) index_cmds = ['cd %s' % refdir] for ref in refs: if not os.path.exists(cmn.lastName(ref)): #cmn.run('ln -s %s' % ref) cmn.run('cp %s %s/' % (ref, refdir)) ref = cmn.lastName(ref) reflabel = ref.replace('.fa', '') checkFn = reflabel + '.pac' if cmn.filexist(checkFn): print('found finished ref for %s, skip it' % ref) continue cmd = '/home2/wli/local/bwa-0.7.12/bwa index %s -p %s &' % (ref, reflabel) index_cmds.append(cmd) index_cmds.append('\nwait\n') os.chdir(cwd) print('#################################################') if len(index_cmds) != 2: dn = 'index.cmds' cmn.write_lines(index_cmds, dn) fjob = 'index.job' cmd = '/work/biophysics/mtang/SNP_calling/scripts/decorate_job.py %s -p 256GB > %s' % (
count = 1 dnNew = '%s_%s.fa' % (dnlabel, count) dnlist.append(dnNew) dp = open(dnNew, 'w') with open(dn) as fp: for i, line in enumerate(fp): line = line.strip() if i % 2 == 0: defline = line continue elif i % 2 == 1: fasta = '%s\n%s\n' % (defline, line) dp.write(fasta) if (i+1) % each_pack == 0: count += 1 dp.close() dnNew = '%s_%s.fa' % (dnlabel, count) dnlist.append(dnNew) dp = open(dnNew, 'w') dp.close() for each in dnlist: if not cmn.filexist(each): cmn.run('rm %s' % each) cmn.run('rm %s' % dn)
fns = cmn.cmd2lines('ls %s/*/*/*.sam' % wdir) dirs = set(['/'.join(fn.split('/')[:-2]) for fn in fns]) cov_files = cmn.cmd2lines('ls mapped_reads_count/*_cov.count 2> /dev/null') finished_dirs = set( [cmn.lastName(fn).replace('_cov.count', '') for fn in cov_files]) cwd = os.getcwd() isGood = True cmds = ['cd %s' % wdir] for dir in dirs: sp = cmn.lastName(dir) if sp in finished_dirs and cmn.filexist( 'mapped_reads_count/%s_cov.count' % sp): continue isGood = False cmd = 'python /work/biophysics/mtang/SNP_calling/scripts/tell_best_mapping.py %s &' % dir cmds.append(cmd) cmds.append('\nwait\n') outdir = '%s/mapped_reads_count' % wdir cmn.mkdir(outdir) for dir in dirs: sp = cmn.lastName(dir.rstrip('/')) dn = '%s/%s_cov.count' % (outdir, sp) if sp in finished_dirs and cmn.filexist(dn): continue
cmds = ['cd %s' % refdir] cmds.append('module add picard/1.117') cmds.append('module load java/oracle/jdk1.8.0_65') todoref_count = 0 taken_refs = set([]) for sublist in list(refdict.values()): for samdir, ref in sublist: if ref in taken_refs: continue else: taken_refs.add(ref) fcheck = '%s/%s.dict' % (refdir, ref) #print refdir, ref if cmn.filexist(fcheck): print('skip finished indexed %s' % ref) continue # this has been finished in bwa mapping #/home2/wli/local/bwa-0.7.12/bwa index -p assembly_selfref assembly_selfref.fa > index.log & todoref_count += 1 cmds.append( 'java -jar $PICARD/CreateSequenceDictionary.jar R=%s.fa O=%s.dict &' % (ref, ref)) cmds.append( '/home2/wli/local/samtools-1.2/samtools faidx %s.fa &' % ref) cmds.append('\nwait;\n') isIndexed = False
def get_query_sequence(seqDict, genus, sp): #1. anything in Eudamine file has higher priority #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt' #cmd = 'grep %s %s' % (sp, fEud) #lines = cmn.cmd2lines(cmd) #if len(lines) == 1: # name = lines[0].split()[0] # seq = seqDict[name] # fasta = '>%s\n%s\n' % (name, seq) # qlen = len(seq.replace('N', '')) # print 'pick %s for %s %s' % (name, genus, sp) # return fasta, qlen names = list(seqDict.keys()) #try to look up the exact match first expected_name = '%s_%s' % (genus, sp) tmp = [name for name in names if name.upper() == expected_name.upper()] if len(tmp) != 0: name = tmp[0] print('found exact match %s' % name) seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) return fasta, qlen #look it up in other files good_names = [name for name in names # if genus.upper() in name.upper().split('_')] if genus.upper() == name.upper().split('_')[0]] useGenus = False if len(good_names) > 0: useGenus = True cmn.run('rm pickingLog.txt 2> /dev/null') if len(good_names) == 0:#sp is just 'sp' print('can not find barcode for genus keyword "%s"' % genus) good_names = names cmn.write_file('noGenus\n', 'pickingLog.txt') if len(good_names) > 1: #try to refine it tmp = [name for name in good_names if sp.upper() in name.upper().split('_')] if len(tmp) != 0: good_names = tmp else: cmn.append_file('noSpecies\n', 'pickingLog.txt') ############################################# ####new here, auto pick sequences for those has no info ############################################# if cmn.filexist('pickingLog.txt'): print('automatically pick bait by fastq similarity') fsp = 'restricted_genus.info' if useGenus and (not cmn.filexist(fsp)): cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist %s' % genus else: cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist ' cmn.run(cmd) good_names = cmn.file2lines('picked_bait.txt') cmn.write_file('pickClosed\n', 'pickingLog.txt') ############################################# ############################################# ############################################# #try to see if type species is there tmp = [name for name in good_names if name[0] == '*'] if len(tmp) != 0: good_names = tmp else: tmp = [name for name in good_names if '*' in name] if len(tmp) != 0: good_names = tmp #then randomly pick one, get the max length ones name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-'))) #name = name.replace('/', '_') seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) print('pick %s for %s %s' % (name, genus, sp)) return fasta, qlen
#main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() #check if all the files has contains falist = cmn.file2lines(fn) bad_falist = [ fa for fa in falist if not cmn.filexist(fa) and '/archive/butterfly/' not in fa ] if len(bad_falist) != 0: print('Error!') print('the following files are errorous:') print('\n'.join(bad_falist)) sys.exit() transferDir = 'archiveTransfer' cmn.mkdir(transferDir) alea_list = [fa for fa in falist if '/archive/butterfly' in fa] biohpc_list = set(falist) - set(alea_list)
thread_lines = {} clean_lines = {} leftN = 20 barcodeLength = 658 for wdir in wdirs: ID = wdir.split('sampleRun_')[-1] print('working on %s' % ID) try: fn = cmn.cmd2lines( 'ls sampleRun_%s/rescued_read_assembled_mis1*.txt' % ID)[0] except: print('can not find assembled files for %s' % ID) continue #print fn findel = 'sampleRun_%s/bait_insertion' % ID if cmn.filexist(findel): print('prasing indel for %s' % ID) indel_positions = find_indel_from_reads(findel, fn) print('indel_positions', indel_positions) else: indel_positions = [] threadSeq, stackSeq, cleanSeq = read_lineup_seq(fn, indel_positions) thread_lines[ID] = threadSeq stack_lines[ID] = stackSeq clean_lines[ID] = cleanSeq #1. if thread and stack show inconsistent, show an X #2. if thread has gap, show as lower case #3. if both are gap, show an N
def add_in_baits(fref): fbait = 'sampleInfo.baits' ref_info = cmn.txt_read(fref) if cmn.filexist('bait_insertion'): indel_dict = read_indel_info('bait_insertion') else: indel_dict = {} #baits added by customBaits #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo' #if cmn.filexist(fadd): # add_lines = cmn.file2lines(fadd) #else: # add_lines = [] add_lines = [] if len(indel_dict) != 0: ref_info = insert_in_ref_info(ref_info, indel_dict) add_lines = insert_in_lines(add_lines, indel_dict) refIDs = [ line[1:] for line in ref_info.split('\n') if line.strip() != '' and line[0] == '>' ] addedIDs = [line.split()[1] for line in add_lines] #new = [] #when check a new line, need to check both the fref and the fadd #if the one is not in fadd, add it to fadd for line in cmn.file2lines(fbait): sp, defline, seq = line.strip().split() if all([defline.upper() not in refID.upper() for refID in refIDs]): #not in ref if all([ defline.upper() not in addedID.upper() for addedID in addedIDs ]): if len(seq) == 698: add_lines.append(line) else: if len(seq) != 658: print( 'Error! length of bait barcode is wrong for %s %s' % (sp, defline)) sys.exit() else: seq = add_primer(seq) add_lines.append('%s\t%s\t%s' % (sp, defline, seq)) #now get a new fadd, need to format it into fasta add_fasta = [] for line in add_lines: sp, defline, seq = line.strip().split() fasta = '>%s\n%s\n' % (defline, seq) add_fasta.append(fasta) ref_info += '\n' ref_info += ''.join(add_fasta) dn = 'species_barcodes_4mapping_withAddon.fa' cmn.write_file(ref_info, dn) #index it cmd = 'module add bwa; bwa index %s' % dn cmn.run(cmd) #record the new fadd #cmn.write_lines(add_lines, fadd) return dn
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': fns = cmn.cmd2lines('ls *.map| grep -v all| grep -v test| grep -v concat') #fns = fns[:1] f_label = '15101E05_snp.vcf' #make the scaffold index #cmd = "grep -v '^#' 15101E05_snp.vcf| cut -f 1,2 > index_header" if not cmn.filexist('index_header'): # cmn.run(cmd) make_index_header('15101E05_snp.vcf', 'assembly_v2_length.txt') header = ['scaffold', 'index'] for fn in fns: label = fn.split('_')[0] header.append(label+'_f') header.append(label+'_m') cmn.write_lines(fns, 'map_name_order') cmn.write_file('\t'.join(header)+'\n', 'table_header') cmd = 'cp table_header all_concat.map;'
if len(sys.argv) > 2: if sys.argv[2] == 'ignore': ignore_check = True outlabel = cmn.lastName(fn).replace('.fa', '') wdir = '%s_tmp' % cmn.lastName(fn) cmn.mkdir(wdir) os.chdir(wdir) fphy = cmn.lastName(fn) + '.phylip' fname = cmn.lastName(fn) + '.phylipNames.dict.pkl' fchecks = ['outfile', 'dist.Tree'] isbad = False for fcheck in fchecks: if not ignore_check and cmn.filexist(fcheck): print('Erorr: file %s exists! running pipeline would overwrite the files, please either delete it or move it to another place' % fcheck) isbad = True if isbad: sys.exit() cmd = 'source /home2/wli/.bash_profile;/project/biophysics/Nick_lab/wli/sequencing/scripts/fasta2phylip4dnadist.py %s' % fn cmn.run(cmd) dnadistInfo = '%s\nY\n' % fphy cmn.write_file(dnadistInfo, 'input.dnadist') cmd = 'rm outfile 2> /dev/null;/home2/wli/local/phylip-3.696/exe/dnadist < input.dnadist > dnadist.log' #print cmd cmn.run(cmd)
try: fn = sys.argv[1] except: print("Usage: *.py good_reads_assembled [allowed_mismatch=0]", file=sys.stderr) sys.exit() try: misN = int(sys.argv[2]) except: misN = 1 hasDelLabel = False #NOTE: new feature: rejecting reads that matched mostly to the extended ends #NOTE: change it such that bad reads are not rescued due to gaps if cmn.filexist('hasDeletion'): indel_list = set(cmn.file2lines('hasDeletion')) else: indel_list = set([]) #add primer if not added bait_dict, ordered_baits, stack_seq, thread_seq, good_reads, junk_reads, sampleLabel = read_assembled_file( fn) print('junk1', len(junk_reads)) #use stack to rescue all_rescued = {} while (True): #this means more reads are rescued #continue doing rescue
#main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: wdir = sys.argv[1] except: print("Usage: *.py 3935", file=sys.stderr) sys.exit() #subdirs = cmn.cmd2lines('ls %s| grep -v txt$' % wdir) #for subdir in subdirs: # fns = cmn.cmd2lines('ls %s/%s/*.sam' % (wdir, subdir)) # count_sam_align(fns) fn = '%s/mapping_stat.txt' % wdir if not cmn.filexist(fn): cmd = '/work/biophysics/mtang/SNP_calling/scripts/count_sam_aligns_byDir.py %s' % wdir cmn.run(cmd) sys.exit() sp = cmn.lastName(wdir) for line in cmn.file2lines(fn): print('%s\t%s' % (sp, line))
if len(subdirs) == 1: print('only one directory, no need to tell who is best') cmn.write_file(subdirs[0], '%s/best_mapping.txt' % wdir) sys.exit() else: sizeDict = {} for each in subdirs: fsams = cmn.cmd2lines('ls %s/%s/*.sam' % (wdir, each)) total = 0 mapped = 0 halfmap = 0 TpN = 0 TptN = 0 for fsam in fsams: if not cmn.filexist(fsam): continue mappedN, halfN, totalN, pN, ptN = aligned_reads(fsam) total += totalN mapped += mappedN halfmap += halfN TpN += pN TptN += ptN sizeDict[each] = (mapped, total, halfmap, float(TpN) / TptN) dn = '%s/mapping_stat.txt' % wdir info = [ '%s\t%s\n' % (name, '\t'.join(map(str, sizeDict[name]))) for name in sizeDict ]
Id = cmn.lastName(line).split('_')[0] Id = Id.replace('NVG-', '').replace('11-BOA-','').replace('LEP-', 'LEP') IDlist.add(Id) fq = os.path.abspath(line) try: fq_groups[Id].append(fq) except KeyError: fq_groups[Id] = [fq] nameDict = get_names_4barcode() fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta' seqDict = read_fa(fall) fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa' if cmn.filexist(fadd): seqDict.update(read_fa(fadd)) ftable = '/archive/biophysics/Nick_lab/wli/archive/barcodes/auto_tables/verified_barcodes.fa' seqDict.update(read_autoTable(ftable)) all_genus = set([name.split('_')[0].lower() for name in seqDict]) info = [] missing = [] notFound = [] for sp in IDlist: try: fullname = nameDict[sp] genus = fullname.split()[1] if genus.lower() not in all_genus: notFound.append(sp)