def attempt_to_find_genus_by_abundence(ID, fqlist): tmpdir = 'tmp_%s' % ID cmn.mkdir(tmpdir) os.chdir(tmpdir) cmn.write_lines(fqlist, 'fqlist') cmd = '/work/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist' cmn.run(cmd) dn = 'picked_bait.txt' if cmn.filexist(dn): genus = cmn.txt_read(dn).strip().split('_')[0].split()[0] else: genus = None os.chdir('..') cmn.run('cp %s/mapping_stat.info tmpStat/%s_mapping_stat.info' % (tmpdir, ID)) cmn.run('rm -r %s ' % tmpdir) return genus
def detect_dominated_reads(seqs, seed_i): global fraction global minN sequence_index = seed_i form_dict = {} #record which seq has how many faction forms = [] #used for output while (True): subseqs = [ seq[:sequence_index] for seq in seqs if len(seq) >= sequence_index ] if len(subseqs) == 0: break total = float(len(subseqs)) adict = Counter(subseqs) #print 'seq', seqs #print 'sequence_index', sequence_index #print 'subseq', subseqs #print adict maxCount = 0 maxSeq = '' for seq in adict: count = adict[seq] if count > maxCount: maxCount = count maxSeq = seq forms.append('%.3f\t%s\t%s' % (maxCount / total, len(subseqs), maxSeq)) form_dict[maxSeq] = [ maxCount / total, len(subseqs) ] #fraction of reads and number of remaining reads if maxCount < (fraction * total): break if len(subseqs) < minN: break sequence_index += 1 dn = 'form_stat.txt' cmn.write_lines(forms, dn) return form_dict
def grep_reads(read, f_libs, direction): #reverse the read #reverse = ''.join([rdict[i] for i in read[::-1]]) cmds = [] for fn in f_libs: cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/grep_reads.py %s %s %s &' % ( read, fn, direction) cmds.append(cmd) cmds.append('\nwait;\n') f_job = 'grep_read.job' cmn.write_lines(cmds, f_job) cmn.run('bash %s ' % f_job) #the output dir is grep_out dn = 'all_grep_reads.txt' cmn.run('cat grep_out/* > %s' % dn) fished_reads = cmn.getid(dn) return fished_reads
qseq, sseq = items[-2:] if ('-' in qseq) or ('-' in sseq): print('detected gap for %s, skip' % sid, file=sys.stderr) continue for i in range(qhsp_length): qI = i + qstart sI = i codon = hsp[sI] try: stack_dict[exon][sp][qI].append(codon) except KeyError: stack_dict[exon][sp][qI] = [codon] dn = '%s_%s_endExtend.fa' % (sp, exon) cmn.write_lines(end_seqs, dn) print('endseq', consensus_seqs(end_seqs)) maxLength = max([len(seq) for seq in start_seqs]) pattern = '{:>%s}' % maxLength start_seqs = [pattern.format(seq) for seq in start_seqs] dn = '%s_%s_startExtend.fa' % (sp, exon) cmn.write_lines(start_seqs, dn) print('startseq', consensus_seqs(start_seqs)) #tell and output dn = 'phased_assemblies.contigs' new = [] cov_dict = {}
biohpc_files = [] for each in falist: if '/archive/butterfly/' in each or ('jshen/h' in each): alea_files.append(each) else: biohpc_files.append(each) new_files = transfer_alea_files(alea_files) falist = biohpc_files + new_files #if len(missing) != 0: #try to look for refgenomes # fns = cmn.cmd2lines('ls /work/biophysics/mtang/SNP_calling/indexed_references/mitogenomes/*.fa') # addback = [fn for fn in fns if cmn.lastName(fn).split('_')[0] in missing] # missing = set(missing) - set([cmn.lastName(fn).split('_')[0] for fn in addback]) # falist += addback if len(missing) != 0: print('ATTENTION! the following ID missing sequence!') print('\n'.join(missing)) cmn.write_lines(missing, 'missingMITOs') falist.append('') cmn.write_lines(falist, 'falist.mito') #if len(alea_files) != 0: # print 'the following files need to transfer from /archive server' # print '\n'.join(alea_files)
header = lines[0].split()[2:] #cmn.write_lines(header, 'header_names') #new = [' '.join(['sp%s' % i for i in xrange(len(header))])] new = [' '.join(header)] for line in lines[1:]: firstChar = '' items = line.split()[2:] newline = [] for item in items: chars = item.split() if firstChar == '': firstChar = chars[0] if len(chars) == 1: if chars[0] == '-': newline.append('0,0') elif firstChar == chars[0]: newline.append('1,0') else: newline.append('0,1') else: #have both char newline.append('1,1') new.append(' '.join(newline)) dn = fn + '.tmix' cmn.write_lines(new, dn) cmn.run('gzip %s' % dn)
label = cmn.lastName(fn) if label in finished_maps: continue isGood = False if 'MITO' in label: cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta_mito.py %s' % fn else: cmd = '/work/biophysics/mtang/SNP_calling/scripts/map2fasta.py %s' % fn cmds.append(cmd) if isGood: print('Good news! everything looks good!') else: cmds.append('') dn = 'm2fadd.cmds' cmn.write_lines(cmds, dn) print('Error!!!!!') print('There are still %s fasta missing' % (len(cmds) - 1)) print('please use following command to submit jobs') print( '\n>>> /work/biophysics/mtang/SNP_calling/scripts/submit_jobs.py %s [#node] m2fAdd -p 256GB\n' % dn) print('-p specifies the partition it submitted to') print( '[#node] is the number of nodes and should be adjusted according to number of lines in %s' % dn) print('\n[IMPORTANT]Please run this check again upon the job completion.')
#/home2/wli/local/bwa-0.7.12/bwa index -p assembly_selfref assembly_selfref.fa > index.log & todoref_count += 1 cmds.append( 'java -jar $PICARD/CreateSequenceDictionary.jar R=%s.fa O=%s.dict &' % (ref, ref)) cmds.append( '/home2/wli/local/samtools-1.2/samtools faidx %s.fa &' % ref) cmds.append('\nwait;\n') isIndexed = False print('###############################################') if todoref_count != 0: fcmd = 'gatkIndex.cmd' cmn.write_lines(cmds, fcmd) fjob = 'gatkIndex.job' cmd = '/work/biophysics/mtang/SNP_calling/scripts/decorate_job.py %s -p 256GB > %s' % ( fcmd, fjob) cmn.run(cmd) print('please submit %s to the queue for indexing ' % fjob) else: print('good news! all references have been indexed') isIndexed = True print('###############################################') if not isIndexed: print('**********************************************') print('\nimportant!!!') print('please re-run this script after all references are indexed!\n') print('**********************************************')
good_reads.append(name1) elif (misM2 + 1) >= misM1: #bad one has more mismatch, good one is good! #good one can be 1 bp more than the bad one good_reads.append(name1) else: if identity >= identity_cut: good_reads.append(name1) else: bad_reads.append((name2, aln1)) print('further classify overlapping reads into:') print('%s good reads' % len(good_reads)) print('%s bad reads' % len(bad_reads)) #sp2 = name2sp(name2) #add back the previous IDs good_reads.append('#' * 100) for ID in good_IDs: name = ID1mapping[ID] good_reads.append(name) cmn.write_lines(good_reads, 'good_reads.txt') #bad_reads.append('#' * 100) for ID in bad_IDs: name = ID2mapping[ID] bad_reads.append((name, seqDict2[name])) bad_alignments = ['%s %s\n' % (each[0], each[1]) for each in bad_reads] cmn.write_file(''.join(bad_alignments), 'bad_reads_alignment.txt')
try: clean_seq = clean_lines[ID] thread_seq = thread_lines[ID] length = min(len(clean_seq), len(thread_seq)) N = sum([ clean_seq[i + 20] != thread_seq[i + 20] for i in range(length - 20) if clean_seq[i + 20] not in gapChars and ( thread_seq[i + 20] not in gapChars) ]) Ngap = sum([char in gapChars for char in clean_seq[20:678]]) if Ngap == 0 and N == 0: difflabel += ',goodCC' else: difflabel += ',CC_%s' % N if Ngap != 0: difflabel += '[g%s]' % Ngap except KeyError: difflabel += ',noCC' line.append(difflabel) try: line.append(conta_dict[ID]) except: line.append('NA') line.append(recordLabel) new.append('\t'.join(line)) new.append('') cmn.write_lines(new, 'compare.check')
line = '%s%s' % (Pname, good_reads[name]) final.append(line) ### report those filtered by bwa mapping to other species final.append('#' * 700) names = sorted(list(bad_dict.keys()), key=lambda x: spBased_badnames(x, bad_dict[x])) #collapsed_names = collapse_same_reads(names, bad_dict, True) for name in names: #try: # Pname = collapsed_names[name] #except KeyError: # continue Pname = format_name(strformat, name, indel_list) line = '%s%s' % (Pname, bad_dict[name]) final.append(line) if hasDelLabel: dn = 'rescued_read_assembled_mis%s_withDeletion.txt' % misN else: dn = 'rescued_read_assembled_mis%s.txt' % misN cmn.write_lines(final, dn) Ngood = len(good_reads) Njunk = len(junk_reads) #print junk_reads statInfo = 'junk:good = %s:%s(%s)\n' % (Njunk, Ngood, float(Njunk) / (Njunk + Ngood)) cmn.write_file(statInfo, 'rescued_ratio_mis%s.txt' % misN)
read = seed_read Iter = 0 extensions = [] while (Iter < upper_iter): Iter += 1 print('running iteration %s' % Iter) #prepare wdir wdir = 'extend_iter%s' % Iter cmn.mkdir(wdir) os.chdir(wdir) cmn.write_file(read, 'seed_seq.txt') #grep the reads fished_reads = grep_reads(read, f_libs, direction) cmn.write_lines(fished_reads, 'fished_reads.txt') #make stat of the reads stat_dict = detect_dominated_reads(fished_reads, len(read)) #get the longest set under cutoff extended_seq = get_extended(stat_dict) cmn.write_file(extended_seq, 'extension_seq.txt') os.chdir('..') if extended_seq == '': print('no extension can be found in iteration %s! exit!' % Iter) break read = extended_seq[-overlapN:] extensions.append(extended_seq)
lastPhase = curPhase left = move_pointer(left, index + 1, count + 1) #print 'left', left scafLength = len_dict[scaf] if scafLength + 1 != expectIndex: gap = scafLength + 1 - expectIndex fillN = 'N' * gap seq1.append(fillN) seq2.append(fillN) #output the last phased_blocks.append(ouput_phased_blocks(lastPosition, right, 'lastOne')) dnlabel = cmn.lastName(fn).replace('.vcf', '') sp = dnlabel.split('_')[1] dn = dnlabel + '_phased.fa' with open(dn, 'w') as dp: dp.write('>%s_ref_or_phase1\n' % sp) dp.write(''.join(seq1)) dp.write('\n') dp.write('>%s_called_or_phase2\n' % sp) dp.write(''.join(seq2)) dp.write('\n') dn = dnlabel + '_phased.blocks' cmn.write_file(''.join(phased_blocks), dn) dn = dnlabel + '_phased.letters' cmn.write_lines(phased_letter, dn)
### report those filtered by bwa mapping to other species final.append('#' * 700) names = sorted(list(bad_dict.keys()), key=lambda x: spBased_badnames(x, bad_dict[x])) collapsed_names = collapse_same_reads(names, bad_dict, True) for name in names: try: Pname = collapsed_names[name] except KeyError: continue Pname = format_name(strformat, Pname) line = '%s%s' % (Pname, bad_dict[name]) final.append(line) dn = 'good_read_assembled.txt' cmn.write_lines(final, dn) #report thoese reads inconsistent with consensus for name in bad_cc_names: #Pname = parse_br_name(name_dict, name) Pname = name Pname = format_name(strformat, Pname) line = '%s%s' % (Pname, seqDict[name]) info.append(line) info.append('#' * 200) #report those reads mapped to other species names = sorted(bad_reads, key=lambda x: parse_bad_names(x, seqDict[x])) for name in names:
checklist = list(IDs) for line in falist: sp = cmn.lastName(line).split('_')[0] if sp in IDs: tmp.append(line) try: checklist.remove(sp) except: pass #print falist falist = tmp #print falist missing = set(checklist) | set(missing) alea_files = [ each for each in falist if '/archive/butterfly/' in each or ('jshen/h' in each) ] if len(alea_files) != 0: print('the following files need to transfer from /archive server') print('\n'.join(alea_files)) cmn.write_lines(falist, 'attempt.fastqlist') if len(missing) != 0: print('ATTENTION! the following ID missing sequence!') print('\n'.join(missing)) cmn.write_lines(missing, 'missingIDs')
#add primer if not added ref_seqs, toAddDict = read_baits(fref) #log the baits into the dataset log_newBaits_ifPossible(ref_seqs) #index ref here frefs = parse_ref(ref_seqs) fqlist = cmn.getid(fqlist) fq_groups = group_fq(fqlist) N = cmn.cpu_check() bwa_cmds = ['module add bwa'] for reflabel in frefs: fref = frefs[reflabel] fnlabel = cmn.lastName(fref).replace('.fa', '') for sp in fq_groups: R1, R2, single = fq_groups[sp] cmd = 'bwa mem -t %s -B 2 -M %s %s %s | grep "%s" > %s_paired_%s_mapped.sam ' % ( N, fnlabel, R1, R2, reflabel, sp, fnlabel) bwa_cmds.append(cmd) cmd = 'bwa mem -t %s -B 2 -M %s %s | grep "%s" > %s_single_%s_mapped.sam ' % ( N, fnlabel, single, reflabel, sp, fnlabel) bwa_cmds.append(cmd) bwa_cmds.append('\nwait\n') dn = 'bwa.cmds' cmn.write_lines(bwa_cmds, dn)
print('guessing fastq file to be %s' % fq) if fref == '': print('Error! can not find ref table file!') sys.exit() else: print('guessing ref table file to be %s' % fref) fq_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/fastq.filelist' if os.path.exists(fq_all): aset = set(cmn.getid(fq_all)) else: aset = set([]) bset = set(cmn.getid(fq)) newset = aset | bset newset = filter_best_fastq(newset) cmn.write_lines(newset, fq_all) fref_all = '/project/biophysics/Nick_lab/mtang/archive/step1_info/refTable.txt' if os.path.exists(fref_all): aset = set(cmn.getid(fref_all)) else: aset = set([]) bset = set(cmn.getid(fref)) newset = aset | bset cmn.write_lines(newset, fref_all)
mapInfo = ['#both the index started with 0 (not 1)'] mapDict = {} new = [] for defline in adict: seq = adict[defline] label = defline[1:].split('_')[0] #goodSeq = [char for i, char in enumerate(seq) # if i not in gapped] count = 0 goodSeq = [] for i, char in enumerate(seq): if i not in gapped: #mapInfo.append('%s\t%s\t%s' % (label, count, i)) mapDict[count] = i count += 1 goodSeq.append(char) goodSeq = ''.join(goodSeq) new.append('%s\n%s\n' % (defline, goodSeq)) f_label = '%s_%s' % (cmn.lastName(fn).split('.')[0], cmn.lastName(sys.argv[2])) dn = f_label + '_noGap.fa' cmn.write_file(''.join(new), dn) mapInfo += ['%s\t%s' % (key, mapDict[key]) for key in mapDict] dn = f_label + '_noGap2coding_index.info' cmn.write_lines(mapInfo, dn)
seqDict = read_fa(fn) lengths = [len(seqDict[i]) for i in seqDict] if len(set(lengths)) != 1: print('alignments are not in the same length! below is the stat:') for i in seqDict: print(i, len(seqDict[i])) sys.exit() keys = list(seqDict.keys()) info = [str(len(seqDict))] for name in keys: line = [name] for name2 in keys: if name2 == name: line.append(0.0) else: dist = compute_distance(name, name2, seqDict) line.append(dist) info.append('\t'.join(map(str, line))) info.append('') cmn.write_lines(info, fn + '.dist')
seqs += adict[name] for i in range(len(seqs[0])): chars = [seq[i] for seq in seqs] check_chars = [char for char in chars if (char != '-') and (char != 'N')] Nchar = len(check_chars) #N = len(set(check_chars)) count_dict = Counter(check_chars) N = len(count_dict) if N == 0: #skip the all gapped positions continue if any([count_dict[char] < Ncut for char in count_dict]): #skip non-info positions continue elif float(Nchar) / len(chars) < gap_cut: #skip gap positions continue else: line = chars2line(i, chars) hetero[N-1].append(line) for i, each in enumerate(hetero): dn = cmn.lastName(fn) + '.hetero%s' % (i + 1) cmn.write_lines(each, dn)
f_list = 'falist' cmd = 'ls %s/* > %s' % (wdir, f_list) cmn.run(cmd) falist = [os.path.abspath(fn) for fn in cmn.file2lines(f_list)] Njob = 3 fa_size = cmn.filesize(falist[0]) / 1024 / 1024 Njob = max(Njob, 50 * fa_size / 5000 + 1) Ncores = 48 * Njob / 100 print('number of cores:', Ncores) print('number of jobs:', Njob) cmds = [] outdir = 'making_fastme_trees' cmn.mkdir(outdir) for fa in falist: cmd = 'cd %s; python /project/biophysics/Nick_lab/wli/sequencing/scripts/fasta2fastmeTree.py %s %s' % ( outdir, fa, Ncores) cmds.append(cmd) cmn.write_lines(cmds, 'fastme.cmds') cmd = 'python /home2/wli/my_programs/submit_jobs.py fastme.cmds %s %s -p 256GB' % ( Njob, project) cmn.run(cmd)
def fasta2chpInput(fn, fdonor, freceipt): ############################## gapCut = 0.1 Napp = 4 gapChars = set(list('N-X')) linkage = 10000000.0 #the larger the stronger linkage ############################## #outlabel = 'chp_%s_%s_%s_gap%s_info%s' % (cmn.lastName(fn), cmn.lastName(fdonor), cmn.lastName(freceipt), gapCut, Napp) outlabel = cmn.lastName(fn) #parsing sequence seqDict, seqLength = read_fa(fn) seqGroups = group_seqDict(seqDict) included_IDs = set(seqGroups.keys()) donor_dict = parse_popDef(fdonor, freceipt, included_IDs) donor_keys = list(donor_dict.keys()) random.shuffle(donor_keys) print(donor_keys) donorIDs = [] donorF = [] key_groups = [] #this is used to fill in gap for each group for key in donor_keys:#just to garantee ordering IDs = donor_dict[key] key_groups.append(IDs) donorIDs += list(IDs) line = 'p%s %s\n' % (key, len(IDs)*2) donorF.append(line) dn = outlabel + '.donor' cmn.write_file(''.join(donorF), dn) receiptIDs = [line.split()[0] for line in cmn.file2lines(freceipt)] receiptIDs = list(set(receiptIDs) & included_IDs) receiptInfo = parse_receiptInfo(freceipt, receiptIDs) dn = outlabel + '_ind_record.list' cmn.write_lines(receiptInfo, dn) key_groups.append(receiptIDs) ids = [] ordered_keys = donorIDs + receiptIDs phase = [str(len(donorIDs)*2)] phase.append(str(len(donorIDs) + len(receiptIDs))) Npos = 0 Pline = ['P'] phaseSeqs = [[] for _ in range(len(ordered_keys)*2)] for i in range(seqLength): chars = [] isBad = False for keys in key_groups: subchars = take_position_chars(seqGroups, keys, i) #require half of subchars is not gap nonGap_sub = [char for char in subchars if char not in gapChars] #print i, subchars, nonGap_sub if len(nonGap_sub) < len(subchars) * 0.5: isBad = True break #random sample the positions to fill in gap newSubChars = [] for char in subchars: if char in gapChars: newChar = random.sample(nonGap_sub, 1)[0] else: newChar = char newSubChars.append(newChar) chars += newSubChars #print 'chars', chars if isBad: continue nonGaps = [char for char in chars if char not in gapChars] #print i, chars #print i, nonGaps if len(nonGaps) > ((1 - gapCut) * len(chars)): #if len(nonGaps) == len(chars): #filtering for infoP count_dict = Counter(nonGaps) if len(count_dict) != 2: continue if any([count_dict[key] < Napp for key in count_dict]): continue #print i, 'isGood' Pline.append(i) Npos += 1 for j, char in enumerate(chars): if char in gapChars: char = '0' phaseSeqs[j].append(char) dn = outlabel + '.hap' phase.append(str(Npos)) phase.append(' '.join(map(str, Pline))) print('number of positions: %s' % (len(Pline) -1)) phase.append('S' * Npos) phase += [''.join(map(str, each)) for each in phaseSeqs] phase.append('') cmn.write_lines(phase, dn) #parse out recomb linkage = float(linkage) positions = Pline[1:] recomb = ['pos morgan.dist'] for i in range(len(positions) - 1): p1 = positions[i] p2 = positions[i+1] dist = p2 - p1 morgan = dist / linkage recomb.append('%s %s' % (p1, morgan)) recomb.append('%s 0' % (positions[-1])) recomb.append('') dn = outlabel + '.recomb' cmn.write_lines(recomb, dn) return outlabel
hasCombined = True if len(old_fastqs) == 0: # no old data print('no old libs found for %s' % label) cmn.run('ln -s %s' % dn) else: #has old data print('combining old libs for %s' % label) old_fastqs, dup_fastqs = remove_duplication(old_fastqs) cmn.run('cp %s %s' % (dn, wdir)) log_info.append('%s\t%s\n' % (label, dn)) comb_fn = '%s/%s' % (wdir, cmn.lastName(dn)) for old_fastq in old_fastqs: cmn.run('cat %s >> %s' % (old_fastq, comb_fn)) log_info.append('%s\t%s\n' % (label, old_fastq)) if hasCombined: cmn.write_file(''.join(log_info), '%s/combined_libs.log' % wdir) #make statistics for data amount fastq_groups = group_fastq(fastqs) new = [] for key in fastq_groups: fns = fastq_groups[key] cmd = 'python /work/biophysics/mtang/SNP_calling/scripts/check_fastq_size.py %s %s' % ( key, ','.join(fns)) new.append(cmd) new.append('') cmn.write_lines(new, 'fastq_amount.cmds')
dn = outlabel + '.ids' cmn.write_file(''.join(ids), dn) phase = [str(Nhap)] Npos = 0 Pline = ['P'] phaseSeqs = [[] for _ in range(Nhap)] for i in range(seqLength): chars = [] for key in ordered_keys: chars += [seq[i] for seq in seqGroups[key]] nonGaps = [char for char in chars if char not in gapChars] if len(nonGaps) > ((1 - gapCut) * len(chars)): Pline.append(i) Npos += 1 for j, char in enumerate(chars): if char in gapChars: char = '0' phaseSeqs[j].append(char) dn = outlabel + '.phase' phase.append(str(Npos)) phase.append(' '.join(map(str, Pline))) phase += [''.join(map(str, each)) for each in phaseSeqs] phase.append('') cmn.write_lines(phase, dn)
#fqlist = cmn.cmd2lines('ls /project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/tmp_link_fastq/%s*.fastq' % sample) #fqlist = cmn.cmd2lines('ls /work/biophysics/wli/workspace/filtered_6313*q') wdir = 'mitoD_%s' % sample cmn.mkdir(wdir) os.chdir(wdir) cwd = os.getcwd() info = template.replace('[cwd]', cwd) info = info.replace('[fq_files]', ' '.join(fqlist)) info = info.replace('[sample]', sample) #prepare quake infiles fqlist_local = [] for fq in fqlist: cmn.run('ln -s ' + fq) fqlist_local.append(cmn.lastName(fq)) cmn.write_lines(fqlist_local, 'fqlist') cmn.run('ln -s fqlist infiles') #make fq2fa comand quake_fqlist = [each.replace('.fastq', '.cor.fastq') for each in fqlist_local] fq2fa_cmds = ['rm %s.fa 2> /dev/null' % sample] for fq in quake_fqlist: cmd = 'fq2fa %s >> %s.fa;' % (fq, sample) fq2fa_cmds.append(cmd) cmn.write_lines(quake_fqlist, 'fqlist.cor') cmd = '\n'.join(fq2fa_cmds) info = info.replace('[fq2fa_commands]', cmd) noWolba_fqlist = ['noWolb_%s' % each for each in quake_fqlist] info = info.replace('[noWolba_fastq]', ' '.join(noWolba_fqlist))
fns = cmn.cmd2lines('ls *.map| grep -v all| grep -v test| grep -v concat') #fns = fns[:1] f_label = '15101E05_snp.vcf' #make the scaffold index #cmd = "grep -v '^#' 15101E05_snp.vcf| cut -f 1,2 > index_header" if not cmn.filexist('index_header'): # cmn.run(cmd) make_index_header('15101E05_snp.vcf', 'assembly_v2_length.txt') header = ['scaffold', 'index'] for fn in fns: label = fn.split('_')[0] header.append(label+'_f') header.append(label+'_m') cmn.write_lines(fns, 'map_name_order') cmn.write_file('\t'.join(header)+'\n', 'table_header') cmd = 'cp table_header all_concat.map;' cmd += 'paste index_header %s >> all_concat.map;' % ' '.join(fns) if not cmn.filexist('all_concat.map'): print(cmd) cmn.run(cmd) else: print('the final file all_concat.map has exist!, skip!')
if len(taken) == 0: missing.append(ID) elif len(taken) == 1: falist.append(taken[0]) else: falist.append(tell_best_fa(taken)) print('checklog', ID, taken) alea_files = [] biohpc_files = [] for each in falist: if '/archive/butterfly/' in each or ('jshen/h' in each): alea_files.append(each) else: biohpc_files.append(each) new_files = transfer_alea_files(alea_files) falist = biohpc_files + new_files falist.append('') cmn.write_lines(falist, 'statlist') if len(missing) != 0: print('ATTENTION! the following ID missing sequence!') print('\n'.join(missing)) cmn.write_lines(missing, 'missingIDs') #if len(alea_files) != 0: # print 'the following files need to transfer from /archive server' # print '\n'.join(alea_files)
newConfig.append('#define OUTFILE structure.output') elif '#define MAXPOPS' in line: newConfig.append('#define MAXPOPS %s' % K) else: newConfig.append(line) newConfig.append('') cmds = [] for eachtime in range(rep): outdir = 'structureK%s/r%s' % (K, eachtime) cmn.mkdir(outdir) os.chdir(outdir) dn = 'mainparams' cmn.write_lines(newConfig, dn) cmd = 'cd %s; touch extraparams; /home2/wli/local/Structure/bin/structure > runtime.log & cd %s' % ( outdir, cwd) #cmn.run(cmd) cmds.append(cmd) cmd = 'sleep 55;' cmds.append(cmd) os.chdir(cwd) cmds.append('\nwait\n') dn = 'masterK%s.bash' % K cmn.write_lines(cmds, dn) cmd = 'bash %s' % dn
#fn = cmn.lastName(fn) fqs = cmn.cmd2lines( 'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/1_bwa_align/%s*.fq' % sp) fqs += cmn.cmd2lines( 'ls /project/biophysics/Nick_lab/wli/sequencing/Nick_request/Heli_map/SNP_calling/1_bwa_align/%s*.fastq' % sp) if len(fqs) == 0: print('can not find fastq for %s!' % sp) os.chdir('..') continue cmd, fsams = make_bwa_cmds(fqs, fn) dn = 'sam_filelist' cmn.write_lines(fsams, dn) info = template.replace('assembly_selfref', asslabel) info = info.replace('[bwa_cmds]', cmd) info = info.replace('[WL_sam_filelist]', dn) info = info.replace('[WL_preprocessing]', '\n'.join(step1cmds)) #make snp call cmds #f_sam = merge_sams(sp, fsams) info = info.replace('5328', sp) info = info.replace('[WL_cwd]', os.getcwd()) info2 = template2.replace('assembly_selfref', asslabel) info2 = info2.replace('5328', sp) info2 = info2.replace('[WL_cwd]', os.getcwd())
fq = os.path.abspath(fq) ID = cmn.lastName(fq).split('_')[0] try: groupDict[ID].append(fq) except KeyError: groupDict[ID] = [fq] fmitolist = os.path.abspath(fmitolist) for sample in groupDict: fqlist = groupDict[sample] wdir = 'mitoRef_%s' % sample cmn.mkdir(wdir) os.chdir(wdir) cwd = os.getcwd() info = template.replace('[cwd]', cwd) cmn.write_lines(fqlist, 'fqlist') cmd = 'cat %s|xargs cat > ref_mito.fa; module add bwa; bwa index ref_mito.fa' % fmitolist cmn.run(cmd) bwa_commands = [] if len(fqlist) == 3: R1 = [each for each in fqlist if '_R1' in each][0] R2 = [each for each in fqlist if '_R2' in each][0] single = [each for each in fqlist if '_single' in each][0] bwa_commands.append('bwa mem -M ref_mito.fa %s %s > paired.sam' % (R1, R2)) bwa_commands.append('bwa mem -M ref_mito.fa %s > singleton.sam' % (single)) elif len(fqlist) == 2: R1 = [each for each in fqlist if '_R1' in each][0] R2 = [each for each in fqlist if '_R2' in each][0]