def alnDict2output(aln_dict, dn, order='sorting'): info = [] if len(aln_dict) == 0: cmn.run('touch %s' % dn) return None #maxLength = max([len(each) for each in aln_dict.keys()]) maxLength = 0 maxNameLength = max([len(each) for each in aln_dict]) nameformat = '{:<%s}' % maxNameLength names = list(aln_dict.keys()) if order == 'sorting': names = sorted(names, key=lambda x: number4sorting(aln_dict[x])) elif order == 'grouping': #this is used to output inconsistent group #rank by grouping of species IDs names = sorted(names, key=lambda x: group_by_spnames(x)) else: names.sort() for i, name in enumerate(names): #name = 'readgroup%s' % i aln = aln_dict[name] name = nameformat.format(name) toAdd = maxLength - len(aln) if toAdd > 0: aln += '-' * toAdd info.append('%s %s\n' % (name, ''.join(aln))) cmn.write_file(''.join(info), dn)
def read_baits(fn): adict = {} toAdd = {} hasPrimer = True new = [] for line in cmn.file2lines(fn): if line.strip() == '': continue sp, name, seq = line.split() print(len(seq)) if len(seq) != 698: hasPrimer = False if len(seq) == 658: #fixable seq = add_primer(seq) else: print('Error! didn\'t recognize the length of the bait %s %s' % (sp, name)) sys.exit() newline = '%s\t%s\t%s\n' % (sp, name, seq) new.append(newline) key = '%s_%s' % (sp, name) adict[key] = seq toAdd[name] = seq if not hasPrimer: print('revise the input baits to add primer...') cmn.write_file(''.join(new), fn) return adict, toAdd
def prune_tree(ftree, fseq): t = ete3.Tree(ftree) IDlist = cmn.cmd2lines('grep ">" %s|cut -d ">" -f 2' % fseq) t.prune(IDlist) dn = 'prune_tree.tre' cmn.write_file(t.write(format=1), dn) return dn
def makeBlastDatabase(seqDict): dn = 'db4picking.fa' new = ['>%s\n%s\n' % (name, seqDict[name]) for name in seqDict if seqDict[name].strip('N-X') != ''] cmn.write_file(''.join(new), dn) cmd = 'module add blast; makeblastdb -dbtype=nucl -in=%s' % dn cmn.run(cmd) return dn
def do_barcode_blast(sequence): fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta' namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '') fquery = '/tmp/%s.fa' % namelabel cmn.write_file(sequence, fquery) cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb) cmd += '-outfmt \'6 sseqid qlen slen length pident\'' lines = cmn.cmd2lines(cmd) #cmn.run('rm %s' % fquery) return lines
def update_baits(bait_dict): adict = {} for i, name in enumerate(bait_dict): fnlabel = 'bait%s' % i dn = 'baits/%s.fa' % fnlabel seq = bait_dict[name] fasta = '>%s\n%s\n' % (name, ''.join(seq)) cmn.write_file(fasta, dn) cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel) cmn.run(cmd) adict[name] = dn return adict
def output_matrix(arr, fn=None): dimension = arr.ndim lines = [] for index, content in np.ndenumerate(arr): first = '' for i in range(dimension): first += '%s\t' % (index[i]) lines += ['%s%s' % (first, content)] if fn != None: cmn.write_file('\n'.join(lines), fn) else: return '\n'.join(lines)
def parse_inserted_gap(ID, seq, label): fn = 'sampleRun_%s/bait_insertion' % ID #if cmn.filexist(fn) or ('N' in seq.replace('-', 'N').strip('N')): if cmn.filexist(fn): #lines = cmn.file2lines(fn) #lines = sorted(lines, key=lambda x: int(x.split(',')[0][1:])) #Ngap = 0 #for line in lines: # items = line.strip().split() # Ngap += len(items[-1]) #check what is the right range of sequence print('runing blast to fix %s' % ID) checkSeq = seq.replace('-', 'N').strip('N') fquery = 'tmpInput.fa' fasta = '>input\n%s\n' % checkSeq cmn.write_file(fasta, fquery) dn = 'tmpBr_%s.txt' % label cmd = 'blastn -query %s -db /project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa ' % fquery cmd += '-task blastn-short -dust no -outfmt \'6 qseqid sseqid qstart qend sstart send evalue pident qseq sseq\' -out %s' % dn cmn.run(cmd) isFixed = False for line in cmn.file2lines(dn): items = line.strip().split() #print items qstart, qend, sstart, send = list(map(int, items[2:6])) if sstart == 1 and send == 658 and qstart == 21: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 658: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if sstart == 2 and send == 655 and qstart == 22: qseq, sseq = items[-2:] new = [ char1 for char1, char2 in zip(qseq, sseq) if char2 != '-' ] if len(new) == 654: seq = seq[:qstart - 1] + ''.join(new) + seq[qend:] print('solution found for %s' % ID) isFixed = True break if not isFixed: cmn.append_file('%s\t%s\n' % (ID, label), 'cannot_fixed_indel.txt') return seq
def parse_ref(seqDict): cmn.mkdir('baits') newDict = {} for i, name in enumerate(seqDict): seq = seqDict[name] fnlabel = 'bait%s' % i dn = 'baits/%s.fa' % fnlabel name = name.replace('*', '').replace('"', "'") fasta = '>%s\n%s\n' % (name, seq) cmn.write_file(fasta, dn) cmd = 'module add bwa; bwa index %s -p %s' % (dn, fnlabel) cmn.run(cmd) newDict[name] = dn return newDict
def do_barcode_blast(sequence, seqDict): #fref = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa' #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo' fdb = makeBlastDatabase(seqDict) #fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_NoN_0.95.fasta' namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '') namelabel = namelabel.replace('/', '_') fquery = '/tmp/%s.fa' % namelabel cmn.write_file(sequence, fquery) cmd = 'module add blast; blastn -query %s -db %s ' % (fquery, fdb) cmd += '-outfmt \'6 sseqid qlen slen length pident\'' lines = cmn.cmd2lines(cmd) cmn.run('rm %s' % fquery) return lines
def get_mash_file(name, seq): global mash_file_dict, cpu try: fn = mash_file_dict[name] except KeyError: fn = '/tmp/%s' % name seq = ''.join(seq).replace('-', '').replace('N', '') fasta = '>%s\n%s\n' % (name, seq) cmn.write_file(fasta, fn) cmd = '/home2/wli/local/mash-Linux64-v1.1.1/mash sketch -n -p %s %s' % ( cpu, fn) cmn.run(cmd) dn = fn + '.msh' mash_file_dict[name] = dn fn = dn return fn
def do_barcode_blast(sequence): fdb = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes_4verify.fa' namelabel = sequence.split()[0][1:].split()[0].split('|')[0].replace('*','').split('[')[0].replace('"','').replace("'", '').split('(')[0].split('/')[0].split('\\')[0] fquery = '/tmp/%s.fa' % namelabel cmn.write_file(sequence, fquery) fbr = fquery + '.br' cmd = 'module add blast; blastn -max_target_seqs 5000 -query %s -db %s -ungapped ' % (fquery, fdb) cmd += '-outfmt \'6 sseqid slen length pident qstart qend qseq sseq\'' cmd += ' -out %s ' % fbr #print cmd cmn.run(cmd) #cmd += ' | head -n 10' #lines = cmn.cmd2lines(cmd) lines = cmn.file2lines(fbr) cmn.run('rm %s' % fquery) cmn.run('rm %s' % fbr) return lines
def make_index_header(fvcf, flen): lendict = read_length_info(flen) ordered_scafs = [] taken = set([]) with open(fvcf) as fp: for line in fp: if not line.startswith('scaffold'): continue scaf = line.split()[0] if scaf not in taken: taken.add(scaf) ordered_scafs.append(scaf) info = [] for scaf in ordered_scafs: length = lendict[scaf] for i in range(length): info.append('%s\t%s\n' % (scaf, (i+1))) dn = 'index_header' cmn.write_file(''.join(info), dn)
def check_difference(seq1, seq2): print(len(seq1), len(seq2)) if len(seq1) == len(seq2): return sum([char1 != char2 for char1, char2 in zip(seq1, seq2) if char1 not in gapChars and char2 not in gapChars]) cmn.write_file(seq1, 'tmpSeq1.fa') cmn.write_file(seq2, 'tmpSeq2.fa') info = cmn.cmd2info('blastn -query tmpSeq1.fa -subject tmpSeq2.fa') #Identities = 656/656 (100%) identityString = cmn.find_between(info, 'Identities = ', ' (') identN, totalN = list(map(int, identityString.split('/'))) cmn.write_file(info, 'checkTmp%s.br' % (ID)) return totalN - identN
try: fn = sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() N = 1000000 #get this much positions label = '%sM' % (N / 1000000) seqDict, length = read_fa(fn) if length < N: print('sequence length is shorter than 10K, exist!') sys.exit() positions = random.sample(list(range(length)), N) positions.sort() new = [] for name in seqDict: seq = seqDict[name] newSeq = [seq[i] for i in positions] fasta = '>%s\n%s\n' % (name, ''.join(newSeq)) new.append(fasta) dn = cmn.lastName(fn).replace('.fasta', '').replace( '.fa', '') + '_rd%s.fa' % (label) cmn.write_file(''.join(new), dn)
hasCombined = True if len(old_fastqs) == 0: # no old data print('no old libs found for %s' % label) cmn.run('ln -s %s' % dn) else: #has old data print('combining old libs for %s' % label) old_fastqs, dup_fastqs = remove_duplication(old_fastqs) cmn.run('cp %s %s' % (dn, wdir)) log_info.append('%s\t%s\n' % (label, dn)) comb_fn = '%s/%s' % (wdir, cmn.lastName(dn)) for old_fastq in old_fastqs: cmn.run('cat %s >> %s' % (old_fastq, comb_fn)) log_info.append('%s\t%s\n' % (label, old_fastq)) if hasCombined: cmn.write_file(''.join(log_info), '%s/combined_libs.log' % wdir) #make statistics for data amount fastq_groups = group_fastq(fastqs) new = [] for key in fastq_groups: fns = fastq_groups[key] cmd = 'python /work/biophysics/mtang/SNP_calling/scripts/check_fastq_size.py %s %s' % ( key, ','.join(fns)) new.append(cmd) new.append('') cmn.write_lines(new, 'fastq_amount.cmds')
#for tacc, copy fastq to wdir print('copying fastq for %s...' % olabel) for fastq in fastqs: cmd = 'cp %s %s' % (fastq, wdir) cmn.run(cmd) fastqs = [cmn.lastName(fastq) for fastq in fastqs] cmd, fnewSams = make_bwa_cmds(fastqs, 'assembly_selfref_v2', wdir) info = info.replace('[WL_mapping_cmds]', cmd) #7. merge mapped sams cmd = 'python /work/00412/mtang/sequencing/scripts/merge_mapped_sams.py %s_step2.sam %s' % ( olabel, ' '.join(fnewSams)) info = info.replace('[WL_merge_sam_cmds]', cmd) #8. re-run gatk #this step has been fully included in the template fjob = 'job_files/gatk%s.job' % olabel cmn.write_file(info, fjob) fjobs.append(fjob) print( '##########################################################################' ) print('please use the following cmd to submit unfinished jobs') print('cd job_files') print('\n'.join(['sbatch %s' % cmn.lastName(fjob) for fjob in fjobs])) print( '##########################################################################' )
def get_query_sequence(seqDict, genus, sp): #1. anything in Eudamine file has higher priority #fEud = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/Eudaminae-barcode-reference.txt' #cmd = 'grep %s %s' % (sp, fEud) #lines = cmn.cmd2lines(cmd) #if len(lines) == 1: # name = lines[0].split()[0] # seq = seqDict[name] # fasta = '>%s\n%s\n' % (name, seq) # qlen = len(seq.replace('N', '')) # print 'pick %s for %s %s' % (name, genus, sp) # return fasta, qlen names = list(seqDict.keys()) #try to look up the exact match first expected_name = '%s_%s' % (genus, sp) tmp = [name for name in names if name.upper() == expected_name.upper()] if len(tmp) != 0: name = tmp[0] print('found exact match %s' % name) seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) return fasta, qlen #look it up in other files good_names = [name for name in names # if genus.upper() in name.upper().split('_')] if genus.upper() == name.upper().split('_')[0]] useGenus = False if len(good_names) > 0: useGenus = True cmn.run('rm pickingLog.txt 2> /dev/null') if len(good_names) == 0:#sp is just 'sp' print('can not find barcode for genus keyword "%s"' % genus) good_names = names cmn.write_file('noGenus\n', 'pickingLog.txt') if len(good_names) > 1: #try to refine it tmp = [name for name in good_names if sp.upper() in name.upper().split('_')] if len(tmp) != 0: good_names = tmp else: cmn.append_file('noSpecies\n', 'pickingLog.txt') ############################################# ####new here, auto pick sequences for those has no info ############################################# if cmn.filexist('pickingLog.txt'): print('automatically pick bait by fastq similarity') fsp = 'restricted_genus.info' if useGenus and (not cmn.filexist(fsp)): cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist %s' % genus else: cmd = '/archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/auto_rebait.py fqlist ' cmn.run(cmd) good_names = cmn.file2lines('picked_bait.txt') cmn.write_file('pickClosed\n', 'pickingLog.txt') ############################################# ############################################# ############################################# #try to see if type species is there tmp = [name for name in good_names if name[0] == '*'] if len(tmp) != 0: good_names = tmp else: tmp = [name for name in good_names if '*' in name] if len(tmp) != 0: good_names = tmp #then randomly pick one, get the max length ones name = max(good_names, key=lambda x: len(seqDict[x].replace('N', '-'))) #name = name.replace('/', '_') seq = seqDict[name] fasta = '>%s\n%s\n' % (name, seq) qlen = len(seq.replace('N', '')) print('pick %s for %s %s' % (name, genus, sp)) return fasta, qlen
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: ftree, faln=sys.argv[1:] except: print("Usage: *.py ftree fa", file=sys.stderr) sys.exit() tree = ete3.Tree(ftree) takenIDs = [] with open(faln) as fp: for line in fp: if line[0] == '>': ID = line[1:].strip() takenIDs.append(ID) tree.prune(takenIDs) dn = cmn.lastName(ftree).replace('.tre', '') + '_prune.tre' cmn.write_file(tree.write(), dn)
info = info.replace('[WL_sam_filelist]', dn) info = info.replace('[WL_preprocessing]', '\n'.join(step1cmds)) #make snp call cmds #f_sam = merge_sams(sp, fsams) info = info.replace('5328', sp) info = info.replace('[WL_cwd]', os.getcwd()) info2 = template2.replace('assembly_selfref', asslabel) info2 = info2.replace('5328', sp) info2 = info2.replace('[WL_cwd]', os.getcwd()) os.chdir('..') fjob = 'job_files/s1_%s.job' % sp cmn.write_file(info, fjob) cmn.run('cd job_files; sbatch s1_%s.job' % sp) if sp not in step1_finished: step1_jobs.append(fjob) fjob = 'job_files/s2_%s.job' % sp cmn.write_file(info2, fjob) step2_jobs.append(fjob) #cmn.run('cd job_files; sbatch sg%s.job' % sp) info = ['bash %s\n' % each for each in step1_jobs] cmn.write_file(''.join(info), 'step1todo.cmds') info = ['sbatch %s\n' % each for each in step2_jobs]
except: print("Usage: *.py RAxML_bestTree.noGap", file=sys.stderr) sys.exit() nameDict = get_names() #t = ete3.Tree(cmn.txt_read(fn).replace('[&U]', '')) appear = {} table = [] lines = cmn.file2lines(fn) info = [] for line in lines: if line[0] == '>': sp = line[1:].split('_')[0] print(sp) try: newline = '>' + nameDict[sp].replace('\t', '_').replace( ' ', '_') except: newline = line info.append(newline) else: info.append(line) info.append('') info = '\n'.join(info) dn = fn + '.renamed' cmn.write_file(info, dn) cmn.write_file(''.join(table), dn + '.nameTable')
seq.append(char2) else: #different characters if char1 == '-' and char2 == '-': seq.append(char3.lower()) elif char1 == '-': seq.append(char2.lower()) elif char2 == '-': seq.append(char1.lower()) else: #different chars and not a gap seq.append('X') fasta = '>%s\n%s\n' % (Id, ''.join(seq)) refBaseDict[Id] = ''.join(seq) new.append(fasta) cmn.write_file(''.join(new), 'sum_barcodes.fa') #cmn.run('rm -r sampleRun_fake') #check denovo pipeline one fns = cmn.cmd2lines('ls sampleRun_*/denovo_barcode.fa') denovoDict = {} new = [] for fn in fns: Id = cmn.find_between(fn, 'sampleRun_', '/') lines = cmn.file2lines(fn) seq = ''.join(lines[1:]) if seq > 658: tmp = seq.replace('N', '') if len(tmp) == 658: seq = tmp
for gene in selected: scaf, i, j = gene fa = '../introgression/0_process_scaf/scaf2_fastas/%s.fa' % scaf seqDict = read_fa(fa) print('parsing %s (i, j)' % (fa, i, j)) try: exclusion = repdict[scaf] except: exclusion = set([]) for name in seqDict: #seq = [char for index, char in enumerate(seqDict[name]) # if index not in exclusion and (i <= index <= j)] seq = seqDict[name][i:j] seq = [char for index, char in enumerate(seq) if (index+i) not in exclusion] try: final[name] += seq except: final[name] = seq dn = 'sampled_seq_t%s.fa' % times fasta = ['>%s\n%s\n' % (name, ''.join(final[name])) for name in final] cmn.write_file(''.join(fasta), dn)
cov_dict = {} #get coverage first for exon in stack_dict: cov_dict[exon] = {} for sp in stack_dict[exon]: stacks = stack_dict[exon][sp] Nlist = [len(stacks[key]) for key in stacks] cov = float(sum(Nlist)) / len(Nlist) cov_dict[exon][sp] = cov cov_info = [ '%s\t%s\t%s\n' % (sp, exon, cov_dict[exon][sp]) for exon in cov_dict for sp in cov_dict[exon] ] cmn.write_file(''.join(cov_info), 'cov_info.txt') for exon in stack_dict: length = exon_lengths[exon] spDict = stack_dict[exon] for sp in spDict: stacks = spDict[sp] newSeq = [[], []] cov = cov_dict[exon][sp] for each in range(length): p = each + 1 print(p, exon, sp) try:
defline = lines[0] seq = ''.join(lines[1:]) adict[defline] = seq return adict #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #fn = 'all_genomes_noGap.fa' #fn = 'all_genomes_charGap.fa' try: fn = sys.argv[1] except: print('*.py all_genomes_charGap.fa ') sys.exit() adict = read_fa(fn) fnlabel = cmn.lastName(fn).replace('.fa', '') outdir = 'splitS_%s' % fnlabel cmn.mkdir(outdir) for i, key in enumerate(adict): seq = adict[key] fasta = '>%s\n%s\n' % (key, seq) dn = '%s/%s_%s.fa' % (outdir, fnlabel, i) cmn.write_file(fasta, dn)
#fall = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/all_barcodes.fasta' seqDict = read_fa(fall) #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/addedBaits_fromPipeline.fa' #if cmn.filexist(fadd): # seqDict.update(read_fa(fadd)) #ftable = '/archive/biophysics/Nick_lab/wli/archive/barcodes/auto_tables/verified_barcodes.fa' #seqDict.update(read_autoTable(ftable)) info = [] for line in cmn.file2lines(fn): #5077 Autochton zarex items = line.strip().replace('?', ' ').split() try: sample, genus, sp = items[:3] except: sample, genus = items[:2] sp = 'sp' sp = sp.split('-')[0].split('_')[0] genus = parse_name(genus) query_sequence, qlen = get_query_sequence(seqDict, genus, sp) br_result = do_barcode_blast(query_sequence, seqDict) print('\n'.join(br_result)) baits = pick_barcode_baits(br_result, qlen, seqDict) info += format_baits(sample, baits) dn = cmn.lastName(fn) + '.baits' cmn.write_file(''.join(info), dn)
taken += cmn.cmd2lines(cmd) taken += [each for each in alea_list if cmn.lastName(each).split('_')[0] == ID] if len(words) != 0: taken = [each for each in taken if all([word in each for word in words])] faDict[ID] = taken #print taken all_fa = sum(list(faDict.values()), []) ass_count = count_ass_appearance(all_fa) best_ass = max(list(ass_count.keys()), key=lambda x: ass_count[x]) print('the most common assembly is %s, only take fa mapped to this assembly' % best_ass) cmn.write_file(best_ass, 'best_assembly.txt') sys.exit() for ID in faDict: alist = faDict[ID] taken = [each for each in alist if best_ass in each.replace('_withMito', '')] #print ID, taken if best_ass == 'cne' and len(taken) == 0: taken += [each for each in alist if '3574_assembly_v1' in each] if best_ass == '3574_assembly_v1' and len(taken) == 0: taken += [each for each in alist
good_reads.append(name1) elif (misM2 + 1) >= misM1: #bad one has more mismatch, good one is good! #good one can be 1 bp more than the bad one good_reads.append(name1) else: if identity >= identity_cut: good_reads.append(name1) else: bad_reads.append((name2, aln1)) print('further classify overlapping reads into:') print('%s good reads' % len(good_reads)) print('%s bad reads' % len(bad_reads)) #sp2 = name2sp(name2) #add back the previous IDs good_reads.append('#' * 100) for ID in good_IDs: name = ID1mapping[ID] good_reads.append(name) cmn.write_lines(good_reads, 'good_reads.txt') #bad_reads.append('#' * 100) for ID in bad_IDs: name = ID2mapping[ID] bad_reads.append((name, seqDict2[name])) bad_alignments = ['%s %s\n' % (each[0], each[1]) for each in bad_reads] cmn.write_file(''.join(bad_alignments), 'bad_reads_alignment.txt')
print("Usage: *.py fa Ncores", file=sys.stderr) sys.exit() #if the nodes are less than 4 taxa, produce a random tree cmd = "grep '>' %s" % (fn) lines = [ each[1:].strip() for each in cmn.cmd2lines(cmd) if each.strip() != '' ] N = len(lines) if N < 4: print('Warning: fastme can not make tree of less than 4 taxa') print('Warning: so I make a fake tree...') dn = '%s.phylip.fastme.tre' % cmn.lastName(fn) if N == 1: info = '(%s);\n' % lines[0] if N == 2: a, b = lines info = '(%s,%s);\n' % (a, b) elif N == 3: a, b, c = lines info = '((%s,%s),%s);\n' % (a, b, c) cmn.write_file(info, dn) sys.exit() label = cmn.lastName(fn) cmd = 'rm RAxML_*.%s;' % label cmd += '/home2/wli/local/RAxML/raxmlHPC-PTHREADS-SSE3 -m GTRGAMMA -p 7112 -T %s -s %s -n %s' % ( Ncores, label, label) cmn.run(cmd)
except: print('usage: *.py fsam fass', file=sys.stderr) sys.exit() cmd = 'module add samtools; samtools faidx %s' % fass cmn.run(cmd) cmd = 'module add picard/1.117; java -jar $PICARD/CreateSequenceDictionary.jar R=%s O=%s.dict' % ( fass, fass[:-3]) cmn.run(cmd) template = cmn.txt_read( '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/template_gatk_bias_fromSam.job' ) template = template.replace('[WL_ref]', fass) template = template.replace('[INPUT.sam]', fsam) sampleId = cmn.lastName(fsam).replace('highQ_', '').split('_')[0] dnlabel = '%s_%s' % (cmn.lastName(fsam).replace( '.sam', ''), cmn.lastName(fass).replace('.fa', '')) cmn.mkdir(dnlabel) os.chdir(dnlabel) cwd = os.getcwd() pre_cmds = 'cd %s\n' % cwd template = template.replace('5642', sampleId) template = template.replace('[WL_preprossing]', pre_cmds) cmn.write_file(template, 'gatk%s.job' % sampleId) #cmn.run('sbatch gatk%s.job' % sampleId)