#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fqlist, fmitolist = sys.argv[1:] except: print("Usage: *.py fqlist refMitoList", file=sys.stderr) sys.exit() #ftemplate = '/work/biophysics/wli/Eudamine/wholeMito_run2/mito_denovo.template' ftemplate = '/project/biophysics/Nick_lab/wli/sequencing/scripts/mito_scripts/mito_refDenovo.template' template = cmn.txt_read(ftemplate) fqlist = cmn.file2lines(fqlist) groupDict = {} for fq in fqlist: fq = os.path.abspath(fq) ID = cmn.lastName(fq).split('_')[0] try: groupDict[ID].append(fq) except KeyError: groupDict[ID] = [fq] fmitolist = os.path.abspath(fmitolist) for sample in groupDict: fqlist = groupDict[sample] wdir = 'mitoRef_%s' % sample cmn.mkdir(wdir)
sys.exit() f_table = '/project/biophysics/Nick_lab/wli/sequencing/scripts/name_table' nameDict = {} for line in cmn.file2lines(f_table): items = line.strip().split() if len(items) == 0: continue label = items[0] name = '_'.join(items) nameDict[label] = name.replace('-', '_') print(list(nameDict.keys())) t = ete3.Tree(cmn.txt_read(fn).replace('[&U]', '')) appear = {} for node in t: name = node.name sp = name.split('_')[0] if sp not in appear: appear[sp] = 1 else: appear[sp] += 1 new_name = '%s_cp%s' % (nameDict[sp], appear[sp]) node.name = new_name info = t.write() print(info)
sys.exit() f_table = '/project/biophysics/Nick_lab/wli/sequencing/scripts/name_table' nameDict = {} for line in cmn.file2lines(f_table): items = line.strip().split() if len(items) == 0: continue label = items[0] name = '_'.join(items[1:]) nameDict[label] = name.replace('-', '_') print(list(nameDict.keys())) info = cmn.txt_read(fn) #hasDash = ('_' in info) hasDash = False for label in nameDict: name = nameDict[label] #data_label = '%s_' % label if label.isdigit(): if hasDash: data_label = '%s_' % label new_name = '%s_%s_' % (label, name) else: data_label = '%s' % label new_name = '%s_%s' % (label, name) else: data_label = '%s' % label
sys.exit() key_cmd = sys.argv[1] node = '1' part = 'super' time_hour = '200' for i, arg in enumerate(sys.argv): if arg == '-n': node = sys.argv[i + 1] elif arg == '-p': part = sys.argv[i + 1] elif arg == '-t': time_hour = sys.argv[i + 1] cwd = os.getcwd() aa = cmn.txt_read('/home2/wli/template/slurm.job') aa = aa.replace('NODE', node) aa = aa.replace('PART', part) aa = aa.replace('TIME_HOUR', time_hour) #aa+="#$ -pe %sway %s\n\n\n" % (cpu, cpu) aa += 'cd %s\n\n' % cwd aa += key_cmd + '\n' print(aa)
#input: #output: #algorithm: #author:wenlin; Date:2012- import sys python_lib = '/home2/wli/my_programs/python_lib' if python_lib not in sys.path: sys.path.append(python_lib) import cmn import ete3 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py bestTree", file=sys.stderr) sys.exit() t = ete3.Tree(cmn.txt_read(fn)) print(t.write(format=9))
print('please submit %s to the queue for indexing ' % fjob) else: print('good news! all references have been indexed') isIndexed = True print('###############################################') if not isIndexed: print('**********************************************') print('\nimportant!!!') print('please re-run this script after all references are indexed!\n') print('**********************************************') ############################### #all the steps below would put into the job files template = cmn.txt_read( '/work/biophysics/mtang/SNP_calling/scripts/templates/template_gatk_unbias4TACC.job' ) cmn.mkdir('job_files') fjobs = [] for sp in refdict: print('processing %s' % sp) snp_list = refdict[sp] for samdir, ref in snp_list: #a. make directory olabel = '%s_%s' % (sp, ref) wdir = '%s/%s' % (cwd, olabel) wdir4TACC = '../%s' % olabel cmn.mkdir(wdir)
def load_verified_barcodes(): fgood = '/archive/biophysics/Nick_lab/wli/archive/barcodes/auto_tables/verified_barcodes.fa' seqDict = read_fa(fgood) IDmapping = names2IDs(list(seqDict.keys())) return IDmapping, seqDict if __name__=='__main__': fn1 = 'sum_denovo.fa' fn2 = 'sum_barcodes.fa' #fn3 = 'compare.check' if 'Error' in cmn.txt_read('compare.check'): print('##########################################################################') print('Error in running barcode pipeline! please fix lines with "Error" in "compare.check" file!') print('##########################################################################') #sys.exit() replaceIDs = set(cmn.cmd2lines('grep takenD compare.check|grep -v same|cut -f 1')) seqDict1 = read_fa(fn1) seqDict2 = read_fa(fn2) dn = 'sum_hybrid.fa' #newDict = {} with open(dn, 'w') as fp: for name in seqDict2: if name in replaceIDs:
#output the phylip format file seqDict = {ID: ''.join(final[ID]) for ID in final} length = len(seqDict[ID]) new = ['%s\t%s' % (len(seqDict), length)] for name in seqDict: new.append('%s %s' % (name, seqDict[name])) dn = outlabel + '.phylip' cmn.write_lines(new, dn) #write out the partition file ftemplate = '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/partition_finder.cfg.template' fcfg = 'partition_finder.cfg' info = cmn.txt_read(ftemplate) info = info.replace('[input_phylip]', dn) ##Gene3_pos3 = 1452-2208\3; print('assuming all are protein coding genes') blocks = [] for name,i,j in setList: if True: pos = 0 pLabel = '%s_%s' % (name, pos+1) for char in badchars: pLabel = pLabel.replace(char, '_') line = '%s = %s-%s;\n' % (pLabel, i+pos, j) blocks.append(line) info = info.replace('[data_block_input]', ''.join(blocks)) cmn.write_file(info, fcfg)
def add_in_baits(fref): fbait = 'sampleInfo.baits' ref_info = cmn.txt_read(fref) if cmn.filexist('bait_insertion'): indel_dict = read_indel_info('bait_insertion') else: indel_dict = {} #baits added by customBaits #fadd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/data/barcodes/added_from_customBaits.baitInfo' #if cmn.filexist(fadd): # add_lines = cmn.file2lines(fadd) #else: # add_lines = [] add_lines = [] if len(indel_dict) != 0: ref_info = insert_in_ref_info(ref_info, indel_dict) add_lines = insert_in_lines(add_lines, indel_dict) refIDs = [ line[1:] for line in ref_info.split('\n') if line.strip() != '' and line[0] == '>' ] addedIDs = [line.split()[1] for line in add_lines] #new = [] #when check a new line, need to check both the fref and the fadd #if the one is not in fadd, add it to fadd for line in cmn.file2lines(fbait): sp, defline, seq = line.strip().split() if all([defline.upper() not in refID.upper() for refID in refIDs]): #not in ref if all([ defline.upper() not in addedID.upper() for addedID in addedIDs ]): if len(seq) == 698: add_lines.append(line) else: if len(seq) != 658: print( 'Error! length of bait barcode is wrong for %s %s' % (sp, defline)) sys.exit() else: seq = add_primer(seq) add_lines.append('%s\t%s\t%s' % (sp, defline, seq)) #now get a new fadd, need to format it into fasta add_fasta = [] for line in add_lines: sp, defline, seq = line.strip().split() fasta = '>%s\n%s\n' % (defline, seq) add_fasta.append(fasta) ref_info += '\n' ref_info += ''.join(add_fasta) dn = 'species_barcodes_4mapping_withAddon.fa' cmn.write_file(ref_info, dn) #index it cmd = 'module add bwa; bwa index %s' % dn cmn.run(cmd) #record the new fadd #cmn.write_lines(add_lines, fadd) return dn
sys.path.append(python_lib) import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py falist_file", file=sys.stderr) sys.exit() fns = cmn.getid(fn) template = cmn.txt_read( '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/RAxML_tree.job' ) count = 0 for fn in fns: count += 1 label = cmn.lastName(fn).replace('.fa', '') info = template.replace('[FN]', fn) info = info.replace('[outlabel]', label) dn = 'BTtree%s.job' % count cmn.write_file(info, dn)
cwd = os.getcwd() cmn.mkdir('job_files') cmn.mkdir('step3_gatk') fromPdir = '/'.join(fromDir.split('/')[:-1]) cmn.run('ln -s %s/step2_bwa_mapping' % fromPdir) fjobs = [] #1. copy the directory to current for job in jobs: wdir = job[4:-4] current = '%s/%s' % (fromDir, wdir) cmd = 'cp -r %s step3_gatk' % current print('forking data for %s' % current) cmn.run(cmd) new = '%s/step3_gatk/%s' % (cwd, wdir) user = cmn.cmd2info('echo $USER').strip() user_label = user[0] fjob = '%s/job_files/%s' % (fromDir, job) info = cmn.txt_read(fjob) info = info.replace(fromDir, '%s/step3_gatk' % cwd) fjob = 'job_files/g%s%s.job' % (user_label, wdir) cmn.write_file(info, fjob) fjobs.append(cmn.lastName(fjob)) dn = 'forked_jobs.list' cmn.write_lines(fjobs, dn)
import sys python_lib = '/home2/wli/my_programs/python_lib' if python_lib not in sys.path: sys.path.append(python_lib) import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py alist", file=sys.stderr) sys.exit() all_deflines = cmn.cmd2lines( 'grep ">" /archive/biophysics/Nick_lab/wli/project/sequencing/scripts/data/barcodes/species_barcodes_4mapping.fa|cut -d ">" -f 2' ) all_genus = set([each.split('_')[0] for each in all_deflines]) for word in cmn.txt_read(fn).strip().split(): if word in all_genus: print(word)
#print taken all_fa = sum(list(faDict.values()), []) if len(all_fa) == 0: print('we find nothing... Please ask Wenlin for help') cmn.write_lines(IDs, 'missingMITOonlys') sys.exit() ass_count = count_ass_appearance(all_fa) best_ass = max(list(ass_count.keys()), key=lambda x: ass_count[x]) print( 'the most common assembly is %s, only take fa mapped to this assembly' % best_ass) cmn.write_file(best_ass, 'best_assembly.txt') #cmd = '/project/biophysics/Nick_lab/wli/sequencing/scripts/find_falist_mito_checkOnly.py %s %s' % (fn, ' '.join(words)) #cmn.run(cmd) best_ass = cmn.txt_read('best_assembly.txt').strip() for ID in faDict: alist = faDict[ID] taken = [ each for each in alist if best_ass in each.replace('_withMito', '') ] #print ID, taken if best_ass == 'cne' and len(taken) == 0: taken += [each for each in alist if '3574_assembly_v1' in each] if best_ass == '3574_assembly_v1' and len(taken) == 0: taken += [each for each in alist if 'cne' in each]
def read_genus_info_from_bait(fn): ID, genus = cmn.txt_read(fn).strip().split()[:2] return {ID:genus}
if __name__ == '__main__': #options=parse_options() try: cmd = sys.argv[1] except: print("Usage: *.py 'seq2ref.py 254780193'", file=sys.stderr) print("the command must contain full python to read it", file=sys.stderr) sys.exit() import cmn argvs = cmd.split() info = cmn.txt_read(argvs[0]) if "__name__=='__main__'" not in info: print("program doesn't contain the line: __name__=='__main__'", file=sys.stderr) print("exit! do nothing", file=sys.stderr) sys.exit() #reformat to make it workable for profiler info = reformat(info, argvs[1:]) dn = 'profile_%s' % argvs[0] cmn.write_file(info, dn) report = cmn.cmd2info('python %s' % dn) dn = '%s_report' % argvs[0]
python_lib = '/work/00412/mtang/sequencing/scripts' if python_lib not in sys.path: sys.path.append(python_lib) import cmn if __name__ == '__main__': import cmn, os, sys k = len(sys.argv) if k == 1: print('usage:make_job.py fn [-n 4 -p 128G -t 24]') sys.exit() fn = sys.argv[1] key_cmd = cmn.txt_read(fn) node = '1' part = 'normal' time_hour = '48' for i, arg in enumerate(sys.argv): if arg == '-n': node = sys.argv[i + 1] elif arg == '-p': part = sys.argv[i + 1] elif arg == '-t': time_hour = sys.argv[i + 1] cwd = os.getcwd() aa = cmn.txt_read('/work/00412/mtang/sequencing/scripts/slurm.job')
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py", file=sys.stderr) sys.exit() seqDict, length = read_fa(fn) template = cmn.txt_read( '/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/beast_template.xml' ) info = [] for name in seqDict: info.append('<sequence taxon="%s">%s</sequence>' % (name, seqDict[name])) info.append('') new = template.replace('[WLdata]', '\n'.join(info)) new = new.replace('[WLlabel]', cmn.lastName(fn)) dn = cmn.lastName(fn) + '.xml' cmn.write_file(new, dn)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: fn, ftree = sys.argv[1:3] except: print("Usage: *.py table ftree", file=sys.stderr) sys.exit() #ftree = '/project/biophysics/Nick_lab/wli/sequencing/Eudamine/BEAST_timing/current_tree.newick' t = ete3.Tree(cmn.txt_read(ftree)) order_list = [] nameDict = {} for node in t: name = node.name.split('_')[0].lstrip("'") print(name) if '_cp1' in node.name: order_list.append(name) nameDict[name] = node.name #read in fasta #seqDict = read_fa(fn) table_dict = {} for line in cmn.file2lines(fn):
#main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__=='__main__': #options=parse_options() try: fvcf, fbam = list(map(os.path.abspath, sys.argv[1:])) except: print("Usage: *.py *.vcf *.bam", file=sys.stderr) print("Generate the command to phase vcf", file=sys.stderr) sys.exit() template = cmn.txt_read('/project/biophysics/Nick_lab/wli/sequencing/scripts/templates/template_readbackedPhasing.cmds') dnlabel = cmn.lastName(fvcf).replace('.vcf', '') outdir = '%s_wdir' % dnlabel cmn.mkdir(outdir) os.chdir(outdir) cwd = os.getcwd() cmd = 'ln -s %s' % fvcf cmn.run(cmd) fvcf = cmn.lastName(fvcf) cmd = 'ln -s %s' % fbam cmn.run(cmd) fbam = cmn.lastName(fbam)