def compute_fileSize(alist): size = 0 for fn in alist: if 'archive/butterfly' in fn: cmd = 'ssh [email protected] "python /home/wenlin/my_programs/filesize.py %s"' % fn size += int(cmn.cmd2info(cmd).strip()) else: size += cmn.filesize(fn) / 1024 / 1024 return size
def update_SRNP_species(line2): srnp = find_SRNPnumber(line2) cmd = '/home2/wli/anaconda/bin/python /archive/biophysics/Nick_lab/wli/project/sequencing/scripts/barcode_scripts/updateSRNPnumber.py %s' % srnp print(cmd) sp = cmn.cmd2info(cmd).strip() if sp != '': newname = '%s|%s' % ('_'.join(sp.split()), srnp) else: newname = line2 return newname
def remove_duplication(alist): stat_dict = {} dup = [] for fn in alist: cmd = 'wc -l %s' % fn N = int(cmn.cmd2info(cmd).strip().split()[0]) if N in stat_dict: dup.append(fn) else: stat_dict[N] = fn return list(stat_dict.values()), dup
def old_find_reference(fn): cmd = 'samtools view -H %s| grep "@PG"| grep bwa' % fn info = cmn.cmd2info(cmd) items = info.strip().split() for i, item in enumerate(items): if item == '-M': ref = items[i+1] break if ref[-3:] == '.fa': ref = ref[:-3] print('found ref: %s' % ref) return ref
def check_difference(seq1, seq2): print(len(seq1), len(seq2)) if len(seq1) == len(seq2): return sum([char1 != char2 for char1, char2 in zip(seq1, seq2) if char1 not in gapChars and char2 not in gapChars]) cmn.write_file(seq1, 'tmpSeq1.fa') cmn.write_file(seq2, 'tmpSeq2.fa') info = cmn.cmd2info('blastn -query tmpSeq1.fa -subject tmpSeq2.fa') #Identities = 656/656 (100%) identityString = cmn.find_between(info, 'Identities = ', ' (') identN, totalN = list(map(int, identityString.split('/'))) cmn.write_file(info, 'checkTmp%s.br' % (ID)) return totalN - identN
def read_barcode_inWdir(sampleID): refbased = cmn.cmd2info('grep thread rescued_read_assembled_mis1*.txt').strip().split()[1] adict = { '%s_threaded' % sampleID: refbased[20:678], } try: denovo = cmn.file2lines('denovo_barcode.fa')[1] adict['%s_denovo' % sampleID] = denovo except: pass try: protDict = read_fa('../all_protBarcodes_complete.fa') adict['%s_prot' % sampleID] = protDict[sampleID] except: pass return adict, list(adict.keys())
sys.path.append(python_lib) import cmn import os from fullname_lib import get_names_4barcode #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #infoLines = cmn.cmd2lines('head -n 1 sampleRun*/rescued_read_assembled_mis1*.txt') IDlist = cmn.cmd2lines("ls -d sampleRun_* |grep -v fake|cut -d '_' -f 2") nameDict = get_names_4barcode() for ID in IDlist: items = nameDict[ID].replace('?', '').split() ID, genus, sp = items[:3] print('sampleInfo', ID, genus, sp) fn = 'sampleRun_%s/good_read_assembled.txt' % ID #label = '%s_%s' % (genus, sp) cmd = 'head %s -n 2| grep %s' % (fn, genus) print(cmd) info = cmn.cmd2info(cmd).strip() if info == '': print('please re-run', ID, genus, sp)
def get_current_jobs(label, user): cmd = 'squeue| grep %s| grep g%s|wc -l' % (user, label) N = cmn.cmd2info(cmd).split()[0] N = int(N) return N
import time def get_current_jobs(label, user): cmd = 'squeue| grep %s| grep g%s|wc -l' % (user, label) N = cmn.cmd2info(cmd).split()[0] N = int(N) return N fn = 'forked_jobs.list' jobs = cmn.getid(fn) cores = int(sys.argv[1]) user = cmn.cmd2info('echo $USER').strip() user_label = user[0] currentN = get_current_jobs(user_label, user) os.chdir('job_files') todo = list(jobs) while(len(todo) != 0): fjob = todo[0] currentN = get_current_jobs(user_label, user) print(currentN) if currentN < cores: #submit cmd = 'sbatch %s' % fjob
if python_lib not in sys.path: sys.path.append(python_lib) import cmn #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #main #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ if __name__ == '__main__': #options=parse_options() try: fn = sys.argv[1] except: print("Usage: *.py vcf", file=sys.stderr) sys.exit() total = cmn.cmd2info('wc -l %s' % fn).split()[0] SNPs = cmn.cmd2info('grep HaplotypeScore %s > %s.tmp; wc -l %s.tmp' % (fn, fn, fn)).split()[0] lowqual = cmn.cmd2info('grep LowQual %s.tmp|wc -l ; rm %s.tmp' % (fn, fn)).split()[0] print(cmn.lastName(fn), total, SNPs, lowqual, int(SNPs) / float(total), int(lowqual) / float(SNPs))
def check_fastqlines(fn): cmd = 'wc -l %s' % fn N = int(cmn.cmd2info(cmd).strip().split()[0]) return N
file=sys.stderr) sys.exit() import cmn argvs = cmd.split() info = cmn.txt_read(argvs[0]) if "__name__=='__main__'" not in info: print("program doesn't contain the line: __name__=='__main__'", file=sys.stderr) print("exit! do nothing", file=sys.stderr) sys.exit() #reformat to make it workable for profiler info = reformat(info, argvs[1:]) dn = 'profile_%s' % argvs[0] cmn.write_file(info, dn) report = cmn.cmd2info('python %s' % dn) dn = '%s_report' % argvs[0] cmn.write_file(report, dn) print('results in %s' % dn) dn2 = "%s_sorted" % dn cmd = 'cat %s| sort -r -nk4 > %s' % (dn, dn2) os.system(cmd) print('sorted result by the accumuated time is in %s' % dn2)
readSizeStr = format_readSize(readSize) print(readSizeStr) #2. step2, get the reference and its length vcf_label = cmn.lastName(vcf_fn).replace('_snp_step2.vcf', '') items = vcf_label.split('_') sp = items[0] reflabel = '_'.join(items[1:]) ref_length = get_ref_length(reflabel) print(ref_length) #3. get percentage of mapping #../../step2_bwa_mapping #TODO: if sam data available, recompute it cmd = 'cat %s/mapped_reads_count/*| grep %s| grep %s' % (samdir, sp, reflabel) info = cmn.cmd2info(cmd) items = info.strip().split() if len(items) == 0: print('Error! can not find map percentage for %s %s' % (sp, reflabel)) mapPercentage = 'NA' else: if len(items) == 4: #old format, ignore the mapN mapPercentage = 'oldstat' mapN, totalN = list(map(int, items[2:4])) else: mapN, totalN, halfN, pPercent = list(map(float, items[-4:])) #mapPercentage = float(mapN) / totalN mapPercentage = 'ready' print(mapPercentage)
import os wdir = os.path.abspath(sys.argv[1].rstrip('/')) fvcfs = cmn.cmd2lines('ls %s/*/*.vcf' % wdir) refdir = '/work/biophysics/mtang/SNP_calling/indexed_references' badones = [] for fvcf in fvcfs: label = fvcf.split('/')[-2] reflabel = '_'.join(label.split('_')[1:]) finfo = '%s/%s_scafLength.txt' % (refdir, reflabel) if not os.path.exists(finfo): cmd = '/work/biophysics/mtang/SNP_calling/scripts/assembly_scaf_length.py %s/%s.fa ' % ( refdir, reflabel) cmn.run(cmd) infoline = cmn.cmd2info('tail -n 1 %s' % finfo).strip() Cscaf, Cindex = infoline.split()[:2] checkline = cmn.cmd2info('tail -n 1 %s' % fvcf).strip() scaf, index = checkline.split()[:2] if scaf != Cscaf or Cindex != index: print('Error! problematic vcf file for %s' % label) badones.append(label) dn = 'bad_vcf.list' cmn.write_lines(badones, dn)
#sps = mapF_dict.keys() #ref_genomes, refmapping = detect_ref_genomes(sps, bwa_dirs) #3. make the length check ref_dir = '/work/biophysics/mtang/SNP_calling/indexed_references' print('validating map files...') refNdict = read_refN(ref_genomes) good_maps = [] bad_maps = [] for sp in refmapping: ref = refmapping[sp] fmaps = mapF_dict[sp] refN = refNdict[ref] mapN = 0 for fmap in fmaps: N = int(cmn.cmd2info('wc -l %s' % fmap).split()[0]) mapN += N if refN != mapN: print('Error! the line of map doesn\'t agree with reference for %s' % sp) print('Nref vs Nmap: %s %s; ref is %s\n' % (refN, mapN, ref)) bad_maps += fmaps else: good_maps += fmaps cmn.write_lines(good_maps, 'good_maps.txt') cmn.write_lines(bad_maps, 'bad_maps.txt')
try: requires[sp].append(set(refs)) except KeyError: requires[sp] = [set(refs)] #check if the ref genome exist #check if the ref is conflict with the one we already have refdir = '/work/biophysics/mtang/SNP_calling/indexed_references' for ref in allrefs: if not os.path.exists(ref): print('reference %s doesn\'t exist! please email to ask!' % ref) oldref = '%s/%s' % (refdir, cmn.lastName(ref)) #print oldref, ref if os.path.exists(oldref): check = cmn.cmd2info('diff %s %s| wc -l ' % (oldref, ref)) if int(check) != 0: print('new ref is different from old ref! please email to ask!') print('old ref: %s' % oldref) print('new ref: %s' % ref) #addon: check fastq to see if anything has been done before fdone = '/project/biophysics/Nick_lab/mtang/archive/submission_done' info_wdir = '/project/biophysics/Nick_lab/mtang/archive/step1_info' fastq_dir = '/project/biophysics/Nick_lab/mtang/archive/fastq_libs' info_dict = parse_info_file(info_wdir) done_dict = parse_done_file(fdone) #current_fastqs = parse_fastq_dir(fastq_dir) print('***********************************************************')