def tell_running_mode(fn): size = float(cmn.filesize(fn)) / 1024 / 1024 # M if size < 25: mode = 'RAxML' else: mode = 'ExaML' return mode
def filter_best_fastq(fns): #group them by ID gdict = {} for fn in fns: sp = cmn.lastName(fn).split('_')[0] try: gdict[sp].append(fn) except KeyError: gdict[sp] = [fn] #check how many different parent dict for each one newlist = [] for sp in gdict: fns = gdict[sp] pdirs = {} for fn in fns: pdir = '/'.join(fn.split('/')[:-1]) try: pdirs[pdir].append(fn) except KeyError: pdirs[pdir] = [fn] if len(pdirs) == 1: newlist += fns else: #if multiple data, then #1. check to take the one with the biggest file size maxFns = (0, None) for pdir in pdirs: subFns = pdirs[pdir] size = sum([cmn.filesize(each) for each in subFns]) if size > maxFns[0]: maxFns = (size, subFns) newlist += maxFns[1] return newlist
def compute_fileSize(alist): size = 0 for fn in alist: if 'archive/butterfly' in fn: cmd = 'ssh [email protected] "python /home/wenlin/my_programs/filesize.py %s"' % fn size += int(cmn.cmd2info(cmd).strip()) else: size += cmn.filesize(fn) / 1024 / 1024 return size
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ wdir = os.path.abspath(sys.argv[1].rstrip('/')) project = cmn.lastName(os.getcwd()).split('_')[0] f_list = 'falist' cmd = 'ls %s/* > %s' % (wdir, f_list) cmn.run(cmd) falist = [os.path.abspath(fn) for fn in cmn.file2lines(f_list)] Njob = 3 fa_size = cmn.filesize(falist[0]) / 1024 / 1024 Njob = max(Njob, 50 * fa_size / 5000 + 1) Ncores = 48 * Njob / 100 print('number of cores:', Ncores) print('number of jobs:', Njob) cmds = [] outdir = 'making_fastme_trees' cmn.mkdir(outdir) for fa in falist: cmd = 'cd %s; python /project/biophysics/Nick_lab/wli/sequencing/scripts/fasta2fastmeTree.py %s %s' % ( outdir, fa, Ncores) cmds.append(cmd)
except: print("Usage: *.py", file=sys.stderr) sys.exit() geneRange = read_gene_range(frange) seqDict, order_list = read_fa(fn) stat = [] outdir = '%s_gene_fasta' % cmn.lastName(fn) cmn.mkdir(outdir) for gene in geneRange: i, j = geneRange[gene] print(gene, i, j) stat.append('%s\t%s\n' % (gene, j - i)) dn = '%s/%s.fa' % (outdir, gene) with open(dn, 'w') as dp: for name in order_list: seq = seqDict[name][i:j] if seq.strip('-').strip('N') == '': continue fasta = '>%s\n%s\n' % (name, seq) dp.write(fasta) if cmn.filesize(dn) == 0: print('fileSize0', dn) dn = cmn.lastName(fn) + '_takenRange.info' cmn.write_file(''.join(stat), dn)
todo_jobs = [] for sp in rdict: records = rdict[sp] #print 'processing lib %s' % sp for record in records: fastq, ref = record reflabel = cmn.lastName(ref).replace('.fa', '') outlabel = '%s_%s' % (sp, reflabel) outdir = '%s/%s/%s' % (cwd, sp, reflabel) tmpcheck = cmn.cmd2lines(('ls %s/*sam 2> /dev/null' % outdir)) if len(tmpcheck) > 0: total = 0 for fn in tmpcheck: total += cmn.filesize(fn) if total != 0: print('skip finished mapping %s' % outdir) continue cmn.mkdir(outdir) os.chdir(outdir) #paired is a dict paired, unpaired = separate_by_pair(fastq.split(',')) cmd = 'cd %s;\n' % (refdir) for key in paired: lib1, lib2 = paired[key] cmd += '/home2/wli/local/bwa-0.7.12/bwa mem -t 32 -M %s %s %s > %s/%s_paired.sam;\n' % (