def Batch2JPG(args): ''' %prog Batch2JPG in_dir out_dir apply toJPG on a large number of images ''' p = OptionParser(Batch2JPG.__doc__) p.add_option('--pattern', default='*.png', help="file pattern of png files under the 'dir_in'") p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=Batch2JPG.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args in_dir_path = Path(in_dir) pngs = in_dir_path.glob(opts.pattern) cmds = [] for img_fn in pngs: img_fn = str(img_fn).replace(' ', '\ ') cmd = "python -m schnablelab.ImageProcessing.base toJPG "\ f"{img_fn} --out_dir {out_dir}" cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print('check %s for all the commands!' % cmd_sh) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict)
def keras_cnn(args): """ %prog train_dir val_dir num_category model_name_prefix Run vgg model """ p = OptionParser(keras_cnn.__doc__) p.add_option('--epoch', default=500, help = 'number of epoches') p.add_option('--lr_n', default=1, type='int', help = 'train model with differnt learning rates. if n=1: set lr to 0.001. if n>1: try differnt lr from 1e-2 to 1e-5 n times') p.set_slurm_opts(gpu=True) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) train_dir, val_dir, numC, mnp = args #mnp:model name prefix out_fns = fns(mnp, n=opts.lr_n) for i in range(int(opts.lr_n)): cmd = 'python -m schnablelab.CNN.keras_vgg %s %s %s %s %s %s'%(train_dir, val_dir, numC, out_fns.lrs[i], opts.epoch, out_fns.model_name[i]) SlurmHeader = Slurm_gpu_header%(opts.time, opts.memory, out_fns.model_name[i], out_fns.model_name[i], out_fns.model_name[i]) SlurmHeader += 'module load anaconda\nsource activate MCY\n' SlurmHeader += cmd f = open('%s.slurm'%out_fns.model_name[i], 'w') f.write(SlurmHeader) f.close() print('slurm file %s.slurm has been created, you can sbatch your job file.'%out_fns.model_name[i])
def gentesting(args): """ %prog source_imgs_dir source_imgs_csv training_imgs_csv testing_imgs_per_cls output_dir create the balanced testing dataset for each class """ p = OptionParser(gentraining.__doc__) p.add_option('--header', default=None, help='spefiy if the source csv file has header') p.add_option('--comma_sep', default=True, help='spefiy if the csv file is separated by comma') p.add_option('--groupby_col', default=1, help='spefiy the groupy column. 0: 1st column; 1: 2nd column') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) source_dir, source_csv, training_csv, ipc, testing_dir = args # ipc: number of images per class. # read the source csv file if opts.header and opts.comma_sep: # without header with , df0 = pd.read_csv(source_csv, header=None) elif (not opts.header) and opts.comma_sep: # with header with , df0 = pd.read_csv(source_csv) elif not (opts.header and opts.comma_sep): # with header with tab/space df0 = pd.read_csv(source_csv, delim_whitespace=True) else: print('keke... implement this option first!') print('shape of source csv %s: %s' % (mycsv, df0.shape))
def hmp2MVP(args): """ %prog hmp2MVP hmp MVP_prefix Convert hmp genotypic data to bimnbam datasets (*.numeric and *.map). """ p = OptionParser(hmp2MVP.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmp, mvp_pre = args f1 = open(hmp) f1.readline() f2 = open(mvp_pre + '.numeric', 'w') f3 = open(mvp_pre + '.map', 'w') f3.write('SNP\tChrom\tBP\n') for i in f1: j = i.split() rs = j[0] ref, alt = j[1].split('/')[0], j[1].split('/')[1] newNUMs = judge(ref, alt, j[11:]) newline = '\t'.join(newNUMs) + '\n' f2.write(newline) chro, pos = j[2], j[3] f3.write('%s\t%s\t%s\n' % (rs, chro, pos)) f1.close() f2.close() f3.close()
def cpu(args): """ %prog request a cpu node from hcc. """ p = OptionParser(cpu.__doc__) p.add_option("--partition", default="jclarke", choices=('batch', 'jclarke'), help="which partition? [default: %default]") p.add_option("--memory", default="10240", help="specify the how much memory [default: %default]") p.add_option("--time", default='20', help="specify the time (hour) [default: %default]") opts, args = p.parse_args(args) if len(args) == 0: print('add --help to see options.\n') cmd = 'srun --partition=%s --mem-per-cpu=%s --ntasks-per-node=6 --nodes=1 --time=%s:0:0 --pty $SHELL\n' % ( opts.partition, opts.memory, opts.time) print(cmd) #call(cmd, shell=True) else: sys.exit(not p.print_help())
def merge_files(args): """ %prog merge_files pattern out_fn combine split vcf files to a single one. Pattern example: 'hmp321_agpv4_chr9.%s.beagle.vcf' revise the lambda fucntion to fit your file patterns """ p = OptionParser(merge_files.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) pattern,out_fn, = args fns = [str(i) for i in list(Path('.').glob(pattern))] fns_sorted = sorted(fns, key=lambda x: int(x.split('.')[0][3:])) print(fns_sorted) print('%s files found!'%len(fns_sorted)) f = open(out_fn, 'w') print(fns_sorted[0]) with open(fns_sorted[0]) as f1: for i in f1: f.write(i) for i in fns_sorted[1:]: print(i) with open(i) as f2: for j in f2: if not j.startswith('#'): f.write(j)
def sortbam(args): """ %prog in_dir out_dir in_dir: bam files folder out_dir: sorted bam files folder sort bam files using samtools/0.1 sort function. """ p = OptionParser(sortbam.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) bams = dir_path.glob('*.bam') for bam in bams: prf = bam.name.split('.bam')[0] sort_bam = prf + '.sorted' sort_bam_path = out_path / sort_bam cmd = 'samtools sort %s %s' % (bam, sort_bam_path) header = Slurm_header % (100, 15000, prf, prf, prf) header += 'ml samtools/0.1\n' header += cmd with open('%s.sortbam.slurm' % prf, 'w') as f: f.write(header)
def SubsamplingSMs(args): """ %prog SubsamplingSMs input_vcf SMs.csv grep a subset of samples defined in SMs.csv (One sample name per row without header) from the input_vcf """ p = OptionParser(SubsamplingSMs.__doc__) _, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputvcf, SMcsv, = args outputvcf = Path(inputvcf).name.replace('.vcf', '_subSMs.vcf') vcf = ParseVCF(inputvcf) df_vcf = vcf.AsDataframe() IDs = pd.read_csv(SMcsv, header=None)[0].values num_IDs = IDs.shape[0] print('number of specified Samples: %s'%num_IDs) subsm = vcf.SMs_header for id in IDs: if id not in vcf.SMs: print('%s not found in vcf...'%id) else: subsm.append(id) print('%s out of %s found in VCF'%(len(subsm)-9, num_IDs)) df_vcf = df_vcf[subsm] with open(outputvcf, 'w') as f: f.writelines(vcf.HashChunk2) df_vcf.to_csv(outputvcf, sep='\t', index=False, mode='a') print('Done! check output %s...'%outputvcf)
def gatk(args): """ %prog gatk ref.fa bam_list.txt region.txt out_dir run GATK HaplotypeCaller """ p = OptionParser(gatk.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ref, bams, regions, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') with open(bams) as f: inputs = ''.join(['-I %s \\\n'%(i.rstrip()) for i in f]) with open(regions) as f: for reg in f: reg = reg.strip() if ':0-' in reg: reg = reg.replace(':0-', ':1-') reg_fn = reg.replace(':','_') reg_fn_vcf = '%s.gatk.vcf'%reg_fn reg_fn_vcf_path = out_path/reg_fn_vcf cmd = "gatk --java-options '-Xmx13G' HaplotypeCaller \\\n-R %s -L %s \\\n%s-O %s"%(ref, reg, inputs, reg_fn_vcf_path) header = Slurm_header%(165, 15000, reg_fn, reg_fn, reg_fn) header += 'ml gatk4/4.1\n' header += cmd with open('%s.gatk.slurm'%reg_fn, 'w') as f1: f1.write(header)
def EstimateLD(args): """ %prog dir_in dir_out run LD decay using tassel """ p = OptionParser(EstimateLD.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*vcf', help='pattern of vcf files') p.add_option('--window_size', default='1000', help='specify how many SNPs in the sliding window') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) dir_in, dir_out = args dir_out = Path(dir_out) if not dir_out.exists(): dir_out.mkdir() for vcf in Path(dir_in).glob(opts.pattern): prefix = vcf.name.replace('.vcf', '') out_fn = '%s.ld' % prefix cmd = 'run_pipeline.pl -Xms512m -Xmx14g -fork1 -vcf %s -ld -ldWinSize %s -ldType SlidingWindow -td_tab %s/%s\n' % ( vcf, opts.window_size, dir_out, out_fn) header = Slurm_header % (opts.time, 15000, prefix, prefix, prefix) header += 'ml java/1.8\n' header += 'ml tassel/5.2\n' header += cmd with open('%s.estLD.slurm' % prefix, 'w') as f: f.write(header) print( 'slurm file %s.estLD.slurm has been created, you can submit your job file.' % prefix)
def SummarizeLD(args): """ %prog dir_in dir_out summarize LD decay in log scale """ p = OptionParser(EstimateLD.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*.ld.txt', help='pattern of ld.txt files') p.add_option('--max_dist', default='1,000,000', help='the maximum ld distance') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) dir_in, dir_out = args dir_out = Path(dir_out) if not dir_out.exists(): dir_out.mkdir() num0 = opts.max_dist.count('0') for fn in Path(dir_in).glob(opts.pattern): prefix = '.'.join(fn.name.split('.')[0:-1]) out_fn = '%s.sum.csv' % prefix cmd = 'python -m schnablelab.SNPcalling.base SummarizeLD %s %s %s/%s\n' % ( fn, num0, dir_out, out_fn) header = Slurm_header % (opts.time, opts.memory, prefix, prefix, prefix) header += cmd with open('%s.sumLD.slurm' % prefix, 'w') as f: f.write(header) print( 'slurm file %s.sumLD.slurm has been created, you can submit your job file.' % prefix)
def only_MAF(args): """ %prog in_dir out_dir filter MAF """ p = OptionParser(only_MAF.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*.vcf', help='file pattern for vcf files in dir_in') p.add_option('--maf', default='0.01', help='maf cutoff') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) vcfs = dir_path.glob(opts.pattern) for vcffile in vcfs: prefix = '.'.join(vcffile.name.split('.')[0:-1]) cmd = "python -m schnablelab.SNPcalling.base MAF %s %s\n" % (vcffile, opts.maf) with open('%s.maf.slurm' % prefix, 'w') as f: header = Slurm_header % (opts.time, opts.memory, prefix, prefix, prefix) header += 'ml bcftools\n' header += cmd f.write(header) print( 'slurm file %s.maf.slurm has been created, you can sbatch your job file.' % prefix)
def filterSpeciesTreatment(args): """ %prog filterSpeciesTreatment tissue_csv output_prefix tissue_csv (21 columns): sb_gene, si_gene, bd_gene, sb_cold_1-3, sb_normal_1-3, si_cold_1-3, si_normal_1-3, bd_cold_1-3, bd_normal_1-3 """ p = OptionParser(filterSpeciesTreatment.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) tissue_csv, outprf, = args df = pd.read_csv(tissue_csv) jdgs = [] spes, trts = ['sorghum_', 'millet_', 'brachy_'], ['cold_', 'normal_'] for spe in spes: cols = [i for i in df.columns[3:] if spe in i] spe_judg = (df[cols] == 0).sum(axis=1) <= 3 jdgs.append(spe_judg) for trt in trts: cols = [i for i in df.columns[3:] if trt in i] trt_judg = (df[cols] == 0).sum(axis=1) <= 5 jdgs.append(trt_judg) for spe in spes: for trt in trts: cols = [i for i in df.columns[3:] if spe + trt in i] jdg = (df[cols] == 0).sum(axis=1) <= 1 jdgs.append(jdg) final_judg = pd.concat(jdgs, axis=1).sum(axis=1) == 11 final_df = df[final_judg] final_df.to_csv('%s.csv' % outprf, index=False)
def Resize(args): ''' %prog Resize img1 img2 img3 ... resize image using PIL. If multiple images are provided, same resizing dimension will be applied on all of them ''' p = OptionParser(Resize.__doc__) p.add_option('--output_dim', default='1227,1028', help='the dimension (width,height) after resizing') p.add_option('--out_dir', default='.', help='specify the output image directory') p.add_option('--to_jpg', default=False, action='store_true', help='in save image as jpg format') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) dim = ([int(i) for i in opts.output_dim.split(',')]) for img_fn in args: img = ProsImage(img_fn) if opts.to_jpg: img_out_fn = Path(img.fn).name.replace(f'.{img.format}', '.Rsz.jpg') img.resize(dim).convert('RGB').save( Path(opts.out_dir) / img_out_fn) else: img_out_fn = Path(img.fn).name.replace(f'.{img.format}', f'.Rsz.{img.format}') img.resize(dim).save(Path(opts.out_dir) / img_out_fn)
def hyp2arr(args): ''' %prog hyp2arr hyp_dir out_fn convert hyperspectral images to numpy array ''' p = OptionParser(hyp2arr.__doc__) opts, args = p.parse_args(args) if len(args)==0: sys.exit(not p.print_help()) hyp_dir, out_fn, = args discard_imgs = ['0_0_0.png', '1_0_0.png'] dir_path = Path(hyp_dir) if not dir_path.exists(): sys.exit('%s does not exist!'%hyp_dir) imgs = list(dir_path.glob('*.png')) imgs = sorted(imgs, key=lambda x: int(x.name.split('_')[0])) num_imgs = len(imgs) print('%s images found.'%num_imgs) img_arrs = [] for i in imgs: if not i.name in discard_imgs: arr = cv2.imread(str(i), cv2.IMREAD_GRAYSCALE) img_arrs.append(arr) img_array = np.stack(img_arrs, axis=2) print(img_array.shape) np.save(out_fn, img_array)
def MLM(args): """ %prog MLM GenoPrefix('*.mean' and '*.annotation') Pheno Outdir RUN automated GEMMA Mixed Linear Model """ p = OptionParser(MLM.__doc__) p.add_option('--kinship', default=False, help = 'specify the relatedness matrix file name') p.add_option('--pca', default=False, help = 'specify the principle components file name') p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) GenoPrefix, Pheno, Outdir = args meanG, annoG = GenoPrefix+'.mean', GenoPrefix+'.annotation' outprefix = '.'.join(Pheno.split('/')[-1].split('.')[0:-1]) cmd = '%s -g %s -p %s -a %s -lmm 4 -outdir %s -o %s' \ %(gemma, meanG, Pheno, annoG, Outdir, outprefix) if opts.kinship: cmd += ' -k %s'%opts.kinship if opts.pca: cmd += ' -c %s'%opts.pca print('The command running on the local node:\n%s'%cmd) h = Slurm_header header = h%(opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd f = open('%s.mlm.slurm'%outprefix, 'w') f.write(header) f.close() print('slurm file %s.mlm.slurm has been created, you can sbatch your job file.'%outprefix)
def splitVCF(args): """ %prog splitVCF N vcf split vcf to N smaller files with equal size """ p = OptionParser(splitVCF.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) N, vcffile, = args N = int(N) prefix = vcffile.split('.')[0] cmd_header = "sed -ne '/^#/p' %s > %s.header" % (vcffile, prefix) subprocess.call(cmd_header, shell=True) child = subprocess.Popen('wc -l %s' % vcffile, shell=True, stdout=subprocess.PIPE) total_line = int(child.communicate()[0].split()[0]) print('total %s lines' % total_line) step = total_line / N print(1) cmd_first = "sed -n '1,%sp' %s > %s.1.vcf" % (step, vcffile, prefix) subprocess.call(cmd_first, shell=True) for i in range(2, N): print(i) st = (i - 1) * step + 1 ed = i * step cmd = "sed -n '%s,%sp' %s > %s.%s.tmp.vcf" % (st, ed, vcffile, prefix, i) subprocess.call(cmd, shell=True) print(i + 1) cmd_last = "sed -n '%s,%sp' %s > %s.%s.tmp.vcf" % ((ed + 1), total_line, vcffile, prefix, (i + 1)) subprocess.call(cmd_last, shell=True) for i in range(2, N + 1): cmd_cat = 'cat %s.header %s.%s.tmp.vcf > %s.%s.vcf' % (prefix, prefix, i, prefix, i) subprocess.call(cmd_cat, shell=True)
def align(args): """ %prog align indx_base fq_fn ... do alignment using bwa. """ p = OptionParser(align.__doc__) opts, args = p.parse_args(args) if len(args)==0: sys.exit(not p.print_help()) ref_base = args[0] fq_fns = args[1:] print(fq_fns) sm = Path(fq_fns[0]).name.split('_trim')[0] gid = sm.split('R')[0] print(gid) R = r"'@RG\tID:%s\tSM:%s'"%(gid, gid) if len(fq_fns)==1: sam = sm+'.se.sam' print('run single-end alignment') cmd = 'bwa mem -R %s %s %s > %s \n'%(R, ref_base, fq_fns[0], sam) prf = '%s.se.align'%sm elif len(fq_fns)==2: sam = sm+'.pe.sam' print('run paired-end alignment') cmd = 'bwa mem -R %s %s %s %s > %s \n'%(R, ref_base, fq_fns[0], fq_fns[1], sam) prf = '%s.pe.align'%sm else: sys.exit('only one or two read files') header = Slurm_header%(100, 10000, prf, prf, prf) header += 'ml bwa\n' header += cmd with open('%s.slurm'%prf, 'w') as f: f.write(header)
def fetchProSeq(args): """ %prog GeneList seq_file output_prefix extract protein sequences of candidate genes """ p = OptionParser(fetchProSeq.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) genelist, SeqFile, out_prefix, = args cmd = "grep '>' %s|cut -d ' ' -f 1|cut -d '>' -f 2 > AllGene.names" % SeqFile call(cmd, shell=True) df_Genes = pd.read_csv(genelist, header=None) df_Trans = pd.read_csv('AllGene.names', header=None) df_Trans['gene'] = df_Trans[0].str.split('_').str.get(0) df1 = df_Trans[df_Trans['gene'].isin(df_Genes[0])] df1['gene'] = df1['gene'].astype('category') df1['gene'].cat.set_categories(df_Genes[0].tolist(), inplace=True) df2 = df1.sort_values(['gene', 0]).reset_index(drop=True) df2[0].to_csv('%s.ProSeq.names' % out_prefix, index=False, header=False) for i in list(df2[0]): print('fetching %s' % i) cmd = "%s %s %s >> %s" % (faOneRecord, SeqFile, i, out_prefix + '.seqs') call(cmd, shell=True) print('Done!')
def split_fa_region(args): """ %prog fa.fai region_size out_fn fa.fai: index file for the fa file region_size: the size for each splitted region out_fn: the output file genearte a list of freebayes/bamtools region specifiers """ p = OptionParser(split_fa_region.__doc__) opts, args = p.parse_args(args) if len(args)==0: sys.exit(not p.print_help()) fasta_index_file, region_size, fn_out, = args fasta_index_file = open(fasta_index_file) region_size = int(region_size) fn_out = open(fn_out, 'w') for line in fasta_index_file: fields = line.strip().split("\t") chrom_name = fields[0] chrom_length = int(fields[1]) region_start = 0 while region_start < chrom_length: start = region_start end = region_start + region_size if end > chrom_length: end = chrom_length line = chrom_name + ":" + str(region_start) + "-" + str(end)+'\n' fn_out.write(line) region_start = end fn_out.close()
def sam2bam(args): """ %prog in_dir out_dir in_dir: sam files folder out_dir: bam files folder convert sam to bam using samtools/0.1. """ p = OptionParser(sam2bam.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) sams = dir_path.glob('*.sam') for sam in sams: prf = sam.name.split('.sam')[0] bam = prf + '.bam' bam_path = out_path / bam cmd = 'samtools view -bS %s > %s' % (sam, bam_path) header = Slurm_header % (100, 15000, prf, prf, prf) header += 'ml samtools/0.1\n' header += cmd with open('%s.sam2bam.slurm' % prf, 'w') as f: f.write(header)
def reheader(args): """ %prog reheader input_hmp names.csv substitute the sample names in hmp header using sed. name.csv: comma separated without header line 1st column is old name 2nd column is the new name """ p = OptionParser(reheader.__doc__) _, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, names_csv, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_reheader.hmp') hmp = ParseHmp(inputhmp) cmd = 'sed ' for _, row in pd.read_csv(names_csv, header=None).iterrows(): old_nm, new_nm = row[0], row[1] if old_nm not in hmp.SMs: print('%s was not found in hmp...' % id) else: cmd += "-e '1s/%s/%s/' " % (old_nm, new_nm) cmd += '%s > %s' % (inputhmp, outputhmp) print('command:\n%s' % cmd) choice = input("Run the above command? (yes/no) ") if choice == 'yes': call(cmd, shell=True) print('Done! check %s' % outputhmp)
def reorgnzGemmaKinship(args): """ %prog reorgnzGemmaKinship GEMMAkinship hmp Reorganize kinship result from GEMMA so it can be used in other software, like GAPIT. The hmp file only provides the order of the smaple names. """ p = OptionParser(reorgnzGemmaKinship.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) gemmaKin, hmpfile, = args f = open(hmpfile) SMs = f.readline().split()[11:] f.close() f1 = open(gemmaKin) f2 = open('GAPIT.' + gemmaKin, 'w') for i, j in zip(SMs, f1): newline = i + '\t' + j f2.write(newline) f1.close() f2.close() print( "Finished! Kinship matrix file for GEMMA 'GAPIT.%s' has been generated." % gemmaKin)
def SubsamplingSMs(args): """ %prog SubsamplingSMs input_hmp SMs.csv grep a subset of samples defined in SMs.csv (One sample name per row without header) from the input_hmp """ p = OptionParser(SubsamplingSMs.__doc__) _, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, SMcsv, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_subSMs.hmp') hmp = ParseHmp(inputhmp) df_hmp = hmp.AsDataframe() IDs = pd.read_csv(SMcsv, header=None)[0].values num_IDs = IDs.shape[0] print('number of specified Samples: %s' % num_IDs) subsm = hmp.SMs_header for id in IDs: if id not in hmp.SMs: print('%s was not found in hmp...' % id) else: subsm.append(id) print('%s out of %s found in Hmp' % (len(subsm) - 11, num_IDs)) df_hmp = df_hmp[subsm] df_hmp.to_csv(outputhmp, sep='\t', index=False, na_rep='NA') print('Done! check output %s...' % outputhmp)
def gpu(args): """ %prog request a gpu node from hcc. """ p = OptionParser(gpu.__doc__) p.add_option("--memory", default="12000", help="specify the how much memory [default: %default]") p.add_option("--time", default='20', help="specify the time (hour) [default: %default]") p.add_option( "--model", default='gpu_k40', choices=('gpu_p100', 'gpu_k20', 'gpu_k40'), help= "specify gpu mode, p100:16gb, k40:12gb, k20:5bg [default: %default]") opts, args = p.parse_args(args) if len(args) == 0: print('add --help to see options.\n') cmd = 'srun --partition=schnablelab --gres=gpu --constraint=%s --mem-per-cpu=%s --ntasks-per-node=1 --nodes=1 --time=%s:0:0 --pty $SHELL\n' % ( opts.model, opts.memory, opts.time) print(cmd) #call(cmd, shell=True) else: sys.exit(not p.print_help())
def DownsamplingSNPs(args): """ %prog downsampling input_hmp Pick up some SNPs from a huge hmp file using Linux sed command """ p = OptionParser(DownsamplingSNPs.__doc__) p.add_option('--downscale', default=10, help='specify the downscale level') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=DownsamplingSNPs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_ds%s.hmp' % opts.downsize) cmd = "sed -n '1~%sp' %s > %s" % (opts.downsize, inputhmp, outputhmp) print('cmd:\n%s\n' % cmd) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm([cmd], put2slurm_dict)
def extract_info(args): """ %prog log_file output_fn extract testing and prediction results from dpp log file """ p = OptionParser(extract_info.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) logfile, opp, = args f0 = open(logfile) all_lines = f0.readlines() test_idx, predict_idx, hist_idx = 0, 0, 0 for i, j in enumerate(all_lines): if 'All test labels:' in j: test_idx = i if 'All predictions:' in j: predict_idx = i if 'Histogram of ' in j: hist_idx = i test_lines = all_lines[test_idx + 1:predict_idx] ground_truth = extract_num(test_lines) #print(len(ground_truth), '\n', ground_truth) predict_lines = all_lines[predict_idx + 1:hist_idx] prediction = extract_num(predict_lines) #print(len(prediction), '\n', prediction) df = pd.DataFrame( dict(zip(['groundtruth', 'prediction'], [groundtruth, prediction]))) df.to_csv(opp, index=False, sep='\t') print('Done! check %s' % opp)
def hmp2bimbam(args): """ %prog hmp2bimbam hmp bimbam_prefix Convert hmp genotypic data to GEMMA bimbam files (*.mean and *.annotation). """ p = OptionParser(hmp2bimbam.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmp, bim_pre = args f1 = open(hmp) f1.readline() f2 = open(bim_pre + '.mean', 'w') f3 = open(bim_pre + '.annotation', 'w') for i in f1: j = i.split() rs = j[0] try: ref, alt = j[1].split('/') except: print('omit rs...') continue newNUMs = judge(ref, alt, j[11:]) newline = '%s,%s,%s,%s\n' % (rs, ref, alt, ','.join(newNUMs)) f2.write(newline) pos = j[3] chro = j[2] f3.write('%s,%s,%s\n' % (rs, pos, chro)) f1.close() f2.close() f3.close()
def Imgs2Arrs(args): ''' %prog hyp_dir(filepath of hyperspectral image data) Returns: numpy array object with shape [x*y, z]. x,y dims correspond to pixel coordinates for each image z dim corresponds to hyperspectral image wavelength. ''' import cv2 p = OptionParser(Imgs2Arrs.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) mydir, = args imgs = [i for i in os.listdir(mydir) if i.endswith('png')] sorted_imgs = sorted(imgs, key=lambda x: int(x.split('_')[0])) all_arrs = [] for i in sorted_imgs[2:]: print(i) #img = cv2.imread('%s/%s'%(mydir, i), cv2.IMREAD_GRAYSCALE) img = np.array(Image.open('%s/%s' % (mydir, i)).convert('L')) print(img.shape) all_arrs.append(img) arrs = np.stack(all_arrs, axis=2) np.save('%s.npy' % mydir, arrs)
def SummarizeLD(args): """ %prog ld.csv num0 out.txt ld.csv: ld tab delimited file generated from tassel num0: 0s in the distance summarize ld decay in log scale 0-100kb """ p = OptionParser(SummarizeLD.__doc__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ld_fn,num0,out_fn, = args df = pd.read_csv(ld_fn, delim_whitespace=True, usecols=['Dist_bp', 'R^2']) df = df.dropna().sort_values('Dist_bp').reset_index(drop=True) mybin = [10**i for i in np.arange(0, float(num0)+0.1, 0.1)] blockPreIndex = np.histogram(df['Dist_bp'].values, bins=mybin)[0] a = list(blockPreIndex) a.insert(0,0) boxlist = [] for idx,ele in enumerate(a): st = sum(a[0:idx]) ed = sum(a[0:idx+1]) boxlist.append(df['R^2'][st:ed].values) boxlist.pop(0) with open(out_fn, 'w') as f: for idx,ele in enumerate(boxlist): if len(ele) >= 1: averageR2, sd = sum(ele)/float(len(ele)), np.var(ele) elif len(ele) == 0: averageR2, sd = '','' f.write('%s\t%s\t%s\t%s\n'%(10**(idx*0.1),(10**((idx+1)*0.1)), averageR2, sd))