def FilterMAF(args): """ %prog FilterMAF input_vcf Remove rare MAF SNPs """ p = OptionParser(FilterMAF.__doc__) p.add_option('--maf_cutoff', default = 0.01, type='float', help = 'specify the MAF rate cutoff, SNPs lower than this cutoff will be removed.') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputvcf, = args outputvcf = Path(inputvcf).name.replace('.vcf', '_maf%s.vcf'%opts.maf_cutoff) vcf = ParseVCF(inputvcf) n = 0 with open(outputvcf, 'w') as f: f.writelines(vcf.HashChunk) pbar = tqdm(vcf.MAFs, total=vcf.num_SNPs, desc='Filter MAF', position=0) for i, maf in pbar: if maf >= opts.maf_cutoff: f.write(i) else: n += 1 pbar.set_description('processing chromosome %s'%i.split()[0]) print('Done! %s SNPs removed! check output %s...'%(n, outputvcf))
def Info(args): ''' %prog Info project_folder Show summary of images under project_folder ''' p = OptionParser(Info.__doc__) p.add_option( '--item_idx', default='1,2,3', help= 'the index of sample name, date, and time in each image directory name' ) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) project_folder, = args sm_idx, date_idx, time_idx = [int(i) for i in opts.item_idx.split(',')] prj = ParseProject(project_folder, sm_idx, date_idx, time_idx) print('Summary of samples:') for i, j in prj.sm_counts.items(): print(i, j) print('Summary of dates:') for i, j in prj.date_counts.items(): print(i, j) print('Angles for RGB images:') for angle in prj.df.loc[0, 'fnpath'].glob('Vis_*'): print(angle.name)
def export(args): ''' %prog export proj_id outfile - proj_id: The project id of the zooniverse project DESC: Fetches an export from the specified zooniverse project id. ''' from schnablelab.Zooniverse.Zootils import export as exp p = OptionParser(export.__doc__) p.add_option('-t', '--type', default='classifications', help='Specify the type of export') opts, args = p.parse_args(args) if len(args) != 2: exit(not p.print_help()) projid, outfile = args exp(projid, outfile, opts) return True
def fastqc(args): """ %prog fastqc in_dir out_dir in_dir: the dir where fastq files are located out_dir: the dir saving fastqc reports generate slurm files for fastqc jobs """ p = OptionParser(fastqc.__doc__) p.add_option("--pattern", default='*.fastq', help="the pattern of fastq files, qutation needed") opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) fqs = dir_path.glob(opts.pattern) for fq in fqs: prf = '.'.join(fq.name.split('.')[0:-1]) print(prf) cmd = 'fastqc %s -o %s' % (str(fq), out_dir) header = Slurm_header % (10, 10000, prf, prf, prf) header += 'ml fastqc\n' header += cmd with open('%s.fastqc.slurm' % (prf), 'w') as f: f.write(header)
def combineHmp(args): """ %prog combineHmp N pattern output combine split hmp (1-based) files to a single one. Pattern example: hmp321_agpv4_chr%s.hmp """ p = OptionParser(combineHmp.__doc__) p.add_option('--header', default='yes', choices=('yes', 'no'), help='choose whether add header or not') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) N, hmp_pattern, new_f, = args N = int(N) f = open(new_f, 'w') fn1 = open(hmp_pattern % 1) print(1) if opts.header == 'yes': for i in fn1: f.write(i) else: fn1.readline() for i in fn1: f.write(i) fn1.close() for i in range(2, N + 1): print(i) fn = open(hmp_pattern % i) fn.readline() for j in fn: f.write(j) fn.close() f.close()
def three2two(args): ''' %prog three2two fn_in out_prefix convert 3d npy to 2d ''' p = OptionParser(three2two.__doc__) p.add_option('--crops', help='the coordinates for croping, follow left,upper,right,lower format. 1,80,320,479') p.add_option("--format", default='npy', choices=('npy', 'csv'), help="choose the output format") opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) fn_in, out_prefix, = args npy = np.load(fn_in) if opts.crops: left, up, right, down = opts.crops.split(',') npy = npy[int(up):int(down),int(left):int(right),:] h,w,d = npy.shape print(h, w, d) npy_2d = npy.reshape(h*w, d) if opts.format=='csv': out_fn = "%s.2d.csv"%out_prefix np.savetxt(out_fn, npy_2d, delimiter=",") else: out_fn = "%s.2d.npy"%out_prefix np.save(out_fn, npy_2d.astype(np.float64)) print('Done!')
def fixGTsep(args): """ %prog fixGTsep in_dir out_dir replace the allele separator . in freebayes vcf file to / which is required for beagle """ p = OptionParser(fixGTsep.__doc__) p.add_option('--pattern', default='*.vcf', help='file pattern for vcf files in dir_in') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) vcfs = dir_path.glob(opts.pattern) for vcf in vcfs: sm = '.'.join(vcf.name.split('.')[0:-1]) out_fn = sm+'.fixGT.vcf' out_fn_path = out_path/out_fn cmd = "perl -pe 's/\s\.:/\t.\/.:/g' %s > %s"%(vcf, out_fn_path) header = Slurm_header%(10, 10000, sm, sm, sm) header += cmd with open('%s.fixGT.slurm'%sm, 'w') as f: f.write(header)
def index_ref(args): """ %prog index_ref ref.fa index the reference genome sequences """ p = OptionParser(index_ref.__doc__) p.add_option('--tool', default='bwa', choices=('bwa', 'samtools'), help = 'tool for indexing reference genome') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ref_fn, = args prefix = '.'.join(ref_fn.split('.')[0:-1]) if opts.tool == 'bwa': cmd = 'bwa index -p %s %s'%(prefix, ref_fn) print(cmd) header = Slurm_header%(100, 15000, prefix, prefix, prefix) header += 'ml bwa\n' header += cmd with open('%s.bwa_index.slurm'%prefix, 'w') as f: f.write(header) else: cmd = 'samtools faidx %s'%ref_fn print(cmd) header = Slurm_header%(10, 10000, prefix, prefix, prefix) header += 'ml samtools\n' header += cmd with open('%s.samtools_index.slurm'%prefix, 'w') as f: f.write(header)
def trim_paired(args): """ %prog trim in_dir out_dir quality control on the paired reads """ p = OptionParser(trim_paired.__doc__) p.add_option('--pattern_r1', default = '*_R1.fastq', help='filename pattern for forward reads') p.add_option('--pattern_r2', default = '*_R2.fastq', help='filename pattern for reverse reads') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir,out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('output dir %s does not exist...'%out_dir) r1_fns = glob('%s/%s'%(in_dir, opts.pattern_r1)) r2_fns = glob('%s/%s'%(in_dir, opts.pattern_r2)) for r1_fn, r2_fn in zip(r1_fns, r2_fns): r1_path = Path(r1_fn) r2_path = Path(r2_fn) prf = '_'.join(r1_path.name.split('_')[0:-1])+'.PE' print(prf) r1_fn_out1 = r1_path.name.replace('R1.fastq', 'trim.R1.fastq') r1_fn_out2 = r1_path.name.replace('R1.fastq', 'unpaired.R1.fastq') r2_fn_out1 = r2_path.name.replace('R2.fastq', 'trim.R2.fastq') r2_fn_out2 = r2_path.name.replace('R2.fastq', 'unpaired.R2.fastq') cmd = 'java -jar $TM_HOME/trimmomatic.jar PE -phred33 %s %s %s %s %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40'%(r1_fn,r2_fn,str(out_path/r1_fn_out1),str(out_path/r1_fn_out2),str(out_path/r2_fn_out1),str(out_path/r2_fn_out2)) header = Slurm_header%(10, 10000, prf, prf, prf) header += 'ml trimmomatic\n' header += cmd with open('%s.trim.slurm'%(prf), 'w') as f: f.write(header)
def download(args): ''' %prog activate download_links.csv download activated asset links ''' p = OptionParser(download.__doc__) p.add_option( '--output', default="'infer'", help='default to construct the output file name from the API response') opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) links_csv, = args links_df = pd.read_csv(links_csv, delim_whitespace=True) client = Client() # Send a GET request to the provided location url, for __, row in links_df.iterrows(): res = client.ses.get(row['download_link'], stream=True) suffix = 'tif' if row['asset_type'] == 'visual' else row['asset_type'] output = '%s_%s.%s'%(row['id'], row['item_type'], suffix) \ if opts.output == "'infer'" else opts.output # Save the file with open(output, "wb") as f: print('download %s...' % output) for chunk in res.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush()
def MLM(args): """ %prog MLM GenoPrefix('*.mean' and '*.annotation') Pheno Outdir RUN automated GEMMA Mixed Linear Model """ p = OptionParser(MLM.__doc__) p.add_option('--kinship', default=False, help = 'specify the relatedness matrix file name') p.add_option('--pca', default=False, help = 'specify the principle components file name') p.set_slurm_opts(jn=True) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) GenoPrefix, Pheno, Outdir = args meanG, annoG = GenoPrefix+'.mean', GenoPrefix+'.annotation' outprefix = '.'.join(Pheno.split('/')[-1].split('.')[0:-1]) cmd = '%s -g %s -p %s -a %s -lmm 4 -outdir %s -o %s' \ %(gemma, meanG, Pheno, annoG, Outdir, outprefix) if opts.kinship: cmd += ' -k %s'%opts.kinship if opts.pca: cmd += ' -c %s'%opts.pca print('The command running on the local node:\n%s'%cmd) h = Slurm_header header = h%(opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd f = open('%s.mlm.slurm'%outprefix, 'w') f.write(header) f.close() print('slurm file %s.mlm.slurm has been created, you can sbatch your job file.'%outprefix)
def item_types(args): ''' %prog item_types print all available item types ''' p = OptionParser(item_types.__doc__) p.add_option('-o', '--output', default="item_types.csv", help='specify output file') p.add_option('--n', default="all", help='how many rows you wanna see') opts, args = p.parse_args(args) if len(args) != 0: sys.exit(not p.print_help()) client = Client() df_items = client.get_all_items()[[ 'id', 'display_name', 'display_description' ]] if opts.n != 'all': try: rows = int(opts.n) df_items.head(rows).to_csv(opts.output, index=False, sep='\t') except ValueError: sys.exit("n must be a number") else: df_items.to_csv(opts.output, index=False, sep='\t') print('check %s!' % (opts.output))
def freebayes(args): """ %prog freebayes region.txt ref.fa bam_list.txt out_dir create freebayes slurm jobs for each splitted region defined in region.txt file """ p = OptionParser(freebayes.__doc__) p.add_option('--max_depth', default=10000, help = 'cites where the mapping depth higher than this value will be ignored') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) region, ref, bams,out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') with open(region) as f: for reg in f: reg = reg.strip() reg_fn = reg.replace(':','_') reg_fn_vcf = '%s.fb.vcf'%reg_fn reg_fn_vcf_path = out_path/reg_fn_vcf cmd = 'freebayes -r %s -f %s -C 1 -F 0.05 -L %s -u -n 2 -g %s > %s\n'%(reg, ref, bams,opts.max_depth, reg_fn_vcf_pth) header = Slurm_header%(165, 50000, reg_fn, reg_fn, reg_fn) header += 'ml freebayes/1.3\n' header += cmd with open('%s.fb.slurm'%reg_fn, 'w') as f1: f1.write(header) print('slurm files %s.fb.slurm has been created'%reg_fn)
def FilterHetero(args): """ %prog FilterHetero input_vcf Remove bad and high heterizygous loci """ p = OptionParser(FilterHetero.__doc__) p.add_option('--het_cutoff', default = 0.1, type='float', help = 'specify the heterozygous rate cutoff, SNPs higher than this cutoff will be removed.') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputvcf, = args outputvcf = Path(inputvcf).name.replace('.vcf', '_het%s.vcf'%opts.het_cutoff) vcf = ParseVCF(inputvcf) n = 0 with open(outputvcf, 'w') as f: f.writelines(vcf.HashChunk) pbar = tqdm(vcf.Heteros, total=vcf.num_SNPs, desc='Filter Heterozygous', position=0) for i, het in pbar: if het <= opts.het_cutoff: f.write(i) else: n += 1 pbar.set_description('processing chromosome %s'%i.split()[0]) print('Done! %s SNPs removed! check output %s...'%(n, outputvcf))
def action1(args): """ %prog dir do some tricky actions... """ p = OptionParser(action1.__doc__) p.add_option("--num", default='10', help="one num-th files will be read.") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) folder, = args all_fns = [] for dirpath, dirnames, filenames in os.walk(folder): for filename in filenames: fn = os.path.join(dirpath, filename) all_fns.append(fn) part_fns = random.sample(all_fns, int(np.ceil(len(all_fns) / float(opts.num)))) for i in part_fns: print(i) f = open(fn) f.readline() f.close() print('run away from crim scene !!!')
def FilterMissing(args): """ %prog FilterMissing input_hmp Remove SNPs with high missing rate """ p = OptionParser(FilterMissing.__doc__) p.add_option( '--missing_cutoff', default=0.7, type='float', help= 'specify the missing rate cutoff. SNPs higher than this cutoff will be removed.' ) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_mis%s.hmp' % opts.missing_cutoff) hmp = ParseHmp(inputhmp) n = 0 with open(outputhmp, 'w') as f: f.write(hmp.headerline) pbar = tqdm(hmp.Missings, total=hmp.numSNPs) for i, miss in pbar: if miss <= opts.missing_cutoff: f.write(i) else: n += 1 pbar.set_description('processing chromosome %s' % i.split()[2]) print('Done! %s SNPs removed! check output %s...' % (n, outputhmp))
def hyp2arr_slurms(args): ''' %prog hyp2arr_slurms in_dir out_dir generate hyp2arr slurm jobs for all folders under specified dir ''' p = OptionParser(hyp2arr_slurms.__doc__) p.add_option('--pattern', default='*', help='hyper dir pattern for folders under dir') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) folders = list(dir_path.glob(opts.pattern)) num_arrs = len(folders) print('%s hyper folders found'%num_arrs) for hyp_dir in folders: in_dir = str(hyp_dir/'Hyp_SV_90') out_fn = hyp_dir.name.replace(' ', '_') out_fn_path = out_path/out_fn cmd = 'python -m schnablelab.CNN.Preprocess hyp2arr %s %s'%(in_dir, out_fn_path) print(cmd) header = Slurm_header%(10, 5000, out_fn, out_fn, out_fn) header += 'conda activate MCY\n' header += cmd with open('%s.hyp2arr.slurm'%out_fn, 'w') as f: f.write(header)
def FilterHetero(args): """ %prog FilterHetero input_hmp Remove bad and high heterizygous loci (coducting Missing and MAF first) """ p = OptionParser(FilterHetero.__doc__) p.add_option( '--het_cutoff', default=0.1, type='float', help= 'specify the heterozygous rate cutoff, SNPs higher than this cutoff will be removed.' ) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_het%s.hmp' % opts.het_cutoff) hmp = ParseHmp(inputhmp) n = 0 with open(outputhmp, 'w') as f: f.write(hmp.headerline) pbar = tqdm(hmp.Heteros, total=hmp.numSNPs) for i, het in pbar: if het <= opts.het_cutoff: f.write(i) else: n += 1 pbar.set_description('processing chromosome %s' % i.split()[2]) print('Done! %s SNPs removed! check output %s...' % (n, outputhmp))
def only_ALT(args): """ %prog in_dir out_dir filter number of ALT using bcftools """ p = OptionParser(only_ALT.__doc__) p.set_slurm_opts(jn=True) p.add_option('--pattern', default='*.vcf', help='file pattern for vcf files in dir_in') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) vcfs = dir_path.glob(opts.pattern) for vcffile in vcfs: prefix = '.'.join(vcf.name.split('.')[0:-1]) new_f = prefix + '.alt1.vcf' cmd = "bcftools view -i 'N_ALT=1' %s > %s"%(vcffile, new_f) with open('%s.alt1.slurm'%prefix, 'w') as f: header = Slurm_header%(opts.time, opts.memory, prefix, prefix, prefix) header += 'ml bacftools\n' header += cmd f.write(header) print('slurm file %s.alt1.slurm has been created, you can sbatch your job file.'%prefix)
def FilterMAF(args): """ %prog FilterMAF input_hmp Remove rare MAF SNPs (conducting Missing filter first) """ p = OptionParser(FilterMAF.__doc__) p.add_option( '--MAF_cutoff', default=0.01, type='float', help= 'specify the MAF rate cutoff, SNPs lower than this cutoff will be removed.' ) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_maf%s.hmp' % opts.MAF_cutoff) hmp = ParseHmp(inputhmp) n = 0 with open(outputhmp, 'w') as f: f.write(hmp.headerline) pbar = tqdm(hmp.MAFs, total=hmp.numSNPs) for i, maf in pbar: if maf >= opts.MAF_cutoff: f.write(i) else: n += 1 pbar.set_description('processing chromosome %s' % i.split()[2]) print('Done! %s SNPs removed! check output %s...' % (n, outputhmp))
def IndexVCF(args): """ %prog IndexVCF in_dir out_dir index vcf using bgzip and tabix """ p = OptionParser(IndexVCF.__doc__) p.add_option('--pattern', default='*.vcf', help='file pattern for vcf files in dir_in') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('%s does not exist...') dir_path = Path(in_dir) vcfs = dir_path.glob(opts.pattern) for vcf in vcfs: sm = '.'.join(vcf.name.split('.')[0:-1]) out_fn = vcf.name+'.gz' out_fn_path = out_path/out_fn cmd1 = 'bgzip -c %s > %s\n'%(vcf, out_fn_path) cmd2 = 'tabix -p vcf %s\n'%(out_fn_path) header = Slurm_header%(10, 20000, sm, sm, sm) header += 'ml tabix\n' header += cmd1 header += cmd2 with open('%s.idxvcf.slurm'%sm, 'w') as f: f.write(header)
def DownsamplingSNPs(args): """ %prog downsampling input_hmp Pick up some SNPs from a huge hmp file using Linux sed command """ p = OptionParser(DownsamplingSNPs.__doc__) p.add_option('--downscale', default=10, help='specify the downscale level') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=DownsamplingSNPs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_ds%s.hmp' % opts.downsize) cmd = "sed -n '1~%sp' %s > %s" % (opts.downsize, inputhmp, outputhmp) print('cmd:\n%s\n' % cmd) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm([cmd], put2slurm_dict)
def trim_single(args): """ %prog trim in_dir out_dir quality control on the single end reads """ p = OptionParser(trim_paired.__doc__) p.add_option('--pattern', default='*_Unpaired.fastq', help='filename pattern for all single end reads') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args out_path = Path(out_dir) if not out_path.exists(): sys.exit('output dir %s does not exist...' % out_dir) fns = glob('%s/%s' % (in_dir, opts.pattern)) for fn in fns: fn_path = Path(fn) prf = '_'.join(fn_path.name.split('_')[0:-1]) + '.SE' print(prf) fn_out = fn_path.name.replace('Unpaired.fastq', 'trim.Unpaired.fastq') cmd = 'java -jar $TM_HOME/trimmomatic.jar SE -phred33 %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40' % ( fn, str(out_path / fn_out)) header = Slurm_header % (10, 10000, prf, prf, prf) header += 'ml trimmomatic\n' header += cmd with open('%s.trim.slurm' % (prf), 'w') as f: f.write(header)
def ped2bed(args): """ %prog ped_prefix Convert plink ped/map to binary bed/bim/fam format using Plink """ p = OptionParser(ped2bed.__doc__) p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=ped2bed.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ped_prefix, = args cmd_header = 'ml plink' cmd = 'plink --noweb --file %s --make-bed --out %s' % (ped_prefix, ped_prefix) print('cmd on HCC:\n%s\n%s' % (cmd_header, cmd)) cmd_local = '%s --noweb --file %s --make-bed --out %s' % ( plink, ped_prefix, ped_prefix) print('cmd on local desktop:\n%s\n' % cmd_local) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd], put2slurm_dict)
def IndePvalue(args): """ %prog IndePvalue plink_bed_prefix output calculate the number of independent SNPs (Me) and the bonferroni pvalue """ p = OptionParser(IndePvalue.__doc__) p.set_slurm_opts(jn=True) p.add_option( '--cutoff', default='0.05', choices=('0.01', '0.05'), help='choose the pvalue cutoff for the calculation of bonferroni pvalue' ) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) bed, output = args mem = int(opts.memory / 1000) - 2 cmd = 'java -Xmx%sg -jar %s --noweb --effect-number --plink-binary %s --genome --out %s' % ( mem, GEC, bed, output) h = Slurm_header h += 'module load java/1.8\n' header = h % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += cmd f = open('%s.Me_SNP.slurm' % output, 'w') f.write(header) f.close() print( 'slurm file %s.Me_SNP.slurm has been created, you can sbatch your job file.' % output)
def IndePvalue(args): """ %prog IndePvalue bed_prefix output_fn Estimate number of idenpendent SNPs using GEC """ p = OptionParser(IndePvalue.__doc__) p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=IndePvalue.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) bed_prefix, output_fn = args cmd = 'java -Xmx18g -jar %s --noweb --effect-number --plink-binary %s --genome --out %s' % ( GEC, bed_prefix, output_fn) print('cmd:\n%s\n' % cmd) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['memory'] = 20000 put2slurm([cmd], put2slurm_dict)
def keras_cnn(args): """ %prog train_dir val_dir num_category model_name_prefix Run vgg model """ p = OptionParser(keras_cnn.__doc__) p.add_option('--epoch', default=500, help = 'number of epoches') p.add_option('--lr_n', default=1, type='int', help = 'train model with differnt learning rates. if n=1: set lr to 0.001. if n>1: try differnt lr from 1e-2 to 1e-5 n times') p.set_slurm_opts(gpu=True) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) train_dir, val_dir, numC, mnp = args #mnp:model name prefix out_fns = fns(mnp, n=opts.lr_n) for i in range(int(opts.lr_n)): cmd = 'python -m schnablelab.CNN.keras_vgg %s %s %s %s %s %s'%(train_dir, val_dir, numC, out_fns.lrs[i], opts.epoch, out_fns.model_name[i]) SlurmHeader = Slurm_gpu_header%(opts.time, opts.memory, out_fns.model_name[i], out_fns.model_name[i], out_fns.model_name[i]) SlurmHeader += 'module load anaconda\nsource activate MCY\n' SlurmHeader += cmd f = open('%s.slurm'%out_fns.model_name[i], 'w') f.write(SlurmHeader) f.close() print('slurm file %s.slurm has been created, you can sbatch your job file.'%out_fns.model_name[i])
def genPCA(args): """ %prog genPCA input_hmp N Generate first N PCs using tassel """ p = OptionParser(genPCA.__doc__) p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=genPCA.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmpfile, N, = args out_prefix = Path(hmpfile).name.replace('.hmp', '') cmd_header = 'ml java/1.8\nml tassel/5.2' cmd = 'run_pipeline.pl -Xms28g -Xmx29g -fork1 -h %s -PrincipalComponentsPlugin -ncomponents %s -covariance true -endPlugin -export %s_%sPCA -runfork1\n' % ( hmpfile, N, out_prefix, N) print('cmd:\n%s\n%s' % (cmd_header, cmd)) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['memory'] = 30000 put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd], put2slurm_dict)
def divide(args): ''' %prog divide input_dir output_dir_prefix ''' p = OptionParser(divide.__doc__) p.add_option('--pattern', default='*.jpg', help='file name pattern') p.add_option('--nimgs_per_folder', type='int', default=700, help='~ number of images (<1000) in each smaller folder') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) input_dir, out_prefix, = args df = GenDataFrameFromPath(Path(input_dir), pattern=opts.pattern) n_folders = math.ceil(df.shape[0]/opts.nimgs_per_folder) print('%s will be divided to %s datasets'%(df.shape[0], n_folders)) n = 0 for _, grp in cutlist(df['fnpath'].values, n_folders): n += 1 output_folder = Path('%s_%s'%(out_prefix,n)) print(output_folder, grp.shape[0]) if not output_folder.exists(): output_folder.mkdir() for i in grp: copyfile(i, output_folder/i.name)
def vcf2hmp(args): """ %prog vcf2hmp vcf convert vcf generated from beagle to hmp format using tassel """ p = OptionParser(vcf2hmp.__doc__) p.set_slurm_opts(jn=True) p.add_option('--version', default='2', choices=('1', '2'), help='specify the hmp type. 1: hyploid. 2: diploid') opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) vcffile, = args prefix = '.'.join(vcffile.split('.')[0:-1]) cmd = '%s -Xms512m -Xmx10G -fork1 -vcf %s -export -exportType HapmapDiploid\n' % (tassel, vcffile) \ if opts.version == '2' \ else '%s -Xms512m -Xmx10G -fork1 -vcf %s -export -exportType Hapmap\n' % (tassel, vcffile) header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix) header += 'module load java/1.8\n' header += cmd f = open('%s.vcf2hmp.slurm' % prefix, 'w') f.write(header) f.close() print( 'slurm file %s.vcf2hmp.slurm has been created, you can submit your job file.' % prefix)