def BatchFilterMAF(args): """ %prog in_dir apply FilterMAF on multiple vcf files """ p = OptionParser(BatchFilterMAF.__doc__) p.add_option('--pattern', default='*.vcf', help="file pattern of vcf files in the 'dir_in'") p.add_option('--maf_cutoff', default='0.01', help='maf cutoff, SNPs lower than this cutoff will be removed') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=BatchFilterMAF.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, = args in_dir_path= Path(in_dir) vcfs = in_dir_path.glob(opts.pattern) cmds = [] for vcf in vcfs: cmd = "python -m schnablelab.SNPcalling.base FilterMAF %s --maf_cutoff %s"%(vcf, opts.maf_cutoff) cmds.append(cmd) cmd_sh = '%s.cmds%s.sh'%(opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print('check %s for all the commands!'%cmd_sh) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict)
def genPCA(args): """ %prog genPCA input_hmp N Generate first N PCs using tassel """ p = OptionParser(genPCA.__doc__) p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=genPCA.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmpfile, N, = args out_prefix = Path(hmpfile).name.replace('.hmp', '') cmd_header = 'ml java/1.8\nml tassel/5.2' cmd = 'run_pipeline.pl -Xms28g -Xmx29g -fork1 -h %s -PrincipalComponentsPlugin -ncomponents %s -covariance true -endPlugin -export %s_%sPCA -runfork1\n' % ( hmpfile, N, out_prefix, N) print('cmd:\n%s\n%s' % (cmd_header, cmd)) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['memory'] = 30000 put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd], put2slurm_dict)
def IndePvalue(args): """ %prog IndePvalue bed_prefix output_fn Estimate number of idenpendent SNPs using GEC """ p = OptionParser(IndePvalue.__doc__) p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=IndePvalue.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) bed_prefix, output_fn = args cmd = 'java -Xmx18g -jar %s --noweb --effect-number --plink-binary %s --genome --out %s' % ( GEC, bed_prefix, output_fn) print('cmd:\n%s\n' % cmd) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['memory'] = 20000 put2slurm([cmd], put2slurm_dict)
def DownsamplingSNPs(args): """ %prog downsampling input_hmp Pick up some SNPs from a huge hmp file using Linux sed command """ p = OptionParser(DownsamplingSNPs.__doc__) p.add_option('--downscale', default=10, help='specify the downscale level') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=DownsamplingSNPs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) inputhmp, = args outputhmp = Path(inputhmp).name.replace('.hmp', '_ds%s.hmp' % opts.downsize) cmd = "sed -n '1~%sp' %s > %s" % (opts.downsize, inputhmp, outputhmp) print('cmd:\n%s\n' % cmd) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm([cmd], put2slurm_dict)
def ped2bed(args): """ %prog ped_prefix Convert plink ped/map to binary bed/bim/fam format using Plink """ p = OptionParser(ped2bed.__doc__) p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=ped2bed.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ped_prefix, = args cmd_header = 'ml plink' cmd = 'plink --noweb --file %s --make-bed --out %s' % (ped_prefix, ped_prefix) print('cmd on HCC:\n%s\n%s' % (cmd_header, cmd)) cmd_local = '%s --noweb --file %s --make-bed --out %s' % ( plink, ped_prefix, ped_prefix) print('cmd on local desktop:\n%s\n' % cmd_local) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd], put2slurm_dict)
def Batch2JPG(args): ''' %prog Batch2JPG in_dir out_dir apply toJPG on a large number of images ''' p = OptionParser(Batch2JPG.__doc__) p.add_option('--pattern', default='*.png', help="file pattern of png files under the 'dir_in'") p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=Batch2JPG.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args in_dir_path = Path(in_dir) pngs = in_dir_path.glob(opts.pattern) cmds = [] for img_fn in pngs: img_fn = str(img_fn).replace(' ', '\ ') cmd = "python -m schnablelab.ImageProcessing.base toJPG "\ f"{img_fn} --out_dir {out_dir}" cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print('check %s for all the commands!' % cmd_sh) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict)
def calculateLD(args): """ %prog vcf_fn/plink_prefix genome_size(Mb) num_SNPs calculate LD using Plink args: vcf_fn/plink_prefix: specify either vcf/vcf.gz file or the prefix of plink bed/bim/fam files. genome_size(Mb): the size of the reference genome in Mb. For reference: sorghum 684Mb num_SNPs: the number of SNPs in the genotype file. """ p = OptionParser(calculateLD.__doc__) p.add_option('--maf_cutoff', default='0.01', help='only use SNP with the MAF higher than this cutoff to calculate LD') p.add_option('--max_distance', type='int', default=1000000, help='the maximum distance of a pair of SNPs to calcualte LD (bp)') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=calculateLD.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_fn, g_size, n_snps, = args in_fn, g_size, n_snps = Path(in_fn), int(g_size)*1000000, int(n_snps) if in_fn.name.endswith('.vcf') or in_fn.name.endswith('.vcf.gz'): input = f'--vcf {in_fn}' else: input = f'--bfile {in_fn}' n = 10 ld_window, ld_window_bp = [], [] while True: ld_window.append(n) dist = g_size//n_snps*n ld_window_bp.append(dist) n *= 10 if dist>=1000000: break out_fn = Path(in_fn).name.split('.')[0] cmds = [] cmd = f'plink {input} --r2 --ld-window 10 --ld-window-kb {ld_window_bp[0]//1000} --ld-window-r2 0 --maf {opts.maf_cutoff} --out {out_fn}' cmds.append(cmd) for win_snp, win_bp in zip(ld_window[1:], ld_window_bp[1:]): prob = 10/win_snp cmd = f'plink {input} --thin {prob} --r2 --ld-window 10 --ld-window-kb {win_bp//1000} --ld-window-r2 0 --maf {opts.maf_cutoff} --out {out_fn}.thin{prob}' cmds.append(cmd) print(cmd) cmd_sh = '%s.cmds%s.sh'%(opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') cmd_header = 'ml plink' if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm(cmds, put2slurm_dict)
def LinkedSNPs(args): """ %prog input_SNPlist_file bed_prefix r2_cutoff output_prefix extract linked SNPs using plink. """ p = OptionParser(LinkedSNPs.__doc__) p.add_option('--col_idx', type='int', default=0, help='specify which column contains SNP ID (0-based)') p.add_option( '--header', default='yes', choices=('yes', 'no'), help='add this option if there is no header in the input SNPlist file') p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=LinkedSNPs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) SNPlist_fn, bedprefix, cutoff, output_prefix, = args if opts.header == 'yes': df = pd.read_csv(SNPlist_fn, delim_whitespace=True, usecols=[opts.col_idx]) else: df = pd.read_csv(SNPlist_fn, delim_whitespace=True, usecols=[opts.col_idx], header=None) pre = Path(SNPlist_fn).name.split('.')[0] df.to_csv('%s.SNPs_list.csv' % pre, index=False, header=None) cmd_local = '%s --bfile %s --r2 --ld-snp-list %s.SNPs_list.csv --ld-window-kb 5000 --ld-window 99999 --ld-window-r2 %s --noweb --out %s\n' % ( plink, bedprefix, pre, cutoff, output_prefix) print('cmd on local:\n%s' % cmd_local) cmd_header = 'ml plink' cmd_hcc = 'plink --bfile %s --r2 --ld-snp-list %s.SNPs_list.csv --ld-window-kb 5000 --ld-window 99999 --ld-window-r2 %s --noweb --out %s\n' % ( bedprefix, pre, cutoff, output_prefix) print('cmd on HCC:\n%s\n%s' % (cmd_header, cmd_hcc)) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd_hcc], put2slurm_dict)
def align_pe(args): """ %prog align_pe ref_indx_base fq_fns.csv output_dir paire-end alignment using bwa. args: ref_index_base: the prefix of reference index files fq_fns.csv: the csv file including parsed fq files from pre_fqs function. output_dir: where the generated bam files save to """ p = OptionParser(align_pe.__doc__) p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=align_pe.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ref_base, fq_csv, output_dir = args output_dir = Path(output_dir) if not output_dir.exists(): sys.exit(f'output directory {output_dir} does not exist!') df = pd.read_csv(fq_csv) df_R1, df_R2 = df[::2], df[1::2] if df_R1.shape[0] != df_R2.shape[0]: sys.exit('number of R1 and R2 files are not consistent!') cmds = [] for (_, r1), (_, r2) in zip(df_R1.iterrows(), df_R2.iterrows()): r1_fn, r2_fn, sm = Path(r1['fnpath']), Path(r2['fnpath']), r1['sm'] r1_fn_arr, r2_fn_arr = np.array(list(r1_fn.name)), np.array( list(r2_fn.name)) bools = (r1_fn_arr != r2_fn_arr) if bools.sum() != 1: print(r1_fn, r2_fn) sys.exit('check fq file names!') idx = np.argmax(bools) prefix = re.split('[-_]R', r1_fn.name[:idx])[0] RG = r"'@RG\tID:%s\tSM:%s'" % (sm, sm) bam_fn = f'{prefix}.pe.sorted.bam' cmd = f"bwa mem -t {opts.ncpus_per_node} -R {RG} {ref_base} {r1_fn} {r2_fn} | samtools sort -@{opts.ncpus_per_node} -o {output_dir/bam_fn} -" cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') cmd_header = 'ml bwa\nml samtools' if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm(cmds, put2slurm_dict)
def pre_ref(args): """ %prog pre_ref ref.fa index the reference genome sequences using bwa, samtools, and picard tools """ p = OptionParser(pre_ref.__doc__) p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=pre_ref.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ref_fn, = args ref_fn, ref_dir = Path(ref_fn), Path(ref_fn).parent if not ref_fn.exists(): sys.exit(f'reference file {ref_fn} does not exist!') ref_prefix = re.split('.fa|.fasta', ref_fn.name)[0] bwa_idx_exs = ('.amb', '.ann', '.bwt', '.pac', '.sa') bwa_bool = sum([(ref_dir / (ref_prefix + bie)).exists() for bie in bwa_idx_exs]) cmds = [] if bwa_bool != 5: print('bwa index does not exist...') cmd = f'ml bwa\nbwa index -p {ref_dir/ref_prefix} {ref_fn}' cmds.append(cmd) if not (ref_dir / (ref_fn.name + '.fai')).exists(): print('fai index does not exist...') cmd = f'ml samtools\nsamtools faidx {ref_fn}' cmds.append(cmd) dict_fn = ref_dir / (ref_prefix + '.dict') if not dict_fn.exists(): print('dict index does not exist...') cmd = f'ml gatk4/4.1\ngatk CreateSequenceDictionary -R {ref_fn} -O {dict_fn}' cmds.append(cmd) if len(cmds) > 0: if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict) else: print('commands running on local:\n%s' % ('\n'.join(cmds))) else: print('All reference index files have already existed!')
def genoGVCFs(args): """ %prog genoGVCFs ref.fa genomicDB_dir out_dir create the raw VCFs from GenomicsDB datastores args: ref.fa: the reference sequence fasta file genomicDB_dir: the root directory of genomicDB workspace out_dir: where the vcf files will be saved """ p = OptionParser(genoGVCFs.__doc__) p.add_option('--gatk_tmp_dir', default='./gatk_tmp', help='temporary directory to use') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=genoGVCFs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ref, db_dir, out_dir, = args out_dir_path = Path(out_dir) if not out_dir_path.exists(): print(f'output directory {out_dir_path} does not exist, creating...') out_dir_path.mkdir() mem = int(opts.memory) // 1024 - 1 cmds = [] for db in Path(db_dir).glob('*'): if db.is_dir(): region = db.name vcf_fn = f"{region}.vcf.gz" cmd = f"gatk --java-options '-Xmx{mem}g' GenotypeGVCFs "\ f"-R {ref} -V gendb://{db} -O {out_dir_path/vcf_fn} --tmp-dir={opts.gatk_tmp_dir}" cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') cmd_header = 'ml gatk4/4.1' if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm(cmds, put2slurm_dict)
def BatchResize(args): ''' %prog BatchResize in_dir out_dir apply BatchResize on a large number of images ''' p = OptionParser(BatchResize.__doc__) p.add_option('--pattern', default='*.png', help="file pattern of png files under the 'dir_in'") p.add_option('--output_dim', default='1227,1028', help='the dimension (width,height) after resizing') p.add_option('--to_jpg', default=False, action='store_true', help='in save image as jpg format') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=BatchResize.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args in_dir_path = Path(in_dir) pngs = in_dir_path.glob(opts.pattern) cmds = [] for img_fn in pngs: img_fn = str(img_fn).replace(' ', '\ ') cmd = 'python -m schnablelab.ImageProcessing.base Resize '\ f'{img_fn} --output_dim {opts.output_dim} --out_dir {out_dir}' if opts.to_jpg: cmd += ' --to_jpg' cmds.append(cmd) fn_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) with open(fn_sh, 'w') as f: for i in cmds: f.write(i + '\n') print('check %s for all the commands!' % fn_sh) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict)
def markdupBam(args): """ %prog markdupBam input_dir output_dir mark potential PCR duplicates output bams will be indexed automatically args: input_dir: where sorted bam located output_dir: where the output rmduped bam shoud save to """ p = OptionParser(markdupBam.__doc__) p.add_option('--bam_fn_pattern', default='*.sorted.bam', help='pattern of bam files') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=markdupBam.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir = args in_dir_path, out_dir_path = Path(in_dir), Path(out_dir) if not out_dir_path.exists(): sys.exit(f'output directory {out_dir_path} does not exist!') bams = in_dir_path.glob(opts.bam_fn_pattern) cmds = [] for bam in bams: mdup_bam = bam.name.replace('.bam', '.mdup.bam') cmd = f'samtools markdup {bam} {out_dir_path/mdup_bam}\nsamtools index {out_dir_path/mdup_bam}' cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') cmd_header = 'ml samtools' if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm(cmds, put2slurm_dict)
def genKinship(args): """ %prog genKinship genotype.mean Calculate kinship matrix file using gemma """ p = OptionParser(genKinship.__doc__) p.add_option( '--type', default='1', choices=('1', '2'), help= 'specify the way to calculate the relateness, 1: centered; 2: standardized' ) p.add_option('--out_dir', default='.', help='specify the output dir') p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=genKinship.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) geno_mean, = args # generate a fake bimbam phenotype based on genotype with open(geno_mean) as f: num_SMs = len(f.readline().split(',')[3:]) mean_prefix = geno_mean.replace('.mean', '') tmp_pheno = '%s.tmp.pheno' % mean_prefix with open(tmp_pheno, 'w') as f1: for i in range(num_SMs): f1.write('sm%s\t%s\n' % (i, 20)) # the location of gemma executable file cmd = '%s -g %s -p %s -gk %s -outdir %s -o gemma.centered.%s' \ % (gemma, geno_mean, tmp_pheno, opts.type, opts.out_dir, Path(mean_prefix).name) print('The kinship command:\n%s' % cmd) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm([cmd], put2slurm_dict)
def BatchCombo(args): ''' %prog fn_core_csv step_n in_dir out_dir args: fn_core_csv: csv file with the first column as fn_core step_n: the step in range(st, ed, step) to split all fn_cores distribute Combo on HCC ''' p = OptionParser(BatchCombo.__doc__) p.add_option('--pattern', default='_Vis_SV_%s.Crp.jpg', help="The pattern of file suffix under the 'dir_in'") p.add_option('--resize', default='150,150', help='the resolution after resizing for each piece of image') p.add_option('--ncpu', default=1, type='int', help='CPU cores if using multiprocessing') p.add_slurm_opts(job_prefix=BatchCombo.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) fn_core_csv, step_n, in_dir, out_dir, = args df = pd.read_csv(fn_core_csv, usecols=['core_fn']) cuts = deque(range(0, len(df), int(step_n))) cuts.popleft() cmds = [] for idx, _df in enumerate(np.split(df, cuts), start=1): new_csv_fn = fn_core_csv + '_%s' % idx _df.to_csv(new_csv_fn, index=False) cmd = "python -m schnablelab.ImageProcessing.base Combo "\ f"{new_csv_fn} {in_dir} {out_dir} --pattern {opts.pattern} --resize {opts.resize} --ncpu {opts.ncpu}" cmds.append(cmd) put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict)
def hmp2vcf(args): """ %prog hmp2vcf input_hmp convert hmp to vcf format using tassel """ p = OptionParser(hmp2vcf.__doc__) p.add_option( '--disable_slurm', default=False, action="store_true", help='add this option to disable converting commands to slurm jobs') p.add_slurm_opts(job_prefix=hmp2vcf.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) hmpfile, = args cmd_header = 'ml tassel/5.2' cmd = 'run_pipeline.pl -Xms512m -Xmx10G -fork1 -h %s -export -exportType VCF\n' % ( hmpfile) print('cmd:\n%s\n%s' % (cmd_header, cmd)) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd], put2slurm_dict)
def BatchUpload(args): ''' %prog BatchUpload dir1 dir2... project_id subject_id upload multiple dataset ''' p = OptionParser(BatchUpload.__doc__) p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm job') p.add_slurm_opts(job_prefix=BatchUpload.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) *img_dirs, p_id, s_id = args cmds = [] for img_dir in img_dirs: cmd = f'python -m schnablelab.Zooniverse.Zookeeper upload {img_dir} {p_id} {img_dir} --subject_id {s_id}' cmds.append(cmd) cmd_sh = '%s.cmds%s.sh'%(opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict)
def indexBam(args): """ %prog indexBam dir1 ... index bam files using samtools index dir1: where bam files are located add more directories if bam files are located at different directories """ p = OptionParser(indexBam.__doc__) p.add_option('--bam_fn_pattern', default='*.mdup.bam', help='file extension of preprocessed bam files') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=indexBam.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) for bam_dir in args: bam_dir = Path(bam_dir) if not bam_dir.exists(): sys.exit(f'{bam_dir} does not exist!') bams = bam_dir.glob(opts.bam_fn_pattern) cmds = [f'samtools index {bam}' for bam in bams] cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') cmd_header = 'ml samtools' if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm(cmds, put2slurm_dict)
def prediction(args): """ %prog prediction saved_model test_csv, test_dir, output Args: saved_model: saved model with either a .pt or .pth file extension test_csv: csv file (comma separated with header) containing all testing image filenames test_dir: directory where testing images are located output: csv file saving prediction results """ p = OptionParser(prediction.__doc__) p.add_option( '--inputsize', default=224, type='int', help='the input size of image. At least 224 if using pretrained models' ) p.add_option('--batchsize', default=36, type='int', help='batch size') p.add_option( '--base_mn', default='resnet18', help= 'base model architectures: vgg16, googlenet, resnet18, resnet152...') p.add_option('--disable_slurm', default=False, action="store_true", help='run directly without generating slurm job') p.add_slurm_opts(job_prefix=prediction.__name__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) saved_model, test_csv, test_dir, output = args # genearte slurm file if not opts.disable_slurm: cmd = "python -m schnablelab.CNN.TransLearning prediction "\ f"{saved_model} {test_csv} {test_dir} {output} "\ f"--batchsize {opts.batchsize} --disable_slurm " if opts.base_mn: cmd += f"--base_mn {opts.base_mn} " put2slurm_dict = vars(opts) put2slurm([cmd], put2slurm_dict) sys.exit() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print('devicd: %s' % device) if opts.base_mn: model, input_size = initialize_model(model_name=opts.base_mn, feature_extract=True, use_pretrained=False, inputsize=opts.inputsize) # turn all gradients off for param in model.parameters(): param.requires_grad = False else: sys.exit('not implemented yet...') model.load_state_dict(torch.load(saved_model, map_location=device)) model.eval() test_dataset = LeafcountingDataset( test_csv, test_dir, image_transforms(input_size=opts.inputsize)['valid']) test_loader = DataLoader(test_dataset, batch_size=opts.batchsize) ground_truths, predicts, filenames = [], [], [] for idx, (inputs, labels, fns) in enumerate(test_loader, 1): # fns is a tuple print('idx %s' % idx) inputs = inputs.to(device) print('type of inputs: %s' % (type(inputs))) outputs = model(inputs) ground_truths.append(labels.squeeze().numpy()) filenames.append(np.array(fns)) if torch.cuda.is_available(): predicts.append(outputs.squeeze().to('cpu').numpy()) else: predicts.append(outputs.squeeze().numpy()) ground_truths = np.concatenate(ground_truths) predicts = np.concatenate(predicts) filenames = np.concatenate(filenames) df = pd.DataFrame( dict( zip(['fn', 'groundtruth', 'prediction'], [filenames, ground_truths, predicts]))) df.to_csv(output, index=False)
def ExtractRGBs(args): ''' %prog ExtractRGBs project_folder extract RGB images from project folder ''' p = OptionParser(ExtractRGBs.__doc__) p.add_option( '--npy_idx', help='specify the numpy file including the indices for extraction') p.add_option( '--item_idx', default='1,2,3', help= 'the index of sample name, date, and time in each image directory name' ) p.add_option('--out_dir', default='.', help='specify the output image directory') p.add_option( '--samples', help= 'extract particular samples. multiple samples separated by comma without space' ) p.add_option( '--dates', help= 'extract particular dates. multiple dates separated by comma without space.' ) p.add_option('--angle', default='108', help='which viewing angle are you going to extract?') p.add_option( '--backup_angle', help= 'specify an alternative viewing angle for RGB images if the above angle does not exist.' ) p.add_option( '--copy_only', default=False, action='store_true', help='only do copy without resizing and converting image format') p.add_option('--disable_slurm', default=False, action='store_true', help='do not convert commands to slurmm jobs') p.add_slurm_opts(job_prefix=ExtractRGBs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) project_folder, = args out_dir = Path(opts.out_dir) if not out_dir.exists(): print("The output directory '%s' does not exist, creating.." % out_dir) out_dir.mkdir() cmd = f'python -m schnablelab.ImageProcessing.HTP ExtractRGBs {project_folder} --out_dir {out_dir} --disable_slurm ' if opts.npy_idx: npy_idx = np.load(opts.npy_idx) print(npy_idx) cmd += f'--npy_idx {opts.npy_idx} ' if opts.samples: cmd += f'--samples {opts.samples} ' if opts.dates: cmd += f'--dates {opts.dates} ' if opts.angle: cmd += f'--angle {opts.angle} ' if opts.copy_only: cmd += f'--copy_only ' print(cmd) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm([cmd], put2slurm_dict) return opts.samples = opts.samples.split(',') if opts.samples else opts.samples opts.dates = opts.dates.split(',') if opts.dates else opts.dates sm_idx, date_idx, time_idx = [int(i) for i in opts.item_idx.split(',')] prj = ParseProject(project_folder, sm_idx, date_idx, time_idx) for sm, d, hms, path_img_fn in prj.RGB(folder_idx=npy_idx, samples=opts.samples, dates=opts.dates, angle=opts.angle, backup_angle=opts.backup_angle): angle_dir_name = path_img_fn.parts[-2] dest_fn = '%s_%s_%s_%s.jpg' % (sm, d, hms, angle_dir_name) dest = out_dir / dest_fn if dest.exists(): print(f'{dest} already exists, omit!') else: if opts.copy_only: copyfile(path_img_fn, dest) else: Image.open(path_img_fn).convert('RGB').resize( (1227, 1028)).save(dest)
def aggGVCFs(args): """ %prog aggGVCFs input_dir out_dir aggregate GVCF files to a GenomicsDB datastore for each genomic interval args: intput_dir: the directory containing all gvcf files out_dir: the output directory. a subdir will be created for each genomic interval """ p = OptionParser(aggGVCFs.__doc__) p.add_option('--gvcf_fn_pattern', default='*.g.vcf', help='file extension of gvcf files') p.add_option( '--sm_re_pattern', default=r"^P[0-9]{3}[_-]W[A-Z][0-9]{2}[^a-z0-9]", help='the regular expression pattern to pull sample name from filename' ) p.add_option('--gatk_tmp_dir', default='./gatk_tmp', help='temporary directory for genomicsDBImport') p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=aggGVCFs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args in_dir_path = Path(in_dir) out_dir_path = Path(out_dir) if not in_dir_path.exists(): sys.exit(f'input directory {in_dir_path} does not exist!') if not out_dir_path.exists(): print(f'output directory {out_dir_path} does not exist, creating...') out_dir_path.mkdir() tmp_dir = Path(opts.gatk_tmp_dir) if not tmp_dir.exists(): print('tmp directory does not exist, creating...') tmp_dir.mkdir() # The -Xmx value the tool is run with should be less than the total amount of physical memory available by at least a few GB mem = int(opts.memory) // 1024 - 2 # set the environment variable TILEDB_DISABLE_FILE_LOCKING=1 try: os.environ['TILEDB_DISABLE_FILE_LOCKING'] except KeyError: sys.exit( 'Set the environment variable TILEDB_DISABLE_FILE_LOCKING=1 before running gatk!' ) df = GenDataFrameFromPath(in_dir_path, pattern=opts.gvcf_fn_pattern) df['interval'] = df['fn'].apply(lambda x: x.split('.')[0].split('_')[1]) prog = re.compile(opts.sm_re_pattern) df['sm'] = df['fn'].apply(lambda x: find_sm(x, prog)) cmds = [] for interval, grp in df.groupby('interval'): interval_dir = out_dir_path / (interval.replace(':', '_')) # The --genomicsdb-workspace-path must point to a non-existent or empty directory if interval_dir.exists(): if len(interval_dir.glob('*')) != 0: sys.exit(f'{interval_dir} is not an empty directory!') gvcf_map = str(interval) + '.map' print( f'{grp.shape[0]} gvcf files found for interval {interval}, generating the corresponding map file {gvcf_map}...' ) grp[['sm', 'fnpath']].to_csv(gvcf_map, header=None, index=False, sep='\t') cmd = f"gatk --java-options '-Xmx{mem}g -Xms{mem}g' GenomicsDBImport "\ f"--sample-name-map {gvcf_map} --genomicsdb-workspace-path {interval_dir} "\ f"--batch-size 50 --intervals {interval} "\ f"--reader-threads {opts.ncpus_per_node} --tmp-dir {tmp_dir}" cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') cmd_header = 'ml gatk4/4.1' if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm(cmds, put2slurm_dict)
def BatchCropObject(args): ''' %prog in_dir out_dir apply BatchCropObject on a large number of images ''' p = OptionParser(BatchCropObject.__doc__) p.add_option('--pattern', default='*.jpg', help="file pattern of png files under the 'dir_in'") p.add_option('--pad', type='int', default=5, help='specify the pad size') p.add_option('--date_cutoff', help='date (yyyy-mm-dd_hh-mm)separating two zoom levels') p.add_option('--frame_zoom1', help='frame coordinates under zoom1') p.add_option('--frame_zoom2', help='frame coordinates under zoom2') p.add_option( '--ncpu', default=1, type='int', help= 'CPU cores if using multiprocessing on own desktop (require python>3.8)' ) p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=BatchCropObject.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) in_dir, out_dir, = args in_dir_path = Path(in_dir) pngs = list(in_dir_path.glob(opts.pattern)) ## running on desktop if opts.ncpu > 1: ncpu = min(multiprocessing.cpu_count(), opts.ncpu) print('available CPUs: %s' % ncpu) if ncpu > 1 and len(pngs) >= ncpu: img_fns, boundrys = [], [] for img_fn in pngs: img_fns.append(str(img_fn)) if opts.date_cutoff and opts.frame_zoom1 and opts.frame_zoom2: date_cutoff = datetime.strptime(opts.date_cutoff, '%Y-%m-%d_%H-%M') ymd = img_fn.name.split('_')[1] hm = '-'.join(img_fn.name.split('_')[2].split('-')[0:-1]) image_date = datetime.strptime('%s_%s' % (ymd, hm), '%Y-%m-%d_%H-%M') if image_date <= date_cutoff: boundrys.append(opts.frame_zoom1) else: boundrys.append(opts.frame_zoom2) else: boundrys.append(None) print(len(img_fns), len(boundrys)) pool_args = zip(img_fns, repeat(out_dir), boundrys, repeat(opts.pad)) with Pool(processes=ncpu) as pool: results = pool.starmap(_CropObject, pool_args) sys.exit('parallel finish!') else: sys.exit('not enough files for parallel computing!') ## running on HCC cmds = [] for img_fn in pngs: img_fn = str(img_fn).replace(' ', '\ ') if opts.date_cutoff and opts.frame_zoom1 and opts.frame_zoom2: date_cutoff = datetime.strptime(opts.date_cutoff, '%Y-%m-%d_%H-%M') ymd = Path(img_fn).name.split('_')[1] hm = '-'.join(Path(img_fn).name.split('_')[2].split('-')[0:-1]) image_date = datetime.strptime('%s_%s' % (ymd, hm), '%Y-%m-%d_%H-%M') if image_date <= date_cutoff: cmd = 'python -m schnablelab.ImageProcessing.base CropObject '\ f'{img_fn} --out_dir {out_dir} --pad {opts.pad} --boundry {opts.frame_zoom1}' else: cmd = 'python -m schnablelab.ImageProcessing.base CropObject '\ f'{img_fn} --out_dir {out_dir} --pad {opts.pad} --boundry {opts.frame_zoom2}' else: cmd = 'python -m schnablelab.ImageProcessing.base CropObject '\ f'{img_fn} --out_dir {out_dir} --pad {opts.pad}' cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.Series(cmds).to_csv(cmd_sh, index=False, header=False) print('check %s for all the commands!' % cmd_sh) if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm(cmds, put2slurm_dict)
def genGVCFs(args): """ %prog genGVCFs ref.fa bams.csv region.txt out_dir run GATK HaplotypeCaller in GVCF mode. one g.vcf file for one smaple may contain multiple replicates args: ref.fa: reference sequence file bams.csv: csv file containing all bam files and their sample names region.txt: genomic intervals defined by each row to speed up GVCF calling. example regions: Chr01, Chr01:1-100 out_dir: where the gVCF files save to """ p = OptionParser(genGVCFs.__doc__) p.add_option('--disable_slurm', default=False, action="store_true", help='do not convert commands to slurm jobs') p.add_slurm_opts(job_prefix=genGVCFs.__name__) opts, args = p.parse_args(args) if len(args) == 0: sys.exit(not p.print_help()) ref, bams_csv, region_txt, out_dir, = args out_dir_path = Path(out_dir) if not out_dir_path.exists(): print(f'output directory {out_dir_path} does not exist, creating...') out_dir_path.mkdir() regions = [] with open(region_txt) as f: for i in f: regions.append(i.rstrip()) mem = int(opts.memory) // 1024 df_bam = pd.read_csv(bams_csv) # check if bai files exist for bam in df_bam['fnpath']: if not Path(bam + '.bai').exists(): print(f'no index file for {bam}...') sys.exit('Index your bam files first!') cmds = [] for sm, grp in df_bam.groupby('sm'): print(f'{grp.shape[0]} bam files for sample {sm}') input_bam = '-I ' + ' -I '.join(grp['fnpath'].tolist()) for region in regions: output_fn = f'{sm}_{region}.g.vcf' cmd = f"gatk --java-options '-Xmx{mem}g' HaplotypeCaller -R {ref} "\ f"{input_bam} -O {out_dir_path/output_fn} --sample-name {sm} "\ f"--emit-ref-confidence GVCF -L {region}" cmds.append(cmd) cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds)) pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None) print(f'check {cmd_sh} for all the commands!') cmd_header = 'ml gatk4/4.1' if not opts.disable_slurm: put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm(cmds, put2slurm_dict)
def regression(args): """ %prog regression train_csv, train_dir, model_name_prefix Args: train_csv: csv file (comma separated without header) containing all training image filenames train_dir: directory where training images reside model_name_prefix: the prefix of the output model name """ p = OptionParser(regression.__doc__) p.add_option('--valid_csv', help='csv file for validation if available') p.add_option('--valid_dir', help='directory where validation images reside') p.add_option( '--inputsize', default=224, type='int', help='the input size of image. At least 224 if using pretrained models' ) p.add_option('--batchsize', default=60, type='int', help='batch size') p.add_option('--epoch', default=500, type='int', help='number of total epochs') p.add_option('--patience', default=50, type='int', help='patience in early stopping') p.add_option( '--base_mn', default='resnet18', help= 'base model architectures: vgg16, googlenet, resnet18, resnet152...') p.add_option( '--tl_type', default='finetuning', choices=('feature_extractor', 'finetuning'), help= 'transfer learning type. finetuning: initialize the network with a pretrained network, like the one that is trained on imagenet 1000 dataset. Rest of the training looks as usual. feature_extractor: freeze the weights for all of the network except that of the final fully connected layer. ' ) p.add_option('--pretrained_mn', help='specify your own pretrained model as feature extractor') p.add_option( '--disable_slurm', default=False, action="store_true", help= 'run directly in the console without generating slurm job. Do not do this in HCC login node' ) p.add_slurm_opts(job_prefix=regression.__name__) opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) train_csv, train_dir, model_name_prefix = args # genearte slurm file if not opts.disable_slurm: cmd = "python -m schnablelab.CNN.TransLearning regression "\ f"{train_csv} {train_dir} {model_name_prefix} "\ f"--inputsize {opts.inputsize} --base_mn {opts.base_mn} --disable_slurm " if opts.pretrained_mn: cmd += f"--pretrained_mn {opts.pretrained_mn} " if opts.valid_csv and opts.valid_dir: cmd += f"--valid_csv {opts.valid_csv} --valid_dir {opts.valid_dir} " put2slurm_dict = vars(opts) put2slurm([cmd], put2slurm_dict) sys.exit() logfile = model_name_prefix + '.log' histfile = model_name_prefix + '.hist.csv' logger = logging.getLogger(__name__) f_handler = logging.FileHandler(logfile, mode='w') f_handler.setLevel(logging.DEBUG) f_format = logging.Formatter( '%(asctime)s:%(name)s:%(funcName)s:%(levelname)s:%(message)s') f_handler.setFormatter(f_format) logger.addHandler(f_handler) # can also create another handler for streaming. e.g. c_handler = logging.StreamHandler() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logger.debug( 'device: %s' % device ) # creat a LogRecord and send this info to all the handlers in the logger logger.debug('pytorch version: %s' % torch.__version__) logger.debug('cuda version: %s' % torch.version.cuda) # prepare training and validation data train_dataset = LeafcountingDataset( train_csv, train_dir, image_transforms(input_size=opts.inputsize)['train']) train_loader = DataLoader(train_dataset, batch_size=opts.batchsize) dataloaders_dict = {'train': train_loader} if opts.valid_csv and opts.valid_dir: valid_dataset = LeafcountingDataset( opts.valid_csv, opts.valid_dir, image_transforms(input_size=opts.inputsize)['valid']) valid_loader = DataLoader(valid_dataset, batch_size=opts.batchsize) dataloaders_dict['valid'] = valid_loader # initialize the pre-trained model feature_extract = True if opts.tl_type == 'feature_extractor' else False logger.debug('feature extract: %s' % feature_extract) if opts.pretrained_mn: model, input_size = initialize_model( model_name=opts.base_mn, feature_extract= True, # set param.requires_grad=True for all layers except the fully connected layer use_pretrained=False, inputsize=opts.inputsize) model.load_state_dict( torch.load(opts.pretrained_mn, map_location=device)) else: model, input_size = initialize_model(model_name=opts.base_mn, feature_extract=feature_extract, inputsize=opts.inputsize) logger.debug(model) params_to_update = [ param for param in model.parameters() if param.requires_grad ] # trainable parameters sgd_optimizer = optim.SGD(params_to_update, lr=0.001, momentum=0.9) # optimizer criterion = nn.MSELoss() # loss # train and validation inception = True if opts.base_mn == 'inception' else False since = time.time() model_ft, train_hist, valid_hist = train_model_regression( model, dataloaders_dict, criterion, sgd_optimizer, model_name_prefix, patience=opts.patience, num_epochs=opts.epoch, is_inception=inception) time_elapsed = time.time() - since logger.debug('Training complete in {:.0f}m {:.0f}s'.format( time_elapsed // 60, time_elapsed % 60)) # save training and validation loss. logger.debug('saving loss history...') if opts.valid_csv and opts.valid_dir: df = pd.DataFrame( dict(zip(['training', 'validation'], [train_hist, valid_hist]))) else: df = pd.DataFrame(dict(zip(['training'], [train_hist]))) df.to_csv(histfile, index=False) # plot training and validation loss logger.debug('plot loss history...') import matplotlib.pyplot as plt from matplotlib import rcParams plt.style.use('bmh') rcParams['xtick.direction'] = 'out' rcParams['ytick.direction'] = 'out' fig, ax = plt.subplots(figsize=(4, 3)) ax = df.plot(ax=ax) ax.set_xlabel('Epoch', fontsize=12) ax.set_ylabel('Loss', fontsize=12) plt.tight_layout() plt.savefig('%s.loss.png' % model_name_prefix, dpi=200)
def prediction(args): """ %prog prediction saved_model test_csv, test_dir, output Args: saved_model: saved model with either a .pt or .pth file extension test_csv: csv file (comma separated without header) containing all testing image filenames test_dir: directory where testing images are located output: csv file saving prediction results """ p = OptionParser(prediction.__doc__) p.add_option('--batchsize', default=36, type='int', help='batch size') p.add_option('--pretrained_mn', default=None, help='specifiy pretrained model name if a pretrained model was used') p.add_option('--disable_slurm', default=False, action="store_true", help='run directly without generating slurm job') p.add_slurm_opts(job_prefix=prediction.__name__) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) saved_model, test_csv, test_dir, output = args # genearte slurm file if not opts.disable_slurm: cmd_header = 'ml singularity' cmd = "singularity exec docker://unlhcc/pytorch:1.5.0 "\ "python3 -m schnablelab.CNN.TransLearning prediction "\ f"{saved_model} {test_csv} {test_dir} {output} "\ f"--batchsize {opts.batchsize} --disable_slurm " if opts.pretrained_mn: cmd += f"--pretrained_mn {opts.pretrained_mn}" put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd], put2slurm_dict) sys.exit() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if opts.pretrained_mn: model, input_size = initialize_model(model_name=opts.pretrained_mn) # turn all gradients off for param in model.parameters(): param.requires_grad = False else: sys.exit('not implemented yet...') model.load_state_dict(torch.load(saved_model, map_location=device)) model.eval() test_dataset = LeafcountingDataset(test_csv, test_dir, image_transforms['valid']) test_loader = DataLoader(test_dataset, batch_size=opts.batchsize) ground_truths, predicts, filenames = [],[],[] for phase, (inputs, labels, fns) in enumerate(test_loader, 1): # fns is a tuple print('phase %s'%phase) inputs = inputs.to(device) outputs = model(inputs) ground_truths.append(labels.squeeze().numpy()) filenames.append(np.array(fns)) if torch.cuda.is_available(): predicts.append(outputs.squeeze().to('cpu').numpy()) else: predicts.append(outputs.squeeze().numpy()) ground_truths = np.concatenate(ground_truths) predicts = np.concatenate(predicts) filenames = np.concatenate(filenames) df = pd.DataFrame(dict(zip(['fn', 'groundtruth', 'prediction'], [filenames, ground_truths, predicts]))) df.to_csv(output, index=False)
def regression(args): """ %prog regression train_csv, train_dir, valid_csv, valid_dir, model_name_prefix Args: train_csv: csv file (comma separated without header) containing all training image filenames train_dir: directory where training images are located valid_csv: csv file (comma separated without header) containing all validation image filenames valid_dir: directory where validation images are located model_name_prefix: the prefix of the output model name """ p = OptionParser(regression.__doc__) p.add_option('--batchsize', default=36, type='int', help='batch size') p.add_option('--epoch', default=200, type='int', help='number of total epochs') p.add_option('--patience', default=20, type='int', help='patience in early stopping') p.add_option('--pretrained_mn', default='vgg16', help='pretrained model name. Available pretrained models: vgg16, googlenet, resnet18, resnet152...') p.add_option('--tl_type', default='feature_extract', choices=('feature_extract', 'finetuning'), help='transfer learning type') p.add_option('--disable_slurm', default=False, action="store_true", help='run directly without generating slurm job') p.add_slurm_opts(job_prefix=regression.__name__) opts, args = p.parse_args(args) if len(args) != 5: sys.exit(not p.print_help()) train_csv, train_dir, valid_csv, valid_dir, model_name_prefix = args # genearte slurm file if not opts.disable_slurm: cmd_header = 'ml singularity' cmd = "singularity exec docker://unlhcc/pytorch:1.5.0 "\ "python3 -m schnablelab.CNN.TransLearning regression "\ f"{train_csv} {train_dir} {valid_csv} {valid_dir} {model_name_prefix} "\ f"--batchsize {opts.batchsize} --pretrained_mn {opts.pretrained_mn} --disable_slurm" put2slurm_dict = vars(opts) put2slurm_dict['cmd_header'] = cmd_header put2slurm([cmd], put2slurm_dict) sys.exit() logfile = model_name_prefix + '.log' histfile = model_name_prefix + '.hist.csv' logging.basicConfig(filename=logfile, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:%(message)s") device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") logging.debug('device: %s'%device) logging.debug('pytorch version: %s'%torch.__version__) logging.debug('cuda version: %s'%torch.version.cuda) # prepare training and validation data train_dataset = LeafcountingDataset(train_csv, train_dir, image_transforms['train']) valid_dataset = LeafcountingDataset(valid_csv, valid_dir, image_transforms['valid']) train_loader = DataLoader(train_dataset, batch_size=opts.batchsize) valid_loader = DataLoader(valid_dataset, batch_size=opts.batchsize) dataloaders_dict = {'train': train_loader, 'valid': valid_loader} # initialize the pre-trained model model, input_size = initialize_model(model_name=opts.pretrained_mn) logging.debug(model) feature_extract = True if opts.tl_type == 'feature_extract' else False params_to_update = model.parameters() #logging.debug("Params to learn:") if feature_extract: params_to_update = [] for name, param in model.named_parameters(): if param.requires_grad == True: params_to_update.append(param) #logging.debug("\t%s"%name) else: for name, param in model.named_parameters(): if param.requires_grad == True: pass #logging.debug("\t%s"%name) # optimizer sgd_optimizer = optim.SGD(params_to_update, lr=0.001, momentum=0.9) # loss criterion = nn.MSELoss() # train and validation inception = True if opts.pretrained_mn=='inception' else False since = time.time() model_ft, train_hist, valid_hist = train_model_regression(model, dataloaders_dict, criterion, sgd_optimizer, model_name_prefix, patience=opts.patience, num_epochs=opts.epoch, is_inception=inception) time_elapsed = time.time() - since print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)) # save training and validation loss. logging.debug('saving loss history...') df = pd.DataFrame(dict(zip(['training', 'validation'], [train_hist, valid_hist]))) df.to_csv(histfile, index=False) # plot training and validation loss logging.debug('plot loss history...') fig, ax = plt.subplots(figsize=(4, 3)) ax = df.plot(ax=ax) ax.set_xlabel('Epoch', fontsize=12) ax.set_ylabel('Loss', fontsize=12) plt.tight_layout() plt.savefig('%s.loss.png'%model_name_prefix, dpi=200)