Ejemplo n.º 1
0
def BatchFilterMAF(args):
    """
    %prog in_dir

    apply FilterMAF on multiple vcf files
    """
    p = OptionParser(BatchFilterMAF.__doc__)
    p.add_option('--pattern', default='*.vcf',
                 help="file pattern of vcf files in the 'dir_in'")
    p.add_option('--maf_cutoff', default='0.01',
                 help='maf cutoff, SNPs lower than this cutoff will be removed')
    p.add_option('--disable_slurm', default=False, action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=BatchFilterMAF.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, = args
    in_dir_path= Path(in_dir)
    vcfs = in_dir_path.glob(opts.pattern)
    cmds = []
    for vcf in vcfs:
        cmd = "python -m schnablelab.SNPcalling.base FilterMAF %s --maf_cutoff %s"%(vcf, opts.maf_cutoff)
        cmds.append(cmd)
    cmd_sh = '%s.cmds%s.sh'%(opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print('check %s for all the commands!'%cmd_sh)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 2
0
def genPCA(args):
    """
    %prog genPCA input_hmp N

    Generate first N PCs using tassel
    """
    p = OptionParser(genPCA.__doc__)
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=genPCA.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    hmpfile, N, = args
    out_prefix = Path(hmpfile).name.replace('.hmp', '')
    cmd_header = 'ml java/1.8\nml tassel/5.2'
    cmd = 'run_pipeline.pl -Xms28g -Xmx29g -fork1 -h %s -PrincipalComponentsPlugin -ncomponents %s -covariance true -endPlugin -export %s_%sPCA -runfork1\n' % (
        hmpfile, N, out_prefix, N)
    print('cmd:\n%s\n%s' % (cmd_header, cmd))

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['memory'] = 30000
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd], put2slurm_dict)
Ejemplo n.º 3
0
def IndePvalue(args):
    """
    %prog IndePvalue bed_prefix output_fn

    Estimate number of idenpendent SNPs using GEC
    """
    p = OptionParser(IndePvalue.__doc__)
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=IndePvalue.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    bed_prefix, output_fn = args
    cmd = 'java -Xmx18g -jar %s --noweb --effect-number --plink-binary %s --genome --out %s' % (
        GEC, bed_prefix, output_fn)
    print('cmd:\n%s\n' % cmd)

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['memory'] = 20000
        put2slurm([cmd], put2slurm_dict)
Ejemplo n.º 4
0
def DownsamplingSNPs(args):
    """
    %prog downsampling input_hmp

    Pick up some SNPs from a huge hmp file using Linux sed command
    """
    p = OptionParser(DownsamplingSNPs.__doc__)
    p.add_option('--downscale', default=10, help='specify the downscale level')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=DownsamplingSNPs.__name__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    inputhmp, = args
    outputhmp = Path(inputhmp).name.replace('.hmp',
                                            '_ds%s.hmp' % opts.downsize)
    cmd = "sed -n '1~%sp' %s > %s" % (opts.downsize, inputhmp, outputhmp)
    print('cmd:\n%s\n' % cmd)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm([cmd], put2slurm_dict)
Ejemplo n.º 5
0
def ped2bed(args):
    """
    %prog ped_prefix

    Convert plink ped/map to binary bed/bim/fam format using Plink
    """
    p = OptionParser(ped2bed.__doc__)
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=ped2bed.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ped_prefix, = args
    cmd_header = 'ml plink'
    cmd = 'plink --noweb --file %s --make-bed --out %s' % (ped_prefix,
                                                           ped_prefix)
    print('cmd on HCC:\n%s\n%s' % (cmd_header, cmd))

    cmd_local = '%s --noweb --file %s --make-bed --out %s' % (
        plink, ped_prefix, ped_prefix)
    print('cmd on local desktop:\n%s\n' % cmd_local)

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd], put2slurm_dict)
Ejemplo n.º 6
0
def Batch2JPG(args):
    '''
    %prog Batch2JPG in_dir out_dir

    apply toJPG on a large number of images
    '''
    p = OptionParser(Batch2JPG.__doc__)
    p.add_option('--pattern',
                 default='*.png',
                 help="file pattern of png files under the 'dir_in'")
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=Batch2JPG.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    in_dir_path = Path(in_dir)
    pngs = in_dir_path.glob(opts.pattern)
    cmds = []
    for img_fn in pngs:
        img_fn = str(img_fn).replace(' ', '\ ')
        cmd = "python -m schnablelab.ImageProcessing.base toJPG "\
        f"{img_fn} --out_dir {out_dir}"
        cmds.append(cmd)
    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print('check %s for all the commands!' % cmd_sh)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 7
0
def calculateLD(args):
    """
    %prog vcf_fn/plink_prefix genome_size(Mb) num_SNPs

    calculate LD using Plink
    args:
        vcf_fn/plink_prefix: specify either vcf/vcf.gz file or the prefix of plink bed/bim/fam files. 
        genome_size(Mb): the size of the reference genome in Mb. For reference: sorghum 684Mb
        num_SNPs: the number of SNPs in the genotype file.
    """
    p = OptionParser(calculateLD.__doc__)
    p.add_option('--maf_cutoff', default='0.01',
                 help='only use SNP with the MAF higher than this cutoff to calculate LD')
    p.add_option('--max_distance', type='int', default=1000000,
                 help='the maximum distance of a pair of SNPs to calcualte LD (bp)')
    p.add_option('--disable_slurm', default=False, action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=calculateLD.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_fn, g_size, n_snps, = args
    in_fn, g_size, n_snps = Path(in_fn), int(g_size)*1000000, int(n_snps)

    if in_fn.name.endswith('.vcf') or in_fn.name.endswith('.vcf.gz'):
        input = f'--vcf {in_fn}'
    else:
        input = f'--bfile {in_fn}'
    n = 10
    ld_window, ld_window_bp = [], [] 
    while True:
        ld_window.append(n)
        dist = g_size//n_snps*n
        ld_window_bp.append(dist)
        n *= 10
        if dist>=1000000:
            break
    
    out_fn = Path(in_fn).name.split('.')[0]
    cmds = []
    cmd = f'plink {input} --r2 --ld-window 10 --ld-window-kb {ld_window_bp[0]//1000} --ld-window-r2 0 --maf {opts.maf_cutoff} --out {out_fn}'
    cmds.append(cmd)
    for win_snp, win_bp in zip(ld_window[1:], ld_window_bp[1:]):
        prob = 10/win_snp
        cmd = f'plink {input} --thin {prob} --r2 --ld-window 10 --ld-window-kb {win_bp//1000} --ld-window-r2 0 --maf {opts.maf_cutoff} --out {out_fn}.thin{prob}'
        cmds.append(cmd)
        print(cmd)
    cmd_sh = '%s.cmds%s.sh'%(opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print(f'check {cmd_sh} for all the commands!')

    cmd_header = 'ml plink'
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 8
0
def LinkedSNPs(args):
    """
    %prog input_SNPlist_file bed_prefix r2_cutoff output_prefix

    extract linked SNPs using plink.
    """
    p = OptionParser(LinkedSNPs.__doc__)
    p.add_option('--col_idx',
                 type='int',
                 default=0,
                 help='specify which column contains SNP ID (0-based)')
    p.add_option(
        '--header',
        default='yes',
        choices=('yes', 'no'),
        help='add this option if there is no header in the input SNPlist file')
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=LinkedSNPs.__name__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    SNPlist_fn, bedprefix, cutoff, output_prefix, = args
    if opts.header == 'yes':
        df = pd.read_csv(SNPlist_fn,
                         delim_whitespace=True,
                         usecols=[opts.col_idx])
    else:
        df = pd.read_csv(SNPlist_fn,
                         delim_whitespace=True,
                         usecols=[opts.col_idx],
                         header=None)
    pre = Path(SNPlist_fn).name.split('.')[0]
    df.to_csv('%s.SNPs_list.csv' % pre, index=False, header=None)

    cmd_local = '%s --bfile %s --r2 --ld-snp-list %s.SNPs_list.csv --ld-window-kb 5000 --ld-window 99999 --ld-window-r2 %s --noweb --out %s\n' % (
        plink, bedprefix, pre, cutoff, output_prefix)
    print('cmd on local:\n%s' % cmd_local)

    cmd_header = 'ml plink'
    cmd_hcc = 'plink --bfile %s --r2 --ld-snp-list %s.SNPs_list.csv --ld-window-kb 5000 --ld-window 99999 --ld-window-r2 %s --noweb --out %s\n' % (
        bedprefix, pre, cutoff, output_prefix)
    print('cmd on HCC:\n%s\n%s' % (cmd_header, cmd_hcc))

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd_hcc], put2slurm_dict)
Ejemplo n.º 9
0
def align_pe(args):
    """
    %prog align_pe ref_indx_base fq_fns.csv output_dir

    paire-end alignment using bwa.
    args:
        ref_index_base: the prefix of reference index files
        fq_fns.csv: the csv file including parsed fq files from pre_fqs function.
        output_dir: where the generated bam files save to
    """
    p = OptionParser(align_pe.__doc__)
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=align_pe.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ref_base, fq_csv, output_dir = args
    output_dir = Path(output_dir)
    if not output_dir.exists():
        sys.exit(f'output directory {output_dir} does not exist!')
    df = pd.read_csv(fq_csv)

    df_R1, df_R2 = df[::2], df[1::2]
    if df_R1.shape[0] != df_R2.shape[0]:
        sys.exit('number of R1 and R2 files are not consistent!')

    cmds = []
    for (_, r1), (_, r2) in zip(df_R1.iterrows(), df_R2.iterrows()):
        r1_fn, r2_fn, sm = Path(r1['fnpath']), Path(r2['fnpath']), r1['sm']
        r1_fn_arr, r2_fn_arr = np.array(list(r1_fn.name)), np.array(
            list(r2_fn.name))
        bools = (r1_fn_arr != r2_fn_arr)
        if bools.sum() != 1:
            print(r1_fn, r2_fn)
            sys.exit('check fq file names!')
        idx = np.argmax(bools)
        prefix = re.split('[-_]R', r1_fn.name[:idx])[0]
        RG = r"'@RG\tID:%s\tSM:%s'" % (sm, sm)
        bam_fn = f'{prefix}.pe.sorted.bam'
        cmd = f"bwa mem -t {opts.ncpus_per_node} -R {RG} {ref_base} {r1_fn} {r2_fn} | samtools sort -@{opts.ncpus_per_node} -o {output_dir/bam_fn} -"
        cmds.append(cmd)
    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print(f'check {cmd_sh} for all the commands!')

    cmd_header = 'ml bwa\nml samtools'
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 10
0
def pre_ref(args):
    """
    %prog pre_ref ref.fa

    index the reference genome sequences using bwa, samtools, and picard tools
    """
    p = OptionParser(pre_ref.__doc__)
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=pre_ref.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ref_fn, = args
    ref_fn, ref_dir = Path(ref_fn), Path(ref_fn).parent
    if not ref_fn.exists():
        sys.exit(f'reference file {ref_fn} does not exist!')
    ref_prefix = re.split('.fa|.fasta', ref_fn.name)[0]
    bwa_idx_exs = ('.amb', '.ann', '.bwt', '.pac', '.sa')
    bwa_bool = sum([(ref_dir / (ref_prefix + bie)).exists()
                    for bie in bwa_idx_exs])
    cmds = []
    if bwa_bool != 5:
        print('bwa index does not exist...')
        cmd = f'ml bwa\nbwa index -p {ref_dir/ref_prefix} {ref_fn}'
        cmds.append(cmd)

    if not (ref_dir / (ref_fn.name + '.fai')).exists():
        print('fai index does not exist...')
        cmd = f'ml samtools\nsamtools faidx {ref_fn}'
        cmds.append(cmd)

    dict_fn = ref_dir / (ref_prefix + '.dict')
    if not dict_fn.exists():
        print('dict index does not exist...')
        cmd = f'ml gatk4/4.1\ngatk CreateSequenceDictionary -R {ref_fn} -O {dict_fn}'
        cmds.append(cmd)

    if len(cmds) > 0:
        if not opts.disable_slurm:
            put2slurm_dict = vars(opts)
            put2slurm(cmds, put2slurm_dict)
        else:
            print('commands running on local:\n%s' % ('\n'.join(cmds)))
    else:
        print('All reference index files have already existed!')
Ejemplo n.º 11
0
def genoGVCFs(args):
    """
    %prog genoGVCFs ref.fa genomicDB_dir out_dir 

    create the raw VCFs from GenomicsDB datastores
    args:
        ref.fa: the reference sequence fasta file
        genomicDB_dir: the root directory of genomicDB workspace
        out_dir: where the vcf files will be saved
    """
    p = OptionParser(genoGVCFs.__doc__)
    p.add_option('--gatk_tmp_dir',
                 default='./gatk_tmp',
                 help='temporary directory to use')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=genoGVCFs.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ref, db_dir, out_dir, = args
    out_dir_path = Path(out_dir)
    if not out_dir_path.exists():
        print(f'output directory {out_dir_path} does not exist, creating...')
        out_dir_path.mkdir()
    mem = int(opts.memory) // 1024 - 1

    cmds = []
    for db in Path(db_dir).glob('*'):
        if db.is_dir():
            region = db.name
            vcf_fn = f"{region}.vcf.gz"
            cmd = f"gatk --java-options '-Xmx{mem}g' GenotypeGVCFs "\
            f"-R {ref} -V gendb://{db} -O {out_dir_path/vcf_fn} --tmp-dir={opts.gatk_tmp_dir}"
            cmds.append(cmd)
    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print(f'check {cmd_sh} for all the commands!')

    cmd_header = 'ml gatk4/4.1'
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 12
0
def BatchResize(args):
    '''
    %prog BatchResize in_dir out_dir

    apply BatchResize on a large number of images
    '''
    p = OptionParser(BatchResize.__doc__)
    p.add_option('--pattern',
                 default='*.png',
                 help="file pattern of png files under the 'dir_in'")
    p.add_option('--output_dim',
                 default='1227,1028',
                 help='the dimension (width,height) after resizing')
    p.add_option('--to_jpg',
                 default=False,
                 action='store_true',
                 help='in save image as jpg format')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=BatchResize.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    in_dir_path = Path(in_dir)
    pngs = in_dir_path.glob(opts.pattern)
    cmds = []
    for img_fn in pngs:
        img_fn = str(img_fn).replace(' ', '\ ')
        cmd = 'python -m schnablelab.ImageProcessing.base Resize '\
        f'{img_fn} --output_dim {opts.output_dim} --out_dir {out_dir}'
        if opts.to_jpg:
            cmd += ' --to_jpg'
        cmds.append(cmd)
    fn_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    with open(fn_sh, 'w') as f:
        for i in cmds:
            f.write(i + '\n')
    print('check %s for all the commands!' % fn_sh)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 13
0
def markdupBam(args):
    """
    %prog markdupBam input_dir output_dir

    mark potential PCR duplicates
    output bams will be indexed automatically
    args:
        input_dir: where sorted bam located
        output_dir: where the output rmduped bam shoud save to
    """
    p = OptionParser(markdupBam.__doc__)
    p.add_option('--bam_fn_pattern',
                 default='*.sorted.bam',
                 help='pattern of bam files')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=markdupBam.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir = args
    in_dir_path, out_dir_path = Path(in_dir), Path(out_dir)
    if not out_dir_path.exists():
        sys.exit(f'output directory {out_dir_path} does not exist!')
    bams = in_dir_path.glob(opts.bam_fn_pattern)
    cmds = []
    for bam in bams:
        mdup_bam = bam.name.replace('.bam', '.mdup.bam')
        cmd = f'samtools markdup {bam} {out_dir_path/mdup_bam}\nsamtools index {out_dir_path/mdup_bam}'
        cmds.append(cmd)

    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print(f'check {cmd_sh} for all the commands!')

    cmd_header = 'ml samtools'
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 14
0
def genKinship(args):
    """
    %prog genKinship genotype.mean

    Calculate kinship matrix file using gemma
    """
    p = OptionParser(genKinship.__doc__)
    p.add_option(
        '--type',
        default='1',
        choices=('1', '2'),
        help=
        'specify the way to calculate the relateness, 1: centered; 2: standardized'
    )
    p.add_option('--out_dir', default='.', help='specify the output dir')
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=genKinship.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    geno_mean, = args
    # generate a fake bimbam phenotype based on genotype
    with open(geno_mean) as f:
        num_SMs = len(f.readline().split(',')[3:])
    mean_prefix = geno_mean.replace('.mean', '')
    tmp_pheno = '%s.tmp.pheno' % mean_prefix
    with open(tmp_pheno, 'w') as f1:
        for i in range(num_SMs):
            f1.write('sm%s\t%s\n' % (i, 20))

    # the location of gemma executable file
    cmd = '%s -g %s -p %s -gk %s -outdir %s -o gemma.centered.%s' \
        % (gemma, geno_mean, tmp_pheno, opts.type, opts.out_dir, Path(mean_prefix).name)
    print('The kinship command:\n%s' % cmd)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm([cmd], put2slurm_dict)
Ejemplo n.º 15
0
def BatchCombo(args):
    '''
    %prog fn_core_csv step_n in_dir out_dir
    args:
        fn_core_csv: csv file with the first column as fn_core
        step_n: the step in range(st, ed, step) to split all fn_cores

    distribute Combo on HCC
    '''
    p = OptionParser(BatchCombo.__doc__)
    p.add_option('--pattern',
                 default='_Vis_SV_%s.Crp.jpg',
                 help="The pattern of file suffix under the 'dir_in'")
    p.add_option('--resize',
                 default='150,150',
                 help='the resolution after resizing for each piece of image')
    p.add_option('--ncpu',
                 default=1,
                 type='int',
                 help='CPU cores if using multiprocessing')
    p.add_slurm_opts(job_prefix=BatchCombo.__name__)

    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    fn_core_csv, step_n, in_dir, out_dir, = args

    df = pd.read_csv(fn_core_csv, usecols=['core_fn'])
    cuts = deque(range(0, len(df), int(step_n)))
    cuts.popleft()

    cmds = []
    for idx, _df in enumerate(np.split(df, cuts), start=1):
        new_csv_fn = fn_core_csv + '_%s' % idx
        _df.to_csv(new_csv_fn, index=False)
        cmd = "python -m schnablelab.ImageProcessing.base Combo "\
            f"{new_csv_fn} {in_dir} {out_dir} --pattern {opts.pattern} --resize {opts.resize} --ncpu {opts.ncpu}"
        cmds.append(cmd)
    put2slurm_dict = vars(opts)
    put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 16
0
def hmp2vcf(args):
    """
    %prog hmp2vcf input_hmp
    convert hmp to vcf format using tassel
    """
    p = OptionParser(hmp2vcf.__doc__)
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=hmp2vcf.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    hmpfile, = args
    cmd_header = 'ml tassel/5.2'
    cmd = 'run_pipeline.pl -Xms512m -Xmx10G -fork1 -h %s -export -exportType VCF\n' % (
        hmpfile)
    print('cmd:\n%s\n%s' % (cmd_header, cmd))
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd], put2slurm_dict)
Ejemplo n.º 17
0
def BatchUpload(args):
    '''
    %prog BatchUpload dir1 dir2... project_id subject_id

    upload multiple dataset 
    '''
    p = OptionParser(BatchUpload.__doc__)
    p.add_option('--disable_slurm', default=False, action="store_true",
                help='do not convert commands to slurm job')
    p.add_slurm_opts(job_prefix=BatchUpload.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    *img_dirs, p_id, s_id = args
    cmds = []
    for img_dir in img_dirs:
        cmd = f'python -m schnablelab.Zooniverse.Zookeeper upload {img_dir} {p_id} {img_dir} --subject_id {s_id}'
        cmds.append(cmd)
    cmd_sh = '%s.cmds%s.sh'%(opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print(f'check {cmd_sh} for all the commands!')
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 18
0
def indexBam(args):
    """
    %prog indexBam dir1 ...

    index bam files using samtools index

    dir1: where bam files are located
        add more directories if bam files are located at different directories
    """
    p = OptionParser(indexBam.__doc__)
    p.add_option('--bam_fn_pattern',
                 default='*.mdup.bam',
                 help='file extension of preprocessed bam files')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=indexBam.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    for bam_dir in args:
        bam_dir = Path(bam_dir)
        if not bam_dir.exists():
            sys.exit(f'{bam_dir} does not exist!')
        bams = bam_dir.glob(opts.bam_fn_pattern)
        cmds = [f'samtools index {bam}' for bam in bams]
        cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
        pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
        print(f'check {cmd_sh} for all the commands!')

        cmd_header = 'ml samtools'
        if not opts.disable_slurm:
            put2slurm_dict = vars(opts)
            put2slurm_dict['cmd_header'] = cmd_header
            put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 19
0
def prediction(args):
    """
    %prog prediction saved_model test_csv, test_dir, output
    Args:
        saved_model: saved model with either a .pt or .pth file extension
        test_csv: csv file (comma separated with header) containing all testing image filenames
        test_dir: directory where testing images are located
        output: csv file saving prediction results
    """
    p = OptionParser(prediction.__doc__)
    p.add_option(
        '--inputsize',
        default=224,
        type='int',
        help='the input size of image. At least 224 if using pretrained models'
    )
    p.add_option('--batchsize', default=36, type='int', help='batch size')
    p.add_option(
        '--base_mn',
        default='resnet18',
        help=
        'base model architectures: vgg16, googlenet, resnet18, resnet152...')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='run directly without generating slurm job')
    p.add_slurm_opts(job_prefix=prediction.__name__)

    opts, args = p.parse_args(args)
    if len(args) != 4:
        sys.exit(not p.print_help())
    saved_model, test_csv, test_dir, output = args

    # genearte slurm file
    if not opts.disable_slurm:
        cmd = "python -m schnablelab.CNN.TransLearning prediction "\
            f"{saved_model} {test_csv} {test_dir} {output} "\
            f"--batchsize {opts.batchsize} --disable_slurm "
        if opts.base_mn:
            cmd += f"--base_mn {opts.base_mn} "
        put2slurm_dict = vars(opts)
        put2slurm([cmd], put2slurm_dict)
        sys.exit()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print('devicd: %s' % device)

    if opts.base_mn:
        model, input_size = initialize_model(model_name=opts.base_mn,
                                             feature_extract=True,
                                             use_pretrained=False,
                                             inputsize=opts.inputsize)
        # turn all gradients off
        for param in model.parameters():
            param.requires_grad = False
    else:
        sys.exit('not implemented yet...')

    model.load_state_dict(torch.load(saved_model, map_location=device))
    model.eval()

    test_dataset = LeafcountingDataset(
        test_csv, test_dir,
        image_transforms(input_size=opts.inputsize)['valid'])
    test_loader = DataLoader(test_dataset, batch_size=opts.batchsize)

    ground_truths, predicts, filenames = [], [], []
    for idx, (inputs, labels, fns) in enumerate(test_loader,
                                                1):  # fns is a tuple
        print('idx %s' % idx)
        inputs = inputs.to(device)
        print('type of inputs: %s' % (type(inputs)))
        outputs = model(inputs)
        ground_truths.append(labels.squeeze().numpy())
        filenames.append(np.array(fns))
        if torch.cuda.is_available():
            predicts.append(outputs.squeeze().to('cpu').numpy())
        else:
            predicts.append(outputs.squeeze().numpy())
    ground_truths = np.concatenate(ground_truths)
    predicts = np.concatenate(predicts)
    filenames = np.concatenate(filenames)
    df = pd.DataFrame(
        dict(
            zip(['fn', 'groundtruth', 'prediction'],
                [filenames, ground_truths, predicts])))
    df.to_csv(output, index=False)
Ejemplo n.º 20
0
def ExtractRGBs(args):
    '''
    %prog ExtractRGBs project_folder

    extract RGB images from project folder
    '''
    p = OptionParser(ExtractRGBs.__doc__)
    p.add_option(
        '--npy_idx',
        help='specify the numpy file including the indices for extraction')
    p.add_option(
        '--item_idx',
        default='1,2,3',
        help=
        'the index of sample name, date, and time in each image directory name'
    )
    p.add_option('--out_dir',
                 default='.',
                 help='specify the output image directory')
    p.add_option(
        '--samples',
        help=
        'extract particular samples. multiple samples separated by comma without space'
    )
    p.add_option(
        '--dates',
        help=
        'extract particular dates. multiple dates separated by comma without space.'
    )
    p.add_option('--angle',
                 default='108',
                 help='which viewing angle are you going to extract?')
    p.add_option(
        '--backup_angle',
        help=
        'specify an alternative viewing angle for RGB images if the above angle does not exist.'
    )
    p.add_option(
        '--copy_only',
        default=False,
        action='store_true',
        help='only do copy without resizing and converting image format')
    p.add_option('--disable_slurm',
                 default=False,
                 action='store_true',
                 help='do not convert commands to slurmm jobs')
    p.add_slurm_opts(job_prefix=ExtractRGBs.__name__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    project_folder, = args

    out_dir = Path(opts.out_dir)
    if not out_dir.exists():
        print("The output directory '%s' does not exist, creating.." % out_dir)
        out_dir.mkdir()

    cmd = f'python -m schnablelab.ImageProcessing.HTP ExtractRGBs {project_folder} --out_dir {out_dir} --disable_slurm '
    if opts.npy_idx:
        npy_idx = np.load(opts.npy_idx)
        print(npy_idx)
        cmd += f'--npy_idx {opts.npy_idx} '
    if opts.samples:
        cmd += f'--samples {opts.samples} '
    if opts.dates:
        cmd += f'--dates {opts.dates} '
    if opts.angle:
        cmd += f'--angle {opts.angle} '
    if opts.copy_only:
        cmd += f'--copy_only '
    print(cmd)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm([cmd], put2slurm_dict)
        return

    opts.samples = opts.samples.split(',') if opts.samples else opts.samples
    opts.dates = opts.dates.split(',') if opts.dates else opts.dates

    sm_idx, date_idx, time_idx = [int(i) for i in opts.item_idx.split(',')]
    prj = ParseProject(project_folder, sm_idx, date_idx, time_idx)

    for sm, d, hms, path_img_fn in prj.RGB(folder_idx=npy_idx,
                                           samples=opts.samples,
                                           dates=opts.dates,
                                           angle=opts.angle,
                                           backup_angle=opts.backup_angle):
        angle_dir_name = path_img_fn.parts[-2]
        dest_fn = '%s_%s_%s_%s.jpg' % (sm, d, hms, angle_dir_name)
        dest = out_dir / dest_fn
        if dest.exists():
            print(f'{dest} already exists, omit!')
        else:
            if opts.copy_only:
                copyfile(path_img_fn, dest)
            else:
                Image.open(path_img_fn).convert('RGB').resize(
                    (1227, 1028)).save(dest)
Ejemplo n.º 21
0
def aggGVCFs(args):
    """
    %prog aggGVCFs input_dir out_dir 

    aggregate GVCF files to a GenomicsDB datastore for each genomic interval
    args:
        intput_dir: the directory containing all gvcf files
        out_dir: the output directory. a subdir will be created for each genomic interval
    """
    p = OptionParser(aggGVCFs.__doc__)
    p.add_option('--gvcf_fn_pattern',
                 default='*.g.vcf',
                 help='file extension of gvcf files')
    p.add_option(
        '--sm_re_pattern',
        default=r"^P[0-9]{3}[_-]W[A-Z][0-9]{2}[^a-z0-9]",
        help='the regular expression pattern to pull sample name from filename'
    )
    p.add_option('--gatk_tmp_dir',
                 default='./gatk_tmp',
                 help='temporary directory for genomicsDBImport')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=aggGVCFs.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    in_dir_path = Path(in_dir)
    out_dir_path = Path(out_dir)
    if not in_dir_path.exists():
        sys.exit(f'input directory {in_dir_path} does not exist!')
    if not out_dir_path.exists():
        print(f'output directory {out_dir_path} does not exist, creating...')
        out_dir_path.mkdir()
    tmp_dir = Path(opts.gatk_tmp_dir)
    if not tmp_dir.exists():
        print('tmp directory does not exist, creating...')
        tmp_dir.mkdir()

    # The -Xmx value the tool is run with should be less than the total amount of physical memory available by at least a few GB
    mem = int(opts.memory) // 1024 - 2

    # set the environment variable TILEDB_DISABLE_FILE_LOCKING=1
    try:
        os.environ['TILEDB_DISABLE_FILE_LOCKING']
    except KeyError:
        sys.exit(
            'Set the environment variable TILEDB_DISABLE_FILE_LOCKING=1 before running gatk!'
        )

    df = GenDataFrameFromPath(in_dir_path, pattern=opts.gvcf_fn_pattern)
    df['interval'] = df['fn'].apply(lambda x: x.split('.')[0].split('_')[1])
    prog = re.compile(opts.sm_re_pattern)
    df['sm'] = df['fn'].apply(lambda x: find_sm(x, prog))

    cmds = []
    for interval, grp in df.groupby('interval'):
        interval_dir = out_dir_path / (interval.replace(':', '_'))
        # The --genomicsdb-workspace-path must point to a non-existent or empty directory
        if interval_dir.exists():
            if len(interval_dir.glob('*')) != 0:
                sys.exit(f'{interval_dir} is not an empty directory!')
        gvcf_map = str(interval) + '.map'
        print(
            f'{grp.shape[0]} gvcf files found for interval {interval}, generating the corresponding map file {gvcf_map}...'
        )
        grp[['sm', 'fnpath']].to_csv(gvcf_map,
                                     header=None,
                                     index=False,
                                     sep='\t')

        cmd = f"gatk --java-options '-Xmx{mem}g -Xms{mem}g' GenomicsDBImport "\
 f"--sample-name-map {gvcf_map} --genomicsdb-workspace-path {interval_dir} "\
 f"--batch-size 50 --intervals {interval} "\
        f"--reader-threads {opts.ncpus_per_node} --tmp-dir {tmp_dir}"
        cmds.append(cmd)

    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print(f'check {cmd_sh} for all the commands!')

    cmd_header = 'ml gatk4/4.1'
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 22
0
def BatchCropObject(args):
    '''
    %prog in_dir out_dir

    apply BatchCropObject on a large number of images
    '''
    p = OptionParser(BatchCropObject.__doc__)
    p.add_option('--pattern',
                 default='*.jpg',
                 help="file pattern of png files under the 'dir_in'")
    p.add_option('--pad', type='int', default=5, help='specify the pad size')
    p.add_option('--date_cutoff',
                 help='date (yyyy-mm-dd_hh-mm)separating two zoom levels')
    p.add_option('--frame_zoom1', help='frame coordinates under zoom1')
    p.add_option('--frame_zoom2', help='frame coordinates under zoom2')
    p.add_option(
        '--ncpu',
        default=1,
        type='int',
        help=
        'CPU cores if using multiprocessing on own desktop (require python>3.8)'
    )
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=BatchCropObject.__name__)

    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    in_dir_path = Path(in_dir)
    pngs = list(in_dir_path.glob(opts.pattern))

    ## running on desktop
    if opts.ncpu > 1:
        ncpu = min(multiprocessing.cpu_count(), opts.ncpu)
        print('available CPUs: %s' % ncpu)
        if ncpu > 1 and len(pngs) >= ncpu:
            img_fns, boundrys = [], []
            for img_fn in pngs:
                img_fns.append(str(img_fn))
                if opts.date_cutoff and opts.frame_zoom1 and opts.frame_zoom2:
                    date_cutoff = datetime.strptime(opts.date_cutoff,
                                                    '%Y-%m-%d_%H-%M')
                    ymd = img_fn.name.split('_')[1]
                    hm = '-'.join(img_fn.name.split('_')[2].split('-')[0:-1])
                    image_date = datetime.strptime('%s_%s' % (ymd, hm),
                                                   '%Y-%m-%d_%H-%M')
                    if image_date <= date_cutoff:
                        boundrys.append(opts.frame_zoom1)
                    else:
                        boundrys.append(opts.frame_zoom2)
                else:
                    boundrys.append(None)
            print(len(img_fns), len(boundrys))
            pool_args = zip(img_fns, repeat(out_dir), boundrys,
                            repeat(opts.pad))
            with Pool(processes=ncpu) as pool:
                results = pool.starmap(_CropObject, pool_args)
            sys.exit('parallel finish!')
        else:
            sys.exit('not enough files for parallel computing!')

    ## running on HCC
    cmds = []
    for img_fn in pngs:
        img_fn = str(img_fn).replace(' ', '\ ')
        if opts.date_cutoff and opts.frame_zoom1 and opts.frame_zoom2:
            date_cutoff = datetime.strptime(opts.date_cutoff, '%Y-%m-%d_%H-%M')
            ymd = Path(img_fn).name.split('_')[1]
            hm = '-'.join(Path(img_fn).name.split('_')[2].split('-')[0:-1])
            image_date = datetime.strptime('%s_%s' % (ymd, hm),
                                           '%Y-%m-%d_%H-%M')
            if image_date <= date_cutoff:
                cmd = 'python -m schnablelab.ImageProcessing.base CropObject '\
            f'{img_fn} --out_dir {out_dir} --pad {opts.pad} --boundry {opts.frame_zoom1}'
            else:
                cmd = 'python -m schnablelab.ImageProcessing.base CropObject '\
            f'{img_fn} --out_dir {out_dir} --pad {opts.pad} --boundry {opts.frame_zoom2}'
        else:
            cmd = 'python -m schnablelab.ImageProcessing.base CropObject '\
            f'{img_fn} --out_dir {out_dir} --pad {opts.pad}'
        cmds.append(cmd)
    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.Series(cmds).to_csv(cmd_sh, index=False, header=False)
    print('check %s for all the commands!' % cmd_sh)

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 23
0
def genGVCFs(args):
    """
    %prog genGVCFs ref.fa bams.csv region.txt out_dir

    run GATK HaplotypeCaller in GVCF mode. 
    one g.vcf file for one smaple may contain multiple replicates
    args:
        ref.fa: reference sequence file
        bams.csv: csv file containing all bam files and their sample names
        region.txt: genomic intervals defined by each row to speed up GVCF calling. 
            example regions: Chr01, Chr01:1-100
        out_dir: where the gVCF files save to
    """
    p = OptionParser(genGVCFs.__doc__)
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=genGVCFs.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ref, bams_csv, region_txt, out_dir, = args
    out_dir_path = Path(out_dir)
    if not out_dir_path.exists():
        print(f'output directory {out_dir_path} does not exist, creating...')
        out_dir_path.mkdir()

    regions = []
    with open(region_txt) as f:
        for i in f:
            regions.append(i.rstrip())

    mem = int(opts.memory) // 1024

    df_bam = pd.read_csv(bams_csv)

    # check if bai files exist
    for bam in df_bam['fnpath']:
        if not Path(bam + '.bai').exists():
            print(f'no index file for {bam}...')
            sys.exit('Index your bam files first!')

    cmds = []
    for sm, grp in df_bam.groupby('sm'):
        print(f'{grp.shape[0]} bam files for sample {sm}')
        input_bam = '-I ' + ' -I '.join(grp['fnpath'].tolist())
        for region in regions:
            output_fn = f'{sm}_{region}.g.vcf'
            cmd = f"gatk --java-options '-Xmx{mem}g' HaplotypeCaller -R {ref} "\
  f"{input_bam} -O {out_dir_path/output_fn} --sample-name {sm} "\
  f"--emit-ref-confidence GVCF -L {region}"
            cmds.append(cmd)

    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print(f'check {cmd_sh} for all the commands!')

    cmd_header = 'ml gatk4/4.1'
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm(cmds, put2slurm_dict)
Ejemplo n.º 24
0
def regression(args):
    """
    %prog regression train_csv, train_dir, model_name_prefix
    Args:
        train_csv: csv file (comma separated without header) containing all training image filenames
        train_dir: directory where training images reside
        model_name_prefix: the prefix of the output model name 
    """
    p = OptionParser(regression.__doc__)
    p.add_option('--valid_csv', help='csv file for validation if available')
    p.add_option('--valid_dir',
                 help='directory where validation images reside')
    p.add_option(
        '--inputsize',
        default=224,
        type='int',
        help='the input size of image. At least 224 if using pretrained models'
    )
    p.add_option('--batchsize', default=60, type='int', help='batch size')
    p.add_option('--epoch',
                 default=500,
                 type='int',
                 help='number of total epochs')
    p.add_option('--patience',
                 default=50,
                 type='int',
                 help='patience in early stopping')
    p.add_option(
        '--base_mn',
        default='resnet18',
        help=
        'base model architectures: vgg16, googlenet, resnet18, resnet152...')
    p.add_option(
        '--tl_type',
        default='finetuning',
        choices=('feature_extractor', 'finetuning'),
        help=
        'transfer learning type. finetuning: initialize the network with a pretrained network, like the one that is trained on imagenet 1000 dataset. Rest of the training looks as usual. feature_extractor: freeze the weights for all of the network except that of the final fully connected layer. '
    )
    p.add_option('--pretrained_mn',
                 help='specify your own pretrained model as feature extractor')
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help=
        'run directly in the console without generating slurm job. Do not do this in HCC login node'
    )
    p.add_slurm_opts(job_prefix=regression.__name__)

    opts, args = p.parse_args(args)
    if len(args) != 3:
        sys.exit(not p.print_help())
    train_csv, train_dir, model_name_prefix = args
    # genearte slurm file
    if not opts.disable_slurm:
        cmd = "python -m schnablelab.CNN.TransLearning regression "\
            f"{train_csv} {train_dir} {model_name_prefix} "\
            f"--inputsize {opts.inputsize} --base_mn {opts.base_mn} --disable_slurm "
        if opts.pretrained_mn:
            cmd += f"--pretrained_mn {opts.pretrained_mn} "
        if opts.valid_csv and opts.valid_dir:
            cmd += f"--valid_csv {opts.valid_csv} --valid_dir {opts.valid_dir} "
        put2slurm_dict = vars(opts)
        put2slurm([cmd], put2slurm_dict)
        sys.exit()

    logfile = model_name_prefix + '.log'
    histfile = model_name_prefix + '.hist.csv'

    logger = logging.getLogger(__name__)
    f_handler = logging.FileHandler(logfile, mode='w')
    f_handler.setLevel(logging.DEBUG)
    f_format = logging.Formatter(
        '%(asctime)s:%(name)s:%(funcName)s:%(levelname)s:%(message)s')
    f_handler.setFormatter(f_format)
    logger.addHandler(f_handler)
    # can also create another handler for streaming. e.g. c_handler = logging.StreamHandler()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logger.debug(
        'device: %s' % device
    )  # creat a LogRecord and send this info to all the handlers in the logger
    logger.debug('pytorch version: %s' % torch.__version__)
    logger.debug('cuda version: %s' % torch.version.cuda)

    # prepare training and validation data
    train_dataset = LeafcountingDataset(
        train_csv, train_dir,
        image_transforms(input_size=opts.inputsize)['train'])
    train_loader = DataLoader(train_dataset, batch_size=opts.batchsize)
    dataloaders_dict = {'train': train_loader}

    if opts.valid_csv and opts.valid_dir:
        valid_dataset = LeafcountingDataset(
            opts.valid_csv, opts.valid_dir,
            image_transforms(input_size=opts.inputsize)['valid'])
        valid_loader = DataLoader(valid_dataset, batch_size=opts.batchsize)
        dataloaders_dict['valid'] = valid_loader

    # initialize the pre-trained model
    feature_extract = True if opts.tl_type == 'feature_extractor' else False
    logger.debug('feature extract: %s' % feature_extract)

    if opts.pretrained_mn:
        model, input_size = initialize_model(
            model_name=opts.base_mn,
            feature_extract=
            True,  # set param.requires_grad=True for all layers except the fully connected layer
            use_pretrained=False,
            inputsize=opts.inputsize)
        model.load_state_dict(
            torch.load(opts.pretrained_mn, map_location=device))
    else:
        model, input_size = initialize_model(model_name=opts.base_mn,
                                             feature_extract=feature_extract,
                                             inputsize=opts.inputsize)
    logger.debug(model)

    params_to_update = [
        param for param in model.parameters() if param.requires_grad
    ]  # trainable parameters
    sgd_optimizer = optim.SGD(params_to_update, lr=0.001,
                              momentum=0.9)  # optimizer
    criterion = nn.MSELoss()  # loss
    # train and validation
    inception = True if opts.base_mn == 'inception' else False
    since = time.time()
    model_ft, train_hist, valid_hist = train_model_regression(
        model,
        dataloaders_dict,
        criterion,
        sgd_optimizer,
        model_name_prefix,
        patience=opts.patience,
        num_epochs=opts.epoch,
        is_inception=inception)
    time_elapsed = time.time() - since
    logger.debug('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))

    # save training and validation loss.
    logger.debug('saving loss history...')
    if opts.valid_csv and opts.valid_dir:
        df = pd.DataFrame(
            dict(zip(['training', 'validation'], [train_hist, valid_hist])))
    else:
        df = pd.DataFrame(dict(zip(['training'], [train_hist])))
    df.to_csv(histfile, index=False)

    # plot training and validation loss
    logger.debug('plot loss history...')
    import matplotlib.pyplot as plt
    from matplotlib import rcParams
    plt.style.use('bmh')
    rcParams['xtick.direction'] = 'out'
    rcParams['ytick.direction'] = 'out'
    fig, ax = plt.subplots(figsize=(4, 3))
    ax = df.plot(ax=ax)
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Loss', fontsize=12)
    plt.tight_layout()
    plt.savefig('%s.loss.png' % model_name_prefix, dpi=200)
Ejemplo n.º 25
0
def prediction(args):
    """
    %prog prediction saved_model test_csv, test_dir, output
    Args:
        saved_model: saved model with either a .pt or .pth file extension
        test_csv: csv file (comma separated without header) containing all testing image filenames
        test_dir: directory where testing images are located
        output: csv file saving prediction results
    """
    p = OptionParser(prediction.__doc__)
    p.add_option('--batchsize', default=36, type='int', 
                    help='batch size')
    p.add_option('--pretrained_mn', default=None,
                    help='specifiy pretrained model name if a pretrained model was used')
    p.add_option('--disable_slurm', default=False, action="store_true",
                 help='run directly without generating slurm job')
    p.add_slurm_opts(job_prefix=prediction.__name__)

    opts, args = p.parse_args(args)
    if len(args) != 4:
        sys.exit(not p.print_help())
    saved_model, test_csv, test_dir, output = args

    # genearte slurm file
    if not opts.disable_slurm:
        cmd_header = 'ml singularity'
        cmd = "singularity exec docker://unlhcc/pytorch:1.5.0 "\
            "python3 -m schnablelab.CNN.TransLearning prediction "\
            f"{saved_model} {test_csv} {test_dir} {output} "\
            f"--batchsize {opts.batchsize} --disable_slurm "
        if opts.pretrained_mn:
         cmd += f"--pretrained_mn {opts.pretrained_mn}"
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd], put2slurm_dict)
        sys.exit()

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if opts.pretrained_mn:
        model, input_size = initialize_model(model_name=opts.pretrained_mn)
        # turn all gradients off
        for param in model.parameters():
            param.requires_grad = False
    else:
        sys.exit('not implemented yet...')

    model.load_state_dict(torch.load(saved_model, map_location=device))
    model.eval()

    test_dataset = LeafcountingDataset(test_csv, test_dir, image_transforms['valid'])
    test_loader = DataLoader(test_dataset, batch_size=opts.batchsize)

    ground_truths, predicts, filenames = [],[],[]
    for phase, (inputs, labels, fns) in enumerate(test_loader, 1): # fns is a tuple
        print('phase %s'%phase)
        inputs = inputs.to(device)
        outputs = model(inputs)
        ground_truths.append(labels.squeeze().numpy())
        filenames.append(np.array(fns))
        if torch.cuda.is_available():
            predicts.append(outputs.squeeze().to('cpu').numpy())
        else:
            predicts.append(outputs.squeeze().numpy())
    ground_truths = np.concatenate(ground_truths)
    predicts = np.concatenate(predicts)
    filenames = np.concatenate(filenames)
    df = pd.DataFrame(dict(zip(['fn', 'groundtruth', 'prediction'], [filenames, ground_truths, predicts])))
    df.to_csv(output, index=False)
Ejemplo n.º 26
0
def regression(args):
    """
    %prog regression train_csv, train_dir, valid_csv, valid_dir, model_name_prefix
    Args:
        train_csv: csv file (comma separated without header) containing all training image filenames
        train_dir: directory where training images are located
        valid_csv: csv file (comma separated without header) containing all validation image filenames
        valid_dir: directory where validation images are located
        model_name_prefix: the prefix of the output model name 
    """
    p = OptionParser(regression.__doc__)
    p.add_option('--batchsize', default=36, type='int', 
                    help='batch size')
    p.add_option('--epoch', default=200, type='int', 
                    help='number of total epochs')
    p.add_option('--patience', default=20, type='int', 
                    help='patience in early stopping')
    p.add_option('--pretrained_mn', default='vgg16',
                    help='pretrained model name. Available pretrained models: vgg16, googlenet, resnet18, resnet152...')
    p.add_option('--tl_type', default='feature_extract', choices=('feature_extract', 'finetuning'),
                    help='transfer learning type')
    p.add_option('--disable_slurm', default=False, action="store_true",
                 help='run directly without generating slurm job')
    p.add_slurm_opts(job_prefix=regression.__name__)

    opts, args = p.parse_args(args)
    if len(args) != 5:
        sys.exit(not p.print_help())
    train_csv, train_dir, valid_csv, valid_dir, model_name_prefix = args
    # genearte slurm file
    if not opts.disable_slurm:
        cmd_header = 'ml singularity'
        cmd = "singularity exec docker://unlhcc/pytorch:1.5.0 "\
            "python3 -m schnablelab.CNN.TransLearning regression "\
            f"{train_csv} {train_dir} {valid_csv} {valid_dir} {model_name_prefix} "\
            f"--batchsize {opts.batchsize} --pretrained_mn {opts.pretrained_mn} --disable_slurm"
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd], put2slurm_dict)
        sys.exit()

    logfile = model_name_prefix + '.log'
    histfile = model_name_prefix + '.hist.csv'
    logging.basicConfig(filename=logfile, level=logging.DEBUG, format="%(asctime)s:%(levelname)s:%(message)s")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    logging.debug('device: %s'%device)
    logging.debug('pytorch version: %s'%torch.__version__)
    logging.debug('cuda version: %s'%torch.version.cuda)

    # prepare training and validation data
    train_dataset = LeafcountingDataset(train_csv, train_dir, image_transforms['train'])
    valid_dataset = LeafcountingDataset(valid_csv, valid_dir, image_transforms['valid'])
    train_loader = DataLoader(train_dataset, batch_size=opts.batchsize)
    valid_loader = DataLoader(valid_dataset, batch_size=opts.batchsize)
    dataloaders_dict = {'train': train_loader, 'valid': valid_loader}

    # initialize the pre-trained model
    model, input_size = initialize_model(model_name=opts.pretrained_mn)
    logging.debug(model)

    feature_extract = True if opts.tl_type == 'feature_extract' else False

    params_to_update = model.parameters()
    #logging.debug("Params to learn:")
    if feature_extract:
        params_to_update = []
        for name, param in model.named_parameters():
            if param.requires_grad == True:
                params_to_update.append(param)
                #logging.debug("\t%s"%name)
    else:
        for name, param in model.named_parameters():
            if param.requires_grad == True:
                pass
                #logging.debug("\t%s"%name)
    # optimizer
    sgd_optimizer = optim.SGD(params_to_update, lr=0.001, momentum=0.9)
    # loss
    criterion = nn.MSELoss()
    # train and validation
    inception = True if opts.pretrained_mn=='inception' else False
    since = time.time()
    model_ft, train_hist, valid_hist = train_model_regression(model, dataloaders_dict, 
                                                            criterion, sgd_optimizer,
                                                            model_name_prefix, 
                                                            patience=opts.patience, 
                                                            num_epochs=opts.epoch, 
                                                            is_inception=inception)
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
    
    # save training and validation loss.
    logging.debug('saving loss history...')
    df = pd.DataFrame(dict(zip(['training', 'validation'], [train_hist, valid_hist])))
    df.to_csv(histfile, index=False)
    
    # plot training and validation loss
    logging.debug('plot loss history...')
    fig, ax = plt.subplots(figsize=(4, 3))
    ax = df.plot(ax=ax)
    ax.set_xlabel('Epoch', fontsize=12)
    ax.set_ylabel('Loss', fontsize=12)
    plt.tight_layout()
    plt.savefig('%s.loss.png'%model_name_prefix, dpi=200)