Exemple #1
0
def trim_paired(args):
    """
    %prog trim in_dir out_dir
    quality control on the paired reads
    """
    p = OptionParser(trim_paired.__doc__)
    p.add_option('--pattern_r1', default = '*_R1.fastq',
            help='filename pattern for forward reads')
    p.add_option('--pattern_r2', default = '*_R2.fastq',
            help='filename pattern for reverse reads')
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir,out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('output dir %s does not exist...'%out_dir)
    r1_fns = glob('%s/%s'%(in_dir, opts.pattern_r1))
    r2_fns = glob('%s/%s'%(in_dir, opts.pattern_r2))
    for r1_fn, r2_fn in zip(r1_fns, r2_fns):
        r1_path = Path(r1_fn)
        r2_path = Path(r2_fn)
        prf = '_'.join(r1_path.name.split('_')[0:-1])+'.PE'
        print(prf)
        r1_fn_out1 = r1_path.name.replace('R1.fastq', 'trim.R1.fastq')
        r1_fn_out2 = r1_path.name.replace('R1.fastq', 'unpaired.R1.fastq')
        r2_fn_out1 = r2_path.name.replace('R2.fastq', 'trim.R2.fastq')
        r2_fn_out2 = r2_path.name.replace('R2.fastq', 'unpaired.R2.fastq')
        cmd = 'java -jar $TM_HOME/trimmomatic.jar PE -phred33 %s %s %s %s %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40'%(r1_fn,r2_fn,str(out_path/r1_fn_out1),str(out_path/r1_fn_out2),str(out_path/r2_fn_out1),str(out_path/r2_fn_out2))
        header = Slurm_header%(10, 10000, prf, prf, prf)
        header += 'ml trimmomatic\n'
        header += cmd
        with open('%s.trim.slurm'%(prf), 'w') as f:
            f.write(header)
Exemple #2
0
def PredictSlurmGPU(args):
    """
    %prog model_name npyPattern("CM*.npy") job_n
    generate prediction GPU jobs for all npy files
    """
    p = OptionParser(PredictSlurmGPU.__doc__)
    p.set_slurm_opts(jn=True)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    mn, npy_pattern, jobn, = args
    if opts.prefix == 'myjob':
        print('specify job name prefix!')
        sys.exit()

    npys = glob(npy_pattern)
    print(len(npys))
    grps = cutlist(npys, int(jobn))
    for gn, grp in grps:
        st, ed = gn.split('-')
        ed = int(ed) + 1
        gn = '%s-%s' % (st, ed)
        cmd = "python -m schnablelab.CNN.Predict Predict %s '%s' %s\n" % (
            mn, npy_pattern, gn)
        opt = '%s.%s' % (opts.prefix, gn)
        header = Slurm_gpu_header % (opts.time, opts.memory, opt, opt, opt)
        header += "ml anaconda\nsource activate MCY\n"
        header += cmd
        with open('%s.gpu.slurm' % opt, 'w') as f:
            f.write(header)
        print('%s.gpu.slurm prediction GPU job file generated!' % opt)
Exemple #3
0
def CallHeightBatch(args):
    """
    %prog imagePattern("CM*.polish.png")
    generate height call jobs for all polished image files
    """
    p = OptionParser(CallHeightBatch.__doc__)
    p.set_slurm_opts(array=False)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    pattern, = args
    all_pngs = glob(pattern)
    for i in all_pngs:
        out_prefix = i.split('/')[-1].split('.polish.png')[0]
        jobname = out_prefix + '.Height'
        cmd = 'python -m schnablelab.CNN.CallHeight CallHeight %s %s\n' % (
            i, out_prefix)
        header = Slurm_header % (opts.time, opts.memory, jobname, jobname,
                                 jobname)
        header += "ml anaconda\nsource activate %s\n" % opts.env
        header += cmd
        jobfile = open('%s.CallHeight.slurm' % out_prefix, 'w')
        jobfile.write(header)
        jobfile.close()
        print('%s.CallHeight.slurm call height job file generated!' % jobname)
Exemple #4
0
def trim_single(args):
    """
    %prog trim in_dir out_dir
    quality control on the single end reads
    """
    p = OptionParser(trim_paired.__doc__)
    p.add_option('--pattern', default = '*_Unpaired.fastq',
            help='filename pattern for all single end reads')
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir,out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('output dir %s does not exist...'%out_dir)
    fns = glob('%s/%s'%(in_dir, opts.pattern))
    for fn in fns:
        fn_path = Path(fn)
        prf = '_'.join(fn_path.name.split('_')[0:-1])+'.SE'
        print(prf)
        fn_out = fn_path.name.replace('Unpaired.fastq', 'trim.Unpaired.fastq')
        cmd = 'java -jar $TM_HOME/trimmomatic.jar SE -phred33 %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40'%(fn, str(out_path/fn_out))
        header = Slurm_header%(10, 10000, prf, prf, prf)
        header += 'ml trimmomatic\n'
        header += cmd
        with open('%s.trim.slurm'%(prf), 'w') as f:
            f.write(header)
Exemple #5
0
def Predict(args):
    """
    %prog model_name npy_pattern('CM*.npy')
    using your trained model to make predictions on selected npy files.
    The pred_data is a numpy array object which has the same number of columns as the training data.
    """
    from keras.models import load_model
    import scipy.misc as sm
    p = OptionParser(Predict.__doc__)
    p.add_option(
        '--range',
        default='all',
        help="specify the range of the testing images, hcc job range style")
    p.add_option('--opf',
                 default='infer',
                 help="specify the prefix of the output file names")
    opts, args = p.parse_args(args)
    if len(args) != 2:
        sys.exit(not p.print_help())
    model, npy_pattern = args
    opf = model.split('/')[-1].split(
        '.')[0] if opts.opf == 'infer' else opts.opf

    npys = glob(npy_pattern)
    if opts.range != 'all':
        start = int(opts.range.split('-')[0])
        end = int(opts.range.split('-')[1])
        npys = npys[start:end]
    print('%s npys will be predicted this time.' % len(npys))

    my_model = load_model(model)
    for npy in npys:
        print(npy)
        test_npy = np.load(npy)
        npy_shape = test_npy.shape
        test_npy_2d = test_npy.reshape(npy_shape[0] * npy_shape[1],
                                       npy_shape[2])
        print('testing data shape:', test_npy_2d.shape)
        pre_prob = my_model.predict(test_npy_2d)
        predictions = pre_prob.argmax(axis=1)  # this is a numpy array
        predictions = predictions.reshape(npy_shape[0], npy_shape[1])
        df = pd.DataFrame(predictions)
        df1 = df.replace(0, 255).replace(1, 127).replace(2, 253).replace(
            3, 190)  #0: background; 1: leaf; 2: stem; 3: panicle
        df2 = df.replace(0, 255).replace(1, 201).replace(2,
                                                         192).replace(3, 174)
        df3 = df.replace(0, 255).replace(1, 127).replace(2,
                                                         134).replace(3, 212)
        arr = np.stack([df1.values, df2.values, df3.values], axis=2)
        opt = npy.split('/')[-1].split('.npy')[0] + '.prd'
        sm.imsave('%s.%s.png' % (opf, opt), arr)
        print('Done!')
Exemple #6
0
def Plot(args):
    """
    %prog dir
    plot training process
    You can load the dict back using pickle.load(open('*.p', 'rb'))
    """

    p = OptionParser(Plot.__doc__)
    p.add_option(
        "--pattern",
        default="History_*.p",
        help=
        "specify the pattern of your pickle object file, remember to add quotes [default: %default]"
    )
    p.set_slurm_opts()
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    mydir, = args
    pickles = glob('%s/%s' % (mydir, opts.pattern))
    print('total %s pickle objects.' % len(pickles))
    #print(pickles)
    for p in pickles:
        fs, es = opts.pattern.split('*')
        fn = p.split(fs)[-1].split(es)[0]
        myp = pickle.load(open(p, 'rb'))

        mpl.rcParams['figure.figsize'] = [7.5, 3.25]
        fig, axes = plt.subplots(nrows=1, ncols=2)

        # summarize history for accuracy
        ax1 = axes[0]
        ax1.plot(myp['acc'])
        ax1.plot(myp['val_acc'])
        ax1.set_title('model accuracy')
        ax1.set_ylabel('accuracy')
        ax1.set_xlabel('epoch')
        ax1.set_ylim(0, 1.01)
        ax1.legend(['train', 'validation'], loc='lower right')
        max_acc = max(myp['val_acc'])
        # summarize history for loss
        ax2 = axes[1]
        ax2.plot(myp['loss'])
        ax2.plot(myp['val_loss'])
        ax2.set_title('model loss')
        ax2.set_ylabel('loss')
        ax2.set_xlabel('epoch')
        ax2.legend(['train', 'validation'], loc='upper right')
        plt.tight_layout()
        plt.savefig('%s_%s.png' % (max_acc, fn))
        plt.clf()
Exemple #7
0
def submit(args):
    """
    %prog dir

    Submit part of job in the dir or all jobs
    """
    p = OptionParser(submit.__doc__)
    p.add_option(
        "--pattern",
        default="*.slurm",
        help=
        "specify the patter of your slurm job, remember to add quotes [default: %default]"
    )
    p.add_option(
        "--partition",
        default='jclarke',
        choices=('batch', 'jclarke', 'gpu', 'schnablelab'),
        help=
        "choose which partition you are going to submit [default: %default]")
    p.add_option(
        "--range",
        default='all',
        help=
        "how many jobs you gonna submit this time. exp: '1-10', '11-20', 'all'. 1-based coordinate"
    )
    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    #partition = '' if opts.partition=='batch' else '-p %s'%opts.partition
    partition = '-p %s' % opts.partition
    alljobs = [
        'sbatch %s %s' % (partition, i) for i in glob(folder, opts.pattern)
    ]
    print("Total %s jobs under '%s'" % (len(alljobs), folder))

    if opts.range == 'all':
        for i in alljobs:
            print(i)
            call(i, shell=True)
    else:
        start, end = int(opts.range.split('-')[0]), int(
            opts.range.split('-')[1])
        if end <= len(alljobs):
            for i in alljobs[start - 1:end]:
                print(i)
                call(i, shell=True)
            print('%s of total %s were submitted. [%s to %s] this time.' \
                %(len(alljobs[start-1 : end]), len(alljobs), start, end))
        else:
            print('jobs exceed the limit')
Exemple #8
0
def combineFQ(args):
    """
    %prog combineFQ pattern(with quotation) fn_out
    """
    p = OptionParser(combineFQ.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    fq_pattern, fn_out, = args
    fns = glob(fq_pattern)
    cmd = 'cat %s > %s'%(' '.join(fns), fn_out)
    print(cmd)
    run(cmd, shell=True)
def Imgs2ArrsBatch(args):
    """
    %prog HyperDirPattern("CM*")
    generate img2arr jobs for all hyperspectral image dirs
    """
    p = OptionParser(Imgs2ArrsBatch.__doc__)
    p.set_slurm_opts()
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    pattern, = args
    all_dirs = [i for i in glob(pattern) if os.path.isdir(i)]
    for i in all_dirs:
        cmd = 'python -m schnablelab.CNN.Predict Imgs2Arrs %s\n'%i
        jobname = i+'.img2npy'
        header = Slurm_header%(opts.time, opts.memory, jobname, jobname, jobname)
        #header += "ml anaconda\nsource activate MCY\n"
        header += cmd
        jobfile = open('%s.img2arr.slurm'%i, 'w')
        jobfile.write(header)
        jobfile.close()
        print('slurm job for %s has been generated.'%i)
Exemple #10
0
def PlantHullBatch(args):
    """
    %prog PlantHullBatch Pattern("*.png") job_n
    generate PlantHull jobs for all image files
    """
    p = OptionParser(PlantHullBatch.__doc__)
    p.add_option('--mode',
                 default='real',
                 choices=['real', 'simu'],
                 help="real image or simulated image.")
    p.set_slurm_opts()
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    pattern, jobn, = args
    all_imgs = glob(pattern)
    all_cmds = []
    for img in all_imgs:
        imgpath = Path(img)
        outpre = str(imgpath.stem)
        cmd = 'python -m schnablelab.ImgPros.Preprocess PlantHull %s --crop True --segmentation True --border 80,10,10,10\n' % (img) \
            if opts.mode=='real' \
            else 'python -m schnablelab.ImgPros.Preprocess PlantHull %s --border 0,40,10,0 --thresh_cutoff 160\n' % (img)
        print(cmd)
        all_cmds.append(cmd)
    grps = cutlist(all_cmds, int(jobn))
    for gn, grp in grps:
        header = Slurm_header % (opts.time, opts.memory, gn, gn, gn)
        header += "ml anaconda\nsource activate MCY\n"
        for cmd in grp:
            header += cmd
        jobname = '%s.ppnum.slurm' % (gn)
        jobfile = open(jobname, 'w')
        jobfile.write(header)
        jobfile.close()
        print('%s job file generated!' % jobname)
from schnablelab.apps.base import glob
from pathlib import Path
import sys
from subprocess import run

pfx_r1 = glob('*_trim.R1.fastq')
pfx_r2 = glob('*_trim.R2.fastq')
pfx_un = glob('*_trim.Unpaired.fastq')

for r1, r2, un in zip(pfx_r1, pfx_r2, pfx_un):
    print(r1, r2)
    sm1 = r1.split('_trim')[0]
    sm2 = r2.split('_trim')[0]
    if sm1 == sm2:
        cmd = 'python -m schnablelab.SNPcalling.Preprocess align /work/schnablelab/cmiao/TimeSeriesGWAS/Genotype_GBS/Reference_Genome_4th/Sbicolor_454_v3.0.1 %s %s' % (
            r1, r2)
        run(cmd, shell=True)
    else:
        sys.exit('sm1 != sm2')
    print(un)
    cmd1 = 'python -m schnablelab.SNPcalling.Preprocess align /work/schnablelab/cmiao/TimeSeriesGWAS/Genotype_GBS/Reference_Genome_4th/Sbicolor_454_v3.0.1 %s' % un
    run(cmd1, shell=True)