Esempio n. 1
0
def Batch2JPG(args):
    '''
    %prog Batch2JPG in_dir out_dir

    apply toJPG on a large number of images
    '''
    p = OptionParser(Batch2JPG.__doc__)
    p.add_option('--pattern',
                 default='*.png',
                 help="file pattern of png files under the 'dir_in'")
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=Batch2JPG.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    in_dir_path = Path(in_dir)
    pngs = in_dir_path.glob(opts.pattern)
    cmds = []
    for img_fn in pngs:
        img_fn = str(img_fn).replace(' ', '\ ')
        cmd = "python -m schnablelab.ImageProcessing.base toJPG "\
        f"{img_fn} --out_dir {out_dir}"
        cmds.append(cmd)
    cmd_sh = '%s.cmds%s.sh' % (opts.job_prefix, len(cmds))
    pd.DataFrame(cmds).to_csv(cmd_sh, index=False, header=None)
    print('check %s for all the commands!' % cmd_sh)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm(cmds, put2slurm_dict)
Esempio n. 2
0
def keras_cnn(args):
    """
    %prog train_dir val_dir num_category model_name_prefix
    
    Run vgg model
    """
    p = OptionParser(keras_cnn.__doc__)
    p.add_option('--epoch', default=500, help = 'number of epoches')
    p.add_option('--lr_n', default=1, type='int',
        help = 'train model with differnt learning rates. if n=1: set lr to 0.001. if n>1: try differnt lr from 1e-2 to 1e-5 n times')
    p.set_slurm_opts(gpu=True)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    train_dir, val_dir, numC, mnp = args #mnp:model name prefix
    out_fns = fns(mnp, n=opts.lr_n)
    for i in range(int(opts.lr_n)):
        cmd = 'python -m schnablelab.CNN.keras_vgg %s %s %s %s %s %s'%(train_dir, val_dir, numC, out_fns.lrs[i], opts.epoch, out_fns.model_name[i]) 
        SlurmHeader = Slurm_gpu_header%(opts.time, opts.memory, out_fns.model_name[i], out_fns.model_name[i], out_fns.model_name[i])
        SlurmHeader += 'module load anaconda\nsource activate MCY\n'
        SlurmHeader += cmd
        f = open('%s.slurm'%out_fns.model_name[i], 'w')
        f.write(SlurmHeader)
        f.close()
        print('slurm file %s.slurm has been created, you can sbatch your job file.'%out_fns.model_name[i])
Esempio n. 3
0
def gentesting(args):
    """
    %prog source_imgs_dir source_imgs_csv training_imgs_csv testing_imgs_per_cls output_dir

    create the balanced testing dataset for each class
    """
    p = OptionParser(gentraining.__doc__)
    p.add_option('--header',
                 default=None,
                 help='spefiy if the source csv file has header')
    p.add_option('--comma_sep',
                 default=True,
                 help='spefiy if the csv file is separated by comma')
    p.add_option('--groupby_col',
                 default=1,
                 help='spefiy the groupy column. 0: 1st column; 1: 2nd column')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    source_dir, source_csv, training_csv, ipc, testing_dir = args  # ipc: number of images per class.

    # read the source csv file
    if opts.header and opts.comma_sep:  # without header with ,
        df0 = pd.read_csv(source_csv, header=None)
    elif (not opts.header) and opts.comma_sep:  # with header with ,
        df0 = pd.read_csv(source_csv)
    elif not (opts.header and opts.comma_sep):  # with header with tab/space
        df0 = pd.read_csv(source_csv, delim_whitespace=True)
    else:
        print('keke... implement this option first!')
    print('shape of source csv %s: %s' % (mycsv, df0.shape))
Esempio n. 4
0
def hmp2MVP(args):
    """
    %prog hmp2MVP hmp MVP_prefix

    Convert hmp genotypic data to bimnbam datasets (*.numeric and *.map).
    """
    p = OptionParser(hmp2MVP.__doc__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    hmp, mvp_pre = args
    f1 = open(hmp)
    f1.readline()
    f2 = open(mvp_pre + '.numeric', 'w')
    f3 = open(mvp_pre + '.map', 'w')
    f3.write('SNP\tChrom\tBP\n')
    for i in f1:
        j = i.split()
        rs = j[0]
        ref, alt = j[1].split('/')[0], j[1].split('/')[1]
        newNUMs = judge(ref, alt, j[11:])
        newline = '\t'.join(newNUMs) + '\n'
        f2.write(newline)
        chro, pos = j[2], j[3]
        f3.write('%s\t%s\t%s\n' % (rs, chro, pos))
    f1.close()
    f2.close()
    f3.close()
Esempio n. 5
0
def cpu(args):
    """
    %prog  
    request a cpu node from hcc.
    """
    p = OptionParser(cpu.__doc__)
    p.add_option("--partition",
                 default="jclarke",
                 choices=('batch', 'jclarke'),
                 help="which partition? [default: %default]")
    p.add_option("--memory",
                 default="10240",
                 help="specify the how much memory [default: %default]")
    p.add_option("--time",
                 default='20',
                 help="specify the time (hour) [default: %default]")
    opts, args = p.parse_args(args)
    if len(args) == 0:
        print('add --help to see options.\n')
        cmd = 'srun --partition=%s --mem-per-cpu=%s --ntasks-per-node=6 --nodes=1 --time=%s:0:0 --pty $SHELL\n' % (
            opts.partition, opts.memory, opts.time)
        print(cmd)
        #call(cmd, shell=True)
    else:
        sys.exit(not p.print_help())
Esempio n. 6
0
def merge_files(args):
    """
    %prog merge_files pattern out_fn
    combine split vcf files to a single one. Pattern example: 'hmp321_agpv4_chr9.%s.beagle.vcf'
    revise the lambda fucntion to fit your file patterns
    """

    p = OptionParser(merge_files.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    pattern,out_fn, = args

    fns = [str(i) for i in list(Path('.').glob(pattern))]
    fns_sorted = sorted(fns, key=lambda x: int(x.split('.')[0][3:]))
    print(fns_sorted)
    print('%s files found!'%len(fns_sorted))

    f = open(out_fn, 'w')
    print(fns_sorted[0])
    with open(fns_sorted[0]) as f1:
        for i in f1:
            f.write(i)
    for i in fns_sorted[1:]:
        print(i)
        with open(i) as f2:
            for j in f2:
                if not j.startswith('#'):
                    f.write(j)
Esempio n. 7
0
def sortbam(args):
    """
    %prog in_dir out_dir
        in_dir: bam files folder
        out_dir: sorted bam files folder

    sort bam files using samtools/0.1 sort function.
    """
    p = OptionParser(sortbam.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args

    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    bams = dir_path.glob('*.bam')
    for bam in bams:
        prf = bam.name.split('.bam')[0]
        sort_bam = prf + '.sorted'
        sort_bam_path = out_path / sort_bam
        cmd = 'samtools sort %s %s' % (bam, sort_bam_path)
        header = Slurm_header % (100, 15000, prf, prf, prf)
        header += 'ml samtools/0.1\n'
        header += cmd
        with open('%s.sortbam.slurm' % prf, 'w') as f:
            f.write(header)
Esempio n. 8
0
def SubsamplingSMs(args):
    """
    %prog SubsamplingSMs input_vcf SMs.csv 
    grep a subset of samples defined in SMs.csv (One sample name per row without header) from the input_vcf
    """
    p = OptionParser(SubsamplingSMs.__doc__)
    _, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputvcf, SMcsv, = args
    outputvcf = Path(inputvcf).name.replace('.vcf', '_subSMs.vcf')

    vcf = ParseVCF(inputvcf)
    df_vcf = vcf.AsDataframe()

    IDs = pd.read_csv(SMcsv, header=None)[0].values
    num_IDs = IDs.shape[0]
    print('number of specified Samples: %s'%num_IDs)

    subsm = vcf.SMs_header
    for id in IDs:
        if id not in vcf.SMs:
            print('%s not found in vcf...'%id)
        else:
            subsm.append(id)
    print('%s out of %s found in VCF'%(len(subsm)-9, num_IDs))

    df_vcf = df_vcf[subsm]
    with open(outputvcf, 'w') as f:
        f.writelines(vcf.HashChunk2)
    df_vcf.to_csv(outputvcf, sep='\t', index=False, mode='a')
    print('Done! check output %s...'%outputvcf)   
Esempio n. 9
0
def gatk(args):
    """
    %prog gatk ref.fa bam_list.txt region.txt out_dir

    run GATK HaplotypeCaller
    """
    p = OptionParser(gatk.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ref, bams, regions, out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    with open(bams) as f:
        inputs = ''.join(['-I %s \\\n'%(i.rstrip()) for i in f])
    with open(regions) as f:
        for reg in f:
            reg = reg.strip()
            if ':0-' in reg:
                reg = reg.replace(':0-', ':1-')
            reg_fn = reg.replace(':','_')
            reg_fn_vcf = '%s.gatk.vcf'%reg_fn
            reg_fn_vcf_path = out_path/reg_fn_vcf
            cmd = "gatk --java-options '-Xmx13G' HaplotypeCaller \\\n-R %s -L %s \\\n%s-O %s"%(ref, reg, inputs, reg_fn_vcf_path)
            header = Slurm_header%(165, 15000, reg_fn, reg_fn, reg_fn)
            header += 'ml gatk4/4.1\n'
            header += cmd
            with open('%s.gatk.slurm'%reg_fn, 'w') as f1:
                f1.write(header)
Esempio n. 10
0
def EstimateLD(args):
    """
    %prog dir_in dir_out
    run LD decay using tassel
    """
    p = OptionParser(EstimateLD.__doc__)
    p.set_slurm_opts(jn=True)
    p.add_option('--pattern', default='*vcf', help='pattern of vcf files')
    p.add_option('--window_size',
                 default='1000',
                 help='specify how many SNPs in the sliding window')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    dir_in, dir_out = args
    dir_out = Path(dir_out)
    if not dir_out.exists():
        dir_out.mkdir()
    for vcf in Path(dir_in).glob(opts.pattern):
        prefix = vcf.name.replace('.vcf', '')
        out_fn = '%s.ld' % prefix
        cmd = 'run_pipeline.pl -Xms512m -Xmx14g -fork1 -vcf %s -ld -ldWinSize %s -ldType SlidingWindow -td_tab %s/%s\n' % (
            vcf, opts.window_size, dir_out, out_fn)
        header = Slurm_header % (opts.time, 15000, prefix, prefix, prefix)
        header += 'ml java/1.8\n'
        header += 'ml tassel/5.2\n'
        header += cmd
        with open('%s.estLD.slurm' % prefix, 'w') as f:
            f.write(header)
        print(
            'slurm file %s.estLD.slurm has been created, you can submit your job file.'
            % prefix)
Esempio n. 11
0
def SummarizeLD(args):
    """
    %prog dir_in dir_out
    summarize LD decay in log scale
    """
    p = OptionParser(EstimateLD.__doc__)
    p.set_slurm_opts(jn=True)
    p.add_option('--pattern',
                 default='*.ld.txt',
                 help='pattern of ld.txt files')
    p.add_option('--max_dist',
                 default='1,000,000',
                 help='the maximum ld distance')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    dir_in, dir_out = args
    dir_out = Path(dir_out)
    if not dir_out.exists():
        dir_out.mkdir()
    num0 = opts.max_dist.count('0')

    for fn in Path(dir_in).glob(opts.pattern):
        prefix = '.'.join(fn.name.split('.')[0:-1])
        out_fn = '%s.sum.csv' % prefix
        cmd = 'python -m schnablelab.SNPcalling.base SummarizeLD %s %s %s/%s\n' % (
            fn, num0, dir_out, out_fn)
        header = Slurm_header % (opts.time, opts.memory, prefix, prefix,
                                 prefix)
        header += cmd
        with open('%s.sumLD.slurm' % prefix, 'w') as f:
            f.write(header)
        print(
            'slurm file %s.sumLD.slurm has been created, you can submit your job file.'
            % prefix)
Esempio n. 12
0
def only_MAF(args):
    """
    %prog in_dir out_dir

    filter MAF
    """
    p = OptionParser(only_MAF.__doc__)
    p.set_slurm_opts(jn=True)
    p.add_option('--pattern',
                 default='*.vcf',
                 help='file pattern for vcf files in dir_in')
    p.add_option('--maf', default='0.01', help='maf cutoff')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    vcfs = dir_path.glob(opts.pattern)
    for vcffile in vcfs:
        prefix = '.'.join(vcffile.name.split('.')[0:-1])
        cmd = "python -m schnablelab.SNPcalling.base MAF %s %s\n" % (vcffile,
                                                                     opts.maf)
        with open('%s.maf.slurm' % prefix, 'w') as f:
            header = Slurm_header % (opts.time, opts.memory, prefix, prefix,
                                     prefix)
            header += 'ml bcftools\n'
            header += cmd
            f.write(header)
            print(
                'slurm file %s.maf.slurm has been created, you can sbatch your job file.'
                % prefix)
Esempio n. 13
0
def filterSpeciesTreatment(args):
    """
    %prog filterSpeciesTreatment tissue_csv output_prefix
    tissue_csv (21 columns): sb_gene, si_gene, bd_gene, sb_cold_1-3, sb_normal_1-3, si_cold_1-3, si_normal_1-3, bd_cold_1-3, bd_normal_1-3
    """
    p = OptionParser(filterSpeciesTreatment.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    tissue_csv, outprf, = args
    df = pd.read_csv(tissue_csv)

    jdgs = []
    spes, trts = ['sorghum_', 'millet_', 'brachy_'], ['cold_', 'normal_']
    for spe in spes:
        cols = [i for i in df.columns[3:] if spe in i]
        spe_judg = (df[cols] == 0).sum(axis=1) <= 3
        jdgs.append(spe_judg)
    for trt in trts:
        cols = [i for i in df.columns[3:] if trt in i]
        trt_judg = (df[cols] == 0).sum(axis=1) <= 5
        jdgs.append(trt_judg)
    for spe in spes:
        for trt in trts:
            cols = [i for i in df.columns[3:] if spe + trt in i]
            jdg = (df[cols] == 0).sum(axis=1) <= 1
            jdgs.append(jdg)
    final_judg = pd.concat(jdgs, axis=1).sum(axis=1) == 11
    final_df = df[final_judg]
    final_df.to_csv('%s.csv' % outprf, index=False)
Esempio n. 14
0
def Resize(args):
    '''
    %prog Resize img1 img2 img3 ...

    resize image using PIL. 
    If multiple images are provided, same resizing dimension will be applied on all of them
    '''
    p = OptionParser(Resize.__doc__)
    p.add_option('--output_dim',
                 default='1227,1028',
                 help='the dimension (width,height) after resizing')
    p.add_option('--out_dir',
                 default='.',
                 help='specify the output image directory')
    p.add_option('--to_jpg',
                 default=False,
                 action='store_true',
                 help='in save image as jpg format')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())

    dim = ([int(i) for i in opts.output_dim.split(',')])
    for img_fn in args:
        img = ProsImage(img_fn)
        if opts.to_jpg:
            img_out_fn = Path(img.fn).name.replace(f'.{img.format}',
                                                   '.Rsz.jpg')
            img.resize(dim).convert('RGB').save(
                Path(opts.out_dir) / img_out_fn)
        else:
            img_out_fn = Path(img.fn).name.replace(f'.{img.format}',
                                                   f'.Rsz.{img.format}')
            img.resize(dim).save(Path(opts.out_dir) / img_out_fn)
Esempio n. 15
0
def hyp2arr(args):
    '''
    %prog hyp2arr hyp_dir out_fn

    convert hyperspectral images to numpy array
    '''
    p = OptionParser(hyp2arr.__doc__)
    opts, args = p.parse_args(args)
    if len(args)==0:
        sys.exit(not p.print_help())
    hyp_dir, out_fn, = args

    discard_imgs = ['0_0_0.png', '1_0_0.png']
    dir_path = Path(hyp_dir)
    if not dir_path.exists():
        sys.exit('%s does not exist!'%hyp_dir)
    imgs = list(dir_path.glob('*.png'))
    imgs = sorted(imgs, key=lambda x: int(x.name.split('_')[0]))
    num_imgs = len(imgs)
    print('%s images found.'%num_imgs)
    img_arrs = []
    for i in imgs:
        if not i.name in discard_imgs:
            arr = cv2.imread(str(i), cv2.IMREAD_GRAYSCALE)
            img_arrs.append(arr)
    img_array = np.stack(img_arrs, axis=2)
    print(img_array.shape)
    np.save(out_fn, img_array)
Esempio n. 16
0
def MLM(args):
    """
    %prog MLM GenoPrefix('*.mean' and '*.annotation') Pheno Outdir
    RUN automated GEMMA Mixed Linear Model
    """ 
    p = OptionParser(MLM.__doc__)
    p.add_option('--kinship', default=False, 
        help = 'specify the relatedness matrix file name')
    p.add_option('--pca', default=False, 
        help = 'specify the principle components file name')
    p.set_slurm_opts(jn=True)
    opts, args = p.parse_args(args)
    
    if len(args) == 0:
        sys.exit(not p.print_help())
    GenoPrefix, Pheno, Outdir = args
    meanG, annoG = GenoPrefix+'.mean', GenoPrefix+'.annotation'
    outprefix = '.'.join(Pheno.split('/')[-1].split('.')[0:-1])
    cmd = '%s -g %s -p %s -a %s -lmm 4 -outdir %s -o %s' \
        %(gemma, meanG, Pheno, annoG, Outdir, outprefix)
    if opts.kinship:
        cmd += ' -k %s'%opts.kinship
    if opts.pca:
        cmd += ' -c %s'%opts.pca
    print('The command running on the local node:\n%s'%cmd)

    h = Slurm_header
    header = h%(opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix)
    header += cmd
    f = open('%s.mlm.slurm'%outprefix, 'w')
    f.write(header)
    f.close()
    print('slurm file %s.mlm.slurm has been created, you can sbatch your job file.'%outprefix)
Esempio n. 17
0
def splitVCF(args):
    """
    %prog splitVCF N vcf
    split vcf to N smaller files with equal size
    """
    p = OptionParser(splitVCF.__doc__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    N, vcffile, = args
    N = int(N)
    prefix = vcffile.split('.')[0]
    cmd_header = "sed -ne '/^#/p' %s > %s.header" % (vcffile, prefix)
    subprocess.call(cmd_header, shell=True)
    child = subprocess.Popen('wc -l %s' % vcffile, shell=True, stdout=subprocess.PIPE)
    total_line = int(child.communicate()[0].split()[0])
    print('total %s lines' % total_line)
    step = total_line / N
    print(1)
    cmd_first = "sed -n '1,%sp' %s > %s.1.vcf" % (step, vcffile, prefix)
    subprocess.call(cmd_first, shell=True)
    for i in range(2, N):
        print(i)
        st = (i - 1) * step + 1
        ed = i * step
        cmd = "sed -n '%s,%sp' %s > %s.%s.tmp.vcf" % (st, ed, vcffile, prefix, i)
        subprocess.call(cmd, shell=True)
    print(i + 1)
    cmd_last = "sed -n '%s,%sp' %s > %s.%s.tmp.vcf" % ((ed + 1), total_line, vcffile, prefix, (i + 1))
    subprocess.call(cmd_last, shell=True)
    for i in range(2, N + 1):
        cmd_cat = 'cat %s.header %s.%s.tmp.vcf > %s.%s.vcf' % (prefix, prefix, i, prefix, i)
        subprocess.call(cmd_cat, shell=True)
Esempio n. 18
0
def align(args):
    """
    %prog align indx_base fq_fn ...

    do alignment using bwa.
    """
    p = OptionParser(align.__doc__)
    opts, args = p.parse_args(args)
    if len(args)==0:
        sys.exit(not p.print_help())
    ref_base = args[0]
    fq_fns = args[1:]
    print(fq_fns)
    sm = Path(fq_fns[0]).name.split('_trim')[0]
    gid = sm.split('R')[0]
    print(gid)
    R = r"'@RG\tID:%s\tSM:%s'"%(gid, gid)
    if len(fq_fns)==1:
        sam = sm+'.se.sam'
        print('run single-end alignment')
        cmd = 'bwa mem -R %s %s %s > %s \n'%(R, ref_base, fq_fns[0], sam)
        prf = '%s.se.align'%sm
    elif len(fq_fns)==2:
        sam = sm+'.pe.sam'
        print('run paired-end alignment')
        cmd = 'bwa mem -R %s %s %s %s > %s \n'%(R, ref_base, fq_fns[0], fq_fns[1], sam)
        prf = '%s.pe.align'%sm
    else:
        sys.exit('only one or two read files')
    header = Slurm_header%(100, 10000, prf, prf, prf)
    header += 'ml bwa\n'
    header += cmd
    with open('%s.slurm'%prf, 'w') as f:
        f.write(header)
Esempio n. 19
0
def fetchProSeq(args):
    """
    %prog GeneList seq_file output_prefix

    extract protein sequences of candidate genes
    """
    p = OptionParser(fetchProSeq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    genelist, SeqFile, out_prefix, = args
    cmd = "grep '>' %s|cut -d ' ' -f 1|cut -d '>' -f 2 > AllGene.names" % SeqFile
    call(cmd, shell=True)

    df_Genes = pd.read_csv(genelist, header=None)
    df_Trans = pd.read_csv('AllGene.names', header=None)
    df_Trans['gene'] = df_Trans[0].str.split('_').str.get(0)
    df1 = df_Trans[df_Trans['gene'].isin(df_Genes[0])]
    df1['gene'] = df1['gene'].astype('category')
    df1['gene'].cat.set_categories(df_Genes[0].tolist(), inplace=True)
    df2 = df1.sort_values(['gene', 0]).reset_index(drop=True)
    df2[0].to_csv('%s.ProSeq.names' % out_prefix, index=False, header=False)

    for i in list(df2[0]):
        print('fetching %s' % i)
        cmd = "%s %s %s >> %s" % (faOneRecord, SeqFile, i,
                                  out_prefix + '.seqs')
        call(cmd, shell=True)
    print('Done!')
Esempio n. 20
0
def split_fa_region(args):
    """
    %prog fa.fai region_size out_fn
        fa.fai: index file for the fa file
        region_size: the size for each splitted region
        out_fn: the output file

    genearte a list of freebayes/bamtools region specifiers
    """
    p = OptionParser(split_fa_region.__doc__)
    opts, args = p.parse_args(args)
    if len(args)==0:
        sys.exit(not p.print_help())
    fasta_index_file, region_size, fn_out, = args
    fasta_index_file = open(fasta_index_file)
    region_size = int(region_size)
    fn_out = open(fn_out, 'w')
    for line in fasta_index_file:
        fields = line.strip().split("\t")
        chrom_name = fields[0]
        chrom_length = int(fields[1])
        region_start = 0
        while region_start < chrom_length:
            start = region_start
            end = region_start + region_size
            if end > chrom_length:
                end = chrom_length
            line = chrom_name + ":" + str(region_start) + "-" + str(end)+'\n'
            fn_out.write(line)
            region_start = end
    fn_out.close()
Esempio n. 21
0
def sam2bam(args):
    """
    %prog in_dir out_dir
        in_dir: sam files folder
        out_dir: bam files folder

    convert sam to bam using samtools/0.1.
    """
    p = OptionParser(sam2bam.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args

    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    sams = dir_path.glob('*.sam')
    for sam in sams:
        prf = sam.name.split('.sam')[0]
        bam = prf + '.bam'
        bam_path = out_path / bam
        cmd = 'samtools view -bS %s > %s' % (sam, bam_path)
        header = Slurm_header % (100, 15000, prf, prf, prf)
        header += 'ml samtools/0.1\n'
        header += cmd
        with open('%s.sam2bam.slurm' % prf, 'w') as f:
            f.write(header)
Esempio n. 22
0
def reheader(args):
    """
    %prog reheader input_hmp names.csv

    substitute the sample names in hmp header using sed. 
    name.csv:
        comma separated without header line
        1st column is old name
        2nd column is the new name
    """
    p = OptionParser(reheader.__doc__)
    _, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputhmp, names_csv, = args
    outputhmp = Path(inputhmp).name.replace('.hmp', '_reheader.hmp')

    hmp = ParseHmp(inputhmp)

    cmd = 'sed '
    for _, row in pd.read_csv(names_csv, header=None).iterrows():
        old_nm, new_nm = row[0], row[1]
        if old_nm not in hmp.SMs:
            print('%s was not found in hmp...' % id)
        else:
            cmd += "-e '1s/%s/%s/' " % (old_nm, new_nm)
    cmd += '%s > %s' % (inputhmp, outputhmp)
    print('command:\n%s' % cmd)
    choice = input("Run the above command? (yes/no) ")
    if choice == 'yes':
        call(cmd, shell=True)
        print('Done! check %s' % outputhmp)
Esempio n. 23
0
def reorgnzGemmaKinship(args):
    """
    %prog reorgnzGemmaKinship GEMMAkinship hmp

    Reorganize kinship result from GEMMA so it can be used in other software, like GAPIT.
    The hmp file only provides the order of the smaple names.
    """
    p = OptionParser(reorgnzGemmaKinship.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    gemmaKin, hmpfile, = args

    f = open(hmpfile)
    SMs = f.readline().split()[11:]
    f.close()
    f1 = open(gemmaKin)
    f2 = open('GAPIT.' + gemmaKin, 'w')
    for i, j in zip(SMs, f1):
        newline = i + '\t' + j
        f2.write(newline)
    f1.close()
    f2.close()
    print(
        "Finished! Kinship matrix file for GEMMA 'GAPIT.%s' has been generated."
        % gemmaKin)
Esempio n. 24
0
def SubsamplingSMs(args):
    """
    %prog SubsamplingSMs input_hmp SMs.csv 
    grep a subset of samples defined in SMs.csv (One sample name per row without header) from the input_hmp
    """
    p = OptionParser(SubsamplingSMs.__doc__)
    _, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputhmp, SMcsv, = args
    outputhmp = Path(inputhmp).name.replace('.hmp', '_subSMs.hmp')

    hmp = ParseHmp(inputhmp)
    df_hmp = hmp.AsDataframe()

    IDs = pd.read_csv(SMcsv, header=None)[0].values
    num_IDs = IDs.shape[0]
    print('number of specified Samples: %s' % num_IDs)

    subsm = hmp.SMs_header
    for id in IDs:
        if id not in hmp.SMs:
            print('%s was not found in hmp...' % id)
        else:
            subsm.append(id)
    print('%s out of %s found in Hmp' % (len(subsm) - 11, num_IDs))

    df_hmp = df_hmp[subsm]
    df_hmp.to_csv(outputhmp, sep='\t', index=False, na_rep='NA')
    print('Done! check output %s...' % outputhmp)
Esempio n. 25
0
def gpu(args):
    """
    %prog
    request a gpu node from hcc.
    """
    p = OptionParser(gpu.__doc__)
    p.add_option("--memory",
                 default="12000",
                 help="specify the how much memory [default: %default]")
    p.add_option("--time",
                 default='20',
                 help="specify the time (hour) [default: %default]")
    p.add_option(
        "--model",
        default='gpu_k40',
        choices=('gpu_p100', 'gpu_k20', 'gpu_k40'),
        help=
        "specify gpu mode, p100:16gb, k40:12gb, k20:5bg [default: %default]")
    opts, args = p.parse_args(args)
    if len(args) == 0:
        print('add --help to see options.\n')
        cmd = 'srun --partition=schnablelab --gres=gpu --constraint=%s --mem-per-cpu=%s --ntasks-per-node=1 --nodes=1 --time=%s:0:0 --pty $SHELL\n' % (
            opts.model, opts.memory, opts.time)
        print(cmd)
        #call(cmd, shell=True)
    else:
        sys.exit(not p.print_help())
Esempio n. 26
0
def DownsamplingSNPs(args):
    """
    %prog downsampling input_hmp

    Pick up some SNPs from a huge hmp file using Linux sed command
    """
    p = OptionParser(DownsamplingSNPs.__doc__)
    p.add_option('--downscale', default=10, help='specify the downscale level')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=DownsamplingSNPs.__name__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    inputhmp, = args
    outputhmp = Path(inputhmp).name.replace('.hmp',
                                            '_ds%s.hmp' % opts.downsize)
    cmd = "sed -n '1~%sp' %s > %s" % (opts.downsize, inputhmp, outputhmp)
    print('cmd:\n%s\n' % cmd)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm([cmd], put2slurm_dict)
Esempio n. 27
0
def extract_info(args):
    """
    %prog log_file output_fn
    
    extract testing and prediction results from dpp log file
    """
    p = OptionParser(extract_info.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    logfile, opp, = args

    f0 = open(logfile)
    all_lines = f0.readlines()
    test_idx, predict_idx, hist_idx = 0, 0, 0
    for i, j in enumerate(all_lines):
        if 'All test labels:' in j:
            test_idx = i
        if 'All predictions:' in j:
            predict_idx = i
        if 'Histogram of ' in j:
            hist_idx = i

    test_lines = all_lines[test_idx + 1:predict_idx]
    ground_truth = extract_num(test_lines)
    #print(len(ground_truth), '\n', ground_truth)

    predict_lines = all_lines[predict_idx + 1:hist_idx]
    prediction = extract_num(predict_lines)
    #print(len(prediction), '\n', prediction)

    df = pd.DataFrame(
        dict(zip(['groundtruth', 'prediction'], [groundtruth, prediction])))
    df.to_csv(opp, index=False, sep='\t')
    print('Done! check %s' % opp)
Esempio n. 28
0
def hmp2bimbam(args):
    """
    %prog hmp2bimbam hmp bimbam_prefix
    Convert hmp genotypic data to GEMMA bimbam files (*.mean and *.annotation).
    """
    p = OptionParser(hmp2bimbam.__doc__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    hmp, bim_pre = args
    f1 = open(hmp)
    f1.readline()
    f2 = open(bim_pre + '.mean', 'w')
    f3 = open(bim_pre + '.annotation', 'w')
    for i in f1:
        j = i.split()
        rs = j[0]
        try:
            ref, alt = j[1].split('/')
        except:
            print('omit rs...')
            continue
        newNUMs = judge(ref, alt, j[11:])
        newline = '%s,%s,%s,%s\n' % (rs, ref, alt, ','.join(newNUMs))
        f2.write(newline)
        pos = j[3]
        chro = j[2]
        f3.write('%s,%s,%s\n' % (rs, pos, chro))
    f1.close()
    f2.close()
    f3.close()
Esempio n. 29
0
def Imgs2Arrs(args):
    '''
    %prog hyp_dir(filepath of hyperspectral image data) 
    Returns: numpy array object with shape [x*y, z].
        x,y dims correspond to pixel coordinates for each image
        z dim corresponds to hyperspectral image wavelength.
    '''
    import cv2

    p = OptionParser(Imgs2Arrs.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    mydir, = args
    imgs = [i for i in os.listdir(mydir) if i.endswith('png')]
    sorted_imgs = sorted(imgs, key=lambda x: int(x.split('_')[0]))
    all_arrs = []
    for i in sorted_imgs[2:]:
        print(i)
        #img = cv2.imread('%s/%s'%(mydir, i), cv2.IMREAD_GRAYSCALE)
        img = np.array(Image.open('%s/%s' % (mydir, i)).convert('L'))
        print(img.shape)
        all_arrs.append(img)
    arrs = np.stack(all_arrs, axis=2)
    np.save('%s.npy' % mydir, arrs)
Esempio n. 30
0
def SummarizeLD(args):
    """
    %prog ld.csv num0 out.txt
    ld.csv: ld tab delimited file generated from tassel
    num0: 0s in the distance

    summarize ld decay in log scale 0-100kb
    """
    p = OptionParser(SummarizeLD.__doc__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ld_fn,num0,out_fn, = args
    df = pd.read_csv(ld_fn, delim_whitespace=True, usecols=['Dist_bp', 'R^2'])
    df = df.dropna().sort_values('Dist_bp').reset_index(drop=True)

    mybin = [10**i for i in np.arange(0, float(num0)+0.1, 0.1)]
    blockPreIndex = np.histogram(df['Dist_bp'].values, bins=mybin)[0]

    a = list(blockPreIndex)
    a.insert(0,0)
    boxlist = []
    for idx,ele in enumerate(a):
        st = sum(a[0:idx])
        ed = sum(a[0:idx+1])
        boxlist.append(df['R^2'][st:ed].values)
    boxlist.pop(0)
    
    with open(out_fn, 'w') as f:
        for idx,ele in enumerate(boxlist):
            if len(ele) >= 1:
                averageR2, sd = sum(ele)/float(len(ele)), np.var(ele)
            elif len(ele) == 0:
                averageR2, sd = '',''
            f.write('%s\t%s\t%s\t%s\n'%(10**(idx*0.1),(10**((idx+1)*0.1)), averageR2, sd))