Exemple #1
0
def FilterMAF(args):
    """
    %prog FilterMAF input_vcf
    Remove rare MAF SNPs
    """
    p = OptionParser(FilterMAF.__doc__)
    p.add_option('--maf_cutoff', default = 0.01, type='float',
        help = 'specify the MAF rate cutoff, SNPs lower than this cutoff will be removed.')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputvcf, = args
    outputvcf = Path(inputvcf).name.replace('.vcf', '_maf%s.vcf'%opts.maf_cutoff)

    vcf = ParseVCF(inputvcf)
    n = 0
    with open(outputvcf, 'w') as f:
        f.writelines(vcf.HashChunk)
        pbar = tqdm(vcf.MAFs, total=vcf.num_SNPs, desc='Filter MAF', position=0)
        for i, maf in pbar:
            if maf >= opts.maf_cutoff:
                f.write(i)
            else:
                n += 1
            pbar.set_description('processing chromosome %s'%i.split()[0])
    print('Done! %s SNPs removed! check output %s...'%(n, outputvcf))
Exemple #2
0
def Info(args):
    '''
    %prog Info project_folder

    Show summary of images under project_folder
    '''
    p = OptionParser(Info.__doc__)
    p.add_option(
        '--item_idx',
        default='1,2,3',
        help=
        'the index of sample name, date, and time in each image directory name'
    )
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    project_folder, = args

    sm_idx, date_idx, time_idx = [int(i) for i in opts.item_idx.split(',')]
    prj = ParseProject(project_folder, sm_idx, date_idx, time_idx)
    print('Summary of samples:')
    for i, j in prj.sm_counts.items():
        print(i, j)
    print('Summary of dates:')
    for i, j in prj.date_counts.items():
        print(i, j)
    print('Angles for RGB images:')
    for angle in prj.df.loc[0, 'fnpath'].glob('Vis_*'):
        print(angle.name)
Exemple #3
0
def export(args):
    '''
    %prog export proj_id outfile

    - proj_id: The project id of the zooniverse project

    DESC: Fetches an export from the specified zooniverse project id.
    '''

    from schnablelab.Zooniverse.Zootils import export as exp

    p = OptionParser(export.__doc__)
    p.add_option('-t', '--type', default='classifications',
                 help='Specify the type of export')

    opts, args = p.parse_args(args)

    if len(args) != 2:
        exit(not p.print_help())

    projid, outfile = args

    exp(projid, outfile, opts)

    return True
Exemple #4
0
def fastqc(args):
    """
    %prog fastqc in_dir out_dir
        in_dir: the dir where fastq files are located
        out_dir: the dir saving fastqc reports

    generate slurm files for fastqc jobs
    """
    p = OptionParser(fastqc.__doc__)
    p.add_option("--pattern",
                 default='*.fastq',
                 help="the pattern of fastq files, qutation needed")
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args

    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    fqs = dir_path.glob(opts.pattern)
    for fq in fqs:
        prf = '.'.join(fq.name.split('.')[0:-1])
        print(prf)
        cmd = 'fastqc %s -o %s' % (str(fq), out_dir)
        header = Slurm_header % (10, 10000, prf, prf, prf)
        header += 'ml fastqc\n'
        header += cmd
        with open('%s.fastqc.slurm' % (prf), 'w') as f:
            f.write(header)
Exemple #5
0
def combineHmp(args):
    """
    %prog combineHmp N pattern output
    combine split hmp (1-based) files to a single one. Pattern example: hmp321_agpv4_chr%s.hmp
    """

    p = OptionParser(combineHmp.__doc__)
    p.add_option('--header', default='yes', choices=('yes', 'no'),
                 help='choose whether add header or not')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    N, hmp_pattern, new_f, = args
    N = int(N)

    f = open(new_f, 'w')

    fn1 = open(hmp_pattern % 1)
    print(1)
    if opts.header == 'yes':
        for i in fn1:
            f.write(i)
    else:
        fn1.readline()
        for i in fn1:
            f.write(i)
    fn1.close()
    for i in range(2, N + 1):
        print(i)
        fn = open(hmp_pattern % i)
        fn.readline()
        for j in fn:
            f.write(j)
        fn.close()
    f.close()
Exemple #6
0
def three2two(args):
    '''
    %prog three2two fn_in out_prefix 

    convert 3d npy to 2d
    '''
    p = OptionParser(three2two.__doc__)
    p.add_option('--crops',
        help='the coordinates for croping, follow left,upper,right,lower format. 1,80,320,479')
    p.add_option("--format", default='npy', choices=('npy', 'csv'),
        help="choose the output format")
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    fn_in, out_prefix, = args
    npy = np.load(fn_in)
    if opts.crops:
        left, up, right, down = opts.crops.split(',')
        npy = npy[int(up):int(down),int(left):int(right),:]
    h,w,d = npy.shape
    print(h, w, d)
    npy_2d = npy.reshape(h*w, d)
    if opts.format=='csv':
        out_fn = "%s.2d.csv"%out_prefix
        np.savetxt(out_fn, npy_2d, delimiter=",")
    else:
        out_fn = "%s.2d.npy"%out_prefix
        np.save(out_fn, npy_2d.astype(np.float64))
    print('Done!')
Exemple #7
0
def fixGTsep(args):
    """
    %prog fixGTsep in_dir out_dir

    replace the allele separator . in freebayes vcf file to / which is required for beagle
    """
    p = OptionParser(fixGTsep.__doc__)
    p.add_option('--pattern', default='*.vcf',
                 help='file pattern for vcf files in dir_in')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    vcfs = dir_path.glob(opts.pattern)
    for vcf in vcfs:
        sm = '.'.join(vcf.name.split('.')[0:-1])
        out_fn = sm+'.fixGT.vcf'
        out_fn_path = out_path/out_fn
        cmd = "perl -pe 's/\s\.:/\t.\/.:/g' %s > %s"%(vcf, out_fn_path)
        header = Slurm_header%(10, 10000, sm, sm, sm)
        header += cmd
        with open('%s.fixGT.slurm'%sm, 'w') as f:
            f.write(header)
Exemple #8
0
def index_ref(args):
    """
    %prog index_ref ref.fa

    index the reference genome sequences
    """
    p = OptionParser(index_ref.__doc__)
    p.add_option('--tool', default='bwa', choices=('bwa', 'samtools'),
            help = 'tool for indexing reference genome')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ref_fn, = args
    prefix = '.'.join(ref_fn.split('.')[0:-1])
    if opts.tool == 'bwa':
        cmd = 'bwa index -p %s %s'%(prefix, ref_fn)
        print(cmd)
        header = Slurm_header%(100, 15000, prefix, prefix, prefix)
        header += 'ml bwa\n'
        header += cmd
        with open('%s.bwa_index.slurm'%prefix, 'w') as f:
            f.write(header)
    else:
        cmd = 'samtools faidx %s'%ref_fn
        print(cmd)
        header = Slurm_header%(10, 10000, prefix, prefix, prefix)
        header += 'ml samtools\n'
        header += cmd
        with open('%s.samtools_index.slurm'%prefix, 'w') as f:
            f.write(header)
Exemple #9
0
def trim_paired(args):
    """
    %prog trim in_dir out_dir
    quality control on the paired reads
    """
    p = OptionParser(trim_paired.__doc__)
    p.add_option('--pattern_r1', default = '*_R1.fastq',
            help='filename pattern for forward reads')
    p.add_option('--pattern_r2', default = '*_R2.fastq',
            help='filename pattern for reverse reads')
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir,out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('output dir %s does not exist...'%out_dir)
    r1_fns = glob('%s/%s'%(in_dir, opts.pattern_r1))
    r2_fns = glob('%s/%s'%(in_dir, opts.pattern_r2))
    for r1_fn, r2_fn in zip(r1_fns, r2_fns):
        r1_path = Path(r1_fn)
        r2_path = Path(r2_fn)
        prf = '_'.join(r1_path.name.split('_')[0:-1])+'.PE'
        print(prf)
        r1_fn_out1 = r1_path.name.replace('R1.fastq', 'trim.R1.fastq')
        r1_fn_out2 = r1_path.name.replace('R1.fastq', 'unpaired.R1.fastq')
        r2_fn_out1 = r2_path.name.replace('R2.fastq', 'trim.R2.fastq')
        r2_fn_out2 = r2_path.name.replace('R2.fastq', 'unpaired.R2.fastq')
        cmd = 'java -jar $TM_HOME/trimmomatic.jar PE -phred33 %s %s %s %s %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40'%(r1_fn,r2_fn,str(out_path/r1_fn_out1),str(out_path/r1_fn_out2),str(out_path/r2_fn_out1),str(out_path/r2_fn_out2))
        header = Slurm_header%(10, 10000, prf, prf, prf)
        header += 'ml trimmomatic\n'
        header += cmd
        with open('%s.trim.slurm'%(prf), 'w') as f:
            f.write(header)
Exemple #10
0
def download(args):
    '''
    %prog activate download_links.csv

    download activated asset links 
    '''
    p = OptionParser(download.__doc__)
    p.add_option(
        '--output',
        default="'infer'",
        help='default to construct the output file name from the API response')
    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())
    links_csv, = args
    links_df = pd.read_csv(links_csv, delim_whitespace=True)
    client = Client()

    # Send a GET request to the provided location url,
    for __, row in links_df.iterrows():
        res = client.ses.get(row['download_link'], stream=True)
        suffix = 'tif' if row['asset_type'] == 'visual' else row['asset_type']
        output = '%s_%s.%s'%(row['id'], row['item_type'], suffix) \
            if opts.output == "'infer'" else opts.output
        # Save the file
        with open(output, "wb") as f:
            print('download %s...' % output)
            for chunk in res.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
                    f.flush()
Exemple #11
0
def MLM(args):
    """
    %prog MLM GenoPrefix('*.mean' and '*.annotation') Pheno Outdir
    RUN automated GEMMA Mixed Linear Model
    """ 
    p = OptionParser(MLM.__doc__)
    p.add_option('--kinship', default=False, 
        help = 'specify the relatedness matrix file name')
    p.add_option('--pca', default=False, 
        help = 'specify the principle components file name')
    p.set_slurm_opts(jn=True)
    opts, args = p.parse_args(args)
    
    if len(args) == 0:
        sys.exit(not p.print_help())
    GenoPrefix, Pheno, Outdir = args
    meanG, annoG = GenoPrefix+'.mean', GenoPrefix+'.annotation'
    outprefix = '.'.join(Pheno.split('/')[-1].split('.')[0:-1])
    cmd = '%s -g %s -p %s -a %s -lmm 4 -outdir %s -o %s' \
        %(gemma, meanG, Pheno, annoG, Outdir, outprefix)
    if opts.kinship:
        cmd += ' -k %s'%opts.kinship
    if opts.pca:
        cmd += ' -c %s'%opts.pca
    print('The command running on the local node:\n%s'%cmd)

    h = Slurm_header
    header = h%(opts.time, opts.memory, opts.prefix, opts.prefix, opts.prefix)
    header += cmd
    f = open('%s.mlm.slurm'%outprefix, 'w')
    f.write(header)
    f.close()
    print('slurm file %s.mlm.slurm has been created, you can sbatch your job file.'%outprefix)
Exemple #12
0
def item_types(args):
    '''
    %prog item_types
    print all available item types
    '''
    p = OptionParser(item_types.__doc__)
    p.add_option('-o',
                 '--output',
                 default="item_types.csv",
                 help='specify output file')
    p.add_option('--n', default="all", help='how many rows you wanna see')
    opts, args = p.parse_args(args)
    if len(args) != 0:
        sys.exit(not p.print_help())

    client = Client()
    df_items = client.get_all_items()[[
        'id', 'display_name', 'display_description'
    ]]
    if opts.n != 'all':
        try:
            rows = int(opts.n)
            df_items.head(rows).to_csv(opts.output, index=False, sep='\t')
        except ValueError:
            sys.exit("n must be a number")
    else:
        df_items.to_csv(opts.output, index=False, sep='\t')
    print('check %s!' % (opts.output))
Exemple #13
0
def freebayes(args):
    """
    %prog freebayes region.txt ref.fa bam_list.txt out_dir

    create freebayes slurm jobs for each splitted region defined in region.txt file
    """
    p = OptionParser(freebayes.__doc__)
    p.add_option('--max_depth', default=10000,
            help = 'cites where the mapping depth higher than this value will be ignored')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    region, ref, bams,out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')

    with open(region) as f:
        for reg in f:
            reg = reg.strip()
            reg_fn = reg.replace(':','_')
            reg_fn_vcf = '%s.fb.vcf'%reg_fn
            reg_fn_vcf_path = out_path/reg_fn_vcf
            cmd = 'freebayes -r %s -f %s -C 1 -F 0.05 -L %s -u -n 2 -g %s > %s\n'%(reg, ref, bams,opts.max_depth, reg_fn_vcf_pth)
            header = Slurm_header%(165, 50000, reg_fn, reg_fn, reg_fn)
            header += 'ml freebayes/1.3\n'
            header += cmd
            with open('%s.fb.slurm'%reg_fn, 'w') as f1:
                f1.write(header)
            print('slurm files %s.fb.slurm has been created'%reg_fn)
Exemple #14
0
def FilterHetero(args):
    """
    %prog FilterHetero input_vcf
    Remove bad and high heterizygous loci
    """
    p = OptionParser(FilterHetero.__doc__)
    p.add_option('--het_cutoff', default = 0.1, type='float',
        help = 'specify the heterozygous rate cutoff, SNPs higher than this cutoff will be removed.')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputvcf, = args
    outputvcf = Path(inputvcf).name.replace('.vcf', '_het%s.vcf'%opts.het_cutoff)

    vcf = ParseVCF(inputvcf)
    n = 0
    with open(outputvcf, 'w') as f:
        f.writelines(vcf.HashChunk)
        pbar = tqdm(vcf.Heteros, total=vcf.num_SNPs, desc='Filter Heterozygous', position=0)
        for i, het in pbar:
            if het <= opts.het_cutoff:
                f.write(i)
            else:
                n += 1
            pbar.set_description('processing chromosome %s'%i.split()[0])
    print('Done! %s SNPs removed! check output %s...'%(n, outputvcf))
Exemple #15
0
def action1(args):
    """
    %prog dir

    do some tricky actions...
    """
    p = OptionParser(action1.__doc__)
    p.add_option("--num", default='10', help="one num-th files will be read.")
    opts, args = p.parse_args(args)
    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    all_fns = []
    for dirpath, dirnames, filenames in os.walk(folder):
        for filename in filenames:
            fn = os.path.join(dirpath, filename)
            all_fns.append(fn)
    part_fns = random.sample(all_fns,
                             int(np.ceil(len(all_fns) / float(opts.num))))
    for i in part_fns:
        print(i)
        f = open(fn)
        f.readline()
        f.close()
    print('run away from crim scene !!!')
Exemple #16
0
def FilterMissing(args):
    """
    %prog FilterMissing input_hmp
    Remove SNPs with high missing rate
    """
    p = OptionParser(FilterMissing.__doc__)
    p.add_option(
        '--missing_cutoff',
        default=0.7,
        type='float',
        help=
        'specify the missing rate cutoff. SNPs higher than this cutoff will be removed.'
    )
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputhmp, = args
    outputhmp = Path(inputhmp).name.replace('.hmp',
                                            '_mis%s.hmp' % opts.missing_cutoff)

    hmp = ParseHmp(inputhmp)
    n = 0
    with open(outputhmp, 'w') as f:
        f.write(hmp.headerline)
        pbar = tqdm(hmp.Missings, total=hmp.numSNPs)
        for i, miss in pbar:
            if miss <= opts.missing_cutoff:
                f.write(i)
            else:
                n += 1
            pbar.set_description('processing chromosome %s' % i.split()[2])
    print('Done! %s SNPs removed! check output %s...' % (n, outputhmp))
Exemple #17
0
def hyp2arr_slurms(args):
    '''
    %prog hyp2arr_slurms in_dir out_dir
    
    generate hyp2arr slurm jobs for all folders under specified dir
    '''
    p = OptionParser(hyp2arr_slurms.__doc__)
    p.add_option('--pattern', default='*',
                 help='hyper dir pattern for folders under dir')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    folders = list(dir_path.glob(opts.pattern))
    num_arrs = len(folders)
    print('%s hyper folders found'%num_arrs)
    for hyp_dir in folders:
        in_dir = str(hyp_dir/'Hyp_SV_90')
        out_fn = hyp_dir.name.replace(' ', '_')
        out_fn_path = out_path/out_fn
        cmd = 'python -m schnablelab.CNN.Preprocess hyp2arr %s %s'%(in_dir, out_fn_path)
        print(cmd)
        header = Slurm_header%(10, 5000, out_fn, out_fn, out_fn)
        header += 'conda activate MCY\n'
        header += cmd
        with open('%s.hyp2arr.slurm'%out_fn, 'w') as f:
            f.write(header)
Exemple #18
0
def FilterHetero(args):
    """
    %prog FilterHetero input_hmp
    Remove bad and high heterizygous loci (coducting Missing and MAF first)
    """
    p = OptionParser(FilterHetero.__doc__)
    p.add_option(
        '--het_cutoff',
        default=0.1,
        type='float',
        help=
        'specify the heterozygous rate cutoff, SNPs higher than this cutoff will be removed.'
    )
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputhmp, = args
    outputhmp = Path(inputhmp).name.replace('.hmp',
                                            '_het%s.hmp' % opts.het_cutoff)

    hmp = ParseHmp(inputhmp)
    n = 0
    with open(outputhmp, 'w') as f:
        f.write(hmp.headerline)
        pbar = tqdm(hmp.Heteros, total=hmp.numSNPs)
        for i, het in pbar:
            if het <= opts.het_cutoff:
                f.write(i)
            else:
                n += 1
            pbar.set_description('processing chromosome %s' % i.split()[2])
    print('Done! %s SNPs removed! check output %s...' % (n, outputhmp))
Exemple #19
0
def only_ALT(args):
    """
    %prog in_dir out_dir

    filter number of ALT using bcftools
    """
    p = OptionParser(only_ALT.__doc__)
    p.set_slurm_opts(jn=True)
    p.add_option('--pattern', default='*.vcf',
                 help='file pattern for vcf files in dir_in')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    vcfs = dir_path.glob(opts.pattern)
    for vcffile in vcfs:
        prefix = '.'.join(vcf.name.split('.')[0:-1])
        new_f = prefix + '.alt1.vcf'
        cmd = "bcftools view -i 'N_ALT=1' %s > %s"%(vcffile, new_f)
        with open('%s.alt1.slurm'%prefix, 'w') as f:
            header = Slurm_header%(opts.time, opts.memory, prefix, prefix, prefix)
            header += 'ml bacftools\n'
            header += cmd
            f.write(header)
            print('slurm file %s.alt1.slurm has been created, you can sbatch your job file.'%prefix)
Exemple #20
0
def FilterMAF(args):
    """
    %prog FilterMAF input_hmp
    Remove rare MAF SNPs (conducting Missing filter first)
    """
    p = OptionParser(FilterMAF.__doc__)
    p.add_option(
        '--MAF_cutoff',
        default=0.01,
        type='float',
        help=
        'specify the MAF rate cutoff, SNPs lower than this cutoff will be removed.'
    )
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    inputhmp, = args
    outputhmp = Path(inputhmp).name.replace('.hmp',
                                            '_maf%s.hmp' % opts.MAF_cutoff)

    hmp = ParseHmp(inputhmp)
    n = 0
    with open(outputhmp, 'w') as f:
        f.write(hmp.headerline)
        pbar = tqdm(hmp.MAFs, total=hmp.numSNPs)
        for i, maf in pbar:
            if maf >= opts.MAF_cutoff:
                f.write(i)
            else:
                n += 1
            pbar.set_description('processing chromosome %s' % i.split()[2])
    print('Done! %s SNPs removed! check output %s...' % (n, outputhmp))
Exemple #21
0
def IndexVCF(args):
    """
    %prog IndexVCF in_dir out_dir

    index vcf using bgzip and tabix
    """
    p = OptionParser(IndexVCF.__doc__)
    p.add_option('--pattern', default='*.vcf',
                 help='file pattern for vcf files in dir_in')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('%s does not exist...')
    dir_path = Path(in_dir)
    vcfs = dir_path.glob(opts.pattern)
    for vcf in vcfs:
        sm = '.'.join(vcf.name.split('.')[0:-1])
        out_fn = vcf.name+'.gz'
        out_fn_path = out_path/out_fn
        cmd1 = 'bgzip -c %s > %s\n'%(vcf, out_fn_path)
        cmd2 = 'tabix -p vcf %s\n'%(out_fn_path)
        header = Slurm_header%(10, 20000, sm, sm, sm)
        header += 'ml tabix\n'
        header += cmd1
        header += cmd2
        with open('%s.idxvcf.slurm'%sm, 'w') as f:
            f.write(header)
Exemple #22
0
def DownsamplingSNPs(args):
    """
    %prog downsampling input_hmp

    Pick up some SNPs from a huge hmp file using Linux sed command
    """
    p = OptionParser(DownsamplingSNPs.__doc__)
    p.add_option('--downscale', default=10, help='specify the downscale level')
    p.add_option('--disable_slurm',
                 default=False,
                 action="store_true",
                 help='do not convert commands to slurm jobs')
    p.add_slurm_opts(job_prefix=DownsamplingSNPs.__name__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    inputhmp, = args
    outputhmp = Path(inputhmp).name.replace('.hmp',
                                            '_ds%s.hmp' % opts.downsize)
    cmd = "sed -n '1~%sp' %s > %s" % (opts.downsize, inputhmp, outputhmp)
    print('cmd:\n%s\n' % cmd)
    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm([cmd], put2slurm_dict)
Exemple #23
0
def trim_single(args):
    """
    %prog trim in_dir out_dir
    quality control on the single end reads
    """
    p = OptionParser(trim_paired.__doc__)
    p.add_option('--pattern',
                 default='*_Unpaired.fastq',
                 help='filename pattern for all single end reads')
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())
    in_dir, out_dir, = args
    out_path = Path(out_dir)
    if not out_path.exists():
        sys.exit('output dir %s does not exist...' % out_dir)
    fns = glob('%s/%s' % (in_dir, opts.pattern))
    for fn in fns:
        fn_path = Path(fn)
        prf = '_'.join(fn_path.name.split('_')[0:-1]) + '.SE'
        print(prf)
        fn_out = fn_path.name.replace('Unpaired.fastq', 'trim.Unpaired.fastq')
        cmd = 'java -jar $TM_HOME/trimmomatic.jar SE -phred33 %s %s TRAILING:20 SLIDINGWINDOW:4:20 MINLEN:40' % (
            fn, str(out_path / fn_out))
        header = Slurm_header % (10, 10000, prf, prf, prf)
        header += 'ml trimmomatic\n'
        header += cmd
        with open('%s.trim.slurm' % (prf), 'w') as f:
            f.write(header)
Exemple #24
0
def ped2bed(args):
    """
    %prog ped_prefix

    Convert plink ped/map to binary bed/bim/fam format using Plink
    """
    p = OptionParser(ped2bed.__doc__)
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=ped2bed.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    ped_prefix, = args
    cmd_header = 'ml plink'
    cmd = 'plink --noweb --file %s --make-bed --out %s' % (ped_prefix,
                                                           ped_prefix)
    print('cmd on HCC:\n%s\n%s' % (cmd_header, cmd))

    cmd_local = '%s --noweb --file %s --make-bed --out %s' % (
        plink, ped_prefix, ped_prefix)
    print('cmd on local desktop:\n%s\n' % cmd_local)

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd], put2slurm_dict)
Exemple #25
0
def IndePvalue(args):
    """
    %prog IndePvalue plink_bed_prefix output

    calculate the number of independent SNPs (Me) and the bonferroni pvalue
    """
    p = OptionParser(IndePvalue.__doc__)
    p.set_slurm_opts(jn=True)
    p.add_option(
        '--cutoff',
        default='0.05',
        choices=('0.01', '0.05'),
        help='choose the pvalue cutoff for the calculation of bonferroni pvalue'
    )
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())

    bed, output = args
    mem = int(opts.memory / 1000) - 2
    cmd = 'java -Xmx%sg -jar %s --noweb --effect-number --plink-binary %s --genome --out %s' % (
        mem, GEC, bed, output)
    h = Slurm_header
    h += 'module load java/1.8\n'
    header = h % (opts.time, opts.memory, opts.prefix, opts.prefix,
                  opts.prefix)
    header += cmd
    f = open('%s.Me_SNP.slurm' % output, 'w')
    f.write(header)
    f.close()
    print(
        'slurm file %s.Me_SNP.slurm has been created, you can sbatch your job file.'
        % output)
Exemple #26
0
def IndePvalue(args):
    """
    %prog IndePvalue bed_prefix output_fn

    Estimate number of idenpendent SNPs using GEC
    """
    p = OptionParser(IndePvalue.__doc__)
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=IndePvalue.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    bed_prefix, output_fn = args
    cmd = 'java -Xmx18g -jar %s --noweb --effect-number --plink-binary %s --genome --out %s' % (
        GEC, bed_prefix, output_fn)
    print('cmd:\n%s\n' % cmd)

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['memory'] = 20000
        put2slurm([cmd], put2slurm_dict)
Exemple #27
0
def keras_cnn(args):
    """
    %prog train_dir val_dir num_category model_name_prefix
    
    Run vgg model
    """
    p = OptionParser(keras_cnn.__doc__)
    p.add_option('--epoch', default=500, help = 'number of epoches')
    p.add_option('--lr_n', default=1, type='int',
        help = 'train model with differnt learning rates. if n=1: set lr to 0.001. if n>1: try differnt lr from 1e-2 to 1e-5 n times')
    p.set_slurm_opts(gpu=True)
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    train_dir, val_dir, numC, mnp = args #mnp:model name prefix
    out_fns = fns(mnp, n=opts.lr_n)
    for i in range(int(opts.lr_n)):
        cmd = 'python -m schnablelab.CNN.keras_vgg %s %s %s %s %s %s'%(train_dir, val_dir, numC, out_fns.lrs[i], opts.epoch, out_fns.model_name[i]) 
        SlurmHeader = Slurm_gpu_header%(opts.time, opts.memory, out_fns.model_name[i], out_fns.model_name[i], out_fns.model_name[i])
        SlurmHeader += 'module load anaconda\nsource activate MCY\n'
        SlurmHeader += cmd
        f = open('%s.slurm'%out_fns.model_name[i], 'w')
        f.write(SlurmHeader)
        f.close()
        print('slurm file %s.slurm has been created, you can sbatch your job file.'%out_fns.model_name[i])
Exemple #28
0
def genPCA(args):
    """
    %prog genPCA input_hmp N

    Generate first N PCs using tassel
    """
    p = OptionParser(genPCA.__doc__)
    p.add_option(
        '--disable_slurm',
        default=False,
        action="store_true",
        help='add this option to disable converting commands to slurm jobs')
    p.add_slurm_opts(job_prefix=genPCA.__name__)
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    hmpfile, N, = args
    out_prefix = Path(hmpfile).name.replace('.hmp', '')
    cmd_header = 'ml java/1.8\nml tassel/5.2'
    cmd = 'run_pipeline.pl -Xms28g -Xmx29g -fork1 -h %s -PrincipalComponentsPlugin -ncomponents %s -covariance true -endPlugin -export %s_%sPCA -runfork1\n' % (
        hmpfile, N, out_prefix, N)
    print('cmd:\n%s\n%s' % (cmd_header, cmd))

    if not opts.disable_slurm:
        put2slurm_dict = vars(opts)
        put2slurm_dict['memory'] = 30000
        put2slurm_dict['cmd_header'] = cmd_header
        put2slurm([cmd], put2slurm_dict)
Exemple #29
0
def divide(args):
    '''
    %prog divide input_dir output_dir_prefix
    '''
    p = OptionParser(divide.__doc__)
    p.add_option('--pattern', default='*.jpg',
                 help='file name pattern')
    p.add_option('--nimgs_per_folder', type='int', default=700,
                 help='~ number of images (<1000) in each smaller folder')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    input_dir, out_prefix, = args

    df = GenDataFrameFromPath(Path(input_dir), pattern=opts.pattern)
    n_folders = math.ceil(df.shape[0]/opts.nimgs_per_folder)
    print('%s will be divided to %s datasets'%(df.shape[0], n_folders))
    n = 0
    for _, grp in cutlist(df['fnpath'].values, n_folders):
        n += 1
        output_folder = Path('%s_%s'%(out_prefix,n))
        print(output_folder, grp.shape[0])
        if not output_folder.exists():
            output_folder.mkdir()
        for i in grp:
            copyfile(i, output_folder/i.name)
Exemple #30
0
def vcf2hmp(args):
    """
    %prog vcf2hmp vcf
    convert vcf generated from beagle to hmp format using tassel
    """
    p = OptionParser(vcf2hmp.__doc__)
    p.set_slurm_opts(jn=True)
    p.add_option('--version',
                 default='2',
                 choices=('1', '2'),
                 help='specify the hmp type. 1: hyploid. 2: diploid')
    opts, args = p.parse_args(args)
    if len(args) == 0:
        sys.exit(not p.print_help())
    vcffile, = args
    prefix = '.'.join(vcffile.split('.')[0:-1])
    cmd = '%s -Xms512m -Xmx10G -fork1 -vcf %s -export -exportType HapmapDiploid\n' % (tassel, vcffile) \
        if opts.version == '2' \
        else '%s -Xms512m -Xmx10G -fork1 -vcf %s -export -exportType Hapmap\n' % (tassel, vcffile)
    header = Slurm_header % (opts.time, opts.memory, opts.prefix, opts.prefix,
                             opts.prefix)
    header += 'module load java/1.8\n'
    header += cmd
    f = open('%s.vcf2hmp.slurm' % prefix, 'w')
    f.write(header)
    f.close()
    print(
        'slurm file %s.vcf2hmp.slurm has been created, you can submit your job file.'
        % prefix)