def prepare_seekr(): parser = argparse.ArgumentParser(description="top level SEEKR program used to prepare and generate all input files necessary for a SEEKR calculation") parser.add_argument('input_filename', metavar='INPUT_FILENAME', type = str, help="name of SEEKR input file") args = parser.parse_args() #parse command line arguments args = vars(args) #converts to dictionary _get_inputs(args,) sys_params = _get_sys_params(inputs) #parse general system parameters (structures, forcefield files, directory names from input file) md_milestones = _parse_milestone_inputs(inputs) #parse milestone CV parameters and milestone values from input file milestones, md_anchor_list = _generate_milestone_lists(md_milestones) #generate upper/lower bounds of a SINGLE CV for each anchor/Voronoi celli # TODO generate anchor lists for multiple milestone CV's # TODO BD milestones #print(milestones) _generate_filetree(inputs, sys_params) #creates/clears top level ditectory filetree_settings = _get_filetree_settings(md_anchor_list) md_filetree_settings_all = {**filetree_settings, **sys_params} anchor_dirlist, md_file_paths = filetree.md_filetree(md_filetree_settings_all) md_settings = _get_md_settings(inputs, md_file_paths) #parse parameters for MD simulations from input file md_settings_all = {**md_settings, **sys_params, **filetree_settings,} md.main(md_settings_all, milestones) print(milestones[-1]) milestone_filename= os.path.join(sys_params['rootdir'], 'milestones.xml') anchor_list = _group_milestones_to_anchor(milestones, anchor_dirlist, md_file_paths,) print('Anchor List',anchor_list) _write_milestone_file(anchor_list, md_settings['master_temperature'], sys_params['md_time_factor'], sys_params['bd_time_factor'],milestone_filename) structures = _load_structures(inputs, sys_params) bd_settings = _get_bd_settings(inputs, sys_params, structures) bd_milestone = anchor_list[-1] print(bd_milestone) #bd_milestone_pair = bd_milestone['%s_pair_list' %bd_milestone['key']][-1] bd_lower_bound = bd_milestone['milestone_params'][0]['lower_bound'] bd_lower_bound_index = bd_milestone['milestone_params'][0]['lower_milestone_index'] bd_index = bd_milestone['milestone_params'][0]['upper_milestone_index'] b_surf_distance = bd_milestone['milestone_params'][0]['upper_bound'] #bd_milestone_index = bd_settings.update({'b_surf_distance' : b_surf_distance, 'bd_lower_bound' : bd_lower_bound, 'bd_lower_bound_index' : bd_lower_bound_index, 'bd_index': bd_index, }) print("BD b surface distance", b_surf_distance) bd.main(bd_settings)
#!/bin/env python ''' usage: genie r1 --fname=FILE [--out=PREFIX] [--nojob] [--dry-run] options: --fname=FILE a blank file --out=PREFIX outname prefix [default: r1] --nojob no job --dry-run dry run ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../library') import md if __name__ == '__main__': md.main(docopt(__doc__), ['r1'])
#!/bin/env python '''ipsych cojo usage: ipsych cojo [options] <summary-file> options: --out=PREFIX outname prefix [default: ipsych_cojo] --ldfile=FILE ld reference plink-file [default: ....] --nsamples=NUMBER number of samples --nojob if you want to run in front end ''' from docopt import docopt import md if __name__ == '__main__': md.main(docopt(__doc__))
#!/bin/env ''' usage: ldsc munge [options] --sumstats=FILE options: --sumstats=FILE daner formatted summary file --out=PREFIX outname prefix [default: out] --Nsamples=NUMBER sample size --other=ARGUMENTS other arguments to pass to ldsc within quotes --daner if using daner format --nojob run front end --dry-run just see the analysis plan but not run ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['munge'])
#!/bin/env python ''' usage: gseq bwa [options] --fqlist=FILE options: --fqlist=FILE fastq list --ref-genome=FILE reference fasta file [default: |resources/bwa/reference/b37/human.b37.fasta] ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['bwa'])
#!/bin/env python3 ''' usage: predixcan simu [options] --gene=FILE options: --gene=FILE a file with gene names one per line --out=PREFIX outname prefix [default: out] --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end --int ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['simu'])
usage: prs prsice [options] --base=FILE --target=FILE options: --base=FILE training sample file or file.list --target=FILE target sample file or file.list --slower=NUMBER lower limit [default: 0] --supper=NUMBER upper limit [default: 0.5] --sinc=NUMBER p-value increments [default: 0.1] --out=PREFIX outname prefix [default: prs] --clump if clumping need to be done --clump_p1=NUMBER clump p1 threshold [default: 1] --clump_p2=NUMBER clump p2 threshold [default: 1] --clump_r2=NUMBER clump r2 threshold [default: 0.1] --clump_kb=NUMBER clump distance [default: 250] --remove_mhc If you want to remove MHC region --nojob if should be run in front end --njobs=NUMBER Number of parallel jobs when running in front end --dry-run dry run ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['prsice'])
--reml-priors=N1 two numbers --reml-alg=NUMBER 0(AI) or 1(AI) or 2(EM) --reml-no-constrain allow negative --reml-maxit=NUMBER maximum number of iterations --gxe test gxe --covar=FILE covariate file --qcovar=FILE quantitative covariate file --reml-lrt=NUMBER see GCTA doc --reml-no-lrt turn off the LRT --prevalence=NUMBER prevalence of the disease --dry-run show only the code --nojob front end description: The arguments here are exactly the same arguments used in GCTA software. To know how to use these arguments, please see the GCTA web page (http://cnsgenomics.com/software/gcta/reml.html). A typical example submission command for a case-control trait would be like, genie gcta variance --grm=test --pheno=test.phen --prevalence 0.01 --out test_null ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['variance'])
#!/bin/env python3 ''' usage: predixcan qc-samples [options] --bfile=FILE --ref=FILE options: --bfile=FILE plink file basename or list --ref=FILE referrence plink file (only basename) --out=PREFIX outname prefix [default: predixcan] --skip-sex skip sex checks --overlap if the ref is within bfile --cluster=NAME cluster name [default: open] --nojob run in front end --dry-run just show the codes --int control job submission from frontend --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['qc-samples'])
#!/bin/env python3 ''' usage: predixcan gcta [options] --grm=FILE --pheno=NAME options: --grm=BASENAME basename of grm file --pheno=FILE file or list of plink format phenotype files --out=PREFIX outname prefix --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end --int ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['gcta'])
#!/bin/env python3 ''' usage: predixcan magma [options] --gwas=FILE --geneannot=FILE --ref=FILE options: --gwas=FILE gwas summary statistics --geneannot=FILE geneannot file --ref=FILE genotype reference file --out=PREFIX outname prefix [default: magma] --nojob run in front end --dry-run just show the codes --int submit jobs from front end --njobs=NUMBER number of parallel jobs;applicable only when running in front end --cluster=NAME cluster name [default: open] ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['magma'])
#!/bin/env python3 ''' usage: mvp gdsforpca [options] --pgen=FILE options: --pgen=PREFIX pgen file or .list of pgen files --nvariants=NUMBER final number of variants [default: 40000] --out=PREFIX outname prefix [default: predixcan] --cluster=NAME minerva or genomedk [default: minerva] --nojob run in front end --dry-run just show the codes --int submit jobs from front end --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments,['gdsforpca'])
#!/bin/env python3 ''' usage: predixcan qc-dna [options] --vcf=FILE options: --vcf=FILE bgzipped and indexed vcf file or list --ref=FILE bgzipped and indexed referrence vcf file --ifilter=EXPRESSION filter expression [default: MAF>0.01 & R2>0.8] --renameINFO --out=PREFIX outname prefix [default: predixcan] --cluster=NAME cluster name [default: open] --nojob run in front end --dry-run just show the codes --int control job submission from frontend --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['qc-dna'])
#!/bin/python ''' usage: sumstats dbsnp [options] --sumstats=FILE options: --sumstats=FILE summary statistics file --out=PREFIX output name prefix [default: out] --chr=NAME chromosome column name [default: CHR] --pos=NAME position column name [default: BP] --ref=NAME ref allele column name [default: A1] --alt=NAME alternate allele column name [default: A2] --dry-run dry run --nojob if need to be run in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['dbsnp'])
#!/bin/python ''' usage: sumstats impute [options] --sumstats=FILE --ld=PLINK --N=SAMPLESIZE options: --sumstats=FILE munged summary statistics file --out=PREFIX output name prefix [default: clump_out] --ld=PLINKFILE plink file without extension --bed=BEDFILE bed file [default: |resources/sumstats/impute/fizi/locations.bed] --N=SAMPLESIZE sample size --nojob if should run in front end --int --njobs=NUMBER Number of parallel jobs when running in front end --dry-run dry run snakemake --cluster=NAME cluster name [default: open] ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['impute'])
'launches_per_config': inp['fwd_rev_launches_per_config'], 'frame_chunk_size': inp['fwd_rev_frame_chunk_size'] }, 'md_file_paths': md_file_paths, # file paths to the MD directories in the anchor file 'prods_per_anchor': 1, # number of simulations per anchor #'one_equil_per_anchor':True, # True: all prod simulations will be started from portions of one single equilibration. False: all 50 productions will have their own equilibration } md_settings['absolute_mode'] = str(absolute_mode) md_settings['temp_equil_settings']['temp_range'] = np.concatenate((np.arange(md_settings['temp_equil_settings']['start_temp'],md_settings['temp_equil_settings']['peak_temp'],md_settings['temp_equil_settings']['temp_increment']), \ np.arange(md_settings['temp_equil_settings']['peak_temp'],md_settings['temp_equil_settings']['end_temp'],-md_settings['temp_equil_settings']['temp_increment']), [md_settings['temp_equil_settings']['end_temp']])) md_settings_all = dict(md_settings.items() + sys_params.items() + tcl.items()) md.main(md_settings_all) #print "md_settings:", md_settings if sys_params['bd']: bd_receptor_dry_pqr = parser.get_structure( 'bd_receptor_dry_pqr', sys_params['bd_rec_pqr_filename'], pqr=True) bd_settings={ # settings for the bd model 'rec_struct':bd_receptor_dry_pqr, 'lig_configs': lig_configs, 'temperature':master_temperature, 'threads':int(inp['bd_threads']), 'fhpd_numtraj':inp['fhpd_numtraj'], # reaction sites information in milestone_settings below 'browndye_bin_dir':inp['browndye_bin_dir'],
#!/bin/env python ''' usage: gcta cojo (--cojo-slct | --cojo-joint | --cojo-cond=FILE ) --cojo-file=SUMMARY [options] options: --cojo-file=SUMMARY Summary file --bfile=PLINK Plink file [default: |resources/magma/g1000_eur] --chr=NUMBER Chromosome number --maf=NUMBER Minor allele frequency --out=PREFIX Outname prefix [default: genie_cojo] --extract=LIST a file with list of SNPs --cojo-slct Select multiple associated SNPs through a stepwise selection procedure --cojo-joint Estimate the joint effects of a subset of SNPs --cojo-cond=FILE Perform single-SNP association analyses conditional on a set of SNPs --nojob if front end --dry-run dry run ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['cojo'])
#!/bin/env python3 ''' usage: predixcan chunkvcf [options] --vcf=FILE options: --vcf=VCF vcf file or .list of files --nvariants=NUMBER number of variants [default: 500000] --nsamples=NUMBER number of samples [default: 5000] --out=PREFIX outname prefix [default: predixcan] --cluster=NAME minerva or genomedk [default: minerva] --nojob run in front end --dry-run just show the codes --int submit jobs from front end --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['chunkvcf'])
#!/bin/env python ''' usage: gwas genabel [options] --pheno=FILE options: --pheno=FILE phenotype file --covar=FILE covariate file --geno=FILE genotypefile or .list in genabel format --out=PREFIX outname prefix [deafult: genabel_out] --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments,['genabel'])
#!/bin/env python ''' usage: prs hub [options] --pheno=FILE --covar=FILE options: --pheno=FILE phenotype file --covar=FILE covariate file --score=FILE score file or .list --out=PREFIX outname prefix [deafult: hub_out] --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['hub'])
#!/bin/env python3 ''' usage: predixcan wgcna [options] --expr=RDS options: --expr=RDS expression rds file --beta=NUMBER beta value [default: NULL] --out=PREFIX outname prefix [default: WGCNA] --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end --int ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['wgcna'])
#!/bin/env python ''' usage: genie hess [options] <sumstat> options: --out=PREFIX outname prefix [default: genie_gsmr] --nojob run on front end --dry-run dry run <sumstat> sumstat files. Split by chromosome, filename with * in place of chr number. ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../library') import md if __name__ == '__main__': md.main(docopt(__doc__), ['hess'])
''' usage: geneset magma [options] <summary-file> options: --out=PREFIX outname prefix[default: output] --geneloc=FILE geneloc file [default: |resources/magma/NCBI37.3.gene.loc.symbols] --ldfile=FILE reference file in plink format [default: |resources/magma/g1000_eur] --ncol=NAME name of the column containing the sample size [default: Neff] --nsamples=NUMBER alternatively, the number of samples can be assigned like this --genesets=FILE genesets file in gmt format [default: |resources/magma/gmt.list] --nojob run on the front end --dry-run dry run --njobs=NUMBER number of parallel jobs [default: 1] <summary-file> a single summary file or a text file(with extension .list) containing the list of summary files ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['magma', 'vegas', 'fastbat'])
''' usage: sumstats clump [options] --sumstats=FILE --ld=PLINKFILE options: --sumstats=FILE munged summary statistics file --out=PREFIX output name prefix [default: clump_out] --ld=PLINKFILE plink file without extension --p1=NUMBER pvalue threshold 1 [default: 1] --p2=NUMBER pvalue threshold 2 [default: 1] --r2=NUMBER rsquared value [default: 0.1] --distance=NUMBER window distance in kb [default: 250] --nojob if should run in front end --int --njobs=NUMBER Number of parallel jobs when running in front end --dry-run dry run snakemake --other=ARGS other arguments to pass to plink with quotes ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md from md import process_list arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments,['clump'])
#!/bin/env python3 ''' usage: predixcan train_module [options] --module=FILE --out=NAME options: --gds=FILE genotype gds file [default: merged.gds] --bfile=NAME plink base name [default: merged] --module=FILE file with list of module genes [default: module.names.list] --out=PREFIX outname prefix --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end --int ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['train_module'])
#!/bin/env python3 ''' usage: gwas plink [options] --pheno=FILE --covar=FILE options: --pheno=FILE phenotype file --covar=FILE covariate file --geno=FILE genotypefile or .list (without plink extension) --out=PREFIX outname prefix [deafult: genabel_out] --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments,['plink'])
''' usage: predixcan train [options] --gds=FILE --expr=FILE options: --gds=FILE genotype gds file --expr=FILE expression file --genes=FILE a file with list of genes with co-ordinates --priors=FILE priors file --grouping=FILE grouping file --nested --out=PREFIX outname prefix [default: predixcan] --cluster=NAME genomedk or minerva [default: minerva] --nojob run in front end --dry-run just show the codes --njobs=NUMBER number of parallel jobs; applicable only when running in front end --int ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments,['train'])
#!/bin/env python3 ''' usage: predixcan predict [options] --gds=FILE --db=FILE options: --gds=FILE genotype gds file --db=FILE prediction model db file --genes=FILE a file with list of genes with co-ordinates --out=PREFIX outname prefix [default: predixcan] --nojob run in front end --cluster=NAME cluster name [default: minerva] --dry-run just show the codes --int submit jobs from front end --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments,['predict'])
#!/bin/env python3 ''' usage: predixcan novel [options] --gwas=FILE options: --gwas=FILE gwas summary statistics --annot=FILE gencode annot [default: |resources/predixcan/gencode.v27.build37.txt] --out=PREFIX outname prefix [default: predixcan] --nojob run in front end --dry-run just show the codes --int submit jobs from front end --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['novel'])
#!/bin/env python3 ''' usage: predixcan covar [options] --expr=FILE options: --expr=FILE expression matrix file --thold=NUMBER correlation value to filter [default: 0.8] --cutoff=NUMBER filtering cutoff 20 or 50 percent [default: 0.2] --out=PREFIX outname prefix [default: predixcan] --nojob run in front end --dry-run just show the codes --int submit jobs from front end --njobs=NUMBER number of parallel jobs; applicable only when running in front end ''' from docopt import docopt import sys sys.path.insert(1, sys.path[0] + '/../../../library') import md arguments = docopt(__doc__) if __name__ == '__main__': md.main(arguments, ['covar'])