def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option( "-s", dest="samples", type="string", help="""Comma separated list of samples to run or a file with the names of the samples to run. If a file is given, the first column will be used as the sample column and is assumed tab-delimited""", default=None) opt_parser.add_option( "-i", dest="input_dir", type="string", help="""Root of directory containing input files for all samples.""", default=None) opt_parser.add_option("-o", dest="output_dir", type="string", help="""Root of directory that will place all output into subdirectories.""", default=None) opt_parser.add_option("--sqlite_db_dir", dest="sqlite_db_dir", type="string", help="""Location of sqlite databases. If sqlite databases are used, will override usage of a MySQL database.""", default=None) opt_parser.add_option("--host", dest="host", type="string", help="MySQL database host. Def=\'%s\'" % HOST, default=HOST) opt_parser.add_option("--user", dest="user", type="string", help="MySQL database user. Def=\'%s\'" % USER, default=USER) opt_parser.add_option("--passwd", dest="passwd", type="string", help="MySQL database password. Def=\'%s\'" % PASSWD, default=PASSWD) opt_parser.add_option("--txt_db1", dest="txt_db1", type="string", help="""Database of transcript annotations derived from a gtf file. Used to define exon and intron and gene coordinates.""", default=None) opt_parser.add_option("--txt_db2", dest="txt_db2", type="string", help="""Database of transcript annotations derived from a gtf file. Used to identify alternative first and last exons. Can be the same or different as txt_db1. This annotation should be fairly clean of fragmented transcripts.""", default=None) opt_parser.add_option("--txt_db3", dest="txt_db3", type="string", help="""Database of transcript annotations derived from a gtf file. Used for annotating gene names and whether an intron/junction is annotated or not. By default, txt_db1 will be used for this information.""", default=None) # opt_parser.add_option("--method", # dest="method", # type="string", # help="""Type of correction method: # 'BH' - Benjamini & Hochberg, # 'bonferroni'""", # default=None) opt_parser.add_option("--jcn_seq_len", dest="jcn_seq_len", type="int", help="""I recommmend this value to be (read_length-6)*2 which assumes reads aligned to junctions with at least a 6pb overhang. If tophat was used for the alignment and you used the -a option with something < 6, give the value (read_length-(anchor length)*2.""", default=None) # opt_parser.add_option("--lengthNorm", # dest="lengthNorm", # action="store_true", # help="""Default is to normalize read counts by # isoform length. This will option will specify # to not normalize by isoform length.""", # default=False) # opt_parser.add_option("--fasta", # dest="genome_file", # type="string", # help="""Contains the genome sequence organized by # chromosome.""", # default=None) opt_parser.add_option("-p", dest="num_processes", type="int", help="""Will run getASEventReadCounts.py simultaneously with this many samples. Default=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option("--nice", dest="nice", action="store_true", help="When running locally, use nice", default=False) opt_parser.add_option( "--LSF", dest="run_lsf", action="store_true", help="""Will launch jobs on LSF. Default is running on local.""", default=False) opt_parser.add_option("--week", dest="week", action="store_true", help="Will launch jobs on LSF using week queue.", default=False) opt_parser.add_option("--by_chr", dest="by_chr", action="store_true", help="""Indicates that input files are broken up by chromosome""", default=False) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will check for the existence of the final output before running commands. This option will force all runs.""", default=False) opt_parser.add_option( "--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option( "--print_cmd", dest="print_cmd", action="store_true", help="""Will print commands that will be run, but will not run them. Used for debugging.""", default=False) opt_parser.add_option("--keep_intermediate", dest="keep_interm", action="store_true", help="""Will remove intermediate files by default. Use this option to keep them.""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-s") opt_parser.check_required("-i") opt_parser.check_required("-o") opt_parser.check_required("--txt_db1") opt_parser.check_required("--txt_db2") # opt_parser.check_required("--method") # opt_parser.check_required("--fasta") opt_parser.check_required("--jcn_seq_len") samples = getSampleNames(options.samples) if os.path.exists(options.input_dir): input_dir = os.path.abspath(options.input_dir) else: print "Input directory does not exist." opt_parser.print_help() sys.exit(1) if input_dir.endswith("/"): input_dir = input_dir.rstrip("/") if options.sqlite_db_dir: sqlite_db_dir = formatDir(options.sqlite_db_dir) if os.path.exists(options.output_dir): output_dir = os.path.abspath(options.output_dir) else: os.mkdir(options.output_dir) output_dir = os.path.abspath(options.output_dir) print "Creating output directory: %s" % output_dir if output_dir.endswith("/"): output_dir = output_dir.rstrip("/") txt_db1 = options.txt_db1 txt_db2 = options.txt_db2 txt_db3 = options.txt_db3 # method = options.method # genome_file = os.path.abspath(options.genome_file) jcn_seq_len = options.jcn_seq_len num_processes = options.num_processes run_LSF = options.run_lsf keep_interm = options.keep_interm week = options.week nice = options.nice print_cmd = options.print_cmd force = options.force check = options.check by_chr = options.by_chr if by_chr: chr_list = getChr(input_dir) ctr = 0 for samp in samples: # Check for output subdirectory samp_dir = output_dir + "/" + samp if not os.path.exists(samp_dir): os.mkdir(samp_dir) for chr in chr_list: chr_dir = samp_dir + "/" + samp + "_" + chr if not os.path.exists(chr_dir): os.mkdir(chr_dir) os.chdir(chr_dir) expected_out_file = "%s_%s_finished.txt" % (samp, chr) file_is_present = False try: if os.path.getsize(expected_out_file) != 0: file_is_present = True else: if check: print "File is empty: %s,%s,%s" % ( samp, chr, expected_out_file) except: if check: print "Does not exist: %s,%s,%s" % (samp, chr, expected_out_file) if check: continue if force: # Delete previous expected out file to prevent confusions # with previous runs when rechecking again if file_is_present: os.system("rm " + expected_out_file) else: if file_is_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "--jcn1 %s/pseudo_%s/pseudo_%s_junctions.bed " % ( input_dir, chr, chr) cmd += "--jcn2 %s/%s/%s_%s/%s_%s_junctions.bed " % ( input_dir, samp, samp, chr, samp, chr) cmd += "--genome_reads1 %s/pseudo_%s/pseudo_%s_genome_reads.txt.gz " % ( input_dir, chr, chr) cmd += "--genome_reads2 %s/%s/%s_%s/%s_%s_genome_reads.txt.gz " % ( input_dir, samp, samp, chr, samp, chr) cmd += "--ie1 %s/pseudo_%s/pseudo_%s_intron_exon_junction_counts.txt " % ( input_dir, chr, chr) cmd += "--ie2 %s/%s/%s_%s/%s_%s_intron_exon_junction_counts.txt " % ( input_dir, samp, samp, chr, samp, chr) cmd += "-p %s_%s " % (samp, chr) cmd += "--txt_db1 %s " % txt_db1 cmd += "--txt_db2 %s " % txt_db2 if txt_db3: cmd += "--txt_db3 %s " % txt_db3 # cmd += "--method %s " % method cmd += "--jcn_seq_len %d " % jcn_seq_len # cmd += "--fasta %s " % genome_file cmd += "--by_chr %s " % chr if keep_interm: cmd += "--keep_intermediate " # Now for databases if options.sqlite_db_dir: cmd += "--sqlite_db_dir %s" % sqlite_db_dir else: # use MySQL if options.passwd == "": cmd += "--host %s --user %s" % (options.host, options.user) else: cmd += "--host %s --user %s --passwd %s" % ( options.host, options.user, options.passwd) if print_cmd: if not run_LSF: if nice: cmd = "nice " + cmd print cmd continue if run_LSF: if week: queue = "week" else: queue = "hour" runLSF(cmd, "%s_%s.getASEventReadCounts.bsub.out" % (samp, chr), samp + "_" + chr, queue) continue if nice: cmd = "nice " + cmd if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) else: ctr = 0 for samp in samples: # Check for output subdirectory full_output_dir = output_dir + "/" + samp if not os.path.exists(full_output_dir): os.mkdir(full_output_dir) os.chdir(full_output_dir) expected_out_file = "%s_finished.txt" % samp file_is_present = False try: if os.path.getsize(expected_out_file) != 0: file_is_present = True else: if check: print "File is empty: %s,%s" % (samp, expected_out_file) except: if check: print "Does not exist: %s, %s" % (samp, expected_out_file) if check: continue if force: if file_is_present: os.system("rm " + expected_out_file) else: if file_is_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "--jcn1 %s/pseudo/pseudo_junctions.bed " % input_dir cmd += "--jcn2 %s/%s/%s_junctions.bed " % (input_dir, samp, samp) cmd += "--genome_reads1 %s/pseudo/pseudo_genome_reads.txt.gz " % input_dir cmd += "--genome_reads2 %s/%s/%s_genome_reads.txt.gz " % ( input_dir, samp, samp) cmd += "--ie1 %s/pseudo/pseudo_intron_exon_junction_counts.txt " % input_dir cmd += "--ie2 %s/%s/%s_intron_exon_junction_counts.txt " % ( input_dir, samp, samp) cmd += "-p %s " % samp cmd += "--txt_db1 %s " % txt_db1 cmd += "--txt_db2 %s " % txt_db2 if txt_db3: cmd += "--txt_db3 %s " % txt_db3 # cmd += "--method %s " % method cmd += "--jcn_seq_len %d " % jcn_seq_len # cmd += "--fasta %s " % genome_file if keep_interm: cmd += "--keep_intermediate " # Now for databases if options.sqlite_db_dir: cmd += "--sqlite_db_dir %s" % sqlite_db_dir else: # use MySQL if options.passwd == "": cmd += "--host %s --user %s" % (options.host, options.user) else: cmd += "--host %s --user %s --passwd %s" % ( options.host, options.user, options.passwd) if print_cmd: if not run_LSF: if nice: cmd = "nice " + cmd print cmd continue if run_LSF: runLSF(cmd, "%s.getASEventReadCounts.bsub.out" % samp, samp, "week") # Week cue if running whole samples continue if nice: cmd = "nice " + cmd if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--input_dir", dest="input_dir", type="string", help="Root directory of all input files.", default=None) opt_parser.add_option("--LSF", dest="lsf_group", type="string", help="""Launches jobs on LSF by default. It will use this specified group. DEF=%s""" % DEF_GROUP, default=DEF_GROUP) opt_parser.add_option("--lsf_queue", dest="lsf_queue", type="string", help="""If launching jobs on LSF, it will use the specified queue. DEF=%s""" % DEF_QUEUE, default=DEF_QUEUE) opt_parser.add_option("--num_processes", dest="num_processes", type="int", help="""If jobs should be run locally, indicate by putting the number of processes to batch.""", default=None) opt_parser.add_option("--min_overhang", dest="min_overhang", type="int", help="""Minimum overhang used to determine a junction alignment (Used as input to find intron-exon junctions. Default=%d""" % DEF_OVERHANG, default=DEF_OVERHANG) opt_parser.add_option( "--force", dest="force", action="store_true", help="""By default, will check for the existence of the output file. If it is non-zero, then it will not run the sample. This option forces runs of everything""", default=False) opt_parser.add_option( "--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option("--nice", dest="nice", action="store_true", help="""Will run locally, using nice""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("--input_dir") input_dir = formatDir(options.input_dir) lsf_group = options.lsf_group lsf_queue = options.lsf_queue num_processes = options.num_processes force = options.force check = options.check nice = options.nice # Will use the tmp files in the input directory to determine chromosomes to # process. chr_list = [] for bed_file in os.listdir(input_dir): if os.path.isdir(input_dir + "/" + bed_file): continue if not bed_file.endswith(".bed"): continue first_split = bed_file.split("preProcess")[0] second_split = first_split.split("tmp")[-1] chr_list.append(second_split.strip("_")) # Now go through each sample directory to get to subdirectory, then run # command ctr = 0 # For num processes for sample_dir in os.listdir(input_dir): if not os.path.isdir(input_dir + "/" + sample_dir): continue for this_chr in chr_list: expected_out_file = "%s/%s/%s_%s/%s_%s_intron_exon_junction_counts.txt" % ( input_dir, sample_dir, sample_dir, this_chr, sample_dir, this_chr) file_is_present = False try: if os.path.getsize(expected_out_file) == 0: if check: print "File is empty: %s" % expected_out_file else: file_is_present = True except: if check: print "Does not exist: %s" % expected_out_file if check: continue if not force: if file_is_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-i %s/%s " % (input_dir, sample_dir) cmd += "-n %s_%s " % (sample_dir, this_chr) cmd += "-t %s/tmp_%s_preProcess_getASEventReadCounts_step2.bed " % ( input_dir, this_chr) cmd += "--min_overhang %d" % options.min_overhang if num_processes: if nice: cmd = "nice " + cmd if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) else: tmp_file = "%s/tmp.txt" % os.curdir runLSF(cmd, "%s_%s.preProcess_3.bsub.out" % (sample_dir, this_chr), "%s_%s" % (sample_dir, this_chr), lsf_queue, group=lsf_group, tmp_file_name=tmp_file) sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-s", dest="samples", type="string", help="""Comma separated list of samples to run or a file with the names of the samples to run. If a file is given, the first column will be used as the sample column and is assumed tab-delimited""", default=None) opt_parser.add_option("-i", dest="input_dir", type="string", help="""Root of directory containing input files for all samples.""", default=None) opt_parser.add_option("-o", dest="output_dir", type="string", help="""Root of directory that will place all output into subdirectories.""", default=None) opt_parser.add_option("--sqlite_db_dir", dest="sqlite_db_dir", type="string", help="""Location of sqlite databases. If sqlite databases are used, will override usage of a MySQL database.""", default=None) opt_parser.add_option("--host", dest="host", type="string", help="MySQL database host. Def=\'%s\'" % HOST, default=HOST) opt_parser.add_option("--user", dest="user", type="string", help="MySQL database user. Def=\'%s\'" % USER, default=USER) opt_parser.add_option("--passwd", dest="passwd", type="string", help="MySQL database password. Def=\'%s\'" % PASSWD, default=PASSWD) opt_parser.add_option("--txt_db1", dest="txt_db1", type="string", help="""Database of transcript annotations derived from a gtf file. Used to define exon and intron and gene coordinates.""", default=None) opt_parser.add_option("--txt_db2", dest="txt_db2", type="string", help="""Database of transcript annotations derived from a gtf file. Used to identify alternative first and last exons. Can be the same or different as txt_db1. This annotation should be fairly clean of fragmented transcripts.""", default=None) opt_parser.add_option("--txt_db3", dest="txt_db3", type="string", help="""Database of transcript annotations derived from a gtf file. Used for annotating gene names and whether an intron/junction is annotated or not. By default, txt_db1 will be used for this information.""", default=None) # opt_parser.add_option("--method", # dest="method", # type="string", # help="""Type of correction method: # 'BH' - Benjamini & Hochberg, # 'bonferroni'""", # default=None) opt_parser.add_option("--jcn_seq_len", dest="jcn_seq_len", type="int", help="""I recommmend this value to be (read_length-6)*2 which assumes reads aligned to junctions with at least a 6pb overhang. If tophat was used for the alignment and you used the -a option with something < 6, give the value (read_length-(anchor length)*2.""", default=None) # opt_parser.add_option("--lengthNorm", # dest="lengthNorm", # action="store_true", # help="""Default is to normalize read counts by # isoform length. This will option will specify # to not normalize by isoform length.""", # default=False) # opt_parser.add_option("--fasta", # dest="genome_file", # type="string", # help="""Contains the genome sequence organized by # chromosome.""", # default=None) opt_parser.add_option("-p", dest="num_processes", type="int", help="""Will run getASEventReadCounts.py simultaneously with this many samples. Default=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option("--nice", dest="nice", action="store_true", help="When running locally, use nice", default=False) opt_parser.add_option("--LSF", dest="run_lsf", action="store_true", help="""Will launch jobs on LSF. Default is running on local.""", default=False) opt_parser.add_option("--week", dest="week", action="store_true", help="Will launch jobs on LSF using week queue.", default=False) opt_parser.add_option("--by_chr", dest="by_chr", action="store_true", help="""Indicates that input files are broken up by chromosome""", default=False) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will check for the existence of the final output before running commands. This option will force all runs.""", default=False) opt_parser.add_option("--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option("--print_cmd", dest="print_cmd", action="store_true", help="""Will print commands that will be run, but will not run them. Used for debugging.""", default=False) opt_parser.add_option("--keep_intermediate", dest="keep_interm", action="store_true", help="""Will remove intermediate files by default. Use this option to keep them.""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-s") opt_parser.check_required("-i") opt_parser.check_required("-o") opt_parser.check_required("--txt_db1") opt_parser.check_required("--txt_db2") # opt_parser.check_required("--method") # opt_parser.check_required("--fasta") opt_parser.check_required("--jcn_seq_len") samples = getSampleNames(options.samples) if os.path.exists(options.input_dir): input_dir = os.path.abspath(options.input_dir) else: print "Input directory does not exist." opt_parser.print_help() sys.exit(1) if input_dir.endswith("/"): input_dir = input_dir.rstrip("/") if options.sqlite_db_dir: sqlite_db_dir = formatDir(options.sqlite_db_dir) if os.path.exists(options.output_dir): output_dir = os.path.abspath(options.output_dir) else: os.mkdir(options.output_dir) output_dir = os.path.abspath(options.output_dir) print "Creating output directory: %s" % output_dir if output_dir.endswith("/"): output_dir = output_dir.rstrip("/") txt_db1 = options.txt_db1 txt_db2 = options.txt_db2 txt_db3 = options.txt_db3 # method = options.method # genome_file = os.path.abspath(options.genome_file) jcn_seq_len = options.jcn_seq_len num_processes = options.num_processes run_LSF = options.run_lsf keep_interm = options.keep_interm week = options.week nice = options.nice print_cmd = options.print_cmd force = options.force check = options.check by_chr = options.by_chr if by_chr: chr_list = getChr(input_dir) ctr = 0 for samp in samples: # Check for output subdirectory samp_dir = output_dir + "/" + samp if not os.path.exists(samp_dir): os.mkdir(samp_dir) for chr in chr_list: chr_dir = samp_dir + "/" + samp + "_" + chr if not os.path.exists(chr_dir): os.mkdir(chr_dir) os.chdir(chr_dir) expected_out_file = "%s_%s_finished.txt" % (samp, chr) file_is_present = False try: if os.path.getsize(expected_out_file) != 0: file_is_present = True else: if check: print "File is empty: %s,%s,%s" % (samp, chr, expected_out_file) except: if check: print "Does not exist: %s,%s,%s" % (samp, chr, expected_out_file) if check: continue if force: # Delete previous expected out file to prevent confusions # with previous runs when rechecking again if file_is_present: os.system("rm " + expected_out_file) else: if file_is_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "--jcn1 %s/pseudo_%s/pseudo_%s_junctions.bed " % (input_dir, chr, chr) cmd += "--jcn2 %s/%s/%s_%s/%s_%s_junctions.bed " % (input_dir, samp, samp, chr, samp, chr) cmd += "--genome_reads1 %s/pseudo_%s/pseudo_%s_genome_reads.txt.gz " % (input_dir, chr, chr) cmd += "--genome_reads2 %s/%s/%s_%s/%s_%s_genome_reads.txt.gz " % (input_dir, samp, samp, chr, samp, chr) cmd += "--ie1 %s/pseudo_%s/pseudo_%s_intron_exon_junction_counts.txt " % (input_dir, chr, chr) cmd += "--ie2 %s/%s/%s_%s/%s_%s_intron_exon_junction_counts.txt " % (input_dir, samp, samp, chr, samp, chr) cmd += "-p %s_%s " % (samp, chr) cmd += "--txt_db1 %s " % txt_db1 cmd += "--txt_db2 %s " % txt_db2 if txt_db3: cmd += "--txt_db3 %s " % txt_db3 # cmd += "--method %s " % method cmd += "--jcn_seq_len %d " % jcn_seq_len # cmd += "--fasta %s " % genome_file cmd += "--by_chr %s " % chr if keep_interm: cmd += "--keep_intermediate " # Now for databases if options.sqlite_db_dir: cmd += "--sqlite_db_dir %s" % sqlite_db_dir else: # use MySQL if options.passwd == "": cmd += "--host %s --user %s" % (options.host, options.user) else: cmd += "--host %s --user %s --passwd %s" % (options.host, options.user, options.passwd) if print_cmd: if not run_LSF: if nice: cmd = "nice " + cmd print cmd continue if run_LSF: if week: queue = "week" else: queue = "hour" runLSF(cmd, "%s_%s.getASEventReadCounts.bsub.out" % (samp, chr), samp + "_" + chr, queue) continue if nice: cmd = "nice " + cmd if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) else: ctr = 0 for samp in samples: # Check for output subdirectory full_output_dir = output_dir + "/" + samp if not os.path.exists(full_output_dir): os.mkdir(full_output_dir) os.chdir(full_output_dir) expected_out_file = "%s_finished.txt" % samp file_is_present = False try: if os.path.getsize(expected_out_file) != 0: file_is_present = True else: if check: print "File is empty: %s,%s" % (samp, expected_out_file) except: if check: print "Does not exist: %s, %s" % (samp, expected_out_file) if check: continue if force: if file_is_present: os.system("rm " + expected_out_file) else: if file_is_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "--jcn1 %s/pseudo/pseudo_junctions.bed " % input_dir cmd += "--jcn2 %s/%s/%s_junctions.bed " % (input_dir, samp, samp) cmd += "--genome_reads1 %s/pseudo/pseudo_genome_reads.txt.gz " % input_dir cmd += "--genome_reads2 %s/%s/%s_genome_reads.txt.gz " % (input_dir, samp, samp) cmd += "--ie1 %s/pseudo/pseudo_intron_exon_junction_counts.txt " % input_dir cmd += "--ie2 %s/%s/%s_intron_exon_junction_counts.txt " % (input_dir, samp, samp) cmd += "-p %s " % samp cmd += "--txt_db1 %s " % txt_db1 cmd += "--txt_db2 %s " % txt_db2 if txt_db3: cmd += "--txt_db3 %s " % txt_db3 # cmd += "--method %s " % method cmd += "--jcn_seq_len %d " % jcn_seq_len # cmd += "--fasta %s " % genome_file if keep_interm: cmd += "--keep_intermediate " # Now for databases if options.sqlite_db_dir: cmd += "--sqlite_db_dir %s" % sqlite_db_dir else: # use MySQL if options.passwd == "": cmd += "--host %s --user %s" % (options.host, options.user) else: cmd += "--host %s --user %s --passwd %s" % (options.host, options.user, options.passwd) if print_cmd: if not run_LSF: if nice: cmd = "nice " + cmd print cmd continue if run_LSF: runLSF(cmd, "%s.getASEventReadCounts.bsub.out" % samp, samp, "week") # Week cue if running whole samples continue if nice: cmd = "nice " + cmd if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--input_dir", dest="input_dir", type="string", help="Root directory of all input files.", default=None) opt_parser.add_option("--LSF", dest="lsf_group", type="string", help="""Launches jobs on LSF by default. It will use this specified group. DEF=%s""" % DEF_GROUP, default=DEF_GROUP) opt_parser.add_option("--lsf_queue", dest="lsf_queue", type="string", help="""If launching jobs on LSF, it will use the specified queue. DEF=%s""" % DEF_QUEUE, default=DEF_QUEUE) opt_parser.add_option("--num_processes", dest="num_processes", type="int", help="""If jobs should be run locally, indicate by putting the number of processes to batch.""", default=None) opt_parser.add_option("--min_overhang", dest="min_overhang", type="int", help="""Minimum overhang used to determine a junction alignment (Used as input to find intron-exon junctions. Default=%d""" % DEF_OVERHANG, default=DEF_OVERHANG) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will check for the existence of the output file. If it is non-zero, then it will not run the sample. This option forces runs of everything""", default=False) opt_parser.add_option("--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option("--nice", dest="nice", action="store_true", help="""Will run locally, using nice""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("--input_dir") input_dir = formatDir(options.input_dir) lsf_group = options.lsf_group lsf_queue = options.lsf_queue num_processes = options.num_processes force = options.force check = options.check nice = options.nice # Will use the tmp files in the input directory to determine chromosomes to # process. chr_list = [] for bed_file in os.listdir(input_dir): if os.path.isdir(input_dir + "/" + bed_file): continue if not bed_file.endswith(".bed"): continue first_split = bed_file.split("preProcess")[0] second_split = first_split.split("tmp")[-1] chr_list.append(second_split.strip("_")) # Now go through each sample directory to get to subdirectory, then run # command ctr = 0 # For num processes for sample_dir in os.listdir(input_dir): if not os.path.isdir(input_dir + "/" + sample_dir): continue for this_chr in chr_list: expected_out_file = "%s/%s/%s_%s/%s_%s_intron_exon_junction_counts.txt" % (input_dir, sample_dir, sample_dir, this_chr, sample_dir, this_chr) file_is_present = False try: if os.path.getsize(expected_out_file) == 0: if check: print "File is empty: %s" % expected_out_file else: file_is_present = True except: if check: print "Does not exist: %s" % expected_out_file if check: continue if not force: if file_is_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-i %s/%s " % (input_dir, sample_dir) cmd += "-n %s_%s " % (sample_dir, this_chr) cmd += "-t %s/tmp_%s_preProcess_getASEventReadCounts_step2.bed " % (input_dir, this_chr) cmd += "--min_overhang %d" % options.min_overhang if num_processes: if nice: cmd = "nice " + cmd if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) else: tmp_file = "%s/tmp.txt" % os.curdir runLSF(cmd, "%s_%s.preProcess_3.bsub.out" % (sample_dir, this_chr), "%s_%s" % (sample_dir, this_chr), lsf_queue, group=lsf_group, tmp_file_name=tmp_file) sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-d", dest="root_dir", type="string", help="""Root directory that contains subdirectoires with output from getASEventReadCounts""", default=None) opt_parser.add_option("-i", dest="input_dir", type="string", help="""Directory containing original input files to getASEventReadCounts.py. This is used to obtain the chromosome information.""", default=None) opt_parser.add_option("--jcn_seq_len", dest="jcn_seq_len", type="int", help="""Value used in getASEventReadCounts""", default=None) opt_parser.add_option("-s", dest="samples", type="string", help="""Comma separated list of the samples that will be used or a file of sample names, one per line. The order which they are given is the order in the output of the file.""", default=None) # opt_parser.add_option("--lengthNorm", # dest="lengthNorm", # action="store_true", # help="""Flag to indicate length normalization was # done on the counts. Used for splitting the IR # counts back into left and right counts""", # default=False) opt_parser.add_option("--num_processes", dest="num_processes", type="int", help="""Will run each chromosome in batches using this number of parallel processes. DEF=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option("--print_cmd", dest="print_cmd", action="store_true", help="""Will not run any processes, but print the commands""", default=False) opt_parser.add_option("--check", dest="check", action="store_true", help="Will check which samples are not finished.", default=False) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will only run jobs that need to be completed. This will force to run all jobs.""", default=False) opt_parser.add_option("--run_LSF", dest="run_lsf", action="store_true", help="Will run everything through LSF", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-d") opt_parser.check_required("-i") opt_parser.check_required("-s") opt_parser.check_required("--jcn_seq_len") root_dir = formatDir(options.root_dir) input_dir = formatDir(options.input_dir) # Change to the root directory to make sure output files are put here os.chdir(root_dir) samples = options.samples jcn_seq_len = options.jcn_seq_len # lengthNorm = options.lengthNorm print_cmd = options.print_cmd check = options.check force = options.force num_processes = options.num_processes run_lsf = options.run_lsf chr_list = getChr(input_dir) ctr = 0 for this_chr in chr_list: files_are_present = False expected_out_files = ["%s/tmp_createAS_CountTables_%s_AS_exclusion_inclusion_counts.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_left_intron_counts.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_right_intron_counts.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_AS_exclusion_inclusion_counts_lenNorm.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_left_intron_counts_lenNorm.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_right_intron_counts_lenNorm.txt" % (root_dir, this_chr)] try: for expect_file in expected_out_files: if os.path.getsize(expect_file) == 0: files_are_present = False if check: print "Cannot find files for: %s" % this_chr break else: files_are_present = True except: if check: print "Cannot find files for: %s" % this_chr if check: continue if not force: if files_are_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-d %s " % root_dir cmd += "-o %s/tmp_createAS_CountTables_%s " % (root_dir, this_chr) # cmd += "--left_intron tmp_createAS_CountTables_%s_left_intron.out " % this_chr # cmd += "--right_intron tmp_createAS_CountTables_%s_right_intron.out " % this_chr cmd += "-s %s " % samples cmd += "--jcn_seq_len %d " % jcn_seq_len # if lengthNorm: # cmd += "--lengthNorm " cmd += "--which_chr %s" % this_chr if print_cmd: print cmd continue if run_lsf: runLSF(cmd, "%s.createAS_CountTables.bsub.out" % this_chr, samples.replace(",","-") + "_" + this_chr, "hour") continue if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option( "-d", dest="root_dir", type="string", help="""Root directory that contains subdirectoires with output from getASEventReadCounts""", default=None, ) opt_parser.add_option( "-i", dest="input_dir", type="string", help="""Directory containing original input files to getASEventReadCounts.py. This is used to obtain the chromosome information.""", default=None, ) opt_parser.add_option( "--jcn_seq_len", dest="jcn_seq_len", type="int", help="""Value used in getASEventReadCounts""", default=None ) opt_parser.add_option( "-s", dest="samples", type="string", help="""Comma separated list of the samples that will be used or a file of sample names, one per line. The order which they are given is the order in the output of the file.""", default=None, ) # opt_parser.add_option("--lengthNorm", # dest="lengthNorm", # action="store_true", # help="""Flag to indicate length normalization was # done on the counts. Used for splitting the IR # counts back into left and right counts""", # default=False) opt_parser.add_option( "--num_processes", dest="num_processes", type="int", help="""Will run each chromosome in batches using this number of parallel processes. DEF=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES, ) opt_parser.add_option( "--print_cmd", dest="print_cmd", action="store_true", help="""Will not run any processes, but print the commands""", default=False, ) opt_parser.add_option( "--check", dest="check", action="store_true", help="Will check which samples are not finished.", default=False ) opt_parser.add_option( "--force", dest="force", action="store_true", help="""By default, will only run jobs that need to be completed. This will force to run all jobs.""", default=False, ) opt_parser.add_option( "--run_LSF", dest="run_lsf", action="store_true", help="Will run everything through LSF", default=False ) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-d") opt_parser.check_required("-i") opt_parser.check_required("-s") opt_parser.check_required("--jcn_seq_len") root_dir = formatDir(options.root_dir) input_dir = formatDir(options.input_dir) # Change to the root directory to make sure output files are put here os.chdir(root_dir) samples = options.samples jcn_seq_len = options.jcn_seq_len # lengthNorm = options.lengthNorm print_cmd = options.print_cmd check = options.check force = options.force num_processes = options.num_processes run_lsf = options.run_lsf chr_list = getChr(input_dir) ctr = 0 for this_chr in chr_list: files_are_present = False expected_out_files = [ "%s/tmp_createAS_CountTables_%s_AS_exclusion_inclusion_counts.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_left_intron_counts.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_right_intron_counts.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_AS_exclusion_inclusion_counts_lenNorm.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_left_intron_counts_lenNorm.txt" % (root_dir, this_chr), "%s/tmp_createAS_CountTables_%s_right_intron_counts_lenNorm.txt" % (root_dir, this_chr), ] try: for expect_file in expected_out_files: if os.path.getsize(expect_file) == 0: files_are_present = False if check: print "Cannot find files for: %s" % this_chr break else: files_are_present = True except: if check: print "Cannot find files for: %s" % this_chr if check: continue if not force: if files_are_present: continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-d %s " % root_dir cmd += "-o %s/tmp_createAS_CountTables_%s " % (root_dir, this_chr) # cmd += "--left_intron tmp_createAS_CountTables_%s_left_intron.out " % this_chr # cmd += "--right_intron tmp_createAS_CountTables_%s_right_intron.out " % this_chr cmd += "-s %s " % samples cmd += "--jcn_seq_len %d " % jcn_seq_len # if lengthNorm: # cmd += "--lengthNorm " cmd += "--which_chr %s" % this_chr if print_cmd: print cmd continue if run_lsf: runLSF( cmd, "%s.createAS_CountTables.bsub.out" % this_chr, samples.replace(",", "-") + "_" + this_chr, "hour" ) continue if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) sys.exit(0)