def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("--initialize", dest="initialize", action="store_true", help="""Will split up the gtf file into separate temp files and initalize the database.""", default=False) opt_parser.add_option("--tmp_dir", dest="tmp_dir", type="string", help="""Directory to place temporary files and to look for temporary files.""", default=None) opt_parser.add_option("--keep_temp", dest="keep_temp", action="store_true", help="""TEMP FILES ARE KEPT FOR NOW. Will keep the temporary gtf files. Default is to delete them.""", default=False) opt_parser.add_option("-g", dest="gtf_file", type="string", help="GTF annotation file.", default=None) opt_parser.add_option("--use_gene_name", dest="use_gene_name", action="store_true", help="""By default, the gene_id attribute will be used for the gene name used in the database, but the gene_name attribute can be used instead.""", default=False) # May revisit this option, but do not need now # opt_parser.add_option("-f", # dest="genome_file_name", # type="string", # help="""Fasta file containing all chromosome # sequences. If this option is given, exon and # intron sequences will be stored in the # database as well. Chromosome names must be the # same format as in the gtf file.""", # default=None) opt_parser.add_option("-d", dest="db_name", type="string", help="Name of the new database", default=None) opt_parser.add_option("--sqlite_db_dir", dest="sqlite_db_dir", type="string", help="Location to put sqlite database. Default=%s" % DB_DIR, default=DB_DIR) opt_parser.add_option("-p", dest="num_processes", type="int", help="""Will run getASEventReadCounts.py simultaneously with this many samples. Default=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option("--LSF", dest="run_lsf", action="store_true", help="""Will launch jobs on LSF. Default is running on local.""", default=False) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will check for the existence of the final output before running commands. This option will force all runs.""", default=False) opt_parser.add_option("--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option("--print_cmd", dest="print_cmd", action="store_true", help="""Will print commands that will be run, but will not run them. Used for debugging.""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-g") opt_parser.check_required("--tmp_dir") opt_parser.check_required("-d") gtf_file_name = options.gtf_file tmp_dir = formatDir(options.tmp_dir) db_name = options.db_name sqlite_db_dir = options.sqlite_db_dir num_processes = options.num_processes run_lsf = options.run_lsf force = options.force check = options.check print_cmd = options.print_cmd ############## # INITIALIZE # ############## # If it's initilalizing, split gtf file and initialize database return if options.initialize: chr2lines = {} gtf_file_path = gtf_file_name gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) gtf_file = open(gtf_file_path) for line in gtf_file: this_chr = line.split("\t")[0] updateDictOfLists(chr2lines, this_chr, line) gtf_file.close() for chr in chr2lines: tmp_chr_file = open("%s/%s_%s.gtf" % (tmp_dir, gtf_file_prefix, chr), "w") for line in chr2lines[chr]: tmp_chr_file.write(line) tmp_chr_file.close() # Now initialize the database cmd = "python %s " % SCRIPT cmd += "--initialize -d %s" % db_name os.system(cmd) sys.exit(0) ################## # BUILD DATABASE # ################## db = DB(sqlite_db_dir) # Use gtf file to figure out temp file names, Build the database from them tmp_file_list = [] gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) for this_file in os.listdir(tmp_dir): if gtf_file_prefix in this_file: if this_file == gtf_file_name: continue tmp_file_list.append(this_file) # Now run script for every chromosome file ctr = 0 for tmp_file in tmp_file_list: this_chr = getChr(tmp_dir + "/" + tmp_file) if (not force) or check: # For now, just checks that records exist in the database, It is # better to force since it difficult to really know if a chromosome was # built or not. chr_built = checkChr(db, db_name, this_chr) if chr_built: if not force: continue if check: if not chr_built: print "Chromosome %s not built" % this_chr continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-g %s/%s " % (tmp_dir, tmp_file) cmd += "-d %s " % db_name if options.use_gene_name: cmd += "--use_gene_name " cmd += "--sqlite_db_dir %s" % sqlite_db_dir if print_cmd: print cmd continue if run_lsf: runLSF(cmd, "%s.build_DB.bsub.out" % this_chr, this_chr + "build_DB", "hour") continue if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) # Remove temp files, but first check that exons are returned from the same # chromosome in the database # if not options.keep_temp: sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option( "--initialize", dest="initialize", action="store_true", help="""Will split up the gtf file into separate temp files and initalize the database.""", default=False) opt_parser.add_option( "--tmp_dir", dest="tmp_dir", type="string", help="""Directory to place temporary files and to look for temporary files.""", default=None) opt_parser.add_option( "--keep_temp", dest="keep_temp", action="store_true", help= """TEMP FILES ARE KEPT FOR NOW. Will keep the temporary gtf files. Default is to delete them.""", default=False) opt_parser.add_option("-g", dest="gtf_file", type="string", help="GTF annotation file.", default=None) opt_parser.add_option( "--use_gene_name", dest="use_gene_name", action="store_true", help="""By default, the gene_id attribute will be used for the gene name used in the database, but the gene_name attribute can be used instead.""", default=False) # May revisit this option, but do not need now # opt_parser.add_option("-f", # dest="genome_file_name", # type="string", # help="""Fasta file containing all chromosome # sequences. If this option is given, exon and # intron sequences will be stored in the # database as well. Chromosome names must be the # same format as in the gtf file.""", # default=None) opt_parser.add_option("-d", dest="db_name", type="string", help="Name of the new database", default=None) opt_parser.add_option("--sqlite_db_dir", dest="sqlite_db_dir", type="string", help="Location to put sqlite database. Default=%s" % DB_DIR, default=DB_DIR) opt_parser.add_option("-p", dest="num_processes", type="int", help="""Will run getASEventReadCounts.py simultaneously with this many samples. Default=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option( "--LSF", dest="run_lsf", action="store_true", help="""Will launch jobs on LSF. Default is running on local.""", default=False) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will check for the existence of the final output before running commands. This option will force all runs.""", default=False) opt_parser.add_option( "--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option( "--print_cmd", dest="print_cmd", action="store_true", help="""Will print commands that will be run, but will not run them. Used for debugging.""", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-g") opt_parser.check_required("--tmp_dir") opt_parser.check_required("-d") gtf_file_name = options.gtf_file tmp_dir = formatDir(options.tmp_dir) db_name = options.db_name sqlite_db_dir = options.sqlite_db_dir num_processes = options.num_processes run_lsf = options.run_lsf force = options.force check = options.check print_cmd = options.print_cmd ############## # INITIALIZE # ############## # If it's initilalizing, split gtf file and initialize database return if options.initialize: chr2lines = {} gtf_file_path = gtf_file_name gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) gtf_file = open(gtf_file_path) for line in gtf_file: this_chr = line.split("\t")[0] updateDictOfLists(chr2lines, this_chr, line) gtf_file.close() for chr in chr2lines: tmp_chr_file = open( "%s/%s_%s.gtf" % (tmp_dir, gtf_file_prefix, chr), "w") for line in chr2lines[chr]: tmp_chr_file.write(line) tmp_chr_file.close() # Now initialize the database cmd = "python %s " % SCRIPT cmd += "--initialize -d %s" % db_name os.system(cmd) sys.exit(0) ################## # BUILD DATABASE # ################## db = DB(sqlite_db_dir) # Use gtf file to figure out temp file names, Build the database from them tmp_file_list = [] gtf_file_name = gtf_file_name.split("/")[-1] gtf_file_comp = gtf_file_name.split(".") gtf_file_prefix = ".".join(gtf_file_comp[:-1]) for this_file in os.listdir(tmp_dir): if gtf_file_prefix in this_file: if this_file == gtf_file_name: continue tmp_file_list.append(this_file) # Now run script for every chromosome file ctr = 0 for tmp_file in tmp_file_list: this_chr = getChr(tmp_dir + "/" + tmp_file) if (not force) or check: # For now, just checks that records exist in the database, It is # better to force since it difficult to really know if a chromosome was # built or not. chr_built = checkChr(db, db_name, this_chr) if chr_built: if not force: continue if check: if not chr_built: print "Chromosome %s not built" % this_chr continue ctr += 1 cmd = "python %s " % SCRIPT cmd += "-g %s/%s " % (tmp_dir, tmp_file) cmd += "-d %s " % db_name if options.use_gene_name: cmd += "--use_gene_name " cmd += "--sqlite_db_dir %s" % sqlite_db_dir if print_cmd: print cmd continue if run_lsf: runLSF(cmd, "%s.build_DB.bsub.out" % this_chr, this_chr + "build_DB", "hour") continue if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) # Remove temp files, but first check that exons are returned from the same # chromosome in the database # if not options.keep_temp: sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option( "-i", dest="input", type="string", help="""Tab-delimited file that specifies sample name and bam location""", default=None, ) opt_parser.add_option( "--force", dest="force", action="store_true", help="""By default, will only run Cufflinks if no output file exists. This option forces the runs on every sample.""", default=False, ) opt_parser.add_option( "--txt_ref", dest="txt_ref", type="string", help="Transcript reference used for assembly.", default=None ) opt_parser.add_option( "--quantitate", dest="quantitate", action="store_true", help="""Will quantitate against reference transcript annotations instead of assembly""", default=False, ) opt_parser.add_option( "--out_dir", dest="out_dir", type="string", help="Root output directory of cufflinks runs", default=None ) opt_parser.add_option( "--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False, ) opt_parser.add_option( "--num_processes", dest="num_processes", type="int", help="""If running locally, indicate the number of processes to batch. Def=%d""" % DEF_NUM_PROCESSES, default=None, ) opt_parser.add_option( "--nice", dest="nice", action="store_true", help="If running locally, run using nice", default=False ) opt_parser.add_option("--LSF", dest="run_lsf", action="store_true", help="Run through LSF", default=None) opt_parser.add_option( "--print_cmd", dest="print_cmd", action="store_true", help="Print the commands to run, but do not run", default=False, ) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-i") opt_parser.check_required("--out_dir") opt_parser.check_required("--txt_ref") out_dir = formatDir(options.out_dir) if not os.path.exists(out_dir): print "Output directory does not exist: %s" % out_dir opt_parser.print_help() sys.exit(1) num_processes = options.num_processes run_lsf = options.run_lsf nice = options.nice quantitate = options.quantitate print_cmd = options.print_cmd force = options.force check = options.check bsub_options = '#!/bin/tcsh\n#BSUB -q week\n#BSUB -R "rusage[mem=8]"\n' bsub_options += "#BSUB -P cgafolk\n" input = open(options.input) ctr = 0 for line in input: line = formatLine(line) s_id, bam = line.split("\t") # Make subdir subdir = out_dir + "/" + s_id + "_cufflinks" if not os.path.exists(subdir): os.mkdir(subdir) # Check for existence file_is_present = False try: if os.path.getsize(subdir + "/transcripts.gtf") == 0: if check: print "Need to run %s" % s_id else: file_is_present = True except: # File doesn't exist if check: print "Need to run %s" % s_id if check: continue if not force: if file_is_present: continue ctr += 1 cmd = "%s -o %s " % (CUFF_EXEC, subdir) if quantitate: cmd += "-G %s " % options.txt_ref else: cmd += "-g %s " % options.txt_ref cmd += "-u %s" % bam if num_processes: if nice: cmd = "nice " + cmd if print_cmd: print cmd continue if ctr % num_processes == 0: print cmd os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) else: if print_cmd: print cmd continue tmp_file = "%s/tmp_cuff_%s.txt" % (os.curdir, s_id) runLSF(cmd, "%s.cufflinks.bsub.out" % s_id, "cuff_%s" % s_id, "week", tmp_file_name=tmp_file) input.close() sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-i", dest="input", type="string", help="""Tab-delimited file that specifies sample name and bam location""", default=None) opt_parser.add_option("--force", dest="force", action="store_true", help="""By default, will only run Cufflinks if no output file exists. This option forces the runs on every sample.""", default=False) opt_parser.add_option("--txt_ref", dest="txt_ref", type="string", help="Transcript reference used for assembly.", default=None) opt_parser.add_option("--quantitate", dest="quantitate", action="store_true", help="""Will quantitate against reference transcript annotations instead of assembly""", default=False) opt_parser.add_option("--out_dir", dest="out_dir", type="string", help="Root output directory of cufflinks runs", default=None) opt_parser.add_option( "--check", dest="check", action="store_true", help="""Will check samples that are not done and print out which need to still be run""", default=False) opt_parser.add_option("--num_processes", dest="num_processes", type="int", help="""If running locally, indicate the number of processes to batch. Def=%d""" % DEF_NUM_PROCESSES, default=None) opt_parser.add_option("--nice", dest="nice", action="store_true", help="If running locally, run using nice", default=False) opt_parser.add_option("--LSF", dest="run_lsf", action="store_true", help="Run through LSF", default=None) opt_parser.add_option("--print_cmd", dest="print_cmd", action="store_true", help="Print the commands to run, but do not run", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-i") opt_parser.check_required("--out_dir") opt_parser.check_required("--txt_ref") out_dir = formatDir(options.out_dir) if not os.path.exists(out_dir): print "Output directory does not exist: %s" % out_dir opt_parser.print_help() sys.exit(1) num_processes = options.num_processes run_lsf = options.run_lsf nice = options.nice quantitate = options.quantitate print_cmd = options.print_cmd force = options.force check = options.check bsub_options = "#!/bin/tcsh\n#BSUB -q week\n#BSUB -R \"rusage[mem=8]\"\n" bsub_options += "#BSUB -P cgafolk\n" input = open(options.input) ctr = 0 for line in input: line = formatLine(line) s_id, bam = line.split("\t") # Make subdir subdir = out_dir + "/" + s_id + "_cufflinks" if not os.path.exists(subdir): os.mkdir(subdir) # Check for existence file_is_present = False try: if os.path.getsize(subdir + "/transcripts.gtf") == 0: if check: print "Need to run %s" % s_id else: file_is_present = True except: # File doesn't exist if check: print "Need to run %s" % s_id if check: continue if not force: if file_is_present: continue ctr += 1 cmd = "%s -o %s " % (CUFF_EXEC, subdir) if quantitate: cmd += "-G %s " % options.txt_ref else: cmd += "-g %s " % options.txt_ref cmd += "-u %s" % bam if num_processes: if nice: cmd = "nice " + cmd if print_cmd: print cmd continue if ctr % num_processes == 0: print cmd os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) else: if print_cmd: print cmd continue tmp_file = "%s/tmp_cuff_%s.txt" % (os.curdir, s_id) runLSF(cmd, "%s.cufflinks.bsub.out" % s_id, "cuff_%s" % s_id, "week", tmp_file_name=tmp_file) input.close() sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-d", dest="root_dir", type="string", help="""Root directory that contains subdirectoires with output from getASEventReadCounts""", default=None) opt_parser.add_option("-i", dest="input_dir", type="string", help="""Directory containing original input files to getASEventReadCounts.py. This is used to obtain the chromosome information.""", default=None) opt_parser.add_option("-s", dest="samples", type="string", help="""Comma separated list of the samples that will be used. The order which they are given is the order in the output of the file.""", default=None) opt_parser.add_option("--lengthNorm", dest="lengthNorm", action="store_true", help="""Flag to indicate length normalization was done on the counts. Used for splitting the IR counts back into left and right counts""", default=False) opt_parser.add_option( "--num_processes", dest="num_processes", type="int", help="""Will run each chromosome in batches using this number of parallel processes. DEF=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option("--run_LSF", dest="run_lsf", action="store_true", help="Will run everything through LSF", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-d") opt_parser.check_required("-i") opt_parser.check_required("-s") root_dir = formatDir(options.root_dir) # Change to the root directory to make sure output files are put here os.chdir(root_dir) input_dir = formatDir(options.input_dir) samples = options.samples lengthNorm = options.lengthNorm num_processes = options.num_processes run_lsf = options.run_lsf chr_list = getChr(input_dir) ctr = 0 for this_chr in chr_list: ctr += 1 cmd = "python %s " % SCRIPT cmd += "-d %s " % root_dir cmd += "-o tmp_clusterASExons2_%s.out " % this_chr cmd += "--left_intron tmp_clusterASExons2_%s_left_intron.out " % this_chr cmd += "--right_intron tmp_clusterASExons2_%s_right_intron.out " % this_chr cmd += "-s %s " % samples if lengthNorm: cmd += "--lengthNorm " cmd += "--which_chr %s" % this_chr if run_lsf: runLSF(cmd, "%s.clusterASExons2.bsub.out" % this_chr, samples.replace(",", "-") + "_" + this_chr, "hour") continue if ctr % num_processes == 0: os.system(cmd) else: print(cmd) Popen(cmd, shell=True, executable=SHELL) sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-d", dest="root_dir", type="string", help="""Root directory that contains subdirectoires with output from getASEventReadCounts""", default=None) opt_parser.add_option("-i", dest="input_dir", type="string", help="""Directory containing original input files to getASEventReadCounts.py. This is used to obtain the chromosome information.""", default=None) opt_parser.add_option("-s", dest="samples", type="string", help="""Comma separated list of the samples that will be used. The order which they are given is the order in the output of the file.""", default=None) opt_parser.add_option("--lengthNorm", dest="lengthNorm", action="store_true", help="""Flag to indicate length normalization was done on the counts. Used for splitting the IR counts back into left and right counts""", default=False) opt_parser.add_option("--num_processes", dest="num_processes", type="int", help="""Will run each chromosome in batches using this number of parallel processes. DEF=%d""" % DEF_NUM_PROCESSES, default=DEF_NUM_PROCESSES) opt_parser.add_option("--run_LSF", dest="run_lsf", action="store_true", help="Will run everything through LSF", default=False) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-d") opt_parser.check_required("-i") opt_parser.check_required("-s") root_dir = formatDir(options.root_dir) # Change to the root directory to make sure output files are put here os.chdir(root_dir) input_dir = formatDir(options.input_dir) samples = options.samples lengthNorm = options.lengthNorm num_processes = options.num_processes run_lsf = options.run_lsf chr_list = getChr(input_dir) ctr = 0 for this_chr in chr_list: ctr += 1 cmd = "python %s " % SCRIPT cmd += "-d %s " % root_dir cmd += "-o tmp_clusterASExons2_%s.out " % this_chr cmd += "--left_intron tmp_clusterASExons2_%s_left_intron.out " % this_chr cmd += "--right_intron tmp_clusterASExons2_%s_right_intron.out " % this_chr cmd += "-s %s " % samples if lengthNorm: cmd += "--lengthNorm " cmd += "--which_chr %s" % this_chr if run_lsf: runLSF(cmd, "%s.clusterASExons2.bsub.out" % this_chr, samples.replace(",","-") + "_" + this_chr, "hour") continue if ctr % num_processes == 0: os.system(cmd) else: print cmd Popen(cmd, shell=True, executable=SHELL) sys.exit(0)