def main(): # setup logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # command line parsing parser = argparse.ArgumentParser() parser.add_argument("--mode", dest="mode", choices=['htseq', 'cufflinks'], default='htseq') parser.add_argument("--dryrun", dest="dryrun", action="store_true", default=False) parser.add_argument("--keep-tmp", dest="keep_tmp", action="store_true", default=None) parser.add_argument('-o', '--output-dir', dest="output_dir", default=None) parser.add_argument("config_xml_file") parser.add_argument('library_table') parser.add_argument("gtf_file") args = parser.parse_args() if not os.path.exists(args.config_xml_file): parser.error("config xml file '%s' not found" % (args.config_xml_file)) if not os.path.exists(args.library_table): parser.error("library table file '%s' not found" % (args.library_table)) if not os.path.exists(args.gtf_file): parser.error("gtf file '%s' not found" % (args.gtf_file)) gtf_file = os.path.abspath(args.gtf_file) # read configuration logging.info("Reading configuration file") config = Config.from_xml(args.config_xml_file) if args.output_dir is not None: config.output_dir = os.path.abspath(args.output_dir) else: config.output_dir = os.getcwd() config.keep_tmp = args.keep_tmp config.dryrun = args.dryrun # create output directory if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) # read library table logging.info("Parsing library table") num_libs = 0 for lib in Library.from_file(args.library_table): if not os.path.exists(lib.bam_file): logging.warning("\t[SKIPPED] Library %s BAM file not found" % (lib.library_id)) continue # setup library dir lib_dir = os.path.join(config.output_dir, lib.library_id) if not os.path.exists(lib_dir): os.makedirs(lib_dir) if args.mode == 'htseq': htseq_count(lib, gtf_file, config) elif args.mode == 'cufflinks': cufflinks(lib, gtf_file, config) num_libs += 1 logging.info("Found %d libraries" % (num_libs)) logging.info("Done") return 0
def main(): # setup logging logging.basicConfig( level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # command line parsing parser = argparse.ArgumentParser() parser.add_argument("--mode", dest="mode", choices=['htseq', 'cufflinks'], default='htseq') parser.add_argument("--dryrun", dest="dryrun", action="store_true", default=False) parser.add_argument("--keep-tmp", dest="keep_tmp", action="store_true", default=None) parser.add_argument('-o', '--output-dir', dest="output_dir", default=None) parser.add_argument("config_xml_file") parser.add_argument('library_table') parser.add_argument("gtf_file") args = parser.parse_args() if not os.path.exists(args.config_xml_file): parser.error("config xml file '%s' not found" % (args.config_xml_file)) if not os.path.exists(args.library_table): parser.error("library table file '%s' not found" % (args.library_table)) if not os.path.exists(args.gtf_file): parser.error("gtf file '%s' not found" % (args.gtf_file)) gtf_file = os.path.abspath(args.gtf_file) # read configuration logging.info("Reading configuration file") config = Config.from_xml(args.config_xml_file) if args.output_dir is not None: config.output_dir = os.path.abspath(args.output_dir) else: config.output_dir = os.getcwd() config.keep_tmp = args.keep_tmp config.dryrun = args.dryrun # create output directory if not os.path.exists(config.output_dir): os.makedirs(config.output_dir) # read library table logging.info("Parsing library table") num_libs = 0 for lib in Library.from_file(args.library_table): if not os.path.exists(lib.bam_file): logging.warning("\t[SKIPPED] Library %s BAM file not found" % (lib.library_id)) continue # setup library dir lib_dir = os.path.join(config.output_dir, lib.library_id) if not os.path.exists(lib_dir): os.makedirs(lib_dir) if args.mode == 'htseq': htseq_count(lib, gtf_file, config) elif args.mode == 'cufflinks': cufflinks(lib, gtf_file, config) num_libs += 1 logging.info("Found %d libraries" % (num_libs)) logging.info("Done") return 0
def main(): # setup logging logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s") logging.info("AssemblyLine %s" % (assemblyline.__version__)) logging.info("----------------------------------") # parse command line parser = argparse.ArgumentParser() parser.add_argument('--min-transcript-length', type=int, dest="min_transcript_length", metavar="N", default=config.MIN_TRANSCRIPT_LENGTH, help="Skip ab initio transcripts equal to or below " "this length [default=%(default)s]") parser.add_argument("--gtf-score-attr", dest="gtf_score_attr", default="FPKM", metavar="ATTR", help="GTF attribute field containing transcript " "expression [default='%(default)s']") parser.add_argument('-o', '--output-dir', dest="output_dir", default="assemblyline_out", help="directory to store assemblyline results and " "intermediate files [default=%(default)s]") parser.add_argument("--random-test-frac", dest="random_test_frac", default=0.1, metavar="FRAC", type=float, help="if no user-defined tests are specified " "using '--tests' randomly designate a fraction " "of reference transcripts as test data for use " "in classification [default=%(default)s]") parser.add_argument("--tests", dest="test_file", default=None, help="(optional) text file containing " "reference 'gene_id' attributes " "(one per line) that define test cases " "to use for validation purposes") parser.add_argument('ref_gtf_file') parser.add_argument('library_table_file') args = parser.parse_args() # check command line parameters if not os.path.exists(args.library_table_file): parser.error("library table file %s not found" % (args.library_table_file)) if args.min_transcript_length < 0: parser.error("min_transcript_length < 0") if not os.path.exists(args.ref_gtf_file): parser.error("reference GTF file %s not found" % (args.ref_gtf_file)) if (args.test_file is not None) and (not os.path.exists(args.test_file)): parser.error("test file %s not found" % (args.test_file)) if (args.random_test_frac < 0): parser.error("cannot set --random-test-frac < 0") # show parameters logging.info("Parameters:") logging.info("min transcript length: %d" % (args.min_transcript_length)) logging.info("gtf score attr: %s" % (args.gtf_score_attr)) logging.info("output directory: %s" % (args.output_dir)) logging.info("reference GTF file: %s" % (args.ref_gtf_file)) logging.info("test file: %s" % (args.test_file)) logging.info("library table file: %s" % (args.library_table_file)) logging.info("----------------------------------") # setup results results = config.AssemblylineResults(args.output_dir) # create output directory if not os.path.exists(results.run_dir): logging.debug("Creating output directory '%s'" % (results.run_dir)) os.makedirs(results.run_dir) if not os.path.exists(results.tmp_dir): logging.info("Creating tmp directory '%s'" % (results.tmp_dir)) os.makedirs(results.tmp_dir) # parse sample table logging.info("Parsing library table") libraries = [] valid = True library_num = 1 sample_num = 1 sample_id_map = {} library_map_fileh = open(results.library_id_map, 'w') sample_map_fileh = open(results.sample_id_map, 'w') for library in Library.from_file(args.library_table_file): # exclude samples if not os.path.exists(library.gtf_file): logging.warning("Library '%s' GTF file not found" % (library.library_id)) continue # rename library id new_library_id = "L%d" % (library_num) print >>library_map_fileh, '\t'.join([new_library_id, library.library_id]) library_num += 1 library.library_id = new_library_id # rename sample id if library.sample_id not in sample_id_map: new_sample_id = "S%d" % (sample_num) print >>sample_map_fileh, '\t'.join([new_sample_id, library.sample_id]) sample_id_map[library.sample_id] = new_sample_id sample_num += 1 else: new_sample_id = sample_id_map[library.sample_id] library.sample_id = new_sample_id libraries.append(library) if not valid: logging.warning("Invalid libraries in table file") library_map_fileh.close() sample_map_fileh.close() # setup output files tmp_file = os.path.join(results.tmp_dir, "transcripts.unsorted.gtf") tmpfileh = open(tmp_file, "w") dropfileh = open(results.transcripts_dropped_gtf_file, "w") statsfileh = open(results.transcript_stats_file, 'w') header_fields = ['#library_id'] header_fields.extend(config.TRANSCRIPT_STATS_FIELDS) header_fields.extend([("failed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES]) header_fields.extend([("passed_q%d" % x) for x in config.TRANSCRIPT_SCORE_QUANTILES]) print >>statsfileh, '\t'.join(header_fields) # read test transcripts test_gene_ids = set() if args.test_file is not None: fileh = open(args.test_file) test_gene_ids.update(line.strip() for line in fileh) fileh.close() logging.info("Read %d test genes" % len(test_gene_ids)) # read reference GTF file and aggregate logging.info("Adding reference GTF file") add_reference_gtf_file(args.ref_gtf_file, test_gene_ids, args.random_test_frac, tmpfileh) # parse sample table logging.info("Adding libraries") for library in libraries: t_dict = read_gtf_file(library, args.gtf_score_attr) logging.debug("Read %s transcripts from file %s" % (len(t_dict), library.gtf_file)) if len(t_dict) == 0: logging.warning("Library %s has no transcripts" % (library.library_id)) else: filter_transcripts(library.library_id, t_dict, tmpfileh, dropfileh, statsfileh, args.min_transcript_length) statsfileh.close() tmpfileh.close() logging.info("Sorting GTF") retcode = sort_gtf(tmp_file, results.transcripts_gtf_file, tmp_dir=results.tmp_dir) if retcode != 0: logging.error("sort GTF failed") if os.path.exists(results.transcripts_gtf_file): os.remove(results.transcripts_gtf_file) os.remove(tmp_file) logging.info("Done") return retcode