def merge_transcripts(results): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) library_ids = [] expressed_gtf_files = [] background_gtf_files = [] for statsobj in stats_list: library_id = statsobj.library_id library_ids.append(library_id) prefix = os.path.join(results.classify_dir, library_id) expressed_gtf_files.append(prefix + ".expr.gtf") background_gtf_files.append(prefix + ".bkgd.gtf") library_id_map = {} for line in open(results.library_id_map): fields = line.strip().split('\t') library_id_map[fields[0]] = fields[1] # make a classification report logging.info("Writing classification report") fileh = open(results.classify_report_file, 'w') header_fields = [ "library_id", "library_name", "category", "train.auc", "test.auc", "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn", "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp", "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc" ] print >> fileh, '\t'.join(header_fields) for library_id in library_ids: prefix = os.path.join(results.classify_dir, library_id) library_name = library_id_map[library_id] intergenic_perf_file = prefix + ".intergenic.perf.txt" intronic_perf_file = prefix + ".intronic.perf.txt" input_fileh = open(intergenic_perf_file) input_fileh.next() for line in input_fileh: fields = ([library_id, library_name, "intergenic"] + line.strip().split('\t')) print >> fileh, '\t'.join(fields) input_fileh.close() input_fileh = open(intronic_perf_file) input_fileh.next() for line in input_fileh: fields = ([library_id, library_name, "intronic"] + line.strip().split('\t')) print >> fileh, '\t'.join(fields) input_fileh.close() fileh.close() # add reference gtf file expressed_gtf_files.append(results.ref_gtf_file) background_gtf_files.append(results.ref_gtf_file) # merge sort gtf files logging.info("Merging and sorting expressed GTF files") merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir) logging.info("Merging and sorting background GTF files") merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir) return 0
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)): # split input gtf by library and mark test ids keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid)) bufobj = BufferedFileSplitter(keyfunc, bufsize) ref_fileh = open(ref_gtf_file, 'w') stats_dict = collections.defaultdict(lambda: CategoryStats()) logging.info("Splitting transcripts by library") for line in open(gtf_file): f = GTFFeature.from_string(line) is_ref = bool(int(f.attrs[GTFAttr.REF])) if is_ref: print >> ref_fileh, str(f) continue library_id = f.attrs[GTFAttr.LIBRARY_ID] # keep statistics if f.feature_type == 'transcript': is_test = bool(int(f.attrs[GTFAttr.TEST])) if is_test: category = Category.SAME_STRAND else: category = int(f.attrs[GTFAttr.CATEGORY]) score = float(f.attrs[GTFAttr.SCORE]) statsobj = stats_dict[library_id] statsobj.library_id = library_id statsobj.counts[category] += 1 statsobj.signal[category] += score # write features from each library to separate files bufobj.write(library_id, line) # close open file handles ref_fileh.close() bufobj.close() logging.debug("Buffer flushes: %d" % (bufobj.flushes)) # write library category statistics logging.info("Writing category statistics") fh = open(category_stats_file, "w") print >> fh, '\t'.join(CategoryStats.header_fields()) for statsobj in stats_dict.itervalues(): fields = statsobj.to_fields() print >> fh, '\t'.join(map(str, fields)) fh.close()
def split_gtf_file(gtf_file, split_dir, ref_gtf_file, category_stats_file, bufsize=(1 << 30)): # split input gtf by library and mark test ids keyfunc = lambda myid: os.path.join(split_dir, "%s.gtf" % (myid)) bufobj = BufferedFileSplitter(keyfunc, bufsize) ref_fileh = open(ref_gtf_file, "w") stats_dict = collections.defaultdict(lambda: CategoryStats()) logging.info("Splitting transcripts by library") for line in open(gtf_file): f = GTFFeature.from_string(line) is_ref = bool(int(f.attrs[GTFAttr.REF])) if is_ref: print >> ref_fileh, str(f) continue library_id = f.attrs[GTFAttr.LIBRARY_ID] # keep statistics if f.feature_type == "transcript": is_test = bool(int(f.attrs[GTFAttr.TEST])) if is_test: category = Category.SAME_STRAND else: category = int(f.attrs[GTFAttr.CATEGORY]) score = float(f.attrs[GTFAttr.SCORE]) statsobj = stats_dict[library_id] statsobj.library_id = library_id statsobj.counts[category] += 1 statsobj.signal[category] += score # write features from each library to separate files bufobj.write(library_id, line) # close open file handles ref_fileh.close() bufobj.close() logging.debug("Buffer flushes: %d" % (bufobj.flushes)) # write library category statistics logging.info("Writing category statistics") fh = open(category_stats_file, "w") print >> fh, "\t".join(CategoryStats.header_fields()) for statsobj in stats_dict.itervalues(): fields = statsobj.to_fields() print >> fh, "\t".join(map(str, fields)) fh.close()
def classify_transcripts(results, num_processors): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) # get tasks tasks = [] for statsobj in stats_list: library_id = statsobj.library_id tasks.append((library_id, results.classify_dir)) # use multiprocessing to parallelize pool = multiprocessing.Pool(processes=num_processors) result_iter = pool.imap_unordered(classify_library_transcripts, tasks) errors = False library_ids = [] for retcode, library_id in result_iter: if retcode == 0: library_ids.append(library_id) else: errors = True pool.close() pool.join() if errors: logging.error("Errors occurred during classification") return int(errors)
def merge_transcripts(results): # read library category statistics stats_list = list(CategoryStats.from_file(results.category_stats_file)) library_ids = [] expressed_gtf_files = [] background_gtf_files = [] for statsobj in stats_list: library_id = statsobj.library_id library_ids.append(library_id) prefix = os.path.join(results.classify_dir, library_id) expressed_gtf_files.append(prefix + ".expr.gtf") background_gtf_files.append(prefix + ".bkgd.gtf") library_id_map = {} for line in open(results.library_id_map): fields = line.strip().split("\t") library_id_map[fields[0]] = fields[1] # make a classification report logging.info("Writing classification report") fileh = open(results.classify_report_file, "w") header_fields = [ "library_id", "library_name", "category", "train.auc", "test.auc", "train.cutoff", "train.tp", "train.fp", "train.fn", "train.tn", "train.sens", "train.spec", "train.balacc", "test.tp", "test.fp", "test.fn", "test.tn", "test.sens", "test.spec", "test.balacc", ] print >> fileh, "\t".join(header_fields) for library_id in library_ids: prefix = os.path.join(results.classify_dir, library_id) library_name = library_id_map[library_id] intergenic_perf_file = prefix + ".intergenic.perf.txt" intronic_perf_file = prefix + ".intronic.perf.txt" input_fileh = open(intergenic_perf_file) input_fileh.next() for line in input_fileh: fields = [library_id, library_name, "intergenic"] + line.strip().split("\t") print >> fileh, "\t".join(fields) input_fileh.close() input_fileh = open(intronic_perf_file) input_fileh.next() for line in input_fileh: fields = [library_id, library_name, "intronic"] + line.strip().split("\t") print >> fileh, "\t".join(fields) input_fileh.close() fileh.close() # add reference gtf file expressed_gtf_files.append(results.ref_gtf_file) background_gtf_files.append(results.ref_gtf_file) # merge sort gtf files logging.info("Merging and sorting expressed GTF files") merge_sort_gtf_files(expressed_gtf_files, results.expressed_gtf_file, tmp_dir=results.tmp_dir) logging.info("Merging and sorting background GTF files") merge_sort_gtf_files(background_gtf_files, results.background_gtf_file, tmp_dir=results.tmp_dir) return 0