def filter_transcripts(classify_dir, min_prec, min_rec, min_spec, opt_variable, tmp_dir): # setup input and output files lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE) lib_counts_list = list(LibCounts.from_file(lib_counts_file)) # filter each category expr_gtf_files = [] bkgd_gtf_files = [] for category_key in CATEGORIES: category_str = category_int_to_str[category_key] category_lib_counts = [x for x in lib_counts_list if x.category_counts[category_key] > 0] library_ids = [x.library_id for x in category_lib_counts] cinfo = CategoryInfo.create(library_ids, category_key, category_str, classify_dir) filter_category(category_lib_counts, cinfo, min_prec, min_rec, min_spec, opt_variable, tmp_dir) expr_gtf_files.append(cinfo.unann_expr_gtf_file) bkgd_gtf_files.append(cinfo.unann_bkgd_gtf_file) # only need one set of annotated gtf files expr_gtf_files.extend([cinfo.ann_expr_gtf_file, cinfo.ann_bkgd_gtf_file]) # merge transcripts logging.info("Merging filtered transcripts") expr_gtf_file = os.path.join(classify_dir, EXPR_GTF_FILE) bkgd_gtf_file = os.path.join(classify_dir, BKGD_GTF_FILE) merge_sort_gtf_files(expr_gtf_files, expr_gtf_file, tmp_dir=tmp_dir) merge_sort_gtf_files(bkgd_gtf_files, bkgd_gtf_file, tmp_dir=tmp_dir)
def classify_transcripts(classify_dir, num_processors, gtf_score_attr, tmp_dir): # setup input and output files lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE) lib_counts_list = list(LibCounts.from_file(lib_counts_file)) library_ids = [x.library_id for x in lib_counts_list] category_info_dict = {} for category_key in CATEGORIES: category_str = category_int_to_str[category_key] cinfo = CategoryInfo.create(library_ids, category_key, category_str, classify_dir) category_info_dict[category_key] = cinfo # write input files for classifier logging.info("Writing classification input files category='%s'" % (cinfo.category_str)) for transcripts in parse_gtf(open(cinfo.output_gtf_file)): for t in transcripts: # set transcript score t.score = float(t.attrs.get(gtf_score_attr, 0.0)) library_id = t.attrs[GTFAttr.LIBRARY_ID] fields = get_classification_fields(t) # lookup file handle and open new file if necessary if not library_id in cinfo.result_fh_dict: cinfo.result_fh_dict[library_id] = open( cinfo.result_file_dict[library_id], "w") print >> cinfo.result_fh_dict[library_id], '\t'.join( get_classification_header()) # write to file print >> cinfo.result_fh_dict[library_id], '\t'.join( map(str, fields)) # close open file handles for fh in cinfo.result_fh_dict.itervalues(): fh.close() for category_key, cinfo in category_info_dict.iteritems(): classify_tasks = [] for lib_counts in lib_counts_list: # see if can run classifier on this file if lib_counts.category_counts[category_key] > 0: filename = cinfo.result_file_dict[lib_counts.library_id] classify_tasks.append((lib_counts.library_id, filename)) # run classification logging.info("Classifying transcripts category='%s'" % (cinfo.category_str)) classify_category(cinfo, classify_tasks, num_processors, tmp_dir) # sort results sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file, tmp_dir) os.remove(cinfo.ctree_file)
def classify_transcripts(classify_dir, num_processors, gtf_score_attr, tmp_dir): # setup input and output files lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE) lib_counts_list = list(LibCounts.from_file(lib_counts_file)) library_ids = [x.library_id for x in lib_counts_list] category_info_dict = {} for category_key in CATEGORIES: category_str = category_int_to_str[category_key] cinfo = CategoryInfo.create(library_ids, category_key, category_str, classify_dir) category_info_dict[category_key] = cinfo # write input files for classifier logging.info("Writing classification input files category='%s'" % (cinfo.category_str)) for transcripts in parse_gtf(open(cinfo.output_gtf_file)): for t in transcripts: # set transcript score t.score = float(t.attrs.get(gtf_score_attr, 0.0)) library_id = t.attrs[GTFAttr.LIBRARY_ID] fields = get_classification_fields(t) # lookup file handle and open new file if necessary if not library_id in cinfo.result_fh_dict: cinfo.result_fh_dict[library_id] = open(cinfo.result_file_dict[library_id], "w") print >>cinfo.result_fh_dict[library_id], '\t'.join(get_classification_header()) # write to file print >>cinfo.result_fh_dict[library_id], '\t'.join(map(str, fields)) # close open file handles for fh in cinfo.result_fh_dict.itervalues(): fh.close() for category_key, cinfo in category_info_dict.iteritems(): classify_tasks = [] for lib_counts in lib_counts_list: # see if can run classifier on this file if lib_counts.category_counts[category_key] > 0: filename = cinfo.result_file_dict[lib_counts.library_id] classify_tasks.append((lib_counts.library_id, filename)) # run classification logging.info("Classifying transcripts category='%s'" % (cinfo.category_str)) classify_category(cinfo, classify_tasks, num_processors, tmp_dir) # sort results sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file, tmp_dir) os.remove(cinfo.ctree_file)