def filter_transcripts(classify_dir, min_prec, min_rec, min_spec, 
                       opt_variable, tmp_dir):
    # setup input and output files
    lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE)
    lib_counts_list = list(LibCounts.from_file(lib_counts_file))
    # filter each category
    expr_gtf_files = []
    bkgd_gtf_files = []
    for category_key in CATEGORIES:
        category_str = category_int_to_str[category_key]
        category_lib_counts = [x for x in lib_counts_list
                               if x.category_counts[category_key] > 0]
        library_ids = [x.library_id for x in category_lib_counts]
        cinfo = CategoryInfo.create(library_ids, category_key, 
                                    category_str, classify_dir)
        filter_category(category_lib_counts, cinfo, min_prec, min_rec, 
                        min_spec, opt_variable, tmp_dir)
        expr_gtf_files.append(cinfo.unann_expr_gtf_file)
        bkgd_gtf_files.append(cinfo.unann_bkgd_gtf_file)
    # only need one set of annotated gtf files
    expr_gtf_files.extend([cinfo.ann_expr_gtf_file,
                           cinfo.ann_bkgd_gtf_file])
    # merge transcripts
    logging.info("Merging filtered transcripts")
    expr_gtf_file = os.path.join(classify_dir, EXPR_GTF_FILE)
    bkgd_gtf_file = os.path.join(classify_dir, BKGD_GTF_FILE)
    merge_sort_gtf_files(expr_gtf_files, expr_gtf_file, tmp_dir=tmp_dir)
    merge_sort_gtf_files(bkgd_gtf_files, bkgd_gtf_file, tmp_dir=tmp_dir)
def classify_transcripts(classify_dir, num_processors, gtf_score_attr,
                         tmp_dir):
    # setup input and output files
    lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE)
    lib_counts_list = list(LibCounts.from_file(lib_counts_file))
    library_ids = [x.library_id for x in lib_counts_list]
    category_info_dict = {}
    for category_key in CATEGORIES:
        category_str = category_int_to_str[category_key]
        cinfo = CategoryInfo.create(library_ids, category_key, category_str,
                                    classify_dir)
        category_info_dict[category_key] = cinfo
        # write input files for classifier
        logging.info("Writing classification input files category='%s'" %
                     (cinfo.category_str))
        for transcripts in parse_gtf(open(cinfo.output_gtf_file)):
            for t in transcripts:
                # set transcript score
                t.score = float(t.attrs.get(gtf_score_attr, 0.0))
                library_id = t.attrs[GTFAttr.LIBRARY_ID]
                fields = get_classification_fields(t)
                # lookup file handle and open new file if necessary
                if not library_id in cinfo.result_fh_dict:
                    cinfo.result_fh_dict[library_id] = open(
                        cinfo.result_file_dict[library_id], "w")
                    print >> cinfo.result_fh_dict[library_id], '\t'.join(
                        get_classification_header())
                # write to file
                print >> cinfo.result_fh_dict[library_id], '\t'.join(
                    map(str, fields))
        # close open file handles
        for fh in cinfo.result_fh_dict.itervalues():
            fh.close()
    for category_key, cinfo in category_info_dict.iteritems():
        classify_tasks = []
        for lib_counts in lib_counts_list:
            # see if can run classifier on this file
            if lib_counts.category_counts[category_key] > 0:
                filename = cinfo.result_file_dict[lib_counts.library_id]
                classify_tasks.append((lib_counts.library_id, filename))
        # run classification
        logging.info("Classifying transcripts category='%s'" %
                     (cinfo.category_str))
        classify_category(cinfo, classify_tasks, num_processors, tmp_dir)
        # sort results
        sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file,
                                    tmp_dir)
        os.remove(cinfo.ctree_file)
def classify_transcripts(classify_dir, num_processors, gtf_score_attr, 
                         tmp_dir):
    # setup input and output files
    lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE)
    lib_counts_list = list(LibCounts.from_file(lib_counts_file))
    library_ids = [x.library_id for x in lib_counts_list]
    category_info_dict = {}
    for category_key in CATEGORIES:
        category_str = category_int_to_str[category_key]
        cinfo = CategoryInfo.create(library_ids, category_key, 
                                    category_str, classify_dir)
        category_info_dict[category_key] = cinfo
        # write input files for classifier
        logging.info("Writing classification input files category='%s'" % (cinfo.category_str))
        for transcripts in parse_gtf(open(cinfo.output_gtf_file)):
            for t in transcripts:
                # set transcript score
                t.score = float(t.attrs.get(gtf_score_attr, 0.0))
                library_id = t.attrs[GTFAttr.LIBRARY_ID]
                fields = get_classification_fields(t)
                # lookup file handle and open new file if necessary
                if not library_id in cinfo.result_fh_dict:
                    cinfo.result_fh_dict[library_id] = open(cinfo.result_file_dict[library_id], "w")        
                    print >>cinfo.result_fh_dict[library_id], '\t'.join(get_classification_header())
                # write to file
                print >>cinfo.result_fh_dict[library_id], '\t'.join(map(str, fields))        
        # close open file handles
        for fh in cinfo.result_fh_dict.itervalues():
            fh.close()
    for category_key, cinfo in category_info_dict.iteritems():
        classify_tasks = []
        for lib_counts in lib_counts_list:
            # see if can run classifier on this file
            if lib_counts.category_counts[category_key] > 0:
                filename = cinfo.result_file_dict[lib_counts.library_id]
                classify_tasks.append((lib_counts.library_id, filename))
        # run classification
        logging.info("Classifying transcripts category='%s'" % (cinfo.category_str))
        classify_category(cinfo, classify_tasks, num_processors, tmp_dir)
        # sort results
        sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file, tmp_dir)
        os.remove(cinfo.ctree_file)