def load_transcript_fpkm_dict_from_gtf( gtf_path, transcript_id_column_name="reference_id", fpkm_column_name="FPKM", feature_column_name="feature"): """ Load a GTF file generated by StringTie which contains transcript-level quantification of abundance. Returns a dictionary mapping Ensembl IDs of transcripts to FPKM values. """ columns = gtfparse.read_gtf_as_dict( gtf_path, column_converters={fpkm_column_name: float}) transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, columns) fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, columns) features = _get_gtf_column(feature_column_name, gtf_path, columns) logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path)) logging.info("Found %s transcript entries" % sum(feature == "transcript" for feature in features)) result = { transcript_id: float(fpkm) for (transcript_id, fpkm, feature) in zip(transcript_ids, fpkm_values, features) if transcript_id is not None and len(transcript_id) > 0 and feature == "transcript" } logging.info("Keeping %d transcript rows with reference IDs" % (len(result), )) return result
def load_transcript_fpkm_dict_from_gtf( gtf_path, transcript_id_column_name="reference_id", fpkm_column_name="FPKM", feature_column_name="feature"): """ Load a GTF file generated by StringTie which contains transcript-level quantification of abundance. Returns a dictionary mapping Ensembl IDs of transcripts to FPKM values. """ columns = gtfparse.read_gtf_as_dict( gtf_path, column_converters={fpkm_column_name: float}) transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, columns) fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, columns) features = _get_gtf_column(feature_column_name, gtf_path, columns) logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path)) logging.info("Found %s transcript entries" % sum( feature == "transcript" for feature in features)) result = { transcript_id: float(fpkm) for (transcript_id, fpkm, feature) in zip(transcript_ids, fpkm_values, features) if transcript_id is not None and len(transcript_id) > 0 and feature == "transcript" } logging.info("Keeping %d transcript rows with reference IDs" % ( len(result),)) return result
def test_read_refseq_gtf_as_dict(): gtf_dict = read_gtf_as_dict(REFSEQ_GTF_PATH) _check_required_columns(gtf_dict)
def test_read_string_gtf_as_dict_float_values(): gtf_dict = read_gtf_as_dict( B16_GTF_PATH, column_converters={"cov": float, "FPKM": float}) _check_required_columns(gtf_dict) _check_float_cov_and_FPKM(gtf_dict)
def test_read_string_gtf_as_dict(): gtf_dict = read_gtf_as_dict(B16_GTF_PATH) _check_required_columns(gtf_dict) _check_string_cov_and_FPKM(gtf_dict)