Example #1
0
def load_transcript_fpkm_dict_from_gtf(
        gtf_path,
        transcript_id_column_name="reference_id",
        fpkm_column_name="FPKM",
        feature_column_name="feature"):
    """
    Load a GTF file generated by StringTie which contains transcript-level
    quantification of abundance. Returns a dictionary mapping Ensembl
    IDs of transcripts to FPKM values.
    """
    columns = gtfparse.read_gtf_as_dict(
        gtf_path, column_converters={fpkm_column_name: float})
    transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path,
                                     columns)
    fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, columns)
    features = _get_gtf_column(feature_column_name, gtf_path, columns)
    logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path))
    logging.info("Found %s transcript entries" % sum(feature == "transcript"
                                                     for feature in features))
    result = {
        transcript_id: float(fpkm)
        for (transcript_id, fpkm,
             feature) in zip(transcript_ids, fpkm_values, features)
        if transcript_id is not None and len(transcript_id) > 0
        and feature == "transcript"
    }
    logging.info("Keeping %d transcript rows with reference IDs" %
                 (len(result), ))
    return result
Example #2
0
def load_transcript_fpkm_dict_from_gtf(
        gtf_path,
        transcript_id_column_name="reference_id",
        fpkm_column_name="FPKM",
        feature_column_name="feature"):
    """
    Load a GTF file generated by StringTie which contains transcript-level
    quantification of abundance. Returns a dictionary mapping Ensembl
    IDs of transcripts to FPKM values.
    """
    columns = gtfparse.read_gtf_as_dict(
        gtf_path,
        column_converters={fpkm_column_name: float})
    transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, columns)
    fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, columns)
    features = _get_gtf_column(feature_column_name, gtf_path, columns)
    logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path))
    logging.info("Found %s transcript entries" % sum(
        feature == "transcript" for feature in features))
    result = {
        transcript_id: float(fpkm)
        for (transcript_id, fpkm, feature)
        in zip(transcript_ids, fpkm_values, features)
        if transcript_id is not None
        and len(transcript_id) > 0
        and feature == "transcript"
    }
    logging.info("Keeping %d transcript rows with reference IDs" % (
        len(result),))
    return result
Example #3
0
def test_read_refseq_gtf_as_dict():
    gtf_dict = read_gtf_as_dict(REFSEQ_GTF_PATH)
    _check_required_columns(gtf_dict)
def test_read_string_gtf_as_dict_float_values():
    gtf_dict = read_gtf_as_dict(
        B16_GTF_PATH,
        column_converters={"cov": float, "FPKM": float})
    _check_required_columns(gtf_dict)
    _check_float_cov_and_FPKM(gtf_dict)
def test_read_string_gtf_as_dict():
    gtf_dict = read_gtf_as_dict(B16_GTF_PATH)
    _check_required_columns(gtf_dict)
    _check_string_cov_and_FPKM(gtf_dict)
Example #6
0
def test_read_refseq_gtf_as_dict():
    gtf_dict = read_gtf_as_dict(REFSEQ_GTF_PATH)
    _check_required_columns(gtf_dict)