def test_parse_tag_attributes_with_usecols():
    parsed = parse_gtf_and_expand_attributes(
        StringIO(GTF_TEXT), restrict_attribute_columns=["tag"])
    tag_column = parsed["tag"]
    eq_(len(tag_column), 1)
    tags = tag_column[0]
    eq_(tags, 'cds_end_NF,mRNA_end_NF')
def test_parse_tag_attributes_with_usecols_other_column():
    parsed = parse_gtf_and_expand_attributes(
        StringIO(GTF_TEXT), restrict_attribute_columns=["exon_id"])
    tag_column = parsed.get("tag")

    assert tag_column is None, "Expected 'tag' to get dropped but got %s" % (
        parsed, )
def test_parse_gtf_lines_with_expand_attributes():
    parsed_dict = parse_gtf_and_expand_attributes(StringIO(gtf_text))
    # excluding 'attribute' column from required names
    expected_columns = REQUIRED_COLUMNS[:8] + [
        "gene_id",
        "gene_name",
        "gene_source",
        "gene_biotype",
        "transcript_id",
        "transcript_name",
        "transcript_source",
    ]
    # convert to list since Py3's dictionary keys are a distinct collection type
    eq_(list(parsed_dict.keys()), expected_columns)
    eq_(list(parsed_dict["seqname"]), ["1", "1"])
    # convert to list for comparison since numerical columns may be NumPy arrays
    eq_(list(parsed_dict["start"]), [11869, 11869])
    eq_(list(parsed_dict["end"]), [14409, 14409])
    # can't compare NaN with equality
    scores = list(parsed_dict["score"])
    assert np.isnan(scores).all(), "Unexpected scores: %s" % scores
    eq_(list(parsed_dict["gene_id"]), ["ENSG00000223972", "ENSG00000223972"])
    eq_(list(parsed_dict["transcript_id"]), ["", "ENST00000456328"])
# inferred
GTF_TEXT = "\n".join([
    "# seqname biotype feature start end score strand frame attribute",
    "".join([
        """18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\t""",
        """gene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7";"""
        """ gene_name "C18orf10";""", """ transcript_name "C18orf10-201";"""
    ]),
    "".join([
        """18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; """,
        """transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; """,
        """transcript_name "KIAA1328-202";"""
    ]),
])

GTF_DATAFRAME = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT))


def test_create_missing_features_identity():
    df_should_be_same = create_missing_features(GTF_DATAFRAME, {})
    assert len(GTF_DATAFRAME) == len(df_should_be_same), \
        "GTF DataFrames should be same size"


def _check_expanded_dataframe(df):
    assert "gene" in set(df["feature"]), \
        "Extended GTF should contain gene feature"
    assert "transcript" in set(df["feature"]), \
        "Extended GTF should contain transcript feature"

    C18orf10_201_transcript_mask = ((df["feature"] == "transcript") &
def test_parse_tag_attributes():
    parsed = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT))
    tag_column = parsed["tag"]
    eq_(len(tag_column), 1)
    tags = tag_column[0]
    eq_(tags, 'cds_end_NF,mRNA_end_NF')