def test_parse_tag_attributes_with_usecols(): parsed = parse_gtf_and_expand_attributes( StringIO(GTF_TEXT), restrict_attribute_columns=["tag"]) tag_column = parsed["tag"] eq_(len(tag_column), 1) tags = tag_column[0] eq_(tags, 'cds_end_NF,mRNA_end_NF')
def test_parse_tag_attributes_with_usecols_other_column(): parsed = parse_gtf_and_expand_attributes( StringIO(GTF_TEXT), restrict_attribute_columns=["exon_id"]) tag_column = parsed.get("tag") assert tag_column is None, "Expected 'tag' to get dropped but got %s" % ( parsed, )
def test_parse_gtf_lines_with_expand_attributes(): parsed_dict = parse_gtf_and_expand_attributes(StringIO(gtf_text)) # excluding 'attribute' column from required names expected_columns = REQUIRED_COLUMNS[:8] + [ "gene_id", "gene_name", "gene_source", "gene_biotype", "transcript_id", "transcript_name", "transcript_source", ] # convert to list since Py3's dictionary keys are a distinct collection type eq_(list(parsed_dict.keys()), expected_columns) eq_(list(parsed_dict["seqname"]), ["1", "1"]) # convert to list for comparison since numerical columns may be NumPy arrays eq_(list(parsed_dict["start"]), [11869, 11869]) eq_(list(parsed_dict["end"]), [14409, 14409]) # can't compare NaN with equality scores = list(parsed_dict["score"]) assert np.isnan(scores).all(), "Unexpected scores: %s" % scores eq_(list(parsed_dict["gene_id"]), ["ENSG00000223972", "ENSG00000223972"]) eq_(list(parsed_dict["transcript_id"]), ["", "ENST00000456328"])
# inferred GTF_TEXT = "\n".join([ "# seqname biotype feature start end score strand frame attribute", "".join([ """18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\t""", """gene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7";""" """ gene_name "C18orf10";""", """ transcript_name "C18orf10-201";""" ]), "".join([ """18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; """, """transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; """, """transcript_name "KIAA1328-202";""" ]), ]) GTF_DATAFRAME = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT)) def test_create_missing_features_identity(): df_should_be_same = create_missing_features(GTF_DATAFRAME, {}) assert len(GTF_DATAFRAME) == len(df_should_be_same), \ "GTF DataFrames should be same size" def _check_expanded_dataframe(df): assert "gene" in set(df["feature"]), \ "Extended GTF should contain gene feature" assert "transcript" in set(df["feature"]), \ "Extended GTF should contain transcript feature" C18orf10_201_transcript_mask = ((df["feature"] == "transcript") &
def test_parse_tag_attributes(): parsed = parse_gtf_and_expand_attributes(StringIO(GTF_TEXT)) tag_column = parsed["tag"] eq_(len(tag_column), 1) tags = tag_column[0] eq_(tags, 'cds_end_NF,mRNA_end_NF')