def test_parse_gtf_lines_without_expand_attributes(): parsed_dict = parse_gtf_lines(gtf_lines, expand_attribute_column=False) # convert to list since Py3's dictionary keys are a distinct collection type eq_(list(parsed_dict.keys()), REQUIRED_COLUMNS) eq_(parsed_dict["seqname"], ["1", "1"]) # convert to list for comparison since numerical columns may be NumPy arrays eq_(list(parsed_dict["start"]), [11869, 11869]) eq_(list(parsed_dict["end"]), [14409, 14409]) # can't compare NaN with equality scores = list(parsed_dict["score"]) assert np.isnan(scores).all(), "Unexpected scores: %s" % scores assert len(parsed_dict["attribute"]) == 2
def test_parse_gtf_lines_with_expand_attributes(): parsed_dict = parse_gtf_lines(gtf_lines, expand_attribute_column=True) # excluding 'attribute' column from required names expected_columns = REQUIRED_COLUMNS[:8] + [ "gene_id", "gene_name", "gene_source", "gene_biotype", "transcript_id", "transcript_name", "transcript_source", ] # convert to list since Py3's dictionary keys are a distinct collection type eq_(list(parsed_dict.keys()), expected_columns) eq_(parsed_dict["seqname"], ["1", "1"]) # convert to list for comparison since numerical columns may be NumPy arrays eq_(list(parsed_dict["start"]), [11869, 11869]) eq_(list(parsed_dict["end"]), [14409, 14409]) # can't compare NaN with equality scores = list(parsed_dict["score"]) assert np.isnan(scores).all(), "Unexpected scores: %s" % scores eq_(parsed_dict["gene_id"], ["ENSG00000223972", "ENSG00000223972"]) eq_(parsed_dict["transcript_id"], ["", "ENST00000456328"])
from gtfparse import create_missing_features, parse_gtf_lines import pandas # two lines from the Ensembl 54 human GTF containing only a stop_codon and # exon features, but from which gene and transcript information could be # inferred GTF_DATA = """ # seqname biotype feature start end score strand frame attribute 18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\tgene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7"; gene_name "C18orf10"; transcript_name "C18orf10-201"; 18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; transcript_name "KIAA1328-202"; """ GTF_LINES = GTF_DATA.split("\n") GTF_DICT = parse_gtf_lines(GTF_LINES) GTF_DATAFRAME = pandas.DataFrame(GTF_DICT) def test_create_missing_features_identity(): df_should_be_same = create_missing_features(GTF_DATAFRAME, {}) assert len(GTF_DATAFRAME) == len(df_should_be_same), \ "GTF DataFrames should be same size" def _check_expanded_dataframe(df): assert "gene" in set(df["feature"]), \ "Extended GTF should contain gene feature" assert "transcript" in set(df["feature"]), \ "Extended GTF should contain transcript feature" C18orf10_201_transcript_mask = ((df["feature"] == "transcript") &
from gtfparse import create_missing_features, parse_gtf_lines import pandas # two lines from the Ensembl 54 human GTF containing only a stop_codon and # exon features, but from which gene and transcript information could be # inferred GTF_DATA = """ # seqname biotype feature start end score strand frame attribute 18\tprotein_coding\tstop_codon\t32630766\t32630768\t.\t-\t0\tgene_id "ENSG00000134779"; transcript_id "ENST00000334295"; exon_number "7"; gene_name "C18orf10"; transcript_name "C18orf10-201"; 18\tprotein_coding\texon\t32663078\t32663157\t.\t+\t.\tgene_id "ENSG00000150477"; transcript_id "ENST00000383055"; exon_number "1"; gene_name "KIAA1328"; transcript_name "KIAA1328-202"; """ GTF_LINES = GTF_DATA.split("\n") GTF_DICT = parse_gtf_lines(GTF_LINES) GTF_DATAFRAME = pandas.DataFrame(GTF_DICT) def test_create_missing_features_identity(): df_should_be_same = create_missing_features(GTF_DATAFRAME, {}) assert len(GTF_DATAFRAME) == len(df_should_be_same), \ "GTF DataFrames should be same size" def _check_expanded_dataframe(df): assert "gene" in set(df["feature"]), \ "Extended GTF should contain gene feature" assert "transcript" in set(df["feature"]), \ "Extended GTF should contain transcript feature" C18orf10_201_transcript_mask = ( (df["feature"] == "transcript") & (df["transcript_name"] == "C18orf10-201"))
def test_parse_gtf_lines_error_too_few_fields(): bad_gtf_lines = [line.replace("\t", " ") for line in gtf_lines] # pylint: disable=no-value-for-parameter with assert_raises(ParsingError): parse_gtf_lines(bad_gtf_lines)