def _load_gtf_as_dataframe(self, usecols=None, features=None): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf( self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True, usecols=usecols, features=features) column_names = set(df.keys()) expect_gene_feature = features is None or "gene" in features expect_transcript_feature = features is None or "transcript" in features observed_features = set(df["feature"]) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if expect_gene_feature and "gene" not in observed_features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF logger.info("Creating missing gene features...") df = create_missing_features( dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": { "gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") logger.info("Done.") if expect_transcript_feature and "transcript" not in observed_features: logger.info("Creating missing transcript features...") df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") logger.info("Done.") return df
def _load_gtf_as_dataframe(self, usecols=None, features=None): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf(self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True, usecols=usecols, features=features) column_names = set(df.keys()) expect_gene_feature = features is None or "gene" in features expect_transcript_feature = features is None or "transcript" in features observed_features = set(df["feature"]) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if expect_gene_feature and "gene" not in observed_features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF logger.info("Creating missing gene features...") df = create_missing_features(dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": {"gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") logger.info("Done.") if expect_transcript_feature and "transcript" not in observed_features: logger.info("Creating missing transcript features...") df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") logger.info("Done.") return df
def _load_full_dataframe_from_gtf(self): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ print("Reading GTF from %s" % self.gtf_path) df = read_gtf_as_dataframe( self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True) features = set(df["feature"]) column_names = set(df.keys()) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if "gene" not in features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF df = create_missing_features( dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": { "gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") if "transcript" not in features: df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") return df
def _load_full_dataframe_from_gtf(self): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ df = read_gtf_as_dataframe( self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True) features = set(df["feature"]) column_names = set(df.keys()) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if "gene" not in features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF df = create_missing_features( dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": { "gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") if "transcript" not in features: df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") return df
def test_create_missing_features(): assert "gene" not in set(GTF_DATAFRAME["feature"]), \ "Original GTF should not contain gene feature" assert "transcript" not in set(GTF_DATAFRAME["feature"]), \ "Original GTF should not contain transcript feature" df_extra_features = create_missing_features( GTF_DATAFRAME, unique_keys={ "gene": "gene_id", "transcript": "transcript_id" }, extra_columns={ "gene": {"gene_name"}, "transcript": {"gene_id", "gene_name", "transcript_name"}, }) _check_expanded_dataframe(df_extra_features)
def test_create_missing_features_identity(): df_should_be_same = create_missing_features(GTF_DATAFRAME, {}) assert len(GTF_DATAFRAME) == len(df_should_be_same), \ "GTF DataFrames should be same size"