def filter_regex( self, annot: Union[str, pd.DataFrame], regex: Optional[str] = ".*", invert_match: Optional[bool] = False, column: Union[str, int] = 0, ) -> pd.DataFrame: """ Filter a dataframe by any column using regex. Parameters ---------- annot : str or pd.Dataframe annotation to filter: "bed", "gtf" or a pandas dataframe regex : str regex string to match invert_match : bool, optional keep contigs NOT matching the regex string column: str or int, optional column name or number to filter (default: 1st, contig name) Returns ------- pd.DataFrame filtered dataframe """ df = _parse_annot(self, annot) return filter_regex(df, regex, invert_match, column)
def from_attributes(self, field, annot: Union[str, pd.DataFrame] = "gtf", check=True): """ Convert the specified GTF attribute field to a pandas series Parameters ---------- field : str field from the GTF's attribute column. annot : str or pd.Dataframe, optional any GTF in dataframe format, or the default GTF. check : bool, optional filter the GTF for rows containing field? Returns ------- pd.Series with the same index as the input GTF and the field column """ df = _parse_annot(self, annot) if check: df = df[df["attribute"].str.contains(field)] if len(df) == 0: raise ValueError(f"{field} not in GTF attributes!") # extract the text between the quotes series = df["attribute"].str.extract(fr'{field} "(.*?)"', expand=False) series.name = field return series
def map_locations(self, annot: Union[str, pd.DataFrame], to: str, drop=True) -> Union[None, pd.DataFrame]: """ Map chromosome mapping from one assembly to another. Uses the NCBI assembly reports to find contigs. Drops missing contigs. Parameters ---------- annot : str or pd.Dataframe annotation to map: "bed", "gtf" or a pandas dataframe. to: str target provider (UCSC, Ensembl or NCBI) drop: bool, optional if True, replace the chromosome column. If False, add a 2nd chromosome column. Returns ------- pandas.DataFrame chromosome mapping. """ if self.readme_file is None: raise AttributeError( "Can only map genomepy annotations (a readme file is required)" ) genomes_dir = os.path.dirname(self.genome_dir) mapping = map_locations(self.name, to, genomes_dir) if mapping is None: return df = _parse_annot(self, annot) index_name = df.index.name if not set([index_name] + df.columns.to_list()) & {"chrom", "seqname"}: raise ValueError( "Location mapping requires a column named 'chrom' or 'seqname'." ) # join mapping on chromosome column and return with original index is_indexed = df.index.to_list() != list(range(df.shape[0])) if is_indexed: df = df.reset_index(level=index_name) index_col = "chrom" if "chrom" in df.columns else "seqname" df = df.set_index(index_col) df = mapping.join(df, how="inner") df = df.reset_index(drop=drop) df.columns = [index_col] + df.columns.to_list()[1:] if is_indexed: df = df.set_index(index_name if index_name else "index") return df
def gtf_dict(self, key, value, string_values=True, annot: Union[str, pd.DataFrame] = "gtf"): """ Create a dictionary based on the columns or attribute fields in a GTF. Parameters ---------- key : str column name or attribute fields (e.g. "seqname", "gene_name") value : str column name or attribute fields (e.g. "gene_id", "transcript_name") string_values : bool, optional attempt to format the dict values as strings (only happens if all value lists are length 1) annot : str or pd.Dataframe, optional annotation to filter: "gtf" or a pandas dataframe Returns ------- dict with values as lists. If string_values is True and all lists are length 1, values will be strings. """ df = _parse_annot(self, annot) k = key in df.columns v = value in df.columns # pd.DataFrame.iterrows() is slow. this is not. attributes = zip( df[key] if k else df.attribute, df[value] if v else df.attribute, ) def _get_attr_item(series, item): """ example series.attribute: "...; gene_name: "TP53"; ..." item="gene_name" would return "TP53" """ split = series.split(item) return split[1].split('"')[1] # item might not exist def _get_col_item(series, _): return series get_key = _get_col_item if k else _get_attr_item get_val = _get_col_item if v else _get_attr_item a_dict = dict() for row in attributes: try: k = get_key(row[0], key) v = get_val(row[1], value) except IndexError: continue # unique values per key if k in a_dict: a_dict[k].update({v}) else: a_dict[k] = {v} # return values as str if all values are length 1 # and string_values is True, else return values as list all_len_1 = string_values and all(len(v) == 1 for v in a_dict.values()) for k, v in a_dict.items(): a_dict[k] = list(v)[0] if all_len_1 else list(v) return a_dict
def _map_genes( self, # noqa field: str, product: str = "protein", annot: Union[str, pd.DataFrame] = "bed", ) -> pd.DataFrame: """ Use mygene.info to map gene identifiers to any specified `field`. Returns the dataframe with remapped "name" column. Drops missing identifiers. Parameters ---------- annot: str or pd.Dataframe Annotation dataframe to map (a pandas dataframe or "bed"). Is mapped to a column named "name" (required). field : str, optional Identifier for gene annotation. Uses mygene.info to map ids. Valid fields are: ensembl.gene, entrezgene, symbol, name, refseq, entrezgene. Note that refseq will return the protein refseq_id by default, use `product="rna"` to return the RNA refseq_id. Currently, mapping to Ensembl transcript ids is not supported. product : str, optional Either "protein" or "rna". Only used when `field="refseq"` Returns ------- pandas.DataFrame remapped gene annotation """ if not isinstance(self.tax_id, int): raise AttributeError("A taxonomy identifier is required. " "You can set 'Annotation.tax_id' manually") to, product = _parse_mygene_input(field, product) df = _parse_annot(self, annot) if df is None: raise ValueError( "Argument 'annot' must be 'bed' or a pandas dataframe.") cols = df.columns # starting columns if "name" not in cols: raise ValueError("Column 'name' is required to map to.") # remove version numbers from gene IDs split_id = df["name"].str.split(r"\.", expand=True)[0] df = df.assign(split_id=split_id.values) genes = sorted(set(split_id)) result = query_mygene(genes, self.tax_id, field) # result = _query_mygene(self, genes, field=to) result = _filter_query(result) if len(result) == 0: logger.warning("Could not map using mygene.info") return pd.DataFrame() df = df.join(result, on="split_id") # Only in case of RefSeq annotation the product needs to be specified. if to == "refseq": to = f"{to}.translation.{product}" # Get rid of extra columns from query df = df.assign(name=df[to].values) # df["name"] = df[to] df = df[cols].dropna() return df