def filter_regex(
        self,
        annot: Union[str, pd.DataFrame],
        regex: Optional[str] = ".*",
        invert_match: Optional[bool] = False,
        column: Union[str, int] = 0,
    ) -> pd.DataFrame:
        """
        Filter a dataframe by any column using regex.

        Parameters
        ----------
        annot : str or pd.Dataframe
            annotation to filter: "bed", "gtf" or a pandas dataframe
        regex : str
            regex string to match
        invert_match : bool, optional
            keep contigs NOT matching the regex string
        column: str or int, optional
            column name or number to filter (default: 1st, contig name)

        Returns
        -------
        pd.DataFrame
            filtered dataframe
        """
        df = _parse_annot(self, annot)
        return filter_regex(df, regex, invert_match, column)
    def from_attributes(self,
                        field,
                        annot: Union[str, pd.DataFrame] = "gtf",
                        check=True):
        """
        Convert the specified GTF attribute field to a pandas series

        Parameters
        ----------
        field : str
            field from the GTF's attribute column.
        annot : str or pd.Dataframe, optional
            any GTF in dataframe format, or the default GTF.
        check : bool, optional
            filter the GTF for rows containing field?

        Returns
        -------
        pd.Series
            with the same index as the input GTF and the field column
        """
        df = _parse_annot(self, annot)
        if check:
            df = df[df["attribute"].str.contains(field)]
            if len(df) == 0:
                raise ValueError(f"{field} not in GTF attributes!")

        # extract the text between the quotes
        series = df["attribute"].str.extract(fr'{field} "(.*?)"', expand=False)
        series.name = field
        return series
    def map_locations(self,
                      annot: Union[str, pd.DataFrame],
                      to: str,
                      drop=True) -> Union[None, pd.DataFrame]:
        """
        Map chromosome mapping from one assembly to another.

        Uses the NCBI assembly reports to find contigs.
        Drops missing contigs.

        Parameters
        ----------
        annot : str or pd.Dataframe
            annotation to map: "bed", "gtf" or a pandas dataframe.
        to: str
            target provider (UCSC, Ensembl or NCBI)
        drop: bool, optional
            if True, replace the chromosome column.
            If False, add a 2nd chromosome column.

        Returns
        -------
        pandas.DataFrame
            chromosome mapping.
        """
        if self.readme_file is None:
            raise AttributeError(
                "Can only map genomepy annotations (a readme file is required)"
            )
        genomes_dir = os.path.dirname(self.genome_dir)
        mapping = map_locations(self.name, to, genomes_dir)
        if mapping is None:
            return

        df = _parse_annot(self, annot)
        index_name = df.index.name
        if not set([index_name] + df.columns.to_list()) & {"chrom", "seqname"}:
            raise ValueError(
                "Location mapping requires a column named 'chrom' or 'seqname'."
            )

        # join mapping on chromosome column and return with original index
        is_indexed = df.index.to_list() != list(range(df.shape[0]))
        if is_indexed:
            df = df.reset_index(level=index_name)
        index_col = "chrom" if "chrom" in df.columns else "seqname"
        df = df.set_index(index_col)
        df = mapping.join(df, how="inner")
        df = df.reset_index(drop=drop)
        df.columns = [index_col] + df.columns.to_list()[1:]
        if is_indexed:
            df = df.set_index(index_name if index_name else "index")

        return df
    def gtf_dict(self,
                 key,
                 value,
                 string_values=True,
                 annot: Union[str, pd.DataFrame] = "gtf"):
        """
        Create a dictionary based on the columns or attribute fields in a GTF.

        Parameters
        ----------
        key : str
            column name or attribute fields (e.g. "seqname", "gene_name")
        value : str
            column name or attribute fields (e.g. "gene_id", "transcript_name")
        string_values : bool, optional
            attempt to format the dict values as strings
            (only happens if all value lists are length 1)
        annot : str or pd.Dataframe, optional
            annotation to filter: "gtf" or a pandas dataframe

        Returns
        -------
        dict
            with values as lists. If string_values is True
            and all lists are length 1, values will be strings.
        """
        df = _parse_annot(self, annot)
        k = key in df.columns
        v = value in df.columns
        # pd.DataFrame.iterrows() is slow. this is not.
        attributes = zip(
            df[key] if k else df.attribute,
            df[value] if v else df.attribute,
        )

        def _get_attr_item(series, item):
            """
            example series.attribute: "...; gene_name: "TP53"; ..."
            item="gene_name" would return "TP53"
            """
            split = series.split(item)
            return split[1].split('"')[1]  # item might not exist

        def _get_col_item(series, _):
            return series

        get_key = _get_col_item if k else _get_attr_item
        get_val = _get_col_item if v else _get_attr_item
        a_dict = dict()
        for row in attributes:
            try:
                k = get_key(row[0], key)
                v = get_val(row[1], value)
            except IndexError:
                continue

            # unique values per key
            if k in a_dict:
                a_dict[k].update({v})
            else:
                a_dict[k] = {v}

        # return values as str if all values are length 1
        # and string_values is True, else return values as list
        all_len_1 = string_values and all(len(v) == 1 for v in a_dict.values())
        for k, v in a_dict.items():
            a_dict[k] = list(v)[0] if all_len_1 else list(v)

        return a_dict
Exemple #5
0
def _map_genes(
    self,  # noqa
    field: str,
    product: str = "protein",
    annot: Union[str, pd.DataFrame] = "bed",
) -> pd.DataFrame:
    """
    Use mygene.info to map gene identifiers to any specified `field`.

    Returns the dataframe with remapped "name" column.
    Drops missing identifiers.

    Parameters
    ----------
    annot: str or pd.Dataframe
        Annotation dataframe to map (a pandas dataframe or "bed").
        Is mapped to a column named "name" (required).
    field : str, optional
        Identifier for gene annotation. Uses mygene.info to map ids. Valid fields
        are: ensembl.gene, entrezgene, symbol, name, refseq, entrezgene. Note that
        refseq will return the protein refseq_id by default, use `product="rna"` to
        return the RNA refseq_id. Currently, mapping to Ensembl transcript ids is
        not supported.
    product : str, optional
        Either "protein" or "rna". Only used when `field="refseq"`

    Returns
    -------
    pandas.DataFrame
        remapped gene annotation
    """
    if not isinstance(self.tax_id, int):
        raise AttributeError("A taxonomy identifier is required. "
                             "You can set 'Annotation.tax_id' manually")
    to, product = _parse_mygene_input(field, product)
    df = _parse_annot(self, annot)
    if df is None:
        raise ValueError(
            "Argument 'annot' must be 'bed' or a pandas dataframe.")

    cols = df.columns  # starting columns
    if "name" not in cols:
        raise ValueError("Column 'name' is required to map to.")

    # remove version numbers from gene IDs
    split_id = df["name"].str.split(r"\.", expand=True)[0]
    df = df.assign(split_id=split_id.values)
    genes = sorted(set(split_id))

    result = query_mygene(genes, self.tax_id, field)
    # result = _query_mygene(self, genes, field=to)
    result = _filter_query(result)
    if len(result) == 0:
        logger.warning("Could not map using mygene.info")
        return pd.DataFrame()
    df = df.join(result, on="split_id")

    # Only in case of RefSeq annotation the product needs to be specified.
    if to == "refseq":
        to = f"{to}.translation.{product}"

    # Get rid of extra columns from query
    df = df.assign(name=df[to].values)  # df["name"] = df[to]
    df = df[cols].dropna()
    return df