Exemple #1
0
def combine_group_rows_on_char(df, group_on, combine_cols=None, char='|'):
    """
    Performs a Groupby on a dataframe and then converts each group into a single row, joinned by a character `char`

    Primarly suppports grouping on columns, other methods have not been tested.

    :param df:  The dataframe to group
    :param group_on: the column name or list of column names to group by
    :param combine_cols: a list of column names to combine with a character, if None, will combine all columns.
        can save computation time to provide only the columns of interest for combination
    :param char: the character to combine the columns with. Defaults to a `|` character.

    :return: Dataframe with 1 row per group, and information of different rows joined by given character.

    """
    col_order = df.columns

    if type(group_on) in (str, int, float):
        group_on = [group_on]

    grouped = df.groupby(group_on)

    if combine_cols is None:
        combine_cols = find_cols_with_multi_values(grouped)

    out_df = grouped.first()
    for col in tqdm(combine_cols, desc='total_progress'):
        tqdm.pandas(desc=col)
        out_df[col] = grouped[col].progress_apply(char_combine_col, char=char)

    return out_df.reset_index()[col_order]
Exemple #2
0
def create_farm_id_translate_table(
        df: pd.DataFrame,
        columns=["kommunenr", "gaardsnummer", "bruksnummer", "festenummer"]):
    """
    For a dataframe with farmers, each identified by kommunenr, gårdsnummer, bruksnummer, and festenummer, 
    returns a new dataframe with new and updated ids for every farmer, queried against the geonorge commune reform api.

    Settle in, this could take a while...
    """

    from tqdm.autonotebook import tqdm

    old_farms = df[columns]

    tqdm.pandas(desc="Creating translate table...", ncols=100)

    def apply_func(farm):
        return get_updated_commune_and_farm_id(*farm)

    new_farms = old_farms.progress_apply(apply_func,
                                         axis=1,
                                         result_type="broadcast")

    new_farms.columns = list(map(lambda c: c + "_new", columns))
    old_farms.columns = list(map(lambda c: c + "_old", columns))

    return old_farms.merge(new_farms, left_index=True, right_index=True)
Exemple #3
0
    def _dask_apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, *args, **kwds):
        samp = self._obj.iloc[: self._npartitions * 2, :]
        meta = samp.apply(
            func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds
        )
        try:
            if broadcast:
                result_type = "broadcast"
            elif reduce:
                result_type = "reduce"

            tmp_df = (
                dd.from_pandas(samp, npartitions=self._npartitions)
                .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                .compute(scheduler=self._scheduler)
            )
            assert tmp_df.equals(meta)
            if self._progress_bar:
                with TQDMDaskProgressBar(desc="Dask Apply"):
                    return (
                        dd.from_pandas(self._obj, npartitions=self._npartitions)
                        .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                        .compute(scheduler=self._scheduler)
                    )
            else:
                return (
                    dd.from_pandas(self._obj, npartitions=self._npartitions)
                    .apply(func, *args, axis=axis, raw=raw, result_type=result_type, meta=meta, **kwds)
                    .compute(scheduler=self._scheduler)
                )
        except (AssertionError, AttributeError, ValueError, TypeError) as e:
            if self._progress_bar:
                tqdm.pandas(desc="Pandas Apply")
                return self._obj.progress_apply(
                    func,
                    axis=axis,
                    broadcast=broadcast,
                    raw=raw,
                    reduce=reduce,
                    result_type=result_type,
                    args=args,
                    **kwds
                )
            else:
                return self._obj.apply(
                    func,
                    axis=axis,
                    broadcast=broadcast,
                    raw=raw,
                    reduce=reduce,
                    result_type=result_type,
                    args=args,
                    **kwds
                )
Exemple #4
0
    def apply(self, func, convert_dtype=True, args=(), **kwds):
        """
        Apply the function to the Series using swifter
        """
        samp = self._obj.iloc[:self._npartitions * 2]
        # check if input is string or if the user is overriding the string processing default
        allow_dask_processing = True if self._allow_dask_on_strings else (
            samp.dtype != "object")

        if "axis" in kwds.keys():
            kwds.pop("axis")
            warnings.warn(
                "Axis keyword not necessary because applying on a Series.")

        try:  # try to vectorize
            tmp_df = func(samp, *args, **kwds)
            assert samp.apply(func,
                              convert_dtype=convert_dtype,
                              args=args,
                              **kwds).equals(tmp_df)
            return func(self._obj, *args, **kwds)
        except (
                AssertionError,
                AttributeError,
                ValueError,
                TypeError,
                TypingError,
        ):  # if can't vectorize, estimate time to pandas apply
            wrapped = self._wrapped_apply(func,
                                          convert_dtype=convert_dtype,
                                          args=args,
                                          **kwds)
            n_repeats = 3
            timed = timeit.timeit(wrapped, number=n_repeats)
            samp_proc_est = timed / n_repeats
            est_apply_duration = samp_proc_est / self._SAMP_SIZE * self._obj.shape[
                0]

            # if pandas apply takes too long and not performing str processing, use dask
            if (est_apply_duration >
                    self._dask_threshold) and allow_dask_processing:
                return self._dask_apply(func, convert_dtype, *args, **kwds)
            else:  # use pandas
                if self._progress_bar:
                    tqdm.pandas(desc="Pandas Apply")
                    return self._obj.progress_apply(
                        func, convert_dtype=convert_dtype, args=args, **kwds)
                else:
                    return self._obj.apply(func,
                                           convert_dtype=convert_dtype,
                                           args=args,
                                           **kwds)
Exemple #5
0
def _apply_func(iterable, func, tqdm_obj=None):
    """
    Applies a function to an iterable immutably.
    """
    if isinstance(iterable, (pd.DataFrame, pd.Series)):
        tqdm.pandas()
        return iterable.progress_apply(func)
    else:
        def update(*args):
            tqdm_obj.update()
            return func(*args)

        return map(update, iterable)
Exemple #6
0
    def apply(self, func, *args, **kwds):
        """
        Apply the function to the transformed swifter object
        """
        # estimate time to pandas apply
        wrapped = self._wrapped_apply(func, *args, **kwds)
        n_repeats = 3
        timed = timeit.timeit(wrapped, number=n_repeats)
        samp_proc_est = timed / n_repeats
        est_apply_duration = samp_proc_est / self._SAMP_SIZE * self._nrows

        # if pandas apply takes too long, use dask
        if est_apply_duration > self._dask_threshold:
            return self._dask_apply(func, *args, **kwds)
        else:  # use pandas
            if self._progress_bar:
                tqdm.pandas(desc="Pandas Apply")
                return self._obj_pd.apply(func, *args, **kwds)
            else:
                return self._obj_pd.apply(func, *args, **kwds)
def assemble_library(
    spacers: pd.DataFrame,
    on_target_score_threshold: int = 0,
    off_target_score_threshold: int = 0,
    spacers_per_feature: int = 6,
) -> pd.DataFrame:
    """Creates a final list of protospacers for synthesis

    Parameters
    __________
    spacers : :class:`~pd.DataFrame`
        Dataframe with all spacers found by :module:`~find_spacers.find_spacers`,
        scores added by :module:`~on_target_scoring.on_target_scoring` and
        :module:`~off_target_scoring.off_target_scoring`
    on_target_score_threshold : int, optional (default: 100)
        Spacers with an on-target score below this threshold will be removed
    off_target_score_threshold : int, optional (default: 0)
        Spacers with an off-target score below this threshold will be removed
    spacers_per_feature : int, optional (default: 6)
        The number of spacers to return for each gene

    Return
    ______
    :class:`~pd.DataFrame` with the final spacer sequences for synthesis
    """

    spacers = spacers[spacers["on_target_score"] > on_target_score_threshold]
    spacers = spacers[spacers["off_target_score"] > off_target_score_threshold]
    spacers = spacers.drop(labels=["seq_hash", "hash"],
                           axis="columns").drop_duplicates()
    if spacers_per_feature == 0:
        return spacers.drop(labels=["seq_hash", "hash"], axis="columns")
    else:
        tqdm.pandas(desc="Assembling library", unit="spacers")
        grouped = (
            spacers.groupby("gene_name").progress_apply(lambda x: x.nlargest(
                spacers_per_feature, "on_target_score")).reset_index(drop=True)
            # .drop(labels=["seq_hash", "hash"], axis="columns")
        )
        return grouped
Exemple #8
0
def generate_unique_filepaths(outfile=None, nrows=None):
    '''
    Create a list of unique filepaths for all case json in the PACER folder and export to .csv
    Inputs:
        - outfile (str or Path) - the output file name (.csv) relative to the project root if none doesn't output
        - nrows (int) - no. of cases to use (for testing)
    Outputs:
        DataFrame of file metadata (also output to outfile if output=True)
    '''
    import pandas as pd
    tqdm.pandas()

    case_jsons = [court_dir.glob('json/*.json') for court_dir in settings.PACER_PATH.glob('*')
                    if court_dir.is_dir()]

    file_iter = chain(*case_jsons)

    df = convert_filepaths_list(file_iter=file_iter, nrows=nrows)

    #Write the file
    if outfile:
        df.to_csv(std_path(outfile))

    return df
Exemple #9
0
def filtrations(df, with_dots=False):
    stopWords = set(stopwords.words('english'))

    if with_dots:
        tqdm.pandas(desc="WITH DOTS: ")

        df = df[df.lemma.progress_apply(lambda lemma: str(lemma) not in string.punctuation.replace('.', ''))]
    else:
        tqdm.pandas(desc="WITHOUT DOTS: ")

        df = df[df.lemma.progress_apply(lambda lemma: str(lemma) not in string.punctuation)]

    mask = (~df.lemma.isin(stopWords)) & (df.ner_tag != '[]') & (df.ner_tag != '') & (df.lemma != '') & (df.token != '')

    df = df[mask]

    tqdm.pandas(desc="")

    return df
Exemple #10
0
    def apply(self, func, axis=0, broadcast=None, raw=False, reduce=None, result_type=None, args=(), **kwds):
        """
        Apply the function to the DataFrame using swifter
        """
        samp = self._obj.iloc[: self._npartitions * 2, :]
        # check if input is string or if the user is overriding the string processing default
        str_processing = ("object" in samp.dtypes.values) if not self._allow_dask_on_strings else False

        try:  # try to vectorize
            tmp_df = func(samp, *args, **kwds)
            assert samp.apply(
                func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds
            ).equals(tmp_df)
            return func(self._obj, *args, **kwds)
        except (
            AssertionError,
            AttributeError,
            ValueError,
            TypeError,
            TypingError,
        ) as e:  # if can't vectorize, estimate time to pandas apply
            wrapped = self._wrapped_apply(
                func, axis=axis, broadcast=broadcast, raw=raw, reduce=reduce, result_type=result_type, args=args, **kwds
            )
            n_repeats = 3
            timed = timeit.timeit(wrapped, number=n_repeats)
            samp_proc_est = timed / n_repeats
            est_apply_duration = samp_proc_est / self._SAMP_SIZE * self._obj.shape[0]

            # if pandas apply takes too long and not performing str processing, use dask
            if (est_apply_duration > self._dask_threshold) and (not str_processing):
                if axis == 0:
                    raise NotImplementedError(
                        "Swifter cannot perform axis=0 applies on large datasets.\n"
                        "Dask currently does not have an axis=0 apply implemented.\n"
                        "More details at https://github.com/jmcarpenter2/swifter/issues/10"
                    )
                return self._dask_apply(func, axis, broadcast, raw, reduce, result_type, *args, **kwds)
            else:  # use pandas
                if self._progress_bar:
                    tqdm.pandas(desc="Pandas Apply")
                    return self._obj.progress_apply(
                        func,
                        axis=axis,
                        broadcast=broadcast,
                        raw=raw,
                        reduce=reduce,
                        result_type=result_type,
                        args=args,
                        **kwds
                    )
                else:
                    return self._obj.apply(
                        func,
                        axis=axis,
                        broadcast=broadcast,
                        raw=raw,
                        reduce=reduce,
                        result_type=result_type,
                        args=args,
                        **kwds
                    )
# In[5]:


import pandas as pd
import urllib

import numpy as np

import json

from tqdm.autonotebook import  tqdm

#%matplotlib inline

tqdm.pandas(tqdm)

import dask.dataframe as dd

from dask.multiprocessing import get
from dask.diagnostics import ProgressBar

from datetime import datetime
import matplotlib.pyplot as plt

from IPython.display import display


# In[10]:

Exemple #12
0
# %%
import glob
import os
from functools import partial

import pandas as pd
from pymatgen.core import Composition, Structure
from tqdm.autonotebook import tqdm

from aviary.cgcnn.utils import get_cgcnn_input
from aviary.wren.utils import count_wyks, get_aflow_label_spglib

tqdm.pandas()  # prime progress_apply functionality

final_dir = os.path.dirname(os.path.abspath(__file__))

idx_list = []
structs = []
E_vasp_list = []
meta_list = []
ht_paths = []

for f in glob.glob(final_dir + "/raw/*.poscar", recursive=True):
    task_id = f.split("/")[-1].split(".")[0]

    with open(f) as s:
        s = s.read()
        struct = Structure.from_str(s, fmt="poscar")

        lines = s.split("\n")
Exemple #13
0
def additional_features(df):
    tqdm.pandas(desc="IS TITLE: ")

    df['is_title'] = df.token.progress_apply(lambda x: int(str(x).istitle()))

    tqdm.pandas(desc="CONTAINS DIGITS: ")

    df['contains_digits'] = df.token.progress_apply(lambda x: int(not str(x).isalpha()))

    tqdm.pandas(desc="WORD LENGTH: ")

    df['word_len'] = df.token.progress_apply(lambda x: len(str(x)))

    tqdm.pandas(desc="SUFFIX: ")

    df['suffix'] = df.lemma.progress_apply(lambda x: str(x)[-3:])

    tqdm.pandas(desc="PREFIX: ")

    df['prefix'] = df.lemma.progress_apply(lambda x: str(x)[0:3])

    tqdm.pandas(desc="")

    df['prev_pos_tag'] = np.roll(df.pos_tag.values, 1)

    df['prev_is_title'] = np.roll(df.is_title.values, 1)

    df['prev_contains_digits'] = np.roll(df.contains_digits.values, 1)

    df['prev_word_len'] = np.roll(df.word_len.values, 1)

    df['prev_suffix'] = np.roll(df.suffix.values, 1)

    df['prev_prefix'] = np.roll(df.prefix.values, 1)

    df['next_pos_tag'] = np.roll(df.pos_tag.values, -1)

    df['next_is_title'] = np.roll(df.is_title.values, -1)

    df['next_contains_digits'] = np.roll(df.contains_digits.values, -1)

    df['next_word_len'] = np.roll(df.word_len.values, -1)

    df['next_suffix'] = np.roll(df.suffix.values, -1)

    df['next_prefix'] = np.roll(df.prefix.values, -1)

    return df
Exemple #14
0
def create_library(
    input_sequences: str = None,
    output_library: str = None,
    reference: str = None,
    restriction_sites: str = None,
    largeindex: bool = False,
    on_target_rule_set: Optional[str] = None,
    on_target_score_threshold: int = 0,
    off_target_rule_set: Optional[str] = None,
    off_target_score_threshold: int = 0,
    off_target_count_threshold: int = 100,
    number_mismatches_to_consider: int = 3,
    nuclease: str = "SpCas9",
    spacers_per_feature: int = 9,
    reject: bool = False,
    paired: bool = False,
    number_upstream_spacers: int = 0,
    number_downstream_spacers: int = 0,
    cores: int = 0,
    chunks: int = 8,
    verbose: bool = False,
    write_early_exit: bool = False,
) -> None:
    """Build a CRISPR library
    \f

    Parameters
    ----------
    :param input_sequences :
    :param output_library :
    :param reference :
    :param restriction_sites :
    :param largeindex :
    :param on_target_score_threshold :
    :param off_target_score_threshold :
    :param off_target_count_threshold :
    number_mismatches_to_consider
    :param nuclease :
    :param spacers_per_feature :
    :param reject :
    :param paired :
    :param rule_set :
    :param number_upstream_spacers :
    :param number_downstream_spacers :
    :param cores :
    :param chunks:
    verbose : bool

    Return
    ------
    :type reference: object
    """
    targets = pyfaidx.Fasta(input_sequences)

    global NUCLEASES
    nuc = NUCLEASES[NUCLEASES["nuclease"] == nuclease].to_dict(
        orient="records")[0]

    spacers_df = find_spacers(
        itemlist=targets,
        nuclease_info=nuc,
        restriction_sites=restriction_sites,
        chunks=chunks,
    )
    if write_early_exit:
        spacers_df.to_csv(
            "/Users/milessmith/workspace/mc_human_files/early_exit.csv")
        sys.exit(0)
    initialnumber = spacers_df.shape[0]

    # thank the gods for the tutorial at
    # https://www.machinelearningplus.com/python/parallel-processing-python/
    # scoring_pool = Pool(cores)
    chunked_spacer_dfs = np.array_split(spacers_df, chunks * 10)

    scoring_partial = partial(
        on_target_scoring,
        rule_set=on_target_rule_set,
        on_target_score_threshold=on_target_score_threshold,
    )

    spacers_df = pd.concat(p_umap(scoring_partial, chunked_spacer_dfs))

    # scoring_pool.close()
    # scoring_pool.join()
    # scoring_pool.clear()

    if spacers_df.shape[0] == 0:
        print("Sorry, no spacers matching that criteria were found")
        exit()
    else:
        if verbose:
            print(
                f"Finished scoring spacers. {spacers_df.shape[0]} of {initialnumber} "
                f"spacers have an on-target score above the cutoff threshold of "
                f"{on_target_score_threshold}.")

    tqdm.pandas(desc="Adding tracking hashes", unit="spacers")
    spacers_df["hash"] = spacers_df.progress_apply(lambda x: hash(tuple(x)),
                                                   axis=1)
    if verbose:
        print("\nBeginning Bowtie alignment...")
    off_target_results_file = off_target_discovery(
        spacers_df=spacers_df,
        nuclease_info=nuc,
        cpus=cores,
        refgenome=reference,
        large_index_size=largeindex,
        reject=reject,
        number_mismatches_to_consider=number_mismatches_to_consider,
        verbose=verbose,
    )

    spacers_df = off_target_scoring(
        otrf=off_target_results_file,
        spacers_df=spacers_df,
        nuclease_info=nuc,
        rule_set=off_target_rule_set,
        off_target_score_threshold=off_target_score_threshold,
        off_target_count_threshold=off_target_count_threshold,
        verbose=verbose,
    )

    if paired:
        guide_library = assemble_paired_library(
            spacers=spacers_df,
            on_target_score_threshold=on_target_score_threshold,
            off_target_score_threshold=off_target_score_threshold,
            number_upstream_spacers=number_upstream_spacers,
            number_downstream_spacers=number_downstream_spacers,
        )
    else:
        guide_library = assemble_library(
            spacers=spacers_df,
            on_target_score_threshold=on_target_score_threshold,
            off_target_score_threshold=off_target_score_threshold,
            spacers_per_feature=spacers_per_feature,
        )
    guide_library.to_csv(output_library)
    print("Finished.")
def assemble_paired_library(
    spacers: pd.DataFrame,
    on_target_score_threshold: int = 100,
    off_target_score_threshold: int = 100,
    number_upstream_spacers: int = 3,
    number_downstream_spacers: int = 3,
    # min_paired_distance: int = 30, #reenable once I figure it out
    mix_and_match: bool = True,
) -> pd.DataFrame:
    """Creates a final list of protospacers for synthesis.  Used to create
    excision libraries, where two spacers are necessary to cause cuts at either
    side of a feature. `assemble_paired_library()` will take a set of upstream
    and set of downstream spacers, generate all permutations for those
    originating for the same feature, and assemble them in a synthetic
    SpCas9 spacer array

    Parameters
    __________
    spacers : :class:`~pd.DataFrame`
        Dataframe with all spacers found by :module:`~find_spacers.find_spacers`,
        scores added by :module:`~on_target_scoring.on_target_scoring` and
        :module:`~off_target_scoring.off_target_scoring`
    on_target_score_threshold : int, optional (default: 100)
        Spacers with an on-target score below this threshold will be removed
    off_target_score_threshold : int, optional (default: 100)
        Spacers with an off-target score below this threshold will be removed
    number_upstream_spacers : int, optional (default: 3)
        Number of spacers upstream of a gene to use
    number_downstream_spacers : int, optional (default: 3)
        Number of spacers upstream of a gene to use
    mix_and_match : bool, optional (default: True)
        If `True`, permutations of the final upstream and downstream spacers
        will be assembled into a larger synthetic spacer array construct.

    Return
    ______
    :class:`~pd.DataFrame` with the final spacer sequences for synthesis.
    If `mix_and_match` is `True`, then this will correspond to the spacer
    arrays; if `False`, then this will be a listing of the final upstream and
    downstream spacers.
    """

    spacers = spacers[spacers["on_target_score"] > on_target_score_threshold]
    spacers = spacers[spacers["off_target_score"] > off_target_score_threshold]
    upstream_spacers = spacers[spacers["gene_name"].str.contains("upstream")]
    downstream_spacers = spacers[spacers["gene_name"].str.contains(
        "downstream")]

    tqdm.pandas(desc="finding upstream spacers with highest on-target scores")
    grouped_upstream = (upstream_spacers.groupby("seq_hash").progress_apply(
        lambda x: x.nlargest(number_upstream_spacers, "on_target_score")).
                        reset_index(drop=True))

    tqdm.pandas(
        desc="finding downstream spacers with highest on-target scores")
    grouped_downstream = (downstream_spacers.groupby(
        "seq_hash").progress_apply(lambda x: x.nlargest(
            number_downstream_spacers, "on_target_score")).reset_index(
                drop=True))

    if mix_and_match:
        original_targets = spacers["seq_hash"].drop_duplicates().values

        combo_df = pd.DataFrame(columns=[
            "gene_name",
            "feature_id",
            "strand",
            "spacer",
            "upstream_on_target_score",
            "downstream_on_target_score",
            "upstream_off_target_score",
            "downstream_off_target_score",
            "seq_hash",
            "upstream_hash",
            "downstream_hash",
        ])

        for _ in original_targets:
            tmp_upstream_spacers = grouped_upstream[
                grouped_upstream["seq_hash"] == _]
            tmp_downstream_spacers = grouped_downstream[
                grouped_downstream["seq_hash"] == _]

            for permuted_indices in product(tmp_upstream_spacers.index,
                                            tmp_downstream_spacers.index):
                upstream_index, downstream_index = permuted_indices
                instance_df = pd.DataFrame({
                    "gene_name":
                    tmp_upstream_spacers["gene_name"].drop_duplicates().item().
                    strip("-upstream"),
                    "feature_id":
                    tmp_upstream_spacers["feature_id"],
                    "strand":
                    tmp_upstream_spacers["strand"],
                    "spacer":
                    "".join([
                        BSMBI_ARM_5,
                        RIGHT_EXTRA_SPACER,
                        tmp_upstream_spacers.loc[upstream_index, "spacer"],
                        DIRECT_REPEAT,
                        LEFT_EXTRA_SPACER,
                        tmp_downstream_spacers.loc[downstream_index, "spacer"],
                        BSMBI_ARM_3,
                    ]),
                    "upstream_on_target_score":
                    tmp_upstream_spacers.loc[upstream_index,
                                             "on_target_score"],
                    "downstream_on_target_score":
                    tmp_downstream_spacers.loc[downstream_index,
                                               "on_target_score"],
                    "upstream_off_target_score":
                    tmp_upstream_spacers.loc[upstream_index,
                                             "off_target_score"],
                    "downstream_off_target_score":
                    tmp_downstream_spacers.loc[downstream_index,
                                               "off_target_score"],
                    "seq_hash":
                    tmp_upstream_spacers.loc[upstream_index, "seq_hash"],
                    "upstream_hash":
                    tmp_upstream_spacers.loc[upstream_index, "hash"],
                    "downstream_hash":
                    tmp_downstream_spacers.loc[downstream_index, "hash"],
                })
                combo_df = pd.concat([combo_df, instance_df]).drop_duplicates()
        return combo_df
    else:
        return pd.concat([grouped_upstream, grouped_downstream])
import pandas as pd
import numpy as np
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from gensim.models.ldamulticore import LdaMulticore
from gensim.models import Phrases
from gensim.models import CoherenceModel
# from gensim.models import ldaseqmodel
from gensim.corpora import Dictionary
from datetime import datetime
from .ploty_template import plot_title
from .eda import Documents
from . import models
from tqdm.autonotebook import tqdm
tqdm.pandas()

import warnings
warnings.filterwarnings("ignore")

global MODEL_PATH
MODEL_PATH = os.path.abspath(os.path.dirname(models.__file__))


################## TODO: Temporal Topic Modelling ##################
class DynTM:
    def __init__(self, documents_object, num_topics=None, algo=None):
        if isinstance(documents_object, Documents):
            self.doc_obj = documents_object
            self.raw_df = documents_object.raw_df
            self.stop_words = documents_object.stop_words
Exemple #17
0
def off_target_scoring(
    otrf: str,
    spacers_df: pd.DataFrame,
    nuclease_info: Dict[str, Any],
    rule_set: Optional[str] = None,
    off_target_score_threshold: int = 0,
    off_target_count_threshold: Optional[int] = 100,
    verbose: bool = False,
) -> object:
    """Calculate a cumulative off-target score for a protospacer
    \f
    Parameters
    -----------
    otrf : `str`
        Path to the results from Bowtie
    spacers_df : :class:`~pandas.DataFrame`
        Dataframe containing spacers.  Format should be `{'gene_name',
        'feature_id', 'start','stop','strand','spacer'}`
    nuclease_info : `str`
        dictionary series with nuclease characteristics from nuclease_list.csv
    off_target_score_threshold : `int`
        Total off-target score threshold beyond which a spacer is rejected.
        Ranges from 0 to 100.
    off_target_count_threshold : `int`, default: 100
        Number of potential mismatches that should be tolerated.  Spacers
        exceeding the threshold will be discarded
    verbose : `bool`

    Return
    -------
    :class:`~pandas.DataFrame` matching the one passed to spacers_df containing
    off-target scores
    """

    bowtie_results = pd.read_csv(
        otrf,
        header=None,
        names=[
            "hash",
            "strand",
            "refseq",
            "position",
            "seq",
            "readquality",
            "aligncount",
            "mismatches",
        ],
        usecols=["hash", "mismatches"],
        dtype={"hash": "int64", "mismatches": "str"},
        na_filter=False,
        skip_blank_lines=True,
        sep="\t",
        memory_map=True,
    )

    if verbose:
        print(f"Total alignments from Bowtie: {bowtie_results.shape[0]}")

    # We need to reduce the number of spacers we examine.  For the most part,
    # those with a lot of potential off-targets (>1000?) have really low
    # scores and are worthless.  Some have >10,000 (!) potential off-targets
    # and should just be thrown out.
    results_count = bowtie_results.groupby("hash").agg("count").reset_index()
    filtered_results = bowtie_results[
        bowtie_results["hash"].isin(
            results_count[results_count["mismatches"] < off_target_count_threshold][
                "hash"
            ]
        )
    ]

    # Keep only those spacers that have fewer than our cutoff
    spacers_df = spacers_df[spacers_df["hash"].isin(filtered_results["hash"])]

    mmpos = regex.compile("[0-9]{1,}")
    tqdm.pandas(desc="converting mismatches", unit="spacers")
    filtered_results["locations"] = filtered_results["mismatches"].progress_apply(
        mmpos.findall
    )

    tqdm.pandas(desc="collapsing mismatches", unit="spacers")
    collapsed_results = (
        filtered_results.groupby("hash")
        .progress_apply(lambda x: x["locations"].values)
        .reset_index()
        .rename(index=str, columns={0: "locations"})
    )

    tqdm.pandas(desc="scoring mismatches", unit="spacers")
    collapsed_results["off_target_score"] = collapsed_results.apply(
        lambda x: sumofftargets(x["locations"], rule_set=rule_set), axis=1
    )
    spacers_df = spacers_df.merge(collapsed_results, on="hash")

    tqdm.pandas("counting off-targets", unit="spacers")
    spacers_df["off_targets"] = spacers_df.progress_apply(
        lambda x: len(x["locations"]) - 1, axis=1
    )
    spacers_df = spacers_df.drop(columns=["locations"])

    spacers_df = spacers_df[spacers_df["off_target_score"] > off_target_score_threshold]
    return spacers_df