Example #1
0
def get_spliceRack_bp(pseudocount=0.05):
    return [
        pseudocount +
        PWM(np.loadtxt(BR_SPLICE_RACK_PATH + "/AT_AC_U12.txt"), "AT_AC_U12"),
        pseudocount +
        PWM(np.loadtxt(BR_SPLICE_RACK_PATH + "/GT_AG_U12.txt"), "GT_AG_U12")
    ]
Example #2
0
def get_metadata():
    """
    Get pandas.DataFrame with metadata about the PWM's. Columns:

    - PWM_id (id of the PWM - pass to get_pwm_list() for getting the pwm
    - TF
    - Organism
    - DB
    - Info
    - consensus
    """

    motifs = load_motif_db(HOCOMOCO_PWM)
    motif_names = sorted(list(motifs.keys()))

    df = pd.Series(motif_names).str.split(pat="_|\\.", expand=True)
    df.rename(columns={
        0: "TF",
        1: "Organism",
        2: "DB",
        3: "info"
    },
              inplace=True)

    # add PWM_id
    df.insert(0, "PWM_id", motif_names)

    # compute the consensus
    consensus = pd.Series(
        [PWM(motifs[m]).get_consensus() for m in motif_names])
    df["consensus"] = consensus
    return df
Example #3
0
def read_transfac(file_path, ignore_motif_name=True):
    from concise.utils.pwm import PWM
    from collections import defaultdict

    with open(file_path) as f:
        motif_lines = defaultdict(list)
        if not ignore_motif_name:
            motif = None
        else:
            motif = 0
        for line in f:
            if line.startswith("DE"):
                if not ignore_motif_name:
                    motif = line.split(
                        "\t")[1].strip()  # all are called motif 1
                else:
                    motif += 1
            elif line.startswith("XX"):
                if not ignore_motif_name:
                    motif = None
            else:
                motif_lines[motif].append(line.split("\t"))

    return {
        str(motif): PWM(np.array(v)[:, 1:-1].astype(float) + 0.01)
        for motif, v in motif_lines.items()
    }
Example #4
0
    def __init__(self, pwm_list=[], stddev=0.05, seed=None):
        if len(pwm_list) > 0 and isinstance(pwm_list[0], dict):
            pwm_list = [PWM.from_config(pwm) for pwm in pwm_list]

        self.stddev = stddev
        self.seed = seed
        self.pwm_list = pwm_list
        _check_pwm_list(pwm_list)
Example #5
0
def consensus_dist(seqs, seq_distance_fn, **kwargs):
    from concise.utils.pwm import PWM
    from concise.preprocessing.sequence import one_hot2string, DNA
    consensus = PWM(seqs.mean(0) + 0.001).get_consensus()
    return np.array([
        seq_distance_fn(consensus, s, **kwargs)
        for s in one_hot2string(seqs, DNA)
    ])
Example #6
0
def read_meme_motifs(meme_file):
    from basepair.external.meme import read
    from concise.utils.pwm import PWM

    with open(meme_file) as f:
        record = read(f)
    return {
        str(i + 1): PWM(pd.DataFrame(m.pwm)[list("ACGT")].values + 0.01,
                        name=m.num_occurrences)
        for i, m in enumerate(record)
    }
Example #7
0
    def __init__(self, pwm_list=[], kernel_size=None, mean_max_scale=0.):
        # handle pwm_list as a dictionary
        if len(pwm_list) > 0 and isinstance(pwm_list[0], dict):
            pwm_list = [PWM.from_config(pwm) for pwm in pwm_list]

        if kernel_size is None:
            kernel_size = len(pwm_list)

        self.pwm_list = pwm_list
        self.kernel_size = kernel_size
        self.mean_max_scale = mean_max_scale
        _check_pwm_list(pwm_list)
Example #8
0
def get_branchpoint_pwm_list(cache=True):
    l = []
    if os.path.isfile(BR_PWM) and cache:
        l.append(PWM.from_config(read_json(BR_PWM)))
    else:
        dt = pd.read_csv(
            DATA_ROOT +
            "/Splice_branchpoints/processed/branchpointer/train/filteredDescr.csv"
        )
        # colmeans
        dt = dt[dt.set == "HC"]
        dtseq = dt[dt.columns[dt.columns.str.match("^seq_")]] - 1
        pwm = np.array(dtseq.mean()).reshape((-1, 4))
        assert np.allclose(pwm.sum(1), 1)
        p = PWM(pwm, name="U2_branchpoint")
        write_json(p.get_config(), BR_PWM)
        l.append(p)
    l.append(
        PWM(0.05 + np.loadtxt(BR_SPLICE_RACK_PATH + "/GT_AG_U12.txt"),
            "GT_AG_U12_branchpoint"))
    return l
Example #9
0
def get_pwm_list(motif_name_list, pseudocountProb=0.0001):
    """Get a list of ENCODE PWM's.

    # Arguments
        pwm_id_list: List of id's from the `PWM_id` column in `get_metadata()` table
        pseudocountProb: Added pseudocount probabilities to the PWM

    # Returns
        List of `concise.utils.pwm.PWM` instances.
    """
    l = _load_motifs()
    l = {k.split()[0]: v for k, v in l.items()}
    pwm_list = [PWM(l[m] + pseudocountProb, name=m) for m in motif_name_list]
    return pwm_list
Example #10
0
    def __init__(self,
                 pwm_list=[],
                 stddev=0.05,
                 seed=None,
                 background_probs=DEFAULT_BASE_BACKGROUND,
                 add_noise_before_Pwm2Pssm=True):
        if len(pwm_list) > 0 and isinstance(pwm_list[0], dict):
            pwm_list = [PWM.from_config(pwm) for pwm in pwm_list]

        self.pwm_list = pwm_list
        _check_pwm_list(pwm_list)
        self.stddev = stddev
        self.seed = seed
        self.background_probs = background_probs
        self.add_noise_before_Pwm2Pssm = add_noise_before_Pwm2Pssm
Example #11
0
def get_metadata():
    """Get pandas.DataFrame with metadata about the PWM's. Columns:

    - PWM_id (id of the PWM - pass to get_pwm_list() for getting the pwm
    - info1 - additional information about the motifs
    - info2
    - consensus: PWM consensus sequence
    """
    motifs = _load_motifs()

    motif_names = sorted(list(motifs.keys()))
    df = pd.Series(motif_names).str.split(expand=True)
    df.rename(columns={0: "PWM_id", 1: "info1", 2: "info2"}, inplace=True)

    # compute the consensus
    consensus = pd.Series([PWM(motifs[m]).get_consensus() for m in motif_names])
    df["consensus"] = consensus
    return df
Example #12
0
    def __init__(self,
                 pwm_list=[],
                 kernel_size=None,
                 mean_max_scale=0.,
                 background_probs=DEFAULT_BASE_BACKGROUND):

        # handle pwm_list as a dictionary
        if len(pwm_list) > 0 and isinstance(pwm_list[0], dict):
            pwm_list = [PWM.from_config(pwm) for pwm in pwm_list]

        if kernel_size is None:
            kernel_size = len(pwm_list)

        _check_pwm_list(pwm_list)
        self.pwm_list = pwm_list
        self.kernel_size = kernel_size
        self.mean_max_scale = mean_max_scale
        self.background_probs = background_probs
Example #13
0
def read_homer(file_path):
    df = pd.read_csv(file_path, skiprows=1, header=None, sep='\t').values
    l.split(",P:")[1].strip()
    with open("motif1.motif.txt") as f:
        l = f.readline()
    return PWM(df + 0.01, name="P={}".format(l.split(",P:")[1].strip()))