def get_spliceRack_bp(pseudocount=0.05): return [ pseudocount + PWM(np.loadtxt(BR_SPLICE_RACK_PATH + "/AT_AC_U12.txt"), "AT_AC_U12"), pseudocount + PWM(np.loadtxt(BR_SPLICE_RACK_PATH + "/GT_AG_U12.txt"), "GT_AG_U12") ]
def get_metadata(): """ Get pandas.DataFrame with metadata about the PWM's. Columns: - PWM_id (id of the PWM - pass to get_pwm_list() for getting the pwm - TF - Organism - DB - Info - consensus """ motifs = load_motif_db(HOCOMOCO_PWM) motif_names = sorted(list(motifs.keys())) df = pd.Series(motif_names).str.split(pat="_|\\.", expand=True) df.rename(columns={ 0: "TF", 1: "Organism", 2: "DB", 3: "info" }, inplace=True) # add PWM_id df.insert(0, "PWM_id", motif_names) # compute the consensus consensus = pd.Series( [PWM(motifs[m]).get_consensus() for m in motif_names]) df["consensus"] = consensus return df
def read_transfac(file_path, ignore_motif_name=True): from concise.utils.pwm import PWM from collections import defaultdict with open(file_path) as f: motif_lines = defaultdict(list) if not ignore_motif_name: motif = None else: motif = 0 for line in f: if line.startswith("DE"): if not ignore_motif_name: motif = line.split( "\t")[1].strip() # all are called motif 1 else: motif += 1 elif line.startswith("XX"): if not ignore_motif_name: motif = None else: motif_lines[motif].append(line.split("\t")) return { str(motif): PWM(np.array(v)[:, 1:-1].astype(float) + 0.01) for motif, v in motif_lines.items() }
def __init__(self, pwm_list=[], stddev=0.05, seed=None): if len(pwm_list) > 0 and isinstance(pwm_list[0], dict): pwm_list = [PWM.from_config(pwm) for pwm in pwm_list] self.stddev = stddev self.seed = seed self.pwm_list = pwm_list _check_pwm_list(pwm_list)
def consensus_dist(seqs, seq_distance_fn, **kwargs): from concise.utils.pwm import PWM from concise.preprocessing.sequence import one_hot2string, DNA consensus = PWM(seqs.mean(0) + 0.001).get_consensus() return np.array([ seq_distance_fn(consensus, s, **kwargs) for s in one_hot2string(seqs, DNA) ])
def read_meme_motifs(meme_file): from basepair.external.meme import read from concise.utils.pwm import PWM with open(meme_file) as f: record = read(f) return { str(i + 1): PWM(pd.DataFrame(m.pwm)[list("ACGT")].values + 0.01, name=m.num_occurrences) for i, m in enumerate(record) }
def __init__(self, pwm_list=[], kernel_size=None, mean_max_scale=0.): # handle pwm_list as a dictionary if len(pwm_list) > 0 and isinstance(pwm_list[0], dict): pwm_list = [PWM.from_config(pwm) for pwm in pwm_list] if kernel_size is None: kernel_size = len(pwm_list) self.pwm_list = pwm_list self.kernel_size = kernel_size self.mean_max_scale = mean_max_scale _check_pwm_list(pwm_list)
def get_branchpoint_pwm_list(cache=True): l = [] if os.path.isfile(BR_PWM) and cache: l.append(PWM.from_config(read_json(BR_PWM))) else: dt = pd.read_csv( DATA_ROOT + "/Splice_branchpoints/processed/branchpointer/train/filteredDescr.csv" ) # colmeans dt = dt[dt.set == "HC"] dtseq = dt[dt.columns[dt.columns.str.match("^seq_")]] - 1 pwm = np.array(dtseq.mean()).reshape((-1, 4)) assert np.allclose(pwm.sum(1), 1) p = PWM(pwm, name="U2_branchpoint") write_json(p.get_config(), BR_PWM) l.append(p) l.append( PWM(0.05 + np.loadtxt(BR_SPLICE_RACK_PATH + "/GT_AG_U12.txt"), "GT_AG_U12_branchpoint")) return l
def get_pwm_list(motif_name_list, pseudocountProb=0.0001): """Get a list of ENCODE PWM's. # Arguments pwm_id_list: List of id's from the `PWM_id` column in `get_metadata()` table pseudocountProb: Added pseudocount probabilities to the PWM # Returns List of `concise.utils.pwm.PWM` instances. """ l = _load_motifs() l = {k.split()[0]: v for k, v in l.items()} pwm_list = [PWM(l[m] + pseudocountProb, name=m) for m in motif_name_list] return pwm_list
def __init__(self, pwm_list=[], stddev=0.05, seed=None, background_probs=DEFAULT_BASE_BACKGROUND, add_noise_before_Pwm2Pssm=True): if len(pwm_list) > 0 and isinstance(pwm_list[0], dict): pwm_list = [PWM.from_config(pwm) for pwm in pwm_list] self.pwm_list = pwm_list _check_pwm_list(pwm_list) self.stddev = stddev self.seed = seed self.background_probs = background_probs self.add_noise_before_Pwm2Pssm = add_noise_before_Pwm2Pssm
def get_metadata(): """Get pandas.DataFrame with metadata about the PWM's. Columns: - PWM_id (id of the PWM - pass to get_pwm_list() for getting the pwm - info1 - additional information about the motifs - info2 - consensus: PWM consensus sequence """ motifs = _load_motifs() motif_names = sorted(list(motifs.keys())) df = pd.Series(motif_names).str.split(expand=True) df.rename(columns={0: "PWM_id", 1: "info1", 2: "info2"}, inplace=True) # compute the consensus consensus = pd.Series([PWM(motifs[m]).get_consensus() for m in motif_names]) df["consensus"] = consensus return df
def __init__(self, pwm_list=[], kernel_size=None, mean_max_scale=0., background_probs=DEFAULT_BASE_BACKGROUND): # handle pwm_list as a dictionary if len(pwm_list) > 0 and isinstance(pwm_list[0], dict): pwm_list = [PWM.from_config(pwm) for pwm in pwm_list] if kernel_size is None: kernel_size = len(pwm_list) _check_pwm_list(pwm_list) self.pwm_list = pwm_list self.kernel_size = kernel_size self.mean_max_scale = mean_max_scale self.background_probs = background_probs
def read_homer(file_path): df = pd.read_csv(file_path, skiprows=1, header=None, sep='\t').values l.split(",P:")[1].strip() with open("motif1.motif.txt") as f: l = f.readline() return PWM(df + 0.01, name="P={}".format(l.split(",P:")[1].strip()))