def append_pfm_to_database(self, meme_db, trim_frac=0.08, freqs=[0.27, 0.23, 0.23, 0.27]): """write the pfm into a meme file to be used by motif search tools """ motif_name = "PWM_{}".format(self.short_name) if(os.path.exists(meme_db)): with open(meme_db, "r") as fp: for line in fp: if(motif_name in line): print("{} already exists".format(motif_name)) return None else: with open(meme_db, "w") as fp: fp.write("MEME version 4\n\nALPHABET= ACGT\n\nstrands: + -\n\nBackground letter frequencies\n\n") fp.write("A {0} C {1} G {2} T {3}\n\n".format(freqs[0], freqs[1], freqs[2], freqs[3])) i, j = trim_pssm_idx(self.get_seq_ic(), frac=trim_frac) trimmed_pattern = self.trim(i, j) pfm = trimmed_pattern.seq with open(meme_db, 'a') as fp: fp.write("MOTIF {}\n".format(motif_name)) fp.write("letter-probability matrix: alength= 4 w= {} nsites= 20 E= 0e+0\n".format(pfm.shape[0])) for line in pfm: fp.write('%.5f %.5f %.5f %.5f\n' % tuple(line)) fp.write("\n") return None
def fetch_tomtom_matches(self, background=[0.27, 0.23, 0.23, 0.27], tomtom_exec_path='tomtom', motifs_db='HOCOMOCOv11_full_HUMAN_mono_meme_format.meme', save_report=False, report_dir='./', temp_dir='./', trim_frac=0.08): """Fetches top matches from a motifs database using TomTom. Args: background: list with ACGT background probabilities tomtom_exec_path: path to TomTom executable motifs_db: path to motifs database in meme format n: number of top matches to return, ordered by p-value temp_dir: directory for storing temp files trim_threshold: the ppm is trimmed from left till first position for which probability for any base pair >= trim_threshold. Similarly from right. Returns: list: a list of up to n results returned by tomtom, each entry is a dictionary with keys 'Target ID', 'p-value', 'E-value', 'q-value' """ fname = os.path.join(temp_dir, 'query_file') # trim and prepare meme file i, j = trim_pssm_idx(self.get_seq_ic(), frac=trim_frac) trimmed_pattern = self.trim(i, j) trimmed_pattern.write_meme_file(background, fname) # run tomtom if(save_report): cmd = '{0} -no-ssc -oc {1} -verbosity 1 -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 {2} {3}'.format(tomtom_exec_path, report_dir, fname, motifs_db) print(cmd) out = subprocess.check_output(cmd, shell=True) df = pd.read_table("{}/tomtom.tsv".format(report_dir)) df = df[['Target_ID', 'p-value', 'E-value', 'q-value']] schema = list(df.columns) dat = df.get_values() else: cmd = "{0} -no-ssc -oc . -verbosity 1 -text -min-overlap 5 -mi 1 -dist pearson -evalue -thresh 10.0 {1} {2}".format(tomtom_exec_path, fname, motifs_db) print(cmd) out = subprocess.check_output(cmd, shell=True) dat = [x.split('\t') for x in out.strip().decode("utf-8").split('\n')] schema = dat[0] dat = dat[1:] tget_idx, pval_idx, eval_idx, qval_idx = schema.index('Target_ID'), schema.index('p-value'), schema.index('E-value'), schema.index('q-value') r = [] for t in dat: if(len(t) < 4): break mtf = {} mtf['Target ID'] = t[tget_idx] mtf['p-value'] = float(t[pval_idx]) mtf['E-value'] = float(t[eval_idx]) mtf['q-value'] = float(t[qval_idx]) # if(mtf['q-value']<0.001): # break r.append(mtf) os.system('rm ' + fname) return r
def append_pwm_to_database(self, pwm_db, trim_frac=0.08): """write a the pwm into a database to be used by motif search tools """ pwm_name = "PWM_{}".format(self.short_name) if(os.path.exists(pwm_db)): # search whether the pwm already exists in the database with open(pwm_db, 'r') as fp: for line in fp: if(pwm_name in line): print("{} already exists".format(pwm_name)) return None i, j = trim_pssm_idx(self.get_seq_ic(), frac=trim_frac) trimmed_pattern = self.trim(i, j) pssm = trimmed_pattern.get_seq_ic() with open(pwm_db, 'a') as fp: fp.write(">{}\n".format(pwm_name)) for line in pssm: fp.write('%.5f %.5f %.5f %.5f\n' % tuple(line)) return None
def get_trim_idx(self, pattern): """Return the trimming indices """ return trim_pssm_idx(self.mr.get_pssm(*pattern.split("/")), frac=self.trim_frac)
def _trim_seq_ic_ij(self, trim_frac=0.0): return trim_pssm_idx(self.get_seq_ic(), frac=trim_frac)