def parse(self, fo): """ Convert ChIPMunk output to motifs Parameters ---------- fo : file-like File object containing ChIPMunk output. Returns ------- motifs : list List of Motif instances. """ line = fo.readline() if not line: return [] while not line.startswith("A|"): line = fo.readline() matrix = [] for _ in range(4): matrix.append( [float(x) for x in line.strip().split("|")[1].split(" ")]) line = fo.readline() # print matrix matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] # print matrix m = Motif(matrix) m.id = "ChIPMunk_w%s" % len(m) return [m]
def parse_out(self, fo): motifs = [] nucs = {"A":0,"C":1,"G":2,"T":3} pseudo = 0.0 # Should be 1/sqrt(# of seqs) aligns = {} for line in fo.readlines(): if line.startswith("#"): pass elif len(line) > 1: vals = line.strip().split("\t") id, site = [x.strip().split(" ")[1].replace('"',"") for x in vals[8].split(";") if x] #if vals[6] == "+": if site.upper().find("N") == -1: aligns.setdefault(id, []).append(site) #else: # print site, rc(site) # aligns.setdefault(id, []).append(rc(site)) for id, align in aligns.items(): #print id, len(align) width = len(align[0]) pfm = [[0 for x in range(4)] for x in range(width)] for row in align: for i in range(len(row)): pfm[i][nucs[row[i]]] += 1 total = float(len(align)) pwm = [[(x + pseudo/4)/total+(pseudo) for x in row] for row in pfm] m = Motif() m.align = align[:] m.pwm = pwm[:] m.pfm = pfm[:] m.id = id motifs.append(m) return motifs
def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print "Motif\tMatch\tScore\tP-value" for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print "%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def test6_pcc(self): pfm1 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]] pfm2 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]] m1 = Motif(pfm1) m2 = Motif(pfm2) self.assertEqual(4, m1.max_pcc(m2)[0])
def parse(self, fo): """ Convert MDmodule output to motifs Parameters ---------- fo : file-like File object containing MDmodule output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile( r"(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)") pf = re.compile(r">.+\s+[bf]\d+\s+(\w+)") pwm = [] pfm = [] align = [] m_id = "" for line in fo.readlines(): if line.startswith("Motif"): if m_id: motifs.append(Motif()) motifs[-1].id = m_id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align pwm = [] pfm = [] align = [] m_id = line.split("\t")[0] else: m = p.search(line) if m: pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]]) m = pf.search(line) if m: if not pfm: pfm = [[0 for x in range(4)] for x in range(len(m.group(1)))] for i in range(len(m.group(1))): pfm[i][nucs[m.group(1)[i]]] += 1 align.append(m.group(1)) if pwm: motifs.append(Motif()) motifs[-1].id = m_id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def test5_motif_to_img(self): """ Motif to img """ seqlogo = which("seqlogo") if seqlogo: m = Motif(self.pfm) m.to_img("test/test.png", fmt="png", seqlogo=seqlogo) self.assertTrue(os.path.exists("test/test.png")) os.unlink("test/test.png") else: print("seqlogo not found, skipping.")
def test8_pwm_to_str(self): pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]] m = Motif(pwm) s2 = "0.01\t0.01\t0.01\t0.97\n0.12\t0.46\t0.22\t0.20" s3 = "0.010\t0.010\t0.010\t0.970\n0.123\t0.456\t0.222\t0.199" self.assertEqual(s2, m._pwm_to_str(precision=2)) self.assertEqual(s3, m._pwm_to_str(precision=3))
def parse(self, fo): """ Convert MotifSampler output to motifs Parameters ---------- fo : file-like File object containing MotifSampler output. Returns ------- motifs : list List of Motif instances. """ motifs = [] pwm = [] info = {} for line in fo.readlines(): if line.startswith("#"): vals = line.strip()[1:].split(" = ") if len(vals) > 1: info[vals[0]] = vals[1] elif len(line) > 1: pwm.append([float(x) for x in line.strip().split("\t")]) else: motifs.append(Motif()) motifs[-1].consensus = info["Consensus"] motifs[-1].width = info["W"] motifs[-1].id = info["ID"] motifs[-1].pwm = pwm[:] pwm = [] return motifs
def test11_slice_motif(self): pfm = [ [120, 0, 0, 0], [120, 0, 0, 0], [0, 60, 60, 0], [0, 0, 0, 120], [0, 0, 0, 120], ] m = Motif(pfm) m.to_consensus() # take slice m2 = m[1:-1] self.assertEqual("AST", m2.consensus.upper()) self.assertEqual(pfm[1:-1], m2.pfm)
def get_gimmemotif(self): """ Get gimmemotif object for motif Reads counts from self.counts """ self.length = len(self.counts[0]) motif_rows = [] for pos_id in range(self.length): row = [self.counts[letter][pos_id] for letter in range(4) ] # each row represents one position in motif ( A C G T ) motif_rows.append(row) self.gimme_obj = Motif( motif_rows) # generate gimmemotif motif instance self.gimme_obj.id = self.id + " " + self.name return (self)
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile( r'(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)') pf = re.compile(r'>.+\s+[bf]\d+\s+(\w+)') pwm = [] pfm = [] align = [] id = "" for line in fo.readlines(): if line.startswith("Motif"): if id: motifs.append(Motif()) motifs[-1].id = id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align pwm = [] pfm = [] align = [] id = line.split("\t")[0] else: m = p.search(line) if m: pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]]) m = pf.search(line) if m: if not pfm: pfm = [[0 for x in range(4)] for x in range(len(m.group(1)))] for i in range(len(m.group(1))): pfm[i][nucs[m.group(1)[i]]] += 1 align.append(m.group(1)) if pwm: motifs.append(Motif()) motifs[-1].id = id motifs[-1].pwm = pwm motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def parse(self, fo): motifs = [] p = re.compile( r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)') pwm = [] motif_id = "" for line in fo.readlines(): if line.startswith("Motif #"): if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) motif_id = line.split("#")[1].split(":")[0] pwm = [] else: m = p.search(line) if m: pwm.append( [float(m.group(x)) / 100.0 for x in range(1, 5)]) if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) return motifs
def test9_logodds_matrix(self): pwm = [[0.5, 0.4, 0.1, 0.0], [0.25, 0.25, 0.25, 0.25]] logodds = np.array([ [0.69813, 0.47623, -0.89160, -4.60517], [0.00995, 0.00995, 0.00995, 0.00995], ]) m = Motif(pwm) np.testing.assert_almost_equal(logodds, np.array(m.logodds), decimal=5)
def parse(self, fo): motifs = [] nucs = {"A":0,"C":1,"G":2,"T":3} lines = [fo.readline() for x in range(6)] while lines[0]: matrix = [[float(x) for x in line.strip().split("\t")] for line in lines[2:]] matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] m = Motif(matrix) m.id = lines[0].strip().split(" ")[-1] motifs.append(m) lines = [fo.readline() for x in range(6)] for i,motif in enumerate(motifs): motif.id = "%s_%s" % (self.name, i + 1) motif.trim(0.25) return motifs
def match(args): sample = dict([(m.id, m) for m in read_motifs(args.pfmfile)]) db = dict([(m.id, m) for m in read_motifs(args.dbpfmfile)]) mc = MotifComparer() result = mc.get_best_matches( sample.values(), args.nmatches, db.values(), "partial", "seqcor", "mean" ) plotdata = [] print("Motif\tMatch\tScore\tP-value") for motif_name, matches in result.items(): for match in matches: pval, pos, orient = mc.compare_motifs( sample[motif_name], db[match[0]], "partial", "seqcor", "mean", pval=True ) print("%s\t%s\t%0.2f\t%0.3e" % (motif_name, match[0], match[1][0], pval)) motif = sample[motif_name] dbmotif = db[match[0]] if args.img: if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25, 0.25, 0.25, 0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25, 0.25, 0.25, 0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp diff = len(motif) - len(dbmotif) if diff > 0: dbmotif = Motif(dbmotif.pwm + [[0.25, 0.25, 0.25, 0.25]] * diff) else: motif = Motif(motif.pwm + [[0.25, 0.25, 0.25, 0.25]] * -diff) plotdata.append((motif, dbmotif, pval)) if args.img: match_plot(plotdata, args.img)
def parse(self, fo): motifs = [] m = [[float(x) for x in fo.readline().strip().split(" ")] for i in range(4)] matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]] for i in range(len(m[0]))] motifs = [Motif(matrix)] motifs[-1].id = self.name return motifs
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} lines = [fo.readline() for x in range(6)] while lines[0]: matrix = [[float(x) for x in line.strip().split("\t")] for line in lines[2:]] matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] m = Motif(matrix) m.id = lines[0].strip().split(" ")[-1] motifs.append(m) lines = [fo.readline() for x in range(6)] for i, motif in enumerate(motifs): motif.id = "%s_%s" % (self.name, i + 1) motif.trim(0.25) return motifs
def parse(self, fo): """ Convert GADEM output to motifs Parameters ---------- fo : file-like File object containing GADEM output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} lines = fo.readlines() for i in range(0, len(lines), 5): align = [] pwm = [] pfm = [] m_id = "" line = lines[i].strip() m_id = line[1:] number = m_id.split("_")[0][1:] if os.path.exists("%s.seq" % number): with open("%s.seq" % number) as f: for line in f: if "x" not in line and "n" not in line: line = line.strip().upper() align.append(line) if not pfm: pfm = [[0 for x in range(4)] for x in range(len(line))] for p in range(len(line)): pfm[p][nucs[line[p]]] += 1 m = [ line.strip().split(" ")[1].split("\t") for line in lines[i + 1:i + 5] ] pwm = [[float(m[x][y]) for x in range(4)] for y in range(len(m[0]))] motifs.append(Motif(pwm)) motifs[-1].id = "{}_{}".format(self.name, m_id) # motifs[-1].pwm = pwm if align: motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def parse(self, fo): #KDIC|6.124756232026243 #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999 #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994 #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998 #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999 #N|999.9999999999998 line = fo.readline() while not line.startswith("A|"): line = fo.readline() matrix = [] for i in range(4): matrix.append([float(x) for x in line.strip().split("|")[1].split(" ")]) line = fo.readline() #print matrix matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] #print matrix m = Motif(matrix) m.id = "ChIPMunk_w%s" % len(m) return [m]
def match(args): sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)]) db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)]) mc = MotifComparer() result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean") print("Motif\tMatch\tScore\tP-value") for motif, match in result.items(): pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True) print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval)) if args.img: plotdata = [] for query, match in result.items(): motif = sample[query] dbmotif = db[match[0]] pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True) if orient == -1: tmp = dbmotif.id dbmotif = dbmotif.rc() dbmotif.id = tmp if pos < 0: tmp = motif.id motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm) motif.id = tmp elif pos > 0: tmp = dbmotif.id dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm) dbmotif.id = tmp plotdata.append((motif, dbmotif, pval)) match_plot(plotdata, args.img)
def test_motif_export_import(self): pfm = [ [120, 0, 0, 0], [120, 0, 0, 0], [0, 60, 60, 0], [0, 0, 0, 120], [0, 0, 0, 120], ] motif = Motif(pfm) motif.id = "test_motif" f = StringIO(motif.to_transfac()) motif_from_file = read_motifs(f, fmt="transfac")[0] self.assertEqual("AASTT", motif_from_file.to_consensus().upper()) self.assertEqual("test_motif", motif_from_file.id) f = StringIO(motif.to_meme()) motif_from_file = read_motifs(f, fmt="meme")[0] self.assertEqual("AASTT", motif_from_file.to_consensus().upper()) self.assertEqual("test_motif", motif_from_file.id) f = StringIO(motif.to_motevo()) motif_from_file = read_motifs(f, fmt="transfac")[0] self.assertEqual("AASTT", motif_from_file.to_consensus().upper()) self.assertEqual("test_motif", motif_from_file.id)
def parse(self, fo): """ Convert MEME output to motifs Parameters ---------- fo : file-like File object containing MEME output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile( r"MOTIF.+MEME-(\d+)\s*width\s*=\s*(\d+)\s+sites\s*=\s*(\d+)") pa = re.compile(r"\)\s+([A-Z]+)") line = fo.readline() while line: m = p.search(line) align = [] pfm = None if m: # print(m.group(0)) id = "%s_%s_w%s" % (self.name, m.group(1), m.group(2)) while not line.startswith("//"): ma = pa.search(line) if ma: # print(ma.group(0)) match = ma.group(1) align.append(match) if not pfm: pfm = [[0 for x in range(4)] for x in range(len(match))] for pos in range(len(match)): if match[pos] in nucs: pfm[pos][nucs[match[pos]]] += 1 else: for i in range(4): pfm[pos][i] += 0.25 line = fo.readline() motifs.append(Motif(pfm[:])) motifs[-1].id = id motifs[-1].align = align[:] line = fo.readline() return motifs
def parse(self, fo): #KDIC|6.124756232026243 #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999 #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994 #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998 #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999 #N|999.9999999999998 line = fo.readline() while not line.startswith("A|"): line = fo.readline() matrix = [] for i in range(4): matrix.append( [float(x) for x in line.strip().split("|")[1].split(" ")]) line = fo.readline() #print matrix matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] #print matrix m = Motif(matrix) m.id = "ChIPMunk_w%s" % len(m) return [m]
def parse(self, fo): motifs = [] #160: 112 CACGTGC 7.25 chr14:32308489-32308689 p = re.compile(r'\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)') wm = [] name = "" for line in fo.readlines(): if line.startswith("Motif") and line.strip().endswith(":"): if name: motifs.append(Motif(wm)) motifs[-1].id = name name = "" wm = [] name = "%s_%s" % (self.name, line.split(":")[0]) else: m = p.search(line) if m: wm.append([float(m.group(x)) for x in range(1, 5)]) motifs.append(Motif(wm)) motifs[-1].id = name return motifs
def parse(self, fo): """ Convert AMD output to motifs Parameters ---------- fo : file-like File object containing AMD output. Returns ------- motifs : list List of Motif instances. """ motifs = [] # 160: 112 CACGTGC 7.25 chr14:32308489-32308689 p = re.compile(r"\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)") wm = [] name = "" for line in fo.readlines(): if line.startswith("Motif") and line.strip().endswith(":"): if name: motifs.append(Motif(wm)) motifs[-1].id = name name = "" wm = [] name = "%s_%s" % (self.name, line.split(":")[0]) else: m = p.search(line) if m: wm.append([float(m.group(x)) for x in range(1, 5)]) motifs.append(Motif(wm)) motifs[-1].id = name return motifs
def parse(self, fo): motifs = [] p = re.compile(r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)') pwm = [] motif_id = "" for line in fo.readlines(): if line.startswith("Motif #"): if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) motif_id = line.split("#")[1].split(":")[0] pwm = [] else: m = p.search(line) if m: pwm.append([float(m.group(x))/100.0 for x in range(1,5)]) if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) return motifs
def parse(self, fo, width, seed=None): """ Convert Posmo output to motifs Parameters ---------- fo : file-like File object containing Posmo output. Returns ------- motifs : list List of Motif instances. """ motifs = [] lines = [fo.readline() for x in range(6)] while lines[0]: matrix = [[float(x) for x in line.strip().split("\t")] for line in lines[2:]] matrix = [[matrix[x][y] for x in range(4)] for y in range(len(matrix[0]))] m = Motif(matrix) m.trim(0.1) m.id = lines[0].strip().split(" ")[-1] motifs.append(m) lines = [fo.readline() for x in range(6)] for i, motif in enumerate(motifs): if seed: motif.id = "%s_w%s.%s_%s" % (self.name, width, seed, i + 1) else: motif.id = "%s_w%s_%s" % (self.name, width, i + 1) motif.trim(0.25) return motifs
def test8_pwm_to_str_hash(self): pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]] m = Motif(pwm) h = "1f260320cac8c26a" self.assertEqual(h, m.hash()) pwm = [ [0.010000, 0.010000, 0.010000, 0.970000], [0.12300, 0.45600, 0.22200, 0.19900], ] m = Motif(pwm) self.assertEqual(h, m.hash())
def parse_out(self, fo): """ Convert MotifSampler output to motifs Parameters ---------- fo : file-like File object containing MotifSampler output. Returns ------- motifs : list List of Motif instances. """ motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} pseudo = 0.0 # Should be 1/sqrt(# of seqs) aligns = {} for line in fo.readlines(): if line.startswith("#"): pass elif len(line) > 1: vals = line.strip().split("\t") m_id, site = [ x.strip().split(" ")[1].replace('"', "") for x in vals[8].split(";") if x ] # if vals[6] == "+": if site.upper().find("N") == -1: aligns.setdefault(m_id, []).append(site) # else: # print site, rc(site) # aligns.setdefault(id, []).append(rc(site)) for m_id, align in aligns.items(): # print id, len(align) width = len(align[0]) pfm = [[0 for x in range(4)] for x in range(width)] for row in align: for i in range(len(row)): pfm[i][nucs[row[i]]] += 1 total = float(len(align)) pwm = [[(x + pseudo / 4) / total + (pseudo) for x in row] for row in pfm] m = Motif() m.align = align[:] m.pwm = pwm[:] m.pfm = pfm[:] m.id = m_id motifs.append(m) return motifs
def parse(self, fo): motifs = [] pwm = [] info = {} for line in fo.readlines(): if line.startswith("#"): vals = line.strip()[1:].split(" = ") if len(vals) > 1: info[vals[0]] = vals[1] elif len(line) > 1: pwm.append([float(x) for x in line.strip().split("\t")]) else: motifs.append(Motif()) motifs[-1].consensus = info["Consensus"] motifs[-1].width = info["W"] motifs[-1].id = info["ID"] motifs[-1].pwm = pwm[:] pwm = [] return motifs
def parse(self, fo): """ Convert HMS output to motifs Parameters ---------- fo : file-like File object containing HMS output. Returns ------- motifs : list List of Motif instances. """ motifs = [] m = [[float(x) for x in fo.readline().strip().split(" ")] for i in range(4)] matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]] for i in range(len(m[0]))] motifs = [Motif(matrix)] motifs[-1].id = self.name return motifs
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} lines = fo.readlines() for i in range(0, len(lines), 5): align = [] pwm = [] pfm = [] id = "" line = lines[i].strip() id = line[1:] number = id.split("_")[0][1:] if os.path.exists("%s.seq" % number): for l in open("%s.seq" % number).readlines(): if not "x" in l and not "n" in l: l = l.strip().upper() align.append(l) if not pfm: pfm = [[0 for x in range(4)] for x in range(len(l))] for p in range(len(l)): pfm[p][nucs[l[p]]] += 1 m = [ l.strip().split(" ")[1].split("\t") for l in lines[i + 1:i + 5] ] pwm = [[float(m[x][y]) for x in range(4)] for y in range(len(m[0]))] motifs.append(Motif(pwm)) motifs[-1].id = id #motifs[-1].pwm = pwm if align: pass motifs[-1].pfm = pfm motifs[-1].align = align return motifs
def parse(self, fo): motifs = [] nucs = {"A": 0, "C": 1, "G": 2, "T": 3} p = re.compile(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)') pa = re.compile(r'(\s*\d+\s+.\s+([ACGT]+)\s+.+\))') pwm = [] align = [] c = 1 for line in fo.readlines(): m = p.search(line) if m: pwm.append([int(m.group(x)) for x in [2, 3, 4, 5]]) m = pa.search(line) if m: align.append(m.group(2)) elif line.startswith("===="): motifs.append(Motif()) #total = float(pwm[0][0] + pwm[0][1] + pwm[0][2] + pwm[0][3]) #motifs[-1].pwm = [[x / total for x in row] for row in pwm] motifs[-1].id = "Weeder_%s" % c motifs[-1].align = align[:] width = len(align[0]) pfm = [[0 for x in range(4)] for x in range(width)] for row in align: for i in range(len(row)): pfm[i][nucs[row[i]]] += 1 total = float(len(align)) pwm = [[((x) / total) for x in row] for row in pfm] motifs[-1].pwm = pwm[:] motifs[-1].pfm = pfm[:] align = [] c += 1 pwm = [] return motifs
def parse(self, fo): """ Convert Improbizer output to motifs Parameters ---------- fo : file-like File object containing Improbizer output. Returns ------- motifs : list List of Motif instances. """ motifs = [] p = re.compile(r"\d+\s+@\s+\d+\.\d+\s+sd\s+\d+\.\d+\s+(\w+)$") line = fo.readline() while line and line.find("Color") == -1: m = p.search(line) if m: pwm_data = {} for _i in range(4): vals = [ x.strip() for x in fo.readline().strip().split(" ") if x ] pwm_data[vals[0].upper()] = vals[1:] pwm = [] for i in range(len(pwm_data["A"])): pwm.append( [float(pwm_data[x][i]) for x in ["A", "C", "G", "T"]]) motifs.append(Motif(pwm)) motifs[-1].id = "%s_%s" % (self.name, m.group(1)) line = fo.readline() return motifs
def parse(self, fo): """ Convert BioProspector output to motifs Parameters ---------- fo : file-like File object containing BioProspector output. Returns ------- motifs : list List of Motif instances. """ motifs = [] p = re.compile( r"^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)") pwm = [] motif_id = "" for line in fo.readlines(): if line.startswith("Motif #"): if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) motif_id = line.split("#")[1].split(":")[0] pwm = [] else: m = p.search(line) if m: pwm.append( [float(m.group(x)) / 100.0 for x in range(1, 5)]) if pwm: m = Motif(pwm) m.id = "BioProspector_w%s_%s" % (len(m), motif_id) motifs.append(m) return motifs
def cluster_motifs(motifs, match="total", metric="wic", combine="mean", pval=True, threshold=0.95, trim_edges=False, edge_ic_cutoff=0.2, include_bg=True, progress=True, ncpus=None): """ Clusters a set of sequence motifs. Required arg 'motifs' is a file containing positional frequency matrices or an array with motifs. Optional args: 'match', 'metric' and 'combine' specify the method used to compare and score the motifs. By default the WIC score is used (metric='wic'), using the the score over the whole alignment (match='total'), with the total motif score calculated as the mean score of all positions (combine='mean'). 'match' can be either 'total' for the total alignment or 'subtotal' for the maximum scoring subsequence of the alignment. 'metric' can be any metric defined in MotifComparer, currently: 'pcc', 'ed', 'distance', 'wic' or 'chisq' 'combine' determines how the total score is calculated from the score of individual positions and can be either 'sum' or 'mean' 'pval' can be True or False and determines if the score should be converted to an empirical p-value 'threshold' determines the score (or p-value) cutoff If 'trim_edges' is set to True, all motif edges with an IC below 'edge_ic_cutoff' will be removed before clustering When computing the average of two motifs 'include_bg' determines if, at a position only present in one motif, the information in that motif should be kept, or if it should be averaged with background frequencies. Should probably be left set to True. """ # First read pfm or pfm formatted motiffile if type([]) != type(motifs): motifs = read_motifs(motifs, fmt="pwm") mc = MotifComparer() # Trim edges with low information content if trim_edges: for motif in motifs: motif.trim(edge_ic_cutoff) # Make a MotifTree node for every motif nodes = [MotifTree(m) for m in motifs] # Determine all pairwise scores and maxscore per motif scores = {} motif_nodes = dict([(n.motif.id,n) for n in nodes]) motifs = [n.motif for n in nodes] if progress: sys.stderr.write("Calculating initial scores\n") result = mc.get_all_scores(motifs, motifs, match, metric, combine, pval, parallel=True, ncpus=ncpus) for m1, other_motifs in result.items(): for m2, score in other_motifs.items(): if m1 == m2: if pval: motif_nodes[m1].maxscore = 1 - score[0] else: motif_nodes[m1].maxscore = score[0] else: if pval: score = [1 - score[0]] + score[1:] scores[(motif_nodes[m1],motif_nodes[m2])] = score cluster_nodes = [node for node in nodes] ave_count = 1 total = len(cluster_nodes) while len(cluster_nodes) > 1: l = sorted(scores.keys(), key=lambda x: scores[x][0]) i = -1 (n1, n2) = l[i] while n1 not in cluster_nodes or n2 not in cluster_nodes: i -= 1 (n1,n2) = l[i] if len(n1.motif) > 0 and len(n2.motif) > 0: (score, pos, orientation) = scores[(n1,n2)] ave_motif = n1.motif.average_motifs(n2.motif, pos, orientation, include_bg=include_bg) ave_motif.trim(edge_ic_cutoff) # Check if the motif is not empty if len(ave_motif) == 0: ave_motif = Motif([[0.25,0.25,0.25,0.25]]) ave_motif.id = "Average_%s" % ave_count ave_count += 1 new_node = MotifTree(ave_motif) if pval: new_node.maxscore = 1 - mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] else: new_node.maxscore = mc.compare_motifs(new_node.motif, new_node.motif, match, metric, combine, pval)[0] new_node.mergescore = score #print "%s + %s = %s with score %s" % (n1.motif.id, n2.motif.id, ave_motif.id, score) n1.parent = new_node n2.parent = new_node new_node.left = n1 new_node.right = n2 cmp_nodes = dict([(node.motif, node) for node in nodes if not node.parent]) if progress: progress = (1 - len(cmp_nodes) / float(total)) * 100 sys.stderr.write('\rClustering [{0}{1}] {2}%'.format( '#' * (int(progress) // 10), " " * (10 - int(progress) // 10), int(progress))) result = mc.get_all_scores( [new_node.motif], list(cmp_nodes.keys()), match, metric, combine, pval, parallel=True) for motif, n in cmp_nodes.items(): x = result[new_node.motif.id][motif.id] if pval: x = [1 - x[0]] + x[1:] scores[(new_node, n)] = x nodes.append(new_node) cluster_nodes = [node for node in nodes if not node.parent] if progress: sys.stderr.write("\n") root = nodes[-1] for node in [node for node in nodes if not node.left]: node.parent.checkMerge(root, threshold) return root