Ejemplo n.º 1
0
    def parse(self, fo):
        motifs = []

        p = re.compile(
            r'^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id = line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append(
                        [float(m.group(x)) / 100.0 for x in range(1, 5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
Ejemplo n.º 2
0
def match(args):
    sample = dict([(m.id, m) for m in pwmfile_to_motifs(args.pwmfile)])
    db = dict([(m.id, m) for m in pwmfile_to_motifs(args.dbpwmfile)])

    mc = MotifComparer()
    result = mc.get_closest_match(sample.values(), db.values(), "partial", "wic", "mean")

    print("Motif\tMatch\tScore\tP-value")
    for motif, match in result.items():
        pval, pos, orient = mc.compare_motifs(sample[motif], db[match[0]], "partial", "wic", "mean", pval=True)
        print("%s\t%s\t%0.2f\t%0.3e" % (motif, match[0], match[1][0], pval))

    if args.img:
        plotdata = []
        for query, match in result.items():
            motif = sample[query]
            dbmotif = db[match[0]]
            pval, pos, orient = mc.compare_motifs(motif, dbmotif, "partial", "wic", "mean", pval=True)
            
            if orient == -1:
                tmp = dbmotif.id
                dbmotif = dbmotif.rc()
                dbmotif.id = tmp

            if pos < 0:
                tmp = motif.id
                motif = Motif([[0.25,0.25,0.25,0.25]] * -pos + motif.pwm)
                motif.id = tmp
            elif pos > 0:
                tmp = dbmotif.id
                dbmotif = Motif([[0.25,0.25,0.25,0.25]] * pos + dbmotif.pwm)
                dbmotif.id = tmp

            plotdata.append((motif, dbmotif, pval))
            match_plot(plotdata, args.img)
Ejemplo n.º 3
0
    def test6_pcc(self):
        pfm1 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]]
        pfm2 = [[5, 0, 0, 0], [0, 5, 0, 0], [0, 5, 0, 0], [0, 0, 0, 5]]

        m1 = Motif(pfm1)
        m2 = Motif(pfm2)

        self.assertEqual(4, m1.max_pcc(m2)[0])
Ejemplo n.º 4
0
    def parse(self, fo):
        """
        Convert MDmodule output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MDmodule output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}
        p = re.compile(
            r"(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)")
        pf = re.compile(r">.+\s+[bf]\d+\s+(\w+)")

        pwm = []
        pfm = []
        align = []
        m_id = ""
        for line in fo.readlines():
            if line.startswith("Motif"):
                if m_id:
                    motifs.append(Motif())
                    motifs[-1].id = m_id
                    motifs[-1].pwm = pwm
                    motifs[-1].pfm = pfm
                    motifs[-1].align = align
                    pwm = []
                    pfm = []
                    align = []
                m_id = line.split("\t")[0]
            else:
                m = p.search(line)
                if m:
                    pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]])
                m = pf.search(line)
                if m:
                    if not pfm:
                        pfm = [[0 for x in range(4)]
                               for x in range(len(m.group(1)))]
                    for i in range(len(m.group(1))):
                        pfm[i][nucs[m.group(1)[i]]] += 1

                    align.append(m.group(1))

        if pwm:
            motifs.append(Motif())
            motifs[-1].id = m_id
            motifs[-1].pwm = pwm
            motifs[-1].pfm = pfm
            motifs[-1].align = align

        return motifs
Ejemplo n.º 5
0
    def test8_pwm_to_str_hash(self):
        pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]]
        m = Motif(pwm)
        h = "1f260320cac8c26a"
        self.assertEqual(h, m.hash())

        pwm = [
            [0.010000, 0.010000, 0.010000, 0.970000],
            [0.12300, 0.45600, 0.22200, 0.19900],
        ]
        m = Motif(pwm)
        self.assertEqual(h, m.hash())
Ejemplo n.º 6
0
    def test_motif_export_import(self):
        pfm = [
            [120, 0, 0, 0],
            [120, 0, 0, 0],
            [0, 60, 60, 0],
            [0, 0, 0, 120],
            [0, 0, 0, 120],
        ]
        motif = Motif(pfm)
        motif.id = "test_motif"

        f = StringIO(motif.to_transfac())
        motif_from_file = read_motifs(f, fmt="transfac")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)

        f = StringIO(motif.to_meme())
        motif_from_file = read_motifs(f, fmt="meme")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)

        f = StringIO(motif.to_motevo())
        motif_from_file = read_motifs(f, fmt="transfac")[0]
        self.assertEqual("AASTT", motif_from_file.to_consensus().upper())
        self.assertEqual("test_motif", motif_from_file.id)
Ejemplo n.º 7
0
    def parse(self, fo):
        """
        Convert ChIPMunk output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing ChIPMunk output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        line = fo.readline()
        if not line:
            return []

        while not line.startswith("A|"):
            line = fo.readline()
        matrix = []
        for _ in range(4):
            matrix.append(
                [float(x) for x in line.strip().split("|")[1].split(" ")])
            line = fo.readline()
        # print matrix
        matrix = [[matrix[x][y] for x in range(4)]
                  for y in range(len(matrix[0]))]
        # print matrix
        m = Motif(matrix)
        m.id = "ChIPMunk_w%s" % len(m)
        return [m]
Ejemplo n.º 8
0
    def parse(self, fo):
        """
        Convert MotifSampler output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MotifSampler output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        pwm = []
        info = {}
        for line in fo.readlines():
            if line.startswith("#"):
                vals = line.strip()[1:].split(" = ")
                if len(vals) > 1:
                    info[vals[0]] = vals[1]
            elif len(line) > 1:
                pwm.append([float(x) for x in line.strip().split("\t")])
            else:
                motifs.append(Motif())
                motifs[-1].consensus = info["Consensus"]
                motifs[-1].width = info["W"]
                motifs[-1].id = info["ID"]
                motifs[-1].pwm = pwm[:]
                pwm = []

        return motifs
Ejemplo n.º 9
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}
        p = re.compile(
            r'(\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)')
        pf = re.compile(r'>.+\s+[bf]\d+\s+(\w+)')

        pwm = []
        pfm = []
        align = []
        id = ""
        for line in fo.readlines():
            if line.startswith("Motif"):
                if id:
                    motifs.append(Motif())
                    motifs[-1].id = id
                    motifs[-1].pwm = pwm
                    motifs[-1].pfm = pfm
                    motifs[-1].align = align
                    pwm = []
                    pfm = []
                    align = []
                id = line.split("\t")[0]
            else:
                m = p.search(line)
                if m:
                    pwm.append([float(m.group(x)) / 100 for x in [2, 3, 4, 5]])
                m = pf.search(line)
                if m:
                    if not pfm:
                        pfm = [[0 for x in range(4)]
                               for x in range(len(m.group(1)))]
                    for i in range(len(m.group(1))):
                        pfm[i][nucs[m.group(1)[i]]] += 1

                    align.append(m.group(1))

        if pwm:
            motifs.append(Motif())
            motifs[-1].id = id
            motifs[-1].pwm = pwm
            motifs[-1].pfm = pfm
            motifs[-1].align = align

        return motifs
Ejemplo n.º 10
0
    def test9_logodds_matrix(self):
        pwm = [[0.5, 0.4, 0.1, 0.0], [0.25, 0.25, 0.25, 0.25]]

        logodds = np.array([
            [0.69813, 0.47623, -0.89160, -4.60517],
            [0.00995, 0.00995, 0.00995, 0.00995],
        ])
        m = Motif(pwm)
        np.testing.assert_almost_equal(logodds, np.array(m.logodds), decimal=5)
Ejemplo n.º 11
0
def match(args):
    sample = dict([(m.id, m) for m in read_motifs(args.pfmfile)])
    db = dict([(m.id, m) for m in read_motifs(args.dbpfmfile)])

    mc = MotifComparer()
    result = mc.get_best_matches(
        sample.values(), args.nmatches, db.values(), "partial", "seqcor", "mean"
    )

    plotdata = []
    print("Motif\tMatch\tScore\tP-value")
    for motif_name, matches in result.items():
        for match in matches:

            pval, pos, orient = mc.compare_motifs(
                sample[motif_name], db[match[0]], "partial", "seqcor", "mean", pval=True
            )
            print("%s\t%s\t%0.2f\t%0.3e" % (motif_name, match[0], match[1][0], pval))
            motif = sample[motif_name]
            dbmotif = db[match[0]]

            if args.img:
                if orient == -1:
                    tmp = dbmotif.id
                    dbmotif = dbmotif.rc()
                    dbmotif.id = tmp
                if pos < 0:
                    tmp = motif.id
                    motif = Motif([[0.25, 0.25, 0.25, 0.25]] * -pos + motif.pwm)
                    motif.id = tmp
                elif pos > 0:
                    tmp = dbmotif.id
                    dbmotif = Motif([[0.25, 0.25, 0.25, 0.25]] * pos + dbmotif.pwm)
                    dbmotif.id = tmp

                diff = len(motif) - len(dbmotif)
                if diff > 0:
                    dbmotif = Motif(dbmotif.pwm + [[0.25, 0.25, 0.25, 0.25]] * diff)
                else:
                    motif = Motif(motif.pwm + [[0.25, 0.25, 0.25, 0.25]] * -diff)

                plotdata.append((motif, dbmotif, pval))
    if args.img:
        match_plot(plotdata, args.img)
Ejemplo n.º 12
0
    def parse(self, fo):
        motifs = []
        m = [[float(x) for x in fo.readline().strip().split(" ")]
             for i in range(4)]
        matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]]
                  for i in range(len(m[0]))]
        motifs = [Motif(matrix)]
        motifs[-1].id = self.name

        return motifs
Ejemplo n.º 13
0
 def test5_motif_to_img(self):
     """ Motif to img """
     seqlogo = which("seqlogo")
     if seqlogo:
         m = Motif(self.pfm)
         m.to_img("test/test.png", fmt="png", seqlogo=seqlogo)
         self.assertTrue(os.path.exists("test/test.png"))
         os.unlink("test/test.png")
     else:
         print("seqlogo not found, skipping.")
Ejemplo n.º 14
0
    def test8_pwm_to_str(self):
        pwm = [[0.01, 0.01, 0.01, 0.97], [0.123, 0.456, 0.222, 0.199]]

        m = Motif(pwm)

        s2 = "0.01\t0.01\t0.01\t0.97\n0.12\t0.46\t0.22\t0.20"
        s3 = "0.010\t0.010\t0.010\t0.970\n0.123\t0.456\t0.222\t0.199"

        self.assertEqual(s2, m._pwm_to_str(precision=2))
        self.assertEqual(s3, m._pwm_to_str(precision=3))
Ejemplo n.º 15
0
    def parse(self, fo):
        """
        Convert GADEM output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing GADEM output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        lines = fo.readlines()
        for i in range(0, len(lines), 5):
            align = []
            pwm = []
            pfm = []
            m_id = ""
            line = lines[i].strip()
            m_id = line[1:]
            number = m_id.split("_")[0][1:]
            if os.path.exists("%s.seq" % number):
                with open("%s.seq" % number) as f:
                    for line in f:
                        if "x" not in line and "n" not in line:
                            line = line.strip().upper()
                            align.append(line)
                            if not pfm:
                                pfm = [[0 for x in range(4)]
                                       for x in range(len(line))]
                            for p in range(len(line)):
                                pfm[p][nucs[line[p]]] += 1

            m = [
                line.strip().split(" ")[1].split("\t")
                for line in lines[i + 1:i + 5]
            ]

            pwm = [[float(m[x][y]) for x in range(4)]
                   for y in range(len(m[0]))]

            motifs.append(Motif(pwm))
            motifs[-1].id = "{}_{}".format(self.name, m_id)
            # motifs[-1].pwm = pwm
            if align:
                motifs[-1].pfm = pfm
                motifs[-1].align = align

        return motifs
Ejemplo n.º 16
0
    def parse(self, fname):
        """
        Convert RPMCMC output to motifs

        Parameters
        ----------
        fname : str
            File containing RPMCMC output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        pfm = []
        name = ""
        for line in open(fname):
            line = line.strip()
            if line.startswith("PFM"):
                continue
            if line.startswith("Motif"):
                if len(pfm) > 0:
                    motif = Motif(pfm)
                    motif.id = name
                    motifs.append(motif)
                name = line
                pfm = []
            else:
                if line != ("A C G T"):
                    row = line.split(" ")
                    if len(row) == 4:
                        row = [float(x) for x in row]
                        pfm.append(row)

        motif = Motif(pfm)
        motif.id = name
        motifs.append(motif)

        return motifs
Ejemplo n.º 17
0
    def parse(self, fo):
        """
        Convert MEME output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MEME output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        p = re.compile(
            r"MOTIF.+MEME-(\d+)\s*width\s*=\s*(\d+)\s+sites\s*=\s*(\d+)")
        pa = re.compile(r"\)\s+([A-Z]+)")
        line = fo.readline()
        while line:
            m = p.search(line)
            align = []
            pfm = None
            if m:
                # print(m.group(0))
                id = "%s_%s_w%s" % (self.name, m.group(1), m.group(2))
                while not line.startswith("//"):
                    ma = pa.search(line)
                    if ma:
                        # print(ma.group(0))
                        match = ma.group(1)
                        align.append(match)
                        if not pfm:
                            pfm = [[0 for x in range(4)]
                                   for x in range(len(match))]
                        for pos in range(len(match)):
                            if match[pos] in nucs:
                                pfm[pos][nucs[match[pos]]] += 1
                            else:
                                for i in range(4):
                                    pfm[pos][i] += 0.25

                    line = fo.readline()

                motifs.append(Motif(pfm[:]))
                motifs[-1].id = id
                motifs[-1].align = align[:]
            line = fo.readline()

        return motifs
Ejemplo n.º 18
0
    def parse_out(self, fo):
        """
        Convert MotifSampler output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing MotifSampler output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}
        pseudo = 0.0  # Should be 1/sqrt(# of seqs)
        aligns = {}
        for line in fo.readlines():
            if line.startswith("#"):
                pass
            elif len(line) > 1:
                vals = line.strip().split("\t")
                m_id, site = [
                    x.strip().split(" ")[1].replace('"', "")
                    for x in vals[8].split(";")
                    if x
                ]
                # if vals[6] == "+":
                if site.upper().find("N") == -1:
                    aligns.setdefault(m_id, []).append(site)
                # else:
                #    print site, rc(site)
                #    aligns.setdefault(id, []).append(rc(site))

        for m_id, align in aligns.items():
            # print id, len(align)

            width = len(align[0])
            pfm = [[0 for x in range(4)] for x in range(width)]
            for row in align:
                for i in range(len(row)):
                    pfm[i][nucs[row[i]]] += 1
            total = float(len(align))
            pwm = [[(x + pseudo / 4) / total + (pseudo) for x in row] for row in pfm]
            m = Motif()
            m.align = align[:]
            m.pwm = pwm[:]
            m.pfm = pfm[:]
            m.id = m_id
            motifs.append(m)
        return motifs
Ejemplo n.º 19
0
    def parse(self, fo):
        """
        Convert BioProspector output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing BioProspector output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        p = re.compile(
            r"^\d+\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)\s+(\d+\.\d+)")
        pwm = []
        motif_id = ""
        for line in fo.readlines():
            if line.startswith("Motif #"):
                if pwm:
                    m = Motif(pwm)
                    m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
                    motifs.append(m)
                motif_id = line.split("#")[1].split(":")[0]
                pwm = []
            else:
                m = p.search(line)
                if m:
                    pwm.append(
                        [float(m.group(x)) / 100.0 for x in range(1, 5)])

        if pwm:
            m = Motif(pwm)
            m.id = "BioProspector_w%s_%s" % (len(m), motif_id)
            motifs.append(m)
        return motifs
Ejemplo n.º 20
0
    def parse(self, fo):
        motifs = []

        #160:  112  CACGTGC      7.25   chr14:32308489-32308689
        p = re.compile(r'\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)')
        wm = []
        name = ""
        for line in fo.readlines():
            if line.startswith("Motif") and line.strip().endswith(":"):
                if name:
                    motifs.append(Motif(wm))
                    motifs[-1].id = name
                    name = ""
                    wm = []
                name = "%s_%s" % (self.name, line.split(":")[0])
            else:
                m = p.search(line)
                if m:
                    wm.append([float(m.group(x)) for x in range(1, 5)])
        motifs.append(Motif(wm))
        motifs[-1].id = name

        return motifs
Ejemplo n.º 21
0
    def parse(self, fo):
        """
        Convert AMD output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing AMD output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []

        # 160:  112  CACGTGC      7.25   chr14:32308489-32308689
        p = re.compile(r"\d+\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)")
        wm = []
        name = ""
        for line in fo.readlines():
            if line.startswith("Motif") and line.strip().endswith(":"):
                if name:
                    motifs.append(Motif(wm))
                    motifs[-1].id = name
                    name = ""
                    wm = []
                name = "%s_%s" % (self.name, line.split(":")[0])
            else:
                m = p.search(line)
                if m:
                    wm.append([float(m.group(x)) for x in range(1, 5)])
        motifs.append(Motif(wm))
        motifs[-1].id = name

        return motifs
Ejemplo n.º 22
0
	def get_gimmemotif(self):
		""" Get gimmemotif object for motif 
			Reads counts from self.counts """

		from gimmemotifs.motif import Motif

		self.length = len(self.counts[0])

		motif_rows = []
		for pos_id in range(self.length):
			row = [self.counts[letter][pos_id] for letter in range(4)] 	# each row represents one position in motif ( A C G T )
			motif_rows.append(row)

		self.gimme_obj = Motif(motif_rows) 	# generate gimmemotif motif instance
		self.gimme_obj.id = self.id + " " + self.name

		return(self)
Ejemplo n.º 23
0
    def test11_slice_motif(self):
        pfm = [
            [120, 0, 0, 0],
            [120, 0, 0, 0],
            [0, 60, 60, 0],
            [0, 0, 0, 120],
            [0, 0, 0, 120],
        ]

        m = Motif(pfm)
        m.to_consensus()

        # take slice
        m2 = m[1:-1]

        self.assertEqual("AST", m2.consensus.upper())
        self.assertEqual(pfm[1:-1], m2.pfm)
Ejemplo n.º 24
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        lines = [fo.readline() for x in range(6)]
        while lines[0]:
            matrix = [[float(x) for x in line.strip().split("\t")]
                      for line in lines[2:]]
            matrix = [[matrix[x][y] for x in range(4)]
                      for y in range(len(matrix[0]))]
            m = Motif(matrix)
            m.id = lines[0].strip().split(" ")[-1]
            motifs.append(m)
            lines = [fo.readline() for x in range(6)]

        for i, motif in enumerate(motifs):
            motif.id = "%s_%s" % (self.name, i + 1)
            motif.trim(0.25)

        return motifs
Ejemplo n.º 25
0
    def parse(self, fo):
        motifs = []

        pwm = []
        info = {}
        for line in fo.readlines():
            if line.startswith("#"):
                vals = line.strip()[1:].split(" = ")
                if len(vals) > 1:
                    info[vals[0]] = vals[1]
            elif len(line) > 1:
                pwm.append([float(x) for x in line.strip().split("\t")])
            else:
                motifs.append(Motif())
                motifs[-1].consensus = info["Consensus"]
                motifs[-1].width = info["W"]
                motifs[-1].id = info["ID"]
                motifs[-1].pwm = pwm[:]
                pwm = []

        return motifs
Ejemplo n.º 26
0
    def parse(self, fo):
        """
        Convert HMS output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing HMS output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        m = [[float(x) for x in fo.readline().strip().split(" ")] for i in range(4)]
        matrix = [[m[0][i], m[1][i], m[2][i], m[3][i]] for i in range(len(m[0]))]
        motifs = [Motif(matrix)]
        motifs[-1].id = self.name

        return motifs
Ejemplo n.º 27
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        lines = fo.readlines()
        for i in range(0, len(lines), 5):
            align = []
            pwm = []
            pfm = []
            id = ""
            line = lines[i].strip()
            id = line[1:]
            number = id.split("_")[0][1:]
            if os.path.exists("%s.seq" % number):
                for l in open("%s.seq" % number).readlines():
                    if not "x" in l and not "n" in l:
                        l = l.strip().upper()
                        align.append(l)
                        if not pfm:
                            pfm = [[0 for x in range(4)]
                                   for x in range(len(l))]
                        for p in range(len(l)):
                            pfm[p][nucs[l[p]]] += 1

            m = [
                l.strip().split(" ")[1].split("\t") for l in lines[i + 1:i + 5]
            ]

            pwm = [[float(m[x][y]) for x in range(4)]
                   for y in range(len(m[0]))]

            motifs.append(Motif(pwm))
            motifs[-1].id = id
            #motifs[-1].pwm = pwm
            if align:
                pass
                motifs[-1].pfm = pfm
                motifs[-1].align = align

        return motifs
Ejemplo n.º 28
0
    def parse(self, fo):
        motifs = []
        nucs = {"A": 0, "C": 1, "G": 2, "T": 3}

        p = re.compile(r'(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)')
        pa = re.compile(r'(\s*\d+\s+.\s+([ACGT]+)\s+.+\))')
        pwm = []
        align = []
        c = 1
        for line in fo.readlines():
            m = p.search(line)
            if m:
                pwm.append([int(m.group(x)) for x in [2, 3, 4, 5]])

            m = pa.search(line)
            if m:
                align.append(m.group(2))

            elif line.startswith("===="):
                motifs.append(Motif())
                #total = float(pwm[0][0] + pwm[0][1] + pwm[0][2] + pwm[0][3])
                #motifs[-1].pwm = [[x / total for x in row] for row in pwm]
                motifs[-1].id = "Weeder_%s" % c
                motifs[-1].align = align[:]

                width = len(align[0])
                pfm = [[0 for x in range(4)] for x in range(width)]
                for row in align:
                    for i in range(len(row)):
                        pfm[i][nucs[row[i]]] += 1
                total = float(len(align))
                pwm = [[((x) / total) for x in row] for row in pfm]
                motifs[-1].pwm = pwm[:]
                motifs[-1].pfm = pfm[:]

                align = []
                c += 1
                pwm = []

        return motifs
Ejemplo n.º 29
0
 def parse(self, fo):
     #KDIC|6.124756232026243
     #A|517.9999999999999 42.99999999999999 345.99999999999994 25.999999999999996 602.9999999999999 155.99999999999997 2.9999999999999996 91.99999999999999
     #C|5.999999999999999 4.999999999999999 2.9999999999999996 956.9999999999999 91.99999999999999 17.999999999999996 22.999999999999996 275.99999999999994
     #G|340.99999999999994 943.9999999999999 630.9999999999999 6.999999999999999 16.999999999999996 48.99999999999999 960.9999999999999 14.999999999999998
     #T|134.99999999999997 7.999999999999999 19.999999999999996 9.999999999999998 287.99999999999994 776.9999999999999 12.999999999999998 616.9999999999999
     #N|999.9999999999998
     line = fo.readline()
     while not line.startswith("A|"):
         line = fo.readline()
     matrix = []
     for i in range(4):
         matrix.append(
             [float(x) for x in line.strip().split("|")[1].split(" ")])
         line = fo.readline()
     #print matrix
     matrix = [[matrix[x][y] for x in range(4)]
               for y in range(len(matrix[0]))]
     #print matrix
     m = Motif(matrix)
     m.id = "ChIPMunk_w%s" % len(m)
     return [m]
Ejemplo n.º 30
0
    def parse(self, fo):
        """
        Convert Improbizer output to motifs

        Parameters
        ----------
        fo : file-like
            File object containing Improbizer output.

        Returns
        -------
        motifs : list
            List of Motif instances.
        """
        motifs = []
        p = re.compile(r"\d+\s+@\s+\d+\.\d+\s+sd\s+\d+\.\d+\s+(\w+)$")

        line = fo.readline()
        while line and line.find("Color") == -1:
            m = p.search(line)
            if m:
                pwm_data = {}
                for _i in range(4):
                    vals = [
                        x.strip() for x in fo.readline().strip().split(" ")
                        if x
                    ]
                    pwm_data[vals[0].upper()] = vals[1:]
                pwm = []
                for i in range(len(pwm_data["A"])):
                    pwm.append(
                        [float(pwm_data[x][i]) for x in ["A", "C", "G", "T"]])
                motifs.append(Motif(pwm))
                motifs[-1].id = "%s_%s" % (self.name, m.group(1))
            line = fo.readline()

        return motifs