Esempio n. 1
0
 def GetLookup(self, token):
     """ Remove diacritics and convert to transliteration """
     token = re.sub(u'ـ', '', token)  # remove any تَطوِيل
     token = re.sub(u'[\u064b-\u0652\u0670]', '', token)
     # remove any vowels/diacritics
     # FIXME do something about \u0671, ALIF WASLA ?
     return buck.uni2buck(token)
Esempio n. 2
0
def mergeMadaFiles(top=".", out=False):
    madafiles = {}
    for d, p, files in os.walk(top):
        for f in files:
            if f == "originalprompts.segments.mada":
                for s in sPattern.finditer(
                        codecs.open(os.path.join(d, f),
                                    encoding="UTF-8").read()):
                    try:
                        words = []
                        for w in WPATTERN.finditer(s.group("words")):
                            try:
                                form = w.group("word")
                                form = replaceAll(form, [
                                    u"\u2018", u"\u060c", u"\u00F7", u"\u00A7",
                                    u'\u200f', u'\u06a9', u'\u0022', u'\u0023',
                                    u'\u003B', u'\u0040'
                                ])
                                form = str(buck.uni2buck(form))
                                d = DPATTERN.match(w.group(0))
                                diac = str(buck.uni2buck(d.group("diac")))
                            except Exception as e:
                                print "Couldn't do conversion"
                                print w.group(0)
                                print buck.uni2buck(w.group("word"))
                                raise e
                            try:
                                gloss = str(d.group("gloss"))
                            except:
                                gloss = d.group("none")
                            if "foreign" in form:
                                raise foreign(form)
                            if re.compile(".*\d.*").match(
                                    form) and not form.startswith("*/test"):
                                raise foreign(form)
                            words.append([form, diac, gloss])
                    except foreign:
                        continue
                    if len(words) > 3:
                        madafiles[str("%s.wav" %
                                      (s.group("test")))] = fixDashes(words)
    if out:
        out = open(os.path.join(top, out), "w")
        json.dump(madafiles, out)
        out.close()
    return madafiles
Esempio n. 3
0
def borrowDiacritics(w0, w1):
    d = u""
    b = buck.uni2buck(w0.form)
    for i, c in enumerate(w1[DIACRITICS]):
        d += c
        if c == b[0]:
            b = b[1:]
            if b == "":
                if len(d) == 1:
                    try:
                        d += w1[DIACRITICS][i + 1]
                    except:
                        pass
                break
    return d
Esempio n. 4
0
def getUnknowns(
        ifile="XXX/ABUDHABI_ABUDHNEWS2_ARB_20070228_000000/originalprompts.segments.mada",
        useBW=False,
        samaknowns=False,
        madaknowns=False,
        samaunknowns=False,
        madaunknowns=False,
        sentences=False):
    """ 
    Read it as UTF-8, split it into SENTENCEs
    """
    if samaunknowns == False:
        samaunknowns = {}
    if madaunknowns == False:
        madaunknowns = {}
    if samaknowns == False:
        samaknowns = {}
    if madaknowns == False:
        madaknowns = {}
    if sentences == False:
        sentences = []
    sentences = []
    for sentence in readRawMada(ifile):
        """
        Get all examples of the pattern, dig out the bit we want
        """
        dforms = []
        for w in sentence:
            form = w.form
            if "!E" in form or "*/test" in form:
                continue
            diacritics = w.diacritics
            mgloss = w.gloss
            if mgloss == None:
                mgloss = "***"
                try:
                    madaunknowns[form] += 1
                except:
                    madaunknowns[form] = 1
            else:
                try:
                    madaknowns[form] += 1
                except:
                    madaknowns[form] = 1
            pyasolutions = pya.getSolutions(form)
            try:
                pyasolution, pyagloss = pyasolutions[0].buckvoc, pyasolutions[
                    0].gloss_b
            except:
                pyasolution, pyagloss = "***", ""
                try:
                    samaunknowns[form] += 1
                except:
                    samaunknowns[form] = 1
            else:
                try:
                    samaknowns[form] += 1
                except:
                    samaknowns[form] = 1
            if useBW:
                x = "%s\t%s\t%s\t%s\t%s\t%s" % (form, buck.uni2buck(form),
                                                buck.uni2buck(diacritics),
                                                mgloss, pyasolution, pyagloss)
            else:
                x = "%s\t%s\t%s\t%s\t%s\t%s" % (form, form, diacritics, mgloss,
                                                pyasolution, pyagloss)
            dforms.append(x)
        sentences.append(dforms)
Esempio n. 5
0
def segment(src=os.path.join(TDF, TESTPROMPT),
            dest="TEMP",
            wav=WAV,
            N=sys.maxint,
            segments=False,
            prompts="",
            promptsfile="originalprompts.segments",
            useBW=False,
            rawPrompts=True,
            copywavfiles=False,
            separatedashes=True):
    print "SRC %s, N %s" % (src, N)
    if segments == False:
        segments = []
        try:
            os.makedirs(os.path.join(dest, "wav"))
        except:
            print "%s already exists" % (os.path.join(dest, "wav"))
    segments = []
    if os.path.isdir(src):
        for f in os.listdir(src):
            if N > 0:
                N, prompts = segment(src=os.path.join(src, f),
                                     dest=dest,
                                     wav=WAV,
                                     N=N,
                                     segments=segments,
                                     prompts=prompts)
    else:
        prompt = src.split("/")[-1]
        if not prompt.endswith(".qrtr.tdf"):
            return N, prompts
        print prompt
        prompt = prompt[:-len(".qrtr.tdf")]
        if copywavfiles:
            sound = sounds.readsound(os.path.join(wav, "%s.wav" % (prompt)))
        for i, line in enumerate(open(src)):
            m = P.match(line.strip())
            if m:
                if N <= 0:
                    break
                N -= 1
                transcript = m.group("transcript").decode("UTF-8")
                if useBW:
                    transcript = buck.uni2buck(transcript, buck._uni2buck)
                if separatedashes:
                    transcript = transcript.replace("-", " - ")
                transcript = respace.sub(
                    " ", brackets.sub("", tag.sub("", transcript)))
                s = [
                    m.group("prompt"),
                    float(m.group("start")),
                    float(m.group("end")), transcript
                ]
                test = "test-%s-%s-%s-%s" % (prompt, m.group("gender"),
                                             m.group("dialect"), i)
                if transcript.strip() == "":
                    continue
                if rawPrompts:
                    prompts += "%s\n" % (transcript)
                else:
                    prompts += "*/%s !ENTER %s !EXIT\n" % (test, transcript)
                segments.append(s)
                start = s[1]
                end = s[2]
                w = os.path.join(dest, "wav", "%s.wav" % (test))
                if copywavfiles and not os.path.isfile(w):
                    sound.frames = False
                    sound.save(w,
                               start=int(start * sound.params[2]),
                               end=int(end * sound.params[2]))
    return N, prompts
Esempio n. 6
0
 def lookup(self, word):
     return self.analyse(buck.uni2buck(word))