def GetLookup(self, token): """ Remove diacritics and convert to transliteration """ token = re.sub(u'ـ', '', token) # remove any تَطوِيل token = re.sub(u'[\u064b-\u0652\u0670]', '', token) # remove any vowels/diacritics # FIXME do something about \u0671, ALIF WASLA ? return buck.uni2buck(token)
def mergeMadaFiles(top=".", out=False): madafiles = {} for d, p, files in os.walk(top): for f in files: if f == "originalprompts.segments.mada": for s in sPattern.finditer( codecs.open(os.path.join(d, f), encoding="UTF-8").read()): try: words = [] for w in WPATTERN.finditer(s.group("words")): try: form = w.group("word") form = replaceAll(form, [ u"\u2018", u"\u060c", u"\u00F7", u"\u00A7", u'\u200f', u'\u06a9', u'\u0022', u'\u0023', u'\u003B', u'\u0040' ]) form = str(buck.uni2buck(form)) d = DPATTERN.match(w.group(0)) diac = str(buck.uni2buck(d.group("diac"))) except Exception as e: print "Couldn't do conversion" print w.group(0) print buck.uni2buck(w.group("word")) raise e try: gloss = str(d.group("gloss")) except: gloss = d.group("none") if "foreign" in form: raise foreign(form) if re.compile(".*\d.*").match( form) and not form.startswith("*/test"): raise foreign(form) words.append([form, diac, gloss]) except foreign: continue if len(words) > 3: madafiles[str("%s.wav" % (s.group("test")))] = fixDashes(words) if out: out = open(os.path.join(top, out), "w") json.dump(madafiles, out) out.close() return madafiles
def borrowDiacritics(w0, w1): d = u"" b = buck.uni2buck(w0.form) for i, c in enumerate(w1[DIACRITICS]): d += c if c == b[0]: b = b[1:] if b == "": if len(d) == 1: try: d += w1[DIACRITICS][i + 1] except: pass break return d
def getUnknowns( ifile="XXX/ABUDHABI_ABUDHNEWS2_ARB_20070228_000000/originalprompts.segments.mada", useBW=False, samaknowns=False, madaknowns=False, samaunknowns=False, madaunknowns=False, sentences=False): """ Read it as UTF-8, split it into SENTENCEs """ if samaunknowns == False: samaunknowns = {} if madaunknowns == False: madaunknowns = {} if samaknowns == False: samaknowns = {} if madaknowns == False: madaknowns = {} if sentences == False: sentences = [] sentences = [] for sentence in readRawMada(ifile): """ Get all examples of the pattern, dig out the bit we want """ dforms = [] for w in sentence: form = w.form if "!E" in form or "*/test" in form: continue diacritics = w.diacritics mgloss = w.gloss if mgloss == None: mgloss = "***" try: madaunknowns[form] += 1 except: madaunknowns[form] = 1 else: try: madaknowns[form] += 1 except: madaknowns[form] = 1 pyasolutions = pya.getSolutions(form) try: pyasolution, pyagloss = pyasolutions[0].buckvoc, pyasolutions[ 0].gloss_b except: pyasolution, pyagloss = "***", "" try: samaunknowns[form] += 1 except: samaunknowns[form] = 1 else: try: samaknowns[form] += 1 except: samaknowns[form] = 1 if useBW: x = "%s\t%s\t%s\t%s\t%s\t%s" % (form, buck.uni2buck(form), buck.uni2buck(diacritics), mgloss, pyasolution, pyagloss) else: x = "%s\t%s\t%s\t%s\t%s\t%s" % (form, form, diacritics, mgloss, pyasolution, pyagloss) dforms.append(x) sentences.append(dforms)
def segment(src=os.path.join(TDF, TESTPROMPT), dest="TEMP", wav=WAV, N=sys.maxint, segments=False, prompts="", promptsfile="originalprompts.segments", useBW=False, rawPrompts=True, copywavfiles=False, separatedashes=True): print "SRC %s, N %s" % (src, N) if segments == False: segments = [] try: os.makedirs(os.path.join(dest, "wav")) except: print "%s already exists" % (os.path.join(dest, "wav")) segments = [] if os.path.isdir(src): for f in os.listdir(src): if N > 0: N, prompts = segment(src=os.path.join(src, f), dest=dest, wav=WAV, N=N, segments=segments, prompts=prompts) else: prompt = src.split("/")[-1] if not prompt.endswith(".qrtr.tdf"): return N, prompts print prompt prompt = prompt[:-len(".qrtr.tdf")] if copywavfiles: sound = sounds.readsound(os.path.join(wav, "%s.wav" % (prompt))) for i, line in enumerate(open(src)): m = P.match(line.strip()) if m: if N <= 0: break N -= 1 transcript = m.group("transcript").decode("UTF-8") if useBW: transcript = buck.uni2buck(transcript, buck._uni2buck) if separatedashes: transcript = transcript.replace("-", " - ") transcript = respace.sub( " ", brackets.sub("", tag.sub("", transcript))) s = [ m.group("prompt"), float(m.group("start")), float(m.group("end")), transcript ] test = "test-%s-%s-%s-%s" % (prompt, m.group("gender"), m.group("dialect"), i) if transcript.strip() == "": continue if rawPrompts: prompts += "%s\n" % (transcript) else: prompts += "*/%s !ENTER %s !EXIT\n" % (test, transcript) segments.append(s) start = s[1] end = s[2] w = os.path.join(dest, "wav", "%s.wav" % (test)) if copywavfiles and not os.path.isfile(w): sound.frames = False sound.save(w, start=int(start * sound.params[2]), end=int(end * sound.params[2])) return N, prompts
def lookup(self, word): return self.analyse(buck.uni2buck(word))