def _lujvo_analyze(self, value): # XXX TODO Not everything that isn't a gismu/lujvo/cmavo is a fu'ivla # XXX Move somewhere else? self.ve_lujvo_rafsi = [] chars = list( orthography.stream_char(config.Configuration(stdin=io.StringIO(value + " "), args=[])) ) # XXX why you need that extra space, eh? orig_chars = list(chars) forms = [0] def next_form(): # The current item didn't work, so pick the next. if forms[-1] > len(all_): # Out of items here... forms.pop() if forms == []: return True # FUHI! forms[-1] += 1 return False len_form = lambda: sum(len(all_[_]) for _ in forms[:-1]) ##http://www.lojban.org/sv/lists/lojban-list/msg16628.html terminal_rafsi = "CCV| CVV| CVhV| CVCCV| CCVCV|".split(" ") rafsi4 = "CVCCy CCVCy".split(" ") rafsi3 = "CVV- CVhV- CVC CVCy CVV CVhV CCV".split(" ") all_ = terminal_rafsi + rafsi4 + rafsi3 while 1: len_ = len_form() chars = orig_chars[len_:] if chars == []: break try: test = self._match_form(orig_chars[len_:], all_[forms[-1]], len(forms) == 1) except Exception as e: test = False # len_ = len_form() if test: # chars = test if all_[forms[-1]][-1] == "|": break forms.append(0) else: if next_form(): return False # Ran out of checks. FUHI! # You think you found a lujvo # XXX: Do I need to check for doubling? first = all_[forms[0]] if first in ["CVV", "CVhV"]: # xorxes says a CVV must end with CCV. if len(forms) != 2 or all_[forms[1]] != "CCV|": return False # "Check the consonant that follows CVC or CVCy." - I trust he refers to the following paragraph? >_> # Maybe consonant clusters? i = 0 for form in forms: if all_[form][:2] == "CC": # Check consonant clusters if not orthography.valid_init_cc(orig_chars[i].value + orig_chars[i + 1].value): self.config.debug("Invalid consonant cluster for lujvo: {0}".format(self)) # XXX - config.warn return False i += len(all_[form]) """ if first in ['CVC', 'CVCy']: if not self._lujvo_analyze(value[2:]): return False """ while forms: _ = all_[forms.pop(0)] orig_chars = self._add_rafsi(orig_chars, len(_)) # remove hyphen-letter if first in ["CVV-," "CVhV-"]: # Remove the hyphen self.ve_lujvo_rafsi[0] = self.ve_lujvo_rafsi[0][:-1] self.ve_lujvo_rafsi[-1] = self.ve_lujvo_rafsi[-1][:-1] # Dump the whitespace from the begining return True
def break_a_selbri(self): """This function is called when a CC is in the text. It deals with the notable issue of seperating selbri and cmavo. It returns either a single token, or a list of tokens. Should all fail, it will make the word a garbage token TODO : Include brkword's error detections TODO : Look at all the pretty self.tokenize's """ #{2.C.} #Load up on positions cc, cc_location = self.locate_cc() ps = self.locate_ps(cc) word_end = self.word() #This is indexed from 1 self.config.debug("""BUFFER: {0} cc_location: {1} cc: {2} PS: {3} we: {4}""".format(self.bit.buffer, cc_location, cc, ps, word_end)) letters = [] #This is used for some form-checking for bit in self.bit.items(word_end): if bit.wordsep: break for char in bit.chars: letters.append(char) #{2.C.1)} - error checking if ps == False and cc == -1: #{2.C.1)a)} - cc but no ps self.config.warn("Has a consonant cluster, but no penultimate stress", self.bit[0].position) return self.tokenize(word_end, GARBAGE) #{2.C.1)b)} is taken care of in get_token() #{2.C.3)} #Locate the end of the brivla end_of_brivla = ps alter_PS = False found_v = False END_WORD = False while 1: end_of_brivla += 1 #First time, we move off of the vowel try: bit = self.bit[end_of_brivla] if bit.wordsep: END_WORD = True except EOFError: self.config.debug("EOF hit in break_a_selbri()") END_WORD = True if END_WORD: end_of_brivla -= 1 if not found_v: #part of {2.C.3)b)} - it is required to have a vowel! self.config.warn("This supposed selbri is supposed to have a vowel", self.bit.buffer[0].position) return self.tokenize(end_of_brivla, GARBAGE) break #There's the end. Stops the while loop if bit.has_V: found_v = True break #{2.C.3)c)} - end_of_brivla has now been set to the true end. self.config.debug("eb: {0}".format(end_of_brivla)) #{2.C.4)} - Now begins the epic journey to the begining init_cmavo_tokens = [] #{2.C.4)a)]} - break off begining cmavo pieces, like {lonudoklama} while cc_location >= 5: #{2.C.4)a)1]} and {2.C.4)a)2]} #cc is the LETER position of the con cluster, not the actual loccccatttioonn.. if self.bit[0].has_V: init_cmavo_tokens.append(self.tokenize(1, CMAVO)) elif self.bit[0].has_C: init_cmavo_tokens.append(self.tokenize(2, CMAVO)) else: break cc, cc_location = self.locate_cc() if init_cmavo_tokens: return init_cmavo_tokens #self.config.debug(has_starting_cc) #{2.C.4)a)3]} if letters[0].V: find_first_consonant = 0 for bit in self.bit: if bit.has_C: break if bit.whitespace: break find_first_consonant += 1 if self.bit[find_first_consonant].counts_CC: return self.tokenize(word_end, BRIVLA) else: return self.tokenize(find_first_consonant, CMAVO) else: #{2.C.4)b)} - it starts with a consonant. And should have a CC by this point #Things are really get cracking now! #self.config.debug(self.bit.buffer) if self.bit[0].counts_CC or (self.bit[0].C and self.bit[1].V and self.bit[2].CyC) : #CC... CVCyC... #{2.C.4)b)1]} - more selbri should be like this for bit in self.bit.items(end_of_brivla+1): if bit.garbage: self.config.warn("This supposed selbri has garbage in it", bit.position) return self.tokenize(end_of_brivla+1, CIZBRIVLA) return self.tokenize(end_of_brivla+1, BRIVLA) elif self.bit[0].CyC: #{2.C.4)b)2]} - I'm not sure this is neccessary. Why is this neccessary? # Would it not be handled with everything else? TODO - test what happens if you remove it return self.tokenize(word_end, GARBAGE) elif self.bit[0].C and self.bit[1].counts_VV and (self.bit[2].CC or self.bit[2].CCC): #CVVCC #{2.C.4)b)3]} if ps == 1 or not valid_init_cc(self.bit[2]): #{2.C.4)b)3]a]} return self.tokenize(end_of_brivla+1, BRIVLA) else: #{2.C.4)b)3]b]} return self.tokenize(2, CMAVO) elif self.bit[0].C and self.bit[1].VhV and self.bit[2].CC: #CVhVCC #{2.C.4)b)4]} if ps != 1 and valid_init_cc(self.bit[2]): return self.tokenize(2, CMAVO) else: return self.tokenize(end_of_brivla+1, BRIVLA) elif self.bit[0].C and self.bit[1].V and self.bit[2].counts_CC: #CVCC.. #{2.C.4)b)5]]} if self.bit[3].V and end_of_brivla == 3: #CVCCV #{2.C.4)b)5]a]} - gismu return self.tokenize(4, BRIVLA) if not valid_init_cc(self.bit[2]): #{2.C.4)b)5]b]} OKAY = True for bit in self.bit.items(end_of_brivla)[2:end_of_brivla]: if bit.CC and not valid_init_cc(bit): OKAY = False if OKAY: return self.tokenize(end_of_brivla+1, BRIVLA) #{2.C.4)b)5]c]} first_v_is_ps = True for bit in self.bit.items(ps-1): if bit.has_V: first_v_is_ps = False if first_v_is_ps: return self.tokenize(end_of_brivla+1, BRIVLA) #{2.C.4)b)5]d]} has_y = False i = 0 for l in letters: i += 1 if l.y: has_y = True break #has_y and !has_y are fairly similiar. # TODO : Consider merging these two cases? if has_y: #{2.C.4)b)5]d]} has_y = i-1 letters = letters[:has_y] #Adjust letters to look at only stuff before the y i = 0 #self.config.debug(letters) #{2.C.4)b)5]d]1>} - match (CVC){2,}y.. CVC_MATCH = False while 1: #self.config.debug(i) if i+2 >= len(letters): #End of word... End of selbri.. if i >= 3*2: #Then it's probably okay! But one more thing... CVC_MATCH = True return self.tokenize(end_of_brivla+1, BRIVLA) break if letters[i+0].C and letters[i+1].V and letters[i+2].C: i += 3 #Match a CVC else: break #Okay, it's not a (CVC)* if i >= len(letters): break # And then make sure the CC's are all valid init pairs if CVC_MATCH: self.debug("CVC_MATCH") for bit in self.bits: self.debug(bit) if not valid_init_cc(bit.CC): self.debug("Has invalid init CC", bit.position) break elif bit.wordsep or bit.y: return self.tokenize(end_of_brivla+1, BRIVLA) #{2.C.4)b)5]d]2>} frontmiddles = ["CVC", "CVV", "CV'V", "CCV"] ends = ["CVC", "CCVC", "CVCC"] i = 0 while 1: if test(letters, i, ends, matchall): #Can match ZERO frontmiddles #So, it is a selbri! return self.tokenize(end_of_brivla+1, BRIVLA) fms = test(letters, i, frontmiddles, match) if not fms: #{2.C.4)b)5]d]3>} - Doesn't match a front-middle return [self.tokenize(2, CMAVO), self.tokenize(has_y-2, BRIVLA)] i += fms else: #!has_y #{2.C.4)b)5]e]} i = 0 while 1: #{2.C.4)b)5]e]1>} - If (CVC)}{2,}CV , break at first CV and make the rest a selbri #In other words #Starts with C. Ends with C. #Odd bits are V #Second-to-last is C, last is V #Even bits are CC, and those CC are valid # TODO (This parts' brother uses letters instead of bits, which seems nicer.) if i == 0: #Starts with C if not self.bit[0].C: #When would it ever start with a c? break elif i == end_of_brivla - 1: #Ends with CV if self.bit[end_of_brivla-1].CC and self.bit[end_of_brivla].V: #Actually, ends with CCV. Check that CC! if not(valid_init_cc(self.bit[end_of_brivla-1])): break #First two are cmavo, rest is selbri return [self.tokenize(2, CMAVO), self.tokenize(end_of_brivla-1, BRIVLA)] break elif i % 2 == 0: #Even bit, a CC if not self.bit[i].CC or not valid_init_cc(self.bit[i]): break #if not (self.bit[i].CC and valid_init_cc(self.bit[i])): #break else: #An odd bit. A single vowel. if not (self.bit[i].V): break i += 1 #Try: nuncasnu #If any CC's are not valid_init_cc, then BRIVLA old = None for bit in self.bit.items(end_of_brivla): if bit.counts_CC: #self.config.debug(bit, valid_init_cc(bit)) if old: b = old.chars[-1].value c = bit.chars[0].value if not valid_init_cc([b, c]): return self.tokenize(end_of_brivla+1, BRIVLA) if not valid_init_cc(bit): return self.tokenize(end_of_brivla+1, BRIVLA) else: old = bit #There might be a case of [CC][CC], in which case we'd need to check the C][C else: old = None #{2.C.4)b)5]e]2>} i = 0 frontmiddles = ['CVC', 'CVV', 'CVhV', 'CCV'] # NOTICE! I've added a CCV front-middle form, which isn't mentioned in brkwords.txt and so # is probably wrong. Despite the fact that it actually lets it parse everything. ends = ['CVhV', 'CVV', 'CCV', 'CCVCV', 'CVCCV'] #miklAma #bakrecpa'o while 1: """ Here is a regex-like form for what we're looking for: (Front_Middle)*(End) If the entire chunk matches that form, then it is a selbri. If not, then the first two bits make a cmavo, and the rest is a selbri. """ end_test = test(letters, i, ends, matchall) if end_test: #XXX I guess... matchall -> mi klama & mi tsmuvla, match -> miklama mitsmuvla #Can match ZERO frontmiddles #So, it is a selbri! self.config.debug("{0}\n{1}".format(letters[i:], ends)) if test(letters, i+end_test, ends, match): #There is more than one end, so it must be cmavo(CV) selbri() break else: return self.tokenize(end_of_brivla+1, BRIVLA) frontmiddle_test = test(letters, i, frontmiddles, match) if not frontmiddle_test: #Doesn't match a front-middle break #return [self.tokenize(2, CMAVO), self.tokenize(has_y-2, BRIVLA)] i += frontmiddle_test if i >= len(letters): self.config.error("How did I get here?", letters[0].position) #{2.C.4)b)5]e]3>} return self.tokenize(2, CMAVO), self.tokenize(end_of_brivla-1, FUHIVLA) #endif has_y else: #Some other begining we don't do nothing with! #{2.C.4)b)6]} self.config.warn("Don't know how to deal with this supposed selbri, it has a strange begining", self.bit[0].position) return self.tokenize(end_of_brivla+1, GARBAGE)