Beispiel #1
0
    def _lujvo_analyze(self, value):
        # XXX TODO Not everything that isn't a gismu/lujvo/cmavo is a fu'ivla
        # XXX Move somewhere else?
        self.ve_lujvo_rafsi = []
        chars = list(
            orthography.stream_char(config.Configuration(stdin=io.StringIO(value + " "), args=[]))
        )  # XXX why you need that extra space, eh?
        orig_chars = list(chars)
        forms = [0]

        def next_form():
            # The current item didn't work, so pick the next.
            if forms[-1] > len(all_):
                # Out of items here...
                forms.pop()
            if forms == []:
                return True  # FUHI!
            forms[-1] += 1
            return False

        len_form = lambda: sum(len(all_[_]) for _ in forms[:-1])

        ##http://www.lojban.org/sv/lists/lojban-list/msg16628.html
        terminal_rafsi = "CCV| CVV| CVhV| CVCCV| CCVCV|".split(" ")
        rafsi4 = "CVCCy CCVCy".split(" ")
        rafsi3 = "CVV- CVhV- CVC CVCy CVV CVhV CCV".split(" ")
        all_ = terminal_rafsi + rafsi4 + rafsi3

        while 1:
            len_ = len_form()
            chars = orig_chars[len_:]
            if chars == []:
                break

            try:
                test = self._match_form(orig_chars[len_:], all_[forms[-1]], len(forms) == 1)
            except Exception as e:
                test = False
            # len_ = len_form()

            if test:
                # chars = test
                if all_[forms[-1]][-1] == "|":
                    break
                forms.append(0)
            else:
                if next_form():
                    return False  # Ran out of checks. FUHI!

        # You think you found a lujvo
        # XXX: Do I need to check for doubling?
        first = all_[forms[0]]
        if first in ["CVV", "CVhV"]:
            # xorxes says a CVV must end with CCV.
            if len(forms) != 2 or all_[forms[1]] != "CCV|":
                return False
        # "Check the consonant that follows CVC or CVCy." - I trust he refers to the following paragraph? >_>
        # Maybe consonant clusters?
        i = 0
        for form in forms:
            if all_[form][:2] == "CC":
                # Check consonant clusters
                if not orthography.valid_init_cc(orig_chars[i].value + orig_chars[i + 1].value):
                    self.config.debug("Invalid consonant cluster for lujvo: {0}".format(self))  # XXX - config.warn
                    return False
            i += len(all_[form])
        """
    if first in ['CVC', 'CVCy']:
      if not self._lujvo_analyze(value[2:]):
        return False
    """

        while forms:
            _ = all_[forms.pop(0)]
            orig_chars = self._add_rafsi(orig_chars, len(_))
        # remove hyphen-letter

        if first in ["CVV-," "CVhV-"]:
            # Remove the hyphen
            self.ve_lujvo_rafsi[0] = self.ve_lujvo_rafsi[0][:-1]

        self.ve_lujvo_rafsi[-1] = self.ve_lujvo_rafsi[-1][:-1]  # Dump the whitespace from the begining
        return True
Beispiel #2
0
    def break_a_selbri(self):
        """This function is called when a CC is in the text. It deals with the notable issue of seperating selbri and cmavo. It returns either a single token, or a list of tokens. Should all fail, it will make the word a garbage token
        TODO : Include brkword's error detections
        TODO : Look at all the pretty self.tokenize's
        """
        #{2.C.}
        
        #Load up on positions
        
        cc, cc_location = self.locate_cc()
        ps = self.locate_ps(cc)
        word_end = self.word() #This is indexed from 1
        
        self.config.debug("""BUFFER: {0}
cc_location: {1}
cc: {2}
PS: {3}
we: {4}""".format(self.bit.buffer, cc_location, cc, ps, word_end))
        
        
        
        letters = [] #This is used for some form-checking
        for bit in self.bit.items(word_end):
            if bit.wordsep:
                break
            for char in bit.chars:
                letters.append(char)
        
        #{2.C.1)} - error checking
        
        if ps == False and cc == -1:
            #{2.C.1)a)} - cc but no ps
            self.config.warn("Has a consonant cluster, but no penultimate stress", self.bit[0].position)
            return self.tokenize(word_end, GARBAGE)
        #{2.C.1)b)} is taken care of in get_token()
        
        
        #{2.C.3)}
        #Locate the end of the brivla
        end_of_brivla = ps
        alter_PS = False
        found_v = False
        
        END_WORD = False
        
        while 1:
            end_of_brivla += 1 #First time, we move off of the vowel
            
            try:
                bit = self.bit[end_of_brivla]
                if bit.wordsep:
                    END_WORD = True
            except EOFError:
                self.config.debug("EOF hit in break_a_selbri()")
                END_WORD = True
            
            if END_WORD:
                end_of_brivla -= 1
                if not found_v:
                    #part of {2.C.3)b)} - it is required to have a vowel!
                    self.config.warn("This supposed selbri is supposed to have a vowel", self.bit.buffer[0].position)
                    return self.tokenize(end_of_brivla, GARBAGE)
                break #There's the end. Stops the while loop
            if bit.has_V:
                found_v = True
                break

        #{2.C.3)c)} - end_of_brivla has now been set to the true end. 
        
        self.config.debug("eb: {0}".format(end_of_brivla))
        
        #{2.C.4)} - Now begins the epic journey to the begining
        
        
        init_cmavo_tokens = []
        
        #{2.C.4)a)]} - break off begining cmavo pieces, like {lonudoklama}
        while cc_location >= 5:
            #{2.C.4)a)1]} and {2.C.4)a)2]}
            #cc is the LETER position of the con cluster, not the actual loccccatttioonn..
            if self.bit[0].has_V:
                
                init_cmavo_tokens.append(self.tokenize(1, CMAVO))
            elif self.bit[0].has_C:
                init_cmavo_tokens.append(self.tokenize(2, CMAVO))
            else:
                break
            cc, cc_location = self.locate_cc()
        if init_cmavo_tokens:
            return init_cmavo_tokens
        
        #self.config.debug(has_starting_cc)
        
        
        #{2.C.4)a)3]}
        if letters[0].V:
            find_first_consonant = 0
            for bit in self.bit:
                if bit.has_C:
                    break
                if bit.whitespace:
                    break
                find_first_consonant += 1
            
            
            if self.bit[find_first_consonant].counts_CC:
                return self.tokenize(word_end, BRIVLA)
            else:
                return self.tokenize(find_first_consonant, CMAVO)
            
        else:
            #{2.C.4)b)} - it starts with a consonant. And should have a CC by this point
            #Things are really get cracking now!
            #self.config.debug(self.bit.buffer)
            if self.bit[0].counts_CC or    (self.bit[0].C and self.bit[1].V and self.bit[2].CyC) : #CC... CVCyC...
                #{2.C.4)b)1]} - more selbri should be like this
                
                for bit in self.bit.items(end_of_brivla+1):
                    if bit.garbage:
                        self.config.warn("This supposed selbri has garbage in it", bit.position)
                        return self.tokenize(end_of_brivla+1, CIZBRIVLA)
                return self.tokenize(end_of_brivla+1, BRIVLA)
            elif self.bit[0].CyC:
                #{2.C.4)b)2]} - I'm not sure this is neccessary. Why is this neccessary?
                # Would it not be handled with everything else? TODO - test what happens if you remove it
                return self.tokenize(word_end, GARBAGE)
            elif self.bit[0].C and self.bit[1].counts_VV and (self.bit[2].CC or self.bit[2].CCC): #CVVCC
                #{2.C.4)b)3]}
                
                if ps == 1 or not valid_init_cc(self.bit[2]):
                    #{2.C.4)b)3]a]}
                    return self.tokenize(end_of_brivla+1, BRIVLA)
                else:
                    #{2.C.4)b)3]b]}
                    return self.tokenize(2, CMAVO)
            elif self.bit[0].C and self.bit[1].VhV and self.bit[2].CC: #CVhVCC
                #{2.C.4)b)4]}
                if ps != 1 and valid_init_cc(self.bit[2]):
                    return self.tokenize(2, CMAVO)
                else:
                    return self.tokenize(end_of_brivla+1, BRIVLA)
            elif self.bit[0].C and self.bit[1].V and self.bit[2].counts_CC: #CVCC..
                #{2.C.4)b)5]]}
                if self.bit[3].V and end_of_brivla == 3: #CVCCV
                    #{2.C.4)b)5]a]} - gismu
                    return self.tokenize(4, BRIVLA)
                
                if not valid_init_cc(self.bit[2]):
                    #{2.C.4)b)5]b]}
                    OKAY = True
                    for bit in self.bit.items(end_of_brivla)[2:end_of_brivla]:
                        if bit.CC and not valid_init_cc(bit):
                            OKAY = False
                    if OKAY:
                        return self.tokenize(end_of_brivla+1, BRIVLA)
                
                
                #{2.C.4)b)5]c]}
                first_v_is_ps = True
                for bit in self.bit.items(ps-1):
                    if bit.has_V:
                        first_v_is_ps = False
                if first_v_is_ps:
                    return self.tokenize(end_of_brivla+1, BRIVLA)
                
                
                #{2.C.4)b)5]d]}
                
                has_y = False
                i = 0
                for l in letters:
                    i += 1
                    if l.y:
                        has_y = True
                        break
                
                #has_y and !has_y are fairly similiar.
                # TODO : Consider merging these two cases?
                
                if has_y:
                    #{2.C.4)b)5]d]}
                    has_y = i-1
                    letters = letters[:has_y] #Adjust letters to look at only stuff before the y
                    i = 0
                    
                    #self.config.debug(letters)
                    
                    #{2.C.4)b)5]d]1>} - match (CVC){2,}y..
                    CVC_MATCH = False
                    while 1:
                        #self.config.debug(i)
                        if i+2 >= len(letters): #End of word... End of selbri..
                            if i >= 3*2: #Then it's probably okay! But one more thing...
                                CVC_MATCH = True
                                return self.tokenize(end_of_brivla+1, BRIVLA)
                            break
                        if letters[i+0].C and letters[i+1].V and letters[i+2].C:
                            i += 3 #Match a CVC
                        else:
                            break #Okay, it's not a (CVC)*
                        if i >= len(letters):
                            break
                    
                    # And then make sure the CC's are all valid init pairs
                    if CVC_MATCH:
                        self.debug("CVC_MATCH")
                        for bit in self.bits:
                            self.debug(bit)
                            if not valid_init_cc(bit.CC):
                                self.debug("Has invalid init CC", bit.position)
                                break
                            elif bit.wordsep or bit.y:
                                return self.tokenize(end_of_brivla+1, BRIVLA)
                    
                    #{2.C.4)b)5]d]2>}
                    frontmiddles = ["CVC", "CVV", "CV'V", "CCV"]
                    ends = ["CVC", "CCVC", "CVCC"]
                    
                    i = 0
                    while 1:
                        if test(letters, i, ends, matchall):
                            #Can match ZERO frontmiddles
                            #So, it is a selbri!
                            return self.tokenize(end_of_brivla+1, BRIVLA)
                        
                        fms = test(letters, i, frontmiddles, match)
                        if not fms:
                            #{2.C.4)b)5]d]3>} - Doesn't match a front-middle
                            return [self.tokenize(2, CMAVO), self.tokenize(has_y-2, BRIVLA)]
                        i += fms
                    
                    
                else: #!has_y
                    #{2.C.4)b)5]e]}
                    i = 0
                    
                    while 1:
                        #{2.C.4)b)5]e]1>} - If (CVC)}{2,}CV , break at first CV and make the rest a selbri
                        
                        #In other words
                        #Starts with C. Ends with C.
                        #Odd bits are V
                        #Second-to-last is C, last is V
                        #Even bits are CC, and those CC are valid
                        # TODO (This parts' brother uses letters instead of bits, which seems nicer.)
                        
                        if i == 0: #Starts with C
                            if not self.bit[0].C: #When would it ever start with a c?
                                break
                                
                        elif i == end_of_brivla - 1: #Ends with CV
                            if self.bit[end_of_brivla-1].CC and self.bit[end_of_brivla].V:
                                #Actually, ends with CCV. Check that CC!
                                if not(valid_init_cc(self.bit[end_of_brivla-1])):
                                    break
                                #First two are cmavo, rest is selbri
                                return [self.tokenize(2, CMAVO), self.tokenize(end_of_brivla-1, BRIVLA)]
                            break
                        elif i % 2 == 0: #Even bit, a CC
                            if not self.bit[i].CC or not valid_init_cc(self.bit[i]):
                                break
                            #if not (self.bit[i].CC and valid_init_cc(self.bit[i])):
                                #break
                        else: #An odd bit. A single vowel.
                            if not (self.bit[i].V):
                                break
                        i += 1
                    
                    
                    #Try: nuncasnu
                    
                    #If any CC's are not valid_init_cc, then BRIVLA
                    old = None
                    for bit in self.bit.items(end_of_brivla):
                        if bit.counts_CC:
                            #self.config.debug(bit, valid_init_cc(bit))
                            if old:
                                b = old.chars[-1].value
                                c = bit.chars[0].value
                                if not valid_init_cc([b, c]):
                                    return self.tokenize(end_of_brivla+1, BRIVLA)
                            if not valid_init_cc(bit):
                                return self.tokenize(end_of_brivla+1, BRIVLA)
                            else:
                                old = bit #There might be a case of [CC][CC], in which case we'd need to check the C][C
                        else:
                            old = None
                            
                    
                    
                    #{2.C.4)b)5]e]2>}
                    i = 0
                    frontmiddles = ['CVC', 'CVV', 'CVhV', 'CCV'] 
                    # NOTICE! I've added a CCV front-middle form, which isn't mentioned in brkwords.txt and so
                    # is probably wrong. Despite the fact that it actually lets it parse everything.
                    ends = ['CVhV', 'CVV', 'CCV', 'CCVCV', 'CVCCV']
                    #miklAma
                    #bakrecpa'o
                    while 1:
                        """
                        Here is a regex-like form for what we're looking for:
                            (Front_Middle)*(End)
                        
                        If the entire chunk matches that form, then it is a selbri. If not,
                        then the first two bits make a cmavo, and the rest is a selbri.
                        """
                        end_test = test(letters, i, ends, matchall)
                        if end_test: #XXX I guess... matchall -> mi klama & mi tsmuvla, match -> miklama mitsmuvla
                            #Can match ZERO frontmiddles
                            #So, it is a selbri!
                            self.config.debug("{0}\n{1}".format(letters[i:], ends))
                            
                            if test(letters, i+end_test, ends, match):
                                #There is more than one end, so it must be cmavo(CV) selbri()
                                break
                            else:
                                return self.tokenize(end_of_brivla+1, BRIVLA)
                        
                        
                        
                        frontmiddle_test = test(letters, i, frontmiddles, match)
                        if not frontmiddle_test:
                            #Doesn't match a front-middle
                            break
                            #return [self.tokenize(2, CMAVO), self.tokenize(has_y-2, BRIVLA)]
                        
                        i += frontmiddle_test
                        
                        if i >= len(letters):
                            self.config.error("How did I get here?", letters[0].position)
                    
                    
                    #{2.C.4)b)5]e]3>}
                    return self.tokenize(2, CMAVO), self.tokenize(end_of_brivla-1, FUHIVLA)
                #endif has_y
            else: #Some other begining we don't do nothing with!
                #{2.C.4)b)6]}
                self.config.warn("Don't know how to deal with this supposed selbri, it has a strange begining", self.bit[0].position)
                return self.tokenize(end_of_brivla+1, GARBAGE)