Ejemplo n.º 1
0
def adapt_translation(source, matches = []):
    """ Translate new messages by adapting an old translation """
    # Start with a null result
    result = None, None
    # Start by considering the operation not safe
    # True: translated
    # False: fuzzy
    # None: untranslated
    safe = None
    # Iterate over the matches
    for match_source, match_translation in matches:
        # Consider the operation is safe for the moment
        safe = True
        # The accepted operations
        operations = {}
        # Split the texts by using the Splitter
        source_list = patterns.split(source)
#       print 'Source list', source_list
        # Split the matching source text by using the Splitter
        match_source_list = patterns.split(match_source)
#        print
#        print 'Match source list',  match_source_list
        # The sequence matcher
        seq = SequenceMatcher(None, match_source_list, source_list)
        # Build the result
        for tag, i1, i2, j1, j2 in seq.get_opcodes():
            from_seq = match_source_list[i1:i2]
            to_seq = source_list[j1:j2]
#            print 'OPCODE:', tag, i1, i2, j1, j2, from_seq, to_seq
            if tag == 'replace':
                # Accept replacement of several genera
                if (check_sequence(from_seq, genera = ['placeholder', 'branding', 'syntax']) and \
                    check_sequence(to_seq, genera = ['placeholder', 'branding', 'syntax'])) or \
                   (check_sequence(from_seq, genera = ['number']) and \
                    check_sequence(to_seq, genera = ['number'])) or \
                   (check_sequence(from_seq, genera = ['fixed']) and \
                    check_sequence(to_seq, genera = ['fixed'])) or \
                   (check_sequence(from_seq, genera = ['punctuation']) and \
                    check_sequence(to_seq, genera = ['punctuation'])):
                    # Check if the operation is safe
                    if not check_sequence_operations(operations, from_seq, to_seq):
                        safe = None
                        break
                else:
                    safe = None
                    break
            elif tag == 'delete':
                # Accept deletion of several genera
                if check_sequence(from_seq, genera = ['placeholder',
                                                      'branding',
                                                      'syntax',
                                                      'number',
                                                      'punctuation',
                                                      'control']):
                    # Check if the operation is safe
                    if not check_sequence_operations(operations, from_seq, []):
                        safe = None
                        break
                else:
                    safe = None
                    break
            elif tag == 'equal':
                # Check the letter case of text/words in equal sequences
                for from_word, to_word in zip (from_seq, to_seq):
                    if from_word.genus == 'text':
                        if from_word.key(ignorecase = False) != to_word.key(ignorecase = False):
                            safe = False
                            break
            elif tag == 'insert':
                # Accept insertion of several genera only
                if check_sequence(to_seq, genera = ['placeholder',
                                                    'branding',
                                                    'syntax',
                                                    'number',
                                                    'punctuation',
                                                    'control']):
                    if i1 == 0:
                        # Insert at begining
                        from_item = patterns.SplitNode(text = '', genus = '~', species = '^')
                    elif i1 == len(match_source_list):
                        # Insert at end
                        from_item = patterns.SplitNode(text = '', genus = '~', species = '$')
                    else:
                        # No other insertion allowed
                        safe = None
                        break
                    # Check if the operation is safe
                    if not check_sequence_operations(operations, [from_item,], to_seq):
                        safe = None
                        break
                else:
                    # Unaccepted genera
                    safe = None
                    break
            else:
                # Unkonwn operation
                safe = None
                break
#        print 'SAFE:', safe
        # Check if we are safe
        if safe is None:
            # We are not, try another match
            continue
        # We can now operate on translation
#        print 'OPERATIONS:', operations
        # Split the matching translation text by using the Splitter
        match_translation_list = patterns.split(match_translation)
        # Start with a result translation list based on the matching list
        translation_list = match_translation_list[:]
        # Get the operations list
        ops = operations.items()
        # Start with an empty list of failed operations
        failed = []
        # Do not extrapolate from the start
        extrapolate = False
        # Continue to process each operation as long as there are any 
        # operations left
        while ops:
#            print 'OPS', ops
            # Iterate over operations
            for op, count in ops:
#                print 'OP:', op, count
                # Get the from and to
                from_item, to_item = op
                # Convert back the tuples into lists
                from_seq = list(from_item)
                to_seq = list(to_item)
                # Check if 'from' is emty, which means insertion and this is 
                # allowd only at the end (append)
                if len(from_seq) == 1 and from_seq[0].genus == '~':
                    if from_seq[0].species == '$':
                        translation_list.extend(to_seq)
                    elif from_seq[0].species == '^':
                        to_seq.extend(translation_list)
                        translation_list = to_seq[:]
                else:
                    # Get the number of the occurences of this operation in 
                    # translation we are building
                    from_item_count = sublist_count(from_seq, translation_list, ignore = ['space'])
                    # Check if the occurrences remain the same
                    if from_item_count != count:
                        # Check if we should push the operation a little, if the 
                        # count means 'all' anyway
                        if extrapolate and sublist_count(from_seq, match_source_list, ignore = ['space']) == count:
                            # We can operate on all of them, but the result will be 
                            # fuzzy at best
                            safe = False
                        else:
                            # The number of occurrences is not the same, we can not 
                            # extrapolate or we can extrapolate but the count does 
                            # not mean 'all', so fail this operation
                            failed.append((op, count))
                            continue
                    # Adapt the translation if safe
                    if safe is not None:
                        # Get the sub-list positions
                        positions = sublist_positions(from_seq, translation_list, ignore = ['space'])
                        # Replace original sequences with constructed 
                        # sequences, in reversed order
                        for start, end in reversed(positions):
                            translation_list[start:end] = to_seq
            # Check if there are any failed operations
            if failed:
#                print 'FAILED', failed
                # There are failed operations, check if all of them have failed
                if ops == failed:
                    # Check if we have tried extrapolating too
                    if not extrapolate:
                        # Let's try to extrapolate
                        extrapolate = True
                    else:
                        # All the operations have failed, even in 
                        # extrapolation, we are not safe at all
                        ops = []
                        # If safe was True (first iteration), switch to None, 
                        # if safe was False (later iteration) or None, keep it
                        if safe:
                            safe = None
                else:
                    # Only some operations have failed, try them again
                    ops = failed[:]
                    failed = []
                    safe = False
            else:
                # There are no failed operations, we can go on with current 
                # safety level
                ops = []
        # Check the result of the previous match
        if (result[0] is None and safe is not None) or (not result[0] and safe):
            # If there is no previous match or we are safe now and the previous 
            # match was not, create a new result
            result = (safe, patterns.join(translation_list))
            if safe:
                # We can break if we are safe, no more matches are needed
                break
    # Return the result
    return result