def process_bilingual_phrases(atListWithLemmasList,bilingualPhrases, generalisationOptions,generationMethod,allowedSLLemmas):
    finalAlignmentTemplates=ruleLearningLib.AlignmentTemplateSet()
    idAt=1
    
    structuralVariationsDictionary=dict()
    lexicalVariationsDictionary=dict()
    afterwardsDictionary=dict()
    
    timeStructuralvariations=0.0
    timeLexicalVariations=0.0
    timeRemovingWrongAlignments=0.0
    timeCorrectAndIncorrect=0.0
    timeAfterwardsRestrictions=0.0
    
    for atWithLemmas in atListWithLemmasList:
        at=atWithLemmas[0]
        sllemmas=atWithLemmas[1]
        tllemmas=atWithLemmas[2]
        tllemmasfromdictionary=atWithLemmas[3]
        
        debug("Generalising "+str(at)+" | "+str(sllemmas)+" | "+str(tllemmas))
        
        if generationMethod==AlignmentTemplateGenerationMethod.FIRST_APPROACH:
            subsetsGraph=ruleLearningLib.SubsetGraph()
            idAt=ruleLearningLib.AlignmentTemplate_generate_all_generalisations_and_add_them(at,sllemmas,tllemmas,tllemmasfromdictionary,finalAlignmentTemplates,idAt,subsetsGraph,True,True,generalisationOptions.get_genWhenEmptyTLCats(),generalisationOptions.get_genWhenEmptySLCats())
        elif generationMethod==AlignmentTemplateGenerationMethod.TL_VARIABLES:
            debug("Checking whether hash '"+str(hash(at))+"' is in the dictionary |d| = "+str(len(structuralVariationsDictionary))+".")
            #wildcard and reference values
            if not at in structuralVariationsDictionary:
                debug("AT not found in structural generalisations")
                starttime=time()
                structuralVariationsAts=ruleLearningLib.AlignmentTemplate_generate_all_structural_generalisations(at,generalisationOptions)
                timeStructuralvariations+=(time()-starttime)
                structuralVariationsDictionary[at]=structuralVariationsAts
            else:
                debug("AT already found in structural generalisations. Not repeating work")
            
            lemmasposandalignments=at.fast_clone()
            lemmasposandalignments.remove_all_inflection_tags()
            cleanAT=lemmasposandalignments.fast_clone()
            lemmasposandalignments.set_lemmas(sllemmas,tllemmas)
            lemmasposandalignments.tl_lemmas_from_dictionary=tllemmasfromdictionary
            
            #lexicalisations
            if not lemmasposandalignments in lexicalVariationsDictionary:
                starttime=time()
                lexicalVariationsAtsF=ruleLearningLib.AlignmentTemplate_generate_all_lexical_generalisations(cleanAT,sllemmas,tllemmas,tllemmasfromdictionary,generalisationOptions.is_unlexicaliseUnalignedSL())
                if allowedSLLemmas:
                    lexicalVariationsAts=[myat for myat in lexicalVariationsAtsF if tuple(myat.get_sl_lemmas()) in allowedSLLemmas]
                else:
                    lexicalVariationsAts=lexicalVariationsAtsF
                    
                timeLexicalVariations+=(time()-starttime)
                lexicalVariationsDictionary[lemmasposandalignments]=lexicalVariationsAts
            
            #removing alignments
            starttime=time()
            for atstruct in structuralVariationsDictionary[at]:
                for atlex in lexicalVariationsDictionary[lemmasposandalignments]:
                    newat=atstruct.fast_clone()
                    newat.set_lemmas_from_other_at(atlex)
                    options=newat.get_unalignment_options_for_multiple_aligned_unlexicalised_tl_words(lemmasposandalignments)
                    for option in options:
                        atcopy=newat.fast_clone()
                        atcopy.remove_alignments(option)
                        atcopy.alignments.sort()
                        debug("Obtained AT: "+str(atcopy))
                        
                        if not atcopy in afterwardsDictionary:
                            afterwardsDictionary[atcopy]=list()
                        afterwardsDictionary[atcopy].append(atcopy.afterwards_restrictions)
                        
                        if not finalAlignmentTemplates.is_in_set(atcopy):
                            debug("is NOT in set")
                            idAt+=1
                            atcopy.id=idAt
                            finalAlignmentTemplates.add(atcopy)
            timeRemovingWrongAlignments+=(time()-starttime)
        else:   
            print >> sys.stderr, "WRONG GENERATION METHOD"
    
    
    idAT=len(finalAlignmentTemplates.get_all_ats_list())
    finalAlignmentTemplatesAfterwardsRestrictions=AlignmentTemplateSet()
    
    
    if ruleLearningLib.DEBUG:
        debug("All the bilingual phrases:")
        for bilphrase in bilingualPhrases.get_all_ats_list():
            debug("\t"+str(bilphrase))
            tllemmaslocal=u" ".join([ "'"+lem+"'" for lem in bilphrase.tl_lemmas_from_dictionary  ])
            debug("TL lemmas: "+tllemmaslocal.encode('utf-8'))
    
    
    matchingBilphrasesDict=dict()
    for at in finalAlignmentTemplates.get_all_ats_list():
        starttime=time()
        idsOk,idMatching,numOk,numMatching=bilingualPhrases.get_ids_of_matching_and_compatible_phrases(at)
        timeCorrectAndIncorrect+=(time()-starttime)
        matchingBilphrasesDict[at]=(idsOk,idMatching,numOk,numMatching)
        at.freq=numOk
        debug("precomputing matching and OK bilingual phrases for at: "+str(at))
        debug("numOK: "+str(numOk)+" numMatching: "+str(numMatching))
        
    
    
    debug("Final ATs:")
    for at in finalAlignmentTemplates.get_all_ats_list():
        if generalisationOptions.is_refToBiling() and not generalisationOptions.is_differentRestrictionOptions() and generalisationOptions.is_generalise() and not generalisationOptions.is_addRestrictionsForEveryTag():
            at.shorten_restrictions()
        
        idsOk,idMatching,numOk,numMatching=matchingBilphrasesDict[at]
        debug(str(at))
        debug("with numOK = "+str(numOk)+" and freq = "+str(at.freq))
        
        if generalisationOptions.get_possibleValuesForRestrictions() == AT_GeneralisationOptions.VALUE_FOR_RESTRICTION_TRIGGERINGCHANGE:
            starttime=time()
            
            atsSharingLeftSide=list()
            for atSharing in finalAlignmentTemplates.get_ats_with_same_sllex_and_restrictions(at):
                if atSharing != at:
                    reproducedBilphrasesOfSharing=AlignmentTemplateSet()
                    incorrectBilphrasesOfSharing=AlignmentTemplateSet()
                    idsOkS,idMatchingS,numOkS,numMatchingS=matchingBilphrasesDict[atSharing]
                    incorrectIds=set(idMatchingS) - set(idsOkS)
                    for incorrectId in incorrectIds:
                        incorrectBilphrasesOfSharing.add(bilingualPhrases.get_by_id(incorrectId))
                    for idOK in idsOkS:
                        reproducedBilphrasesOfSharing.add(bilingualPhrases.get_by_id(idOK))
                    atsSharingLeftSide.append((atSharing,reproducedBilphrasesOfSharing,incorrectBilphrasesOfSharing,numOkS))
            
            incorrectBilphrases=AlignmentTemplateSet()
            incorrectIds=set(idMatching) - set(idsOk)
            for incorrectId in incorrectIds:
                incorrectBilphrases.add(bilingualPhrases.get_by_id(incorrectId))
            
            reproducedBilphrases=AlignmentTemplateSet()
            for idOK in idsOk:
                reproducedBilphrases.add(bilingualPhrases.get_by_id(idOK))
            
            debug("Processing AT to add restrictions: "+str(at))
            debug("Matching bilphrases ("+str(len(idMatching))+"):") 
            if ruleLearningLib.DEBUG:
                for bid in idMatching:
                    debug("\t"+str(bilingualPhrases.get_by_id(bid)))
            debug("Reproduced bilphrases ("+str(len(idsOk))+"):")
            if ruleLearningLib.DEBUG:
                for bid in idsOk:
                    debug("\t"+str(bilingualPhrases.get_by_id(bid)))
            debug("Incorrect bilphrases ("+str(len(incorrectIds))+") :")
            if ruleLearningLib.DEBUG:
                for inat in incorrectBilphrases.get_all_ats_list():
                    debug("\t"+str(inat.id)+": "+inat.to_string(removeRestrictionsFromLexicalised=False))
            
            #represent possible restrictions to be added as tuples
            allOptions=list()
            afterwardsRestrictionItemIndex=0
            for afterwards_restriction_item in afterwardsDictionary[at]:
                afterwardsRestrictionItemIndex+=1
                restrictionsAsTuples=list()
                for i in range(len(afterwards_restriction_item)):
                    #only add restrictions for non-lexicalised words 
                    if not at.parsed_sl_lexforms[i].has_lemma():
                        afterwardDict=afterwards_restriction_item[i]
                        for key in afterwardDict:
                            tuplerep=(i,key,afterwardDict[key])
                            restrictionsAsTuples.append(tuplerep)
            
                debug("Possible values for restrictions "+str(afterwardsRestrictionItemIndex)+": "+str(restrictionsAsTuples))
            
                #compute power set
                options=powerset(restrictionsAsTuples)
                allOptions.extend(options)
            
            allOptionsFrozenUniq=list(set([frozenset(o) for o in allOptions]))
            
            #sort options by number of components
            sortedOptions=sorted(allOptionsFrozenUniq,key=len)
            if generalisationOptions.is_triggeringLimitedLength():
                positionOfFirstInvalidOption=None
                for k in range(len(sortedOptions)):
                    if len(sortedOptions[k]) > len(at.parsed_sl_lexforms):
                        positionOfFirstInvalidOption=k
                        break
                if positionOfFirstInvalidOption!=None:
                    sortedOptions=sortedOptions[:positionOfFirstInvalidOption]
                
            
            incorrectIdsNotMatchingDict=dict()
            
            while len(sortedOptions) > 0:
                opt=sortedOptions.pop(0)
                optlen=len(opt)
                debug("Added restrictions option: "+str(opt))
                
                
                #matchesZero=False
                #for resSetMatchingZero in restrictionsSetsMatchingZero:
                #    if opt <= resSetMatchingZero:
                #        matchesZero=True
                #        break
                #if matchesZero:
                #    break
                
                newAT=at.fast_clone()
                newAT.add_restrictions_from_tuples(opt)    
                
                idsOk,idMatching,numOk,numMatching=incorrectBilphrases.get_ids_of_matching_and_compatible_phrases(newAT)
                incorrectIdsNotMatching=frozenset(incorrectIds - idMatching)
                
                idsOKFromReproducible,idsMatchingFromReproducible,numOkFromRepr,numMatchingFromRepr= reproducedBilphrases.get_ids_of_matching_and_compatible_phrases(newAT)
                totalReproduciblePhrases=len(reproducedBilphrases.get_all_ids())
                numReproduciblePhrasesNowNOtMatching=totalReproduciblePhrases-len(idsOKFromReproducible)
                debug("Reproducible phrases which now don't match: "+str(numReproduciblePhrasesNowNOtMatching))
                
                atLeastOneValid=False
                if generalisationOptions.is_discardRestrictionsNotImproving():
                    for atSharing,reproducedSharing,incorrectSharing,numOkofSharing in atsSharingLeftSide:
                        idsOkS,idMatchingS,numOkS,numMatchingS=incorrectSharing.get_ids_of_matching_and_compatible_phrases(newAT)
                        idsOKFromReproducibleS,idsMatchingFromReproducibleS,numOkFromReprS,numMatchingFromReprS= reproducedSharing.get_ids_of_matching_and_compatible_phrases(newAT)
                        if ruleLearningLib.DEBUG:
                            debug("\tAT sharing left side: "+str(atSharing))
                            debug("\t New AT matches "+str(numMatchingS)+" bilphrases out of "+str(incorrectSharing.get_total_freq())+" incorrect bilphrases" )
                            debug("\t  reproduces "+str(numOkS)+"/"+str(numMatchingS) ) 
                            debug("\t New AT matches "+str(numMatchingFromReprS)+" bilphrases out of "+str(reproducedSharing.get_total_freq())+" reproduced bilphrases" )
                            debug("\t  reproduces "+str(numOkFromReprS)+"/"+str(numMatchingFromReprS) )
                        phrasesCorrectlyReproducedByCombo=set()
                        
                        #first, the bilingual phrases correctly reproduced by atSharing minus the bilingual phrases matched by newAT
                        phrasesCorrectlyReproducedByCombo.update(reproducedSharing.get_all_ids())
                        phrasesCorrectlyReproducedByCombo.difference_update(idMatchingS)
                        phrasesCorrectlyReproducedByCombo.difference_update(idsMatchingFromReproducibleS)
                        
                        #in addition, the bilingual phrases correctly reproduced by 'newAT' which were matched by AtSharing
                        phrasesCorrectlyReproducedByCombo.update(idsOkS)
                        phrasesCorrectlyReproducedByCombo.update(idsOKFromReproducibleS)
                        
                        totalFreqOfPhrasesReproducedByCombo=sum( bilingualPhrases.get_by_id(bid).freq for bid in phrasesCorrectlyReproducedByCombo )
                        totalFreqOfPhrasesReproducedBySharingAT=numOkofSharing
                        debug("\t"+str(totalFreqOfPhrasesReproducedByCombo)+" phrases reproduced by combo vs. "+str(totalFreqOfPhrasesReproducedBySharingAT)+"phrases reproduced by AT sharing left side")
                        debug("\t"+str(numOkFromRepr)+" phrases reproduced by newAT vs. "+str(totalFreqOfPhrasesReproducedBySharingAT)+"phrases reproduced by AT sharing left side")
                        if numOkFromRepr < totalFreqOfPhrasesReproducedBySharingAT and totalFreqOfPhrasesReproducedByCombo > totalFreqOfPhrasesReproducedBySharingAT and numOkS > numMatchingS/2:
                            debug("\tRestriction VALID for this shared AT")
                            atLeastOneValid=True
                        else:
                            debug("\tRestriction NOT valid for this shared AT")

                if (numReproduciblePhrasesNowNOtMatching==0 or not generalisationOptions.is_triggeringNoGoodDiscarded()) and (not generalisationOptions.is_discardRestrictionsNotImproving() or atLeastOneValid or optlen==0):
                    if ruleLearningLib.DEBUG:
                        debug("Incorrect bilphrases which now don't match ("+str(len(incorrectIdsNotMatching))+"):")
                        for bid in incorrectIdsNotMatching:
                            debug("\t"+str(bilingualPhrases.get_by_id(bid)))
                    
                    if len(incorrectIdsNotMatching) > 0:
                        validAT=True
                        if incorrectIdsNotMatching in incorrectIdsNotMatchingDict:
                            debug("The same set of bilingual phrases was removed by other sets of restrictions...")
                            for pastoption in incorrectIdsNotMatchingDict[incorrectIdsNotMatching]:
                                if pastoption <= opt:
                                    debug("... and there is a subset of this one: "+str(pastoption))                                
                                    validAT=False
                                    break
                            if validAT:
                                debug("... but no set is a subset of this one")
                        else:
                            debug("The same set of bilingual phrases was NOT removed by other sets of restrictions.")
                            incorrectIdsNotMatchingDict[incorrectIdsNotMatching]=set()
                            incorrectIdsNotMatchingDict[incorrectIdsNotMatching].add(opt)
                        if validAT:
                            debug("SET OF RESTRICTIONS OK")
                            idAt+=1
                            newAT.id=idAt
                            finalAlignmentTemplatesAfterwardsRestrictions.add(newAT)
                    if len(idMatching) == 0:
                        debug("This AT does not match any incorrect bilingual phrase. Removing all its supersets")
                        #restrictionsSetsMatchingZero.add(opt)
                        sortedOptionsCopy=list()
                        for sopt in sortedOptions:
                            if not opt <= sopt:
                                sortedOptionsCopy.append(sopt)
                        sortedOptions=sortedOptionsCopy
                else:
                    debug("Set of restrictions not generated")
                debug("")
                        
            timeAfterwardsRestrictions+=(time()-starttime)
    
    
    debug("Final ATs with afterwards restrictions:")
    for at in finalAlignmentTemplatesAfterwardsRestrictions.get_all_ats_list():
        starttime=time()
        idsOk,idMatching,numOk,numMatching=bilingualPhrases.get_ids_of_matching_and_compatible_phrases(at)
        timeCorrectAndIncorrect+=(time()-starttime)
        at.freq=numOk
        debug(str(at))
        
    finalAlignmentTemplates.write(sys.stdout)
    finalAlignmentTemplatesAfterwardsRestrictions.write(sys.stdout)
    
    
    print >>sys.stderr, "Time performing structural generalisation: "+str(timeStructuralvariations)
    print >>sys.stderr, "Time performing lexical generalisation: "+str(timeLexicalVariations)
    print >>sys.stderr, "Time removing wrong alignments: "+str(timeRemovingWrongAlignments)
    print >>sys.stderr, "Time computing correct and matching ATs: "+str(timeCorrectAndIncorrect)
    print >>sys.stderr, "Time generating afterwards restrictions: "+str(timeAfterwardsRestrictions)
            freq=piecesOfline[0].strip()
            
            sllemmastext=piecesOfline[5].strip()
            tllemmastext=piecesOfline[6].strip()
            sllemmas=sllemmastext.split(u'\t')
            tllemmas=tllemmastext.split(u'\t')
            
            at.parse(textat)
            at.add_explicit_empty_tags()
            at.freq=int(freq)
            tl_lemmas_from_dictionary_text=piecesOfline[7]
            tl_lemmas_from_dictionary_list=[ l.strip() for l in tl_lemmas_from_dictionary_text.split(u'\t')]

            originalATList.append((at,sllemmas,tllemmas,tl_lemmas_from_dictionary_list))
            
            bilphrase=copy.deepcopy(at)
            bilphrase.set_lemmas(sllemmas,tllemmas)
            bilphrase.tl_lemmas_from_dictionary=tl_lemmas_from_dictionary_list
            bilid+=1
            bilphrase.id=bilid
            bilingualPhrases.add(bilphrase)

        print >> sys.stderr, " ....."+str(len(originalATList))+" items."
        
        debug("All the bilingual phrases at the beginning:")
        for bilphrase in bilingualPhrases.get_all_ats_list():
            debug("\t"+str(bilphrase))
        
        #process
        process_bilingual_phrases(originalATList,bilingualPhrases,generalisationOptions,generationMethod, allowedSLLemmas)
Example #3
0
def process_bilingual_phrases(atListWithLemmasList, bilingualPhrases,
                              generalisationOptions, generationMethod,
                              allowedSLLemmas):
    finalAlignmentTemplates = ruleLearningLib.AlignmentTemplateSet()
    idAt = 1

    structuralVariationsDictionary = dict()
    lexicalVariationsDictionary = dict()
    afterwardsDictionary = dict()

    timeStructuralvariations = 0.0
    timeLexicalVariations = 0.0
    timeRemovingWrongAlignments = 0.0
    timeCorrectAndIncorrect = 0.0
    timeAfterwardsRestrictions = 0.0

    for atWithLemmas in atListWithLemmasList:
        at = atWithLemmas[0]
        sllemmas = atWithLemmas[1]
        tllemmas = atWithLemmas[2]
        tllemmasfromdictionary = atWithLemmas[3]

        debug("Generalising " + str(at) + " | " + str(sllemmas) + " | " +
              str(tllemmas))

        if generationMethod == AlignmentTemplateGenerationMethod.FIRST_APPROACH:
            subsetsGraph = ruleLearningLib.SubsetGraph()
            idAt = ruleLearningLib.AlignmentTemplate_generate_all_generalisations_and_add_them(
                at, sllemmas, tllemmas, tllemmasfromdictionary,
                finalAlignmentTemplates, idAt, subsetsGraph, True, True,
                generalisationOptions.get_genWhenEmptyTLCats(),
                generalisationOptions.get_genWhenEmptySLCats())
        elif generationMethod == AlignmentTemplateGenerationMethod.TL_VARIABLES:
            debug("Checking whether hash '" + str(hash(at)) +
                  "' is in the dictionary |d| = " +
                  str(len(structuralVariationsDictionary)) + ".")
            #wildcard and reference values
            if not at in structuralVariationsDictionary:
                debug("AT not found in structural generalisations")
                starttime = time()
                structuralVariationsAts = ruleLearningLib.AlignmentTemplate_generate_all_structural_generalisations(
                    at, generalisationOptions)
                timeStructuralvariations += (time() - starttime)
                structuralVariationsDictionary[at] = structuralVariationsAts
            else:
                debug(
                    "AT already found in structural generalisations. Not repeating work"
                )

            lemmasposandalignments = at.fast_clone()
            lemmasposandalignments.remove_all_inflection_tags()
            cleanAT = lemmasposandalignments.fast_clone()
            lemmasposandalignments.set_lemmas(sllemmas, tllemmas)
            lemmasposandalignments.tl_lemmas_from_dictionary = tllemmasfromdictionary

            #lexicalisations
            if not lemmasposandalignments in lexicalVariationsDictionary:
                starttime = time()
                lexicalVariationsAtsF = ruleLearningLib.AlignmentTemplate_generate_all_lexical_generalisations(
                    cleanAT, sllemmas, tllemmas, tllemmasfromdictionary,
                    generalisationOptions.is_unlexicaliseUnalignedSL())
                if allowedSLLemmas:
                    lexicalVariationsAts = [
                        myat for myat in lexicalVariationsAtsF
                        if tuple(myat.get_sl_lemmas()) in allowedSLLemmas
                    ]
                else:
                    lexicalVariationsAts = lexicalVariationsAtsF

                timeLexicalVariations += (time() - starttime)
                lexicalVariationsDictionary[
                    lemmasposandalignments] = lexicalVariationsAts

            #removing alignments
            starttime = time()
            for atstruct in structuralVariationsDictionary[at]:
                for atlex in lexicalVariationsDictionary[
                        lemmasposandalignments]:
                    newat = atstruct.fast_clone()
                    newat.set_lemmas_from_other_at(atlex)
                    options = newat.get_unalignment_options_for_multiple_aligned_unlexicalised_tl_words(
                        lemmasposandalignments)
                    for option in options:
                        atcopy = newat.fast_clone()
                        atcopy.remove_alignments(option)
                        atcopy.alignments.sort()
                        debug("Obtained AT: " + str(atcopy))

                        if not atcopy in afterwardsDictionary:
                            afterwardsDictionary[atcopy] = list()
                        afterwardsDictionary[atcopy].append(
                            atcopy.afterwards_restrictions)

                        if not finalAlignmentTemplates.is_in_set(atcopy):
                            debug("is NOT in set")
                            idAt += 1
                            atcopy.id = idAt
                            finalAlignmentTemplates.add(atcopy)
            timeRemovingWrongAlignments += (time() - starttime)
        else:
            print >> sys.stderr, "WRONG GENERATION METHOD"

    idAT = len(finalAlignmentTemplates.get_all_ats_list())
    finalAlignmentTemplatesAfterwardsRestrictions = AlignmentTemplateSet()

    if ruleLearningLib.DEBUG:
        debug("All the bilingual phrases:")
        for bilphrase in bilingualPhrases.get_all_ats_list():
            debug("\t" + str(bilphrase))
            tllemmaslocal = u" ".join([
                "'" + lem + "'" for lem in bilphrase.tl_lemmas_from_dictionary
            ])
            debug("TL lemmas: " + tllemmaslocal.encode('utf-8'))

    matchingBilphrasesDict = dict()
    for at in finalAlignmentTemplates.get_all_ats_list():
        starttime = time()
        idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases(
            at)
        timeCorrectAndIncorrect += (time() - starttime)
        matchingBilphrasesDict[at] = (idsOk, idMatching, numOk, numMatching)
        at.freq = numOk
        debug("precomputing matching and OK bilingual phrases for at: " +
              str(at))
        debug("numOK: " + str(numOk) + " numMatching: " + str(numMatching))

    debug("Final ATs:")
    for at in finalAlignmentTemplates.get_all_ats_list():
        if generalisationOptions.is_refToBiling(
        ) and not generalisationOptions.is_differentRestrictionOptions(
        ) and generalisationOptions.is_generalise(
        ) and not generalisationOptions.is_addRestrictionsForEveryTag():
            at.shorten_restrictions()

        idsOk, idMatching, numOk, numMatching = matchingBilphrasesDict[at]
        debug(str(at))
        debug("with numOK = " + str(numOk) + " and freq = " + str(at.freq))

        if generalisationOptions.get_possibleValuesForRestrictions(
        ) == AT_GeneralisationOptions.VALUE_FOR_RESTRICTION_TRIGGERINGCHANGE:
            starttime = time()

            atsSharingLeftSide = list()
            for atSharing in finalAlignmentTemplates.get_ats_with_same_sllex_and_restrictions(
                    at):
                if atSharing != at:
                    reproducedBilphrasesOfSharing = AlignmentTemplateSet()
                    incorrectBilphrasesOfSharing = AlignmentTemplateSet()
                    idsOkS, idMatchingS, numOkS, numMatchingS = matchingBilphrasesDict[
                        atSharing]
                    incorrectIds = set(idMatchingS) - set(idsOkS)
                    for incorrectId in incorrectIds:
                        incorrectBilphrasesOfSharing.add(
                            bilingualPhrases.get_by_id(incorrectId))
                    for idOK in idsOkS:
                        reproducedBilphrasesOfSharing.add(
                            bilingualPhrases.get_by_id(idOK))
                    atsSharingLeftSide.append(
                        (atSharing, reproducedBilphrasesOfSharing,
                         incorrectBilphrasesOfSharing, numOkS))

            incorrectBilphrases = AlignmentTemplateSet()
            incorrectIds = set(idMatching) - set(idsOk)
            for incorrectId in incorrectIds:
                incorrectBilphrases.add(
                    bilingualPhrases.get_by_id(incorrectId))

            reproducedBilphrases = AlignmentTemplateSet()
            for idOK in idsOk:
                reproducedBilphrases.add(bilingualPhrases.get_by_id(idOK))

            debug("Processing AT to add restrictions: " + str(at))
            debug("Matching bilphrases (" + str(len(idMatching)) + "):")
            if ruleLearningLib.DEBUG:
                for bid in idMatching:
                    debug("\t" + str(bilingualPhrases.get_by_id(bid)))
            debug("Reproduced bilphrases (" + str(len(idsOk)) + "):")
            if ruleLearningLib.DEBUG:
                for bid in idsOk:
                    debug("\t" + str(bilingualPhrases.get_by_id(bid)))
            debug("Incorrect bilphrases (" + str(len(incorrectIds)) + ") :")
            if ruleLearningLib.DEBUG:
                for inat in incorrectBilphrases.get_all_ats_list():
                    debug("\t" + str(inat.id) + ": " + inat.to_string(
                        removeRestrictionsFromLexicalised=False))

            #represent possible restrictions to be added as tuples
            allOptions = list()
            afterwardsRestrictionItemIndex = 0
            for afterwards_restriction_item in afterwardsDictionary[at]:
                afterwardsRestrictionItemIndex += 1
                restrictionsAsTuples = list()
                for i in range(len(afterwards_restriction_item)):
                    #only add restrictions for non-lexicalised words
                    if not at.parsed_sl_lexforms[i].has_lemma():
                        afterwardDict = afterwards_restriction_item[i]
                        for key in afterwardDict:
                            tuplerep = (i, key, afterwardDict[key])
                            restrictionsAsTuples.append(tuplerep)

                debug("Possible values for restrictions " +
                      str(afterwardsRestrictionItemIndex) + ": " +
                      str(restrictionsAsTuples))

                #compute power set
                options = powerset(restrictionsAsTuples)
                allOptions.extend(options)

            allOptionsFrozenUniq = list(set([frozenset(o)
                                             for o in allOptions]))

            #sort options by number of components
            sortedOptions = sorted(allOptionsFrozenUniq, key=len)
            if generalisationOptions.is_triggeringLimitedLength():
                positionOfFirstInvalidOption = None
                for k in range(len(sortedOptions)):
                    if len(sortedOptions[k]) > len(at.parsed_sl_lexforms):
                        positionOfFirstInvalidOption = k
                        break
                if positionOfFirstInvalidOption != None:
                    sortedOptions = sortedOptions[:
                                                  positionOfFirstInvalidOption]

            incorrectIdsNotMatchingDict = dict()

            while len(sortedOptions) > 0:
                opt = sortedOptions.pop(0)
                optlen = len(opt)
                debug("Added restrictions option: " + str(opt))

                #matchesZero=False
                #for resSetMatchingZero in restrictionsSetsMatchingZero:
                #    if opt <= resSetMatchingZero:
                #        matchesZero=True
                #        break
                #if matchesZero:
                #    break

                newAT = at.fast_clone()
                newAT.add_restrictions_from_tuples(opt)

                idsOk, idMatching, numOk, numMatching = incorrectBilphrases.get_ids_of_matching_and_compatible_phrases(
                    newAT)
                incorrectIdsNotMatching = frozenset(incorrectIds - idMatching)

                idsOKFromReproducible, idsMatchingFromReproducible, numOkFromRepr, numMatchingFromRepr = reproducedBilphrases.get_ids_of_matching_and_compatible_phrases(
                    newAT)
                totalReproduciblePhrases = len(
                    reproducedBilphrases.get_all_ids())
                numReproduciblePhrasesNowNOtMatching = totalReproduciblePhrases - len(
                    idsOKFromReproducible)
                debug("Reproducible phrases which now don't match: " +
                      str(numReproduciblePhrasesNowNOtMatching))

                atLeastOneValid = False
                if generalisationOptions.is_discardRestrictionsNotImproving():
                    for atSharing, reproducedSharing, incorrectSharing, numOkofSharing in atsSharingLeftSide:
                        idsOkS, idMatchingS, numOkS, numMatchingS = incorrectSharing.get_ids_of_matching_and_compatible_phrases(
                            newAT)
                        idsOKFromReproducibleS, idsMatchingFromReproducibleS, numOkFromReprS, numMatchingFromReprS = reproducedSharing.get_ids_of_matching_and_compatible_phrases(
                            newAT)
                        if ruleLearningLib.DEBUG:
                            debug("\tAT sharing left side: " + str(atSharing))
                            debug("\t New AT matches " + str(numMatchingS) +
                                  " bilphrases out of " +
                                  str(incorrectSharing.get_total_freq()) +
                                  " incorrect bilphrases")
                            debug("\t  reproduces " + str(numOkS) + "/" +
                                  str(numMatchingS))
                            debug("\t New AT matches " +
                                  str(numMatchingFromReprS) +
                                  " bilphrases out of " +
                                  str(reproducedSharing.get_total_freq()) +
                                  " reproduced bilphrases")
                            debug("\t  reproduces " + str(numOkFromReprS) +
                                  "/" + str(numMatchingFromReprS))
                        phrasesCorrectlyReproducedByCombo = set()

                        #first, the bilingual phrases correctly reproduced by atSharing minus the bilingual phrases matched by newAT
                        phrasesCorrectlyReproducedByCombo.update(
                            reproducedSharing.get_all_ids())
                        phrasesCorrectlyReproducedByCombo.difference_update(
                            idMatchingS)
                        phrasesCorrectlyReproducedByCombo.difference_update(
                            idsMatchingFromReproducibleS)

                        #in addition, the bilingual phrases correctly reproduced by 'newAT' which were matched by AtSharing
                        phrasesCorrectlyReproducedByCombo.update(idsOkS)
                        phrasesCorrectlyReproducedByCombo.update(
                            idsOKFromReproducibleS)

                        totalFreqOfPhrasesReproducedByCombo = sum(
                            bilingualPhrases.get_by_id(bid).freq
                            for bid in phrasesCorrectlyReproducedByCombo)
                        totalFreqOfPhrasesReproducedBySharingAT = numOkofSharing
                        debug("\t" + str(totalFreqOfPhrasesReproducedByCombo) +
                              " phrases reproduced by combo vs. " +
                              str(totalFreqOfPhrasesReproducedBySharingAT) +
                              "phrases reproduced by AT sharing left side")
                        debug("\t" + str(numOkFromRepr) +
                              " phrases reproduced by newAT vs. " +
                              str(totalFreqOfPhrasesReproducedBySharingAT) +
                              "phrases reproduced by AT sharing left side")
                        if numOkFromRepr < totalFreqOfPhrasesReproducedBySharingAT and totalFreqOfPhrasesReproducedByCombo > totalFreqOfPhrasesReproducedBySharingAT and numOkS > numMatchingS / 2:
                            debug("\tRestriction VALID for this shared AT")
                            atLeastOneValid = True
                        else:
                            debug("\tRestriction NOT valid for this shared AT")

                if (numReproduciblePhrasesNowNOtMatching == 0 or
                        not generalisationOptions.is_triggeringNoGoodDiscarded(
                        )) and (not generalisationOptions.
                                is_discardRestrictionsNotImproving()
                                or atLeastOneValid or optlen == 0):
                    if ruleLearningLib.DEBUG:
                        debug("Incorrect bilphrases which now don't match (" +
                              str(len(incorrectIdsNotMatching)) + "):")
                        for bid in incorrectIdsNotMatching:
                            debug("\t" + str(bilingualPhrases.get_by_id(bid)))

                    if len(incorrectIdsNotMatching) > 0:
                        validAT = True
                        if incorrectIdsNotMatching in incorrectIdsNotMatchingDict:
                            debug(
                                "The same set of bilingual phrases was removed by other sets of restrictions..."
                            )
                            for pastoption in incorrectIdsNotMatchingDict[
                                    incorrectIdsNotMatching]:
                                if pastoption <= opt:
                                    debug(
                                        "... and there is a subset of this one: "
                                        + str(pastoption))
                                    validAT = False
                                    break
                            if validAT:
                                debug("... but no set is a subset of this one")
                        else:
                            debug(
                                "The same set of bilingual phrases was NOT removed by other sets of restrictions."
                            )
                            incorrectIdsNotMatchingDict[
                                incorrectIdsNotMatching] = set()
                            incorrectIdsNotMatchingDict[
                                incorrectIdsNotMatching].add(opt)
                        if validAT:
                            debug("SET OF RESTRICTIONS OK")
                            idAt += 1
                            newAT.id = idAt
                            finalAlignmentTemplatesAfterwardsRestrictions.add(
                                newAT)
                    if len(idMatching) == 0:
                        debug(
                            "This AT does not match any incorrect bilingual phrase. Removing all its supersets"
                        )
                        #restrictionsSetsMatchingZero.add(opt)
                        sortedOptionsCopy = list()
                        for sopt in sortedOptions:
                            if not opt <= sopt:
                                sortedOptionsCopy.append(sopt)
                        sortedOptions = sortedOptionsCopy
                else:
                    debug("Set of restrictions not generated")
                debug("")

            timeAfterwardsRestrictions += (time() - starttime)

    debug("Final ATs with afterwards restrictions:")
    for at in finalAlignmentTemplatesAfterwardsRestrictions.get_all_ats_list():
        starttime = time()
        idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases(
            at)
        timeCorrectAndIncorrect += (time() - starttime)
        at.freq = numOk
        debug(str(at))

    finalAlignmentTemplates.write(sys.stdout)
    finalAlignmentTemplatesAfterwardsRestrictions.write(sys.stdout)

    print >> sys.stderr, "Time performing structural generalisation: " + str(
        timeStructuralvariations)
    print >> sys.stderr, "Time performing lexical generalisation: " + str(
        timeLexicalVariations)
    print >> sys.stderr, "Time removing wrong alignments: " + str(
        timeRemovingWrongAlignments)
    print >> sys.stderr, "Time computing correct and matching ATs: " + str(
        timeCorrectAndIncorrect)
    print >> sys.stderr, "Time generating afterwards restrictions: " + str(
        timeAfterwardsRestrictions)
Example #4
0
        #remove all non-maximum hypotheses
        for numSentence, l_hypothesis in enumerate(ll_hypothesis):
            firstNotMaximumIndex = len(l_hypothesis)
            if firstNotMaximumIndex > 0:
                maximumScore = l_hypothesis[0].get_score()
                for index in range(len(l_hypothesis)):
                    if l_hypothesis[index].get_score() < maximumScore:
                        firstNotMaximumIndex = index
                        break
                if firstNotMaximumIndex == len(
                        l_hypothesis) and args.discard_sentences_all_maximum:
                    l_hypothesis[:] = [RuleApplicationHypothesis()]
                else:
                    l_hypothesis[:] = l_hypothesis[:firstNotMaximumIndex]
            debug("Sentence " + str(numSentence) + ": " +
                  str(firstNotMaximumIndex) + " hypothesses with maximum BLEU")

    if args.beam:
        appliedRules, valueOfSolution = RuleApplicationHypothesis.select_rules_maximize_score_with_beam_search(
            ll_hypothesis, beamSize=int(args.beam_size), isDiff=True)
        for ruleid in sorted(appliedRules):
            print str(ruleid)
        print >> sys.stderr, "Value: " + str(valueOfSolution)
    elif args.super_heuristic:
        appliedRules = RuleApplicationHypothesis.select_rules_maximize_score_with_super_heuristic(
            ll_hypothesis)
        for ruleid in sorted(appliedRules):
            print str(ruleid)
    elif args.select_boxes_minimum or args.compute_key_segment_breaking_prob:

        supersegmentsWithMaxScore = list()
Example #5
0
        tllemmas = tllemmastext.split(u'\t')

        at.parse(textat)
        at.add_explicit_empty_tags()
        at.freq = int(freq)
        tl_lemmas_from_dictionary_text = piecesOfline[7]
        tl_lemmas_from_dictionary_list = [
            l.strip() for l in tl_lemmas_from_dictionary_text.split(u'\t')
        ]

        originalATList.append(
            (at, sllemmas, tllemmas, tl_lemmas_from_dictionary_list))

        bilphrase = copy.deepcopy(at)
        bilphrase.set_lemmas(sllemmas, tllemmas)
        bilphrase.tl_lemmas_from_dictionary = tl_lemmas_from_dictionary_list
        bilid += 1
        bilphrase.id = bilid
        bilingualPhrases.add(bilphrase)

    print >> sys.stderr, " ....." + str(len(originalATList)) + " items."

    debug("All the bilingual phrases at the beginning:")
    for bilphrase in bilingualPhrases.get_all_ats_list():
        debug("\t" + str(bilphrase))

    #process
    process_bilingual_phrases(originalATList, bilingualPhrases,
                              generalisationOptions, generationMethod,
                              allowedSLLemmas)