Python debug Examples

Programming Language: Python

Namespace/Package Name: ruleLearningLib

Method/Function: debug

Examples at hotexamples.com: 5

Python debug - 5 examples found. These are the top rated real world Python examples of ruleLearningLib.debug extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: generateMultipleATsFromBilphrases.py Project: sundetova/kaz-eng-generalisation

def process_bilingual_phrases(atListWithLemmasList,bilingualPhrases, generalisationOptions,generationMethod,allowedSLLemmas):
    finalAlignmentTemplates=ruleLearningLib.AlignmentTemplateSet()
    idAt=1
    
    structuralVariationsDictionary=dict()
    lexicalVariationsDictionary=dict()
    afterwardsDictionary=dict()
    
    timeStructuralvariations=0.0
    timeLexicalVariations=0.0
    timeRemovingWrongAlignments=0.0
    timeCorrectAndIncorrect=0.0
    timeAfterwardsRestrictions=0.0
    
    for atWithLemmas in atListWithLemmasList:
        at=atWithLemmas[0]
        sllemmas=atWithLemmas[1]
        tllemmas=atWithLemmas[2]
        tllemmasfromdictionary=atWithLemmas[3]
        
        debug("Generalising "+str(at)+" | "+str(sllemmas)+" | "+str(tllemmas))
        
        if generationMethod==AlignmentTemplateGenerationMethod.FIRST_APPROACH:
            subsetsGraph=ruleLearningLib.SubsetGraph()
            idAt=ruleLearningLib.AlignmentTemplate_generate_all_generalisations_and_add_them(at,sllemmas,tllemmas,tllemmasfromdictionary,finalAlignmentTemplates,idAt,subsetsGraph,True,True,generalisationOptions.get_genWhenEmptyTLCats(),generalisationOptions.get_genWhenEmptySLCats())
        elif generationMethod==AlignmentTemplateGenerationMethod.TL_VARIABLES:
            debug("Checking whether hash '"+str(hash(at))+"' is in the dictionary |d| = "+str(len(structuralVariationsDictionary))+".")
            #wildcard and reference values
            if not at in structuralVariationsDictionary:
                debug("AT not found in structural generalisations")
                starttime=time()
                structuralVariationsAts=ruleLearningLib.AlignmentTemplate_generate_all_structural_generalisations(at,generalisationOptions)
                timeStructuralvariations+=(time()-starttime)
                structuralVariationsDictionary[at]=structuralVariationsAts
            else:
                debug("AT already found in structural generalisations. Not repeating work")
            
            lemmasposandalignments=at.fast_clone()
            lemmasposandalignments.remove_all_inflection_tags()
            cleanAT=lemmasposandalignments.fast_clone()
            lemmasposandalignments.set_lemmas(sllemmas,tllemmas)
            lemmasposandalignments.tl_lemmas_from_dictionary=tllemmasfromdictionary
            
            #lexicalisations
            if not lemmasposandalignments in lexicalVariationsDictionary:
                starttime=time()
                lexicalVariationsAtsF=ruleLearningLib.AlignmentTemplate_generate_all_lexical_generalisations(cleanAT,sllemmas,tllemmas,tllemmasfromdictionary,generalisationOptions.is_unlexicaliseUnalignedSL())
                if allowedSLLemmas:
                    lexicalVariationsAts=[myat for myat in lexicalVariationsAtsF if tuple(myat.get_sl_lemmas()) in allowedSLLemmas]
                else:
                    lexicalVariationsAts=lexicalVariationsAtsF
                    
                timeLexicalVariations+=(time()-starttime)
                lexicalVariationsDictionary[lemmasposandalignments]=lexicalVariationsAts
            
            #removing alignments
            starttime=time()
            for atstruct in structuralVariationsDictionary[at]:
                for atlex in lexicalVariationsDictionary[lemmasposandalignments]:
                    newat=atstruct.fast_clone()
                    newat.set_lemmas_from_other_at(atlex)
                    options=newat.get_unalignment_options_for_multiple_aligned_unlexicalised_tl_words(lemmasposandalignments)
                    for option in options:
                        atcopy=newat.fast_clone()
                        atcopy.remove_alignments(option)
                        atcopy.alignments.sort()
                        debug("Obtained AT: "+str(atcopy))
                        
                        if not atcopy in afterwardsDictionary:
                            afterwardsDictionary[atcopy]=list()
                        afterwardsDictionary[atcopy].append(atcopy.afterwards_restrictions)
                        
                        if not finalAlignmentTemplates.is_in_set(atcopy):
                            debug("is NOT in set")
                            idAt+=1
                            atcopy.id=idAt
                            finalAlignmentTemplates.add(atcopy)
            timeRemovingWrongAlignments+=(time()-starttime)
        else:   
            print >> sys.stderr, "WRONG GENERATION METHOD"
    
    
    idAT=len(finalAlignmentTemplates.get_all_ats_list())
    finalAlignmentTemplatesAfterwardsRestrictions=AlignmentTemplateSet()
    
    
    if ruleLearningLib.DEBUG:
        debug("All the bilingual phrases:")
        for bilphrase in bilingualPhrases.get_all_ats_list():
            debug("\t"+str(bilphrase))
            tllemmaslocal=u" ".join([ "'"+lem+"'" for lem in bilphrase.tl_lemmas_from_dictionary  ])
            debug("TL lemmas: "+tllemmaslocal.encode('utf-8'))
    
    
    matchingBilphrasesDict=dict()
    for at in finalAlignmentTemplates.get_all_ats_list():
        starttime=time()
        idsOk,idMatching,numOk,numMatching=bilingualPhrases.get_ids_of_matching_and_compatible_phrases(at)
        timeCorrectAndIncorrect+=(time()-starttime)
        matchingBilphrasesDict[at]=(idsOk,idMatching,numOk,numMatching)
        at.freq=numOk
        debug("precomputing matching and OK bilingual phrases for at: "+str(at))
        debug("numOK: "+str(numOk)+" numMatching: "+str(numMatching))
        
    
    
    debug("Final ATs:")
    for at in finalAlignmentTemplates.get_all_ats_list():
        if generalisationOptions.is_refToBiling() and not generalisationOptions.is_differentRestrictionOptions() and generalisationOptions.is_generalise() and not generalisationOptions.is_addRestrictionsForEveryTag():
            at.shorten_restrictions()
        
        idsOk,idMatching,numOk,numMatching=matchingBilphrasesDict[at]
        debug(str(at))
        debug("with numOK = "+str(numOk)+" and freq = "+str(at.freq))
        
        if generalisationOptions.get_possibleValuesForRestrictions() == AT_GeneralisationOptions.VALUE_FOR_RESTRICTION_TRIGGERINGCHANGE:
            starttime=time()
            
            atsSharingLeftSide=list()
            for atSharing in finalAlignmentTemplates.get_ats_with_same_sllex_and_restrictions(at):
                if atSharing != at:
                    reproducedBilphrasesOfSharing=AlignmentTemplateSet()
                    incorrectBilphrasesOfSharing=AlignmentTemplateSet()
                    idsOkS,idMatchingS,numOkS,numMatchingS=matchingBilphrasesDict[atSharing]
                    incorrectIds=set(idMatchingS) - set(idsOkS)
                    for incorrectId in incorrectIds:
                        incorrectBilphrasesOfSharing.add(bilingualPhrases.get_by_id(incorrectId))
                    for idOK in idsOkS:
                        reproducedBilphrasesOfSharing.add(bilingualPhrases.get_by_id(idOK))
                    atsSharingLeftSide.append((atSharing,reproducedBilphrasesOfSharing,incorrectBilphrasesOfSharing,numOkS))
            
            incorrectBilphrases=AlignmentTemplateSet()
            incorrectIds=set(idMatching) - set(idsOk)
            for incorrectId in incorrectIds:
                incorrectBilphrases.add(bilingualPhrases.get_by_id(incorrectId))
            
            reproducedBilphrases=AlignmentTemplateSet()
            for idOK in idsOk:
                reproducedBilphrases.add(bilingualPhrases.get_by_id(idOK))
            
            debug("Processing AT to add restrictions: "+str(at))
            debug("Matching bilphrases ("+str(len(idMatching))+"):") 
            if ruleLearningLib.DEBUG:
                for bid in idMatching:
                    debug("\t"+str(bilingualPhrases.get_by_id(bid)))
            debug("Reproduced bilphrases ("+str(len(idsOk))+"):")
            if ruleLearningLib.DEBUG:
                for bid in idsOk:
                    debug("\t"+str(bilingualPhrases.get_by_id(bid)))
            debug("Incorrect bilphrases ("+str(len(incorrectIds))+") :")
            if ruleLearningLib.DEBUG:
                for inat in incorrectBilphrases.get_all_ats_list():
                    debug("\t"+str(inat.id)+": "+inat.to_string(removeRestrictionsFromLexicalised=False))
            
            #represent possible restrictions to be added as tuples
            allOptions=list()
            afterwardsRestrictionItemIndex=0
            for afterwards_restriction_item in afterwardsDictionary[at]:
                afterwardsRestrictionItemIndex+=1
                restrictionsAsTuples=list()
                for i in range(len(afterwards_restriction_item)):
                    #only add restrictions for non-lexicalised words 
                    if not at.parsed_sl_lexforms[i].has_lemma():
                        afterwardDict=afterwards_restriction_item[i]
                        for key in afterwardDict:
                            tuplerep=(i,key,afterwardDict[key])
                            restrictionsAsTuples.append(tuplerep)
            
                debug("Possible values for restrictions "+str(afterwardsRestrictionItemIndex)+": "+str(restrictionsAsTuples))
            
                #compute power set
                options=powerset(restrictionsAsTuples)
                allOptions.extend(options)
            
            allOptionsFrozenUniq=list(set([frozenset(o) for o in allOptions]))
            
            #sort options by number of components
            sortedOptions=sorted(allOptionsFrozenUniq,key=len)
            if generalisationOptions.is_triggeringLimitedLength():
                positionOfFirstInvalidOption=None
                for k in range(len(sortedOptions)):
                    if len(sortedOptions[k]) > len(at.parsed_sl_lexforms):
                        positionOfFirstInvalidOption=k
                        break
                if positionOfFirstInvalidOption!=None:
                    sortedOptions=sortedOptions[:positionOfFirstInvalidOption]
                
            
            incorrectIdsNotMatchingDict=dict()
            
            while len(sortedOptions) > 0:
                opt=sortedOptions.pop(0)
                optlen=len(opt)
                debug("Added restrictions option: "+str(opt))
                
                
                #matchesZero=False
                #for resSetMatchingZero in restrictionsSetsMatchingZero:
                #    if opt <= resSetMatchingZero:
                #        matchesZero=True
                #        break
                #if matchesZero:
                #    break
                
                newAT=at.fast_clone()
                newAT.add_restrictions_from_tuples(opt)    
                
                idsOk,idMatching,numOk,numMatching=incorrectBilphrases.get_ids_of_matching_and_compatible_phrases(newAT)
                incorrectIdsNotMatching=frozenset(incorrectIds - idMatching)
                
                idsOKFromReproducible,idsMatchingFromReproducible,numOkFromRepr,numMatchingFromRepr= reproducedBilphrases.get_ids_of_matching_and_compatible_phrases(newAT)
                totalReproduciblePhrases=len(reproducedBilphrases.get_all_ids())
                numReproduciblePhrasesNowNOtMatching=totalReproduciblePhrases-len(idsOKFromReproducible)
                debug("Reproducible phrases which now don't match: "+str(numReproduciblePhrasesNowNOtMatching))
                
                atLeastOneValid=False
                if generalisationOptions.is_discardRestrictionsNotImproving():
                    for atSharing,reproducedSharing,incorrectSharing,numOkofSharing in atsSharingLeftSide:
                        idsOkS,idMatchingS,numOkS,numMatchingS=incorrectSharing.get_ids_of_matching_and_compatible_phrases(newAT)
                        idsOKFromReproducibleS,idsMatchingFromReproducibleS,numOkFromReprS,numMatchingFromReprS= reproducedSharing.get_ids_of_matching_and_compatible_phrases(newAT)
                        if ruleLearningLib.DEBUG:
                            debug("\tAT sharing left side: "+str(atSharing))
                            debug("\t New AT matches "+str(numMatchingS)+" bilphrases out of "+str(incorrectSharing.get_total_freq())+" incorrect bilphrases" )
                            debug("\t  reproduces "+str(numOkS)+"/"+str(numMatchingS) ) 
                            debug("\t New AT matches "+str(numMatchingFromReprS)+" bilphrases out of "+str(reproducedSharing.get_total_freq())+" reproduced bilphrases" )
                            debug("\t  reproduces "+str(numOkFromReprS)+"/"+str(numMatchingFromReprS) )
                        phrasesCorrectlyReproducedByCombo=set()
                        
                        #first, the bilingual phrases correctly reproduced by atSharing minus the bilingual phrases matched by newAT
                        phrasesCorrectlyReproducedByCombo.update(reproducedSharing.get_all_ids())
                        phrasesCorrectlyReproducedByCombo.difference_update(idMatchingS)
                        phrasesCorrectlyReproducedByCombo.difference_update(idsMatchingFromReproducibleS)
                        
                        #in addition, the bilingual phrases correctly reproduced by 'newAT' which were matched by AtSharing
                        phrasesCorrectlyReproducedByCombo.update(idsOkS)
                        phrasesCorrectlyReproducedByCombo.update(idsOKFromReproducibleS)
                        
                        totalFreqOfPhrasesReproducedByCombo=sum( bilingualPhrases.get_by_id(bid).freq for bid in phrasesCorrectlyReproducedByCombo )
                        totalFreqOfPhrasesReproducedBySharingAT=numOkofSharing
                        debug("\t"+str(totalFreqOfPhrasesReproducedByCombo)+" phrases reproduced by combo vs. "+str(totalFreqOfPhrasesReproducedBySharingAT)+"phrases reproduced by AT sharing left side")
                        debug("\t"+str(numOkFromRepr)+" phrases reproduced by newAT vs. "+str(totalFreqOfPhrasesReproducedBySharingAT)+"phrases reproduced by AT sharing left side")
                        if numOkFromRepr < totalFreqOfPhrasesReproducedBySharingAT and totalFreqOfPhrasesReproducedByCombo > totalFreqOfPhrasesReproducedBySharingAT and numOkS > numMatchingS/2:
                            debug("\tRestriction VALID for this shared AT")
                            atLeastOneValid=True
                        else:
                            debug("\tRestriction NOT valid for this shared AT")

                if (numReproduciblePhrasesNowNOtMatching==0 or not generalisationOptions.is_triggeringNoGoodDiscarded()) and (not generalisationOptions.is_discardRestrictionsNotImproving() or atLeastOneValid or optlen==0):
                    if ruleLearningLib.DEBUG:
                        debug("Incorrect bilphrases which now don't match ("+str(len(incorrectIdsNotMatching))+"):")
                        for bid in incorrectIdsNotMatching:
                            debug("\t"+str(bilingualPhrases.get_by_id(bid)))
                    
                    if len(incorrectIdsNotMatching) > 0:
                        validAT=True
                        if incorrectIdsNotMatching in incorrectIdsNotMatchingDict:
                            debug("The same set of bilingual phrases was removed by other sets of restrictions...")
                            for pastoption in incorrectIdsNotMatchingDict[incorrectIdsNotMatching]:
                                if pastoption <= opt:
                                    debug("... and there is a subset of this one: "+str(pastoption))                                
                                    validAT=False
                                    break
                            if validAT:
                                debug("... but no set is a subset of this one")
                        else:
                            debug("The same set of bilingual phrases was NOT removed by other sets of restrictions.")
                            incorrectIdsNotMatchingDict[incorrectIdsNotMatching]=set()
                            incorrectIdsNotMatchingDict[incorrectIdsNotMatching].add(opt)
                        if validAT:
                            debug("SET OF RESTRICTIONS OK")
                            idAt+=1
                            newAT.id=idAt
                            finalAlignmentTemplatesAfterwardsRestrictions.add(newAT)
                    if len(idMatching) == 0:
                        debug("This AT does not match any incorrect bilingual phrase. Removing all its supersets")
                        #restrictionsSetsMatchingZero.add(opt)
                        sortedOptionsCopy=list()
                        for sopt in sortedOptions:
                            if not opt <= sopt:
                                sortedOptionsCopy.append(sopt)
                        sortedOptions=sortedOptionsCopy
                else:
                    debug("Set of restrictions not generated")
                debug("")
                        
            timeAfterwardsRestrictions+=(time()-starttime)
    
    
    debug("Final ATs with afterwards restrictions:")
    for at in finalAlignmentTemplatesAfterwardsRestrictions.get_all_ats_list():
        starttime=time()
        idsOk,idMatching,numOk,numMatching=bilingualPhrases.get_ids_of_matching_and_compatible_phrases(at)
        timeCorrectAndIncorrect+=(time()-starttime)
        at.freq=numOk
        debug(str(at))
        
    finalAlignmentTemplates.write(sys.stdout)
    finalAlignmentTemplatesAfterwardsRestrictions.write(sys.stdout)
    
    
    print >>sys.stderr, "Time performing structural generalisation: "+str(timeStructuralvariations)
    print >>sys.stderr, "Time performing lexical generalisation: "+str(timeLexicalVariations)
    print >>sys.stderr, "Time removing wrong alignments: "+str(timeRemovingWrongAlignments)
    print >>sys.stderr, "Time computing correct and matching ATs: "+str(timeCorrectAndIncorrect)
    print >>sys.stderr, "Time generating afterwards restrictions: "+str(timeAfterwardsRestrictions)

Example #2

Show file

File: generateMultipleATsFromBilphrases.py Project: sundetova/kaz-eng-generalisation

            freq=piecesOfline[0].strip()
            
            sllemmastext=piecesOfline[5].strip()
            tllemmastext=piecesOfline[6].strip()
            sllemmas=sllemmastext.split(u'\t')
            tllemmas=tllemmastext.split(u'\t')
            
            at.parse(textat)
            at.add_explicit_empty_tags()
            at.freq=int(freq)
            tl_lemmas_from_dictionary_text=piecesOfline[7]
            tl_lemmas_from_dictionary_list=[ l.strip() for l in tl_lemmas_from_dictionary_text.split(u'\t')]

            originalATList.append((at,sllemmas,tllemmas,tl_lemmas_from_dictionary_list))
            
            bilphrase=copy.deepcopy(at)
            bilphrase.set_lemmas(sllemmas,tllemmas)
            bilphrase.tl_lemmas_from_dictionary=tl_lemmas_from_dictionary_list
            bilid+=1
            bilphrase.id=bilid
            bilingualPhrases.add(bilphrase)

        print >> sys.stderr, " ....."+str(len(originalATList))+" items."
        
        debug("All the bilingual phrases at the beginning:")
        for bilphrase in bilingualPhrases.get_all_ats_list():
            debug("\t"+str(bilphrase))
        
        #process
        process_bilingual_phrases(originalATList,bilingualPhrases,generalisationOptions,generationMethod, allowedSLLemmas)

Example #3

Show file

def process_bilingual_phrases(atListWithLemmasList, bilingualPhrases,
                              generalisationOptions, generationMethod,
                              allowedSLLemmas):
    finalAlignmentTemplates = ruleLearningLib.AlignmentTemplateSet()
    idAt = 1

    structuralVariationsDictionary = dict()
    lexicalVariationsDictionary = dict()
    afterwardsDictionary = dict()

    timeStructuralvariations = 0.0
    timeLexicalVariations = 0.0
    timeRemovingWrongAlignments = 0.0
    timeCorrectAndIncorrect = 0.0
    timeAfterwardsRestrictions = 0.0

    for atWithLemmas in atListWithLemmasList:
        at = atWithLemmas[0]
        sllemmas = atWithLemmas[1]
        tllemmas = atWithLemmas[2]
        tllemmasfromdictionary = atWithLemmas[3]

        debug("Generalising " + str(at) + " | " + str(sllemmas) + " | " +
              str(tllemmas))

        if generationMethod == AlignmentTemplateGenerationMethod.FIRST_APPROACH:
            subsetsGraph = ruleLearningLib.SubsetGraph()
            idAt = ruleLearningLib.AlignmentTemplate_generate_all_generalisations_and_add_them(
                at, sllemmas, tllemmas, tllemmasfromdictionary,
                finalAlignmentTemplates, idAt, subsetsGraph, True, True,
                generalisationOptions.get_genWhenEmptyTLCats(),
                generalisationOptions.get_genWhenEmptySLCats())
        elif generationMethod == AlignmentTemplateGenerationMethod.TL_VARIABLES:
            debug("Checking whether hash '" + str(hash(at)) +
                  "' is in the dictionary |d| = " +
                  str(len(structuralVariationsDictionary)) + ".")
            #wildcard and reference values
            if not at in structuralVariationsDictionary:
                debug("AT not found in structural generalisations")
                starttime = time()
                structuralVariationsAts = ruleLearningLib.AlignmentTemplate_generate_all_structural_generalisations(
                    at, generalisationOptions)
                timeStructuralvariations += (time() - starttime)
                structuralVariationsDictionary[at] = structuralVariationsAts
            else:
                debug(
                    "AT already found in structural generalisations. Not repeating work"
                )

            lemmasposandalignments = at.fast_clone()
            lemmasposandalignments.remove_all_inflection_tags()
            cleanAT = lemmasposandalignments.fast_clone()
            lemmasposandalignments.set_lemmas(sllemmas, tllemmas)
            lemmasposandalignments.tl_lemmas_from_dictionary = tllemmasfromdictionary

            #lexicalisations
            if not lemmasposandalignments in lexicalVariationsDictionary:
                starttime = time()
                lexicalVariationsAtsF = ruleLearningLib.AlignmentTemplate_generate_all_lexical_generalisations(
                    cleanAT, sllemmas, tllemmas, tllemmasfromdictionary,
                    generalisationOptions.is_unlexicaliseUnalignedSL())
                if allowedSLLemmas:
                    lexicalVariationsAts = [
                        myat for myat in lexicalVariationsAtsF
                        if tuple(myat.get_sl_lemmas()) in allowedSLLemmas
                    ]
                else:
                    lexicalVariationsAts = lexicalVariationsAtsF

                timeLexicalVariations += (time() - starttime)
                lexicalVariationsDictionary[
                    lemmasposandalignments] = lexicalVariationsAts

            #removing alignments
            starttime = time()
            for atstruct in structuralVariationsDictionary[at]:
                for atlex in lexicalVariationsDictionary[
                        lemmasposandalignments]:
                    newat = atstruct.fast_clone()
                    newat.set_lemmas_from_other_at(atlex)
                    options = newat.get_unalignment_options_for_multiple_aligned_unlexicalised_tl_words(
                        lemmasposandalignments)
                    for option in options:
                        atcopy = newat.fast_clone()
                        atcopy.remove_alignments(option)
                        atcopy.alignments.sort()
                        debug("Obtained AT: " + str(atcopy))

                        if not atcopy in afterwardsDictionary:
                            afterwardsDictionary[atcopy] = list()
                        afterwardsDictionary[atcopy].append(
                            atcopy.afterwards_restrictions)

                        if not finalAlignmentTemplates.is_in_set(atcopy):
                            debug("is NOT in set")
                            idAt += 1
                            atcopy.id = idAt
                            finalAlignmentTemplates.add(atcopy)
            timeRemovingWrongAlignments += (time() - starttime)
        else:
            print >> sys.stderr, "WRONG GENERATION METHOD"

    idAT = len(finalAlignmentTemplates.get_all_ats_list())
    finalAlignmentTemplatesAfterwardsRestrictions = AlignmentTemplateSet()

    if ruleLearningLib.DEBUG:
        debug("All the bilingual phrases:")
        for bilphrase in bilingualPhrases.get_all_ats_list():
            debug("\t" + str(bilphrase))
            tllemmaslocal = u" ".join([
                "'" + lem + "'" for lem in bilphrase.tl_lemmas_from_dictionary
            ])
            debug("TL lemmas: " + tllemmaslocal.encode('utf-8'))

    matchingBilphrasesDict = dict()
    for at in finalAlignmentTemplates.get_all_ats_list():
        starttime = time()
        idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases(
            at)
        timeCorrectAndIncorrect += (time() - starttime)
        matchingBilphrasesDict[at] = (idsOk, idMatching, numOk, numMatching)
        at.freq = numOk
        debug("precomputing matching and OK bilingual phrases for at: " +
              str(at))
        debug("numOK: " + str(numOk) + " numMatching: " + str(numMatching))

    debug("Final ATs:")
    for at in finalAlignmentTemplates.get_all_ats_list():
        if generalisationOptions.is_refToBiling(
        ) and not generalisationOptions.is_differentRestrictionOptions(
        ) and generalisationOptions.is_generalise(
        ) and not generalisationOptions.is_addRestrictionsForEveryTag():
            at.shorten_restrictions()

        idsOk, idMatching, numOk, numMatching = matchingBilphrasesDict[at]
        debug(str(at))
        debug("with numOK = " + str(numOk) + " and freq = " + str(at.freq))

        if generalisationOptions.get_possibleValuesForRestrictions(
        ) == AT_GeneralisationOptions.VALUE_FOR_RESTRICTION_TRIGGERINGCHANGE:
            starttime = time()

            atsSharingLeftSide = list()
            for atSharing in finalAlignmentTemplates.get_ats_with_same_sllex_and_restrictions(
                    at):
                if atSharing != at:
                    reproducedBilphrasesOfSharing = AlignmentTemplateSet()
                    incorrectBilphrasesOfSharing = AlignmentTemplateSet()
                    idsOkS, idMatchingS, numOkS, numMatchingS = matchingBilphrasesDict[
                        atSharing]
                    incorrectIds = set(idMatchingS) - set(idsOkS)
                    for incorrectId in incorrectIds:
                        incorrectBilphrasesOfSharing.add(
                            bilingualPhrases.get_by_id(incorrectId))
                    for idOK in idsOkS:
                        reproducedBilphrasesOfSharing.add(
                            bilingualPhrases.get_by_id(idOK))
                    atsSharingLeftSide.append(
                        (atSharing, reproducedBilphrasesOfSharing,
                         incorrectBilphrasesOfSharing, numOkS))

            incorrectBilphrases = AlignmentTemplateSet()
            incorrectIds = set(idMatching) - set(idsOk)
            for incorrectId in incorrectIds:
                incorrectBilphrases.add(
                    bilingualPhrases.get_by_id(incorrectId))

            reproducedBilphrases = AlignmentTemplateSet()
            for idOK in idsOk:
                reproducedBilphrases.add(bilingualPhrases.get_by_id(idOK))

            debug("Processing AT to add restrictions: " + str(at))
            debug("Matching bilphrases (" + str(len(idMatching)) + "):")
            if ruleLearningLib.DEBUG:
                for bid in idMatching:
                    debug("\t" + str(bilingualPhrases.get_by_id(bid)))
            debug("Reproduced bilphrases (" + str(len(idsOk)) + "):")
            if ruleLearningLib.DEBUG:
                for bid in idsOk:
                    debug("\t" + str(bilingualPhrases.get_by_id(bid)))
            debug("Incorrect bilphrases (" + str(len(incorrectIds)) + ") :")
            if ruleLearningLib.DEBUG:
                for inat in incorrectBilphrases.get_all_ats_list():
                    debug("\t" + str(inat.id) + ": " + inat.to_string(
                        removeRestrictionsFromLexicalised=False))

            #represent possible restrictions to be added as tuples
            allOptions = list()
            afterwardsRestrictionItemIndex = 0
            for afterwards_restriction_item in afterwardsDictionary[at]:
                afterwardsRestrictionItemIndex += 1
                restrictionsAsTuples = list()
                for i in range(len(afterwards_restriction_item)):
                    #only add restrictions for non-lexicalised words
                    if not at.parsed_sl_lexforms[i].has_lemma():
                        afterwardDict = afterwards_restriction_item[i]
                        for key in afterwardDict:
                            tuplerep = (i, key, afterwardDict[key])
                            restrictionsAsTuples.append(tuplerep)

                debug("Possible values for restrictions " +
                      str(afterwardsRestrictionItemIndex) + ": " +
                      str(restrictionsAsTuples))

                #compute power set
                options = powerset(restrictionsAsTuples)
                allOptions.extend(options)

            allOptionsFrozenUniq = list(set([frozenset(o)
                                             for o in allOptions]))

            #sort options by number of components
            sortedOptions = sorted(allOptionsFrozenUniq, key=len)
            if generalisationOptions.is_triggeringLimitedLength():
                positionOfFirstInvalidOption = None
                for k in range(len(sortedOptions)):
                    if len(sortedOptions[k]) > len(at.parsed_sl_lexforms):
                        positionOfFirstInvalidOption = k
                        break
                if positionOfFirstInvalidOption != None:
                    sortedOptions = sortedOptions[:
                                                  positionOfFirstInvalidOption]

            incorrectIdsNotMatchingDict = dict()

            while len(sortedOptions) > 0:
                opt = sortedOptions.pop(0)
                optlen = len(opt)
                debug("Added restrictions option: " + str(opt))

                #matchesZero=False
                #for resSetMatchingZero in restrictionsSetsMatchingZero:
                #    if opt <= resSetMatchingZero:
                #        matchesZero=True
                #        break
                #if matchesZero:
                #    break

                newAT = at.fast_clone()
                newAT.add_restrictions_from_tuples(opt)

                idsOk, idMatching, numOk, numMatching = incorrectBilphrases.get_ids_of_matching_and_compatible_phrases(
                    newAT)
                incorrectIdsNotMatching = frozenset(incorrectIds - idMatching)

                idsOKFromReproducible, idsMatchingFromReproducible, numOkFromRepr, numMatchingFromRepr = reproducedBilphrases.get_ids_of_matching_and_compatible_phrases(
                    newAT)
                totalReproduciblePhrases = len(
                    reproducedBilphrases.get_all_ids())
                numReproduciblePhrasesNowNOtMatching = totalReproduciblePhrases - len(
                    idsOKFromReproducible)
                debug("Reproducible phrases which now don't match: " +
                      str(numReproduciblePhrasesNowNOtMatching))

                atLeastOneValid = False
                if generalisationOptions.is_discardRestrictionsNotImproving():
                    for atSharing, reproducedSharing, incorrectSharing, numOkofSharing in atsSharingLeftSide:
                        idsOkS, idMatchingS, numOkS, numMatchingS = incorrectSharing.get_ids_of_matching_and_compatible_phrases(
                            newAT)
                        idsOKFromReproducibleS, idsMatchingFromReproducibleS, numOkFromReprS, numMatchingFromReprS = reproducedSharing.get_ids_of_matching_and_compatible_phrases(
                            newAT)
                        if ruleLearningLib.DEBUG:
                            debug("\tAT sharing left side: " + str(atSharing))
                            debug("\t New AT matches " + str(numMatchingS) +
                                  " bilphrases out of " +
                                  str(incorrectSharing.get_total_freq()) +
                                  " incorrect bilphrases")
                            debug("\t  reproduces " + str(numOkS) + "/" +
                                  str(numMatchingS))
                            debug("\t New AT matches " +
                                  str(numMatchingFromReprS) +
                                  " bilphrases out of " +
                                  str(reproducedSharing.get_total_freq()) +
                                  " reproduced bilphrases")
                            debug("\t  reproduces " + str(numOkFromReprS) +
                                  "/" + str(numMatchingFromReprS))
                        phrasesCorrectlyReproducedByCombo = set()

                        #first, the bilingual phrases correctly reproduced by atSharing minus the bilingual phrases matched by newAT
                        phrasesCorrectlyReproducedByCombo.update(
                            reproducedSharing.get_all_ids())
                        phrasesCorrectlyReproducedByCombo.difference_update(
                            idMatchingS)
                        phrasesCorrectlyReproducedByCombo.difference_update(
                            idsMatchingFromReproducibleS)

                        #in addition, the bilingual phrases correctly reproduced by 'newAT' which were matched by AtSharing
                        phrasesCorrectlyReproducedByCombo.update(idsOkS)
                        phrasesCorrectlyReproducedByCombo.update(
                            idsOKFromReproducibleS)

                        totalFreqOfPhrasesReproducedByCombo = sum(
                            bilingualPhrases.get_by_id(bid).freq
                            for bid in phrasesCorrectlyReproducedByCombo)
                        totalFreqOfPhrasesReproducedBySharingAT = numOkofSharing
                        debug("\t" + str(totalFreqOfPhrasesReproducedByCombo) +
                              " phrases reproduced by combo vs. " +
                              str(totalFreqOfPhrasesReproducedBySharingAT) +
                              "phrases reproduced by AT sharing left side")
                        debug("\t" + str(numOkFromRepr) +
                              " phrases reproduced by newAT vs. " +
                              str(totalFreqOfPhrasesReproducedBySharingAT) +
                              "phrases reproduced by AT sharing left side")
                        if numOkFromRepr < totalFreqOfPhrasesReproducedBySharingAT and totalFreqOfPhrasesReproducedByCombo > totalFreqOfPhrasesReproducedBySharingAT and numOkS > numMatchingS / 2:
                            debug("\tRestriction VALID for this shared AT")
                            atLeastOneValid = True
                        else:
                            debug("\tRestriction NOT valid for this shared AT")

                if (numReproduciblePhrasesNowNOtMatching == 0 or
                        not generalisationOptions.is_triggeringNoGoodDiscarded(
                        )) and (not generalisationOptions.
                                is_discardRestrictionsNotImproving()
                                or atLeastOneValid or optlen == 0):
                    if ruleLearningLib.DEBUG:
                        debug("Incorrect bilphrases which now don't match (" +
                              str(len(incorrectIdsNotMatching)) + "):")
                        for bid in incorrectIdsNotMatching:
                            debug("\t" + str(bilingualPhrases.get_by_id(bid)))

                    if len(incorrectIdsNotMatching) > 0:
                        validAT = True
                        if incorrectIdsNotMatching in incorrectIdsNotMatchingDict:
                            debug(
                                "The same set of bilingual phrases was removed by other sets of restrictions..."
                            )
                            for pastoption in incorrectIdsNotMatchingDict[
                                    incorrectIdsNotMatching]:
                                if pastoption <= opt:
                                    debug(
                                        "... and there is a subset of this one: "
                                        + str(pastoption))
                                    validAT = False
                                    break
                            if validAT:
                                debug("... but no set is a subset of this one")
                        else:
                            debug(
                                "The same set of bilingual phrases was NOT removed by other sets of restrictions."
                            )
                            incorrectIdsNotMatchingDict[
                                incorrectIdsNotMatching] = set()
                            incorrectIdsNotMatchingDict[
                                incorrectIdsNotMatching].add(opt)
                        if validAT:
                            debug("SET OF RESTRICTIONS OK")
                            idAt += 1
                            newAT.id = idAt
                            finalAlignmentTemplatesAfterwardsRestrictions.add(
                                newAT)
                    if len(idMatching) == 0:
                        debug(
                            "This AT does not match any incorrect bilingual phrase. Removing all its supersets"
                        )
                        #restrictionsSetsMatchingZero.add(opt)
                        sortedOptionsCopy = list()
                        for sopt in sortedOptions:
                            if not opt <= sopt:
                                sortedOptionsCopy.append(sopt)
                        sortedOptions = sortedOptionsCopy
                else:
                    debug("Set of restrictions not generated")
                debug("")

            timeAfterwardsRestrictions += (time() - starttime)

    debug("Final ATs with afterwards restrictions:")
    for at in finalAlignmentTemplatesAfterwardsRestrictions.get_all_ats_list():
        starttime = time()
        idsOk, idMatching, numOk, numMatching = bilingualPhrases.get_ids_of_matching_and_compatible_phrases(
            at)
        timeCorrectAndIncorrect += (time() - starttime)
        at.freq = numOk
        debug(str(at))

    finalAlignmentTemplates.write(sys.stdout)
    finalAlignmentTemplatesAfterwardsRestrictions.write(sys.stdout)

    print >> sys.stderr, "Time performing structural generalisation: " + str(
        timeStructuralvariations)
    print >> sys.stderr, "Time performing lexical generalisation: " + str(
        timeLexicalVariations)
    print >> sys.stderr, "Time removing wrong alignments: " + str(
        timeRemovingWrongAlignments)
    print >> sys.stderr, "Time computing correct and matching ATs: " + str(
        timeCorrectAndIncorrect)
    print >> sys.stderr, "Time generating afterwards restrictions: " + str(
        timeAfterwardsRestrictions)

Example #4

Show file

        #remove all non-maximum hypotheses
        for numSentence, l_hypothesis in enumerate(ll_hypothesis):
            firstNotMaximumIndex = len(l_hypothesis)
            if firstNotMaximumIndex > 0:
                maximumScore = l_hypothesis[0].get_score()
                for index in range(len(l_hypothesis)):
                    if l_hypothesis[index].get_score() < maximumScore:
                        firstNotMaximumIndex = index
                        break
                if firstNotMaximumIndex == len(
                        l_hypothesis) and args.discard_sentences_all_maximum:
                    l_hypothesis[:] = [RuleApplicationHypothesis()]
                else:
                    l_hypothesis[:] = l_hypothesis[:firstNotMaximumIndex]
            debug("Sentence " + str(numSentence) + ": " +
                  str(firstNotMaximumIndex) + " hypothesses with maximum BLEU")

    if args.beam:
        appliedRules, valueOfSolution = RuleApplicationHypothesis.select_rules_maximize_score_with_beam_search(
            ll_hypothesis, beamSize=int(args.beam_size), isDiff=True)
        for ruleid in sorted(appliedRules):
            print str(ruleid)
        print >> sys.stderr, "Value: " + str(valueOfSolution)
    elif args.super_heuristic:
        appliedRules = RuleApplicationHypothesis.select_rules_maximize_score_with_super_heuristic(
            ll_hypothesis)
        for ruleid in sorted(appliedRules):
            print str(ruleid)
    elif args.select_boxes_minimum or args.compute_key_segment_breaking_prob:

        supersegmentsWithMaxScore = list()

Example #5

Show file

        tllemmas = tllemmastext.split(u'\t')

        at.parse(textat)
        at.add_explicit_empty_tags()
        at.freq = int(freq)
        tl_lemmas_from_dictionary_text = piecesOfline[7]
        tl_lemmas_from_dictionary_list = [
            l.strip() for l in tl_lemmas_from_dictionary_text.split(u'\t')
        ]

        originalATList.append(
            (at, sllemmas, tllemmas, tl_lemmas_from_dictionary_list))

        bilphrase = copy.deepcopy(at)
        bilphrase.set_lemmas(sllemmas, tllemmas)
        bilphrase.tl_lemmas_from_dictionary = tl_lemmas_from_dictionary_list
        bilid += 1
        bilphrase.id = bilid
        bilingualPhrases.add(bilphrase)

    print >> sys.stderr, " ....." + str(len(originalATList)) + " items."

    debug("All the bilingual phrases at the beginning:")
    for bilphrase in bilingualPhrases.get_all_ats_list():
        debug("\t" + str(bilphrase))

    #process
    process_bilingual_phrases(originalATList, bilingualPhrases,
                              generalisationOptions, generationMethod,
                              allowedSLLemmas)