Exemple #1
0
    def extendChainsInSentence(self, sentence, foundChains):
        ''' Rakendab meetodit self.extendChainsInClause() antud lause igal osalausel.
        '''
        # 1) Preprocessing
        clauses = getClausesByClauseIDs(sentence)

        # 2) Extend verb chains in each clause
        allDetectedVerbChains = []
        for clauseID in clauses:
            clause = clauses[clauseID]
            self.extendChainsInClause(clause, clauseID, foundChains)
    def extendChainsInSentence( self, sentence, foundChains ):
        ''' Rakendab meetodit self.extendChainsInClause() antud lause igal osalausel.
        '''
        # 1) Preprocessing
        clauses = getClausesByClauseIDs( sentence )

        # 2) Extend verb chains in each clause
        allDetectedVerbChains = []
        for clauseID in clauses:
            clause = clauses[clauseID]
            self.extendChainsInClause(clause, clauseID, foundChains)
    def detectVerbChainsFromSent( self, sentence, **kwargs):
        ''' Detect verb chains from given sentence.

        Parameters
        ----------
        sentence:  list of dict
            A list of sentence words, each word in form of a dictionary containing 
            morphological analysis and clause boundary annotations (must have CLAUSE_IDX);
        
        Keyword parameters
        ------------------
        expand2ndTime: boolean
            If True, regular verb chains (chains not ending with 'olema') are expanded twice.
            (default: False)
        breakOnPunctuation: boolean
            If True, expansion of regular verb chains will be broken in case of intervening punctuation.
            (default: False)
        removeSingleAraEi: boolean
            if True, verb chains consisting of a single word, 'ära' or 'ei', will be removed.
            (default: True)
        removeOverlapping: boolean
            If True, overlapping verb chains will be removed.
            (default: True)

        Returns
        -------
        list of dict
            List of detected verb chains, each verb chain has following attributes (keys):
             PHRASE      -- list of int : indexes pointing to elements in sentence that belong 
                                            to the chain;
             PATTERN     -- list of str : for each word in phrase, marks whether it is 'ega', 'ei', 
                                            'ära', 'pole', 'ole', '&' (conjunction: ja/ning/ega/või) 
                                            'verb' (verb different than 'ole') or 'nom/adv';
             ANALYSIS_IDS -- list of (list of int) : for each word in phrase, points to index(es) of 
                                                      morphological analyses that correspond to words
                                                      in the verb chains;
             ROOTS       -- list of str : for each word in phrase, lists its corresponding ROOT 
                                            value from the morphological analysis; e.g. for the verb
                                            chain 'püüab kodeerida', the ROOT will be ['püüd', 
                                            'kodeeri'];
             MORPH       -- list of str : for each word in phrase, lists its part-of-speech value 
                                            and morphological form (in one string, separated by '_',
                                            and multiple variants of the pos/form separated by '/'); 
                                            e.g. for the verb chain 'on tulnud', the MORPH value 
                                            will be ['V_vad/b', 'V_nud'];
             OTHER_VERBS  -- bool : whether there are other verbs in the context, potentially being 
                                     part of the verb chain; if this is True, it is uncertain whether 
                                     the chain is complete or not;
             
             POLARITY     -- 'POS', 'NEG' or '??' : grammatical polarity of the verb chain; Negative
                                                    polarity indicates that the verb phrase begins 
                                                    with 'ei', 'ega', 'ära' or 'pole'; 
             TENSE        -- tense of the main verb:  'present', 'imperfect', 'perfect', 
                                                              'pluperfect', 'past', '??';
             MOOD         -- mood of the main verb:   'indic', 'imper', 'condit', 'quotat', '??';
             VOICE        -- voice of the main verb:  'personal', 'impersonal', '??';

        '''
        # 0) Parse given arguments
        expand2ndTime = False
        removeOverlapping  = True
        removeSingleAraEi  = True
        breakOnPunctuation = False
        for argName, argVal in kwargs.items():
            if argName == 'expand2ndTime':
                expand2ndTime = bool(argVal)
            elif argName == 'removeOverlapping':
                removeOverlapping = bool(argVal)
            elif argName == 'removeSingleAraEi':
                removeSingleAraEi = bool(argVal)
            elif argName == 'breakOnPunctuation':
                breakOnPunctuation = bool(argVal)
            else:
                raise Exception(' Unsupported argument given: '+argName)

        # 1) Preprocessing
        sentence = addWordIDs( sentence )
        clauses  = getClausesByClauseIDs( sentence )

        # 2) Extract predicate-centric verb chains within each clause
        allDetectedVerbChains = []
        for clauseID in clauses:
            clause   = clauses[clauseID]

            # 2.1) Extract predicate-centric verb chains within each clause
            detectedBasicChains = _extractBasicPredicateFromClause(clause, clauseID)
            allDetectedVerbChains.extend( detectedBasicChains )
            
            # 2.2) Extract 'saama' + 'tud' verb phrases (typically rare)
            _expandSaamaWithTud( clause, clauseID, allDetectedVerbChains )

            # 2.3) Extend 'olema' chains with 'nud/tud/mas/mata' verbs (if possible)
            _expandOlemaVerbChains( clause, clauseID, allDetectedVerbChains )
            
            # 2.4) Expand non-olema verb chains inside the clause where possible (verb+verb chains)
            _expandVerbChainsBySubcat( clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation)

            # 2.5) Determine for which verb chains the context should be clear
            #    (no additional verbs can be added to the phrase)
            _determineVerbChainContextualAmbiguity( clause, clauseID, allDetectedVerbChains)
            
            # 2.6) Expand non-olema verb chains inside the clause 2nd time (verb+verb+verb chains)
            #      (Note that while verb+verb+verb+verb+...  chains are also possible, three verbs 
            #       seems to be a critical length: longer chains are rare and thus making longer 
            #       chains will likely lead to errors);
            if expand2ndTime:
                _expandVerbChainsBySubcat( clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation)
            
        # 3) Extract 'ega' negations (considering the whole sentence context)
        expandableEgaFound = _extractEgaNegFromSent( sentence, clauses, allDetectedVerbChains )

        if expandableEgaFound:
            for clauseID in clauses:
                clause = clauses[clauseID]
                # 3.1) Expand non-olema 'ega' verb chains inside the clause, if possible;
                _expandVerbChainsBySubcat( clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation)
            
            #_debugPrint(' | '+getJsonAsTextString(sentence, markTokens = [ verbObj[PHRASE] for verbObj in allDetectedVerbChains ]))

        # 4) Extend chains with nom/adv + Vinf relations
        if self.verbNomAdvVinfExtender:
            addGrammaticalFeatsAndRoots( sentence, allDetectedVerbChains )
            for clauseID in clauses:
                clause = clauses[clauseID]
                expansionPerformed = \
                    self.verbNomAdvVinfExtender.extendChainsInClause( clause, clauseID, allDetectedVerbChains )
                if expansionPerformed:
                    _determineVerbChainContextualAmbiguity( clause, clauseID, allDetectedVerbChains)
        
        # ) Remove redundant and overlapping verb phrases
        removeRedundantVerbChains( allDetectedVerbChains, removeOverlapping = removeOverlapping, removeSingleAraAndEi = removeSingleAraEi )

        # ) Add grammatical features (in the end)
        addGrammaticalFeatsAndRoots( sentence, allDetectedVerbChains )

        return allDetectedVerbChains
    def detectVerbChainsFromSent(self, sentence, **kwargs):
        ''' Detect verb chains from given sentence.

        Parameters
        ----------
        sentence:  list of dict
            A list of sentence words, each word in form of a dictionary containing 
            morphological analysis and clause boundary annotations (must have CLAUSE_IDX);
        
        Keyword parameters
        ------------------
        expand2ndTime: boolean
            If True, regular verb chains (chains not ending with 'olema') are expanded twice.
            (default: False)
        breakOnPunctuation: boolean
            If True, expansion of regular verb chains will be broken in case of intervening punctuation.
            (default: False)
        removeSingleAraEi: boolean
            if True, verb chains consisting of a single word, 'ära' or 'ei', will be removed.
            (default: True)
        removeOverlapping: boolean
            If True, overlapping verb chains will be removed.
            (default: True)

        Returns
        -------
        list of dict
            List of detected verb chains, each verb chain has following attributes (keys):
             PHRASE      -- list of int : indexes pointing to elements in sentence that belong 
                                            to the chain;
             PATTERN     -- list of str : for each word in phrase, marks whether it is 'ega', 'ei', 
                                            'ära', 'pole', 'ole', '&' (conjunction: ja/ning/ega/või) 
                                            'verb' (verb different than 'ole') or 'nom/adv';
             ANALYSIS_IDS -- list of (list of int) : for each word in phrase, points to index(es) of 
                                                      morphological analyses that correspond to words
                                                      in the verb chains;
             ROOTS       -- list of str : for each word in phrase, lists its corresponding ROOT 
                                            value from the morphological analysis; e.g. for the verb
                                            chain 'püüab kodeerida', the ROOT will be ['püüd', 
                                            'kodeeri'];
             MORPH       -- list of str : for each word in phrase, lists its part-of-speech value 
                                            and morphological form (in one string, separated by '_',
                                            and multiple variants of the pos/form separated by '/'); 
                                            e.g. for the verb chain 'on tulnud', the MORPH value 
                                            will be ['V_vad/b', 'V_nud'];
             OTHER_VERBS  -- bool : whether there are other verbs in the context, potentially being 
                                     part of the verb chain; if this is True, it is uncertain whether 
                                     the chain is complete or not;
             
             POLARITY     -- 'POS', 'NEG' or '??' : grammatical polarity of the verb chain; Negative
                                                    polarity indicates that the verb phrase begins 
                                                    with 'ei', 'ega', 'ära' or 'pole'; 
             TENSE        -- tense of the main verb:  'present', 'imperfect', 'perfect', 
                                                              'pluperfect', 'past', '??';
             MOOD         -- mood of the main verb:   'indic', 'imper', 'condit', 'quotat', '??';
             VOICE        -- voice of the main verb:  'personal', 'impersonal', '??';

        '''
        # 0) Parse given arguments
        expand2ndTime = False
        removeOverlapping = True
        removeSingleAraEi = True
        breakOnPunctuation = False
        for argName, argVal in kwargs.items():
            if argName == 'expand2ndTime':
                expand2ndTime = bool(argVal)
            elif argName == 'removeOverlapping':
                removeOverlapping = bool(argVal)
            elif argName == 'removeSingleAraEi':
                removeSingleAraEi = bool(argVal)
            elif argName == 'breakOnPunctuation':
                breakOnPunctuation = bool(argVal)
            else:
                raise Exception(' Unsupported argument given: ' + argName)

        # 1) Preprocessing
        sentence = addWordIDs(sentence)
        clauses = getClausesByClauseIDs(sentence)

        # 2) Extract predicate-centric verb chains within each clause
        allDetectedVerbChains = []
        for clauseID in clauses:
            clause = clauses[clauseID]

            # 2.1) Extract predicate-centric verb chains within each clause
            detectedBasicChains = _extractBasicPredicateFromClause(
                clause, clauseID)
            allDetectedVerbChains.extend(detectedBasicChains)

            # 2.2) Extract 'saama' + 'tud' verb phrases (typically rare)
            _expandSaamaWithTud(clause, clauseID, allDetectedVerbChains)

            # 2.3) Extend 'olema' chains with 'nud/tud/mas/mata' verbs (if possible)
            _expandOlemaVerbChains(clause, clauseID, allDetectedVerbChains)

            # 2.4) Expand non-olema verb chains inside the clause where possible (verb+verb chains)
            _expandVerbChainsBySubcat(clause, clauseID, allDetectedVerbChains,
                                      self.verbInfSubcatLexicon, False,
                                      breakOnPunctuation)

            # 2.5) Determine for which verb chains the context should be clear
            #    (no additional verbs can be added to the phrase)
            _determineVerbChainContextualAmbiguity(clause, clauseID,
                                                   allDetectedVerbChains)

            # 2.6) Expand non-olema verb chains inside the clause 2nd time (verb+verb+verb chains)
            #      (Note that while verb+verb+verb+verb+...  chains are also possible, three verbs
            #       seems to be a critical length: longer chains are rare and thus making longer
            #       chains will likely lead to errors);
            if expand2ndTime:
                _expandVerbChainsBySubcat(clause, clauseID,
                                          allDetectedVerbChains,
                                          self.verbInfSubcatLexicon, False,
                                          breakOnPunctuation)

        # 3) Extract 'ega' negations (considering the whole sentence context)
        expandableEgaFound = _extractEgaNegFromSent(sentence, clauses,
                                                    allDetectedVerbChains)

        if expandableEgaFound:
            for clauseID in clauses:
                clause = clauses[clauseID]
                # 3.1) Expand non-olema 'ega' verb chains inside the clause, if possible;
                _expandVerbChainsBySubcat(clause, clauseID,
                                          allDetectedVerbChains,
                                          self.verbInfSubcatLexicon, False,
                                          breakOnPunctuation)

            #_debugPrint(' | '+getJsonAsTextString(sentence, markTokens = [ verbObj[PHRASE] for verbObj in allDetectedVerbChains ]))

        # 4) Extend chains with nom/adv + Vinf relations
        if self.verbNomAdvVinfExtender:
            addGrammaticalFeatsAndRoots(sentence, allDetectedVerbChains)
            for clauseID in clauses:
                clause = clauses[clauseID]
                expansionPerformed = \
                    self.verbNomAdvVinfExtender.extendChainsInClause( clause, clauseID, allDetectedVerbChains )
                if expansionPerformed:
                    _determineVerbChainContextualAmbiguity(
                        clause, clauseID, allDetectedVerbChains)

        # ) Remove redundant and overlapping verb phrases
        removeRedundantVerbChains(allDetectedVerbChains,
                                  removeOverlapping=removeOverlapping,
                                  removeSingleAraAndEi=removeSingleAraEi)

        # ) Add grammatical features (in the end)
        addGrammaticalFeatsAndRoots(sentence, allDetectedVerbChains)

        return allDetectedVerbChains