def extendChainsInSentence(self, sentence, foundChains): ''' Rakendab meetodit self.extendChainsInClause() antud lause igal osalausel. ''' # 1) Preprocessing clauses = getClausesByClauseIDs(sentence) # 2) Extend verb chains in each clause allDetectedVerbChains = [] for clauseID in clauses: clause = clauses[clauseID] self.extendChainsInClause(clause, clauseID, foundChains)
def extendChainsInSentence( self, sentence, foundChains ): ''' Rakendab meetodit self.extendChainsInClause() antud lause igal osalausel. ''' # 1) Preprocessing clauses = getClausesByClauseIDs( sentence ) # 2) Extend verb chains in each clause allDetectedVerbChains = [] for clauseID in clauses: clause = clauses[clauseID] self.extendChainsInClause(clause, clauseID, foundChains)
def detectVerbChainsFromSent( self, sentence, **kwargs): ''' Detect verb chains from given sentence. Parameters ---------- sentence: list of dict A list of sentence words, each word in form of a dictionary containing morphological analysis and clause boundary annotations (must have CLAUSE_IDX); Keyword parameters ------------------ expand2ndTime: boolean If True, regular verb chains (chains not ending with 'olema') are expanded twice. (default: False) breakOnPunctuation: boolean If True, expansion of regular verb chains will be broken in case of intervening punctuation. (default: False) removeSingleAraEi: boolean if True, verb chains consisting of a single word, 'ära' or 'ei', will be removed. (default: True) removeOverlapping: boolean If True, overlapping verb chains will be removed. (default: True) Returns ------- list of dict List of detected verb chains, each verb chain has following attributes (keys): PHRASE -- list of int : indexes pointing to elements in sentence that belong to the chain; PATTERN -- list of str : for each word in phrase, marks whether it is 'ega', 'ei', 'ära', 'pole', 'ole', '&' (conjunction: ja/ning/ega/või) 'verb' (verb different than 'ole') or 'nom/adv'; ANALYSIS_IDS -- list of (list of int) : for each word in phrase, points to index(es) of morphological analyses that correspond to words in the verb chains; ROOTS -- list of str : for each word in phrase, lists its corresponding ROOT value from the morphological analysis; e.g. for the verb chain 'püüab kodeerida', the ROOT will be ['püüd', 'kodeeri']; MORPH -- list of str : for each word in phrase, lists its part-of-speech value and morphological form (in one string, separated by '_', and multiple variants of the pos/form separated by '/'); e.g. for the verb chain 'on tulnud', the MORPH value will be ['V_vad/b', 'V_nud']; OTHER_VERBS -- bool : whether there are other verbs in the context, potentially being part of the verb chain; if this is True, it is uncertain whether the chain is complete or not; POLARITY -- 'POS', 'NEG' or '??' : grammatical polarity of the verb chain; Negative polarity indicates that the verb phrase begins with 'ei', 'ega', 'ära' or 'pole'; TENSE -- tense of the main verb: 'present', 'imperfect', 'perfect', 'pluperfect', 'past', '??'; MOOD -- mood of the main verb: 'indic', 'imper', 'condit', 'quotat', '??'; VOICE -- voice of the main verb: 'personal', 'impersonal', '??'; ''' # 0) Parse given arguments expand2ndTime = False removeOverlapping = True removeSingleAraEi = True breakOnPunctuation = False for argName, argVal in kwargs.items(): if argName == 'expand2ndTime': expand2ndTime = bool(argVal) elif argName == 'removeOverlapping': removeOverlapping = bool(argVal) elif argName == 'removeSingleAraEi': removeSingleAraEi = bool(argVal) elif argName == 'breakOnPunctuation': breakOnPunctuation = bool(argVal) else: raise Exception(' Unsupported argument given: '+argName) # 1) Preprocessing sentence = addWordIDs( sentence ) clauses = getClausesByClauseIDs( sentence ) # 2) Extract predicate-centric verb chains within each clause allDetectedVerbChains = [] for clauseID in clauses: clause = clauses[clauseID] # 2.1) Extract predicate-centric verb chains within each clause detectedBasicChains = _extractBasicPredicateFromClause(clause, clauseID) allDetectedVerbChains.extend( detectedBasicChains ) # 2.2) Extract 'saama' + 'tud' verb phrases (typically rare) _expandSaamaWithTud( clause, clauseID, allDetectedVerbChains ) # 2.3) Extend 'olema' chains with 'nud/tud/mas/mata' verbs (if possible) _expandOlemaVerbChains( clause, clauseID, allDetectedVerbChains ) # 2.4) Expand non-olema verb chains inside the clause where possible (verb+verb chains) _expandVerbChainsBySubcat( clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation) # 2.5) Determine for which verb chains the context should be clear # (no additional verbs can be added to the phrase) _determineVerbChainContextualAmbiguity( clause, clauseID, allDetectedVerbChains) # 2.6) Expand non-olema verb chains inside the clause 2nd time (verb+verb+verb chains) # (Note that while verb+verb+verb+verb+... chains are also possible, three verbs # seems to be a critical length: longer chains are rare and thus making longer # chains will likely lead to errors); if expand2ndTime: _expandVerbChainsBySubcat( clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation) # 3) Extract 'ega' negations (considering the whole sentence context) expandableEgaFound = _extractEgaNegFromSent( sentence, clauses, allDetectedVerbChains ) if expandableEgaFound: for clauseID in clauses: clause = clauses[clauseID] # 3.1) Expand non-olema 'ega' verb chains inside the clause, if possible; _expandVerbChainsBySubcat( clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation) #_debugPrint(' | '+getJsonAsTextString(sentence, markTokens = [ verbObj[PHRASE] for verbObj in allDetectedVerbChains ])) # 4) Extend chains with nom/adv + Vinf relations if self.verbNomAdvVinfExtender: addGrammaticalFeatsAndRoots( sentence, allDetectedVerbChains ) for clauseID in clauses: clause = clauses[clauseID] expansionPerformed = \ self.verbNomAdvVinfExtender.extendChainsInClause( clause, clauseID, allDetectedVerbChains ) if expansionPerformed: _determineVerbChainContextualAmbiguity( clause, clauseID, allDetectedVerbChains) # ) Remove redundant and overlapping verb phrases removeRedundantVerbChains( allDetectedVerbChains, removeOverlapping = removeOverlapping, removeSingleAraAndEi = removeSingleAraEi ) # ) Add grammatical features (in the end) addGrammaticalFeatsAndRoots( sentence, allDetectedVerbChains ) return allDetectedVerbChains
def detectVerbChainsFromSent(self, sentence, **kwargs): ''' Detect verb chains from given sentence. Parameters ---------- sentence: list of dict A list of sentence words, each word in form of a dictionary containing morphological analysis and clause boundary annotations (must have CLAUSE_IDX); Keyword parameters ------------------ expand2ndTime: boolean If True, regular verb chains (chains not ending with 'olema') are expanded twice. (default: False) breakOnPunctuation: boolean If True, expansion of regular verb chains will be broken in case of intervening punctuation. (default: False) removeSingleAraEi: boolean if True, verb chains consisting of a single word, 'ära' or 'ei', will be removed. (default: True) removeOverlapping: boolean If True, overlapping verb chains will be removed. (default: True) Returns ------- list of dict List of detected verb chains, each verb chain has following attributes (keys): PHRASE -- list of int : indexes pointing to elements in sentence that belong to the chain; PATTERN -- list of str : for each word in phrase, marks whether it is 'ega', 'ei', 'ära', 'pole', 'ole', '&' (conjunction: ja/ning/ega/või) 'verb' (verb different than 'ole') or 'nom/adv'; ANALYSIS_IDS -- list of (list of int) : for each word in phrase, points to index(es) of morphological analyses that correspond to words in the verb chains; ROOTS -- list of str : for each word in phrase, lists its corresponding ROOT value from the morphological analysis; e.g. for the verb chain 'püüab kodeerida', the ROOT will be ['püüd', 'kodeeri']; MORPH -- list of str : for each word in phrase, lists its part-of-speech value and morphological form (in one string, separated by '_', and multiple variants of the pos/form separated by '/'); e.g. for the verb chain 'on tulnud', the MORPH value will be ['V_vad/b', 'V_nud']; OTHER_VERBS -- bool : whether there are other verbs in the context, potentially being part of the verb chain; if this is True, it is uncertain whether the chain is complete or not; POLARITY -- 'POS', 'NEG' or '??' : grammatical polarity of the verb chain; Negative polarity indicates that the verb phrase begins with 'ei', 'ega', 'ära' or 'pole'; TENSE -- tense of the main verb: 'present', 'imperfect', 'perfect', 'pluperfect', 'past', '??'; MOOD -- mood of the main verb: 'indic', 'imper', 'condit', 'quotat', '??'; VOICE -- voice of the main verb: 'personal', 'impersonal', '??'; ''' # 0) Parse given arguments expand2ndTime = False removeOverlapping = True removeSingleAraEi = True breakOnPunctuation = False for argName, argVal in kwargs.items(): if argName == 'expand2ndTime': expand2ndTime = bool(argVal) elif argName == 'removeOverlapping': removeOverlapping = bool(argVal) elif argName == 'removeSingleAraEi': removeSingleAraEi = bool(argVal) elif argName == 'breakOnPunctuation': breakOnPunctuation = bool(argVal) else: raise Exception(' Unsupported argument given: ' + argName) # 1) Preprocessing sentence = addWordIDs(sentence) clauses = getClausesByClauseIDs(sentence) # 2) Extract predicate-centric verb chains within each clause allDetectedVerbChains = [] for clauseID in clauses: clause = clauses[clauseID] # 2.1) Extract predicate-centric verb chains within each clause detectedBasicChains = _extractBasicPredicateFromClause( clause, clauseID) allDetectedVerbChains.extend(detectedBasicChains) # 2.2) Extract 'saama' + 'tud' verb phrases (typically rare) _expandSaamaWithTud(clause, clauseID, allDetectedVerbChains) # 2.3) Extend 'olema' chains with 'nud/tud/mas/mata' verbs (if possible) _expandOlemaVerbChains(clause, clauseID, allDetectedVerbChains) # 2.4) Expand non-olema verb chains inside the clause where possible (verb+verb chains) _expandVerbChainsBySubcat(clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation) # 2.5) Determine for which verb chains the context should be clear # (no additional verbs can be added to the phrase) _determineVerbChainContextualAmbiguity(clause, clauseID, allDetectedVerbChains) # 2.6) Expand non-olema verb chains inside the clause 2nd time (verb+verb+verb chains) # (Note that while verb+verb+verb+verb+... chains are also possible, three verbs # seems to be a critical length: longer chains are rare and thus making longer # chains will likely lead to errors); if expand2ndTime: _expandVerbChainsBySubcat(clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation) # 3) Extract 'ega' negations (considering the whole sentence context) expandableEgaFound = _extractEgaNegFromSent(sentence, clauses, allDetectedVerbChains) if expandableEgaFound: for clauseID in clauses: clause = clauses[clauseID] # 3.1) Expand non-olema 'ega' verb chains inside the clause, if possible; _expandVerbChainsBySubcat(clause, clauseID, allDetectedVerbChains, self.verbInfSubcatLexicon, False, breakOnPunctuation) #_debugPrint(' | '+getJsonAsTextString(sentence, markTokens = [ verbObj[PHRASE] for verbObj in allDetectedVerbChains ])) # 4) Extend chains with nom/adv + Vinf relations if self.verbNomAdvVinfExtender: addGrammaticalFeatsAndRoots(sentence, allDetectedVerbChains) for clauseID in clauses: clause = clauses[clauseID] expansionPerformed = \ self.verbNomAdvVinfExtender.extendChainsInClause( clause, clauseID, allDetectedVerbChains ) if expansionPerformed: _determineVerbChainContextualAmbiguity( clause, clauseID, allDetectedVerbChains) # ) Remove redundant and overlapping verb phrases removeRedundantVerbChains(allDetectedVerbChains, removeOverlapping=removeOverlapping, removeSingleAraAndEi=removeSingleAraEi) # ) Add grammatical features (in the end) addGrammaticalFeatsAndRoots(sentence, allDetectedVerbChains) return allDetectedVerbChains