def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue b0BelongsToVmwe = config.buffer[0].In(vmwe) if not b0BelongsToVmwe: continue s0BelongsToVmwe = True if len(config.stack) > 0: for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(config.stack) > 1: otherStackElementsBelongtoVmwe = False for sElem in config.stack[:-1]: for token in Sentence.getTokens(sElem): if token.In(vmwe): otherStackElementsBelongtoVmwe = True break if otherStackElementsBelongtoVmwe: break else: otherStackElementsBelongtoVmwe = False else: s0BelongsToVmwe = False otherStackElementsBelongtoVmwe = False if not s0BelongsToVmwe and otherStackElementsBelongtoVmwe: cost += 1 return cost
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True increaseCost = False for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(vmwe.tokens) == 1 and s0BelongsToVmwe: if isinstance(config.stack[-1], Token): cost += 1 continue elif isinstance(config.stack[-1], list) and len(config.stack[-1]) == 1: if len(config.stack[-1][0].parentMWEs) == 1: return 0 else: cost += 1 if not s0BelongsToVmwe: continue for s in config.stack[:-1]: for token in Sentence.getTokens(s): if token.In(vmwe): increaseCost = True break if increaseCost: cost += 1 else: for b in config.buffer: if b.In(vmwe): cost += 1 break return cost
def apply(self, parent, sent, vMWEId=None, parse=False, vMWEType=None, mwtMerge=False): Counters.blackMergeNum += 1 if sent and not parse: sent.blackMergeNum += 1 config = parent.configuration newBuffer = list(config.buffer) if mwtMerge: newStack = list(config.stack)[:-1] newStack.append([config.stack[-1]]) else: newStack = list(config.stack)[:-2] newStack.append([config.stack[-2], config.stack[-1]]) newTokens = list(config.tokens) vMWETokens = Sentence.getTokens(newStack[-1]) if len(vMWETokens) > 1 or (len(vMWETokens) == 1 and mwtMerge): if vMWEId is None: vMWEId = VMWE.getVMWENumber(newTokens) + 1 vMWE = VMWE(vMWEId, vMWETokens[0]) if parse: sent.identifiedVMWEs.append(vMWE) vMWE.tokens = vMWETokens if vMWEType is not None: vMWE.type = vMWEType newTokens.append(vMWE) elif len(vMWETokens) == 1: newTokens.append(vMWETokens[0]) newConfig = Configuration(stack=newStack, buffer=newBuffer, tokens=newTokens, sent=sent, transition=self) super(BlackMerge, self).__init__(config=newConfig, previous=parent, sent=sent)
def getCost(config, transType=None): sent = config.sent cost = 0 if isinstance(config.stack[-1], list): for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue allInList, someInList = True, False stackTokens = Sentence.getTokens(config.stack[-1]) for t in stackTokens: if not t.In(vmwe): allInList = False else: someInList = True if len(stackTokens) == len(vmwe.tokens) and allInList: return 0 else: allInList = False if not allInList and someInList: cost += 1 elif isinstance(config.stack[-1], Token): if config.stack[-1].parentMWEs is None or len(config.stack[-1].parentMWEs) == 0: return 0 else: for vmwe in config.stack[-1].parentMWEs: if config.stack[-1].In(vmwe) and len(vmwe.tokens) > 1: cost += 1 return cost return cost
def checkForVMWE(transition): config = transition.configuration sent = config.sent # Check up for a possible COMPLETE of MWE after a MERGE transition if transition.type == TransitionType.MERGE: if len(config.stack) == 1 and isinstance(config.stack[0], list): vMWE = None parents = [] tokens = Sentence.getTokens(config.stack[0]) for token in tokens: if len(token.parentMWEs) == 1: vMWE = token.parentMWEs[0] break for parent in token.parentMWEs: if parent not in parents: parents.append(parent) if vMWE is None: for parent in parents: for token in tokens: if parent not in token.parentMWEs: parents.remove(parent) if len(parents) > 1: for parent in parents: if parent.isInterleaving or parent.isEmbedded: parents.remove(parent) vMWE = parents[0] if vMWE is not None and len(vMWE.tokens) == len(tokens): complete = Complete(sent=sent) complete.apply(transition, sent, vMWEId=vMWE.id) return complete return None
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True s1BelongsToVmwe = True increaseCost = False for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(vmwe.tokens) == 1 and s0BelongsToVmwe and isinstance(config.stack[-1], Token): cost += 1 continue for token in Sentence.getTokens(config.stack[-2]): if not token.In(vmwe): s1BelongsToVmwe = False if (s0BelongsToVmwe and s1BelongsToVmwe) and len(config.stack) > 1 and len(vmwe.tokens) == len( Sentence.getTokens(config.stack[-2:])): cost += 1 continue if (s0BelongsToVmwe and s1BelongsToVmwe) or (not s0BelongsToVmwe and not s1BelongsToVmwe): continue if len(config.stack) > 2: for stackElement in config.stack[:-2]: for token in Sentence.getTokens(stackElement): if token.In(vmwe): increaseCost = True break if increaseCost: break if increaseCost: cost += 1 continue for b in config.buffer: if b.In(vmwe): cost += 1 break return cost
def check(transition): config = transition.configuration sent = config.sent # Check up of a possible MERGE if len(config.stack) > 1: s0Tokens = Sentence.getTokens(config.stack[-1]) s1Tokens = Sentence.getTokens(config.stack[-2]) # #TODO getParent MWE for WHite merge tokens = s1Tokens + s0Tokens # tokens = Sentence.getTokens(config.stack) selectedParents = VMWE.getParents(tokens) if selectedParents and len(selectedParents) > 1: reports.annotationReport += str(sent) if selectedParents and len(selectedParents) == 1: selectedParent = selectedParents[0] if selectedParent.type is not None and selectedParent.type != '': if selectedParent.type.lower() == 'id': merge = MergeAsID(sent=sent) elif selectedParent.type.lower() == 'ireflv': merge = MergeAsIReflV(sent=sent) elif selectedParent.type.lower() == 'lvc': merge = MergeAsLVC(sent=sent) elif selectedParent.type.lower() == 'vpc': merge = MergeAsVPC(sent=sent) else: merge = MergeAsOTH(sent=sent) else: merge = MergeAsOTH(sent=sent) merge.apply(transition, sent=sent) return merge # selectedParents = VMWE.getSharedVMWEs(Sentence.getTokens(config.stack)) # if selectedParents and len(selectedParents) > 1: # reports.annotationReport += str(sent) selectedParents = VMWE.haveSameParents(tokens) if selectedParents and len(selectedParents) == 1: if selectedParents[0].tokens[-1] == tokens[-1]: # if len(config.stack) > 2: merge = WhiteMerge(sent=sent) merge.apply(transition, sent) return merge return None
def check(transition): config = transition.configuration sent = config.sent # Check up of a possible MERGE if len(config.stack) > 1: tokens = Sentence.getTokens(config.stack) selectedParents = VMWE.getParents(tokens) if selectedParents and len(selectedParents) == 1 and not selectedParents[0].isEmbedded \ and not selectedParents[0].isInterleaving: merge = Merge(sent=sent) merge.apply(transition, sent) return merge return None
def generateDisconinousFeatures(configuration, sent, transDic): tokens = Sentence.getTokens([configuration.stack[-1]]) tokenTxt = Sentence.getTokenLemmas(tokens) for key in Corpus.mweDictionary.keys(): if tokenTxt in key and tokenTxt != key: bufidx = 0 for bufElem in configuration.buffer[:5]: if bufElem.lemma != '' and ( (tokenTxt + ' ' + bufElem.lemma) in key or (bufElem.lemma + ' ' + tokenTxt) in key): transDic['S0B' + str(bufidx) + 'ArePartsOfMWE'] = True transDic['S0B' + str(bufidx) + 'ArePartsOfMWEDistance'] = sent.tokens.index( bufElem) - sent.tokens.index(tokens[-1]) bufidx += 1 break
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True s1BelongsToVmwe = True for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False break for token in Sentence.getTokens(config.stack[-2]): if not token.In(vmwe): s1BelongsToVmwe = False break if s0BelongsToVmwe and s1BelongsToVmwe: return 0 else: cost += 1 return cost
def getCost(config, transType=None, type=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue if (config.stack[-1]).In(vmwe) and len(vmwe.tokens) == 1 and vmwe.type.lower() == type.lower(): return 0 if (config.stack[-1]).In(vmwe) and len(vmwe.tokens) > 1: cost += 1 continue # Precision score: vmwes = VMWE.getParents(Sentence.getTokens(config.stack[-1]), type.lower()) if not vmwes: cost += 1 return cost
def concatenateTokens(tokens): idx = 0 tokenDic = {} result = [] for token in tokens: if isinstance(token, Token): result.append(Token(-1, token.text, token.lemma, token.posTag)) elif isinstance(token, list): tokenDic[idx] = Token(-1, '', '', '') for subToken in Sentence.getTokens(token): tokenDic[idx].text += subToken.text + '_' tokenDic[idx].lemma += subToken.lemma + '_' tokenDic[idx].posTag += subToken.posTag + '_' tokenDic[idx].text = tokenDic[idx].text[:-1] tokenDic[idx].lemma = tokenDic[idx].lemma[:-1] tokenDic[idx].posTag = tokenDic[idx].posTag[:-1] result.append(tokenDic[idx]) idx += 1 return result
def check(parent): config = parent.configuration sent = config.sent reduce = Reduce(sent=sent) stackWithTopTokenWitoutParents = config.stack and isinstance(config.stack[-1], Token) and ( not config.stack[-1].parentMWEs) if stackWithTopTokenWitoutParents: reduce.apply(parent, sent) return reduce empyBufferWithFullStack = not config.buffer and config.stack if empyBufferWithFullStack: reduce.apply(parent, sent) return reduce stackWithMWT = config.stack and isinstance(config.stack[-1], list) and len(config.stack[-1]) == 1 and \ config.stack[-1][0].parentMWEs == 1 if stackWithMWT: reduce.apply(parent, sent) return reduce stackWithSingleListWitOneSharedParentOnly = False if config.stack and isinstance(config.stack[-1], list): tokens = Sentence.getTokens(config.stack[-1]) if len(VMWE.getParents(tokens)) == 1 and not VMWE.getParents(tokens)[0].isEmbedded: stackWithSingleListWitOneSharedParentOnly = True if stackWithSingleListWitOneSharedParentOnly: reduce.apply(parent, sent) return reduce stackWithTopTokenOfInterleavingMWE = sent.containsInterleaving and config.stack and isinstance(config.stack[-1], Token) and ( config.stack[-1].parentMWEs and len( config.stack[-1].parentMWEs) == 1 and config.stack[-1].parentMWEs[0].isInterleaving) if stackWithTopTokenOfInterleavingMWE: reduce.apply(parent, sent) return reduce return None
def getCost(config, transType=None): sent = config.sent cost = 0 for vmwe in sent.vMWEs: if vmwe.isInterleaving or vmwe.In(sent.identifiedVMWEs): continue s0BelongsToVmwe = True s1BelongsToVmwe = True increaseCost = False for token in Sentence.getTokens(config.stack[-1]): if not token.In(vmwe): s0BelongsToVmwe = False if len(vmwe.tokens) == 1 and s0BelongsToVmwe and isinstance(config.stack[-1], Token): cost += 1 continue for token in Sentence.getTokens(config.stack[-2]): if not token.In(vmwe): s1BelongsToVmwe = False if (s0BelongsToVmwe and not s1BelongsToVmwe) or (not s0BelongsToVmwe and s1BelongsToVmwe): cost += 1 continue if s0BelongsToVmwe and s1BelongsToVmwe: if len(Sentence.getTokens(config.stack[-1])) + len(Sentence.getTokens(config.stack[-2])) == len( vmwe.tokens) and transType.name[7:].lower() == vmwe.type.lower(): return 0 if len(config.stack) > 2: for stackElement in config.stack[:-2]: for token in Sentence.getTokens(stackElement): if token.In(vmwe): increaseCost = True break if increaseCost: break if increaseCost: cost += 1 continue for b in config.buffer: if b.In(vmwe): cost += 1 break # Precision score: correctlyIdentified = False vmwes = VMWE.getParents(Sentence.getTokens(config.stack[-2:])) if vmwes: for vmwe in vmwes: if vmwe.type.lower() in str(transType.name).lower(): correctlyIdentified = True if not correctlyIdentified: cost += 1 return cost
def apply(self, parent, sent, vMWEId=None, parse=False): Counters.completeNum += 1 config = parent.configuration newBuffer = list(config.buffer) newStack = list(config.stack) vMWETokens = Sentence.getTokens(newStack[0]) newStack = newStack[:-1] newTokens = list(config.tokens) if len(vMWETokens) > 1: if sent is not None and not parse: sent.blackMergeNum += 1 if vMWEId is None: vMWEId = VMWE.getVMWENumber(newTokens) + 1 vMWE = VMWE(vMWEId, vMWETokens[0]) if parse: sent.identifiedVMWEs.append(vMWE) vMWE.tokens = vMWETokens newTokens.append(vMWE) elif len(vMWETokens) == 1: newTokens.append(vMWETokens[0]) newConfig = Configuration(stack=newStack, buffer=newBuffer, tokens=newTokens, sent=sent, transition=self) super(Complete, self).__init__(config=newConfig, previous=parent, sent=sent)