def keepForDiabetesCorpus(xmldoc): """ Return True if we should keep this abstract for the diabetes corpus Include abstract in diabetes corpus if it contains at least one cost value or term. """ abstractNodes = xmldoc.getElementsByTagName('Abstract') if abstractNodes is None or len(abstractNodes) == 0: return False textNodeList = abstractNodes[0].getElementsByTagName('AbstractText') if textNodeList is None or len(textNodeList) == 0: return False nCostValues = 0 nCostTerms = 0 tokenCount = 0 cueLemmaSet = {"cost", "QALY", "QALYs"} for textNode in textNodeList: text = xmlutil.getText(textNode) sentenceList = sentenceSplitter.tokenize(text) for sText in sentenceList: tokenTextList = tokenizer.tokenize(sText) tokenList = tokenlist.TokenList() tokenList.convertStringList(tokenTextList) s = sentence.Sentence(tokenList) for token in s: tokenCount += 1 lemmatizeabstracts.lemmatizeToken(token) if token.lemma in cueLemmaSet or token.text.find('cost') >= 0: nCostTerms += 1 if cvFinder.tokenIsCostValue(token): nCostValues += 1 return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100
def keepForDiabetesCorpusCostValue(xmldoc): """ Return True if we should keep this abstract for the diabetes corpus Include abstract in diabetes corpus if it contains at least *one* currency value. """ textNodeList = xmldoc.getElementsByTagName('AbstractText') nCostValues = 0 for textNode in textNodeList: text = xmlutil.getText(textNode) sentenceList = sentenceSplitter.tokenize(text) for sText in sentenceList: tokenTextList = tokenizer.tokenize(sText) tokenList = tokenlist.TokenList() tokenList.convertStringList(tokenTextList) s = sentence.Sentence(tokenList) for token in s: lemmatizeabstracts.lemmatizeToken(token) if cvFinder.tokenIsCostValue(token): nCostValues += 1 return nCostValues > 0