Example #1
0
def keepForDiabetesCorpus(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least one cost value or term.
    """
    abstractNodes = xmldoc.getElementsByTagName('Abstract')
    if abstractNodes is None or len(abstractNodes) == 0:
        return False

    textNodeList = abstractNodes[0].getElementsByTagName('AbstractText')
    if textNodeList is None or len(textNodeList) == 0:
        return False

    nCostValues = 0
    nCostTerms = 0
    tokenCount = 0
    cueLemmaSet = {"cost", "QALY", "QALYs"}

    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                tokenCount += 1
                lemmatizeabstracts.lemmatizeToken(token)
                if token.lemma in cueLemmaSet or token.text.find('cost') >= 0:
                    nCostTerms += 1
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100
Example #2
0
def keepForDiabetesCorpus(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least one cost value or term.
    """
    abstractNodes = xmldoc.getElementsByTagName('Abstract')
    if abstractNodes is None or len(abstractNodes) == 0:
        return False

    textNodeList = abstractNodes[0].getElementsByTagName('AbstractText')
    if textNodeList is None or len(textNodeList) == 0:
        return False

    nCostValues = 0
    nCostTerms = 0
    tokenCount = 0
    cueLemmaSet = {"cost", "QALY", "QALYs"}

    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                tokenCount += 1
                lemmatizeabstracts.lemmatizeToken(token)
                if token.lemma in cueLemmaSet or token.text.find('cost') >= 0:
                    nCostTerms += 1
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return (nCostValues > 0 or nCostTerms > 0) and tokenCount > 100
Example #3
0
def keepForDiabetesCorpusCostValue(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least *one* currency value.
    """
    textNodeList = xmldoc.getElementsByTagName('AbstractText')
    nCostValues = 0
    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                lemmatizeabstracts.lemmatizeToken(token)
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return nCostValues > 0
Example #4
0
def keepForDiabetesCorpusCostValue(xmldoc):
    """ Return True if we should keep this abstract for the diabetes corpus
        Include abstract in diabetes corpus if it contains at least *one* currency value.
    """
    textNodeList = xmldoc.getElementsByTagName('AbstractText')
    nCostValues = 0
    for textNode in textNodeList:
        text = xmlutil.getText(textNode)
        sentenceList = sentenceSplitter.tokenize(text)
        for sText in sentenceList:
            tokenTextList = tokenizer.tokenize(sText)
            tokenList = tokenlist.TokenList()
            tokenList.convertStringList(tokenTextList)
            s = sentence.Sentence(tokenList)
            for token in s:
                lemmatizeabstracts.lemmatizeToken(token)
                if cvFinder.tokenIsCostValue(token):
                    nCostValues += 1

    return nCostValues > 0