Esempio n. 1
0
def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
                docSentences, debug)
            for key in entitiesByType:
                origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
                docSentences, debug)
            for key in interactionsByType:
                origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType:
                removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(
            input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(
            corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(
            corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
Esempio n. 2
0
def mergeAll(input, output=None, debug=False, iterate=False):
    if iterate:
        origItems = defaultdict(int)
        removedItems = defaultdict(int)
        for docSentences in SentenceElements.getCorpusIterator(input, output):
            entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(docSentences, debug)
            for key in entitiesByType: origItems[key] += entitiesByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
            interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(docSentences, debug)
            for key in interactionsByType: origItems[key] += interactionsByType[key]
            for key in duplicatesRemovedByType: removedItems[key] += duplicatesRemovedByType[key]
        printStats(origItems, removedItems)
        return None
    else:
        corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
        print >> sys.stderr, "Merging duplicate entities"
        entitiesByType, duplicatesRemovedByType = mergeDuplicateEntities(corpusElements.sentences, debug)
        printStats(entitiesByType, duplicatesRemovedByType)
        print >> sys.stderr, "Merging duplicate interactions"
        interactionsByType, duplicatesRemovedByType = mergeDuplicateInteractions(corpusElements.sentences, debug)
        printStats(interactionsByType, duplicatesRemovedByType)
        if output != None:
            print >> sys.stderr, "Writing output to", output
            ETUtils.write(corpusElements.rootElement, output)
        return corpusElements
def mergeAll(input, output=None, debug=False):
    corpusElements = CorpusElements.loadCorpus(input, removeIntersentenceInteractions=False)
    mergeDuplicateEntities(corpusElements, debug)
    mergeDuplicateInteractions(corpusElements, debug)
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusElements.rootElement, output)
    return corpusElements
Esempio n. 4
0
        print >> sys.stderr, "Psyco not installed"

    optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.")
    #optparser.add_option("-i", "--input", default=None, dest="input", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-s", "--source", default=None, dest="source", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-r", "--target", default=None, dest="target", help="Corpus in analysis format", metavar="FILE")
    #optparser.add_option("-o", "--output", default=None, dest="output", help="Corpus in analysis format", metavar="FILE")
    optparser.add_option("-t", "--tokenization", default="split-McClosky", dest="tokenization", help="Tokenization element name")
    optparser.add_option("-p", "--parse", default="split-McClosky", dest="parse", help="Parse element name")
    (options, args) = optparser.parse_args()
    assert(options.source != None)
    assert(options.target != None)
    #assert(options.output != None)
    
    print >> sys.stderr, "Loading source:",
    sourceElements = CorpusElements.loadCorpus(options.source, options.parse, options.tokenization)
    print >> sys.stderr, "Loading target:",
    targetElements = CorpusElements.loadCorpus(options.target, options.parse, options.tokenization)
    parseCopied = None
    tokenizationCopied = None
    print >> sys.stderr, "Mapping sentences"
    origIdToSentences = {}
    for sourceSentence in sourceElements.sentences:
        origIdToSentences[sourceSentence.sentence.get("origId")] = [sourceSentence, None]
    for targetSentence in targetElements.sentences:
        assert origIdToSentences.has_key(targetSentence.sentence.get("origId")), targetSentence.sentence.get("origId")
        origIdToSentences[targetSentence.sentence.get("origId")][1] = targetSentence
    print >> sys.stderr, "Comparing sentences"
    count = 0
    for key in sorted(origIdToSentences.keys()):
        sourceSentence = origIdToSentences[key][0]
Esempio n. 5
0
def copyParse(input, source, output, parse, tokenization):
    print >> sys.stderr, "Loading input file", input
    inputTree = ETUtils.ETFromObj(input)
    inputRoot = inputTree.getroot()
    print >> sys.stderr, "Loading source:",
    sourceElements = CorpusElements.loadCorpus(source, parse, tokenization)
    sourceSentencesByText = {}
    for sentence in sourceElements.sentences:
        sentenceText = sentence.sentence.get("text")
        #assert not sourceSentencesByText.has_key(sentenceText)
        if sourceSentencesByText.has_key(sentenceText):
            print >> sys.stderr, "Duplicate text", sentence.sentence.get("id"), sourceSentencesByText[sentenceText].sentence.get("id") 
        sourceSentencesByText[sentenceText] = sentence
    parsesCopied = [0,0]
    tokenizationsCopied = [0,0]
    for sentence in inputRoot.getiterator("sentence"):
        parsesCopied[1] += 1
        tokenizationsCopied[1] += 1
        #sourceSentence = sourceElements.sentencesByOrigId[sentence.attrib["origId"]]
        if not sourceSentencesByText.has_key(sentence.get("text")):
            print >> sys.stderr, "Warning, no text found for sentence", sentence.get("id")
            continue
        sourceSentence = sourceSentencesByText[sentence.get("text")]
        # Create analyses element (if needed)
        targetAnalysesElement = sentence.find("sentenceanalyses")
        if targetAnalysesElement == None:
            targetAnalysesElement = ET.Element("sentenceanalyses")
            sentence.append(targetAnalysesElement)
        # Create parses element (if needed)
        targetParsesElement = targetAnalysesElement.find("parses")
        if targetParsesElement == None:
            targetParsesElement = ET.Element("parses")
            targetAnalysesElement.append(targetParsesElement)
        # Check whether parse already exists
        targetParseElements = targetParsesElement.findall("parse")
        newParse = None
        for parseElement in targetParseElements:
            if parseElement.get("parser") == parse:
                newParse = parseElement
                break
        # Copy parse if it doesn't
        if newParse == None and sourceSentence.parseElement != None:
            targetParsesElement.append(sourceSentence.parseElement)
            parsesCopied[0] += 1
        
        # Create tokenizations element (if needed)
        targetTokenizationsElement = targetAnalysesElement.find("tokenizations")
        if targetTokenizationsElement == None:
            targetTokenizationsElement = ET.Element("tokenizations")
            targetAnalysesElement.append(targetTokenizationsElement)
        # Check whether tokenization already exists
        targetTokenizationElements = targetTokenizationsElement.findall("tokenization")
        newTokenization = None
        for tokenizationElement in targetTokenizationElements:
            if tokenizationElement.attrib["tokenizer"] == newParse.attrib["tokenizer"]:
                newTokenization = tokenizationElement
                break
        # Copy parse if it doesn't
        if newTokenization == None and sourceSentence.tokenizationElement != None:
            targetTokenizationsElement.append(sourceSentence.tokenizationElement)
            tokenizationsCopied[0] += 1
    
    print >> sys.stderr, "Copied parse elements", parsesCopied
    print >> sys.stderr, "Copied tokenization elements", tokenizationsCopied
    
    if output != None:       
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(inputTree, output)
    return inputTree
Esempio n. 6
0
def copyParse(input, source, output, parse, tokenization):
    print >> sys.stderr, "Loading input file", input
    inputTree = ETUtils.ETFromObj(input)
    inputRoot = inputTree.getroot()
    print >> sys.stderr, "Loading source:",
    sourceElements = CorpusElements.loadCorpus(source, parse, tokenization)
    sourceSentencesByText = {}
    for sentence in sourceElements.sentences:
        sentenceText = sentence.sentence.get("text")
        #assert not sourceSentencesByText.has_key(sentenceText)
        if sourceSentencesByText.has_key(sentenceText):
            print >> sys.stderr, "Duplicate text", sentence.sentence.get(
                "id"), sourceSentencesByText[sentenceText].sentence.get("id")
        sourceSentencesByText[sentenceText] = sentence
    parsesCopied = [0, 0]
    tokenizationsCopied = [0, 0]
    for sentence in inputRoot.getiterator("sentence"):
        parsesCopied[1] += 1
        tokenizationsCopied[1] += 1
        #sourceSentence = sourceElements.sentencesByOrigId[sentence.attrib["origId"]]
        if not sourceSentencesByText.has_key(sentence.get("text")):
            print >> sys.stderr, "Warning, no text found for sentence", sentence.get(
                "id")
            continue
        sourceSentence = sourceSentencesByText[sentence.get("text")]
        # Create analyses element (if needed)
        targetAnalysesElement = sentence.find("sentenceanalyses")
        if targetAnalysesElement == None:
            targetAnalysesElement = ET.Element("sentenceanalyses")
            sentence.append(targetAnalysesElement)
        # Create parses element (if needed)
        targetParsesElement = targetAnalysesElement.find("parses")
        if targetParsesElement == None:
            targetParsesElement = ET.Element("parses")
            targetAnalysesElement.append(targetParsesElement)
        # Check whether parse already exists
        targetParseElements = targetParsesElement.findall("parse")
        newParse = None
        for parseElement in targetParseElements:
            if parseElement.get("parser") == parse:
                newParse = parseElement
                break
        # Copy parse if it doesn't
        if newParse == None and sourceSentence.parseElement != None:
            targetParsesElement.append(sourceSentence.parseElement)
            parsesCopied[0] += 1

        # Create tokenizations element (if needed)
        targetTokenizationsElement = targetAnalysesElement.find(
            "tokenizations")
        if targetTokenizationsElement == None:
            targetTokenizationsElement = ET.Element("tokenizations")
            targetAnalysesElement.append(targetTokenizationsElement)
        # Check whether tokenization already exists
        targetTokenizationElements = targetTokenizationsElement.findall(
            "tokenization")
        newTokenization = None
        for tokenizationElement in targetTokenizationElements:
            if tokenizationElement.attrib["tokenizer"] == newParse.attrib[
                    "tokenizer"]:
                newTokenization = tokenizationElement
                break
        # Copy parse if it doesn't
        if newTokenization == None and sourceSentence.tokenizationElement != None:
            targetTokenizationsElement.append(
                sourceSentence.tokenizationElement)
            tokenizationsCopied[0] += 1

    print >> sys.stderr, "Copied parse elements", parsesCopied
    print >> sys.stderr, "Copied tokenization elements", tokenizationsCopied

    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(inputTree, output)
    return inputTree
Esempio n. 7
0
                         "--tokenization",
                         default="split-McClosky",
                         dest="tokenization",
                         help="Tokenization element name")
    optparser.add_option("-p",
                         "--parse",
                         default="split-McClosky",
                         dest="parse",
                         help="Parse element name")
    (options, args) = optparser.parse_args()
    assert (options.source != None)
    assert (options.target != None)
    #assert(options.output != None)

    print >> sys.stderr, "Loading source:",
    sourceElements = CorpusElements.loadCorpus(options.source, options.parse,
                                               options.tokenization)
    print >> sys.stderr, "Loading target:",
    targetElements = CorpusElements.loadCorpus(options.target, options.parse,
                                               options.tokenization)
    parseCopied = None
    tokenizationCopied = None
    print >> sys.stderr, "Mapping sentences"
    origIdToSentences = {}
    for sourceSentence in sourceElements.sentences:
        origIdToSentences[sourceSentence.sentence.get("origId")] = [
            sourceSentence, None
        ]
    for targetSentence in targetElements.sentences:
        assert origIdToSentences.has_key(targetSentence.sentence.get(
            "origId")), targetSentence.sentence.get("origId")
        origIdToSentences[targetSentence.sentence.get(