예제 #1
0
def getCorpusIterator(input,
                      output,
                      parse,
                      tokenization=None,
                      removeNameInfo=False,
                      removeIntersentenceInteractions=True):
    import Utils.ElementTreeUtils as ETUtils
    from Utils.InteractionXML.SentenceElements import SentenceElements
    #import xml.etree.cElementTree as ElementTree

    if output != None:
        etWriter = ETUtils.ETWriter(output)
    for eTuple in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        element = eTuple[1]
        if eTuple[0] in ["end", "memory"] and element.tag == "document":
            sentences = []
            for sentenceElement in element.findall("sentence"):
                #print ElementTree.tostring(sentenceElement)
                sentence = SentenceElements(sentenceElement,
                                            parse,
                                            tokenization,
                                            removeIntersentenceInteractions=
                                            removeIntersentenceInteractions)
                if len(sentence.tokens
                       ) == 0:  # or len(sentence.dependencies) == 0:
                    sentence.sentenceGraph = None
                else:
                    # Construct the basic SentenceGraph (only syntactic information)
                    graph = SentenceGraph(sentence.sentence, sentence.tokens,
                                          sentence.dependencies)
                    # Add semantic information, i.e. the interactions
                    graph.mapInteractions(sentence.entities,
                                          sentence.interactions)
                    graph.interSentenceInteractions = sentence.interSentenceInteractions
                    #duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved
                    sentence.sentenceGraph = graph
                    graph.parseElement = sentence.parseElement
                    graph.documentElement = element
                sentences.append(sentence)
            yield sentences
            if output != None:
                etWriter.write(element)
        elif element.tag == "corpus" and output != None:
            if eTuple[0] == "start":
                etWriter.begin(element)
            else:
                etWriter.end(element)
        if eTuple[0] == "end" and element.tag in ["document", "corpus"]:
            element.clear()
    if output != None:
        etWriter.close()
예제 #2
0
def getSubset(input,
              output=None,
              fraction=1.0,
              seed=0,
              ids=None,
              attributes=None,
              invert=False,
              targetElementTag="document"):
    distribution = None
    if ids == None and attributes == None:
        print >> sys.stderr, "No id-file, using pseudorandom distribution"
        distribution = getSample(
            getElementCounts(input, [targetElementTag])[targetElementTag],
            fraction, seed)
    elif attributes != None:
        print >> sys.stderr, "Selecting subset with attributes:", attributes
        for key in attributes:
            assert type(attributes[key]) in (types.ListType,
                                             types.TupleType), attributes

    counts = defaultdict(int)

    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    targetElementCount = 0
    skip = False
    for event in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event[0] == "start":
            if event[1].tag == targetElementTag:
                skip = select(targetElementCount, distribution, event[1], ids,
                              attributes, invert)
                targetElementCount += 1
            if not skip:
                outWriter.begin(event[1])
                counts[event[1].tag + ":kept"] += 1
            else:
                counts[event[1].tag + ":removed"] += 1
        elif event[0] == "end":
            if not skip:
                outWriter.end(event[1])
            if event[1].tag == targetElementTag:
                skip = False
    if output != None:
        outWriter.close()
        ETUtils.encodeNewlines(output)

    print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)
예제 #3
0
def process(input, output=None, preprocess=True, debug=False):
    """
    Run MetaMap.
    """    
    counter = ProgressCounter(id="MetaMap")
    
    # Create working directory
    workdir = tempfile.mkdtemp()
    
    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    
    # Loop iteratively over elements
    skip = False
    for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event == "start": # element start message, element may not be fully read yet
            if element.tag == "sentence":
                sentence = element
                counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ")
                # Run metamap for the sentence element
            elif element.tag == "metamap": # skip the metamap element to remove the original one
                skip = True
            if not skip and output != None:
                outWriter.begin(element)
        
        elif event == "end": # element is fully read in memory
            if not skip and output != None:
                outWriter.end(element)

            if element.tag == "metamap":
                skip = False # write elements again after this one
                if preprocess:
                    element = convert(element, sentence)
                outWriter.write(element) # insert the new metamap element into the output stream
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        outWriter.close()
        ETUtils.encodeNewlines(output)

    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        shutil.rmtree(workdir)

    return output