Beispiel #1
0
def getSubset(input,
              output=None,
              fraction=1.0,
              seed=0,
              ids=None,
              attributes=None,
              invert=False,
              targetElementTag="document"):
    distribution = None
    if ids == None and attributes == None:
        print >> sys.stderr, "No id-file, using pseudorandom distribution"
        distribution = getSample(
            getElementCounts(input, [targetElementTag])[targetElementTag],
            fraction, seed)
    elif attributes != None:
        print >> sys.stderr, "Selecting subset with attributes:", attributes
        for key in attributes:
            assert type(attributes[key]) in (types.ListType,
                                             types.TupleType), attributes

    counts = defaultdict(int)

    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    targetElementCount = 0
    skip = False
    for event in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event[0] == "start":
            if event[1].tag == targetElementTag:
                skip = select(targetElementCount, distribution, event[1], ids,
                              attributes, invert)
                targetElementCount += 1
            if not skip:
                outWriter.begin(event[1])
                counts[event[1].tag + ":kept"] += 1
            else:
                counts[event[1].tag + ":removed"] += 1
        elif event[0] == "end":
            if not skip:
                outWriter.end(event[1])
            if event[1].tag == targetElementTag:
                skip = False
    if output != None:
        outWriter.close()
        ETUtils.encodeNewlines(output)

    print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)
Beispiel #2
0
def process(input, output=None, preprocess=True, debug=False):
    """
    Run MetaMap.
    """    
    counter = ProgressCounter(id="MetaMap")
    
    # Create working directory
    workdir = tempfile.mkdtemp()
    
    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    
    # Loop iteratively over elements
    skip = False
    for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event == "start": # element start message, element may not be fully read yet
            if element.tag == "sentence":
                sentence = element
                counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ")
                # Run metamap for the sentence element
            elif element.tag == "metamap": # skip the metamap element to remove the original one
                skip = True
            if not skip and output != None:
                outWriter.begin(element)
        
        elif event == "end": # element is fully read in memory
            if not skip and output != None:
                outWriter.end(element)

            if element.tag == "metamap":
                skip = False # write elements again after this one
                if preprocess:
                    element = convert(element, sentence)
                outWriter.write(element) # insert the new metamap element into the output stream
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        outWriter.close()
        ETUtils.encodeNewlines(output)

    if debug:
        print >> sys.stderr, "Work directory preserved for debugging at", workdir
    else:
        shutil.rmtree(workdir)

    return output
Beispiel #3
0
def getSubset(input, output=None, fraction=1.0, seed=0, ids=None, attributes=None, invert=False, targetElementTag="document"): 
    distribution = None
    if ids == None and attributes == None:
        print >> sys.stderr, "No id-file, using pseudorandom distribution"
        distribution = getSample(getElementCounts(input, [targetElementTag])[targetElementTag], fraction, seed)
    elif attributes != None:
        print >> sys.stderr, "Selecting subset with attributes:", attributes
        for key in attributes:
            assert type(attributes[key]) in (types.ListType, types.TupleType), attributes

    counts = defaultdict(int)
    
    outWriter = None
    if output != None:
        outWriter = ETUtils.ETWriter(output)
    targetElementCount = 0
    skip = False
    for event in ETUtils.ETIteratorFromObj(input, ("start", "end")):
        if event[0] == "start":
            if event[1].tag == targetElementTag:
                skip = select(targetElementCount, distribution, event[1], ids, attributes,invert)
                targetElementCount += 1
            if not skip:
                outWriter.begin(event[1])
                counts[event[1].tag + ":kept"] += 1
            else:
                counts[event[1].tag + ":removed"] += 1
        elif event[0] == "end":
            if not skip:
                outWriter.end(event[1])
            if event[1].tag == targetElementTag:
                skip = False
    if output != None:
        outWriter.close()
        ETUtils.encodeNewlines(output)
    
    print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)