def getSubset(input, output=None, fraction=1.0, seed=0, ids=None, attributes=None, invert=False, targetElementTag="document"): distribution = None if ids == None and attributes == None: print >> sys.stderr, "No id-file, using pseudorandom distribution" distribution = getSample( getElementCounts(input, [targetElementTag])[targetElementTag], fraction, seed) elif attributes != None: print >> sys.stderr, "Selecting subset with attributes:", attributes for key in attributes: assert type(attributes[key]) in (types.ListType, types.TupleType), attributes counts = defaultdict(int) outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) targetElementCount = 0 skip = False for event in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event[0] == "start": if event[1].tag == targetElementTag: skip = select(targetElementCount, distribution, event[1], ids, attributes, invert) targetElementCount += 1 if not skip: outWriter.begin(event[1]) counts[event[1].tag + ":kept"] += 1 else: counts[event[1].tag + ":removed"] += 1 elif event[0] == "end": if not skip: outWriter.end(event[1]) if event[1].tag == targetElementTag: skip = False if output != None: outWriter.close() ETUtils.encodeNewlines(output) print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)
def process(input, output=None, preprocess=True, debug=False): """ Run MetaMap. """ counter = ProgressCounter(id="MetaMap") # Create working directory workdir = tempfile.mkdtemp() outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) # Loop iteratively over elements skip = False for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event == "start": # element start message, element may not be fully read yet if element.tag == "sentence": sentence = element counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ") # Run metamap for the sentence element elif element.tag == "metamap": # skip the metamap element to remove the original one skip = True if not skip and output != None: outWriter.begin(element) elif event == "end": # element is fully read in memory if not skip and output != None: outWriter.end(element) if element.tag == "metamap": skip = False # write elements again after this one if preprocess: element = convert(element, sentence) outWriter.write(element) # insert the new metamap element into the output stream if output != None: print >> sys.stderr, "Writing output to", output outWriter.close() ETUtils.encodeNewlines(output) if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: shutil.rmtree(workdir) return output
def getSubset(input, output=None, fraction=1.0, seed=0, ids=None, attributes=None, invert=False, targetElementTag="document"): distribution = None if ids == None and attributes == None: print >> sys.stderr, "No id-file, using pseudorandom distribution" distribution = getSample(getElementCounts(input, [targetElementTag])[targetElementTag], fraction, seed) elif attributes != None: print >> sys.stderr, "Selecting subset with attributes:", attributes for key in attributes: assert type(attributes[key]) in (types.ListType, types.TupleType), attributes counts = defaultdict(int) outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) targetElementCount = 0 skip = False for event in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event[0] == "start": if event[1].tag == targetElementTag: skip = select(targetElementCount, distribution, event[1], ids, attributes,invert) targetElementCount += 1 if not skip: outWriter.begin(event[1]) counts[event[1].tag + ":kept"] += 1 else: counts[event[1].tag + ":removed"] += 1 elif event[0] == "end": if not skip: outWriter.end(event[1]) if event[1].tag == targetElementTag: skip = False if output != None: outWriter.close() ETUtils.encodeNewlines(output) print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)