def getCorpusIterator(input, output, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractions=True): import Utils.ElementTreeUtils as ETUtils from Utils.InteractionXML.SentenceElements import SentenceElements #import xml.etree.cElementTree as ElementTree if output != None: etWriter = ETUtils.ETWriter(output) for eTuple in ETUtils.ETIteratorFromObj(input, ("start", "end")): element = eTuple[1] if eTuple[0] in ["end", "memory"] and element.tag == "document": sentences = [] for sentenceElement in element.findall("sentence"): #print ElementTree.tostring(sentenceElement) sentence = SentenceElements(sentenceElement, parse, tokenization, removeIntersentenceInteractions= removeIntersentenceInteractions) if len(sentence.tokens ) == 0: # or len(sentence.dependencies) == 0: sentence.sentenceGraph = None else: # Construct the basic SentenceGraph (only syntactic information) graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) # Add semantic information, i.e. the interactions graph.mapInteractions(sentence.entities, sentence.interactions) graph.interSentenceInteractions = sentence.interSentenceInteractions #duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved sentence.sentenceGraph = graph graph.parseElement = sentence.parseElement graph.documentElement = element sentences.append(sentence) yield sentences if output != None: etWriter.write(element) elif element.tag == "corpus" and output != None: if eTuple[0] == "start": etWriter.begin(element) else: etWriter.end(element) if eTuple[0] == "end" and element.tag in ["document", "corpus"]: element.clear() if output != None: etWriter.close()
def getSubset(input, output=None, fraction=1.0, seed=0, ids=None, attributes=None, invert=False, targetElementTag="document"): distribution = None if ids == None and attributes == None: print >> sys.stderr, "No id-file, using pseudorandom distribution" distribution = getSample( getElementCounts(input, [targetElementTag])[targetElementTag], fraction, seed) elif attributes != None: print >> sys.stderr, "Selecting subset with attributes:", attributes for key in attributes: assert type(attributes[key]) in (types.ListType, types.TupleType), attributes counts = defaultdict(int) outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) targetElementCount = 0 skip = False for event in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event[0] == "start": if event[1].tag == targetElementTag: skip = select(targetElementCount, distribution, event[1], ids, attributes, invert) targetElementCount += 1 if not skip: outWriter.begin(event[1]) counts[event[1].tag + ":kept"] += 1 else: counts[event[1].tag + ":removed"] += 1 elif event[0] == "end": if not skip: outWriter.end(event[1]) if event[1].tag == targetElementTag: skip = False if output != None: outWriter.close() ETUtils.encodeNewlines(output) print >> sys.stderr, "Subset for " + str(input) + ": " + str(counts)
def process(input, output=None, preprocess=True, debug=False): """ Run MetaMap. """ counter = ProgressCounter(id="MetaMap") # Create working directory workdir = tempfile.mkdtemp() outWriter = None if output != None: outWriter = ETUtils.ETWriter(output) # Loop iteratively over elements skip = False for event, element in ETUtils.ETIteratorFromObj(input, ("start", "end")): if event == "start": # element start message, element may not be fully read yet if element.tag == "sentence": sentence = element counter.update(1, "Processing MetaMap ("+sentence.get("id")+"): ") # Run metamap for the sentence element elif element.tag == "metamap": # skip the metamap element to remove the original one skip = True if not skip and output != None: outWriter.begin(element) elif event == "end": # element is fully read in memory if not skip and output != None: outWriter.end(element) if element.tag == "metamap": skip = False # write elements again after this one if preprocess: element = convert(element, sentence) outWriter.write(element) # insert the new metamap element into the output stream if output != None: print >> sys.stderr, "Writing output to", output outWriter.close() ETUtils.encodeNewlines(output) if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: shutil.rmtree(workdir) return output