def extract_feature_utterances(filenames, feature, speaker=None, cutoff=0): parser = MorParser("{http://www.talkbank.org/ns/talkbank}") corpus = itertools.chain(*(parser.parse(i) for i in filenames)) if feature == "pos": f = rewriter elif feature == "word": f = lambda x: x.word utterances = [[f(w) for w in u[1]] for u in corpus if ((u[0] == speaker or speaker is None) and len(u[1]) >= cutoff)] return utterances
def xml_to_tagfile(filename): parser = MorParser("{http://www.talkbank.org/ns/talkbank}") corpus = parser.parse(filename) for speaker, tokens in corpus: yield speaker, [rewriter(t) for t in tokens]