def preprocess(tree, delexicalize=True): if convert_PRT_to_ADVP: for n in tree.internal_nodes(): if n.label == 'PRT': n.label = 'ADVP' if remove_quotation_marks: tree.prune_labels(["``", "''"]) if remove_outermost_punctuation: tree.remove_leftmost_punctuation() tree.remove_rightmost_punctuation() if raise_punctuation: tree.raise_punctuation() # Noise that can be removed reliably should be removed as early as possible. if remove_unary_projections_to_self: tree.remove_productions_to_self() # This should occur *after* we remove unary projections to self. if add_basal_nps: tree = basenp.transform(tree) # This transform must occur *after* we raise punctuation. if use_S_transform: assert not add_basal_nps tree = S_transform(tree) # Sanity check for n in tree.internal_nodes(): if n.label != "S": continue if len(n.children) < 2: continue if n.children[0].label == "NP" and n.children[-1].label == "VP": assert len([c for c in n.children[1:-1] if c.label == "NP"])==0 assert len([c for c in n.children[1:-1] if c.label == "VP"])==0 # Sanity check that the transform is stable. treestr = tree.to_string() tree = S_transform(tree) assert tree.to_string() == treestr # This transform must occur *after* we raise punctuation. if use_SPRIME_transform: assert not add_basal_nps tree = SPRIME_transform(tree) # # Sanity check that the transform is stable. # treestr = tree.to_string() # tree = SPRIME_transform(tree) # assert tree.to_string() == treestr if lowercase_vocabulary: for n in tree.leaves(): n.headword = string.lower(n.headword) if delexicalize: # Delexicalize infrequent words (words not in the vocabulary) vocab.read() for n in tree.leaves(): if n.headword not in vocab.vocab_to_idx: n.headword = '*rare*' tree = refresh(tree) return tree
# 12. Output the tree. # # # $Id: preprocess.py 1657 2006-06-04 03:03:05Z turian $ # # ####################################################################### # Copyright (c) 2004-2006, New York University. All rights reserved ####################################################################### from variables import * import parsetree import vocab vocab.read() import sys if mysys == "Linux" and not profile: try: debug(1, "Linux detected... using psyco") import psyco psyco.full(memory=4096) # psyco.full() # psyco.log() # psyco.full(memory=128) # psyco.profile(0.05, memory=1024) # psyco.profile(0.2)