def __init__(self, _model_path = paths.SEG_MODEL_PATH, _model_file = None, _scale_model_file = None, verbose=False, _name="segmenter", dependencies = False): self.svm_classifier = SVMClassifier(class_type = 'bin', software = 'libsvm', model_path = _model_path, bin_model_file = _model_file, bin_scale_model_file = _scale_model_file, output_filter = float, name= _name, verbose = verbose) self.feature_writer = SegFeatureWriter(verbose = verbose) self.syntax_parser = SyntaxParser(verbose = verbose, dependencies = dependencies)
class Segmenter: penn_special_chars = {'-LRB-': '(', '-RRB-': ')', '-LAB-': '<', '-RAB-': '>', '-LCB-': '{', '-RCB-': '}', '-LSB-': '[', '-RSB-':']', '\\/' : '/', '\\*' : '*', '``' : '"', "''" : '"', "`" : "'"} def __init__(self, _model_path = paths.SEG_MODEL_PATH, _model_file = None, _scale_model_file = None, verbose=False, _name="segmenter", dependencies = False): self.svm_classifier = SVMClassifier(class_type = 'bin', software = 'libsvm', model_path = _model_path, bin_model_file = _model_file, bin_scale_model_file = _scale_model_file, output_filter = float, name= _name, verbose = verbose) self.feature_writer = SegFeatureWriter(verbose = verbose) self.syntax_parser = SyntaxParser(verbose = verbose, dependencies = dependencies) def create_lexicalized_tree(self, mrg, heads): """ Creates a lexicalized syntax tree given a MRG-style parse and a Penn2Malt style heads file. """ t = LexicalizedTree.parse(mrg, leaf_pattern = '(?<=\\s)[^\)\(]+') # Vanessa's modification t.lexicalize(heads, from_string = True) return t def split_by_sentence(self, text): """ Takes a text and returns a list of (sentence, is_paragraph_boundary) elements Assumes that the text is pre-processed such that end of sentences are marked with <s>, end of paragraphs with <p> """ result = [] text = text.replace("\n", "") parse_pos = 0 prev_pos = 0 while parse_pos < len(text): next_tok = text[parse_pos:parse_pos + 3] if next_tok == "<s>" or next_tok == "<p>": result.append((text[prev_pos:parse_pos].strip(), next_tok)) parse_pos = parse_pos + 3 prev_pos = parse_pos else: parse_pos = parse_pos + 1 return result def segment_tree(self, t): """ Segments a text represented as a lexicalized syntax trees Returns a list of class labels for each token of the tree """ data_to_classify = self.feature_writer.extract_features([t]) results = [] for d in data_to_classify: #print d results.append(self.svm_classifier.classify(d)) return results def get_parsed_trees_from_string(self, tree_strings): # tree_strings separated by "\n" parsed_trees = [] for line in tree_strings: line = line.strip() if line != '': parsed_trees.append(LexicalizedTree.parse(line, leaf_pattern = '(?<=\\s)[^\)\(]+')) return parsed_trees def get_deps(self, deps_filename): try: dep_f = open(deps_filename, 'r') deps = [] sent_dep_str = '' started = True for line in dep_f.readlines(): line = line.strip() if line == '' and started: started = False deps.append(sent_dep_str) sent_dep_str = '' else: started = True sent_dep_str += '\n' + line dep_f.close() return deps except Exception, e: print "*** Could not read the input file..." raise