def recv_line(self): """Receives lineS from the process. The first line is always the number of consecutive lines.""" ret = [] num_lines = int(LineByLineTagger.recv_line(self)) for i in xrange(num_lines): ret.append(LineByLineTagger.recv_line(self)) return ret
def __init__(self, params): SentenceTokenizerWrapper.__init__(self, params) self.patterns.add(HunknownSentenceTokenizer._datePattern) self.patterns.add(HunknownSentenceTokenizer._romanNumberPattern) basedir = params['hunknown_basedir'] runnable = os.path.join(basedir, 'bin', 'tokenize') config = params.get('hunknown_conf') if config is None: config = os.path.join(basedir, 'huntools.conf') encoding = params.get('hunknown_encoding', 'iso-8859-2') LineByLineTagger.__init__(self, runnable, encoding) self.options = [config]