def _do_puddle(text, window, by_frequency, log_level, log_name): """Auxiliary function to segment""" model = Puddle(window=window, by_frequency=by_frequency, log=utils.get_logger(name=log_name, level=log_level)) return list(model.segment(text, update_model=True))
def _puddle(text, window, log_level=logging.ERROR, log_name='wordseg-puddle'): """Runs the puddle algorithm on the `text`""" # create a new puddle segmenter (with an empty lexicon) puddle = _Puddle(window=window, log=utils.get_logger(name=log_name, level=log_level)) return [ ' '.join(puddle.update_utterance(line.strip().split(), segmented=[])) for line in text ]
def _dpseg(text, args, log_level=logging.ERROR, log_name='wordseg-dpseg', binary=utils.get_binary('dpseg')): log = utils.get_logger(name=log_name, level=log_level) with tempfile.NamedTemporaryFile() as tmp_output: command = '{binary} --output-file {output} {args}'.format( binary=binary, output=tmp_output.name, args=args) log.debug('running "%s"', command) process = subprocess.Popen(shlex.split(command), stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) def writer(): for utt in text: process.stdin.write((utt.strip() + '\n').encode('utf8')) process.stdin.close() thread = threading.Thread(target=writer) thread.start() # Send stdout and stderr to logger, break if EOF reached while True: line_out = process.stdout.readline().decode('utf8') line_err = process.stderr.readline().decode('utf8') if line_out == "" and line_err == "": break if line_out != "": log.debug(line_out.strip()) if line_err != "": log.debug(line_err.strip()) thread.join() process.wait() if process.returncode: raise RuntimeError('failed with error code {}'.format( process.returncode)) tmp_output.seek(0) return tmp_output.read().decode('utf8').split('\n')
def _segment_single(parse_counter, train_text, grammar_file, category, ignore_first_parses, args, test_text=None, tempdir=tempfile.gettempdir(), log_level=logging.ERROR, log_name='wordseg-ag'): """Executes a single run of the AG program and postprocessing The function returns nothing but updates the `parse_counter` with the parses built during the AG's iterations. It does the following steps: * create a logger to forward AG messages. * create a temporary directory, write train/test data files in it. * execute the AG program with the given grammar and arguments (in a subprocess, using a bash script as proxy). * postprocess the resulting parse trees to extract the segmented utterance from the raw PTB format. * update the `parse_counter` with the segmented utterances. Parameters ---------- parse_counter : ParseCounter Count the segmented utterances obtained for each parses train_text : sequence The list of utterances to train the model on, and to segment if `test_text` is None. grammar_file : str The path to the grammar file to use for segmentation category : str The category to segment the text with, must be an existing parent in the grammar (i.e. the `segment_category` must be present in the left column of the grammar file), default to 'Colloc0'. ignore_first_parses : int Ignore the first parses from the algorithm output args : str Command line options to run the AG program with, use 'wordseg-ag --help' to have a complete list of available options test_text : sequence, optional If not None, the test text contains the list of utterances to segment on the model learned from `train_text` tempdir : str, optional A directory where to store temporary data log_level : logging.Level, optional The level of the wrapping log (must be DEBUG to display messages from AG, default to ERROR). log_name: str, optional The name of the logger where to send log messages, default to 'wordseg-ag'. Raises ------ RuntimeError If the AG program fails and returns an error code """ log = utils.get_logger(name=log_name, level=log_level) # we need to write some intermediate files, so we create a # temporary directory. The directory and its content is # automatically erased when done. temdir = tempfile.mkdtemp(dir=tempdir) log.debug('created tempdir: %s', temdir) try: # setup the train text as a temp file. ylt extension is the # one used in the original AG implementation. TODO actually we # are copying train and test files for each run, this is # useless (maybe expose train_file and test_file as arguments # instead of train_text / test_text?). train_text = '\n'.join(utt.strip() for utt in train_text) + '\n' train_file = os.path.join(temdir, 'train.ylt') codecs.open(train_file, 'w', encoding='utf8').write(train_text) # setup the test text as well if test_text is None: test_file = train_file else: test_text = '\n'.join(utt.strip() for utt in test_text) + '\n' test_file = os.path.join(temdir, 'test.ylt') codecs.open(test_file, 'w', encoding='utf8').write(test_text) # create a file to store output (compressed PTB-format parse trees) output_file = os.path.join(temdir, 'output.gz') # write the call to AG in a bash script script_file = os.path.join(temdir, 'script.sh') command = ('cat {train} ' '| {bin} {grammar} {args} -u {test} -c {category}' '| gzip -c > {output}'.format(train=train_file, bin=utils.get_binary('ag'), grammar=grammar_file, args=args, test=test_file, category=category, output=output_file)) codecs.open(script_file, 'w', encoding='utf8').write(command + '\n') log.info('running "%s"', command) t1 = datetime.datetime.now() # run the command as a subprocess process = subprocess.Popen(shlex.split('bash {}'.format(script_file)), stdin=None, stdout=None, stderr=subprocess.PIPE) # log.debug the AG messages during execution def stderr2log(line): try: line = line.decode('utf8') except AttributeError: line = str(line) line = re.sub('^# ', '', line.strip()) if line: log.debug(line) # join the command output to log (from # https://stackoverflow.com/questions/35488927) def consume_lines(pipe, consume): with pipe: # NOTE: workaround read-ahead bug for line in iter(pipe.readline, b''): consume(line) consume('\n') threading.Thread(target=consume_lines, args=[process.stderr, lambda line: stderr2log(line)]).start() process.wait() t2 = datetime.datetime.now() # fail if AG returns an error code if process.returncode: raise RuntimeError('segmentation fails with error code {}'.format( process.returncode)) log.info('segmentation done, took {}'.format(t2 - t1)) postprocess(parse_counter, output_file, ignore_first_parses, log) t3 = datetime.datetime.now() log.info('postprocessing done, took %s', t3 - t2) finally: shutil.rmtree(temdir)