Example #1
0
def _do_puddle(text, window, by_frequency, log_level, log_name):
    """Auxiliary function to segment"""
    model = Puddle(window=window,
                   by_frequency=by_frequency,
                   log=utils.get_logger(name=log_name, level=log_level))

    return list(model.segment(text, update_model=True))
Example #2
0
def _puddle(text, window, log_level=logging.ERROR, log_name='wordseg-puddle'):
    """Runs the puddle algorithm on the `text`"""
    # create a new puddle segmenter (with an empty lexicon)
    puddle = _Puddle(window=window,
                     log=utils.get_logger(name=log_name, level=log_level))

    return [
        ' '.join(puddle.update_utterance(line.strip().split(), segmented=[]))
        for line in text
    ]
Example #3
0
def _dpseg(text,
           args,
           log_level=logging.ERROR,
           log_name='wordseg-dpseg',
           binary=utils.get_binary('dpseg')):
    log = utils.get_logger(name=log_name, level=log_level)
    with tempfile.NamedTemporaryFile() as tmp_output:
        command = '{binary} --output-file {output} {args}'.format(
            binary=binary, output=tmp_output.name, args=args)

        log.debug('running "%s"', command)

        process = subprocess.Popen(shlex.split(command),
                                   stdin=subprocess.PIPE,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE)

        def writer():
            for utt in text:
                process.stdin.write((utt.strip() + '\n').encode('utf8'))
            process.stdin.close()

        thread = threading.Thread(target=writer)
        thread.start()

        # Send stdout and stderr to logger, break if EOF reached
        while True:
            line_out = process.stdout.readline().decode('utf8')
            line_err = process.stderr.readline().decode('utf8')

            if line_out == "" and line_err == "":
                break

            if line_out != "":
                log.debug(line_out.strip())

            if line_err != "":
                log.debug(line_err.strip())

        thread.join()
        process.wait()
        if process.returncode:
            raise RuntimeError('failed with error code {}'.format(
                process.returncode))

        tmp_output.seek(0)
        return tmp_output.read().decode('utf8').split('\n')
Example #4
0
def _segment_single(parse_counter,
                    train_text,
                    grammar_file,
                    category,
                    ignore_first_parses,
                    args,
                    test_text=None,
                    tempdir=tempfile.gettempdir(),
                    log_level=logging.ERROR,
                    log_name='wordseg-ag'):
    """Executes a single run of the AG program and postprocessing

    The function returns nothing but updates the `parse_counter` with
    the parses built during the AG's iterations. It does the following
    steps:

    * create a logger to forward AG messages.

    * create a temporary directory, write train/test data files in it.

    * execute the AG program with the given grammar and arguments
      (in a subprocess, using a bash script as proxy).

    * postprocess the resulting parse trees to extract the segmented
      utterance from the raw PTB format.

    * update the `parse_counter` with the segmented utterances.


    Parameters
    ----------
    parse_counter : ParseCounter
        Count the segmented utterances obtained for each parses
    train_text : sequence
        The list of utterances to train the model on, and to segment
        if `test_text` is None.
    grammar_file : str
        The path to the grammar file to use for segmentation
    category : str
        The category to segment the text with, must be an existing
        parent in the grammar (i.e. the `segment_category` must be
        present in the left column of the grammar file), default to
        'Colloc0'.
    ignore_first_parses : int
        Ignore the first parses from the algorithm output
    args : str
        Command line options to run the AG program with, use
        'wordseg-ag --help' to have a complete list of available
        options
    test_text : sequence, optional
        If not None, the test text contains the list of utterances to
        segment on the model learned from `train_text`
    tempdir : str, optional
        A directory where to store temporary data
    log_level : logging.Level, optional
        The level of the wrapping log (must be DEBUG to display
        messages from AG, default to ERROR).
    log_name: str, optional
        The name of the logger where to send log messages, default to
        'wordseg-ag'.

    Raises
    ------
    RuntimeError
        If the AG program fails and returns an error code

    """
    log = utils.get_logger(name=log_name, level=log_level)

    # we need to write some intermediate files, so we create a
    # temporary directory. The directory and its content is
    # automatically erased when done.
    temdir = tempfile.mkdtemp(dir=tempdir)
    log.debug('created tempdir: %s', temdir)

    try:
        # setup the train text as a temp file. ylt extension is the
        # one used in the original AG implementation. TODO actually we
        # are copying train and test files for each run, this is
        # useless (maybe expose train_file and test_file as arguments
        # instead of train_text / test_text?).
        train_text = '\n'.join(utt.strip() for utt in train_text) + '\n'
        train_file = os.path.join(temdir, 'train.ylt')
        codecs.open(train_file, 'w', encoding='utf8').write(train_text)

        # setup the test text as well
        if test_text is None:
            test_file = train_file
        else:
            test_text = '\n'.join(utt.strip() for utt in test_text) + '\n'
            test_file = os.path.join(temdir, 'test.ylt')
            codecs.open(test_file, 'w', encoding='utf8').write(test_text)

        # create a file to store output (compressed PTB-format parse trees)
        output_file = os.path.join(temdir, 'output.gz')

        # write the call to AG in a bash script
        script_file = os.path.join(temdir, 'script.sh')
        command = ('cat {train} '
                   '| {bin} {grammar} {args} -u {test} -c {category}'
                   '| gzip -c > {output}'.format(train=train_file,
                                                 bin=utils.get_binary('ag'),
                                                 grammar=grammar_file,
                                                 args=args,
                                                 test=test_file,
                                                 category=category,
                                                 output=output_file))
        codecs.open(script_file, 'w', encoding='utf8').write(command + '\n')

        log.info('running "%s"', command)

        t1 = datetime.datetime.now()

        # run the command as a subprocess
        process = subprocess.Popen(shlex.split('bash {}'.format(script_file)),
                                   stdin=None,
                                   stdout=None,
                                   stderr=subprocess.PIPE)

        # log.debug the AG messages during execution
        def stderr2log(line):
            try:
                line = line.decode('utf8')
            except AttributeError:
                line = str(line)
            line = re.sub('^# ', '', line.strip())
            if line:
                log.debug(line)

        # join the command output to log (from
        # https://stackoverflow.com/questions/35488927)
        def consume_lines(pipe, consume):
            with pipe:
                # NOTE: workaround read-ahead bug
                for line in iter(pipe.readline, b''):
                    consume(line)
                consume('\n')

        threading.Thread(target=consume_lines,
                         args=[process.stderr,
                               lambda line: stderr2log(line)]).start()

        process.wait()

        t2 = datetime.datetime.now()

        # fail if AG returns an error code
        if process.returncode:
            raise RuntimeError('segmentation fails with error code {}'.format(
                process.returncode))

        log.info('segmentation done, took {}'.format(t2 - t1))

        postprocess(parse_counter, output_file, ignore_first_parses, log)

        t3 = datetime.datetime.now()
        log.info('postprocessing done, took %s', t3 - t2)

    finally:
        shutil.rmtree(temdir)