Example #1
0
def test_fold():
    # a single fold
    assert folding.fold([1, 2, 3], 1) == \
        ([[1, 2, 3]], [0])

    # each group have 1 element
    assert folding.fold([1, 2, 3], 3) == \
        ([[1, 2, 3], [3, 1, 2], [2, 3, 1]], [2, 2, 2])

    # here the last group is [3, 4]
    assert folding.fold([1, 2, 3, 4], 3) == \
        ([[1, 2, 3, 4], [3, 4, 1, 2], [2, 3, 4, 1]], [2, 3, 3])
Example #2
0
def segment(text,
            nfolds=5,
            njobs=1,
            args='--ngram 1 --a1 0 --b1 1',
            log=utils.null_logger(),
            binary=utils.get_binary('dpseg')):
    """Run the 'dpseg' binary on `nfolds` folds"""
    # force the text to be a list of utterances
    text = list(text)

    # set of unique units (syllables or phones) present in the text
    units = set(unit for utt in text for unit in utt.split())
    log.info('%s units found in %s utterances', len(units), len(text))

    # create a unicode equivalent for each unit and convert the text
    # to that unicode version
    log.debug('converting input to unicode')
    unicode_gen = UnicodeGenerator()
    unicode_mapping = {unit: unicode_gen() for unit in units}
    unicode_text = [
        ''.join(unicode_mapping[unit] for unit in utt.split()) for utt in text
    ]

    log.debug('building %s folds', nfolds)
    fold_boundaries = _dpseg_bugfix(unicode_text,
                                    folding.boundaries(unicode_text, nfolds),
                                    log)

    folded_texts, fold_index = folding.fold(unicode_text,
                                            nfolds,
                                            fold_boundaries=fold_boundaries)

    segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)(
        joblib.delayed(_dpseg)(fold,
                               args,
                               log_level=log.getEffectiveLevel(),
                               log_name='wordseg-dpseg - fold {}'.format(n +
                                                                         1),
                               binary=binary)
        for n, fold in enumerate(folded_texts))

    log.debug('unfolding the %s folds', nfolds)
    output_text = folding.unfold(segmented_texts, fold_index)

    # convert the text back to unit level (from unicode level)
    log.debug('converting output back from unicode')
    unit_mapping = {v: k for k, v in unicode_mapping.items()}
    unit_mapping[' '] = ' '
    segmented_text = (''.join(unit_mapping[char] for char in utt)
                      for utt in output_text)

    return (utt for utt in segmented_text if utt)
Example #3
0
def segment(text, window=2, nfolds=5, njobs=1, log=utils.null_logger()):
    """Returns a word segmented version of `text` using the puddle algorithm

    Parameters
    ----------
    text : sequence
        A sequence of lines with syllable (or phoneme) boundaries
        marked by spaces and no word boundaries. Each line in the
        sequence corresponds to a single and comlete utterance.
    window : int, optional
        Number of phonemes to be taken into account for boundary constraint.
    nfolds : int, optional
        The number of folds to segment the `text` on.
    njobs : int, optional
        The number of subprocesses to run in parallel. The folds are
        independant of each others and can be computed in
        parallel. Requesting a number of jobs greater then `nfolds`
        have no effect.
    log : logging.Logger, optional
        The logger instance where to send messages.

    Returns
    -------
    generator
        The utterances from `text` with estimated words boundaries.

    See also
    --------
    wordseg.folding.fold

    """
    # force the text to be a list of utterances
    text = list(text)

    log.debug('building %s folds', nfolds)
    folded_texts, fold_index = folding.fold(text, nfolds)

    segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)(
        joblib.delayed(_puddle)(fold,
                                window,
                                log_level=log.getEffectiveLevel(),
                                log_name='wordseg-puddle - fold {}'.format(n +
                                                                           1))
        for n, fold in enumerate(folded_texts))

    log.debug('unfolding the %s folds', nfolds)
    output_text = folding.unfold(segmented_texts, fold_index)

    return (utt for utt in output_text if utt)
Example #4
0
def test_fold_unfold_nfolds(nfolds, tags):
    folds, index = folding.fold(tags, nfolds)
    assert folding.unfold(folds, index) == tags
Example #5
0
def test_unfold_basic(nfolds):
    folds, index = folding.fold([1, 2, 3], nfolds)
    assert folding.unfold(folds, index) == [1, 2, 3]
Example #6
0
def segment(text,
            train_text=None,
            window=2,
            by_frequency=False,
            nfolds=5,
            njobs=1,
            log=utils.null_logger()):
    """Returns a word segmented version of `text` using the puddle algorithm

    Parameters
    ----------
    text : sequence of str
        A sequence of lines with syllable (or phoneme) boundaries
        marked by spaces and no word boundaries. Each line in the
        sequence corresponds to a single and complete utterance.
    train_text : sequence of str
        The list of utterances to train the model on. If None (default) the
        model is trained online during segmentation. When `train_text` is
        specified, the options `nfolds` and `njobs` are ignored.
    window : int, optional
        Number of phonemes to be taken into account for boundary constraint.
        Default to 2.
    by_frequency : bool, optional
        When True choose the word candidates by filterring them by frequency.
        Default to False.
    nfolds : int, optional
        The number of folds to segment the `text` on. This option is ignored if
        a `train_text` is provided.
    njobs : int, optional
        The number of subprocesses to run in parallel. The folds are
        independant of each others and can be computed in parallel. Requesting
        a number of jobs greater then `nfolds` have no effect. This option is
        ignored if a `train_text` is provided.
    log : logging.Logger, optional
        The logger instance where to send messages.

    Returns
    -------
    generator
        The utterances from `text` with estimated words boundaries.

    See also
    --------
    wordseg.folding.fold

    """
    # force the text to be a list of utterances
    text = list(text)

    if not train_text:
        log.info('not train data provided, will train model on test data')

        log.debug('building %s folds', nfolds)
        folded_texts, fold_index = folding.fold(text, nfolds)

        # segment the folds in parallel
        segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)(
            joblib.delayed(_do_puddle)(fold, window, by_frequency,
                                       log.getEffectiveLevel(),
                                       f'wordseg-puddle - fold {n+1}')
            for n, fold in enumerate(folded_texts))

        log.debug('unfolding the %s folds', nfolds)
        output_text = folding.unfold(segmented_texts, fold_index)

        return (utt for utt in output_text if utt)

    # force the train text from sequence to list
    train_text = list(train_text)
    log.info('train data: %s utterances loaded', len(train_text))

    # init a puddle model and train it
    model = Puddle(window=window, by_frequency=by_frequency, log=log)
    model.train(train_text)

    # segmentation of the test text, keeping the model constant
    return (utt for utt in model.segment(text, update_model=False) if utt)