コード例 #1
0
    def __init__(self,
                 onsets,
                 vowels,
                 separator=Separator(),
                 filling_vowel=False,
                 log=utils.null_logger()):
        self.onsets = onsets
        self.vowels = vowels
        self.separator = separator
        self.log = log

        # ensure onsets and vowels are not empty
        if not isinstance(vowels, list) or not len(vowels):
            raise ValueError('unvalid or empty vowels list')
        if not isinstance(onsets, list) or not len(onsets):
            raise ValueError('unvalid or empty onsets list')

        # concatenation of all chars in onsets and vowels (usefull to
        # detect any char during syllabification)
        self.symbols = (set(''.join(v for v in vowels)).union(
            set(''.join(o for o in onsets))))

        # if defined, ensure the silent vowel is not already used
        if filling_vowel:
            # find a silent vowel (some char not already prensent in
            # the symbols)
            code = 1
            while six.unichr(code) in self.symbols:
                code += 1
            self.silent = six.unichr(code)
            self.symbols.add(self.silent)
            self.vowels.append(self.silent)
        else:
            self.silent = None
コード例 #2
0
ファイル: puddle.py プロジェクト: codebyzeb/wordseg
    def __init__(self, window=2, by_frequency=False, log=utils.null_logger()):
        self._log = log
        self.window = window
        self.by_frequency = by_frequency

        self._lexicon = collections.Counter()
        self._beginning = collections.Counter()
        self._ending = collections.Counter()
コード例 #3
0
ファイル: dpseg.py プロジェクト: rsantana-isg/wordseg
def _dpseg_bugfix(text, boundaries, log=utils.null_logger()):
    """Ensure all folds have their first line with more than one symbol

    There is a bug in the C++ implementation of dpseg: when the first
    input line is composed of a single character (i.e. a unicode
    symbol), the program fails. To avoid this, this method ensures all
    the folds begin with at least two symbols. If this is not the
    case, the boundary position is moved to the next line containing
    at least two symbols.

    Raises
    ------
    ValueError if the bugfix is needed and cannot be applied

    Notes
    -----
    This implementation differs with the one in CDSWordSeg. In wordseg
    we modify the fold index and thus the fold size, whereas in
    CDSWordSeg we permute lines in the text (and thus to the gold
    file). In wordseg we don't want to expose the gold file at this
    level.

    """
    # we have something to fix if one of those lengths is 1
    first_len = [len(text[i]) for i in boundaries]
    if 1 not in first_len:
        log.debug('folds boundaries are OK for dpseg')
        return boundaries

    if first_len[0] == 1:
        raise ValueError('The input text\'s first line has a single symbol, '
                         'this will cause wordseg-dpseg to bug. '
                         'Please re-arrange your text manually and try again.')

    need_to_fix = [i for i, length in enumerate(first_len) if length == 1]
    log.debug('dpseg bugfix: need to fix folds {}'.format(
        [i + 1 for i in need_to_fix]))

    for i in need_to_fix:
        # find the first line of the fold with len >= 2
        index = _find_first_line_with_min_len(text[boundaries[i]:], min_len=2)

        if index is None:
            raise ValueError(
                'dpseg bugfix failed: all lines in the fold {} have len == 1'.
                format(i + 1))

        log.debug('dpseg bugfix: fixing fold {} index from {} to {}'.format(
            i + 1, boundaries[i], boundaries[i] + index))
        boundaries[i] += index

    if not boundaries == sorted(set(boundaries)):
        raise ValueError(
            'dpseg bugfix failed: broke the folds order. '
            'Please re-arrange (shuffle) your text manually and try again.')

    return boundaries
コード例 #4
0
ファイル: dpseg.py プロジェクト: rsantana-isg/wordseg
def segment(text,
            nfolds=5,
            njobs=1,
            args='--ngram 1 --a1 0 --b1 1',
            log=utils.null_logger(),
            binary=utils.get_binary('dpseg')):
    """Run the 'dpseg' binary on `nfolds` folds"""
    # force the text to be a list of utterances
    text = list(text)

    # set of unique units (syllables or phones) present in the text
    units = set(unit for utt in text for unit in utt.split())
    log.info('%s units found in %s utterances', len(units), len(text))

    # create a unicode equivalent for each unit and convert the text
    # to that unicode version
    log.debug('converting input to unicode')
    unicode_gen = UnicodeGenerator()
    unicode_mapping = {unit: unicode_gen() for unit in units}
    unicode_text = [
        ''.join(unicode_mapping[unit] for unit in utt.split()) for utt in text
    ]

    log.debug('building %s folds', nfolds)
    fold_boundaries = _dpseg_bugfix(unicode_text,
                                    folding.boundaries(unicode_text, nfolds),
                                    log)

    folded_texts, fold_index = folding.fold(unicode_text,
                                            nfolds,
                                            fold_boundaries=fold_boundaries)

    segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)(
        joblib.delayed(_dpseg)(fold,
                               args,
                               log_level=log.getEffectiveLevel(),
                               log_name='wordseg-dpseg - fold {}'.format(n +
                                                                         1),
                               binary=binary)
        for n, fold in enumerate(folded_texts))

    log.debug('unfolding the %s folds', nfolds)
    output_text = folding.unfold(segmented_texts, fold_index)

    # convert the text back to unit level (from unicode level)
    log.debug('converting output back from unicode')
    unit_mapping = {v: k for k, v in unicode_mapping.items()}
    unit_mapping[' '] = ' '
    segmented_text = (''.join(unit_mapping[char] for char in utt)
                      for utt in output_text)

    return (utt for utt in segmented_text if utt)
コード例 #5
0
ファイル: baseline.py プロジェクト: codebyzeb/wordseg
def segment_oracle(text,
                   oracle_text,
                   oracle_separator=Separator(),
                   oracle_level='phone',
                   log=utils.null_logger()):
    """Random oracle word segmentation

    The probability of word boundary :math:`p` is estimated from an
    `oracle` text as the ration ``nwords / (nphones or nsyllables)``,
    according to ``oracle_level``. The segmentation is then delegated
    to the segment(text, :math:`p`) method is called.

    Parameters
    ----------
    text : sequence of str
        The input utterances to segment, tokens are
        assumed to be space separated.
    oracle_text : sequence of str
        The text on which to estimate the probaility of word
        boundary. Must be tokenized at word and at least phone or
        syllable levels (according to ``oracle_level``).
    oracle_separator : Separator, optional
        Token separation in the oracle text.
    oracle_level : str, optional
        The level to consider when estimating :math:`p`, must be
        'phone' or 'syllable', default to 'phone'.
    log : logging.Logger
        Where to send log messages

    Yields
    ------
    segmented_text : generator
        The randomly segmented utterances.

    """
    # estimate the word probability boundary in the text
    nphones = sum(
        len(list(oracle_separator.tokenize(utt, level=oracle_level)))
        for utt in oracle_text)
    nwords = sum(
        len(list(oracle_separator.tokenize(utt, level='word')))
        for utt in oracle_text)

    log.info('nwords = %s, n%ss = %s', nwords, oracle_level, nphones)
    if nwords == nphones:
        log.warning(
            'nwords==nphones. Is the oracle\'s token separation correct?')

    probability = float(nwords) / float(nphones)
    return segment(text, probability, log=log)
コード例 #6
0
ファイル: puddle.py プロジェクト: rsantana-isg/wordseg
def segment(text, window=2, nfolds=5, njobs=1, log=utils.null_logger()):
    """Returns a word segmented version of `text` using the puddle algorithm

    Parameters
    ----------
    text : sequence
        A sequence of lines with syllable (or phoneme) boundaries
        marked by spaces and no word boundaries. Each line in the
        sequence corresponds to a single and comlete utterance.
    window : int, optional
        Number of phonemes to be taken into account for boundary constraint.
    nfolds : int, optional
        The number of folds to segment the `text` on.
    njobs : int, optional
        The number of subprocesses to run in parallel. The folds are
        independant of each others and can be computed in
        parallel. Requesting a number of jobs greater then `nfolds`
        have no effect.
    log : logging.Logger, optional
        The logger instance where to send messages.

    Returns
    -------
    generator
        The utterances from `text` with estimated words boundaries.

    See also
    --------
    wordseg.folding.fold

    """
    # force the text to be a list of utterances
    text = list(text)

    log.debug('building %s folds', nfolds)
    folded_texts, fold_index = folding.fold(text, nfolds)

    segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)(
        joblib.delayed(_puddle)(fold,
                                window,
                                log_level=log.getEffectiveLevel(),
                                log_name='wordseg-puddle - fold {}'.format(n +
                                                                           1))
        for n, fold in enumerate(folded_texts))

    log.debug('unfolding the %s folds', nfolds)
    output_text = folding.unfold(segmented_texts, fold_index)

    return (utt for utt in output_text if utt)
コード例 #7
0
ファイル: dibs.py プロジェクト: manelkhe/wordseg
    def __init__(self,
                 text,
                 separator=Separator(),
                 level='phone',
                 log=utils.null_logger()):
        if level not in ('phone', 'syllable'):
            raise ValueError(
                'Unknown level {}, must be hone or syllable'.format(level))
        log.info('reading data at %s level', level)

        self.separator = separator
        self.summary = Counter()
        self.lexicon = Counter()
        self.phrase_initial = Counter()
        self.phrase_final = Counter()
        self.internal_diphones = Counter()
        self.spanning_diphones = Counter()

        nremoved = 0
        for index, utt in enumerate(text):
            # ignore empty lines (as in wordseg-prep, to have a
            # consistant behavior between the tools) and let the user
            # know how many lines we ignored
            if utt.strip() == '':
                log.debug('ignoring empty line %d', index + 1)
                nremoved += 1
            else:
                if separator.word not in utt:
                    raise ValueError(
                        'word separator "{}" not found in train text: line {}'.
                        format(separator.word, index + 1))

                self._read_utterance(utt, level)

        self.diphones = Counter(self.internal_diphones)
        for k, v in self.spanning_diphones.items():
            self.diphones.increment(k, v)

        if nremoved > 0:
            log.info('ignored %d empty lines in train text', nremoved)

        log.info('train data summary: %s', self.summary)
コード例 #8
0
ファイル: baseline.py プロジェクト: codebyzeb/wordseg
def segment(text, probability=0.5, log=utils.null_logger()):
    """Random word segmentation given a boundary probability

    Given a probability :math:`p`, the probability :math:`P(t_i)` to
    add a word boundary after each token :math:`t_i` is:

    .. math::

        P(t_i) = P(X < p), X \\sim \\mathcal{U}(0, 1).

    Parameters
    ----------
    text : sequence
        The input utterances to segment, tokens are
        assumed to be space separated.
    probability: float, optional
        The probability to append a word boundary after each token.
    log : logging.Logger
        Where to send log messages

    Yields
    ------
    segmented_text : generator
        The randomly segmented utterances.

    Raises
    ------
    ValueError
        if the probability is not a float in [0, 1].

    """
    # make sure the probability is valid
    if not isinstance(probability, float):
        raise ValueError('probability must be a float')
    if probability < 0 or probability > 1:
        raise ValueError(
            'probability must be in [0, 1], it is {}'.format(probability))

    log.info('P(word boundary) = %s', probability)
    for utt in text:
        yield ''.join(token + ' ' if random.random() < probability else token
                      for token in utt.strip().split(' '))
コード例 #9
0
    def __init__(self, corpus, separator, log=utils.null_logger()):
        self.log = log

        # check the separator have words and possibly phones
        self.separator = separator
        if not self.separator.word:
            raise ValueError('word separator not defined')
        if not self.separator.phone:
            log.warning('phone separator not defined, some stats ignored')
        if not self.separator.syllable:
            log.warning('syllable separator not defined, some stats ignored')
        self.log.info('token separator is %s', self.separator)

        # force to list and ignore empty lines
        self.corpus = [
            utt for utt in (utt.strip() for utt in corpus) if len(utt)
        ]
        self.log.info('loaded %s utterances', len(self.corpus))
        if len(self.corpus) == 0:
            raise ValueError('no text to load')

        # tokenize the entire text at each defined level ('word',
        # 'syllable' and/or 'phone') TODO can be optimized we are
        # tokenizing the entire text up to 3 times (implement nested
        # tokenization).
        self.tokens = {}
        for level in self.separator.levels()[::-1]:
            self.log.debug('tokenizing %s', level)
            self.tokens[level] = [
                self.separator.tokenize(utt, level, keep_boundaries=False)
                for utt in self.corpus
            ]

            ntokens = sum(len(t) for t in self.tokens[level])
            self.log.info('parsed %s %ss', ntokens, level)
            if ntokens == 0:
                raise ValueError('{}s expected but 0 parsed'.format(level))

        # estimates token frequencies
        self.unigram = {}
        for level in self.separator.levels()[::-1]:
            self.unigram[level] = self._unigram(level)
コード例 #10
0
ファイル: dibs.py プロジェクト: manelkhe/wordseg
    def __init__(self,
                 summary,
                 pwb=None,
                 threshold=0.5,
                 log=utils.null_logger()):
        self.summary = summary
        self.wordsep = summary.separator.word
        self.log = log
        self.diphones = Counter()

        self.pwb = pwb
        if self.pwb and (self.pwb < 0 or self.pwb > 1):
            raise ValueError('pwb must be a float in [0, 1], it is: {}'.format(
                str(self.pwb)))

        self.thresh = threshold
        if self.thresh < 0 or self.thresh > 1:
            raise ValueError(
                'threshold must be a float in [0, 1], it is: {}'.format(
                    self.thresh))

        self.init_diphones()
コード例 #11
0
ファイル: dibs.py プロジェクト: manelkhe/wordseg
def segment(test_text,
            trained_model,
            type='phrasal',
            threshold=0.5,
            pwb=None,
            log=utils.null_logger()):
    """Segment a corpus from a trained DiBS model

    This method is a simple wrapper on the Segmenter classes, namely
    GoldSegmenter, PhrasalSegmenter and LexicalSegmenter.

    Parameters
    ----------
    test_text : sequence of str
        The input text to segment is a sequence (list or generator) of
        utterances. Each utterance is composed of space seprated
        tokens (can be phones or syllables).
    trained_model : CorpusSummary
        The trained DiBS model used for segmentation of `test_text`.
    type : str, optional
        The type of DiBS segmenter to use, must be 'gold',
        'phrasal' or 'lexical'. Default is 'phrasal'.
    threshold: float, optional
        Threshold on word boundary probabilities. If a diphone has a
        word boundray probability greater than this threshold, a word
        boudary is added. Must be in [0, 1]. The optimal threshold is
        0.5 (default).
    pwb : float, optional
        Probability of word boundary, if not specified it is estimated
        from the train text as (nwords - nlines)/(nphones - nlines).
        This option is not used in 'gold' segmentation type. When
        defined must in [0, 1].
    log : logging.Logger, optional
        The log instance where to send messages.

    Yields
    ------
    utterance : str
        The current utterance segmented (with estimated word boundaries)

    Raises
    ------
    ValueError:
        If `type` is not 'gold', 'phrasal' or 'lexical'. If
        `threshold` and `pwb` are not floats in [0, 1].

    """
    # retrieve the segmenter from the 'type' argument
    try:
        segmenter = {
            'phrasal': PhrasalSegmenter,
            'lexical': LexicalSegmenter,
            'gold': GoldSegmenter
        }[type]
    except KeyError:
        raise ValueError(
            'unknown segmenter {}, must be phrasal, lexical or gold'.format(
                type))

    # init the segmenter with the trained model
    segmenter = segmenter(trained_model, pwb=pwb, threshold=threshold, log=log)
    for utt in test_text:
        yield segmenter.segment(utt)
コード例 #12
0
ファイル: puddle.py プロジェクト: rsantana-isg/wordseg
 def __init__(self, window=2, log=utils.null_logger()):
     self.log = log
     self.window = window
     self.lexicon = collections.Counter()
     self.beginning = collections.Counter()
     self.ending = collections.Counter()
コード例 #13
0
ファイル: tp.py プロジェクト: rsantana-isg/wordseg
def segment(text,
            threshold='relative',
            dependency='ftp',
            log=utils.null_logger()):
    """Returns a word segmented version of `text` using the TP algorithm

    Parameters
    ----------
    text : sequence
        A sequence of lines with syllable (or phoneme) boundaries
        marked by spaces and no word boundaries. Each line in the
        sequence corresponds to a single and complete utterance.
    threshold : str, optional
        Type of threshold to use, must be 'relative' or 'absolute'.
    dependency : str, optional
        Type of dependency measure to compute, must be 'ftp' for
        forward transitional probability, 'btp' for backward
        transitional probability or 'mi' for mutual information.
    log : logging.Logger, optional
        The logging instance where to send messages.

    Returns
    -------
    list
        The utterances from `text` with estimated words boundaries.

    Raises
    ------
    ValueError
        If `threshold` is not 'relative' or 'absolute'.
        If `dependency` is not 'ftp', 'btp' or 'mi'.

    """
    # raise on invalid threshold type
    if threshold != 'relative' and threshold != 'absolute':
        raise ValueError(
            "invalid threshold, must be 'relative' or 'absolute', it is '{}'".
            format(threshold))

    # raise on invalid probability type
    if dependency not in ('ftp', 'btp', 'mi'):
        raise ValueError("invalid dependency measure, must be 'ftp', 'btp' "
                         "or 'mi', it is {}".format(dependency))

    log.info('running TP with %s threshold and %s dependency measure',
             threshold, dependency)

    # join all the utterances together, seperated by ' UB '
    units = [
        unit for unit in ' UB '.join(line.strip() for line in text).split()
    ]

    # compute and count all the unigrams and bigrams (two successive units)
    unigrams = collections.Counter(units)
    bigrams = collections.Counter(zip(units[0:-1], units[1:]))

    # compute the transitional probabilities accordoing to the given
    # dependency measure
    if dependency == 'ftp':
        tps = {
            bigram: float(freq) / unigrams[bigram[0]]
            for bigram, freq in bigrams.items()
        }
    elif dependency == 'btp':
        tps = {
            bigram: float(freq) / unigrams[bigram[1]]
            for bigram, freq in bigrams.items()
        }
    else:  # dependency == 'mi'
        tps = {
            bigram: math.log(
                float(freq) / (unigrams[bigram[0]] * unigrams[bigram[1]]), 2)
            for bigram, freq in bigrams.items()
        }

    # segment the input given the transition probalities
    cwords = (_threshold_relative(units, tps)
              if threshold == 'relative' else _threshold_absolute(units, tps))

    # format the segment text for output (' UB ' -> '\n', remove
    # multiple spaces)
    segtext = ' '.join(''.join(c) for c in cwords)
    return [utt.strip() for utt in re.sub(' +', ' ', segtext).split('UB')]
コード例 #14
0
ファイル: prepare.py プロジェクト: rsantana-isg/wordseg
def prepare(text,
            separator=Separator(),
            unit='phone',
            check_punctuation=True,
            tolerant=False,
            log=utils.null_logger()):
    """Prepares a text in phonological form for word segmentation

    The returned text is ready to be segmented. It consists in a suite
    of phonological symbols (can be phones or syllable depending on
    `unit`) separated by spaces.

    The function removes the word separators from all the lines in
    `text` and replaces boundaries at the unit level defined by `unit`
    by a space. If `unit` is 'phone' the syllable separators are
    removed, and vice-versa if `unit` is 'syllable' the phone
    separators are dicarded.

    Parameters
    ----------
    text : sequence
        The input text to be prepared for segmentation. Each element
        of the sequence is assumed to be a single and complete
        utterance in valid phonological form.
    separator : Separator, optional
        Token separation in the `text`
    unit : str, optional
        The unit representation level to prepare the `text` at, must
        be 'syllable' or 'phone'.
    check_punctuation : bool, optional
        When True (default), forbid any punctuation character in the
        utterance and raise ValueError if any punctuation is
        found. When False, do not check punctiation.
    tolerant : bool, optional
        If False, raise ValueError on the first format error detected
        in the `text`. If True, the badly formated utterances are
        filtered out from the output and a warning is issued.
    log : logging.Logger, optional
        The logger instance where to send messages.

    Returns
    -------
    prepared_text : generator
        Utterances from the `text` with separators removed, prepared
        for segmentation at a syllable or phoneme representation level
        (separated by space).

    Raises
    ------
    ValueError
        On the first format error encountered in `text` (see the
        prepare.check_utterance function), only if `tolerant` is
        False.

    """
    # raise an error if unit is not valid
    if unit != 'phone' and unit != 'syllable':
        raise ValueError(
            "unit must be 'phone' or 'syllable', it is '{}'".format(unit))

    # define the function that prepare the text (removing requested
    # separators)
    if unit == 'phone':

        def func(line):
            return line.replace(separator.syllable, '')\
                       .replace(separator.word, '')
    else:  # syllable

        def func(line):
            return line.replace(separator.word, '')\
                       .replace(' ', '')\
                       .replace(separator.syllable, ' ')

    nremoved = 0
    for n, line in enumerate(text):
        try:  # force the utf8 encoding
            line = line.encode('utf8').decode().strip()
        except ValueError:  # line is already in bytes, not str
            line = line.strip()

        # ignore empty lines
        if line == '':
            log.debug('ignoring empty line %d', n + 1)
            nremoved += 1
            continue

        try:
            check_utterance(line,
                            separator,
                            check_punctuation=check_punctuation)
            yield utils.strip(func(line))
        except ValueError as err:
            if tolerant is True:
                log.info('removing line %d: "%s"', n + 1, line)
                nremoved += 1
            else:
                raise ValueError('line {}: {}'.format(n + 1, err))

    if nremoved > 0:
        log.warning('removed %d badly formatted utterances', nremoved)
コード例 #15
0
ファイル: tp.py プロジェクト: manelkhe/wordseg
def segment(text,
            train_text=None,
            threshold='relative',
            dependency='ftp',
            log=utils.null_logger()):
    """Returns a word segmented version of `text` using the TP algorithm

    The parameters `text` and `train_text` must be formatted as follows: A
        sequence of lines with syllable (or phoneme) boundaries marked by
        spaces and no word boundaries. Each line in the sequence corresponds to
        a single and complete utterance

    Parameters
    ----------
    text : sequence
        The text to segment into words
    train_text : sequence, optional
        The text used to train model on (estimation of transition
        probabilities). If not specified use the `text`.
    threshold : str, optional
        Type of threshold to use, must be 'relative' or 'absolute'.
    dependency : str, optional
        Type of dependency measure to compute, must be 'ftp' for
        forward transitional probability, 'btp' for backward
        transitional probability or 'mi' for mutual information.
    log : logging.Logger, optional
        The logging instance where to send messages.

    Returns
    -------
    list
        The utterances from `text` with estimated words boundaries.

    Raises
    ------
    ValueError
        If `threshold` is not 'relative' or 'absolute'.
        If `dependency` is not 'ftp', 'btp' or 'mi'.

    """
    # raise on invalid threshold type
    if threshold not in ('relative', 'absolute'):
        raise ValueError(
            "invalid threshold, must be 'relative' or 'absolute', it is '{}'".
            format(threshold))

    # raise on invalid probability type
    if dependency not in ('ftp', 'btp', 'mi'):
        raise ValueError("invalid dependency measure, must be 'ftp', 'btp' "
                         "or 'mi', it is {}".format(dependency))

    log.info('running TP with %s threshold and %s dependency measure',
             threshold, dependency)

    # calculate test_unit and train_unit
    test_units = ' UB '.join(line.strip() for line in text).split()

    if train_text is None:
        train_units = test_units
    else:
        train_units = ' UB '.join(line.strip() for line in train_text).split()

    # estimate the transition probabilities
    tps = _train(train_units, dependency)

    # segment the text using those TPs
    return _segment(test_units, tps, threshold)
コード例 #16
0
ファイル: ag.py プロジェクト: manelkhe/wordseg
def segment(text,
            train_text=None,
            grammar_file=None,
            category='Colloc0',
            args=DEFAULT_ARGS,
            save_grammar_to=None,
            ignore_first_parses=0,
            nruns=8,
            njobs=1,
            tempdir=tempfile.gettempdir(),
            log=utils.null_logger()):
    """Segment a text using the Adaptor Grammar algorithm

    The algorithm is ran 8 times in parallel and the results are
    collapsed. We ensure the random seed to be different for each run.

    Parameters
    ----------
    text : sequence of str
        The list of utterances to segment using the model
        learned from `train_text`.
    train_text : sequence, optional
        The list of utterances to train the model on. If None train the model
        directly on `text`.
    grammar_file : str, optional
        The path to the grammar file to use for segmentation. If not
        specified, a Colloc0 grammar is generated from the input text.
    category : str, optional
        The category to segment the text with, must be an existing
        parent in the grammar (i.e. the `segment_category` must be
        present in the left column of the grammar file), default to
        'Colloc0'.
    args : str, optional
        Command line options to run the AG program with, use
        'wordseg-ag --help' to have a complete list of available
        options
    save_grammar_to : str, optional
        If defined, this is an output file where to save the grammar
        ussed for segmentation. This is usefull to keep trace of the
        used grammar when using an auto-generated one (i.e. when
        grammar_file is None).
    ignore_first_parses : int, optional
        Ignore the first parses from the algorithm output. If
        negative, keep only the last ones (e.g. -1 keeps only the last
        one, -2 the last two).
    nruns : int, optional
        number of runs to execute and output parses to collapse. This
        number 8 comes from the original recipe provided by M Jonhson.
    njobs : int, optional
        The number of parallel subprocesses to run
    tempdir : str, optional
        A directory where to store temporary data
    log : logging.Logger, optional
        A logger where to send log messages

    Returns
    -------
    segmented : list
        The test utterances with estimated word boundaries

    Raises
    ------
    RuntimeError
        If one of the AG subprocesses fails or returns an error code.
        If the `score_category` is not found in the grammar.

    """
    t1 = datetime.datetime.now()

    # if any, force the test text from sequence to list
    test_text = text
    if not isinstance(test_text, list):
        test_text = list(test_text)
    nutts = len(test_text)
    log.info('test data: %s utterances loaded', nutts)

    if train_text is None:
        train_text = test_text
        log.info('not train data provided, will train model on test data')
    else:
        # force the train text from sequence to list
        if not isinstance(train_text, list):
            train_text = list(train_text)
            nutts = len(train_text)
    log.info('train data: %s utterances loaded', nutts)

    # display the AG algorithm parameters
    log.info('parameters are: "%s"', args)

    # setup ignore_first_parses and make sure ignore_first_parses <=
    # niterations
    if '-n' in args:
        nparses = int(re.sub(r'^.*\-n *([0-9]+).*$', r'\g<1>', args))
        if '-x' in args:
            interval = int(re.sub(r'^.*\-x *([0-9]+).*$', r'\g<1>', args))
            nparses = int(nparses / interval)
        nparses += 1  # include the initial one (independant of iterations)
    else:
        nparses = 2000 + 1  # the default value fixed in C++
    if ignore_first_parses < 0:
        ignore_first_parses = max(0, nparses + ignore_first_parses)
    if ignore_first_parses >= nparses:
        raise RuntimeError('cannot ignore {} parses (max is {})'.format(
            ignore_first_parses, nparses - 1))

    # ensure we have a different seed for all runs. If the seed is
    # specified in command line (-r SEED) then feed SEED+i for i the
    # ith run. Else put a random seed to each run.
    args = _setup_seed(args, nruns)
    log.info('random seeds are: %s',
             ', '.join([arg.split('-r ')[1].split(' ')[0] for arg in args]))

    # we may use a temp file to write the grammar, it is automatically
    # erased when done
    with tempfile.NamedTemporaryFile(dir=tempdir) as grammar_temp:
        # if grammar is not specified, generate a Colloc0 one from the
        # set of phones in the input text and write it in the tempfile
        if grammar_file is None:
            grammar_file = grammar_temp.name
            log.info('generating Colloc0 grammar in %s ...', grammar_file)
            # extract all the phones in both train and test data
            phones = set(p for utt in train_text for p in utt.split() if p)
            if test_text is not None:
                phones.update(
                    set(p for utt in test_text for p in utt.split() if p))
            # build the grammar from the phoneset
            grammar = build_colloc0_grammar(phones)
            codecs.open(grammar_file, 'w', encoding='utf8').write(grammar)

        check_grammar(grammar_file, category)
        log.info('valid grammar for level %s: %s', category, grammar_file)

        if save_grammar_to:
            log.info('saving grammar to %s', save_grammar_to)
            shutil.copyfile(grammar_file, save_grammar_to)

        # parallel runs of the AG algorithm
        log.info('running AG (%d times)...', nruns)
        parse_counter = ParseCounter(nutts)
        try:
            joblib.Parallel(n_jobs=njobs, backend="threading",
                            verbose=0)(joblib.delayed(_segment_single)(
                                parse_counter,
                                train_text,
                                grammar_file,
                                category,
                                ignore_first_parses,
                                args[n],
                                test_text=test_text,
                                log_level=log.getEffectiveLevel(),
                                tempdir=tempdir,
                                log_name='wordseg-ag - run {}'.format(n + 1))
                                       for n in range(nruns))
        except joblib.JoblibRuntimeError as err:
            raise RuntimeError(err)

        t2 = datetime.datetime.now()
        log.info('total processing time: %s', t2 - t1)
        log.info('extracting most common utterances in %d parses',
                 parse_counter.nparses)

        return parse_counter.most_common()
コード例 #17
0
ファイル: puddle.py プロジェクト: codebyzeb/wordseg
def segment(text,
            train_text=None,
            window=2,
            by_frequency=False,
            nfolds=5,
            njobs=1,
            log=utils.null_logger()):
    """Returns a word segmented version of `text` using the puddle algorithm

    Parameters
    ----------
    text : sequence of str
        A sequence of lines with syllable (or phoneme) boundaries
        marked by spaces and no word boundaries. Each line in the
        sequence corresponds to a single and complete utterance.
    train_text : sequence of str
        The list of utterances to train the model on. If None (default) the
        model is trained online during segmentation. When `train_text` is
        specified, the options `nfolds` and `njobs` are ignored.
    window : int, optional
        Number of phonemes to be taken into account for boundary constraint.
        Default to 2.
    by_frequency : bool, optional
        When True choose the word candidates by filterring them by frequency.
        Default to False.
    nfolds : int, optional
        The number of folds to segment the `text` on. This option is ignored if
        a `train_text` is provided.
    njobs : int, optional
        The number of subprocesses to run in parallel. The folds are
        independant of each others and can be computed in parallel. Requesting
        a number of jobs greater then `nfolds` have no effect. This option is
        ignored if a `train_text` is provided.
    log : logging.Logger, optional
        The logger instance where to send messages.

    Returns
    -------
    generator
        The utterances from `text` with estimated words boundaries.

    See also
    --------
    wordseg.folding.fold

    """
    # force the text to be a list of utterances
    text = list(text)

    if not train_text:
        log.info('not train data provided, will train model on test data')

        log.debug('building %s folds', nfolds)
        folded_texts, fold_index = folding.fold(text, nfolds)

        # segment the folds in parallel
        segmented_texts = joblib.Parallel(n_jobs=njobs, verbose=0)(
            joblib.delayed(_do_puddle)(fold, window, by_frequency,
                                       log.getEffectiveLevel(),
                                       f'wordseg-puddle - fold {n+1}')
            for n, fold in enumerate(folded_texts))

        log.debug('unfolding the %s folds', nfolds)
        output_text = folding.unfold(segmented_texts, fold_index)

        return (utt for utt in output_text if utt)

    # force the train text from sequence to list
    train_text = list(train_text)
    log.info('train data: %s utterances loaded', len(train_text))

    # init a puddle model and train it
    model = Puddle(window=window, by_frequency=by_frequency, log=log)
    model.train(train_text)

    # segmentation of the test text, keeping the model constant
    return (utt for utt in model.segment(text, update_model=False) if utt)