Esempio n. 1
0
    def train(labeled_featuresets, estimator=ELEProbDist):	# ELEProbDist:为类名,类名作为参数
        """
        :param labeled_featuresets: A list of classified featuresets,
            i.e., a list of tuples ``(featureset, label)``.
        """
        label_freqdist = FreqDist()
        feature_freqdist = defaultdict(FreqDist)	# value 为 Freqdict 的字典
        feature_values = defaultdict(set)		# value 为 set 的字典
        fnames = set()

        # Count up how many times each feature value occurred, given
        # the label and featurename.
        for featureset, label in labeled_featuresets:		# 原始通用特征 [({feature dict},label) ,( )]
            label_freqdist.inc(label)
            for fname, fval in featureset.items():
                # Increment freq(fval|label, fname)
                feature_freqdist[label, fname].inc(fval)	# featureset 为 dict; feature_freqdist[label, fname] 为 freqdict: 统计每个特征,某个值的出现次数
                # Record that fname can take the value fval.	# !!! 所以,不管特征

                feature_values[fname].add(fval)			# value 为 set 的字典
                # Keep a list of all feature names.
                fnames.add(fname)

        # If a feature didn't have a value given for an instance, then
        # we assume that it gets the implicit value 'None.'  This loop
        # counts up the number of 'missing' feature values for each
        # (label,fname) pair, and increments the count of the fval
        # 'None' by that amount.
        for label in label_freqdist:
            num_samples = label_freqdist[label]			# 所有样本中 某类的总次数
            for fname in fnames:
                count = feature_freqdist[label, fname].N()	# freqdict.N(): 为freqdict()的所有频数之和,即:与特定类共现过的所有特征(key)的种的出现次数
                feature_freqdist[label, fname].inc(None, num_samples-count)	# freqdist.inc(key): 给 key 的 value 加 1 (第二个参数默认为 1)
										# 每个特征的某一值对于某个类的概率都是基于该类的样本总数计算

                feature_values[fname].add(None)			# 每个特征值的种类,都增加一个 None。

        # Create the P(label) distribution
        label_probdist = estimator(label_freqdist)		# 默认平滑: gamma=0.5, bins = len(label_freqdist)

        # Create the P(fval|label, fname) distribution
        feature_probdist = {}
        for ((label, fname), freqdist) in feature_freqdist.items():
            probdist = estimator(freqdist, bins=len(feature_values[fname]))	# estimator:为类别名,用作概率平滑的类:LidstoneProbDist
            feature_probdist[label,fname] = probdist

        return NaiveBayesClassifier(label_probdist, feature_probdist)
Esempio n. 2
0
 def __init__(self,
              mbdp=False,
              subseq_counts=None,
              witten_bell=False,
              bad_score=0):
     '''
         Initializes any counts to their default values, if necessary
         @param mbdp: Use MBDP-1 score adjustments when calculating word scores.
         @type mbdp: L{bool}
         @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues}
                               of the current L{Segmenter}.
         @type subseq_counts: L{FreqDist}
         @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies
                             sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode.
         @type witten_bell: L{bool}
     '''
     super(FamiliarWordCue, self).__init__(Fraction(0),
                                           subseq_counts=subseq_counts)
     self._phonotactic = False
     self._lexicon = FreqDist(counttype=Fraction)
     self._mbdp = mbdp
     self._witten_bell = witten_bell
     self._bad_score = bad_score
Esempio n. 3
0
 def __init__(self, mbdp=False, subseq_counts=None, witten_bell=False, bad_score=0):
     '''
         Initializes any counts to their default values, if necessary
         @param mbdp: Use MBDP-1 score adjustments when calculating word scores.
         @type mbdp: L{bool}
         @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues}
                               of the current L{Segmenter}.
         @type subseq_counts: L{FreqDist}
         @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies
                             sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode.
         @type witten_bell: L{bool}
     '''
     super(FamiliarWordCue, self).__init__(Fraction(0), subseq_counts=subseq_counts)
     self._phonotactic = False
     self._lexicon = FreqDist(counttype=Fraction)
     self._mbdp = mbdp
     self._witten_bell = witten_bell
     self._bad_score = bad_score
Esempio n. 4
0
    def train_supervised(self, seq, model):
        """
        Fits the HMM parameters, namely transition, emission and prior
        probabilities, while also initializing the list of states and
        emissions. Does this by using MLE estimates for all probabilities
        based on the labeled sequence that is passed to this method.
        Parameter estimates are then saved to the inputted model instance,
        via its '_set_parameters()' method.

        Args:
            labeled_seq (list): list of lists of (state, emission) tuple pairs
            model (HiddenMarkovModelTagger): the model instance to train
        """

        # Unpack the sentences and separate them into a tags
        # sequence (tags will be used to initialize priors)
        tags = [pair[_TAG] for sent in seq for pair in sent]

        transitions = CFD()
        emissions = CFD()
        states = set()
        symbols = set()

        # Train the conditional distributions by iterating through the
        # pairs and counting (state, emission) and (state_i, state_i+1)
        for sent in seq:
            n = len(sent)
            for i, pair in enumerate(sent):
                state, symbol = pair[_TAG], pair[_TEXT]
                if i < n - 1:
                    transitions[state][sent[i + 1][_TAG]] += 1
                emissions[state][symbol] += 1
                states.add(state)
                symbols.add(symbol)

        # Save the trained parameters to the model instance and wrap the
        # conditional frequencies with the ConditionalProbDist class
        model._set_parameters(transitions=CPD(transitions, MLEProbDist),
                              emissions=CPD(emissions, MLEProbDist),
                              priors=MLEProbDist(FreqDist(tags)),
                              states=list(states),
                              symbols=list(symbols))
Esempio n. 5
0
def main():
    '''
    Main Program
    '''
    # Parse command-line arguments
    parser = argparse.ArgumentParser(
        description='PHOCUS is a word segmentation system.',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        "corpus",
        help=
        "File to segment (any existing occurrences of word delimiter will be removed before segmenting).",
        type=argparse.FileType('r'),
        default='-')
    parser.add_argument(
        "-bs",
        "--badScore",
        help="Score assigned when word length is less than window size.",
        type=Fraction,
        default="0.0")
    parser.add_argument("-d",
                        "--diphthongs",
                        help="Comma-delimited list of possible diphthongs.",
                        type=csv_list,
                        default='9I,OI,9U')
    parser.add_argument(
        "-df",
        "--decayFactor",
        help="Exponent used to calculate memory decay (0 = no decay).",
        type=Fraction,
        default="0.0")
    parser.add_argument("-fc",
                        "--featureChart",
                        help="Feature chart file",
                        default=os.path.join(
                            os.path.dirname(os.path.realpath(__file__)), '..',
                            'corpora', 'br.tab'))
    parser.add_argument("-fo",
                        "--featureNgramsOut",
                        help="File to dump final feature n-gram counts to",
                        type=argparse.FileType('w'))
    parser.add_argument("-fw",
                        "--featureWindow",
                        help="Window size for feature n-grams",
                        type=int)
    parser.add_argument(
        "-gp",
        "--goldPhonotactics",
        help=
        "Calculate phoneme n-gram scores based on their true frequencies in the gold corpus.",
        action="store_true")
    parser.add_argument(
        "-hp",
        "--hypotheticalPhonotactics",
        help=
        "When evaluating hypothetical words' well-formedness, increment counts of all n-grams within "
        + "proposed word.",
        action="store_true")
    parser.add_argument(
        "-iw",
        "--ignoreWordBoundary",
        help=
        "When calculating phoneme/syllable/etc. n-gram scores, do not include word boundary.",
        action="store_true")
    parser.add_argument(
        "-ic",
        "--initialCount",
        help=
        "Count assigned to phonotactic largest n-grams before they are seen.",
        type=Fraction,
        default="0.0001")
    parser.add_argument(
        "-is",
        "--initializeSyllables",
        help=
        "Initialize syllable n-gram by finding all syllables in gold corpus and setting their counts to one "
        + "in advance.",
        action="store_true")
    parser.add_argument(
        "-i",
        "--interactive",
        help=
        "After reading in corpus, user can specify an utterance number to segment up to, and query scores "
        + "for possible segmentations.",
        action="store_true")
    parser.add_argument("-kb",
                        "--katzBackoff",
                        help="For all n-gram models, use Katz back-off.",
                        action="store_true")
    parser.add_argument("-jp",
                        "--jointProbability",
                        help="Use joint probabilities instead of conditional",
                        action="store_true")
    parser.add_argument("-lo",
                        "--lexiconOut",
                        help="File to dump final lexicon to",
                        type=argparse.FileType('w'))
    parser.add_argument(
        "-ln",
        "--lineNumbers",
        help="Display line numbers before each segmented utterance",
        action="store_true")
    parser.add_argument(
        "-mb",
        "--mbdp",
        help=
        "Use MBDP-1 (Brent 1999) phoneme and word scores functions.  Implies --initialCount = 0. Should "
        + "also enable --hypotheticalPhonotactics for true MBDP-1.",
        action="store_true")
    parser.add_argument(
        "-nb",
        "--nBest",
        help=
        "Number of segmentations to update evidence for for each sentence.",
        type=int,
        default=1)
    parser.add_argument(
        "-nl",
        "--noLexicon",
        help=
        "Only score words based on the phonotactics, and don't do 'familiar word' spotting.  Does NOT entail "
        + "--tokenPhonotactics.",
        action="store_true")
    parser.add_argument("-pn",
                        "--phonemeNgramsOut",
                        help="File to dump final phoneme n-gram counts to",
                        type=argparse.FileType('w'))
    parser.add_argument(
        "-pc",
        "--piecewiseCountsOut",
        help="File to dump final strictly two-piecewise counts to",
        type=argparse.FileType('w'))
    parser.add_argument("-pw",
                        "--phonemeWindow",
                        help="Window size for phoneme n-grams",
                        type=int,
                        default=1)
    parser.add_argument(
        "-pu",
        "--printUtteranceDelimiter",
        help="Print utterance delimiter at the end of each utterance",
        action="store_true")
    parser.add_argument(
        "-rs",
        "--requireSyllabic",
        help=
        "Require each proposed word to contain at least one syllabic sound. " +
        "(Requires --featureChart that includes 'syllabic' as feature)",
        action="store_true")
    parser.add_argument(
        "-sp",
        "--scorePiecewise",
        help=
        "Score potential words based on their Strictly 2-Piecewise factors " +
        "(i.e., long distance pairs for vowel and consonantal harmony).",
        action="store_true")
    parser.add_argument(
        "-st",
        "--stabilityThreshold",
        help=
        "When --waitForStablePhonemeDist is enabled, all the ratio between all phoneme counts when they are "
        +
        "updated must be greater than stabilityThreshold before model will start segmenting.",
        type=Fraction,
        default="0.99")
    parser.add_argument(
        "-sd",
        "--subseqDenominator",
        help=
        "For all scores, calculate probability sequence of characters is in a word, rather than probability "
        + "of them occuring in corpus.",
        action="store_true")
    parser.add_argument(
        "-sf",
        "--supervisedFor",
        help=
        "Number of utterances to use given word-boundaries for (0 = unsupervised).",
        type=int)
    parser.add_argument(
        "-su",
        "--supervisedUpdating",
        help=
        "When doing supervised segmenting with the supervisedFor flag, resume learning process after "
        + "supervised portion of corpus.",
        action="store_true")
    parser.add_argument("-so",
                        "--syllableNgramsOut",
                        help="File to dump final syllable n-gram counts to",
                        type=argparse.FileType('w'))
    parser.add_argument(
        "-sw",
        "--syllableWindow",
        help=
        "Window size for syllable n-grams (Note: does not entail --initialSyllables)",
        type=int,
        default=0)
    parser.add_argument(
        "-tp",
        "--tokenPhonotactics",
        help=
        "Update phoneme n-gram counts once per word occurrence, instead of per word type.",
        action="store_true")
    parser.add_argument(
        "-up",
        "--uniformPhonotactics",
        help=
        "Never update phonotactic n-gram counts.  Just use initial uniform distribution throughout.",
        action="store_true")
    parser.add_argument(
        "-ud",
        "--utteranceDelimiter",
        help=
        "Utterance delimiter (Note: Utterances are always assumed to be one-per-line, this delimiter is "
        + "the symbol used when calculating n-grams at utterance boundaries.",
        default="$")
    parser.add_argument(
        "-ul",
        "--utteranceLimit",
        help=
        "Number of utterances in input corpus to process (0 = process all).",
        type=int,
        default=0)
    parser.add_argument(
        "-v",
        "--verbose",
        help=
        "Print out scores for each possible segmentation of each utterance.",
        action="store_true")
    parser.add_argument(
        "-wf",
        "--waitForStablePhonemeDist",
        help=
        "Do not start attempting to segment until phoneme unigram has stabilized.",
        action="store_true")
    parser.add_argument(
        "-wu",
        "--waitUntilUtterance",
        help=
        "Do not start attempting to segment until we have reached the specified utterance number.",
        type=int)
    parser.add_argument(
        "-ws",
        "--weightedSum",
        help=
        "Instead of using back-off model for score combination, use weighted sum with all cues "
        + "weighted equally.",
        action="store_true")
    parser.add_argument(
        "-wb",
        "--wittenBell",
        help=
        "Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies sub-word scores "
        "by Witten-Bell normalizing factor. This is ignored in no lexicon mode.",
        action="store_true")
    parser.add_argument("-wd",
                        "--wordDelimiter",
                        help="Word delimiter (default: '%(default)s')",
                        default=" ")
    args = parser.parse_args()

    if args.mbdp:
        args.initialCount = Fraction(0)

    feature_chart = PhonologicalFeatureChartReader(
        *os.path.split(args.featureChart))
    subseq_counts = FreqDist(
        counttype=Fraction) if args.subseqDenominator else None

    # Put cue list together
    cues = [FamiliarWordCue(
        subseq_counts=subseq_counts)] if not args.noLexicon else []
    # Syllables
    if args.syllableWindow:
        corpus_name = args.corpus.name
        true_words = set(args.corpus.read().replace(
            '\n', args.wordDelimiter).split(args.wordDelimiter))
        true_words.remove('')
        args.corpus.close()
        args.corpus = open(corpus_name)
        cues.append(
            SyllableNgramCue(
                args.syllableWindow,
                args.initialCount,
                true_words,
                feature_chart,
                hypothetical_phonotactics=args.hypotheticalPhonotactics,
                subseq_counts=subseq_counts,
                diphthongs=args.diphthongs,
                backoff=args.katzBackoff))
    # Phonemes
    if args.phonemeWindow:
        cues.append(
            PhonemeNgramCue(
                args.phonemeWindow,
                args.initialCount,
                len([
                    phone
                    for phone in feature_chart.phones_to_features.viewkeys()
                    if len(phone) == 1
                ]),
                hypothetical_phonotactics=args.hypotheticalPhonotactics,
                backoff=args.katzBackoff))

    if args.interactive:
        utterance_limit = int(raw_input("Utterance number to process to: "))
        args.utteranceLimit = -1 if utterance_limit == 0 else utterance_limit

    segmenter = Segmenter(
        cues,
        partial(backoff_combiner
                if not args.weightedSum else weighted_sum_combiner,
                verbose=args.verbose,
                witten_bell=args.wittenBell),
        feature_chart,
        word_delimiter=args.wordDelimiter,
        utterance_limit=args.utteranceLimit,
        supervised=args.supervisedFor,
        wait_until_utterance=args.waitUntilUtterance,
        wait_for_stable_phoneme_dist=args.waitForStablePhonemeDist,
        output_channel=sys.stdout,
        semi_supervised_updating=args.supervisedUpdating,
        uniform_phonotactics=args.uniformPhonotactics,
        display_line_numbers=args.lineNumbers,
        print_utterance_delimiter=args.printUtteranceDelimiter,
        utterance_delimiter=args.utteranceDelimiter,
        nbest_window=args.nBest)
    segmenter.incremental_processor(args.corpus)

    if args.interactive:
        import readline  # pylint: disable=W0611
        segmenter.cmdloop()

    return segmenter
Esempio n. 6
0
class FamiliarWordCue(Cue):
    '''
        Feature that scores words based on their lexical frequency.
    '''
    def __init__(self,
                 mbdp=False,
                 subseq_counts=None,
                 witten_bell=False,
                 bad_score=0):
        '''
            Initializes any counts to their default values, if necessary
            @param mbdp: Use MBDP-1 score adjustments when calculating word scores.
            @type mbdp: L{bool}
            @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues}
                                  of the current L{Segmenter}.
            @type subseq_counts: L{FreqDist}
            @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies
                                sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode.
            @type witten_bell: L{bool}
        '''
        super(FamiliarWordCue, self).__init__(Fraction(0),
                                              subseq_counts=subseq_counts)
        self._phonotactic = False
        self._lexicon = FreqDist(counttype=Fraction)
        self._mbdp = mbdp
        self._witten_bell = witten_bell
        self._bad_score = bad_score

    def in_lexicon(self, word):
        '''
            @return: whether or not the given word is in the lexicon.
        '''
        return word in self._lexicon

    @property
    def total_words(self):
        ''' Total number of word tokens in lexicon. '''
        return self._lexicon.N()

    def eval_word(self, word):
        '''
            @return: probability that proposed word is a word.
            @todo: Implement lexical decay.
        '''
        if word in self._lexicon:
            word_count = Fraction(self._lexicon[word])
            if not self._mbdp:
                word_types = self._lexicon.B(
                )  # Unlike OCaml version we're not adding utterance delimiter to lexicon, so no subtraction.
                raw_score = word_count / (Fraction(
                    self.subseq_counts[word]) if self.subseq_counts else (
                        Fraction(self.total_words + word_types)
                        if self._witten_bell else Fraction(self.total_words)))
            else:
                raw_score = ((word_count + Fraction(1)) /
                             (self.total_words + Fraction(1))) * ((
                                 (word_count /
                                  (word_count + Fraction(1)))**Fraction(2)))
        elif self._witten_bell:
            word_types = Fraction(
                self._lexicon.B() -
                1)  # Subtract one for initial utterance delimiter addition
            raw_score = word_types / Fraction(self.total_words + word_types)
        else:
            raw_score = self._bad_score
        return raw_score  # lexical decay stuff would need to be added here

    def dump(self, dump_file):
        for word in self._lexicon.iterkeys():
            dump_file.write(word + str(self._lexicon[word]) + '\n')
        if self._subseq_counts:
            for seq in self.subseq_counts.iterkeys():
                dump_file.write(seq + str(self.subseq_counts[seq]) + '\n')
        dump_file.close()

    def use_score(self, word):
        return self.in_lexicon(word)

    def update_evidence(self, word, increase_amount):
        self._lexicon.inc(word, increase_amount)
Esempio n. 7
0
 def setUp(cls):
     cls.tokens = text
     cond_text = list(zip(tags, text))
     cls.fd = FreqDist(tags)
     cls.cfd = ConditionalFreqDist(cond_text)
     cls.cpd = ConditionalProbDist(cls.cfd, MLEProbDist)
Esempio n. 8
0
class FamiliarWordCue(Cue):
    '''
        Feature that scores words based on their lexical frequency.
    '''

    def __init__(self, mbdp=False, subseq_counts=None, witten_bell=False, bad_score=0):
        '''
            Initializes any counts to their default values, if necessary
            @param mbdp: Use MBDP-1 score adjustments when calculating word scores.
            @type mbdp: L{bool}
            @param subseq_counts: A frequency distribution for storing subsequence counts. Should use the same one for all L{Cues}
                                  of the current L{Segmenter}.
            @type subseq_counts: L{FreqDist}
            @param witten_bell: Use Witten-Bell smoothing (like Venkataraman's model) for familiar word scores. This also multiplies
                                sub-word scores by Witten-Bell normalizing factor. This is ignored in no lexicon mode.
            @type witten_bell: L{bool}
        '''
        super(FamiliarWordCue, self).__init__(Fraction(0), subseq_counts=subseq_counts)
        self._phonotactic = False
        self._lexicon = FreqDist(counttype=Fraction)
        self._mbdp = mbdp
        self._witten_bell = witten_bell
        self._bad_score = bad_score

    def in_lexicon(self, word):
        '''
            @return: whether or not the given word is in the lexicon.
        '''
        return word in self._lexicon

    @property
    def total_words(self):
        ''' Total number of word tokens in lexicon. '''
        return self._lexicon.N()

    def eval_word(self, word):
        '''
            @return: probability that proposed word is a word.
            @todo: Implement lexical decay.
        '''
        if word in self._lexicon:
            word_count = Fraction(self._lexicon[word])
            if not self._mbdp:
                word_types = self._lexicon.B()  # Unlike OCaml version we're not adding utterance delimiter to lexicon, so no subtraction.
                raw_score = word_count / (Fraction(self.subseq_counts[word]) if self.subseq_counts
                                                                        else (Fraction(self.total_words + word_types) if self._witten_bell
                                                                                                                      else Fraction(self.total_words)))
            else:
                raw_score = ((word_count + Fraction(1)) / (self.total_words + Fraction(1))) * (((word_count / (word_count + Fraction(1))) ** Fraction(2)))
        elif self._witten_bell:
            word_types = Fraction(self._lexicon.B() - 1)  # Subtract one for initial utterance delimiter addition
            raw_score = word_types / Fraction(self.total_words + word_types)
        else:
            raw_score = self._bad_score
        return raw_score  # lexical decay stuff would need to be added here

    def dump(self, dump_file):
        for word in self._lexicon.iterkeys():
            dump_file.write(word + str(self._lexicon[word]) + '\n')
        if self._subseq_counts:
            for seq in self.subseq_counts.iterkeys():
                dump_file.write(seq + str(self.subseq_counts[seq]) + '\n')
        dump_file.close()

    def use_score(self, word):
        return self.in_lexicon(word)

    def update_evidence(self, word, increase_amount):
        self._lexicon.inc(word, increase_amount)