def main():
    logging.basicConfig(level=logging.INFO)
    # Read the training corpus
    word_vocabulary = Vocabulary(start_stop=False)
    analyses = {}
    for line in sys.stdin:
        word, analysis, _ = line.decode('utf8').split('\t')
        morphemes = analysis.split('+')
        if len(morphemes) not in (1, 2):
            raise Exception('wtf?')
        prefix = morphemes[0]
        suffix = '' if len(morphemes) == 1 else morphemes[1]
        word_vocabulary[word]
        analyses[word] = (prefix, suffix)

    # Compute all the possible prefixes, suffixes
    prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary)

    logging.info('%d types / %d prefixes / %d suffixes',
            len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary))

    prefix_counts = Counter()
    suffix_counts = Counter()
    for word, (prefix, suffix) in analyses.iteritems():
        prefix_counts[prefix_vocabulary[prefix]] += 1
        suffix_counts[suffix_vocabulary[suffix]] += 1

    ## The base
    base = MultinomialProduct(len(prefix_vocabulary), 0.001, len(suffix_vocabulary), 0.001)
    ## Updating the counts
    base.update(prefix_counts, suffix_counts)

    print base.log_likelihood()
Beispiel #2
0
 def __init__(self, alpha, alpha_p, alpha_s, word_vocabulary,
         prefix_vocabulary, suffix_vocabulary, collapsed):
     super(SegmentationModel, self).__init__()
     self.alpha = alpha
     self.base = MultinomialProduct(len(prefix_vocabulary), alpha_p,
             len(suffix_vocabulary), alpha_s, collapsed)
     self.word_vocabulary = word_vocabulary
     self.prefix_vocabulary = prefix_vocabulary
     self.suffix_vocabulary = suffix_vocabulary
     self.analyses = [Counter() for _ in xrange(len(word_vocabulary))]
Beispiel #3
0
    def __init__(self, alpha, alpha_p, alpha_s, corpus, w_vocabulary, p_vocabulary, s_vocabulary, n_processors, n_mh, collapsed):
        self.alpha = float(alpha)
        self.base = MultinomialProduct(len(p_vocabulary), alpha_p,
                len(s_vocabulary), alpha_s, collapsed)
        self.corpus = corpus
        self.word_vocabulary = w_vocabulary
        self.prefix_vocabulary = p_vocabulary
        self.suffix_vocabulary = s_vocabulary
        self.seg_mappings = segmentation_mapping(w_vocabulary, p_vocabulary, s_vocabulary)
        self.n_processors = n_processors
        self.n_mh = n_mh

        self._slaves = []
        for gid in xrange(self.n_processors):
            iq, oq = multiprocessing.Queue(), multiprocessing.Queue()
            s = CRPSlave(self.alpha/self.n_processors, self.seg_mappings, gid, iq, oq)
            self._slaves.append((s, iq, oq))
            s.start()
Beispiel #4
0
class SegmentationModel(CRP):
    """SegmentationModel ~ DP(alpha, H)"""
    def __init__(self, alpha, alpha_p, alpha_s, word_vocabulary,
            prefix_vocabulary, suffix_vocabulary, collapsed):
        super(SegmentationModel, self).__init__()
        self.alpha = alpha
        self.base = MultinomialProduct(len(prefix_vocabulary), alpha_p,
                len(suffix_vocabulary), alpha_s, collapsed)
        self.word_vocabulary = word_vocabulary
        self.prefix_vocabulary = prefix_vocabulary
        self.suffix_vocabulary = suffix_vocabulary
        self.analyses = [Counter() for _ in xrange(len(word_vocabulary))]

    def segmentations(self, w):
        word = self.word_vocabulary[w]
        for prefix, suffix in segmentations(word):
            p = self.prefix_vocabulary[prefix]
            s = self.suffix_vocabulary[suffix]
            yield p, s

    def _random_table(self, k):
        """Pick a table with dish k randomly"""
        n = random.randrange(0, self.ncustomers[k])
        tables = self.tables[k]
        for i, c in enumerate(tables):
            if n < c: return i
            n -= c

    def seating_probs(self, w, initialize=False):
        """Joint probabilities of all possible (segmentation, table assignments) of word #w"""
        for p, s in self.segmentations(w):
            yield (p, s, -1), (1 if initialize
                    else self.alpha * self.base.prob(p, s))
            if not (w, p, s) in self.tables: continue
            for seat, count in enumerate(self.tables[w, p, s]):
                yield (p, s, seat), (1 if initialize else count)

    def increment(self, w, initialize=False):
        """Sample a segmentation and a table assignment for word #w"""
        # sample a table
        (p, s, seat) = mult_sample(self.seating_probs(w, initialize))
        # seat to the table
        if self._seat_to((w, p, s), seat):
            self.base.increment(p, s)
        # increment dish count
        self.analyses[w][p, s] += 1

    def decrement(self, w):
        """Decrement the count for a (p, s) segmentation of w"""
        # randomly choose a dish
        n = random.randrange(0, len(self.analyses[w]))
        for i, (p, s) in enumerate(self.analyses[w]):
            if n == i: break
        # randomly choose a table labeled with this dish
        seat = self._random_table((w, p, s))
        # remove customer
        if self._unseat_from((w, p, s), seat):
            self.base.decrement(p, s)
        # decrement dish count
        self.analyses[w][p, s] -= 1
        if self.analyses[w][p, s] == 0:
            del self.analyses[w][p, s]

    def resample_labels(self):
        new_analyses = [Counter() for _ in xrange(len(self.word_vocabulary))]
        new_tables = {}
        new_ncustomers = {}
        for (w, old_p, old_s), tables in self.tables.iteritems():
            for c in tables:
                # remove (old_p, old_s)
                self.base.decrement(old_p, old_s)
                # resample
                (p, s) = mult_sample(((p, s), self.base.prob(p, s))
                        for p, s in self.segmentations(w))
                # add (p, s)
                if (w, p, s) not in new_tables:
                    new_tables[w, p, s] = []
                    new_ncustomers[w, p, s] = 0
                new_tables[w, p, s].append(c)
                new_ncustomers[w, p, s] += c
                new_analyses[w][p, s] += c
                self.base.increment(p, s)
        self.analyses = new_analyses
        self.tables = new_tables
        self.ncustomers = new_ncustomers

    def decode(self, w):
        """Compute the most likely segmentation (p, s) of word #w"""
        return max(self.segmentations(w), key=lambda ps: self.base.marginal_prob(*ps))

    def log_likelihood(self):
        """
        LL = \frac{\prod_{t \in tables} \alpha p_0(l_t) \times 1 \times \dots (c_t - 1)}{\prod_{k=0}^{N-1} (\alpha + k)}
           = p_{\text{base}} \times \frac{\Gamma(\alpha)}{\Gamma(\alpha + N)}\prod_{t \in tables} \alpha \times c_t!
        """
        return (math.lgamma(self.alpha)
                - math.lgamma(self.alpha + self.total_customers)
                + self.ntables * math.log(self.alpha)
                + sum(math.lgamma(c) for tables in self.tables.itervalues()
                    for c in tables)
                + self.base.log_likelihood())

    def __repr__(self):
        return ('SegmentationModel(alpha={self.alpha}, base={self.base}, '
                '#customers={self.total_customers}, #tables={self.ntables}, '
                '#dishes={V})').format(self=self, V=len(self.tables))
Beispiel #5
0
class ParallelSegmentationModel(object):
    def __init__(self, alpha, alpha_p, alpha_s, corpus, w_vocabulary, p_vocabulary, s_vocabulary, n_processors, n_mh, collapsed):
        self.alpha = float(alpha)
        self.base = MultinomialProduct(len(p_vocabulary), alpha_p,
                len(s_vocabulary), alpha_s, collapsed)
        self.corpus = corpus
        self.word_vocabulary = w_vocabulary
        self.prefix_vocabulary = p_vocabulary
        self.suffix_vocabulary = s_vocabulary
        self.seg_mappings = segmentation_mapping(w_vocabulary, p_vocabulary, s_vocabulary)
        self.n_processors = n_processors
        self.n_mh = n_mh

        self._slaves = []
        for gid in xrange(self.n_processors):
            iq, oq = multiprocessing.Queue(), multiprocessing.Queue()
            s = CRPSlave(self.alpha/self.n_processors, self.seg_mappings, gid, iq, oq)
            self._slaves.append((s, iq, oq))
            s.start()

    def initialize(self):
        # Initialize H
        self.base.initialize()

        # Send tokens to processors (initialize G)
        processor_indicators = [random.randrange(self.n_processors) for _ in self.corpus]
        for gid, (_, iq, _) in enumerate(self._slaves):
            words = [w for i, w in enumerate(self.corpus) if processor_indicators[i] == gid]
            iq.put(Message('init_tokens', words))

        # Update H
        self.update_base()

    def update_base(self):
        # Receive and aggregate counts
        total_p_counts = Counter()
        total_s_counts = Counter()
        for _, iq, oq in self._slaves:
            iq.put(Message('send_counts'))
            p_counts, s_counts = oq.get()
            total_p_counts += p_counts
            total_s_counts += s_counts

        # Update the base counts
        self.base.update(total_p_counts, total_s_counts)

        # Resample the base
        self.base.resample()

    def resample(self, processors=False):
        """Run the sampler for the parallelized model."""
        # Send H to slaves
        for _, iq, _ in self._slaves:
            iq.put(Message('update_base', self.base))

        # Each slave: resample CRP
        for _, iq, _ in self._slaves:
            iq.put(Message('resample'))

        # Update H
        self.update_base()

        if processors:
            self.resample_assignments()

    def mh_step(self, old_tables):
        new_tables = [[] for _ in old_tables]
        for tables in old_tables:
            for table in tables:
                pi = random.randrange(self.n_processors)
                new_tables[pi].append(table)

        new_ccs = [self._counts_of_counts(tables) for tables in new_tables]
        old_ccs = [self._counts_of_counts(tables) for tables in old_tables]
        numer = sum(math.lgamma(v+1) for ccs in new_ccs for v in ccs.itervalues())
        denom = sum(math.lgamma(v+1) for ccs in old_ccs for v in ccs.itervalues())

        ratio = math.exp(numer - denom)
        accept_prob = min(1.0, ratio)

        accept = random.random() < accept_prob

        return accept, (new_tables if accept else old_tables)

    def resample_assignments(self):
        # 1. Collect tables
        tables = [] # [[(dish, count), ...], ...]
        for _, iq, oq in self._slaves:
            iq.put(Message('send_tables'))
            tables.append(oq.get())

        # 2. Resample processor assignments
        mh_steps = 0.0
        mh_accepts = 0.0
        for mh_step in xrange(self.n_mh):
            accept, tables = self.mh_step(tables)
            mh_steps += 1
            mh_accepts += accept

        acceptance_rate = mh_accepts/mh_steps

        # 3. Send new table assignments to slaves
        for i, (_, iq, _) in enumerate(self._slaves):
            iq.put(Message('receive_tables', tables[i]))

        # Write log-likelihood
        total_customers = sum(sum(c for _, c in tables) for tables in tables)
        n_tables = sum(len(tables) for tables in tables)
        n_dishes = len(set(dish for tables in tables for dish, _ in tables))

        logging.info('MH Acceptance Rate: %f', acceptance_rate)
        logging.info('LL= %.0f\tCRPLL= %.0f\tBaseLL= %.0f', *self._log_likelihood(*tables))
        logging.info(('ParallelSegmentationModel(alpha={self.alpha}, base={self.base}, '
                '#customers={total_customers}, #tables={n_tables}, '
                '#dishes={n_dishes})').format(self=self, total_customers=total_customers,
                        n_tables=n_tables, n_dishes=n_dishes))

    def _log_likelihood(self, *tables):
        tables = [t for ts in tables for t in ts]
        ntables = len(tables)
        ncustomers = sum(c for _, c in tables)
        crp_ll = (math.lgamma(self.alpha) - math.lgamma(self.alpha + ncustomers)
              + sum(math.lgamma(c) for _, c in tables)
              + ntables * math.log(self.alpha))
        base_ll = self.base.log_likelihood()
        return crp_ll+base_ll, crp_ll, base_ll

    def _counts_of_counts(self, tables):
        return Counter(c for _, c in tables)

    def shutdown(self):
        """Shut down any resources used."""
        for p, iq, _ in self._slaves:
            iq.put(Message('shutdown'))
            p.join()

    def decode(self, word):
        analyses = self.seg_mappings[self.word_vocabulary[word]]
        probs = [self.base.marginal_prob(*a) for a in analyses]
        _, (p, s) = max(zip(probs, analyses))
        return self.prefix_vocabulary[p], self.suffix_vocabulary[s]