def main(): logging.basicConfig(level=logging.INFO) # Read the training corpus word_vocabulary = Vocabulary(start_stop=False) analyses = {} for line in sys.stdin: word, analysis, _ = line.decode('utf8').split('\t') morphemes = analysis.split('+') if len(morphemes) not in (1, 2): raise Exception('wtf?') prefix = morphemes[0] suffix = '' if len(morphemes) == 1 else morphemes[1] word_vocabulary[word] analyses[word] = (prefix, suffix) # Compute all the possible prefixes, suffixes prefix_vocabulary, suffix_vocabulary = affixes(word_vocabulary) logging.info('%d types / %d prefixes / %d suffixes', len(word_vocabulary), len(prefix_vocabulary), len(suffix_vocabulary)) prefix_counts = Counter() suffix_counts = Counter() for word, (prefix, suffix) in analyses.iteritems(): prefix_counts[prefix_vocabulary[prefix]] += 1 suffix_counts[suffix_vocabulary[suffix]] += 1 ## The base base = MultinomialProduct(len(prefix_vocabulary), 0.001, len(suffix_vocabulary), 0.001) ## Updating the counts base.update(prefix_counts, suffix_counts) print base.log_likelihood()
def __init__(self, alpha, alpha_p, alpha_s, word_vocabulary, prefix_vocabulary, suffix_vocabulary, collapsed): super(SegmentationModel, self).__init__() self.alpha = alpha self.base = MultinomialProduct(len(prefix_vocabulary), alpha_p, len(suffix_vocabulary), alpha_s, collapsed) self.word_vocabulary = word_vocabulary self.prefix_vocabulary = prefix_vocabulary self.suffix_vocabulary = suffix_vocabulary self.analyses = [Counter() for _ in xrange(len(word_vocabulary))]
def __init__(self, alpha, alpha_p, alpha_s, corpus, w_vocabulary, p_vocabulary, s_vocabulary, n_processors, n_mh, collapsed): self.alpha = float(alpha) self.base = MultinomialProduct(len(p_vocabulary), alpha_p, len(s_vocabulary), alpha_s, collapsed) self.corpus = corpus self.word_vocabulary = w_vocabulary self.prefix_vocabulary = p_vocabulary self.suffix_vocabulary = s_vocabulary self.seg_mappings = segmentation_mapping(w_vocabulary, p_vocabulary, s_vocabulary) self.n_processors = n_processors self.n_mh = n_mh self._slaves = [] for gid in xrange(self.n_processors): iq, oq = multiprocessing.Queue(), multiprocessing.Queue() s = CRPSlave(self.alpha/self.n_processors, self.seg_mappings, gid, iq, oq) self._slaves.append((s, iq, oq)) s.start()
class SegmentationModel(CRP): """SegmentationModel ~ DP(alpha, H)""" def __init__(self, alpha, alpha_p, alpha_s, word_vocabulary, prefix_vocabulary, suffix_vocabulary, collapsed): super(SegmentationModel, self).__init__() self.alpha = alpha self.base = MultinomialProduct(len(prefix_vocabulary), alpha_p, len(suffix_vocabulary), alpha_s, collapsed) self.word_vocabulary = word_vocabulary self.prefix_vocabulary = prefix_vocabulary self.suffix_vocabulary = suffix_vocabulary self.analyses = [Counter() for _ in xrange(len(word_vocabulary))] def segmentations(self, w): word = self.word_vocabulary[w] for prefix, suffix in segmentations(word): p = self.prefix_vocabulary[prefix] s = self.suffix_vocabulary[suffix] yield p, s def _random_table(self, k): """Pick a table with dish k randomly""" n = random.randrange(0, self.ncustomers[k]) tables = self.tables[k] for i, c in enumerate(tables): if n < c: return i n -= c def seating_probs(self, w, initialize=False): """Joint probabilities of all possible (segmentation, table assignments) of word #w""" for p, s in self.segmentations(w): yield (p, s, -1), (1 if initialize else self.alpha * self.base.prob(p, s)) if not (w, p, s) in self.tables: continue for seat, count in enumerate(self.tables[w, p, s]): yield (p, s, seat), (1 if initialize else count) def increment(self, w, initialize=False): """Sample a segmentation and a table assignment for word #w""" # sample a table (p, s, seat) = mult_sample(self.seating_probs(w, initialize)) # seat to the table if self._seat_to((w, p, s), seat): self.base.increment(p, s) # increment dish count self.analyses[w][p, s] += 1 def decrement(self, w): """Decrement the count for a (p, s) segmentation of w""" # randomly choose a dish n = random.randrange(0, len(self.analyses[w])) for i, (p, s) in enumerate(self.analyses[w]): if n == i: break # randomly choose a table labeled with this dish seat = self._random_table((w, p, s)) # remove customer if self._unseat_from((w, p, s), seat): self.base.decrement(p, s) # decrement dish count self.analyses[w][p, s] -= 1 if self.analyses[w][p, s] == 0: del self.analyses[w][p, s] def resample_labels(self): new_analyses = [Counter() for _ in xrange(len(self.word_vocabulary))] new_tables = {} new_ncustomers = {} for (w, old_p, old_s), tables in self.tables.iteritems(): for c in tables: # remove (old_p, old_s) self.base.decrement(old_p, old_s) # resample (p, s) = mult_sample(((p, s), self.base.prob(p, s)) for p, s in self.segmentations(w)) # add (p, s) if (w, p, s) not in new_tables: new_tables[w, p, s] = [] new_ncustomers[w, p, s] = 0 new_tables[w, p, s].append(c) new_ncustomers[w, p, s] += c new_analyses[w][p, s] += c self.base.increment(p, s) self.analyses = new_analyses self.tables = new_tables self.ncustomers = new_ncustomers def decode(self, w): """Compute the most likely segmentation (p, s) of word #w""" return max(self.segmentations(w), key=lambda ps: self.base.marginal_prob(*ps)) def log_likelihood(self): """ LL = \frac{\prod_{t \in tables} \alpha p_0(l_t) \times 1 \times \dots (c_t - 1)}{\prod_{k=0}^{N-1} (\alpha + k)} = p_{\text{base}} \times \frac{\Gamma(\alpha)}{\Gamma(\alpha + N)}\prod_{t \in tables} \alpha \times c_t! """ return (math.lgamma(self.alpha) - math.lgamma(self.alpha + self.total_customers) + self.ntables * math.log(self.alpha) + sum(math.lgamma(c) for tables in self.tables.itervalues() for c in tables) + self.base.log_likelihood()) def __repr__(self): return ('SegmentationModel(alpha={self.alpha}, base={self.base}, ' '#customers={self.total_customers}, #tables={self.ntables}, ' '#dishes={V})').format(self=self, V=len(self.tables))
class ParallelSegmentationModel(object): def __init__(self, alpha, alpha_p, alpha_s, corpus, w_vocabulary, p_vocabulary, s_vocabulary, n_processors, n_mh, collapsed): self.alpha = float(alpha) self.base = MultinomialProduct(len(p_vocabulary), alpha_p, len(s_vocabulary), alpha_s, collapsed) self.corpus = corpus self.word_vocabulary = w_vocabulary self.prefix_vocabulary = p_vocabulary self.suffix_vocabulary = s_vocabulary self.seg_mappings = segmentation_mapping(w_vocabulary, p_vocabulary, s_vocabulary) self.n_processors = n_processors self.n_mh = n_mh self._slaves = [] for gid in xrange(self.n_processors): iq, oq = multiprocessing.Queue(), multiprocessing.Queue() s = CRPSlave(self.alpha/self.n_processors, self.seg_mappings, gid, iq, oq) self._slaves.append((s, iq, oq)) s.start() def initialize(self): # Initialize H self.base.initialize() # Send tokens to processors (initialize G) processor_indicators = [random.randrange(self.n_processors) for _ in self.corpus] for gid, (_, iq, _) in enumerate(self._slaves): words = [w for i, w in enumerate(self.corpus) if processor_indicators[i] == gid] iq.put(Message('init_tokens', words)) # Update H self.update_base() def update_base(self): # Receive and aggregate counts total_p_counts = Counter() total_s_counts = Counter() for _, iq, oq in self._slaves: iq.put(Message('send_counts')) p_counts, s_counts = oq.get() total_p_counts += p_counts total_s_counts += s_counts # Update the base counts self.base.update(total_p_counts, total_s_counts) # Resample the base self.base.resample() def resample(self, processors=False): """Run the sampler for the parallelized model.""" # Send H to slaves for _, iq, _ in self._slaves: iq.put(Message('update_base', self.base)) # Each slave: resample CRP for _, iq, _ in self._slaves: iq.put(Message('resample')) # Update H self.update_base() if processors: self.resample_assignments() def mh_step(self, old_tables): new_tables = [[] for _ in old_tables] for tables in old_tables: for table in tables: pi = random.randrange(self.n_processors) new_tables[pi].append(table) new_ccs = [self._counts_of_counts(tables) for tables in new_tables] old_ccs = [self._counts_of_counts(tables) for tables in old_tables] numer = sum(math.lgamma(v+1) for ccs in new_ccs for v in ccs.itervalues()) denom = sum(math.lgamma(v+1) for ccs in old_ccs for v in ccs.itervalues()) ratio = math.exp(numer - denom) accept_prob = min(1.0, ratio) accept = random.random() < accept_prob return accept, (new_tables if accept else old_tables) def resample_assignments(self): # 1. Collect tables tables = [] # [[(dish, count), ...], ...] for _, iq, oq in self._slaves: iq.put(Message('send_tables')) tables.append(oq.get()) # 2. Resample processor assignments mh_steps = 0.0 mh_accepts = 0.0 for mh_step in xrange(self.n_mh): accept, tables = self.mh_step(tables) mh_steps += 1 mh_accepts += accept acceptance_rate = mh_accepts/mh_steps # 3. Send new table assignments to slaves for i, (_, iq, _) in enumerate(self._slaves): iq.put(Message('receive_tables', tables[i])) # Write log-likelihood total_customers = sum(sum(c for _, c in tables) for tables in tables) n_tables = sum(len(tables) for tables in tables) n_dishes = len(set(dish for tables in tables for dish, _ in tables)) logging.info('MH Acceptance Rate: %f', acceptance_rate) logging.info('LL= %.0f\tCRPLL= %.0f\tBaseLL= %.0f', *self._log_likelihood(*tables)) logging.info(('ParallelSegmentationModel(alpha={self.alpha}, base={self.base}, ' '#customers={total_customers}, #tables={n_tables}, ' '#dishes={n_dishes})').format(self=self, total_customers=total_customers, n_tables=n_tables, n_dishes=n_dishes)) def _log_likelihood(self, *tables): tables = [t for ts in tables for t in ts] ntables = len(tables) ncustomers = sum(c for _, c in tables) crp_ll = (math.lgamma(self.alpha) - math.lgamma(self.alpha + ncustomers) + sum(math.lgamma(c) for _, c in tables) + ntables * math.log(self.alpha)) base_ll = self.base.log_likelihood() return crp_ll+base_ll, crp_ll, base_ll def _counts_of_counts(self, tables): return Counter(c for _, c in tables) def shutdown(self): """Shut down any resources used.""" for p, iq, _ in self._slaves: iq.put(Message('shutdown')) p.join() def decode(self, word): analyses = self.seg_mappings[self.word_vocabulary[word]] probs = [self.base.marginal_prob(*a) for a in analyses] _, (p, s) = max(zip(probs, analyses)) return self.prefix_vocabulary[p], self.suffix_vocabulary[s]