def test_compare(): from vsm import corpus n = 4 c = corpus.random_corpus(1e3, 20, 1, 10, tok_name='sentences') c = c.to_maskedcorpus() c.mask_terms(['0']) em = be.BeagleEnvironment() em.train(c, n_columns=n) env_matrix = em.matrix psi = rand_pt_unit_sphere(n) rand_perm = two_rand_perm(n) print 'Training single processor model' sm = BeagleOrderSingle() sm.train(c, psi=psi, env_matrix=env_matrix, rand_perm=rand_perm) print 'Training multiprocessor model' mm = BeagleOrderMulti() mm.train(c, psi=psi, env_matrix=env_matrix, rand_perm=rand_perm) assert np.allclose(sm.matrix, mm.matrix, atol=1e-07)
def train(self, corpus, env_matrix=None, psi=None, rand_perm=None, n_columns=2048, lmda = 7, tok_name='sentences'): if env_matrix is None: m = be.BeagleEnvironment() m.train(corpus, n_columns=n_columns) env_matrix = m.matrix[:, :] b_conv = mk_b_conv(n_columns, rand_perm) if psi is None: psi = rand_pt_unit_sphere(env_matrix.shape[1]) self.matrix = np.zeros_like(env_matrix) if isinstance(corpus, corp.MaskedCorpus): sents = corpus.view_tokens(tok_name, unmask=True) else: sents = corpus.view_tokens(tok_name) for sent in sents: for i in xrange(sent.shape[0]): if corpus.terms[sent[i]] is not np.ma.masked: left = [env_matrix[term] for term in sent[:i]] right = [env_matrix[term] for term in sent[i+1:]] sent_vecs = np.array(left + [psi] + right) conv_ngrams = reduce_ngrams(b_conv, sent_vecs, lmda, i) ord_vec = np.sum(conv_ngrams, axis=0) self.matrix[sent[i], :] += ord_vec
def train(self, corpus, env_matrix=None, psi=None, rand_perm=None, n_columns=2048, lmda = 7, tok_name='sentences', n_processes=20): global _lmda _lmda = lmda del lmda global _b_conv _b_conv = mk_b_conv(n_columns, rand_perm) del rand_perm if env_matrix is None: m = be.BeagleEnvironment() m.train(corpus, n_columns=n_columns) env_matrix = m.matrix[:] del m global _shape _shape = env_matrix.shape global _env_matrix print 'Copying env matrix to shared mp array' _env_matrix = mp.Array('f', env_matrix.size, lock=False) _env_matrix[:] = env_matrix.ravel()[:] del env_matrix global _psi _psi = mp.Array('f', _shape[1], lock=False) if psi is None: _psi[:] = rand_pt_unit_sphere(_shape[1])[:] else: _psi[:] = psi[:] del psi print 'Gathering tokens over which to map' if isinstance(corpus, corp.MaskedCorpus): sents = corpus.view_tokens(tok_name, unmask=True) else: sents = corpus.view_tokens(tok_name) k = len(sents) / (n_processes - 1) sent_lists = [sents[i * k:(i + 1) * k] for i in xrange(n_processes - 1)] sent_lists.append(sents[(i + 1) * k:]) tmp_dir = tempfile.mkdtemp() tmp_files = [os.path.join(tmp_dir, 'tmp_' + str(i)) for i in xrange(len(sent_lists))] sent_lists = [(sent_lists[i], tmp_files[i]) for i in xrange(len(sent_lists))] del sents global _terms _terms = corpus.terms del corpus try: # For debugging # tmp_files = map(mpfn, sent_lists) print 'Forking' p = mp.Pool() tmp_files = p.map(mpfn, sent_lists, 1) p.close() print 'Reducing' self.matrix = np.zeros(_shape) for filename in tmp_files: result = np.memmap(filename, mode='r', shape=_shape, dtype=np.float32) self.matrix[:, :] += result[:, :] self.matrix = np.float32(self.matrix) finally: print 'Removing', tmp_dir shutil.rmtree(tmp_dir)