Exemple #1
0
def test_compare():

    from vsm import corpus

    n = 4

    c = corpus.random_corpus(1e3, 20, 1, 10, tok_name='sentences')

    c = c.to_maskedcorpus()

    c.mask_terms(['0'])

    em = be.BeagleEnvironment()

    em.train(c, n_columns=n)

    env_matrix = em.matrix

    psi = rand_pt_unit_sphere(n)

    rand_perm = two_rand_perm(n)

    print 'Training single processor model'

    sm = BeagleOrderSingle()

    sm.train(c, psi=psi, env_matrix=env_matrix, rand_perm=rand_perm)

    print 'Training multiprocessor model'

    mm = BeagleOrderMulti()

    mm.train(c, psi=psi, env_matrix=env_matrix, rand_perm=rand_perm)

    assert np.allclose(sm.matrix, mm.matrix, atol=1e-07)
Exemple #2
0
    def train(self,
              corpus,
              env_matrix=None,
              psi=None,
              rand_perm=None,
              n_columns=2048,
              lmda = 7,
              tok_name='sentences'):
        
        if env_matrix is None:
            
            m = be.BeagleEnvironment()

            m.train(corpus, n_columns=n_columns)

            env_matrix = m.matrix[:, :]

        b_conv = mk_b_conv(n_columns, rand_perm)

        if psi is None:

            psi = rand_pt_unit_sphere(env_matrix.shape[1])

        self.matrix = np.zeros_like(env_matrix)



        if isinstance(corpus, corp.MaskedCorpus):

            sents = corpus.view_tokens(tok_name, unmask=True)

        else:

            sents = corpus.view_tokens(tok_name)

        for sent in sents:

            for i in xrange(sent.shape[0]):

                if corpus.terms[sent[i]] is not np.ma.masked:
                    
                    left = [env_matrix[term] for term in sent[:i]]

                    right = [env_matrix[term] for term in sent[i+1:]]
                    
                    sent_vecs = np.array(left + [psi] + right)
                    
                    conv_ngrams = reduce_ngrams(b_conv, sent_vecs, lmda, i)
                    
                    ord_vec = np.sum(conv_ngrams, axis=0)

                    self.matrix[sent[i], :] += ord_vec
Exemple #3
0
    def train(self,
              corpus,
              env_matrix=None,
              psi=None,
              rand_perm=None,
              n_columns=2048,
              lmda = 7,
              tok_name='sentences',
              n_processes=20):

        global _lmda

        _lmda = lmda

        del lmda



        global _b_conv
        
        _b_conv = mk_b_conv(n_columns, rand_perm)

        del rand_perm



        if env_matrix is None:

            m = be.BeagleEnvironment()

            m.train(corpus, n_columns=n_columns)

            env_matrix = m.matrix[:]

            del m

        global _shape

        _shape = env_matrix.shape


        
        global _env_matrix

        print 'Copying env matrix to shared mp array'

        _env_matrix = mp.Array('f', env_matrix.size, lock=False)

        _env_matrix[:] = env_matrix.ravel()[:]

        del env_matrix



        global _psi

        _psi = mp.Array('f', _shape[1], lock=False)

        if psi is None:
            
            _psi[:] = rand_pt_unit_sphere(_shape[1])[:]

        else:

            _psi[:] = psi[:]

        del psi



        print 'Gathering tokens over which to map'

        if isinstance(corpus, corp.MaskedCorpus):

            sents = corpus.view_tokens(tok_name, unmask=True)

        else:

            sents = corpus.view_tokens(tok_name)

        k = len(sents) / (n_processes - 1)
        
        sent_lists = [sents[i * k:(i + 1) * k]
                      for i in xrange(n_processes - 1)]
        
        sent_lists.append(sents[(i + 1) * k:])
        
        tmp_dir = tempfile.mkdtemp()
        
        tmp_files = [os.path.join(tmp_dir, 'tmp_' + str(i))
                     for i in xrange(len(sent_lists))]

        sent_lists = [(sent_lists[i], tmp_files[i])
                      for i in xrange(len(sent_lists))]

        del sents



        global _terms

        _terms = corpus.terms

        del corpus


        
        try:

            # For debugging
            # tmp_files = map(mpfn, sent_lists)
            
            print 'Forking'
            
            p = mp.Pool()
            
            tmp_files = p.map(mpfn, sent_lists, 1)
            
            p.close()
            
            print 'Reducing'
            
            self.matrix = np.zeros(_shape)
            
            for filename in tmp_files:
                
                result = np.memmap(filename, mode='r',
                                   shape=_shape, dtype=np.float32)

                self.matrix[:, :] += result[:, :]

                self.matrix = np.float32(self.matrix)
                
        finally:

            print 'Removing', tmp_dir

            shutil.rmtree(tmp_dir)