Exemple #1
0
    def load_matrix(self, filename):

        self.matrix = load_matrix(filename)
    def train(self,
              corpus,
              token_type='sentences',
              stoplist=None,
              n_columns=None,
              env_matrix=None,
              placeholder=None,
              right_permutation=None,
              left_permutation=None,
              lmbda=7):


        if env_matrix == None:
            env_model = BeagleEnvironment()
            env_model.train(corpus,
                            token_type,
                            stoplist,
                            n_columns)
        else:
            env_model = BeagleEnvironment(env_matrix)

        __shape = env_model.matrix.shape

        order_fn.env_matrix = env_model.matrix

        del env_model
        del env_matrix


        
        temp_dir = tempfile.mkdtemp()
        order_fn.temp_dir = temp_dir


        order_fn.lmbda = lmbda


        if not placeholder:

            placeholder = np.random.random(__shape[1])
            placeholder *= 2
            placeholder -= 1
            placeholder /= np.sum(placeholder**2)**(1./2)
            order_fn.placeholder = placeholder
                
        print 'Placeholder:', order_fn.placeholder
        print 'Norm of placeholder', np.sum(order_fn.placeholder**2)**(1./2)



        if not right_permutation or not left_permutation:
            permutations = RandomPermutations(__shape[1], 2)

        if right_permutation:
            order_fn.right_permutation = right_permutation
        else:
            order_fn.right_permutation = permutations.permutations[0]

        if left_permutation:
            order_fn.left_permutation = left_permutation
        else:
            order_fn.left_permutation = permutations.permutations[1]

        print 'Right permutation', order_fn.right_permutation(np.arange(__shape[1]))

        print 'Left permutation', order_fn.left_permutation(np.arange(__shape[1]))




        sentences = corpus.view_tokens(token_type)
        
        # number of sentences in a chunk of sentences
        n = 500

        sent_lists = np.split(np.asarray(sentences, dtype=np.object_),
                              np.arange(n, len(sentences), n))

        ind_sent_lists = list(enumerate(sent_lists))



        # Map
        p = mp.Pool()
        results = p.map(order_fn, ind_sent_lists, 1)
        p.close()



        del order_fn.env_matrix


        # Reduce
        self.matrix = np.zeros(__shape, dtype=np.float32)
        
        for result in results:

            print 'Reducing', result

            summand = load_matrix(result)

            for i,row in summand.iteritems():
                self.matrix[i,:] += row

            # self.matrix += summand


        # Clean up
        print 'Deleting temporary directory\n'\
              '  ', temp_dir

        shutil.rmtree(temp_dir)
    def train(self,
              corpus,
              token_type='sentences',
              stoplist=list(),
              n_columns=None,
              env_matrix=None):


        if env_matrix == None:
            env_model = BeagleEnvironment()
            env_model.train(corpus,
                            token_type,
                            stoplist,
                            n_columns)
        else:
            env_model = BeagleEnvironment(env_matrix)

        #Apply stoplist to environment matrix
        env_model.filter_rows(stoplist)


        __shape = env_model.matrix.shape


        context_fn.env_matrix = env_model.matrix

        del env_model
        del env_matrix


        
        temp_dir = tempfile.mkdtemp()
        context_fn.temp_dir = temp_dir


        sentences = corpus.view_tokens(token_type)
        
        # number of sentences in a chunk of sentences
        n = 500

        sent_lists = np.split(np.asarray(sentences, dtype=np.object_),
                              np.arange(n, len(sentences), n))

        ind_sent_lists = list(enumerate(sent_lists))


        # Map
        p = mp.Pool()
        results = p.map(context_fn, ind_sent_lists, 1)
        p.close()


        del context_fn.env_matrix


        # Reduce
        self.matrix = np.zeros(__shape, dtype=np.float32)
        
        for result in results:

            print 'Reducing', result

            summand = load_matrix(result)
            # self.matrix += summand

            for i,row in summand.iteritems():
                self.matrix[i,:] += row

        # Clean up
        print 'Deleting temporary directory\n'\
              '  ', temp_dir

        shutil.rmtree(temp_dir)