Esempio n. 1
0
    def train(self,
              corpus,
              token_type='sentences',
              stoplist=list(),
              n_columns=None,
              env_matrix=None):


        if env_matrix == None:
            env_model = BeagleEnvironment()
            env_model.train(corpus,
                            token_type,
                            stoplist,
                            n_columns)
        else:
            env_model = BeagleEnvironment(env_matrix)

        #Apply stoplist to environment matrix
        env_model.filter_rows(stoplist)


        __shape = env_model.matrix.shape


        context_fn.env_matrix = env_model.matrix

        del env_model
        del env_matrix


        
        temp_dir = tempfile.mkdtemp()
        context_fn.temp_dir = temp_dir


        sentences = corpus.view_tokens(token_type)
        
        # number of sentences in a chunk of sentences
        n = 500

        sent_lists = np.split(np.asarray(sentences, dtype=np.object_),
                              np.arange(n, len(sentences), n))

        ind_sent_lists = list(enumerate(sent_lists))


        # Map
        p = mp.Pool()
        results = p.map(context_fn, ind_sent_lists, 1)
        p.close()


        del context_fn.env_matrix


        # Reduce
        self.matrix = np.zeros(__shape, dtype=np.float32)
        
        for result in results:

            print 'Reducing', result

            summand = load_matrix(result)
            # self.matrix += summand

            for i,row in summand.iteritems():
                self.matrix[i,:] += row

        # Clean up
        print 'Deleting temporary directory\n'\
              '  ', temp_dir

        shutil.rmtree(temp_dir)