Ejemplo n.º 1
0
    def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
        """
        Estimate the word embeddings.

        Parameters:
        - scipy.sparse.coo_matrix matrix: coocurrence matrix
        - int epochs: number of training epochs
        - int no_threads: number of training threads
        - bool verbose: print progress messages if True
        """

        shape = matrix.shape

        if (len(shape) != 2 or
            shape[0] != shape[1]):
            raise Exception('Coocurrence matrix must be square')

        if not sp.isspmatrix_coo(matrix):
            raise Exception('Coocurrence matrix must be in the COO format')

        self.word_vectors = np.random.rand(shape[0],
                                           self.no_components)
        self.word_biases = np.zeros(shape[0], 
                                    dtype=np.float64)
        shuffle_indices = np.arange(matrix.nnz, dtype=np.int32)

        if verbose:
            print ('Performing %s training epochs '
                   'with %s threads') % (epochs, no_threads)

        for epoch in xrange(epochs):

            if verbose:
                print 'Epoch %s' % epoch

            # Shuffle the coocurrence matrix
            np.random.shuffle(shuffle_indices)

            fit_vectors(self.word_vectors,
                        self.word_biases,
                        matrix.row,
                        matrix.col,
                        matrix.data,
                        shuffle_indices,
                        self.learning_rate,
                        self.max_count,
                        self.alpha,
                        int(no_threads))
Ejemplo n.º 2
0
    def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
        """
        Estimate the word embeddings.

        Parameters:
        - scipy.sparse.coo_matrix matrix: coocurrence matrix
        - int epochs: number of training epochs
        - int no_threads: number of training threads
        - bool verbose: print progress messages if True
        """

        shape = matrix.shape

        if (len(shape) != 2 or
            shape[0] != shape[1]):
            raise Exception('Coocurrence matrix must be square')

        if not sp.isspmatrix_coo(matrix):
            raise Exception('Coocurrence matrix must be in the COO format')

        self.word_vectors = ((np.random.rand(shape[0],
                                             self.no_components) - 0.5)
                                             / self.no_components)
        self.word_biases = np.zeros(shape[0], 
                                    dtype=np.float64)

        self.vectors_sum_gradients = np.ones_like(self.word_vectors)
        self.biases_sum_gradients = np.ones_like(self.word_biases)

        shuffle_indices = np.arange(matrix.nnz, dtype=np.int32)

        if verbose:
            print('Performing %s training epochs '
                  'with %s threads' % (epochs, no_threads))
                  
            # initialize lists that will hold the learning rates
            vectors_gradients = list()
            biases_gradients = list()

        for epoch in range(epochs):

            if verbose:
                starttime = dt.datetime.now()
                print('Epoch %s' % epoch)

            # Shuffle the coocurrence matrix
            np.random.shuffle(shuffle_indices)

            fit_vectors(self.word_vectors,
                        self.vectors_sum_gradients,
                        self.word_biases,
                        self.biases_sum_gradients,
                        matrix.row,
                        matrix.col,
                        matrix.data,
                        shuffle_indices,
                        self.learning_rate,
                        self.max_count,
                        self.alpha,
                        self.max_loss,
                        int(no_threads))
                        
            if not np.isfinite(self.word_vectors).all():
                raise Exception('Non-finite values in word vectors. '
                                'Try reducing the learning rate or the '
                                'max_loss parameter.')
                                
            if verbose:
                vectors_gradients.append(np.mean([self.learning_rate/np.sqrt(a) for a in self.vectors_sum_gradients]))
                biases_gradients.append(np.mean(self.learning_rate/np.sqrt(self.biases_sum_gradients)))

                endtime = dt.datetime.now()
                print('    Epoch %s took %s minutes' % (epoch, (endtime-starttime).total_seconds() / 60))
                
        if verbose:
            # show the learning rates
            plt.plot(vectors_gradients, 'k--', biases_gradients, 'k:')
            plt.legend(('word vectors', 'word biases'))
            plt.xlabel('Epoch')
            plt.ylabel('Mean learning rate')
            plt.title('Change in mean learning rates across epochs')
            plt.show()                
Ejemplo n.º 3
0
    def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
        """
        Estimate the word embeddings.

        Parameters:
        - scipy.sparse.coo_matrix matrix: coocurrence matrix
        - int epochs: number of training epochs
        - int no_threads: number of training threads
        - bool verbose: print progress messages if True
        """

        shape = matrix.shape

        if (len(shape) != 2 or shape[0] != shape[1]):
            raise Exception('Coocurrence matrix must be square')

        if not sp.isspmatrix_coo(matrix):
            raise Exception('Coocurrence matrix must be in the COO format')

        self.word_vectors = (
            (np.random.rand(shape[0], self.no_components) - 0.5) /
            self.no_components)
        self.word_biases = np.zeros(shape[0], dtype=np.float64)

        self.vectors_sum_gradients = np.ones_like(self.word_vectors)
        self.biases_sum_gradients = np.ones_like(self.word_biases)

        shuffle_indices = np.arange(matrix.nnz, dtype=np.int32)

        if verbose:
            print('Performing %s training epochs '
                  'with %s threads' % (epochs, no_threads))

            # initialize lists that will hold the learning rates
            vectors_gradients = list()
            biases_gradients = list()

        for epoch in range(epochs):

            if verbose:
                starttime = dt.datetime.now()
                print('Epoch %s' % epoch)

            # Shuffle the coocurrence matrix
            np.random.shuffle(shuffle_indices)

            fit_vectors(self.word_vectors, self.vectors_sum_gradients,
                        self.word_biases, self.biases_sum_gradients,
                        matrix.row, matrix.col, matrix.data, shuffle_indices,
                        self.learning_rate, self.max_count, self.alpha,
                        self.max_loss, int(no_threads))

            if not np.isfinite(self.word_vectors).all():
                raise Exception('Non-finite values in word vectors. '
                                'Try reducing the learning rate or the '
                                'max_loss parameter.')

            if verbose:
                vectors_gradients.append(
                    np.mean([
                        self.learning_rate / np.sqrt(a)
                        for a in self.vectors_sum_gradients
                    ]))
                biases_gradients.append(
                    np.mean(self.learning_rate /
                            np.sqrt(self.biases_sum_gradients)))

                endtime = dt.datetime.now()
                print('    Epoch %s took %s minutes' %
                      (epoch, (endtime - starttime).total_seconds() / 60))

        if verbose:
            # show the learning rates
            plt.plot(vectors_gradients, 'k--', biases_gradients, 'k:')
            plt.legend(('word vectors', 'word biases'))
            plt.xlabel('Epoch')
            plt.ylabel('Mean learning rate')
            plt.title('Change in mean learning rates across epochs')
            plt.show()
Ejemplo n.º 4
0
    def fit(self, matrix, epochs=5, no_threads=2, verbose=False):
        """
        Estimate the word embeddings.

        Parameters:
        - scipy.sparse.coo_matrix matrix: coocurrence matrix
        - int epochs: number of training epochs
        - int no_threads: number of training threads
        - bool verbose: print progress messages if True
        """

        shape = matrix.shape

        if (len(shape) != 2 or
            shape[0] != shape[1]):
            raise Exception('Coocurrence matrix must be square')

        if not sp.isspmatrix_coo(matrix):
            raise Exception('Coocurrence matrix must be in the COO format')

        random_state = check_random_state(self.random_state)
        self.word_vectors = ((random_state.rand(shape[0],
                                                self.no_components) - 0.5)
                             / self.no_components)
        self.word_biases = np.zeros(shape[0],
                                    dtype=np.float64)
        
        # fixed ones_like to zeros like
        self.vectors_sum_gradients = np.zeros_like(self.word_vectors)
        self.biases_sum_gradients = np.zeros_like(self.word_biases)

        shuffle_indices = np.arange(matrix.nnz, dtype=np.int32)

        if verbose:
            print('Performing %s training epochs '
                  'with %s threads on %s cooccurrence words' % (epochs, no_threads, matrix.nnz))

        progress = tqdm(range(epochs), desc="Epoch 0: start training")
        for epoch in progress:

            # Shuffle the coocurrence matrix
            random_state.shuffle(shuffle_indices)

            avg_loss = fit_vectors(self.word_vectors,
                        self.vectors_sum_gradients,
                        self.word_biases,
                        self.biases_sum_gradients,
                        matrix.row,
                        matrix.col,
                        matrix.data,
                        shuffle_indices,
                        self.learning_rate,
                        self.max_count,
                        self.alpha,
                        self.max_loss,
                        int(no_threads))
            
            progress.set_description('Epoch {}: average loss {}'.format(epoch, avg_loss))

            if not np.isfinite(self.word_vectors).all():
                raise Exception('Non-finite values in word vectors. '
                                'Try reducing the learning rate or the '
                                'max_loss parameter.')
Ejemplo n.º 5
0
    def fit(self, matrix, epochs=5, no_threads=2, verbose=False, wordList = None,save_gap=None, retrain = False, save_vectors = False):
        """
        Estimate the word embeddings.

        Parameters:
        - scipy.sparse.coo_matrix matrix: coocurrence matrix
        - int epochs: number of training epochs
        - int no_threads: number of training threads
        - bool verbose: print progress messages if True
        """

        shape = matrix.shape

        if (len(shape) != 2 or
            shape[0] != shape[1]):
            raise Exception('Coocurrence matrix must be square')

        if not sp.isspmatrix_coo(matrix):
            raise Exception('Coocurrence matrix must be in the COO format')

        random_state = check_random_state(self.random_state)
        
        if retrain == False:
            self.word_vectors = ((random_state.rand(shape[0],
                                                self.no_components) - 0.5)
                             / self.no_components)
            self.word_biases = np.zeros(shape[0],
                                    dtype=np.float64)

            self.vectors_sum_gradients = np.ones_like(self.word_vectors)
            self.biases_sum_gradients = np.ones_like(self.word_biases)

        shuffle_indices = np.arange(matrix.nnz, dtype=np.int32)

        if verbose:
            print('Performing %s training epochs '
                  'with %s threads' % (epochs, no_threads))

        for epoch in range(epochs):

            if verbose:
                print('Epoch %s' % epoch)

            # Shuffle the coocurrence matrix
            random_state.shuffle(shuffle_indices)

            fit_vectors(self.word_vectors,
                        self.vectors_sum_gradients,
                        self.word_biases,
                        self.biases_sum_gradients,
                        matrix.row,
                        matrix.col,
                        matrix.data,
                        shuffle_indices,
                        self.learning_rate,
                        self.max_count,
                        self.alpha,
                        self.max_loss,
                        int(no_threads))

            if not np.isfinite(self.word_vectors).all():
                raise Exception('Non-finite values in word vectors. '
                                'Try reducing the learning rate or the '
                                'max_loss parameter.')

            if epoch%save_gap==0 and wordList is not None:
        
                vectorList = [self.word_vectors[self.dictionary[x]].reshape(1,-1) for x in wordList]

                mat = []

                for i in range(len(wordList)):

                    mat.append([])

                    for j in range(len(wordList)):

                        sim = cosine_similarity(vectorList[i], vectorList[j])
                        mat[i].append(sim[0][0])

                df = pd.DataFrame(mat, columns = wordList, index = wordList)
                df.to_csv("output_" + str(self.startIndex + epoch) + ".csv")

                if save_vectors == True:

                    with open("vectors_"+str(self.startIndex + epoch),'wb') as f:
                        pickle.dump(vectorList, f)