Example #1
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file', type=argparse.FileType())
    parser.add_argument('output_file', type=argparse.FileType('w'))
    parser.add_argument('vector_size', type=int)
    parser.add_argument('context_size', type=int)
    parser.add_argument('vocabulary_size', type=int)
    args = parser.parse_args()

    sentences = list(lower(tokenize(args.input_file)))
    dictionary = build_dictionary(sentences, args.vocabulary_size)
    indices = to_indices(sentences, dictionary)
    inputs, outputs = create_context(indices, args.context_size)

    cost_gradient = bind_cost_gradient(skip_gram_cost_gradient, inputs, outputs, sampler=get_stochastic_sampler(100))
    initial_parameters = np.random.normal(size=(2, len(dictionary) + 1, args.vector_size))
    parameters, cost_history = gradient_descent(cost_gradient, initial_parameters, 10000)
    input_vectors, output_vectors = parameters
    word_vectors = input_vectors + output_vectors
    sorted_pairs = sorted(dictionary.items(), key=operator.itemgetter(1))
    words = [word for word, index in sorted_pairs]

    for word in words:
        vector = word_vectors[dictionary[word]]
        vector_string = ' '.join(str(element) for element in vector)
        print(word, vector_string, file=args.output_file)
Example #2
0
    def train(self, sentences, iterations=1000):
        # Preprocess sentences to create indices of context and next words
        self.dictionary = build_dictionary(sentences, self.vocabulary_size)
        indices = to_indices(sentences, self.dictionary)
        self.reverse_dictionary = {
            index: word
            for word, index in self.dictionary.items()
        }
        inputs, outputs = self.create_context(indices)

        # Create cost and gradient function for gradient descent
        shapes = [self.W_shape, self.U_shape, self.H_shape, self.C_shape]
        flatten_nplm_cost_gradient = flatten_cost_gradient(
            nplm_cost_gradient, shapes)
        cost_gradient = bind_cost_gradient(flatten_nplm_cost_gradient,
                                           inputs,
                                           outputs,
                                           sampler=get_stochastic_sampler(10))

        # Train neural network
        parameters_size = np.sum(np.product(shape) for shape in shapes)
        initial_parameters = np.random.normal(size=parameters_size)
        self.parameters, cost_history = gradient_descent(
            cost_gradient, initial_parameters, iterations)
        return cost_history
Example #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file', type=argparse.FileType())
    parser.add_argument('output_file', type=argparse.FileType('w'))
    parser.add_argument('vector_size', type=int)
    parser.add_argument('context_size', type=int)
    parser.add_argument('vocabulary_size', type=int)
    args = parser.parse_args()

    sentences = list(lower(tokenize(args.input_file)))
    dictionary = build_dictionary(sentences, args.vocabulary_size)
    indices = to_indices(sentences, dictionary)
    inputs, outputs = create_context(indices, args.context_size)

    cost_gradient = bind_cost_gradient(skip_gram_cost_gradient,
                                       inputs,
                                       outputs,
                                       sampler=get_stochastic_sampler(100))
    initial_parameters = np.random.normal(size=(2, len(dictionary) + 1,
                                                args.vector_size))
    parameters, cost_history = gradient_descent(cost_gradient,
                                                initial_parameters, 10000)
    input_vectors, output_vectors = parameters
    word_vectors = input_vectors + output_vectors
    sorted_pairs = sorted(dictionary.items(), key=operator.itemgetter(1))
    words = [word for word, index in sorted_pairs]

    for word in words:
        vector = word_vectors[dictionary[word]]
        vector_string = ' '.join(str(element) for element in vector)
        print(word, vector_string, file=args.output_file)
Example #4
0
    def train(self, sentences, iterations=1000):
        # Preprocess sentences to create indices of context and next words
        self.dictionary = build_dictionary(sentences, self.vocabulary_size)
        indices = to_indices(sentences, self.dictionary)
        self.reverse_dictionary = {index: word for word, index in self.dictionary.items()}
        inputs, outputs = self.create_context(indices)

        # Create cost and gradient function for gradient descent
        shapes = [self.W_shape, self.U_shape, self.H_shape, self.C_shape]
        flatten_nplm_cost_gradient = flatten_cost_gradient(nplm_cost_gradient, shapes)
        cost_gradient = bind_cost_gradient(flatten_nplm_cost_gradient, inputs, outputs,
                                           sampler=get_stochastic_sampler(10))

        # Train neural network
        parameters_size = np.sum(np.product(shape) for shape in shapes)
        initial_parameters = np.random.normal(size=parameters_size)
        self.parameters, cost_history = gradient_descent(cost_gradient, initial_parameters, iterations)
        return cost_history
Example #5
0
 def test_to_indices(self):
     sentences = [['a', 'man', 'a', 'woman'], ['a', 'man']]
     dictionary = build_dictionary(sentences, 2)
     actual = list(to_indices(sentences, dictionary))
     expected = [[1, 2, 1, 0], [1, 2]]
     self.assertSequenceEqual(expected, actual)
Example #6
0
 def test_build_dictionary(self):
     sentences = [['a', 'man', 'a', 'woman'], ['a', 'man']]
     actual = build_dictionary(sentences, 2)
     expected = {'a': 1, 'man': 2}
     self.assertDictEqual(expected, actual)