def build_vocabulary():
    if not os.path.exists(config.TRAIN_FILE):
        print("Error: can't find train file '%s'!" % config.TRAIN_FILE)
        exit()

    if config.INCLUDE_UNKNOWN_WORD:
        vocabulary.add(config.UNKNOWN_WORD, config.WORD_COUNT + 1)
    vocabulary.add(config.SYMBOL_WORD)
    vocabulary.add(config.PADDING_WORD, config.WORD_COUNT + 1)

    train_size = 0
    with open(config.TRAIN_FILE, 'r') as f:
        for line in f:
            line = line.strip('\n')
            words = line.split()
            for word in words:
                word = getNormalWord(word)
                if word:
                    train_size += 1
                    vocabulary.add(word)

    vocabulary.delete_word()
    vocabulary.save_vocabulary()
    print("TRAIN SIZE: %d" % train_size)
    print("VOCABULARY SIZE: %d" % vocabulary.length())
    vocabulary.dump_vocabulary()
    def __init__(self,
                 window_size=config.WINDOW_SIZE,
                 embedding_size=config.EMBEDDING_SIZE,
                 hidden_size=config.HIDDEN_SIZE):
        """
        Initialize L{Model} parameters.
        """

        self.vocab_size = vocabulary.length()
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = 1
        self.input_size = self.embedding_size * self.window_size

        numpy.random.seed()

        self.embeddings = (
            numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2
        if config.NORMALIZE_EMBEDDINGS:
            self.normalize(range(self.vocab_size))

        self.hidden_weights = random_weights(
            self.input_size,
            self.hidden_size,
            scale_by=config.SCALE_INITIAL_WEIGHTS_BY)
        self.output_weights = random_weights(
            self.hidden_size,
            self.output_size,
            scale_by=config.SCALE_INITIAL_WEIGHTS_BY)
        self.hidden_biases = numpy.zeros((1, self.hidden_size))
        self.output_biases = numpy.zeros((1, self.output_size))
    def __init__(self, window_size=config.WINDOW_SIZE,
                 embedding_size=config.EMBEDDING_SIZE, hidden_size=config.HIDDEN_SIZE):
        """
        Initialize L{Model} parameters.
        """

        self.vocab_size = vocabulary.length()
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = 1
        self.input_size = self.embedding_size * self.window_size

        numpy.random.seed()
        # self.embeddings = numpy.asarray(
        #     (numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2 * config.INITIAL_EMBEDDING_RANGE,
        #     dtype=floatX)
        self.embeddings = numpy.asarray((numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2,
                                        dtype=floatX)
        if config.NORMALIZE_EMBEDDINGS:
            self.normalize(range(self.vocab_size))

        self.hidden_weights = theano.shared(random_weights(self.input_size, self.hidden_size))
        self.output_weights = theano.shared(random_weights(self.hidden_size, self.output_size))
        # self.hidden_weights = theano.shared(numpy.asarray(numpy.ones((self.input_size, self.hidden_size)),
        #                                                   dtype=floatX))
        # self.output_weights = theano.shared(numpy.asarray(
        #     random_weights(self.hidden_size, self.output_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY), dtype=floatX))
        self.hidden_biases = theano.shared(numpy.asarray(numpy.zeros((self.hidden_size,)), dtype=floatX))
        self.output_biases = theano.shared(numpy.asarray(numpy.zeros((self.output_size,)), dtype=floatX))
Exemple #4
0
def build_vocabulary():
    if not os.path.exists(config.TRAIN_FILE):
        print("Error: can't find train file '%s'!" % config.TRAIN_FILE)
        exit()

    if config.INCLUDE_UNKNOWN_WORD:
        vocabulary.add(config.UNKNOWN_WORD, config.WORD_COUNT + 1)
    vocabulary.add(config.SYMBOL_WORD)
    vocabulary.add(config.PADDING_WORD, config.WORD_COUNT + 1)

    train_size = 0
    with open(config.TRAIN_FILE, 'r') as f:
        for line in f:
            line = line.strip('\n')
            words = line.split()
            for word in words:
                word = getNormalWord(word)
                if word:
                    train_size += 1
                    vocabulary.add(word)

    vocabulary.delete_word()
    vocabulary.save_vocabulary()
    print("TRAIN SIZE: %d" % train_size)
    print("VOCABULARY SIZE: %d" % vocabulary.length())
    vocabulary.dump_vocabulary()
    def __init__(
        self, window_size=config.WINDOW_SIZE, embedding_size=config.EMBEDDING_SIZE, hidden_size=config.HIDDEN_SIZE
    ):
        """
        Initialize L{Model} parameters.
        """

        self.vocab_size = vocabulary.length()
        self.window_size = window_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.output_size = 1
        self.input_size = self.embedding_size * self.window_size

        numpy.random.seed()

        self.embeddings = (numpy.random.rand(self.vocab_size, self.embedding_size) - 0.5) * 2
        if config.NORMALIZE_EMBEDDINGS:
            self.normalize(range(self.vocab_size))

        self.hidden_weights = random_weights(
            self.input_size, self.hidden_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY
        )
        self.output_weights = random_weights(
            self.hidden_size, self.output_size, scale_by=config.SCALE_INITIAL_WEIGHTS_BY
        )
        self.hidden_biases = numpy.zeros((1, self.hidden_size))
        self.output_biases = numpy.zeros((1, self.output_size))
Exemple #6
0
def build_samples():
    if not vocabulary.length():
        vocabulary.load_vocabulary()

    sample_file = open(config.SAMPLE_FILE, 'w')
    for line in open(config.TRAIN_FILE, 'r'):
        line = line.strip('\n')
        words = [getNormalWord(word) for word in line.split() if word]
        word_ids = [vocabulary.id(word) for word in words]
        sent_length = len(word_ids)
        half_window = config.WINDOW_SIZE / 2
        window = []
        padding_id = vocabulary.id(config.PADDING_WORD)
        unknown_id = vocabulary.id(config.UNKNOWN_WORD)
        symbol_id = vocabulary.id(config.SYMBOL_WORD)
        for index, word_id in enumerate(word_ids):
            if word_id == unknown_id:
                continue
            if index - half_window >= 0 and index + half_window < sent_length:
                window = word_ids[index - half_window:index + half_window + 1]
                if window.count(unknown_id) + window.count(
                        symbol_id) + window.count(padding_id) <= half_window:
                    sample_file.write(' '.join([str(id)
                                                for id in window]) + '\n')
                window = []
                continue

            if index - half_window < 0:
                for i in range(half_window - index):
                    window.append(padding_id)
                window.extend(word_ids[:index + 1])
            else:
                window.extend(word_ids[index - half_window:index + 1])

            if index + half_window >= sent_length:
                window.extend(word_ids[index + 1:])
                for i in range(index + half_window - sent_length + 1):
                    window.append(padding_id)
            else:
                window.extend(word_ids[index + 1:index + half_window + 1])

            if window.count(unknown_id) + window.count(
                    symbol_id) + window.count(padding_id) <= half_window:
                sample_file.write(' '.join([str(id) for id in window]) + '\n')
            window = []

    sample_file.close()
def build_samples():
    if not vocabulary.length():
        vocabulary.load_vocabulary()

    sample_file = open(config.SAMPLE_FILE, 'w')
    for line in open(config.TRAIN_FILE, 'r'):
        line = line.strip('\n')
        words = [getNormalWord(word) for word in line.split() if word]
        word_ids = [vocabulary.id(word) for word in words]
        sent_length = len(word_ids)
        half_window = config.WINDOW_SIZE / 2
        window = []
        padding_id = vocabulary.id(config.PADDING_WORD)
        unknown_id = vocabulary.id(config.UNKNOWN_WORD)
        symbol_id = vocabulary.id(config.SYMBOL_WORD)
        for index, word_id in enumerate(word_ids):
            if word_id == unknown_id:
                continue
            if index - half_window >= 0 and index + half_window < sent_length:
                window = word_ids[index - half_window : index + half_window + 1]
                if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window:
                    sample_file.write(' '.join([str(id) for id in window]) + '\n')
                window = []
                continue

            if index - half_window < 0:
                for i in range(half_window - index):
                    window.append(padding_id)
                window.extend(word_ids[:index + 1])
            else:
                window.extend(word_ids[index - half_window : index + 1])

            if index + half_window >= sent_length:
                window.extend(word_ids[index + 1:])
                for i in range(index + half_window - sent_length + 1):
                    window.append(padding_id)
            else:
                window.extend(word_ids[index + 1 : index + half_window + 1])

            if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window:
                sample_file.write(' '.join([str(id) for id in window]) + '\n')
            window = []

    sample_file.close()