Esempio n. 1
0
def build_samples():
    if not vocabulary.length():
        vocabulary.load_vocabulary()

    sample_file = open(config.SAMPLE_FILE, 'w')
    for line in open(config.TRAIN_FILE, 'r'):
        line = line.strip('\n')
        words = [getNormalWord(word) for word in line.split() if word]
        word_ids = [vocabulary.id(word) for word in words]
        sent_length = len(word_ids)
        half_window = config.WINDOW_SIZE / 2
        window = []
        padding_id = vocabulary.id(config.PADDING_WORD)
        unknown_id = vocabulary.id(config.UNKNOWN_WORD)
        symbol_id = vocabulary.id(config.SYMBOL_WORD)
        for index, word_id in enumerate(word_ids):
            if word_id == unknown_id:
                continue
            if index - half_window >= 0 and index + half_window < sent_length:
                window = word_ids[index - half_window:index + half_window + 1]
                if window.count(unknown_id) + window.count(
                        symbol_id) + window.count(padding_id) <= half_window:
                    sample_file.write(' '.join([str(id)
                                                for id in window]) + '\n')
                window = []
                continue

            if index - half_window < 0:
                for i in range(half_window - index):
                    window.append(padding_id)
                window.extend(word_ids[:index + 1])
            else:
                window.extend(word_ids[index - half_window:index + 1])

            if index + half_window >= sent_length:
                window.extend(word_ids[index + 1:])
                for i in range(index + half_window - sent_length + 1):
                    window.append(padding_id)
            else:
                window.extend(word_ids[index + 1:index + half_window + 1])

            if window.count(unknown_id) + window.count(
                    symbol_id) + window.count(padding_id) <= half_window:
                sample_file.write(' '.join([str(id) for id in window]) + '\n')
            window = []

    sample_file.close()
Esempio n. 2
0
def build_samples():
    if not vocabulary.length():
        vocabulary.load_vocabulary()

    sample_file = open(config.SAMPLE_FILE, 'w')
    for line in open(config.TRAIN_FILE, 'r'):
        line = line.strip('\n')
        words = [getNormalWord(word) for word in line.split() if word]
        word_ids = [vocabulary.id(word) for word in words]
        sent_length = len(word_ids)
        half_window = config.WINDOW_SIZE / 2
        window = []
        padding_id = vocabulary.id(config.PADDING_WORD)
        unknown_id = vocabulary.id(config.UNKNOWN_WORD)
        symbol_id = vocabulary.id(config.SYMBOL_WORD)
        for index, word_id in enumerate(word_ids):
            if word_id == unknown_id:
                continue
            if index - half_window >= 0 and index + half_window < sent_length:
                window = word_ids[index - half_window : index + half_window + 1]
                if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window:
                    sample_file.write(' '.join([str(id) for id in window]) + '\n')
                window = []
                continue

            if index - half_window < 0:
                for i in range(half_window - index):
                    window.append(padding_id)
                window.extend(word_ids[:index + 1])
            else:
                window.extend(word_ids[index - half_window : index + 1])

            if index + half_window >= sent_length:
                window.extend(word_ids[index + 1:])
                for i in range(index + half_window - sent_length + 1):
                    window.append(padding_id)
            else:
                window.extend(word_ids[index + 1 : index + half_window + 1])

            if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window:
                sample_file.write(' '.join([str(id) for id in window]) + '\n')
            window = []

    sample_file.close()
Esempio n. 3
0
    def save_word2vec_format(self, fname, binary=False):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        logger.info("storing %sx%s projection weights into %s" % (self.parameters.vocab_size, self.parameters.embedding_size, fname))

        with open(fname, 'wb') as fout:
            fout.write("%s %s\n" % self.parameters.embeddings.shape)
            # store in sorted order: most frequent words at the top
            for word, count in sorted(vocabulary.words, key=lambda item: -item[1]):
                # word = utils.to_utf8(word)  # always store in utf8
                index = vocabulary.id(word)
                row = self.parameters.embeddings[index]
                if binary:
                    fout.write("%s %s\n" % (word, row.tostring()))
                else:
                    fout.write("%s %s\n" % (word, ' '.join("%f" % val for val in row)))
Esempio n. 4
0
    def save_word2vec_format(self, fname, binary=False):
        """
        Store the input-hidden weight matrix in the same format used by the original
        C word2vec-tool, for compatibility.

        """
        logger.info("storing %sx%s projection weights into %s" %
                    (self.parameters.vocab_size,
                     self.parameters.embedding_size, fname))

        with open(fname, 'wb') as fout:
            fout.write("%s %s\n" % self.parameters.embeddings.shape)
            # store in sorted order: most frequent words at the top
            for word, count in sorted(vocabulary.words,
                                      key=lambda item: -item[1]):
                # word = utils.to_utf8(word)  # always store in utf8
                index = vocabulary.id(word)
                row = self.parameters.embeddings[index]
                if binary:
                    fout.write("%s %s\n" % (word, row.tostring()))
                else:
                    fout.write("%s %s\n" % (word, ' '.join("%f" % val
                                                           for val in row)))