def build_samples(): if not vocabulary.length(): vocabulary.load_vocabulary() sample_file = open(config.SAMPLE_FILE, 'w') for line in open(config.TRAIN_FILE, 'r'): line = line.strip('\n') words = [getNormalWord(word) for word in line.split() if word] word_ids = [vocabulary.id(word) for word in words] sent_length = len(word_ids) half_window = config.WINDOW_SIZE / 2 window = [] padding_id = vocabulary.id(config.PADDING_WORD) unknown_id = vocabulary.id(config.UNKNOWN_WORD) symbol_id = vocabulary.id(config.SYMBOL_WORD) for index, word_id in enumerate(word_ids): if word_id == unknown_id: continue if index - half_window >= 0 and index + half_window < sent_length: window = word_ids[index - half_window:index + half_window + 1] if window.count(unknown_id) + window.count( symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] continue if index - half_window < 0: for i in range(half_window - index): window.append(padding_id) window.extend(word_ids[:index + 1]) else: window.extend(word_ids[index - half_window:index + 1]) if index + half_window >= sent_length: window.extend(word_ids[index + 1:]) for i in range(index + half_window - sent_length + 1): window.append(padding_id) else: window.extend(word_ids[index + 1:index + half_window + 1]) if window.count(unknown_id) + window.count( symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] sample_file.close()
def build_samples(): if not vocabulary.length(): vocabulary.load_vocabulary() sample_file = open(config.SAMPLE_FILE, 'w') for line in open(config.TRAIN_FILE, 'r'): line = line.strip('\n') words = [getNormalWord(word) for word in line.split() if word] word_ids = [vocabulary.id(word) for word in words] sent_length = len(word_ids) half_window = config.WINDOW_SIZE / 2 window = [] padding_id = vocabulary.id(config.PADDING_WORD) unknown_id = vocabulary.id(config.UNKNOWN_WORD) symbol_id = vocabulary.id(config.SYMBOL_WORD) for index, word_id in enumerate(word_ids): if word_id == unknown_id: continue if index - half_window >= 0 and index + half_window < sent_length: window = word_ids[index - half_window : index + half_window + 1] if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] continue if index - half_window < 0: for i in range(half_window - index): window.append(padding_id) window.extend(word_ids[:index + 1]) else: window.extend(word_ids[index - half_window : index + 1]) if index + half_window >= sent_length: window.extend(word_ids[index + 1:]) for i in range(index + half_window - sent_length + 1): window.append(padding_id) else: window.extend(word_ids[index + 1 : index + half_window + 1]) if window.count(unknown_id) + window.count(symbol_id) + window.count(padding_id) <= half_window: sample_file.write(' '.join([str(id) for id in window]) + '\n') window = [] sample_file.close()
def save_word2vec_format(self, fname, binary=False): """ Store the input-hidden weight matrix in the same format used by the original C word2vec-tool, for compatibility. """ logger.info("storing %sx%s projection weights into %s" % (self.parameters.vocab_size, self.parameters.embedding_size, fname)) with open(fname, 'wb') as fout: fout.write("%s %s\n" % self.parameters.embeddings.shape) # store in sorted order: most frequent words at the top for word, count in sorted(vocabulary.words, key=lambda item: -item[1]): # word = utils.to_utf8(word) # always store in utf8 index = vocabulary.id(word) row = self.parameters.embeddings[index] if binary: fout.write("%s %s\n" % (word, row.tostring())) else: fout.write("%s %s\n" % (word, ' '.join("%f" % val for val in row)))