def run_word2vec_generate_snapshot_model(snapshots_id: int) -> None: """Generate word2vec model for a given snapshot.""" # MC_REWRITE_TO_PYTHON: remove after Python rewrite if isinstance(snapshots_id, bytes): snapshots_id = decode_object_from_bytes_if_needed(snapshots_id) if snapshots_id is None: raise McWord2vecGenerateSnapshotModelException( "'snapshots_id' is None.") snapshots_id = int(snapshots_id) db = connect_to_db() log.info("Generating word2vec model for snapshot %d..." % snapshots_id) sentence_iterator = SnapshotSentenceIterator(db=db, snapshots_id=snapshots_id) model_store = SnapshotDatabaseModelStore(db=db, snapshots_id=snapshots_id) train_word2vec_model(sentence_iterator=sentence_iterator, model_store=model_store) log.info("Finished generating word2vec model for snapshot %d." % snapshots_id)
def test_train_word2vec_model(self): sentence_iterator = SnapshotSentenceIterator( db=self.db, topics_id=self.topics_id, snapshots_id=self.snapshots_id, stories_id_chunk_size=self.TEST_STORIES_ID_CHUNK_SIZE, ) model_store = SnapshotDatabaseModelStore( db=self.db, topics_id=self.topics_id, snapshots_id=self.snapshots_id) models_id = train_word2vec_model(sentence_iterator=sentence_iterator, model_store=model_store) model_data = model_store.read_model(models_id=models_id) assert model_data is not None assert isinstance(model_data, bytes) # Save to file, make sure it loads temp_directory = tempfile.mkdtemp() temp_model_path = os.path.join(temp_directory, 'word2vec.pickle') with open(temp_model_path, mode='wb') as temp_model_file: temp_model_file.write(model_data) word_vectors = gensim.models.KeyedVectors.load_word2vec_format( temp_model_path, binary=True) assert word_vectors is not None assert word_vectors['story'] is not None assert word_vectors['sentence'] is not None assert 'badger' not in word_vectors shutil.rmtree(temp_directory)
def test_snapshot_sentence_iterator_nonexistent_snapshot(self): with pytest.raises(McWord2vecException): SnapshotSentenceIterator( db=self.db, snapshots_id=123456, stories_id_chunk_size=self.TEST_STORIES_ID_CHUNK_SIZE, )
def run_word2vec_generate_snapshot_model(snapshots_id: int) -> None: """Generate word2vec model for a given snapshot.""" # MC_REWRITE_TO_PYTHON: remove after Python rewrite if isinstance(snapshots_id, bytes): snapshots_id = decode_object_from_bytes_if_needed(snapshots_id) if snapshots_id is None: raise McWord2vecGenerateSnapshotModelException( "'snapshots_id' is None.") snapshots_id = int(snapshots_id) db = connect_to_db() # FIXME might be more efficient to pass topics_id as a parameter topics_id = db.query( """ SELECT topics_id FROM snapshots WHERE snapshots_id = %(snapshots_id)s """, { 'snapshots_id': snapshots_id }).flat()[0] log.info( f"Generating word2vec model for topic {topics_id}, snapshot {snapshots_id}..." ) sentence_iterator = SnapshotSentenceIterator(db=db, topics_id=topics_id, snapshots_id=snapshots_id) model_store = SnapshotDatabaseModelStore(db=db, topics_id=topics_id, snapshots_id=snapshots_id) train_word2vec_model(sentence_iterator=sentence_iterator, model_store=model_store) log.info( f"Finished generating word2vec model for topic {topics_id}, snapshot {snapshots_id}." )
def test_snapshot_sentence_iterator(self): """Ensure that all of the sentences get returned""" sentence_iterator = SnapshotSentenceIterator( db=self.db, snapshots_id=self.snapshots_id, stories_id_chunk_size=self.TEST_STORIES_ID_CHUNK_SIZE, ) returned_sentence_count = 0 seen_sentences = set() for sentence_words in sentence_iterator: assert sentence_words, "Sentence words should be set." sentence = ' '.join(sentence_words) assert sentence not in seen_sentences, "Every sentence should be unique." returned_sentence_count += 1 seen_sentences.add(sentence) assert returned_sentence_count == self.TEST_STORY_COUNT * self.TEST_SENTENCE_PER_STORY_COUNT, \ "All of the sentences should have been returned."