def __next__(self) -> List[str]: """Return list of next sentence's words to be added to the word2vec vector.""" sentence = self.__next_sentence() if sentence is None: raise StopIteration sentence = sentence.strip() if not len(sentence): return [] language = None if identification_would_be_reliable(sentence): language_code = language_code_for_text(sentence) language = LanguageFactory.language_for_code(language_code) if language is None: language = LanguageFactory.default_language() words = language.split_sentence_to_words(sentence) if not len(words): return [] return words
def __next__(self) -> List[str]: """Return list of next sentence's words to be added to the word2vec vector.""" if self.__copy_to is None: raise StopIteration sentence = self.__copy_to.get_line() if sentence is None: self.__copy_to.end() self.__copy_to = None raise StopIteration sentence = sentence.strip() self.__sentence_counter += 1 if self.__sentence_counter % 1000 == 0: log.info("Feeding sentence %d..." % self.__sentence_counter) if not len(sentence): return [] language = None if identification_would_be_reliable(sentence): language_code = language_code_for_text(sentence) language = LanguageFactory.language_for_code(language_code) if language is None: language = LanguageFactory.default_language() words = language.split_sentence_to_words(sentence) if not len(words): return [] return words
def test_language_is_enabled(self): assert LanguageFactory.language_is_enabled('en') is True assert LanguageFactory.language_is_enabled('lt') is True # noinspection PyTypeChecker assert LanguageFactory.language_is_enabled(None) is False assert LanguageFactory.language_is_enabled('') is False assert LanguageFactory.language_is_enabled('xx') is False
def test_language_for_code(self): assert isinstance(LanguageFactory.language_for_code('en'), EnglishLanguage) assert isinstance(LanguageFactory.language_for_code('lt'), LithuanianLanguage) # noinspection PyTypeChecker assert LanguageFactory.language_for_code(None) is None assert LanguageFactory.language_for_code('') is None assert LanguageFactory.language_for_code('xx') is None
def test_language_code_for_text(): assert language_code_for_text(text='') == '' # noinspection PyTypeChecker assert language_code_for_text(text=None) == '' enabled_languages = LanguageFactory.enabled_languages() for language_code in enabled_languages: language = LanguageFactory.language_for_code(language_code) assert language_code == language_code_for_text(text=language.sample_sentence())
def test_identification_would_be_reliable(): assert identification_would_be_reliable(text='') is False # noinspection PyTypeChecker assert identification_would_be_reliable(text=None) is False enabled_languages = LanguageFactory.enabled_languages() for language_code in enabled_languages: language = LanguageFactory.language_for_code(language_code) assert identification_would_be_reliable(text=language.sample_sentence())
def test_identification_would_be_reliable(): assert identification_would_be_reliable(text='') is False # noinspection PyTypeChecker assert identification_would_be_reliable(text=None) is False enabled_languages = LanguageFactory.enabled_languages() for language_code in enabled_languages: language = LanguageFactory.language_for_code(language_code) assert identification_would_be_reliable( text=language.sample_sentence())
def test_language_code_for_text(): assert language_code_for_text(text='') == '' # noinspection PyTypeChecker assert language_code_for_text(text=None) == '' enabled_languages = LanguageFactory.enabled_languages() for language_code in enabled_languages: language = LanguageFactory.language_for_code(language_code) assert language_code == language_code_for_text( text=language.sample_sentence())
def _get_sentences_from_story_text(story_text: str, story_lang: str) -> List[str]: """Split story text to individual sentences.""" story_text = decode_object_from_bytes_if_needed(story_text) story_lang = decode_object_from_bytes_if_needed(story_lang) # Tokenize into sentences lang = LanguageFactory.language_for_code(story_lang) if not lang: lang = LanguageFactory.default_language() sentences = lang.split_text_to_sentences(story_text) return sentences
def _get_sentences_from_content(story_text: str) -> List[str]: """Given raw HML content, extract the content and parse it into sentences.""" story_text = decode_object_from_bytes_if_needed(story_text) lang = LanguageFactory.language_for_code(__AP_LANGUAGE_CODE) sentences = lang.split_text_to_sentences(text=story_text) return sentences
def test_enabled_languages(self): assert 'lt' in LanguageFactory.enabled_languages() assert 'en' in LanguageFactory.enabled_languages() assert 'xx' not in LanguageFactory.enabled_languages()
def test_default_language_code(self): assert LanguageFactory.default_language_code() == 'en'
def add_content_to_test_story(db: DatabaseHandler, story: dict, feed: dict) -> dict: """Adds a 'download' and a 'content' field to each story in the test story stack. Stores the content in the download store. Uses the story->{ content } field if present or otherwise generates the content using _get_test_content().""" story = decode_object_from_bytes_if_needed(story) feed = decode_object_from_bytes_if_needed(feed) content_language_code = None if 'content' in story: content = story['content'] content_language_code = language_code_for_text(content) else: content = _get_test_content() # If language code was undetermined, or if we're using Latin test content if not content_language_code: content_language_code = 'en' if story.get('full_text_rss', None): story['full_text_rss'] = False db.update_by_id( table='stories', object_id=story['stories_id'], update_hash={ 'full_text_rss': False, 'language': content_language_code, }, ) host = get_url_host(feed['url']) download = db.create(table='downloads', insert_hash={ 'feeds_id': feed['feeds_id'], 'url': story['url'], 'host': host, 'type': 'content', 'sequence': 1, 'state': 'fetching', 'priority': 1, 'extracted': True, 'stories_id': story['stories_id'], }) download = store_content(db=db, download=download, content=content) extracted_content = html_strip(content) story['download'] = download story['content'] = extracted_content db.query( """ INSERT INTO download_texts (downloads_id, download_text, download_text_length) VALUES (%(downloads_id)s, %(download_text)s, CHAR_LENGTH(%(download_text)s)) """, { 'downloads_id': download['downloads_id'], 'download_text': extracted_content, }) lang = LanguageFactory.language_for_code(content_language_code) assert lang, f"Language is None for code {content_language_code}" sentences = lang.split_text_to_sentences(extracted_content) sentence_number = 1 for sentence in sentences: db.insert(table='story_sentences', insert_hash={ 'sentence': sentence, 'language': language_code_for_text(sentence) or 'en', 'sentence_number': sentence_number, 'stories_id': story['stories_id'], 'media_id': story['media_id'], 'publish_date': story['publish_date'], }) sentence_number += 1 mark_as_processed(db=db, stories_id=story['stories_id']) story['download_text'] = db.query( """ SELECT * FROM download_texts WHERE downloads_id = %(downloads_id)s """, { 'downloads_id': download['downloads_id'] }).hash() if not story['download_text']: raise McAddContentToTestStoryException("Unable to find download_text") return story
def test_default_language(self): assert isinstance(LanguageFactory.default_language(), EnglishLanguage)
def test_language_code_for_text_uppercase(): enabled_languages = LanguageFactory.enabled_languages() for language_code in enabled_languages: language = LanguageFactory.language_for_code(language_code) assert language_code_for_text(text=language.sample_sentence().upper()) == language_code
def test_language_code_for_text_uppercase(): enabled_languages = LanguageFactory.enabled_languages() for language_code in enabled_languages: language = LanguageFactory.language_for_code(language_code) assert language_code_for_text( text=language.sample_sentence().upper()) == language_code