def build(src: str, **_) -> str: """Build word store from URL(s). Args: src: source(s) to scrape. Can be URL, or file containing one URL per line. Returns: Words store as json. """ # Make sure src is iterable. src_as_file = os.path.abspath(os.path.expanduser(src)) sources = ( (s for s in iter_sources(src)) if os.path.isfile(src_as_file) else (s for s in [src]) ) # Scrape source(s). words = WordStore() for source in sources: words += scrape(source) # Output results. return words.to_json()
def test_build_with_file(wordstore): rodents = wordstore(['squirrel', 'rabbit', 'capybara']) mustelinae = wordstore(['wolverine', 'otter', 'mink']) birds = wordstore(['osprey', 'pigeon', 'wren']) stores = [rodents, mustelinae, birds] gold = rodents + mustelinae + birds lines = ['first', 'second', 'third'] with mock.patch( 'passphrase.build.scrape', side_effect=stores) as mock_scrape, tempfile.NamedTemporaryFile( 'w') as f: # Create sources. for line in lines: f.write(line) f.write('\n') f.seek(0) json_str = build(f.name) words = WordStore.from_json(json_str) assert words.store == gold.store assert mock_scrape.call_count == len(stores)
def test_build_with_url(wordstore): gold_wordstore = wordstore(['llama', 'rabbit', 'capybara']) with mock.patch('passphrase.build.scrape', return_value=gold_wordstore): words_json = build('http://nowhere.in.particular') words = WordStore.from_json(words_json) assert words.store == gold_wordstore.store
def scrape_html(html: str) -> WordStore: """Scrape HTML of its text. Args: html: Returns: words in HTML. """ words = WordStore() soup = BeautifulSoup(html, 'html.parser') try: for s in soup.body.strings: for word in s.split(): words.add(word) except AttributeError: _logger.info('HTML has no body.') return words
def test_add_duplicate(): gold = ['left', 'center', 'right'] more_gold = ['surround', 'center'] words = WordStore() for word in gold: words.add(word) more_words = WordStore() for word in more_gold: words.add(word) words += more_words assert len(list(words.iter_words())) == len(frozenset(gold + more_gold)) for word in words.iter_words(): assert word in gold or word in more_gold
def test_add_nonempty_to_nonempty(): gold = ['left', 'center', 'right'] more_gold = ['surround'] words = WordStore() for word in gold: words.add(word) more_words = WordStore() for word in more_gold: words.add(word) words += more_words assert len(list(words.iter_words())) == len(gold) + len(more_gold) for word in words.iter_words(): assert word in gold or word in more_gold
def test_add_nonempty_to_empty(): gold = ['alpha', 'bravo', 'charlie'] gold_words = WordStore() for word in gold: gold_words.add(word) words = WordStore() words += gold_words assert len(list(words.iter_words())) == len(gold)
def load_database(src: str) -> WordStore: """Load WordStore from file path. Args: src: file path. Returns: word store. """ src = os.path.abspath(os.path.expanduser(src)) with open(src, 'r') as f: data = f.read() words = WordStore.from_json(data) return words
def scrape(url: str) -> WordStore: """Scrape HTML resource of its text. Args: url: HTML resource to scrape. Returns: words in resource. """ r = requests.get(url) try: r.raise_for_status() except requests.exceptions.HTTPError: _logger.warning('Could not scrape %s', url) words = WordStore() else: words = scrape_html(r.text) return words
def test_empty_plus_empty(): combined = WordStore() + WordStore() assert combined.store == WordStore().store
def test_populated(wordstore): words = wordstore(['something', 'in', 'the', 'way', 'she', 'moves']) json_str = words.to_json() reconstructed_words = WordStore.from_json(json_str) assert reconstructed_words.store == words.store
def test_add_empty(): words = WordStore() words += WordStore() assert len(list(words.iter_words())) == 0
def test_add(): words = WordStore() words.add('albatross') assert len(list(words.iter_words())) == 1 assert 'albatross' in words.iter_words() words.add('ganet') assert len(list(words.iter_words())) == 2 assert 'albatross' in words.iter_words() assert 'ganet' in words.iter_words()
def build_store(words): store = WordStore() for word in words: store.add(word) return store
def test_empty_plus_nonempty(wordstore): words = wordstore(['a', 'b', 'c']) combined = WordStore() + words assert combined.store == words.store
def test_nonempty_plus_empty(wordstore): words = wordstore(['alpha', 'bravo', 'charlie']) combined = words + WordStore() assert combined.store == words.store
def test_empty(): words = WordStore() json_str = words.to_json() reconstructed_words = WordStore.from_json(json_str) assert reconstructed_words.store == words.store