def _call_scrape(lang_settings, config, tsv_path): for unused_retries in range(10): with open(tsv_path, "w") as source: try: for (word, pron) in wikipron.scrape(config): print(f"{word}\t{pron}", file=source) return except ( requests.exceptions.Timeout, requests.exceptions.ConnectionError, ): logging.info( 'Exception detected while scraping: "%s", "%s".', lang_settings["key"], tsv_path, ) # Pauses execution for 10 min. time.sleep(600) # Log and remove TSVs for languages that failed # to be scraped within 10 retries. logging.info( 'Failed to scrape "%s" within 10 retries. %s', lang_settings["key"], lang_settings, ) os.remove(tsv_path)
def _call_scrape(lang_settings, config, tsv_path): for unused_retries in range(10): count = 0 with open(tsv_path, "w") as source: try: for (word, pron) in wikipron.scrape(config): count += 1 print(f"{word}\t{pron}", file=source) return count except ( requests.exceptions.Timeout, requests.exceptions.ConnectionError, ): logger.info( 'Exception detected while scraping: "%s", "%s".', lang_settings["key"], tsv_path, ) # Pauses execution for 10 min. time.sleep(600) logger.info( 'Failed to scrape "%s" within 10 retries. %s', lang_settings["key"], lang_settings, ) return 0
def test_scrape(): """A smoke test for scrape().""" n = 10 # number of word-pron pairs to scrape config = _config_factory() pairs = [] for i, (word, pron) in enumerate(scrape(config)): if i >= n: break pairs.append((word, pron)) assert len(pairs) == n assert all(word and pron for (word, pron) in pairs)
def _call_scrape( lang_settings: Dict[str, str], config: wikipron.Config, tsv_path: str, phones_set: FrozenSet[str] = None, tsv_filtered_path: str = "", ) -> None: for unused_retries in range(10): with open(tsv_path, "w", encoding="utf-8") as source: try: scrape_results = wikipron.scrape(config) # Given phones, opens up a second tsv for scraping. if phones_set: with open(tsv_filtered_path, "w", encoding="utf-8") as source_filtered: for (word, pron) in scrape_results: line = f"{word}\t{pron}" if _filter(word, pron, phones_set): print(line, file=source_filtered) print(line, file=source) else: for (word, pron) in scrape_results: print(f"{word}\t{pron}", file=source) return except ( requests.exceptions.Timeout, requests.exceptions.ConnectionError, ): logging.info( "Exception detected while scraping: %r, %r, %r", lang_settings["key"], tsv_path, tsv_filtered_path, ) # Pauses execution for 10 min. time.sleep(600) # Log and remove TSVs for languages that failed. logging.info( "Failed to scrape %r with 10 retries (%s)", lang_settings["key"], lang_settings, ) # Checks if second TSV was opened. try: os.remove(tsv_filtered_path) except OSError: pass os.remove(tsv_path)
def _call_scrape( lang_settings: Dict[str, str], config: wikipron.Config, tsv_path: str, phones_set: FrozenSet[str] = None, tsv_filtered_path: str = "", ) -> None: with open(tsv_path, "w", encoding="utf-8") as source: scrape_results = wikipron.scrape(config) # Given phones, opens up a second TSV for scraping. if phones_set: with open( tsv_filtered_path, "w", encoding="utf-8" ) as source_filtered: for (word, pron) in scrape_results: line = f"{word}\t{pron}" if _filter(word, pron, phones_set): print(line, file=source_filtered) print(line, file=source) else: for (word, pron) in scrape_results: print(f"{word}\t{pron}", file=source)
# -*- coding: utf-8 -*- """ Created on Sat May 30 14:54:44 2020 @author: qtckp """ import wikipron import os import json dic = {} config = wikipron.Config(key="en") t = 0 for word, pron in wikipron.scrape(config): t += 1 if t % 100 == 0: print(f'{t} {word} {pron}') if len(word) > 1: dic[word] = pron with open("english.json", "w") as write_file: json.dump(dic, write_file, indent=4)