def yield_jpn_pron(request: requests_html, config: "Config") -> "Iterator[Pron]": # For simplicity, just want to grab the first transcription. # Will encounter words that have no transcription. pron_element = request.html.xpath(config.pron_xpath_selector, first=True) if pron_element: yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config)
def _yield_jpn_lower_pron(request: requests.Response, config: "Config", word_target: str, heading: str) -> "Iterator[Pron]": pron_path = _PRON_XPATH_SELECTOR.format( word_to_work_from=word_target, heading=heading, second_ul="", ) for pron_element in request.html.xpath(pron_path): prons = list(yield_pron(pron_element, IPA_XPATH_SELECTOR, config)) upper_prons = _yield_jpn_upper_pron(request, config, word_target, heading) # There is a possible, though seemingly unlikely, undesirable side # effect of this approach. This could potentially match a second # "upper" pronunciation (most likely in a second Etymology) and # append it to the "lower" pronunciations yielded from the first # Etymology. Or append the "upper" pronunciation of the first # Etymology to the "lower" pronunciation of the second Etymology, etc. # Fortunately entries with multiple <ul>'s and multiple # etymologies are exceedingly rare. I'm not sure I've seen any. try: prons += next(upper_prons) except StopIteration: # Did not find a second <ul> pass # Yielding here because we don't want to collect all pronunciation # entries on a page at the same time. Doing so would make it difficult # to connect words to their prons appropriately. We only want to grab # the prons that are linked to the word we have scraped. yield prons
def _yield_latin_pron(request: requests.Response, config: "Config", tag: str) -> "Iterator[Pron]": heading = "h2" if tag == "Latin" else "h3" if config.dialect: dialect_selector = _PRON_WITH_DIALECT_XPATH_SELECTOR_TEMPLATE.format( dialects_text=" or ".join(f'text() = "{d.strip()}"' for d in config.dialect.split("|"))) else: dialect_selector = ( '[descendant::a[@title = "Appendix:Latin pronunciation"]]') pron_xpath_selector = _PRON_XPATH_TEMPLATE.format( heading=heading, tag=tag, dialect_selector=dialect_selector) for pron_element in request.html.xpath(pron_xpath_selector): yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config)
def _yield_jpn_upper_pron(request: requests.Response, config: "Config", word_target: str, heading: str) -> "Iterator[Pron]": pron_path = _PRON_XPATH_SELECTOR.format( word_to_work_from=word_target, heading=heading, second_ul="[preceding-sibling::*[1][self::ul]]", ) try: request.html.xpath(pron_path)[0] except IndexError: return for upper_pron_ele in request.html.xpath(pron_path): prons = list(yield_pron(upper_pron_ele, IPA_XPATH_SELECTOR, config)) yield prons
def extract_word_pron_shan(word: "Word", request: requests_html, config: "Config") -> "Iterator[WordPronPair]": words = itertools.repeat(word) prons = yield_pron(request.html, _IPA_XPATH_SELECTOR, config) yield from zip(words, prons)
def extract_pron(request: requests_html, selector: str, config: "Config") -> "Iterator[Pron]": for pron_element in request.html.xpath(selector): yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config)
def extract_word_pron_thai(word: "Word", request: requests.Response, config: "Config") -> "Iterator[WordPronPair]": words = itertools.repeat(config.casefold(word)) prons = yield_pron(request.html, IPA_XPATH_SELECTOR, config) yield from zip(words, prons)
def yield_cmn_pron( request: requests.Response, config: "Config" ) -> "Iterator[Pron]": for li_container in request.html.xpath(_PRON_XPATH_TEMPLATE): yield from yield_pron(li_container, IPA_XPATH_SELECTOR, config)
def _yield_latin_pron(request: requests.Response, config: "Config", tag: str) -> "Iterator[Pron]": heading = "h2" if tag == "Latin" else "h3" pron_xpath_selector = _PRON_XPATH_TEMPLATE.format(heading=heading, tag=tag) for pron_element in request.html.xpath(pron_xpath_selector): yield from yield_pron(pron_element, IPA_XPATH_SELECTOR, config)
def extract_word_pron_blt(word: "Word", request: requests_html, config: "Config") -> "Iterator[WordPronPair]": words = itertools.repeat(word) selector = _PRON_XPATH_SELECTOR_TEMPLATE.format(language=config.language) prons = yield_pron(request.html, selector, config) yield from zip(words, prons)
def yield_nan_pron(request: requests_html, selector: str, config: "Config") -> "Iterator[Pron]": for li_container in request.html.xpath(selector): yield from yield_pron(li_container, IPA_XPATH_SELECTOR, config)