def extract_ipa(word_soup, region): ipa_class = 'PronCodes' br_ipa_audio_class = 'brefile' us_ipa_audio_class = 'amefile' inflections_class = 'Inflections' audio_url_param_name = 'data-src-mp3' # remove inflections (say -> said, says) # they contain only IPA without any audio inflections = word_soup.find(class_=inflections_class) if inflections: inflections.decompose() if region == 'br': audio_class = br_ipa_audio_class else: audio_class = us_ipa_audio_class ipa = Ipa() ipa.region = region try: ipa.ipa = parsing_tools.find_single_class( word_soup, ipa_class).text.strip().replace(u'/', '') except ClassNotFound: ipa.ipa = '' try: audio_div = parsing_tools.find_single_class(word_soup, audio_class) except ClassNotFound: return ipa audio_url = audio_div[audio_url_param_name] ipa.audio = cache.File(audio_url, 'mp3') return ipa
def parse_html(html): soup = parsing_tools.html_to_soup(html) valid(soup) word_objects = [] words = soup.find_all(class_=word_section_class) if len(words) == 0: raise ParseError( "Can't find any '{}' classes.".format(word_section_class)) for word in words: word_head = parsing_tools.find_single_class(word, word_head_class) word_object = Word() word_object.source = 'Longman' if word.find(class_='bussdictEntry'): word_object.source = 'Longman Business' word_object.word = parsing_tools.find_single_class( word_head, name_class).string.replace(u'‧', u'') pos = word_head.find_all(class_=pos_class) if len(pos) > 0: word_object.pos = ', '.join( [p.text.replace(',', '').strip() for p in pos]) else: word_object.pos = '' try: word_object.pos_additional = parsing_tools\ .find_single_class(word_head, pos_additional_class).text.strip() except ClassNotFound: word_object.pos_additional = '' word_object.ipas.append(extract_ipa(word_head, 'br')) word_object.ipas.append(extract_ipa(word_head, 'us')) try: definitions = parsing_tools.find_all_classes( word, definition_parent_class) except ClassNotFound: pass else: for def_parent in definitions: cross_refs = soup.find_all(class_=crossref_class) for cr in cross_refs: cr.decompose() subdefinitions = def_parent.find_all( class_=subdefinition_parent_class) if subdefinitions: for subdef in subdefinitions: extract_definition(subdef, word_object) else: extract_definition(def_parent, word_object) word_objects.append(word_object) return word_objects
def extract_definition(def_parent, word_object): definition = Definition() try: definition.definition = parsing_tools.find_single_class( def_parent, definition_class).text.strip() except ClassNotFound: # Can't find the definition, it's probably just a link to another page return registers = [] register_divs = def_parent.find_all(class_=definition_register_class) for register_div in register_divs: registers.append('[{}]'.format(register_div.text.strip())) if len(registers) > 0: definition.definition = ' '.join( registers) + ' ' + definition.definition geo = def_parent.find(class_=definition_geo_class) if geo: definition.definition = '[{}] '.format( geo.text.strip()) + definition.definition try: definition.definition_additional = parsing_tools.find_single_class( def_parent, definition_additional_class).text except ClassNotFound: definition.definition_additional = '' sentences = def_parent.find_all(class_=sentence_class) for s in sentences: sentence = Sentence() sentence.content = s.text.strip() audio = s.find(class_=sentence_audio_class) if audio: audio_url = audio[audio_url_param_name] sentence.audio = cache.File(audio_url, 'mp3') definition.sentences.append(sentence) word_object.definitions.append(definition)
def parse_html(html): word_header_class = 'webtop-g' name_class = 'h' pos_class = 'pos' pron_section_class = 'vp-g' pron_top_class = 'pron-g' ipa_class = 'phon' audio_class = 'sound' audio_url_param_name = 'data-src-mp3' idioms_parent = 'idm-gs' definition_parent_class = 'sn-g' definition_class = 'def' definition_additional_class = 'gram-g' definition_label_class = 'label-g' # "informal", "especially north american", etc sentence_class = 'x' collapse_class = 'collapse' synonyms_title = 'Synonyms' collocations_title = 'Collocations' soup = parsing_tools.html_to_soup(html) header = parsing_tools.find_single_class(soup, word_header_class) word = Word() word.source = 'Oxford' word.word = parsing_tools.find_single_class(header, name_class).text try: word.pos = parsing_tools.find_single_class(header, pos_class).string except ClassNotFound: word.pos = 'undefined' try: prons = parsing_tools.find_all_classes(soup, pron_top_class) except ClassNotFound: pass else: for pron in prons: ipa = Ipa() try: ipa_content = parsing_tools.find_single_class(pron, ipa_class) except ClassNotFound: pass else: ipa.ipa = extract_ipa(ipa_content.text) audio_div = parsing_tools.find_single_class(pron, audio_class) audio_url = audio_div[audio_url_param_name] ipa.audio = cache.File(audio_url, 'mp3') try: geo = pron['geo'] except KeyError: raise ParseError("Can't find 'geo' attribute in a pronunciation class {}".format(str(pron))) if 'br' in geo and 'am' in geo: raise ParseError("Can't decide if IPA is UK or US, geo name: '{}'".format(geo)) if 'br' in geo: ipa.region = 'BR' elif 'am' in geo: ipa.region = 'US' else: raise ParseError("Can't decide if IPA is UK or US, geo name: '{}'".format(geo)) pron_section = pron.find_parent(class_=pron_section_class) if pron_section: description_words = pron_section.find(class_='vp').text.split(' ') description_words[-1] = '<b>' + description_words[-1] + '</b>' ipa.description = ' '.join(description_words) word.ipas.append(ipa) # remove idiom div, it also has definitions we don't need idiom_div = soup.find(class_=idioms_parent) if idiom_div: idiom_div.decompose() try: definitions = parsing_tools.find_all_classes(soup, definition_parent_class) except ClassNotFound: pass else: for def_parent in definitions: definition = Definition() try: definition_header = parsing_tools.find_single_class( def_parent, definition_class) except ClassNotFound: # Probably a link to some other page continue definition.definition = definition_header.text # remove synonyms etc., they can have labels we don't need collapsed = soup.find_all(class_=collapse_class) for c in collapsed: c.decompose() label = def_parent.find(class_=definition_label_class) if label: definition.definition = label.text.replace('(', '[').replace(')', ']') \ + ' ' + definition.definition try: definition.definition_additional = parsing_tools.find_single_class( def_parent, definition_additional_class) except ClassNotFound: definition.definition_additional = '' sentences = def_parent.find_all(class_=sentence_class) for s in sentences: sentence = Sentence() sentence.content = s.text definition.sentences.append(sentence) word.definitions.append(definition) return word
def parse_html(html): top_container_class = 'webtop' name_class = 'headword' pos_class = 'pos' pos_additional_classes = ['labels', 'inflections', 'variants'] verb_form_root_class = 'verb_form' verb_form_description_class = 'verb_form' pron_top_class = 'phonetics' ipa_class = 'phon' audio_class = 'sound' audio_url_param_name = 'data-src-mp3' idioms_parent = 'idioms' definition_parent_class = 'sense' definition_class = 'def' definition_additional_class = 'grammar' # "uncountable", etc definition_label_class = 'labels' # "informal", "especially north american", etc sentence_class = 'x' collapse_class = 'collapse' synonyms_title = 'Synonyms' collocations_title = 'Collocations' soup = parsing_tools.html_to_soup(html) # there are many class with this name, get the first one top_container = soup.find(class_=top_container_class) word = Word() word.source = 'Oxford' word.word = parsing_tools.find_single_class(top_container, name_class).text pos = top_container.find_all(class_=pos_class) if len(pos) > 0: word.pos = ', '.join([p.string for p in pos]) else: word.pos = '' pos_additionals = [] for c in pos_additional_classes: pos_additional = top_container.find(class_=c, recursive=False) if pos_additional: pos_additionals.append( pos_additional.text.replace('(', '[').replace(')', ']')) word.pos_additional = ' '.join(pos_additionals) try: pron_collections = parsing_tools.find_all_classes( top_container, pron_top_class) except ClassNotFound: pass else: for pron_collection in pron_collections: prons = pron_collection.find_all('div', recursive=False) for pron in prons: ipas = [] ipa_contents = pron.find_all(class_=ipa_class) for i, ipa_content in enumerate(ipa_contents): if len(ipas) == i: ipas.append(Ipa()) ipas[i].ipa = extract_ipa(ipa_content.text) audio_divs = pron.find_all(class_=audio_class) for i, audio_div in enumerate(audio_divs): if len(ipas) == i: ipas.append(Ipa()) audio_url = audio_div[audio_url_param_name] ipas[i].audio = cache.File(audio_url, 'mp3') if len(ipas) > 1 and (len(ipa_contents) != len(audio_divs)): raise ParseError( "Found multiple pronunciations for a region, " "but audio and ipa length are different.") try: geo = pron['geo'] except KeyError: raise ParseError( "Can't find 'geo' attribute in a pronunciation class {}" .format(str(pron))) if 'br' in geo and 'am' in geo: raise ParseError( "Can't decide if IPA is UK or US, geo name: '{}'". format(geo)) if 'br' in geo: for ipa in ipas: ipa.region = 'BR' elif 'am' in geo: for ipa in ipas: ipa.region = 'US' else: raise ParseError( "Can't decide if IPA is UK or US, geo name: '{}'". format(geo)) pron_section = pron.find_parent(class_=verb_form_root_class) if pron_section: description_words = pron_section.find( class_=verb_form_description_class).text.split(' ') description_words[ -1] = '<b>' + description_words[-1] + '</b>' for ipa in ipas: ipa.description = ' '.join(description_words) for ipa in ipas: word.ipas.append(ipa) # remove idiom div, it also has definitions we don't need idiom_div = soup.find(class_=idioms_parent) if idiom_div: idiom_div.decompose() try: definitions = parsing_tools.find_all_classes(soup, definition_parent_class) except ClassNotFound: pass else: for def_parent in definitions: definition = Definition() try: definition_header = parsing_tools.find_single_class( def_parent, definition_class) except ClassNotFound: # Probably a link to some other page continue definition.definition = definition_header.text # remove synonyms etc., they can have labels we don't need collapsed = soup.find_all(class_=collapse_class) for c in collapsed: c.decompose() label = def_parent.find(class_=definition_label_class) if label and len(label.text.strip()) > 0: definition.definition = label.text.replace('(', '[').replace(')', ']') \ + ' ' + definition.definition try: definition.definition_additional = parsing_tools.find_single_class( def_parent, definition_additional_class).text except ClassNotFound: definition.definition_additional = '' sentences = def_parent.find_all(class_=sentence_class) for s in sentences: sentence = Sentence() sentence.content = s.text definition.sentences.append(sentence) word.definitions.append(definition) return word