def assign_gender_declension(noun, first_line): soup_fb = get_definition_soup(noun, 'teanglann', lang='ga-fb') entry_fb = soup_fb.find(class_='entry') if first_line.find(title="feminine"): gender = 'nf' k_lookup = 'bain' elif first_line.find(title="masculine"): gender = 'nm' k_lookup = 'fir' else: return None search_entry = entry_fb if entry_fb: for subentry in entry_fb.find_all(class_='subentry'): if bs4_get_text(subentry.find(class_='headword')) == noun: # https://www.teanglann.ie/en/fb/cainteoir # main entry is 'caint' search_entry = subentry break else: # https://www.teanglann.ie/en/fb/trumpa - ignore trumpadóir subentry.extract() if search_entry: noun_decs = search_entry.find_all(string=re.compile(k_lookup + '[1-4]')) declensions = set() for noun_dec in noun_decs: declensions.add(noun_dec.string.strip()[-1]) if len(declensions) > 1: manual_debug() elif declensions: gender += declensions.pop() if len(gender) == 2: soup_gram = get_definition_soup(noun, 'teanglann', lang='ga-gram') grams = soup_gram.find_all(class_='gram') for gram in grams: gender_prop = False if gram.find(text='NOUN'): if gender == 'nf': gender_prop = gram.find(text='FEMININE') elif gender == 'nm': gender_prop = gram.find(text='MASCULINE') if gender_prop: dec_prop = gender_prop.\ find_parent(class_='property').\ find_next_sibling(class_='property') if dec_prop: dec_text = bs4_get_text(dec_prop.find(class_='value')) dec_text = dec_text.strip() if dec_text.endswith('DECLENSION'): gender += dec_text[0] break return gender
def find_teanglann_periphrases(): """ total words: 53,677 323 multi-word entries: ... téigh trí thar ceann thar n-ais tit amach tit ar tit chuig tit do tit faoi ... """ alphabet = list('abcdefghijklmnopqrstuvwxyz') word_count = 0 shuffle(alphabet) for letter in alphabet: soup = get_definition_soup('_' + letter, 'teanglann') abc = soup.find(class_='abcListings') for word_item in abc.find_all(class_="abcItem"): potential_periphrase = bs4_get_text(word_item.find('a')) if ' ' in potential_periphrase: print(potential_periphrase) else: word_count += 1 print('total words:', word_count)
def get_foclóir_candidates(word): candidates = set() soup = get_definition_soup(word, 'foclóir', lang='ga', page_no=0) result_lists = soup.find_all(class_='result-list') if not result_lists: if 'No matches found.' in soup.get_text(): return set() lis = result_lists[0].find_all('li') for result in lis: if result.find(class_='lang_ga').string.strip() == word: candidates.add(result.find(class_='lang_en').string.strip()) return candidates
def foclóir_score_definition(en, ga): """ Estimate of how important a GA definition is in terms of the Englis we count what percentage of translations use the word between 0.0 and 1.0 lower is better """ soup = get_definition_soup(en, 'foclóir', lang='en') senses = soup.find_all(class_="sense") found_count = 0 lang_gas_count = 0 for i, sense in enumerate(senses): lang_gas = sense.find_all(attrs={ 'xml:lang': 'ga', 'class': 'cit_translation' }) for lang_ga in lang_gas: lang_gas_count += 1 if lang_ga.find(class_='quote', text=ga) or \ ga in bs4_get_text(lang_ga.find(class_='quote')): found_count += 1 break return 1 - (found_count / lang_gas_count)
def assign_adjectival_variants(adjective, format='html'): ''' rud ceart oibiacht cheart rudaí cearta ''' ret = {} soup_gram = get_definition_soup(adjective, 'teanglann', lang='ga-gram') for gram in soup_gram.find_all(class_="gram"): if not gram.find(text='ADJECTIVE'): continue for section in gram.find_all(class_="section"): k1 = section.find('h2').text.lower() for subsection in section.find_all(class_="subsection"): k2 = '-' + subsection.find('h3').text.lower() for line in subsection.find_all(class_="line"): lab = line.find(class_="label") k3 = '' if lab: k3 += '-' + lab .text.lower().\ strip('().').replace(' ', '-') values = line.find_all(class_="value") if len(values) != 1: manual_debug() k = k1 + k2 + k3 k = k.replace(' ', '-') if k in ret: if 'primary' in values[0]['class']: manual_debug() elif 'ba ' not in values[0].text and \ "ní b'" not in values[0].text and \ 'ab ' not in values[0].text: print('ignoring', k, values[0].text) continue ret[k] = values[0].text.lower() return format_adjectives(adjective, ret, format=format)
def assign_verbal_noun(verb): for subentries, subentry_labels in get_teanglann_subentries(verb): first_line = before_split(subentries[0], 'trans') if first_line.find(title="transitive verb") or \ first_line.find(title="intransitive verb") or \ first_line.find(title="and intransitive"): flt = bs4_get_text(first_line) flt = re.sub(r'\s\s+', ' ', flt) # dóigh: newlines vn = None if 'verbal noun ~' in flt: vn = flt.split('verbal noun ~', 1)[1] vn = vn.replace('feminine', '') # pleanáil poor spacing vn = vn.replace('masculine', '') # ditto vn = verb + vn elif 'verbal noun -' in flt: suffix = flt.split('verbal noun -', 1)[1] suffix = re.split(r'[\s,);]', suffix.lstrip())[0] vn = fill_in_dash('-' + suffix, verb) else: for good_split in [ '(verbal noun ', ', verbal noun ', '; verbal noun ', ]: if good_split in flt: vn = flt.split(good_split, 1)[1] if vn: vn = re.split(r'[\s,);]', vn.lstrip())[0] return vn vni = first_line.find(title='verbal noun') if vni: vn = bs4_get_text(vni.next_sibling) vn = vn.strip() if ' ' in vn: manual_debug() if 'of' in vn: manual_debug() if '~' not in vn: manual_debug() else: return vn.replace('~', verb) pass else: if verb_from_vn(verb) == verb: # self verbal noun, e.g. bruith return verb soup = get_definition_soup(verb, 'teanglann', lang='ga') # same page rm = soup.find(text=re.compile(r"\s*RELATED\s+MATCHES\s*")) if rm: for link in rm.parent.parent.find_all('a'): related_word = bs4_get_text(link).strip(' »') if verb_from_vn(related_word) == verb: return related_word if verb.endswith('aigh') and verb_from_vn(verb[:-4] + 'ú') == verb: # aontaigh / aontú return verb[:-4] + 'ú' if verb.endswith('igh') and verb_from_vn(verb[:-2] + 'ú') == verb: # oibrigh / oibriú return verb[:-2] + 'ú' if verb_from_vn(verb + 'adh') == verb: # gets 'cor' return verb + 'adh' if verb_from_vn(verb + 'eadh') == verb: # gets 'croith' return verb + 'eadh' if verb == 'tosnaigh': # https://www.gaois.ie/crp/ga/?txt=tosn%C3%BA&lang=ga&SearchMode=narrow return 'tosnú' # rather than 'tosú' if verb not in [ 'réigh', # is it réiteach also? 'áil', # literary use as a verb 'cis', 'gad', 'fainic', # used imperatively only 'batráil', ] and ' ' not in verb: print(f'Warning: No verbal noun found for {verb}') pass
def get_teanglann_subentries(word): soup = get_definition_soup(word, 'teanglann', lang='ga') entry_subentries = [] for entry in soup.find_all(class_='entry'): entry_subentries.append(entry) for subentry in entry.find_all(class_='subentry'): # following also removes it from first entry entry_subentries.append(subentry.extract()) for entry in entry_subentries: expand_abbreviations(entry) if 'subentry' not in entry['class'] and \ not entry.text.strip().lower().startswith(word.lower()): # https://www.teanglann.ie/en/fgb/i%20measc # gives results for 'imeasc' not 'i measc' continue if 'subentry' not in entry['class']: subentries = [soup.new_tag('div')] subentry_labels = [''] # first line, may contain a 'main' entry n = 1 nxs = 'abcdefghijklmnopqrstuvwxyz' nxi = 0 for node in entry.contents[:]: node_text = bs4_get_text(node) if f'{n}.' in re.sub(rf'adjective\s*{1}.', '', node_text): as_subnode = node.find(text=re.compile(rf'\s+{n}.\s+')) if as_subnode: rev = [] for r in as_subnode.previous_siblings: rev.append(r) for r in reversed(rev): subentries[-1].append(r) pre, post = node_text.rsplit(f'{n}.', 1) if pre.strip(): subentries[-1].append(pre.strip()) subentries.append(soup.new_tag('div')) subentry_labels.append(f'{n}. ') nxi = 0 if post.strip(): subentries[-1].append(post.strip()) if as_subnode: for r in as_subnode.next_siblings: subentries[-1].append(r) n += 1 elif (len(subentries) > 1 # we've got at least a '1.' already and (f'({nxs[nxi]})' in node_text or (f'{nxs[nxi]}' == node_text.strip() and node.next_sibling.string.strip() == ')' and subentries[-1].get_text().strip().endswith('(')))): pre, post = node_text.split(f'{nxs[nxi]}', 1) if pre.strip().rstrip('('): subentries[-1].append(pre.strip()) if subentries[-1].get_text().strip().rstrip('('): subentries.append(soup.new_tag('div')) subentry_labels.append(f'{n-1}.({nxs[nxi]}) ') else: subentry_labels[-1] = f'{n-1}.({nxs[nxi]}) ' if post.strip().lstrip(')'): subentries[-1].append(post.strip()) nxi += 1 else: subentries[-1].append(node) yield subentries, subentry_labels
def get_line_types_gender(word, line): gender = None types = OrderedDict() # using as ordered set if line.find(title="feminine") and \ line.find(title="masculine"): # 'cara' has '(Var:feminine)' at the end # 'neantóg' has '(Var:neantán masculine)' at the end for g in [r'masculine', r'feminine']: r = re.compile(r'\(var(?:iant)*:[^)]*\s*' + g + r'\)', re.I) if g not in re.sub(r, '', bs4_get_text(line)): line.find(title=g).extract() # TODO: should look at line only up to opening parenthesis if line.find(title="pronoun"): # sé/sí are not nouns types['Pronoun'] = True elif line.find(title="feminine") and \ line.find(title="masculine") and \ word != 'talamh': # 'thar': has 'thairis (m) thairsti (f)' and is not a noun pass elif (line.find(title="feminine") or line.find(title="masculine")): types['Noun'] = True gender = assign_gender_declension(word, line) if line.find(title="adverb"): types['Adverb'] = True if line.find(title="preposition"): types['Preposition'] = True if line.find(title="adjective"): types['Adjective'] = True gender_a = 'a' dec = line.find(title="adjective").next_sibling # to check: think it only goes up to a3 if dec and dec.strip().strip('.') in ['1', '2', '3', '4']: gender_a += dec.strip().strip('.') else: soup_fb = get_definition_soup(word, 'teanglann', lang='ga-fb') entry_fb = soup_fb.find(class_='entry') if soup_fb.find(text='aid3'): gender_a += '3' elif soup_fb.find(text='aid2'): gender_a += '2' elif soup_fb.find(text='aid1'): gender_a += '1' elif not soup_fb.find(text='aid'): # 'thar' spurious adj. in following: # ' of <span title="adjective">a</span> general nature' del types['Adjective'] gender_a = None if gender_a: if not gender: gender = gender_a else: gender += '\n' + gender_a if line.find(title="transitive verb"): if 'Verb' not in types: types['Verb'] = OrderedDict() types['Verb']['Transitive'] = True if line.find(title="intransitive verb") or \ line.find(title="and intransitive"): if 'Verb' not in types: types['Verb'] = OrderedDict() types['Verb']['Intransitive'] = True if line.find(title="conjunction"): types['Conjugation'] = True if line.find(title="prefix"): types['Prefix'] = True if 'Verb' in types and 'Noun' in types: del types['Noun'] gender = None return types, gender