Python get_definition_soupの例、download.get_definition_soup Pythonの例

コード例 #1

0

ファイルを表示

ファイル: teanglann.py プロジェクト: timczerniak/lookup-irish

def assign_gender_declension(noun, first_line):
    soup_fb = get_definition_soup(noun, 'teanglann', lang='ga-fb')
    entry_fb = soup_fb.find(class_='entry')
    if first_line.find(title="feminine"):
        gender = 'nf'
        k_lookup = 'bain'
    elif first_line.find(title="masculine"):
        gender = 'nm'
        k_lookup = 'fir'
    else:
        return None
    search_entry = entry_fb
    if entry_fb:
        for subentry in entry_fb.find_all(class_='subentry'):
            if bs4_get_text(subentry.find(class_='headword')) == noun:
                # https://www.teanglann.ie/en/fb/cainteoir
                # main entry is 'caint'
                search_entry = subentry
                break
            else:
                # https://www.teanglann.ie/en/fb/trumpa - ignore trumpadóir
                subentry.extract()
    if search_entry:
        noun_decs = search_entry.find_all(string=re.compile(k_lookup +
                                                            '[1-4]'))
        declensions = set()
        for noun_dec in noun_decs:
            declensions.add(noun_dec.string.strip()[-1])
        if len(declensions) > 1:
            manual_debug()
        elif declensions:
            gender += declensions.pop()
    if len(gender) == 2:
        soup_gram = get_definition_soup(noun, 'teanglann', lang='ga-gram')
        grams = soup_gram.find_all(class_='gram')
        for gram in grams:
            gender_prop = False
            if gram.find(text='NOUN'):
                if gender == 'nf':
                    gender_prop = gram.find(text='FEMININE')
                elif gender == 'nm':
                    gender_prop = gram.find(text='MASCULINE')
            if gender_prop:
                dec_prop = gender_prop.\
                    find_parent(class_='property').\
                    find_next_sibling(class_='property')
                if dec_prop:
                    dec_text = bs4_get_text(dec_prop.find(class_='value'))
                    dec_text = dec_text.strip()
                    if dec_text.endswith('DECLENSION'):
                        gender += dec_text[0]
                        break
    return gender

コード例 #2

0

ファイルを表示

ファイル: teanglann.py プロジェクト: timczerniak/lookup-irish

def find_teanglann_periphrases():
    """
total words: 53,677
323 multi-word entries:
...
téigh trí
thar ceann
thar n-ais
tit amach
tit ar
tit chuig
tit do
tit faoi
...
    """
    alphabet = list('abcdefghijklmnopqrstuvwxyz')
    word_count = 0
    shuffle(alphabet)
    for letter in alphabet:
        soup = get_definition_soup('_' + letter, 'teanglann')
        abc = soup.find(class_='abcListings')
        for word_item in abc.find_all(class_="abcItem"):
            potential_periphrase = bs4_get_text(word_item.find('a'))
            if ' ' in potential_periphrase:
                print(potential_periphrase)
            else:
                word_count += 1
    print('total words:', word_count)

コード例 #3

0

ファイルを表示

ファイル: focloir.py プロジェクト: timczerniak/lookup-irish

def get_foclóir_candidates(word):
    candidates = set()
    soup = get_definition_soup(word, 'foclóir', lang='ga', page_no=0)
    result_lists = soup.find_all(class_='result-list')
    if not result_lists:
        if 'No matches found.' in soup.get_text():
            return set()
    lis = result_lists[0].find_all('li')
    for result in lis:
        if result.find(class_='lang_ga').string.strip() == word:
            candidates.add(result.find(class_='lang_en').string.strip())
    return candidates

コード例 #4

0

ファイルを表示

ファイル: focloir.py プロジェクト: timczerniak/lookup-irish

def foclóir_score_definition(en, ga):
    """
Estimate of how important a GA definition is in terms of the Englis
we count what percentage of translations use the word
between 0.0 and 1.0
lower is better
    """
    soup = get_definition_soup(en, 'foclóir', lang='en')
    senses = soup.find_all(class_="sense")
    found_count = 0
    lang_gas_count = 0
    for i, sense in enumerate(senses):
        lang_gas = sense.find_all(attrs={
            'xml:lang': 'ga',
            'class': 'cit_translation'
        })
        for lang_ga in lang_gas:
            lang_gas_count += 1
            if lang_ga.find(class_='quote', text=ga) or \
               ga in bs4_get_text(lang_ga.find(class_='quote')):
                found_count += 1
                break
    return 1 - (found_count / lang_gas_count)

コード例 #5

0

ファイルを表示

ファイル: teanglann.py プロジェクト: timczerniak/lookup-irish

def assign_adjectival_variants(adjective, format='html'):
    '''
rud ceart
oibiacht cheart
rudaí cearta
    '''
    ret = {}
    soup_gram = get_definition_soup(adjective, 'teanglann', lang='ga-gram')
    for gram in soup_gram.find_all(class_="gram"):
        if not gram.find(text='ADJECTIVE'):
            continue
        for section in gram.find_all(class_="section"):
            k1 = section.find('h2').text.lower()
            for subsection in section.find_all(class_="subsection"):
                k2 = '-' + subsection.find('h3').text.lower()
                for line in subsection.find_all(class_="line"):
                    lab = line.find(class_="label")
                    k3 = ''
                    if lab:
                        k3 += '-' + lab .text.lower().\
                            strip('().').replace(' ', '-')
                    values = line.find_all(class_="value")
                    if len(values) != 1:
                        manual_debug()
                    k = k1 + k2 + k3
                    k = k.replace(' ', '-')
                    if k in ret:
                        if 'primary' in values[0]['class']:
                            manual_debug()
                        elif 'ba ' not in values[0].text and \
                             "ní b'" not in values[0].text and \
                             'ab ' not in values[0].text:
                            print('ignoring', k, values[0].text)
                        continue
                    ret[k] = values[0].text.lower()
    return format_adjectives(adjective, ret, format=format)

コード例 #6

0

ファイルを表示

ファイル: teanglann.py プロジェクト: timczerniak/lookup-irish

def assign_verbal_noun(verb):
    for subentries, subentry_labels in get_teanglann_subentries(verb):
        first_line = before_split(subentries[0], 'trans')

        if first_line.find(title="transitive verb") or \
           first_line.find(title="intransitive verb") or \
           first_line.find(title="and intransitive"):
            flt = bs4_get_text(first_line)
            flt = re.sub(r'\s\s+', ' ', flt)  # dóigh: newlines
            vn = None
            if 'verbal noun ~' in flt:
                vn = flt.split('verbal noun ~', 1)[1]
                vn = vn.replace('feminine', '')  # pleanáil poor spacing
                vn = vn.replace('masculine', '')  # ditto
                vn = verb + vn
            elif 'verbal noun -' in flt:
                suffix = flt.split('verbal noun -', 1)[1]
                suffix = re.split(r'[\s,);]', suffix.lstrip())[0]
                vn = fill_in_dash('-' + suffix, verb)
            else:
                for good_split in [
                        '(verbal noun ',
                        ', verbal noun ',
                        '; verbal noun ',
                ]:
                    if good_split in flt:
                        vn = flt.split(good_split, 1)[1]
            if vn:
                vn = re.split(r'[\s,);]', vn.lstrip())[0]
                return vn
            vni = first_line.find(title='verbal noun')
            if vni:
                vn = bs4_get_text(vni.next_sibling)
                vn = vn.strip()
                if ' ' in vn:
                    manual_debug()
                if 'of' in vn:
                    manual_debug()
                if '~' not in vn:
                    manual_debug()
                else:
                    return vn.replace('~', verb)
            pass
        else:
            if verb_from_vn(verb) == verb:
                # self verbal noun, e.g. bruith
                return verb

    soup = get_definition_soup(verb, 'teanglann', lang='ga')  # same page
    rm = soup.find(text=re.compile(r"\s*RELATED\s+MATCHES\s*"))
    if rm:
        for link in rm.parent.parent.find_all('a'):
            related_word = bs4_get_text(link).strip(' »')
            if verb_from_vn(related_word) == verb:
                return related_word
    if verb.endswith('aigh') and verb_from_vn(verb[:-4] + 'ú') == verb:
        # aontaigh / aontú
        return verb[:-4] + 'ú'
    if verb.endswith('igh') and verb_from_vn(verb[:-2] + 'ú') == verb:
        # oibrigh / oibriú
        return verb[:-2] + 'ú'
    if verb_from_vn(verb + 'adh') == verb:
        # gets 'cor'
        return verb + 'adh'
    if verb_from_vn(verb + 'eadh') == verb:
        # gets 'croith'
        return verb + 'eadh'

    if verb == 'tosnaigh':
        # https://www.gaois.ie/crp/ga/?txt=tosn%C3%BA&lang=ga&SearchMode=narrow
        return 'tosnú'  # rather than 'tosú'
    if verb not in [
            'réigh',  # is it réiteach also?
            'áil',  # literary use as a verb
            'cis',
            'gad',
            'fainic',  # used imperatively only
            'batráil',
    ] and ' ' not in verb:
        print(f'Warning: No verbal noun found for {verb}')
    pass

コード例 #7

0

ファイルを表示

ファイル: teanglann.py プロジェクト: timczerniak/lookup-irish

def get_teanglann_subentries(word):
    soup = get_definition_soup(word, 'teanglann', lang='ga')

    entry_subentries = []
    for entry in soup.find_all(class_='entry'):
        entry_subentries.append(entry)
        for subentry in entry.find_all(class_='subentry'):
            # following also removes it from first entry
            entry_subentries.append(subentry.extract())

    for entry in entry_subentries:
        expand_abbreviations(entry)
        if 'subentry' not in entry['class'] and \
           not entry.text.strip().lower().startswith(word.lower()):
            # https://www.teanglann.ie/en/fgb/i%20measc
            # gives results for 'imeasc' not 'i measc'
            continue

        if 'subentry' not in entry['class']:
            subentries = [soup.new_tag('div')]
            subentry_labels = ['']  # first line, may contain a 'main' entry
        n = 1
        nxs = 'abcdefghijklmnopqrstuvwxyz'
        nxi = 0
        for node in entry.contents[:]:
            node_text = bs4_get_text(node)
            if f'{n}.' in re.sub(rf'adjective\s*{1}.', '', node_text):
                as_subnode = node.find(text=re.compile(rf'\s+{n}.\s+'))
                if as_subnode:
                    rev = []
                    for r in as_subnode.previous_siblings:
                        rev.append(r)
                    for r in reversed(rev):
                        subentries[-1].append(r)
                pre, post = node_text.rsplit(f'{n}.', 1)
                if pre.strip():
                    subentries[-1].append(pre.strip())
                subentries.append(soup.new_tag('div'))
                subentry_labels.append(f'{n}. ')
                nxi = 0
                if post.strip():
                    subentries[-1].append(post.strip())
                if as_subnode:
                    for r in as_subnode.next_siblings:
                        subentries[-1].append(r)
                n += 1
            elif (len(subentries) > 1  # we've got at least a '1.' already
                  and (f'({nxs[nxi]})' in node_text or
                       (f'{nxs[nxi]}' == node_text.strip()
                        and node.next_sibling.string.strip() == ')'
                        and subentries[-1].get_text().strip().endswith('(')))):
                pre, post = node_text.split(f'{nxs[nxi]}', 1)
                if pre.strip().rstrip('('):
                    subentries[-1].append(pre.strip())
                if subentries[-1].get_text().strip().rstrip('('):
                    subentries.append(soup.new_tag('div'))
                    subentry_labels.append(f'{n-1}.({nxs[nxi]}) ')
                else:
                    subentry_labels[-1] = f'{n-1}.({nxs[nxi]}) '
                if post.strip().lstrip(')'):
                    subentries[-1].append(post.strip())
                nxi += 1
            else:
                subentries[-1].append(node)
        yield subentries, subentry_labels

コード例 #8

0

ファイルを表示

ファイル: teanglann.py プロジェクト: timczerniak/lookup-irish

def get_line_types_gender(word, line):
    gender = None
    types = OrderedDict()  # using as ordered set
    if line.find(title="feminine") and \
       line.find(title="masculine"):
        # 'cara' has '(Var:feminine)' at the end
        # 'neantóg' has '(Var:neantán masculine)' at the end
        for g in [r'masculine', r'feminine']:
            r = re.compile(r'\(var(?:iant)*:[^)]*\s*' + g + r'\)', re.I)
            if g not in re.sub(r, '', bs4_get_text(line)):
                line.find(title=g).extract()

    # TODO: should look at line only up to opening parenthesis

    if line.find(title="pronoun"):
        # sé/sí are not nouns
        types['Pronoun'] = True
    elif line.find(title="feminine") and \
       line.find(title="masculine") and \
       word != 'talamh':
        # 'thar': has 'thairis (m) thairsti (f)' and is not a noun
        pass
    elif (line.find(title="feminine") or line.find(title="masculine")):
        types['Noun'] = True
        gender = assign_gender_declension(word, line)
    if line.find(title="adverb"):
        types['Adverb'] = True
    if line.find(title="preposition"):
        types['Preposition'] = True
    if line.find(title="adjective"):
        types['Adjective'] = True
        gender_a = 'a'
        dec = line.find(title="adjective").next_sibling
        # to check: think it only goes up to a3
        if dec and dec.strip().strip('.') in ['1', '2', '3', '4']:
            gender_a += dec.strip().strip('.')
        else:
            soup_fb = get_definition_soup(word, 'teanglann', lang='ga-fb')
            entry_fb = soup_fb.find(class_='entry')
            if soup_fb.find(text='aid3'):
                gender_a += '3'
            elif soup_fb.find(text='aid2'):
                gender_a += '2'
            elif soup_fb.find(text='aid1'):
                gender_a += '1'
            elif not soup_fb.find(text='aid'):
                # 'thar' spurious adj. in following:
                # ' of <span title="adjective">a</span> general nature'
                del types['Adjective']
                gender_a = None
        if gender_a:
            if not gender:
                gender = gender_a
            else:
                gender += '\n' + gender_a
    if line.find(title="transitive verb"):
        if 'Verb' not in types:
            types['Verb'] = OrderedDict()
        types['Verb']['Transitive'] = True
    if line.find(title="intransitive verb") or \
       line.find(title="and intransitive"):
        if 'Verb' not in types:
            types['Verb'] = OrderedDict()
        types['Verb']['Intransitive'] = True
    if line.find(title="conjunction"):
        types['Conjugation'] = True
    if line.find(title="prefix"):
        types['Prefix'] = True
    if 'Verb' in types and 'Noun' in types:
        del types['Noun']
        gender = None
    return types, gender