def build_cped():
    global dict_id, entry_id, term_id, con
    entries = cped_data.entries

    # Perform corrections and stuff on the entries data.
    # -> utf8

    for entry in entries:
        # Generate phonetic hash, and boost factor. Boost is ignored for CPED.
        entry[0] = textfunctions.vel_to_uni(entry[0])
        entry[1:1] = [textfunctions.phonhash(entry[0]), 1]

    CPEDRow = collections.namedtuple(
        'CPEDRow',
        'pali phonhash boost defn grammar meaning source inflectgroup inflectinfo baseword basedefn funcstem regular'
    )
    rows = tuple(CPEDRow._make(entry) for entry in entries)

    # Populate dictionary information.
    dict_id += 1
    author = "A.P.Buddhadatta Mahāthera"
    about = ""
    details = ""
    con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)',
                (dict_id, 'CPED', 'Concise Pali-English Dictionary', author,
                 about, details))
    terms = []
    entries = []
    search_entries = []
    for row in rows:
        term_id += 1
        entry_id += 1
        terms.append(
            (term_id, None, row.pali, 1, row.phonhash, row.boost, entry_id))
        entries.append(
            (entry_id, term_id, None, textfunctions.mangle(row.meaning),
             row.meaning, row.grammar, dict_id))
        search_entries.append((entry_id, row.meaning))
    print(len(terms), len(entries))
    # term_id, base_id, term, number, phon_hash, boost, entry_id
    con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)',
                    terms)

    # entry_id, term_id, html, text, brief, info, dict_id
    con.executemany('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)',
                    entries)
Example #2
0
def build_cped():
    global dict_id, entry_id, term_id, con
    entries = cped_data.entries

    # Perform corrections and stuff on the entries data.
    # -> utf8

    for entry in entries:
        # Generate phonetic hash, and boost factor. Boost is ignored for CPED.
        entry[0] = textfunctions.vel_to_uni(entry[0])
        entry[1:1] = [textfunctions.phonhash(entry[0]), 1]
        
    CPEDRow = collections.namedtuple('CPEDRow', 'pali phonhash boost defn grammar meaning source inflectgroup inflectinfo baseword basedefn funcstem regular')
    rows = tuple(CPEDRow._make(entry) for entry in entries)

    # Populate dictionary information.
    dict_id += 1
    author = "A.P.Buddhadatta Mahāthera"
    about = ""
    details = ""
    con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)',
        (dict_id, 'CPED', 'Concise Pali-English Dictionary',
        author, about, details))
    terms = []
    entries = []
    search_entries = []
    for row in rows:
        term_id += 1
        entry_id += 1
        terms.append((term_id, None, row.pali, 1, row.phonhash, row.boost, entry_id))
        entries.append((entry_id, term_id, None, textfunctions.mangle(row.meaning), row.meaning, row.grammar, dict_id))
        search_entries.append((entry_id, row.meaning))
    print(len(terms), len(entries))
    # term_id, base_id, term, number, phon_hash, boost, entry_id
    con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)', terms)

    # entry_id, term_id, html, text, brief, info, dict_id
    con.executemany('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)', entries)
def build_dppn():
    global dict_id, entry_id, term_id, con

    dict_path = sc.dict_sources_dir / 'sc_dppn.html'
    with dict_path.open('r', encoding='utf-8') as f:
        dom = lxml.html.fromstring(f.read())

    # Perform sanity-correction
    items = list(dom.cssselect('meta, person, place, thing'))
    for e in items:
        if e.getparent() != dom:
            dom.append(e)

    # Populate dictionary information.
    dict_id += 1
    metas = dom.cssselect('meta')
    author = metas[1].attrib['content']
    about = metas[0].attrib['content']
    details = metas[2].attrib['content']
    con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)',
                (dict_id, 'EBPN', 'Early Buddhism Proper Names', author, about,
                 details))

    # Used to process references in DPPN
    refrex = regex.compile(r'(\w+)(?:[.]([ivxml]+|p))?[.](\d+)(?:–(\d+))?')

    count = collections.Counter()

    def loc(e):
        e.attrib['class'] = 'location'
        ll = e.text
        if ll and ll[0].isdigit():
            e.tag = 'a'
            ll = ll.replace(' ', '')
            e.attrib['href'] = 'http://maps.google.com.au/maps?ll={}'.format(
                ll)
            e.text = '^ see location '
        else:
            if ll:
                e.tag = 'span'
            else:
                if e.getnext().tag == "precision":
                    e.getnext().drop_tree()
                e.drop_tree()

    tfn = {
        'ref': ('a', 'ref'),
        'description': None,
        'place': ('div', 'place'),
        'person': ('div', 'person'),
        'thing': ('div', 'thing'),
        'location': loc,
        'precision': ('span', 'precision'),
        'type': ('span', 'type'),
    }

    for entry in dom.cssselect('person, place, thing'):
        entry_id += 1

        html = lxml.html.tostring(entry, encoding='utf8').decode()

        name_rows = []
        for i, e in enumerate(entry.iter('name')):
            name = e.text_content().strip()
            try:
                html_id = e.attrib['id']
            except KeyError:
                html_id = None
            count.update([name])
            boost = textfunctions.mc4_boost(len(html), 1000)
            if i > 0:
                boost = (1 + i + boost) / (2 + i)
            term_id += 1
            name_rows.append([
                term_id, 0, name, count[name],
                textfunctions.phonhash(name) if name else None, boost, entry_id
            ])
            e.drop_tree()
        alt_names = ", ".join(r[2] for r in name_rows[1:])

        for e in entry.iter():
            if e.tag in tfn:
                value = tfn[e.tag]
                if value is None:
                    e.drop_tag()
                elif callable(value):
                    value(e)
                elif len(value) == 2:
                    e.tag = value[0]
                    e.attrib['class'] = value[1]
                else:
                    raise NotImplementedError

        html = lxml.html.tostring(entry, encoding='utf8').decode()
        html = html.replace('</a>', '</a> ').replace('<a',
                                                     ' <a').replace('  ', ' ')

        # Destructively modify the element
        refstrs = list(t.text.strip() for t in entry.iter('ref'))
        for ref in entry.iter('ref'):
            ref.text = " " + ref.text + " "
            ref.drop_tag()

        paras = [p.text_content() for p in entry.iter('p')]
        text = " ".join(paras)
        text = regex.sub(" {2,}", " ", text)
        if paras:
            brief = create_brief(paras[0])
        else:
            brief = None

        # terms:
        # term_id, base_id, term, number, phon_hash, boost, entry_id

        con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)',
                        name_rows)

        # entries:
        # entry_id, term_id, html, text, brief, info, dict_id
        con.execute('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)',
                    (entry_id, name_rows[0][0], alt_names, html,
                     textfunctions.mangle(text), entry.tag, dict_id))

        # Populate references.
        for refstr in refstrs:
            m = refrex.match(refstr)
            try:
                values = [entry_id]
                values.extend(m[1:])
                con.execute('INSERT INTO refs VALUES (?, ?, ?, ?, ?)', values)
            except TypeError:
                print("Malformed ref: {}, ignoring.".format(refstr))
            except:
                print(values)
                raise
Example #4
0
def build_dppn():
    global dict_id, entry_id, term_id, con

    dict_path = sc.dict_sources_dir / 'sc_dppn.html'
    with dict_path.open('r', encoding='utf-8') as f:
        dom = lxml.html.fromstring(f.read())

    # Perform sanity-correction
    items = list(dom.cssselect('meta, person, place, thing'))
    for e in items:
        if e.getparent() != dom:
            dom.append(e)

    # Populate dictionary information.
    dict_id += 1
    metas = dom.cssselect('meta')
    author = metas[1].attrib['content']
    about = metas[0].attrib['content']
    details = metas[2].attrib['content']
    con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)',
        (dict_id, 'EBPN', 'Early Buddhism Proper Names',
        author, about, details))

    # Used to process references in DPPN
    refrex = regex.compile(r'(\w+)(?:[.]([ivxml]+|p))?[.](\d+)(?:–(\d+))?')

    count = collections.Counter()

    def loc(e):
        e.attrib['class'] = 'location'
        ll = e.text
        if ll and ll[0].isdigit():
            e.tag = 'a'
            ll = ll.replace(' ', '')
            e.attrib['href'] = 'http://maps.google.com.au/maps?ll={}'.format(ll)
            e.text = '^ see location '
        else:
            if ll:
                e.tag = 'span'
            else:
                if e.getnext().tag == "precision":
                    e.getnext().drop_tree()
                e.drop_tree()
        
    tfn = {'ref': ('a', 'ref'),
            'description': None,
            'place': ('div', 'place'),
            'person': ('div', 'person'),
            'thing': ('div', 'thing'),
            'location': loc,
            'precision': ('span', 'precision'),
            'type': ('span', 'type'),}            

    for entry in dom.cssselect('person, place, thing'):
        entry_id += 1

        html = lxml.html.tostring(entry, encoding='utf8').decode()
        
        name_rows = []
        for i, e in enumerate(entry.iter('name')):
            name = e.text_content().strip()
            try:
                html_id = e.attrib['id']
            except KeyError:
                html_id = None
            count.update([name])
            boost = textfunctions.mc4_boost(len(html), 1000)
            if i > 0:
                boost = (1 + i + boost) / (2 + i)
            term_id += 1
            name_rows.append([
                            term_id,
                            0,
                            name,
                            count[name],
                            textfunctions.phonhash(name) if name else None,
                            boost,
                            entry_id])
            e.drop_tree()
        alt_names = ", ".join(r[2] for r in name_rows[1:])

        for e in entry.iter():
            if e.tag in tfn:
                value = tfn[e.tag]
                if value is None:
                    e.drop_tag()
                elif callable(value):
                    value(e)
                elif len(value) == 2:
                    e.tag = value[0]
                    e.attrib['class'] = value[1]
                else:
                    raise NotImplementedError
                
        html = lxml.html.tostring(entry, encoding='utf8').decode()
        html = html.replace('</a>', '</a> ').replace('<a', ' <a').replace('  ', ' ')
        
        # Destructively modify the element
        refstrs = list(t.text.strip() for t in entry.iter('ref'))
        for ref in entry.iter('ref'):
            ref.text = " " + ref.text + " "
            ref.drop_tag()

        paras = [p.text_content() for p in entry.iter('p')]
        text = " ".join(paras)
        text = regex.sub(" {2,}", " ", text)
        if paras:
            brief = create_brief(paras[0])
        else:
            brief = None

        # terms:
        # term_id, base_id, term, number, phon_hash, boost, entry_id

        con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)',
            name_rows)

        # entries:
        # entry_id, term_id, html, text, brief, info, dict_id
        con.execute('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)',
            (entry_id, name_rows[0][0], alt_names, html, textfunctions.mangle(text), entry.tag, dict_id))

        # Populate references.
        for refstr in refstrs:
            m = refrex.match(refstr)
            try:
                values = [entry_id]
                values.extend(m[1:])
                con.execute('INSERT INTO refs VALUES (?, ?, ?, ?, ?)', values)
            except TypeError:
                print("Malformed ref: {}, ignoring.".format(refstr))
            except:
                print(values)
                raise