def build_cped(): global dict_id, entry_id, term_id, con entries = cped_data.entries # Perform corrections and stuff on the entries data. # -> utf8 for entry in entries: # Generate phonetic hash, and boost factor. Boost is ignored for CPED. entry[0] = textfunctions.vel_to_uni(entry[0]) entry[1:1] = [textfunctions.phonhash(entry[0]), 1] CPEDRow = collections.namedtuple( 'CPEDRow', 'pali phonhash boost defn grammar meaning source inflectgroup inflectinfo baseword basedefn funcstem regular' ) rows = tuple(CPEDRow._make(entry) for entry in entries) # Populate dictionary information. dict_id += 1 author = "A.P.Buddhadatta Mahāthera" about = "" details = "" con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)', (dict_id, 'CPED', 'Concise Pali-English Dictionary', author, about, details)) terms = [] entries = [] search_entries = [] for row in rows: term_id += 1 entry_id += 1 terms.append( (term_id, None, row.pali, 1, row.phonhash, row.boost, entry_id)) entries.append( (entry_id, term_id, None, textfunctions.mangle(row.meaning), row.meaning, row.grammar, dict_id)) search_entries.append((entry_id, row.meaning)) print(len(terms), len(entries)) # term_id, base_id, term, number, phon_hash, boost, entry_id con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)', terms) # entry_id, term_id, html, text, brief, info, dict_id con.executemany('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)', entries)
def build_cped(): global dict_id, entry_id, term_id, con entries = cped_data.entries # Perform corrections and stuff on the entries data. # -> utf8 for entry in entries: # Generate phonetic hash, and boost factor. Boost is ignored for CPED. entry[0] = textfunctions.vel_to_uni(entry[0]) entry[1:1] = [textfunctions.phonhash(entry[0]), 1] CPEDRow = collections.namedtuple('CPEDRow', 'pali phonhash boost defn grammar meaning source inflectgroup inflectinfo baseword basedefn funcstem regular') rows = tuple(CPEDRow._make(entry) for entry in entries) # Populate dictionary information. dict_id += 1 author = "A.P.Buddhadatta Mahāthera" about = "" details = "" con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)', (dict_id, 'CPED', 'Concise Pali-English Dictionary', author, about, details)) terms = [] entries = [] search_entries = [] for row in rows: term_id += 1 entry_id += 1 terms.append((term_id, None, row.pali, 1, row.phonhash, row.boost, entry_id)) entries.append((entry_id, term_id, None, textfunctions.mangle(row.meaning), row.meaning, row.grammar, dict_id)) search_entries.append((entry_id, row.meaning)) print(len(terms), len(entries)) # term_id, base_id, term, number, phon_hash, boost, entry_id con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)', terms) # entry_id, term_id, html, text, brief, info, dict_id con.executemany('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)', entries)
def build_dppn(): global dict_id, entry_id, term_id, con dict_path = sc.dict_sources_dir / 'sc_dppn.html' with dict_path.open('r', encoding='utf-8') as f: dom = lxml.html.fromstring(f.read()) # Perform sanity-correction items = list(dom.cssselect('meta, person, place, thing')) for e in items: if e.getparent() != dom: dom.append(e) # Populate dictionary information. dict_id += 1 metas = dom.cssselect('meta') author = metas[1].attrib['content'] about = metas[0].attrib['content'] details = metas[2].attrib['content'] con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)', (dict_id, 'EBPN', 'Early Buddhism Proper Names', author, about, details)) # Used to process references in DPPN refrex = regex.compile(r'(\w+)(?:[.]([ivxml]+|p))?[.](\d+)(?:–(\d+))?') count = collections.Counter() def loc(e): e.attrib['class'] = 'location' ll = e.text if ll and ll[0].isdigit(): e.tag = 'a' ll = ll.replace(' ', '') e.attrib['href'] = 'http://maps.google.com.au/maps?ll={}'.format( ll) e.text = '^ see location ' else: if ll: e.tag = 'span' else: if e.getnext().tag == "precision": e.getnext().drop_tree() e.drop_tree() tfn = { 'ref': ('a', 'ref'), 'description': None, 'place': ('div', 'place'), 'person': ('div', 'person'), 'thing': ('div', 'thing'), 'location': loc, 'precision': ('span', 'precision'), 'type': ('span', 'type'), } for entry in dom.cssselect('person, place, thing'): entry_id += 1 html = lxml.html.tostring(entry, encoding='utf8').decode() name_rows = [] for i, e in enumerate(entry.iter('name')): name = e.text_content().strip() try: html_id = e.attrib['id'] except KeyError: html_id = None count.update([name]) boost = textfunctions.mc4_boost(len(html), 1000) if i > 0: boost = (1 + i + boost) / (2 + i) term_id += 1 name_rows.append([ term_id, 0, name, count[name], textfunctions.phonhash(name) if name else None, boost, entry_id ]) e.drop_tree() alt_names = ", ".join(r[2] for r in name_rows[1:]) for e in entry.iter(): if e.tag in tfn: value = tfn[e.tag] if value is None: e.drop_tag() elif callable(value): value(e) elif len(value) == 2: e.tag = value[0] e.attrib['class'] = value[1] else: raise NotImplementedError html = lxml.html.tostring(entry, encoding='utf8').decode() html = html.replace('</a>', '</a> ').replace('<a', ' <a').replace(' ', ' ') # Destructively modify the element refstrs = list(t.text.strip() for t in entry.iter('ref')) for ref in entry.iter('ref'): ref.text = " " + ref.text + " " ref.drop_tag() paras = [p.text_content() for p in entry.iter('p')] text = " ".join(paras) text = regex.sub(" {2,}", " ", text) if paras: brief = create_brief(paras[0]) else: brief = None # terms: # term_id, base_id, term, number, phon_hash, boost, entry_id con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)', name_rows) # entries: # entry_id, term_id, html, text, brief, info, dict_id con.execute('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)', (entry_id, name_rows[0][0], alt_names, html, textfunctions.mangle(text), entry.tag, dict_id)) # Populate references. for refstr in refstrs: m = refrex.match(refstr) try: values = [entry_id] values.extend(m[1:]) con.execute('INSERT INTO refs VALUES (?, ?, ?, ?, ?)', values) except TypeError: print("Malformed ref: {}, ignoring.".format(refstr)) except: print(values) raise
def build_dppn(): global dict_id, entry_id, term_id, con dict_path = sc.dict_sources_dir / 'sc_dppn.html' with dict_path.open('r', encoding='utf-8') as f: dom = lxml.html.fromstring(f.read()) # Perform sanity-correction items = list(dom.cssselect('meta, person, place, thing')) for e in items: if e.getparent() != dom: dom.append(e) # Populate dictionary information. dict_id += 1 metas = dom.cssselect('meta') author = metas[1].attrib['content'] about = metas[0].attrib['content'] details = metas[2].attrib['content'] con.execute('INSERT INTO dicts VALUES(?, ?, ?, ?, ?, ?)', (dict_id, 'EBPN', 'Early Buddhism Proper Names', author, about, details)) # Used to process references in DPPN refrex = regex.compile(r'(\w+)(?:[.]([ivxml]+|p))?[.](\d+)(?:–(\d+))?') count = collections.Counter() def loc(e): e.attrib['class'] = 'location' ll = e.text if ll and ll[0].isdigit(): e.tag = 'a' ll = ll.replace(' ', '') e.attrib['href'] = 'http://maps.google.com.au/maps?ll={}'.format(ll) e.text = '^ see location ' else: if ll: e.tag = 'span' else: if e.getnext().tag == "precision": e.getnext().drop_tree() e.drop_tree() tfn = {'ref': ('a', 'ref'), 'description': None, 'place': ('div', 'place'), 'person': ('div', 'person'), 'thing': ('div', 'thing'), 'location': loc, 'precision': ('span', 'precision'), 'type': ('span', 'type'),} for entry in dom.cssselect('person, place, thing'): entry_id += 1 html = lxml.html.tostring(entry, encoding='utf8').decode() name_rows = [] for i, e in enumerate(entry.iter('name')): name = e.text_content().strip() try: html_id = e.attrib['id'] except KeyError: html_id = None count.update([name]) boost = textfunctions.mc4_boost(len(html), 1000) if i > 0: boost = (1 + i + boost) / (2 + i) term_id += 1 name_rows.append([ term_id, 0, name, count[name], textfunctions.phonhash(name) if name else None, boost, entry_id]) e.drop_tree() alt_names = ", ".join(r[2] for r in name_rows[1:]) for e in entry.iter(): if e.tag in tfn: value = tfn[e.tag] if value is None: e.drop_tag() elif callable(value): value(e) elif len(value) == 2: e.tag = value[0] e.attrib['class'] = value[1] else: raise NotImplementedError html = lxml.html.tostring(entry, encoding='utf8').decode() html = html.replace('</a>', '</a> ').replace('<a', ' <a').replace(' ', ' ') # Destructively modify the element refstrs = list(t.text.strip() for t in entry.iter('ref')) for ref in entry.iter('ref'): ref.text = " " + ref.text + " " ref.drop_tag() paras = [p.text_content() for p in entry.iter('p')] text = " ".join(paras) text = regex.sub(" {2,}", " ", text) if paras: brief = create_brief(paras[0]) else: brief = None # terms: # term_id, base_id, term, number, phon_hash, boost, entry_id con.executemany('INSERT INTO terms_base VALUES(?, ?, ?, ?, ?, ?, ?)', name_rows) # entries: # entry_id, term_id, html, text, brief, info, dict_id con.execute('INSERT INTO entries_base VALUES (?, ?, ?, ?, ?, ?, ?)', (entry_id, name_rows[0][0], alt_names, html, textfunctions.mangle(text), entry.tag, dict_id)) # Populate references. for refstr in refstrs: m = refrex.match(refstr) try: values = [entry_id] values.extend(m[1:]) con.execute('INSERT INTO refs VALUES (?, ?, ?, ?, ?)', values) except TypeError: print("Malformed ref: {}, ignoring.".format(refstr)) except: print(values) raise