def make_sqlite(self): sqlite_file = self._mdx_file + '.sqlite.db' if os.path.exists(sqlite_file): os.remove(sqlite_file) mdx = MDX(self._mdx_file) conn = sqlite3.connect(sqlite_file) cursor = conn.cursor() cursor.execute(''' CREATE TABLE MDX_DICT (key text not null, value text )''') # remove '(pīnyīn)', remove `1`: aeiou = 'āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ' pattern = r"`\d+`|[(\(]?['a-z%s]*[%s]['a-z%s]*[\))]?" % (aeiou, aeiou, aeiou) tuple_list = [(key.decode(), re.sub(pattern, '', value.decode())) for key, value in mdx.items()] cursor.executemany('INSERT INTO MDX_DICT VALUES (?,?)', tuple_list) returned_index = mdx.get_index(check_block=self._check) meta = returned_index['meta'] cursor.execute('''CREATE TABLE META (key text, value text)''') cursor.executemany('INSERT INTO META VALUES (?,?)', [('encoding', meta['encoding']), ('stylesheet', meta['stylesheet']), ('title', meta['title']), ('description', meta['description']), ('version', version)]) if self._sql_index: cursor.execute(''' CREATE INDEX key_index ON MDX_DICT (key) ''') conn.commit() conn.close()
def collins(filename): mdx = MDX(filename) items = mdx.items() for i in range(12): item = next(items) print(item[0]) dom = etree.HTML(item[1]) explains = dom.xpath(r'//*[@class="C1_explanation_item"]') for e in explains: seealso = e.xpath(r'div/*/a[@class="C1_explain"]/text()') if len(seealso) != 0: print('see also', seealso) else: cn = e.xpath(r'div/span[@class="C1_explanation_label"]/text()') + e.xpath(r'div/span[@class="C1_text_blue"]/text()') print(''.join(cn)) entemp = e.xpath(r'div/text()') enword = e.xpath(r'div/span[@class="C1_inline_word"]/text()') en = [] for i in range(len(enword)): en.append(entemp[i]) en.append(enword[i]) en.append(entemp.pop()) en = e.xpath(r'div/span[@class="C1_word_gram"]/text()') + en print(''.join(en)) sentence = e.xpath(r'ul/li') for sen in sentence: entemp = sen.xpath(r'p[1]/text()') enword = sen.xpath(r'p[1]/span[@class="C1_text_blue"]/text()') sentence_en = [] for i in range(len(enword)): sentence_en.append(entemp[i]) sentence_en.append(enword[i]) print(entemp) sentence_en.append(entemp.pop()) print(''.join(sentence_en)) sentence_cn = sen.xpath(r'p[2]/text()') print(''.join(sentence_cn))
from readmdict import MDX, MDD dirstring = "资料来源整理/资料来源整理/" mdx = MDX("21世纪大英汉词典.mdx") r = r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?" r1 = r"`\d`" #logging.basicConfig() logger = logging.getLogger('root') fh = logging.FileHandler('test.log') fh.setLevel(logging.INFO) logger.addHandler(fh) pattern = re.compile(r) pattern1 = re.compile(r1) dic = {} valuelist = [] for key, value in mdx.items(): try: soup = BeautifulSoup(value) #print soup.prettify() phone = soup.find("span", class_="phone") tree = soup.find_all("span", class_="trs") # print key synonym = soup.find("span", class_="syno") #同义词 antonym = soup.find("span", class_="anto") #反义词 if key == "abandon": #print soup.prettify() #print tree.text
from bs4 import BeautifulSoup from readmdict import MDX,MDD dirstring="资料来源整理/资料来源整理/" mdx=MDX("21世纪大英汉词典.mdx") r=r"`1`.*?`2`(\[.*?\])?\s*(\<br\>)?" r1=r"`\d`" #logging.basicConfig() logger = logging.getLogger('root') fh = logging.FileHandler('test.log') fh.setLevel(logging.INFO) logger.addHandler(fh) pattern=re.compile(r) pattern1=re.compile(r1) dic={} valuelist=[] for key,value in mdx.items(): try: soup=BeautifulSoup(value) #print soup.prettify() phone= soup.find("span",class_="phone") tree=soup.find_all("span",class_="trs") # print key synonym=soup.find("span",class_="syno") #同义词 antonym=soup.find("span",class_="anto") #反义词 if key=="abandon": #print soup.prettify() #print tree.text
#!/bin/python # -*- coding: utf-8 -*- from readmdict import MDX#or MDD #You need to install lzo Library for gcc and python-lzo,readmdict,ripemd128 and pureSalsa20 Library for python import csv import re csvaddress='XXXX.csv' #Your CSV file name to export def remove_html_tags(text): """Remove html tags from a string""" clean = re.compile('<.*?>') return re.sub(clean, '', text) mdx= MDX('XXXX.mdx') #Open the original MDX Dictionary file items = mdx.items() tx=mdx.items().next() #To fetch first line in your mdx file - If delete this line you miss first line lst=[] for counter in items: try: templst=list(tx) templst[1] = remove_html_tags(templst[1]) lst.append(templst) tx=items.next() except: with open(csvaddress,'w') as output: writer = csv.writer(output, lineterminator='\n') writer.writerows(lst)
print(f"[ ] {path}", end='\r') path_tuple = path.split('\\') path_str = str(os.path.join( "pages", *path_tuple[:-1])) if len(path_tuple) > 1 else '' if path_str != '' and not os.path.exists(path_str): os.mkdir(path_str) src_file = open(os.path.join("pages", path), 'wb') src_file.write(data) src_file.flush() src_file.close() print("[v]") elif sys.argv[1].split('.')[-1] == "mdx": mdx_file = MDX(sys.argv[1], encoding='utf-8') index_info = {} print("正在提取词典页面文件...") for mdx_item in mdx_file.items(): item_name = base64.urlsafe_b64encode( mdx_item[0]).decode() + '.html' item_content = mdx_item[1].decode() page_path = os.path.join("pages", item_name) if os.path.exists(page_path): print(f"[x] {page_path} 已存在!") else: print(f"[ ] {page_path}", end='\r') if "@@@LINK=" in item_content: link = item_content[8:] item_content = "<a href=\"{}\">{}</a>".format( base64.urlsafe_b64encode(link.encode()).decode(), link) else: index_info.update({mdx_item[0].decode(): item_name}) page_file = open(page_path, 'w', encoding='utf-8')